From e9decb46d3f17d96ab256df17ac6eab86aec8c29 Mon Sep 17 00:00:00 2001 From: Nemanja Grujic <109360083+nemanjagrujic@users.noreply.github.com> Date: Fri, 7 Feb 2025 12:18:39 +0100 Subject: [PATCH 001/316] #6539: (#7749 #3176 #4514 #5145 #3601 #3602 #6947) Fix multiple unit and sweep tests (#16850) ### Ticket https://github.com/tenstorrent/tt-metal/issues/6539 https://github.com/tenstorrent/tt-metal/issues/7749 https://github.com/tenstorrent/tt-metal/issues/3176 https://github.com/tenstorrent/tt-metal/issues/4514 https://github.com/tenstorrent/tt-metal/issues/5145 https://github.com/tenstorrent/tt-metal/issues/3601 https://github.com/tenstorrent/tt-metal/issues/3602 https://github.com/tenstorrent/tt-metal/issues/6947 ### Problem description API changes and various other changes lead to some sweep and unit tests stop working. ### What's changed 1. Fixed non working sweep and unit tests 2. For ttnn.reshape sweep moved xfail sweeps to nightly suite. ### Checklist - [X] Post commit CI passes (https://github.com/tenstorrent/tt-metal/actions/runs/12830695055) - [X] Sweep tests pass --- .../sweeps/data_movement/reshape/reshape.py | 2 +- .../logaddexp2_bw/logaddexp2_bw.py | 13 +++-- .../logaddexp_bw/logaddexp_bw.py | 13 +++-- .../sweeps/eltwise/unary/logit/logit.py | 20 ++------ .../sweep_framework/sweeps/losses/l1_loss.py | 30 +++++++----- .../sweep_framework/sweeps/losses/mse_loss.py | 31 +++++++----- .../grayskull/test_backward_fill.py | 36 -------------- ...est_eltwise_scale_mask_softmax_in_place.py | 2 +- .../test_eltwise_softmax_in_place.py | 10 ++-- .../wormhole/test_backward_fill.py | 35 -------------- .../sweep_tests/generation_funcs.py | 2 +- .../sweep_tests/pytorch_ops.py | 12 ++--- .../grayskull/test_reshape.py | 37 +++++++++----- .../wormhole/test_min_max.py | 43 +++++------------ .../python_api_testing/sweep_tests/op_map.py | 2 +- .../grayskull/ttnn_eltwise_signbit_test.yaml | 48 ------------------- .../wormhole/ttnn_eltwise_signbit_test.yaml | 48 ------------------- .../wormhole/ttnn_sum_test.yaml | 2 +- .../grayskull/ttnn_eltwise_signbit_test.yaml | 25 ++++++++++ .../wormhole/ttnn_eltwise_signbit_test.yaml | 25 ++++++++++ .../sweep_tests/ttnn_ops.py | 15 +----- 21 files changed, 154 insertions(+), 297 deletions(-) delete mode 100644 tests/tt_eager/python_api_testing/non_working_unit_tests/grayskull/test_backward_fill.py delete mode 100644 tests/tt_eager/python_api_testing/non_working_unit_tests/wormhole/test_backward_fill.py delete mode 100644 tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_broken/grayskull/ttnn_eltwise_signbit_test.yaml delete mode 100644 tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_broken/wormhole/ttnn_eltwise_signbit_test.yaml create mode 100644 tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/grayskull/ttnn_eltwise_signbit_test.yaml create mode 100644 tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/wormhole/ttnn_eltwise_signbit_test.yaml diff --git a/tests/sweep_framework/sweeps/data_movement/reshape/reshape.py b/tests/sweep_framework/sweeps/data_movement/reshape/reshape.py index 69d188257e1..e7c1847c9f9 100644 --- a/tests/sweep_framework/sweeps/data_movement/reshape/reshape.py +++ b/tests/sweep_framework/sweeps/data_movement/reshape/reshape.py @@ -63,7 +63,7 @@ def gen_reshape_shape(input_shape, step=1): # Does not have memory_config parameter parameters = { - "xfail": { + "nightly": { "input_shape": gen_shapes([1, 1, 1, 1], [6, 6, 256, 256], [1, 1, 1, 1], 16) + gen_shapes([1, 1, 1], [6, 256, 256], [1, 1, 1], 16) + gen_shapes([1, 1], [256, 256], [1, 1], 16), diff --git a/tests/sweep_framework/sweeps/eltwise/binary_backward/logaddexp2_bw/logaddexp2_bw.py b/tests/sweep_framework/sweeps/eltwise/binary_backward/logaddexp2_bw/logaddexp2_bw.py index 7e7adad1aa4..030bd454d9e 100644 --- a/tests/sweep_framework/sweeps/eltwise/binary_backward/logaddexp2_bw/logaddexp2_bw.py +++ b/tests/sweep_framework/sweeps/eltwise/binary_backward/logaddexp2_bw/logaddexp2_bw.py @@ -26,15 +26,15 @@ # Developers can create their own generator functions and pass them to the parameters as inputs. parameters = { "nightly": { - "input_shape": gen_shapes([1, 1, 1, 2], [6, 12, 256, 256], [1, 1, 1, 2], 2) - + gen_shapes([1, 1, 2], [12, 256, 256], [1, 1, 2], 2) - + gen_shapes([1, 2], [256, 256], [1, 2], 2), + "input_shape": gen_shapes([1, 1, 1, 2], [6, 12, 256, 256], [1, 1, 1, 2], 3) + + gen_shapes([1, 1, 2], [12, 256, 256], [1, 1, 2], 3) + + gen_shapes([1, 2], [256, 256], [1, 2], 3), "grad_dtype": [ttnn.bfloat16, ttnn.bfloat8_b], "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b], "input_b_dtype": [ttnn.bfloat16, ttnn.bfloat8_b], - "grad_layout": [ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT], # , ttnn.ROW_MAJOR_LAYOUT - "input_a_layout": [ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT], - "input_b_layout": [ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT], + "grad_layout": [ttnn.TILE_LAYOUT], # , ttnn.ROW_MAJOR_LAYOUT + "input_a_layout": [ttnn.TILE_LAYOUT], + "input_b_layout": [ttnn.TILE_LAYOUT], "grad_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], "input_b_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], @@ -130,5 +130,4 @@ def run( pcc[1] = min(pcc[1], str_to_float(pcc_tmp[1])) pcc[1] = str(pcc[1]) - # print(f"pcc {pcc} - {grad_dtype}, {input_a_dtype}, {input_b_dtype}") return [pcc, e2e_perf] diff --git a/tests/sweep_framework/sweeps/eltwise/binary_backward/logaddexp_bw/logaddexp_bw.py b/tests/sweep_framework/sweeps/eltwise/binary_backward/logaddexp_bw/logaddexp_bw.py index 329c9cb3f59..d5166dc5289 100644 --- a/tests/sweep_framework/sweeps/eltwise/binary_backward/logaddexp_bw/logaddexp_bw.py +++ b/tests/sweep_framework/sweeps/eltwise/binary_backward/logaddexp_bw/logaddexp_bw.py @@ -26,15 +26,15 @@ # Developers can create their own generator functions and pass them to the parameters as inputs. parameters = { "nightly": { - "input_shape": gen_shapes([1, 1, 1, 2], [6, 12, 256, 256], [1, 1, 1, 2], 2) - + gen_shapes([1, 1, 2], [12, 256, 256], [1, 1, 2], 2) - + gen_shapes([1, 2], [256, 256], [1, 2], 2), + "input_shape": gen_shapes([1, 1, 1, 2], [6, 12, 256, 256], [1, 1, 1, 2], 3) + + gen_shapes([1, 1, 2], [12, 256, 256], [1, 1, 2], 3) + + gen_shapes([1, 2], [256, 256], [1, 2], 3), "grad_dtype": [ttnn.bfloat16, ttnn.bfloat8_b], "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b], "input_b_dtype": [ttnn.bfloat16, ttnn.bfloat8_b], - "grad_layout": [ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT], - "input_a_layout": [ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT], - "input_b_layout": [ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT], + "grad_layout": [ttnn.TILE_LAYOUT], + "input_a_layout": [ttnn.TILE_LAYOUT], + "input_b_layout": [ttnn.TILE_LAYOUT], "grad_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], "input_b_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], @@ -130,5 +130,4 @@ def run( pcc[1] = min(pcc[1], str_to_float(pcc_tmp[1])) pcc[1] = str(pcc[1]) - # print(f"pcc {pcc}") return [pcc, e2e_perf] diff --git a/tests/sweep_framework/sweeps/eltwise/unary/logit/logit.py b/tests/sweep_framework/sweeps/eltwise/unary/logit/logit.py index 3baa2df0d11..2e88a7d05d6 100644 --- a/tests/sweep_framework/sweeps/eltwise/unary/logit/logit.py +++ b/tests/sweep_framework/sweeps/eltwise/unary/logit/logit.py @@ -21,21 +21,11 @@ # Developers can create their own generator functions and pass them to the parameters as inputs. parameters = { "nightly": { - "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 16) - + gen_shapes([1, 1, 1], [12, 256, 256], [1, 1, 1], 16) - + gen_shapes([1, 1], [256, 256], [1, 1], 16), - "eps": [0, 10e-6, 10e-5, 10e-4, 10e-3, 10e-2, 10e-1], - "input_a_dtype": [ttnn.bfloat16], - "input_a_layout": [ttnn.TILE_LAYOUT], - "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], - "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], - }, - "xfail": { - "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 1) - + gen_shapes([1, 1, 1], [12, 256, 256], [1, 1, 1], 1) - + gen_shapes([1, 1], [256, 256], [1, 1], 1), - "eps": [0, 10e-6, 10e-5, 10e-4, 10e-3, 10e-2, 10e-1], - "input_a_dtype": [ttnn.bfloat8_b], + "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 8) + + gen_shapes([1, 1, 1], [12, 256, 256], [1, 1, 1], 8) + + gen_shapes([1, 1], [256, 256], [1, 1], 8), + "eps": [0, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1], + "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b], "input_a_layout": [ttnn.TILE_LAYOUT], "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], diff --git a/tests/sweep_framework/sweeps/losses/l1_loss.py b/tests/sweep_framework/sweeps/losses/l1_loss.py index 52a5c747974..bf11235de78 100644 --- a/tests/sweep_framework/sweeps/losses/l1_loss.py +++ b/tests/sweep_framework/sweeps/losses/l1_loss.py @@ -25,14 +25,10 @@ # Developers can create their own generator functions and pass them to the parameters as inputs. parameters = { "nightly": { - "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 8) - + gen_shapes([1, 1, 1], [12, 256, 256], [1, 1, 1], 8) - + gen_shapes([1, 1], [256, 256], [1, 1], 8), - "reduction": [ - ["none", ttnn.LossReductionMode.NONE], - ["mean", ttnn.LossReductionMode.MEAN], - ["sum", ttnn.LossReductionMode.SUM], - ], + "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 4) + + gen_shapes([1, 1, 1], [12, 256, 256], [1, 1, 1], 4) + + gen_shapes([1, 1], [256, 256], [1, 1], 4), + "reduction": ["__none", "__mean", "__sum"], "input_reference_dtype": [ttnn.bfloat16, ttnn.bfloat8_b], "input_reference_layout": [ttnn.TILE_LAYOUT], "input_reference_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], @@ -72,10 +68,21 @@ def run( partial(torch_random, low=-100, high=100, dtype=torch.float32), input_prediction_dtype )(input_shape) + reduction_0 = "none" + reduction_1 = ttnn.LossReductionMode.NONE + + if reduction == "__mean": + reduction_0 = "mean" + reduction_1 = ttnn.LossReductionMode.MEAN + + if reduction == "__sum": + reduction_0 = "sum" + reduction_1 = ttnn.LossReductionMode.SUM + golden_function = ttnn.get_golden_function(ttnn.l1_loss) torch_output_tensor = golden_function( - torch_input_reference_tensor, torch_input_prediction_tensor, reduction=reduction[0] + torch_input_reference_tensor, torch_input_prediction_tensor, reduction=reduction_0 ) input_reference_tensor = ttnn.from_torch( @@ -98,16 +105,13 @@ def run( result = ttnn.l1_loss( input_reference_tensor, input_prediction_tensor, - reduction=reduction[1], + reduction=reduction_1, output_tensor=None, memory_config=output_memory_config, ) output_tensor = ttnn.to_torch(result) - if reduction[0] != "none": - output_tensor = output_tensor[0, 0, 0, 0] e2e_perf = stop_measuring_time(start_time) pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999) - # print(f"pcc {pcc} input_shape {input_shape} reduction {reduction[0]} {input_reference_dtype} {input_prediction_dtype}") return [pcc, e2e_perf] diff --git a/tests/sweep_framework/sweeps/losses/mse_loss.py b/tests/sweep_framework/sweeps/losses/mse_loss.py index 3a1b2a3bd3f..7429664474c 100644 --- a/tests/sweep_framework/sweeps/losses/mse_loss.py +++ b/tests/sweep_framework/sweeps/losses/mse_loss.py @@ -8,6 +8,7 @@ import torch import random import ttnn +import json from tests.sweep_framework.sweep_utils.utils import gen_shapes from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt @@ -25,14 +26,10 @@ # Developers can create their own generator functions and pass them to the parameters as inputs. parameters = { "nightly": { - "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 8) - + gen_shapes([1, 1, 1], [12, 256, 256], [1, 1, 1], 8) - + gen_shapes([1, 1], [256, 256], [1, 1], 8), - "reduction": [ - ["none", ttnn.LossReductionMode.NONE], - ["mean", ttnn.LossReductionMode.MEAN], - ["sum", ttnn.LossReductionMode.SUM], - ], + "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 4) + + gen_shapes([1, 1, 1], [12, 256, 256], [1, 1, 1], 4) + + gen_shapes([1, 1], [256, 256], [1, 1], 4), + "reduction": ["__none", "__mean", "__sum"], "input_reference_dtype": [ttnn.bfloat16, ttnn.bfloat8_b], "input_reference_layout": [ttnn.TILE_LAYOUT], "input_reference_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], @@ -72,12 +69,23 @@ def run( partial(torch_random, low=-100, high=100, dtype=torch.float32), input_prediction_dtype )(input_shape) + reduction_0 = "none" + reduction_1 = ttnn.LossReductionMode.NONE + + if reduction == "__mean": + reduction_0 = "mean" + reduction_1 = ttnn.LossReductionMode.MEAN + + if reduction == "__sum": + reduction_0 = "sum" + reduction_1 = ttnn.LossReductionMode.SUM + golden_function = ttnn.get_golden_function(ttnn.mse_loss) torch_output_tensor = golden_function( torch_input_reference_tensor.to(torch.float32), torch_input_prediction_tensor.to(torch.float32), - reduction=reduction[0], + reduction=reduction_0, ) input_reference_tensor = ttnn.from_torch( @@ -100,16 +108,13 @@ def run( result = ttnn.mse_loss( input_reference_tensor, input_prediction_tensor, - reduction=reduction[1], + reduction=reduction_1, output_tensor=None, memory_config=output_memory_config, ) output_tensor = ttnn.to_torch(result) - if reduction[0] != "none": - output_tensor = output_tensor[0, 0, 0, 0] e2e_perf = stop_measuring_time(start_time) pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999) - # print(f"pcc {pcc} input_shape {input_shape} reduction {reduction[0]} {input_reference_dtype} {input_prediction_dtype}") return [pcc, e2e_perf] diff --git a/tests/tt_eager/python_api_testing/non_working_unit_tests/grayskull/test_backward_fill.py b/tests/tt_eager/python_api_testing/non_working_unit_tests/grayskull/test_backward_fill.py deleted file mode 100644 index 199a4edb06d..00000000000 --- a/tests/tt_eager/python_api_testing/non_working_unit_tests/grayskull/test_backward_fill.py +++ /dev/null @@ -1,36 +0,0 @@ -# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. - -# SPDX-License-Identifier: Apache-2.0 - -import torch -import pytest -import ttnn -from tests.tt_eager.python_api_testing.unit_testing.backward_ops.utility_funcs import data_gen_pt_tt, compare_results - - -@pytest.mark.parametrize( - "input_shapes", - ( - (torch.Size([1, 1, 32, 32])), - (torch.Size([1, 1, 320, 384])), - (torch.Size([1, 3, 320, 384])), - (torch.Size([8, 17, 160, 32])), - ), -) -# Pytorch Reference -# - name: fill.Tensor(Tensor self, Tensor value) -> Tensor -# self: zeros_like(grad) -# value: grad.sum() -# result: at::fill(self_t, value_t) -def test_bw_fill(input_shapes, device): - # torch.manual_seed(12386) - grad_data, grad_tensor = data_gen_pt_tt(input_shapes, device) - pyt_y = torch.zeros_like(grad_data) - grad_sum = grad_data.sum() - pyt_y.fill_(grad_sum) - - tt_output_tensor_on_device = ttnn.fill_bw(grad_tensor) - - golden_tensor = [pyt_y] - comp_pass = compare_results(tt_output_tensor_on_device, golden_tensor) - assert comp_pass diff --git a/tests/tt_eager/python_api_testing/non_working_unit_tests/grayskull/test_eltwise_scale_mask_softmax_in_place.py b/tests/tt_eager/python_api_testing/non_working_unit_tests/grayskull/test_eltwise_scale_mask_softmax_in_place.py index 471cbf2baf8..d71f47671d9 100644 --- a/tests/tt_eager/python_api_testing/non_working_unit_tests/grayskull/test_eltwise_scale_mask_softmax_in_place.py +++ b/tests/tt_eager/python_api_testing/non_working_unit_tests/grayskull/test_eltwise_scale_mask_softmax_in_place.py @@ -10,7 +10,7 @@ from tests.tt_eager.python_api_testing.sweep_tests import pytorch_ops from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import comp_pcc -from tests.tt_eager.python_api_testing.sweep_tests.tt_lib_ops import ( +from tests.ttnn.python_api_testing.sweep_tests.ttnn_ops import ( eltwise_scale_mask_softmax_in_place as tt_eltwise_scale_mask_softmax_in_place, ) diff --git a/tests/tt_eager/python_api_testing/non_working_unit_tests/grayskull/test_eltwise_softmax_in_place.py b/tests/tt_eager/python_api_testing/non_working_unit_tests/grayskull/test_eltwise_softmax_in_place.py index 07b387b0662..0744e470c74 100644 --- a/tests/tt_eager/python_api_testing/non_working_unit_tests/grayskull/test_eltwise_softmax_in_place.py +++ b/tests/tt_eager/python_api_testing/non_working_unit_tests/grayskull/test_eltwise_softmax_in_place.py @@ -5,12 +5,10 @@ from loguru import logger import pytest import torch +import ttnn -from tests.tt_eager.python_api_testing.sweep_tests import pytorch_ops from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import comp_pcc -from tests.tt_eager.python_api_testing.sweep_tests.tt_lib_ops import ( - eltwise_softmax_in_place as tt_eltwise_softmax_in_place, -) +from tests.ttnn.python_api_testing.sweep_tests import ttnn_ops def run_eltwise_softmax_in_place_tests(input_shape, dtype, dlayout, in_mem_config, data_seed, device): @@ -23,9 +21,9 @@ def run_eltwise_softmax_in_place_tests(input_shape, dtype, dlayout, in_mem_confi x_ref = x.detach().clone() # get ref result - ref_value = pytorch_ops.softmax_in_place(x_ref) + ref_value = torch.softmax(x_ref, -1) - tt_result = tt_eltwise_softmax_in_place( + tt_result = ttnn_ops.eltwise_softmax_in_place( x=x, device=device, dtype=[dtype], layout=[dlayout], input_mem_config=[in_mem_config], output_mem_config=None ) diff --git a/tests/tt_eager/python_api_testing/non_working_unit_tests/wormhole/test_backward_fill.py b/tests/tt_eager/python_api_testing/non_working_unit_tests/wormhole/test_backward_fill.py deleted file mode 100644 index 76788ff980f..00000000000 --- a/tests/tt_eager/python_api_testing/non_working_unit_tests/wormhole/test_backward_fill.py +++ /dev/null @@ -1,35 +0,0 @@ -# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. - -# SPDX-License-Identifier: Apache-2.0 - -import torch -import pytest -from tests.tt_eager.python_api_testing.unit_testing.backward_ops.utility_funcs import data_gen_pt_tt, compare_results - - -@pytest.mark.parametrize( - "input_shapes", - ( - (torch.Size([1, 1, 32, 32])), - (torch.Size([1, 1, 320, 384])), - (torch.Size([1, 3, 320, 384])), - (torch.Size([8, 17, 160, 32])), - ), -) -# Pytorch Reference -# - name: fill.Tensor(Tensor self, Tensor value) -> Tensor -# self: zeros_like(grad) -# value: grad.sum() -# result: at::fill(self_t, value_t) -def test_bw_fill(input_shapes, device): - # torch.manual_seed(12386) - grad_data, grad_tensor = data_gen_pt_tt(input_shapes, device) - pyt_y = torch.zeros_like(grad_data) - grad_sum = grad_data.sum() - pyt_y.fill_(grad_sum) - - tt_output_tensor_on_device = ttnn.fill_bw(grad_tensor) - - golden_tensor = [pyt_y] - comp_pass = compare_results(tt_output_tensor_on_device, golden_tensor) - assert comp_pass diff --git a/tests/tt_eager/python_api_testing/sweep_tests/generation_funcs.py b/tests/tt_eager/python_api_testing/sweep_tests/generation_funcs.py index 27469ea9087..9390fee7df8 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/generation_funcs.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/generation_funcs.py @@ -1520,7 +1520,7 @@ def gen_polyval_args( yield input_info -def gen_arange_args(input_shapes, dtypes, layouts, mem_configs, low=-100, high=100, do_sanitize_args=True): +def gen_arange_args(input_shapes, dtypes, layouts, mem_configs, low=-100, high=100, do_sanitize_args=True, coregrid=[]): for input_info in gen_two_scalar_args( input_shapes, dtypes, diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py b/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py index 1e0b12e44fd..fcc41f186a6 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py @@ -890,14 +890,10 @@ def fill_rm(x, *args, **kwargs): return y -def fill_bw(x, *args, **kwargs): - grad_data = x.detach().clone() - - put_y = torch.zeros_like(grad_data) - grad_sum = grad_data.sum() - put_y.fill_(grad_sum) - - return put_y +def fill_bw(x, y, *args, **kwargs): + y.requires_grad = True + golden_function = ttnn.get_golden_function(ttnn.fill_bw) + return golden_function(x, y)[0] def fill_zero_bw(x, *args, **kwargs): diff --git a/tests/ttnn/python_api_testing/non_working_unit_tests/grayskull/test_reshape.py b/tests/ttnn/python_api_testing/non_working_unit_tests/grayskull/test_reshape.py index 64ff9c3b6e2..f035fd8d491 100644 --- a/tests/ttnn/python_api_testing/non_working_unit_tests/grayskull/test_reshape.py +++ b/tests/ttnn/python_api_testing/non_working_unit_tests/grayskull/test_reshape.py @@ -20,10 +20,16 @@ def run_reshape_tests(input_shape, dtype, dlayout, in_mem_config, output_mem_con try: # get ref result ref_value = torch.reshape(x, reshape_dims) - x = ttnn_ops.setup_ttnn_tensor(x, device, dlayout[0], in_mem_config, dtype[0]) - tt_result = ttnn.reshape(x, reshape_dims) - tt_result = ttnn_ops.ttnn_tensor_to_torch(tt_result, output_mem_config) + tt_result = ttnn_ops.reshape( + x, + device=device, + dtype=dtype, + layout=dlayout, + input_mem_config=[in_mem_config], + output_mem_config=output_mem_config, + reshape_dims=reshape_dims, + ) except Exception as e: logger.warning(f"Operation execution crashed") @@ -39,27 +45,36 @@ def run_reshape_tests(input_shape, dtype, dlayout, in_mem_config, output_mem_con (224, 128), [ttnn.bfloat16], [ttnn.TILE_LAYOUT], + (None), (ttnn.DRAM_MEMORY_CONFIG), + (448, 64), + 14748599, + ), + ( + (224, 128), + [ttnn.bfloat8_b], + [ttnn.TILE_LAYOUT], + (None), (ttnn.DRAM_MEMORY_CONFIG), (448, 64), - 11871267, + 14748599, ), ( - (10, 192, 64), + (12, 32, 160), [ttnn.bfloat16], [ttnn.TILE_LAYOUT], + (None), (ttnn.DRAM_MEMORY_CONFIG), - (ttnn.DRAM_MEMORY_CONFIG), - (4, 192, 160), - 14337480, + (1, 192, 320), + 14748599, ), ( - (6, 4, 224, 64), + (4, 12, 64, 224), [ttnn.bfloat16], [ttnn.TILE_LAYOUT], + (None), (ttnn.DRAM_MEMORY_CONFIG), - (ttnn.DRAM_MEMORY_CONFIG), - (24, 2, 32, 224), + (6, 8, 224, 64), 14748599, ), ] diff --git a/tests/ttnn/python_api_testing/non_working_unit_tests/wormhole/test_min_max.py b/tests/ttnn/python_api_testing/non_working_unit_tests/wormhole/test_min_max.py index d17199a4543..3f582c14f9c 100644 --- a/tests/ttnn/python_api_testing/non_working_unit_tests/wormhole/test_min_max.py +++ b/tests/ttnn/python_api_testing/non_working_unit_tests/wormhole/test_min_max.py @@ -12,43 +12,18 @@ from tests.ttnn.python_api_testing.sweep_tests import ttnn_ops -def run_min_tests(input_shape, dtype, dlayout, in_mem_config, output_mem_config, data_seed, dim, device): +def run_op_tests( + input_shape, dtype, dlayout, in_mem_config, output_mem_config, data_seed, dim, torch_op, tt_op, device +): torch.manual_seed(data_seed) x = torch.Tensor(size=input_shape[0]).uniform_(-100, 100).to(torch.bfloat16) try: # get ref result - ref_value = torch.min(x, dim) + ref_value = torch_op(x, dim).values - tt_result = ttnn_ops.min( - x, - dim=dim, - device=device, - dtype=dtype, - layout=dlayout, - input_mem_config=in_mem_config, - output_mem_config=output_mem_config, - ) - - except Exception as e: - logger.warning(f"Operation execution crashed") - raise e - - assert len(tt_result.shape) == len(ref_value.shape) - assert_with_pcc(ref_value, tt_result, 0.99) - - -def run_max_tests(input_shape, dtype, dlayout, in_mem_config, output_mem_config, data_seed, dim, device): - torch.manual_seed(data_seed) - - x = torch.Tensor(size=input_shape[0]).uniform_(-100, 100).to(torch.bfloat16) - - try: - # get ref result - ref_value = torch.max(x, dim) - - tt_result = ttnn_ops.max( + tt_result = tt_op( x, dim=dim, device=device, @@ -102,7 +77,9 @@ def run_max_tests(input_shape, dtype, dlayout, in_mem_config, output_mem_config, (test_sweep_args), ) def test_min(input_shape, dtype, dlayout, in_mem_config, output_mem_config, data_seed, dim, device): - run_min_tests(input_shape, dtype, dlayout, in_mem_config, output_mem_config, data_seed, dim, device) + run_op_tests( + input_shape, dtype, dlayout, in_mem_config, output_mem_config, data_seed, dim, torch.min, ttnn_ops.min, device + ) @pytest.mark.parametrize( @@ -110,4 +87,6 @@ def test_min(input_shape, dtype, dlayout, in_mem_config, output_mem_config, data (test_sweep_args), ) def test_max(input_shape, dtype, dlayout, in_mem_config, output_mem_config, data_seed, dim, device): - run_max_tests(input_shape, dtype, dlayout, in_mem_config, output_mem_config, data_seed, dim, device) + run_op_tests( + input_shape, dtype, dlayout, in_mem_config, output_mem_config, data_seed, dim, torch.max, ttnn_ops.max, device + ) diff --git a/tests/ttnn/python_api_testing/sweep_tests/op_map.py b/tests/ttnn/python_api_testing/sweep_tests/op_map.py index 49f08546578..0809643bc41 100644 --- a/tests/ttnn/python_api_testing/sweep_tests/op_map.py +++ b/tests/ttnn/python_api_testing/sweep_tests/op_map.py @@ -387,7 +387,7 @@ }, "ttnn-transpose_13": { "tt_op": ttnn_ops.transpose_13, - "pytorch_op": partial(pytorch_ops.transpose, dim0=0, dim1=3), + "pytorch_op": partial(pytorch_ops.transpose, dim0=1, dim1=3), }, "ttnn-transpose_23": { "tt_op": ttnn_ops.transpose_23, diff --git a/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_broken/grayskull/ttnn_eltwise_signbit_test.yaml b/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_broken/grayskull/ttnn_eltwise_signbit_test.yaml deleted file mode 100644 index c9c76c45502..00000000000 --- a/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_broken/grayskull/ttnn_eltwise_signbit_test.yaml +++ /dev/null @@ -1,48 +0,0 @@ ---- -test-list: - - ttnn-eltwise-signbit: - shape: - start-shape: [1, 1, 32, 32] - end-shape: [6, 12, 256, 256] - interval: [1, 1, 32, 32] - num-dims: [2, 3, 4] - num-shapes: 1 - num-samples: 64 - args-sampling-strategy: "all" - datagen: - function: gen_rand - args: - low: -100 - high: 100 - comparison: - function: comp_equal - args-gen: gen_dtype_layout_device - args: - data-layout: ["TILE"] - data-type: ["BFLOAT16", "BFLOAT8_B"] - buffer-type: ["DRAM", "L1"] - out-buffer-type: ["DRAM", "L1"] - output-file: eltwise_signbit_sweep.csv - - ttnn-eltwise-signbit: - shape: - start-shape: [1, 1, 2, 2] - end-shape: [6, 12, 256, 256] - interval: [1, 1, 1, 2] - num-dims: [2, 3, 4] - num-shapes: 1 - num-samples: 64 - args-sampling-strategy: "all" - datagen: - function: gen_rand - args: - low: -100 - high: 100 - comparison: - function: comp_equal - args-gen: gen_dtype_layout_device - args: - data-layout: ["ROW_MAJOR"] - data-type: ["BFLOAT16"] - buffer-type: ["DRAM", "L1"] - out-buffer-type: ["DRAM", "L1"] - output-file: eltwise_signbit_sweep.csv diff --git a/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_broken/wormhole/ttnn_eltwise_signbit_test.yaml b/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_broken/wormhole/ttnn_eltwise_signbit_test.yaml deleted file mode 100644 index c9c76c45502..00000000000 --- a/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_broken/wormhole/ttnn_eltwise_signbit_test.yaml +++ /dev/null @@ -1,48 +0,0 @@ ---- -test-list: - - ttnn-eltwise-signbit: - shape: - start-shape: [1, 1, 32, 32] - end-shape: [6, 12, 256, 256] - interval: [1, 1, 32, 32] - num-dims: [2, 3, 4] - num-shapes: 1 - num-samples: 64 - args-sampling-strategy: "all" - datagen: - function: gen_rand - args: - low: -100 - high: 100 - comparison: - function: comp_equal - args-gen: gen_dtype_layout_device - args: - data-layout: ["TILE"] - data-type: ["BFLOAT16", "BFLOAT8_B"] - buffer-type: ["DRAM", "L1"] - out-buffer-type: ["DRAM", "L1"] - output-file: eltwise_signbit_sweep.csv - - ttnn-eltwise-signbit: - shape: - start-shape: [1, 1, 2, 2] - end-shape: [6, 12, 256, 256] - interval: [1, 1, 1, 2] - num-dims: [2, 3, 4] - num-shapes: 1 - num-samples: 64 - args-sampling-strategy: "all" - datagen: - function: gen_rand - args: - low: -100 - high: 100 - comparison: - function: comp_equal - args-gen: gen_dtype_layout_device - args: - data-layout: ["ROW_MAJOR"] - data-type: ["BFLOAT16"] - buffer-type: ["DRAM", "L1"] - out-buffer-type: ["DRAM", "L1"] - output-file: eltwise_signbit_sweep.csv diff --git a/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_broken/wormhole/ttnn_sum_test.yaml b/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_broken/wormhole/ttnn_sum_test.yaml index 778788a0fc1..8e2e6fc26a0 100644 --- a/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_broken/wormhole/ttnn_sum_test.yaml +++ b/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_broken/wormhole/ttnn_sum_test.yaml @@ -7,7 +7,7 @@ test-list: interval: [1, 1, 32, 32] num-dims: [2, 3, 4] num-shapes: 1 - num-samples: 128 + num-samples: 256 args-sampling-strategy: "all" datagen: function: gen_rand diff --git a/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/grayskull/ttnn_eltwise_signbit_test.yaml b/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/grayskull/ttnn_eltwise_signbit_test.yaml new file mode 100644 index 00000000000..f4a537149bd --- /dev/null +++ b/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/grayskull/ttnn_eltwise_signbit_test.yaml @@ -0,0 +1,25 @@ +--- +test-list: + - ttnn-eltwise-signbit: + shape: + start-shape: [1, 1, 32, 32] + end-shape: [6, 12, 256, 256] + interval: [1, 1, 32, 32] + num-dims: [2, 3, 4] + num-shapes: 1 + num-samples: 256 + args-sampling-strategy: "all" + datagen: + function: gen_rand + args: + low: -100 + high: 100 + comparison: + function: comp_equal + args-gen: gen_dtype_layout_device + args: + data-layout: ["TILE"] + data-type: ["BFLOAT16", "BFLOAT8_B"] + buffer-type: ["DRAM", "L1"] + out-buffer-type: ["DRAM", "L1"] + output-file: eltwise_signbit_sweep.csv diff --git a/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/wormhole/ttnn_eltwise_signbit_test.yaml b/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/wormhole/ttnn_eltwise_signbit_test.yaml new file mode 100644 index 00000000000..f4a537149bd --- /dev/null +++ b/tests/ttnn/python_api_testing/sweep_tests/test_configs/ci_sweep_tests_working/wormhole/ttnn_eltwise_signbit_test.yaml @@ -0,0 +1,25 @@ +--- +test-list: + - ttnn-eltwise-signbit: + shape: + start-shape: [1, 1, 32, 32] + end-shape: [6, 12, 256, 256] + interval: [1, 1, 32, 32] + num-dims: [2, 3, 4] + num-shapes: 1 + num-samples: 256 + args-sampling-strategy: "all" + datagen: + function: gen_rand + args: + low: -100 + high: 100 + comparison: + function: comp_equal + args-gen: gen_dtype_layout_device + args: + data-layout: ["TILE"] + data-type: ["BFLOAT16", "BFLOAT8_B"] + buffer-type: ["DRAM", "L1"] + out-buffer-type: ["DRAM", "L1"] + output-file: eltwise_signbit_sweep.csv diff --git a/tests/ttnn/python_api_testing/sweep_tests/ttnn_ops.py b/tests/ttnn/python_api_testing/sweep_tests/ttnn_ops.py index 5388de12fc4..c7c0678f5bb 100644 --- a/tests/ttnn/python_api_testing/sweep_tests/ttnn_ops.py +++ b/tests/ttnn/python_api_testing/sweep_tests/ttnn_ops.py @@ -813,7 +813,7 @@ def reshape( **kwargs, ): t0 = setup_ttnn_tensor(x, device, layout[0], input_mem_config[0], dtype[0]) - t1 = ttnn.reshape(t0, reshape_dims) # , memory_config=output_mem_config) + t1 = ttnn.reshape(t0, reshape_dims, memory_config=output_mem_config) return ttnn_tensor_to_torch(t1) @@ -2772,9 +2772,7 @@ def arange( output_mem_config, **kwargs, ): - t0 = setup_ttnn_tensor(x, device, layout[0], input_mem_config[0], dtype[0]) - - t1 = ttnn.arange(start, end, step, device) + t1 = ttnn.arange(start, end, step, dtype=dtype[0], device=device, memory_config=input_mem_config[0]) return ttnn_tensor_to_torch(t1) @@ -2875,8 +2873,6 @@ def zeros( output_mem_config, **kwargs, ): - # t0 = setup_ttnn_tensor(x, device, layout[0], input_mem_config[0], dtype[0]) - t1 = ttnn.zeros( x.shape, device=device, @@ -3459,13 +3455,6 @@ def eltwise_unary_fmod( return ttnn_tensor_to_torch(t1) -def eltwise_softmax_in_place(x, *args, device, dtype, layout, input_mem_config, output_mem_config, **kwargs): - t0 = setup_ttnn_tensor(x, device, layout[0], input_mem_config[0], dtype[0]) - t1 = ttnn.softmax_in_place(t0) - - return ttnn_tensor_to_torch(t1) - - def eltwise_scale_mask_softmax_in_place( x, y, From 3387c94b80aab994631621526e1e16317b194315 Mon Sep 17 00:00:00 2001 From: Sofija Jovic <148721049+s-jovic@users.noreply.github.com> Date: Fri, 7 Feb 2025 13:15:49 +0100 Subject: [PATCH 002/316] #17134: Add SD down block unit test (#17653) --- .../tests/test_basic_transformer_block.py | 4 - .../tests/test_cross_attn_up_block_2d.py | 33 ++---- .../tests/test_downblock_2d.py | 105 ++++++++++++++++++ .../tests/test_transformer_2d_model.py | 8 +- .../stable_diffusion/tests/test_upblock_2d.py | 6 +- .../tests/test_upsample_2d.py | 5 +- .../ttnn_functional_downsample_2d_new_conv.py | 21 +--- .../ttnn_functional_resnetblock2d_new_conv.py | 16 +-- ...tional_unet_2d_condition_model_new_conv.py | 17 +-- .../ttnn_functional_upsample_2d_new_conv.py | 10 +- .../tt/ttnn_functional_utility_functions.py | 75 ++++++++----- 11 files changed, 181 insertions(+), 119 deletions(-) create mode 100644 models/demos/wormhole/stable_diffusion/tests/test_downblock_2d.py diff --git a/models/demos/wormhole/stable_diffusion/tests/test_basic_transformer_block.py b/models/demos/wormhole/stable_diffusion/tests/test_basic_transformer_block.py index 138d3ea6793..c478c77842c 100644 --- a/models/demos/wormhole/stable_diffusion/tests/test_basic_transformer_block.py +++ b/models/demos/wormhole/stable_diffusion/tests/test_basic_transformer_block.py @@ -12,10 +12,6 @@ from models.demos.wormhole.stable_diffusion.tt.ttnn_functional_basic_transformer_block import basic_transformer_block from ttnn.model_preprocessing import preprocess_model_parameters from tests.ttnn.utils_for_testing import assert_with_pcc -from models.demos.wormhole.stable_diffusion.tt.ttnn_functional_utility_functions import ( - pre_process_input, - post_process_output, -) from models.utility_functions import ( skip_for_grayskull, ) diff --git a/models/demos/wormhole/stable_diffusion/tests/test_cross_attn_up_block_2d.py b/models/demos/wormhole/stable_diffusion/tests/test_cross_attn_up_block_2d.py index 517e6d85cfe..1b56106af40 100644 --- a/models/demos/wormhole/stable_diffusion/tests/test_cross_attn_up_block_2d.py +++ b/models/demos/wormhole/stable_diffusion/tests/test_cross_attn_up_block_2d.py @@ -11,6 +11,9 @@ from models.demos.wormhole.stable_diffusion.tt.ttnn_functional_cross_attn_upblock_new_conv import ( cross_attention_upblock2d, ) +from models.demos.wormhole.stable_diffusion.tt.ttnn_functional_utility_functions import ( + preprocess_and_push_input_to_device, +) from models.utility_functions import skip_for_grayskull, torch_random from ttnn.model_preprocessing import preprocess_model_parameters from tests.ttnn.utils_for_testing import assert_with_pcc @@ -23,24 +26,6 @@ def ttnn_to_torch(input): return input -def prepare_input_and_push_to_device(input, device, memory_config): - input = torch.permute(input, (0, 2, 3, 1)) - input = torch.reshape( - input, - ( - 1, - 1, - input.shape[0] * input.shape[1] * input.shape[2], - input.shape[3], - ), - ) - - input = ttnn.from_torch(input, ttnn.bfloat16) - input = ttnn.to_layout(input, ttnn.TILE_LAYOUT) - input = ttnn.to_dtype(input, ttnn.bfloat8_b) - return ttnn.to_device(input, device, memory_config=memory_config) - - @skip_for_grayskull() @pytest.mark.parametrize("device_params", [{"l1_small_size": 32768}], indirect=True) @pytest.mark.parametrize( @@ -163,10 +148,10 @@ def test_cross_attn_up_block_2d_512x512( norm_type = "layer_norm" attn_num_head_channels = 8 - hidden_state = prepare_input_and_push_to_device( - hidden_state, + hidden_state = preprocess_and_push_input_to_device( device, - ttnn.MemoryConfig( + hidden_state, + memory_config=ttnn.MemoryConfig( ttnn.TensorMemoryLayout.BLOCK_SHARDED, ttnn.BufferType.L1, ttnn.ShardSpec( @@ -184,9 +169,9 @@ def test_cross_attn_up_block_2d_512x512( ), ) - res0 = prepare_input_and_push_to_device(res0, device, ttnn.DRAM_MEMORY_CONFIG) - res1 = prepare_input_and_push_to_device(res1, device, ttnn.DRAM_MEMORY_CONFIG) - res2 = prepare_input_and_push_to_device(res2, device, ttnn.DRAM_MEMORY_CONFIG) + res0 = preprocess_and_push_input_to_device(device, res0) + res1 = preprocess_and_push_input_to_device(device, res1) + res2 = preprocess_and_push_input_to_device(device, res2) res_hidden_states_tuple = (res0, res1, res2) temb = temb.permute(2, 0, 1, 3) # pre-permute temb diff --git a/models/demos/wormhole/stable_diffusion/tests/test_downblock_2d.py b/models/demos/wormhole/stable_diffusion/tests/test_downblock_2d.py new file mode 100644 index 00000000000..0148b2f10f4 --- /dev/null +++ b/models/demos/wormhole/stable_diffusion/tests/test_downblock_2d.py @@ -0,0 +1,105 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +import torch +from diffusers import StableDiffusionPipeline +import os +import ttnn +import pytest + +from models.utility_functions import torch_random +from tests.ttnn.utils_for_testing import assert_with_pcc +from models.utility_functions import ( + skip_for_grayskull, +) + +from models.demos.wormhole.stable_diffusion.tt.ttnn_functional_downblock_2d_new_conv import downblock2d +from models.demos.wormhole.stable_diffusion.custom_preprocessing import custom_preprocessor +from ttnn.model_preprocessing import preprocess_model_parameters +from models.demos.wormhole.stable_diffusion.tt.ttnn_functional_utility_functions import ( + get_default_compute_config, + preprocess_and_push_input_to_device, + post_process_output_and_move_to_host, +) + + +@skip_for_grayskull() +@pytest.mark.parametrize("device_params", [{"l1_small_size": 32768}], indirect=True) +@pytest.mark.parametrize("hidden_states, shard_end_core, shard_shape", [([2, 1280, 8, 8], (7, 3), (32, 160))]) +@pytest.mark.parametrize("temb", [[1, 1, 2, 1280]]) +def test_downblock_512x512(reset_seeds, device, hidden_states, shard_end_core, shard_shape, temb): + # Initialize PyTorch component + pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float32) + unet = pipe.unet + unet.eval() + torch_down_block = pipe.unet.down_blocks[3] + + # Initialize ttnn component + reader_patterns_cache = {} + parameters = preprocess_model_parameters( + initialize_model=lambda: unet, custom_preprocessor=custom_preprocessor, device=device + ) + parameters = parameters.down_blocks[3] + N, _, H, W = hidden_states + compute_kernel_config = get_default_compute_config(device) + + ttnn_down_block = downblock2d(device, parameters, reader_patterns_cache, N, H, W, compute_kernel_config) + + # Prepare inputs + in_channels = hidden_states[1] + out_channels = in_channels + temb_channels = 1280 + input_shape = hidden_states + hidden_states = torch_random(input_shape, -0.1, 0.1, dtype=torch.float32) + temb = torch_random(temb, -0.1, 0.1, dtype=torch.float32) + + # Run PyTorch component + torch_output, torch_residuals = torch_down_block(hidden_states, temb.squeeze(0).squeeze(0)) + + # Prepare inputs for ttnn component + hidden_states = preprocess_and_push_input_to_device( + device, + hidden_states, + memory_config=ttnn.MemoryConfig( + ttnn.TensorMemoryLayout.BLOCK_SHARDED, + ttnn.BufferType.L1, + ttnn.ShardSpec( + ttnn.CoreRangeSet( + { + ttnn.CoreRange( + ttnn.CoreCoord(0, 0), + ttnn.CoreCoord(shard_end_core[0], shard_end_core[1]), + ), + } + ), + shard_shape, + ttnn.ShardOrientation.ROW_MAJOR, + ), + ), + ) + + temb = temb.permute(2, 0, 1, 3) + temb = ttnn.from_torch(temb, ttnn.bfloat16) + temb = ttnn.to_layout(temb, ttnn.TILE_LAYOUT, ttnn.bfloat8_b) + temb = ttnn.to_device(temb, device, memory_config=ttnn.L1_MEMORY_CONFIG) + + # Run ttnn component + output, residuals = ttnn_down_block( + temb, + hidden_states, + in_channels, + out_channels, + temb_channels, + num_layers=2, + resnet_eps=1e-5, + resnet_act_fn="silu", + ) + + # Compare outputs + output = post_process_output_and_move_to_host(output, N, H, W, out_channels) + assert_with_pcc(torch_output, output, 0.97) + + for torch_residual, residual in zip(torch_residuals, residuals): + residual = post_process_output_and_move_to_host(residual, N, H, W, out_channels) + assert_with_pcc(torch_residual, residual, 0.97) diff --git a/models/demos/wormhole/stable_diffusion/tests/test_transformer_2d_model.py b/models/demos/wormhole/stable_diffusion/tests/test_transformer_2d_model.py index 2a82be21d1f..62ebd8ae241 100644 --- a/models/demos/wormhole/stable_diffusion/tests/test_transformer_2d_model.py +++ b/models/demos/wormhole/stable_diffusion/tests/test_transformer_2d_model.py @@ -17,7 +17,7 @@ from models.demos.wormhole.stable_diffusion.tt.ttnn_functional_transformer_2d_new_conv import transformer_2d_model from models.demos.wormhole.stable_diffusion.tt.ttnn_functional_utility_functions import ( pre_process_input, - post_process_output, + post_process_output_and_move_to_host, ) @@ -117,7 +117,7 @@ def test_transformer_2d_model_512x512( model = transformer_2d_model( device, parameters, input_shape[0], input_shape[2], input_shape[3], compute_kernel_config ) - ttnn_hidden_state = pre_process_input(model.device, ttnn_hidden_state) + ttnn_hidden_state = pre_process_input(ttnn_hidden_state) ttnn_hidden_state = ttnn.reshape( ttnn_hidden_state, ( @@ -147,14 +147,12 @@ def test_transformer_2d_model_512x512( upcast_attention=upcast_attention, ) - output = post_process_output( - model.device, + ttnn_output_torch = post_process_output_and_move_to_host( output, model.batch_size, model.input_height, model.input_width, model.proj_out_out_channels, ) - ttnn_output_torch = ttnn.to_torch(ttnn.to_layout(ttnn.from_device(output), layout=ttnn.ROW_MAJOR_LAYOUT)) assert_with_pcc(torch_output, ttnn_output_torch, 0.99) diff --git a/models/demos/wormhole/stable_diffusion/tests/test_upblock_2d.py b/models/demos/wormhole/stable_diffusion/tests/test_upblock_2d.py index a87164ca16c..e6d512614f0 100644 --- a/models/demos/wormhole/stable_diffusion/tests/test_upblock_2d.py +++ b/models/demos/wormhole/stable_diffusion/tests/test_upblock_2d.py @@ -19,7 +19,7 @@ from models.demos.wormhole.stable_diffusion.custom_preprocessing import custom_preprocessor from ttnn.model_preprocessing import preprocess_model_parameters from models.demos.wormhole.stable_diffusion.tt.ttnn_functional_utility_functions import ( - post_process_output, + post_process_output_and_move_to_host, weight_to_bfp8, ) @@ -100,6 +100,6 @@ def test_upblock_512x512(reset_seeds, device, res_hidden_states_tuple, hidden_st upsample_size=None, ) - op = post_process_output(device, op, N, H * 2, W * 2, in_channels) - op = ttnn.to_torch(op) + op = post_process_output_and_move_to_host(op, N, H * 2, W * 2, in_channels) + assert_with_pcc(torch_output, op, 0.95) diff --git a/models/demos/wormhole/stable_diffusion/tests/test_upsample_2d.py b/models/demos/wormhole/stable_diffusion/tests/test_upsample_2d.py index 5805ac33aac..348f645f497 100644 --- a/models/demos/wormhole/stable_diffusion/tests/test_upsample_2d.py +++ b/models/demos/wormhole/stable_diffusion/tests/test_upsample_2d.py @@ -18,7 +18,7 @@ from models.utility_functions import torch_random from models.demos.wormhole.stable_diffusion.tt.ttnn_functional_utility_functions import ( pre_process_input, - post_process_output, + post_process_output_and_move_to_host, ) @@ -80,7 +80,6 @@ def test_upsample2d_512x512(device, scale_factor, batch_size, in_channels, input in_channels, out_channels, ) - tt_up = post_process_output(device, tt_up, batch_size, input_height * 2, input_width * 2, in_channels) - torch_up = ttnn.to_torch(tt_up) + torch_up = post_process_output_and_move_to_host(tt_up, batch_size, input_height * 2, input_width * 2, in_channels) assert_with_pcc(torch_output, torch_up, 0.99) diff --git a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_downsample_2d_new_conv.py b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_downsample_2d_new_conv.py index 7879a1ed984..0072f0ee88c 100644 --- a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_downsample_2d_new_conv.py +++ b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_downsample_2d_new_conv.py @@ -10,9 +10,9 @@ from tt_lib.fallback_ops import fallback_ops from models.utility_functions import torch_to_tt_tensor_rm, tt_to_torch_tensor from models.demos.wormhole.stable_diffusion.tt.ttnn_functional_utility_functions import ( - run_ttnn_conv_with_pre_and_post_tensor_formatting, + conv_cache, + get_default_compute_config, ) -from models.demos.wormhole.stable_diffusion.tt.ttnn_functional_utility_functions import conv_cache import math @@ -136,13 +136,7 @@ def __call__( if hidden_states.memory_config() != self.input_memory_config: hidden_states = ttnn.to_memory_config(hidden_states, self.input_memory_config) - compute_config = ttnn.init_device_compute_kernel_config( - self.device.arch(), - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=True, - packer_l1_acc=False, - ) + compute_config = get_default_compute_config(self.device) if self.conv_config_override and "act_block_h" in self.conv_config_override: conv_config.act_block_h_override = self.conv_config_override["act_block_h"] @@ -187,14 +181,5 @@ def __call__( compute_config=compute_config, conv_op_cache=conv_cache, ) - # hidden_states = run_ttnn_conv_with_pre_and_post_tensor_formatting( - # self.device, - # self.conv, - # hidden_states, - # self.conv.batch_size, - # self.conv.output_height, - # self.conv.output_width, - # self.conv.out_channels, - # ) return hidden_states diff --git a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_resnetblock2d_new_conv.py b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_resnetblock2d_new_conv.py index e106d541684..691081f1952 100644 --- a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_resnetblock2d_new_conv.py +++ b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_resnetblock2d_new_conv.py @@ -11,13 +11,13 @@ import torch from typing import Optional, Dict from models.demos.wormhole.stable_diffusion.tt.ttnn_functional_utility_functions import ( - pre_process_input, - post_process_output, permute_conv_parameters, weight_to_bfp8, - dealloc_input, ) -from models.demos.wormhole.stable_diffusion.tt.ttnn_functional_utility_functions import conv_cache +from models.demos.wormhole.stable_diffusion.tt.ttnn_functional_utility_functions import ( + conv_cache, + get_default_compute_config, +) from loguru import logger @@ -721,13 +721,7 @@ def __call__( transpose_shards=False, reshard_if_not_optimal=False, ) - compute_config = ttnn.init_device_compute_kernel_config( - self.device.arch(), - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=True, - packer_l1_acc=False, - ) + compute_config = get_default_compute_config(self.device) if self.conv2_config_override and "act_block_h" in self.conv2_config_override: conv_config.act_block_h_override = self.conv2_config_override["act_block_h"] diff --git a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_unet_2d_condition_model_new_conv.py b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_unet_2d_condition_model_new_conv.py index 5f50f1666cb..d4f5faa00b5 100644 --- a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_unet_2d_condition_model_new_conv.py +++ b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_unet_2d_condition_model_new_conv.py @@ -38,6 +38,7 @@ pad_group_norm_weight, pre_process_input, conv_cache, + get_default_compute_config, ) fp32_accum = True @@ -389,13 +390,7 @@ def __call__( transpose_shards=False, reshard_if_not_optimal=True, ) - compute_config = ttnn.init_device_compute_kernel_config( - self.device.arch(), - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=True, - packer_l1_acc=False, - ) + compute_config = get_default_compute_config(self.device) conv_kwargs = { "in_channels": in_channels, @@ -681,13 +676,7 @@ def __call__( transpose_shards=False, reshard_if_not_optimal=True, ) - compute_config = ttnn.init_device_compute_kernel_config( - self.device.arch(), - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=True, - packer_l1_acc=False, - ) + compute_config = get_default_compute_config(self.device) conv_kwargs_1 = { "in_channels": self.conv_out_in_channels, diff --git a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_upsample_2d_new_conv.py b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_upsample_2d_new_conv.py index 7418626cd30..0c064214e57 100644 --- a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_upsample_2d_new_conv.py +++ b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_upsample_2d_new_conv.py @@ -12,10 +12,10 @@ from models.demos.wormhole.stable_diffusion.tt.ttnn_functional_upsample_nearest_2d import upsample_nearest2d from models.demos.wormhole.stable_diffusion.tt.ttnn_functional_utility_functions import ( - run_ttnn_conv_with_pre_and_post_tensor_formatting, conv_cache, ) from models.demos.wormhole.stable_diffusion.tt.ttnn_functional_utility_functions import ( + get_default_compute_config, permute_conv_parameters, ) from loguru import logger @@ -97,13 +97,7 @@ def __call__(self, input, in_channels, out_channels): transpose_shards=False, reshard_if_not_optimal=False, # Reshard has error : 1616 Bytes unique+common runtime args targeting kernel reshard_reader on (x=0,y=0) are too large. Cannot be written as they will run into memory region reserved for result. Max allowable size is 1024 Bytes ) - compute_config = ttnn.init_device_compute_kernel_config( - self.device.arch(), - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=True, - packer_l1_acc=False, - ) + compute_config = get_default_compute_config(self.device) if self.conv_config_override and "act_block_h" in self.conv_config_override: conv_config.act_block_h_override = self.conv_config_override["act_block_h"] diff --git a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_utility_functions.py b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_utility_functions.py index c4f23188f1d..6460ca2eeaa 100644 --- a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_utility_functions.py +++ b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_utility_functions.py @@ -20,10 +20,32 @@ def is_tile_dim_alligned(dim): return dim % 32 == 0 -def pre_process_input(device, tensor): +def pre_process_input(tensor): return ttnn.permute(tensor, (0, 2, 3, 1)) +# This function takes torch tensor in [N, Ci, H, W] format, transforms it to +# [1, 1, N*H*W, Ci] format and applies needed layout, type and memory config +def preprocess_and_push_input_to_device( + device, input, dtype=ttnn.bfloat8_b, layout=ttnn.TILE_LAYOUT, memory_config=ttnn.DRAM_MEMORY_CONFIG +): + input = torch.permute(input, (0, 2, 3, 1)) + input = torch.reshape( + input, + ( + 1, + 1, + input.shape[0] * input.shape[1] * input.shape[2], + input.shape[3], + ), + ) + + input = ttnn.from_torch(input, ttnn.bfloat16) + input = ttnn.to_layout(input, layout) + input = ttnn.to_dtype(input, dtype) + return ttnn.to_device(input, device, memory_config=memory_config) + + def pad_encoder_hidden_states(device, tensor, required_sequence_length): tensor = ttnn.to_layout(tensor, ttnn.ROW_MAJOR_LAYOUT) assert tensor.shape[0] == 1 @@ -60,36 +82,21 @@ def pad_encoder_hidden_states(device, tensor, required_sequence_length): return tensor -def post_process_output(device, tensor, batch_size, output_height, output_width, output_channels): - tensor = ttnn.to_layout( - tensor, - ttnn.ROW_MAJOR_LAYOUT, # use_multicore=ttnn.get_memory_config(tensor).shard_spec is not None - ) - tensor = ttnn.from_device(tensor) +def post_process_output_and_move_to_host(tensor, batch_size, output_height, output_width, output_channels): assert output_channels == tensor.shape[3] - tensor = fallback_ops.reshape( - tensor, - batch_size, - output_height, - output_width, - output_channels, - output_layout=ttnn.ROW_MAJOR_LAYOUT, - output_on_device=False, - ) - tensor = fallback_ops.permute(tensor, (0, 3, 1, 2), output_layout=ttnn.ROW_MAJOR_LAYOUT, output_on_device=False) - tensor = ttnn.to_layout(tensor, ttnn.TILE_LAYOUT) - tensor = ttnn.to_device(tensor, device) - return tensor - -def run_ttnn_conv_with_pre_and_post_tensor_formatting( - device, ttnn_conv_op, tensor: ttnn.Tensor, batch_size, output_height, output_width, output_channels -) -> ttnn.Tensor: - tensor = pre_process_input(device, tensor) - # print("Running conv op") - tensor = ttnn_conv_op(tensor) - tensor = post_process_output(device, tensor, batch_size, output_height, output_width, output_channels) - return tensor + torch_tensor = ttnn.to_torch(tensor) + torch_tensor = torch.reshape( + torch_tensor, + ( + batch_size, + output_height, + output_width, + output_channels, + ), + ) + torch_tensor = torch.permute(torch_tensor, (0, 3, 1, 2)) + return torch_tensor def ttnn_to_torch(input): @@ -267,3 +274,13 @@ def reshard_to(tensor, grid_size, layout, col_major=False, shape=None): ttnn.ShardOrientation.ROW_MAJOR, ) return tensor + + +def get_default_compute_config(device): + return ttnn.init_device_compute_kernel_config( + device.arch(), + math_fidelity=ttnn.MathFidelity.LoFi, + math_approx_mode=True, + fp32_dest_acc_en=True, + packer_l1_acc=False, + ) From 558da69ae48ef63ad2f8b13a92cd40a00f9f3972 Mon Sep 17 00:00:00 2001 From: Slavko Krstic Date: Fri, 7 Feb 2025 13:22:09 +0100 Subject: [PATCH 003/316] Add torch tensor cache to conv2d unit tests to speedup test execution (#17708) The function `torch.randn` takes a significant amount of time while executing `tests/ttnn/unit_tests/operations/test_new_conv2d.py`. The idea is to cache torch tensors with specific dimensions and reuse them in other tests that require tensors of those dimensions. It turns out that, out of approximately 3000 tensors that needed to be generated, there were only around 300 unique dimensions. This approach reduces the test execution time by 10%. --- .../unit_tests/operations/test_new_conv2d.py | 112 ++++++++++++++---- 1 file changed, 91 insertions(+), 21 deletions(-) diff --git a/tests/ttnn/unit_tests/operations/test_new_conv2d.py b/tests/ttnn/unit_tests/operations/test_new_conv2d.py index c3f02edef65..7627f60e285 100644 --- a/tests/ttnn/unit_tests/operations/test_new_conv2d.py +++ b/tests/ttnn/unit_tests/operations/test_new_conv2d.py @@ -20,8 +20,27 @@ WS = ttnn.TensorMemoryLayout.WIDTH_SHARDED +# Cache map used for torch tensor reuse - the tensor will not be generated if a tensor of the same dimensions has already been generated +@pytest.fixture(scope="module") +def torch_tensor_map(request): + torch_tensor_map = {} + + return torch_tensor_map + + +def randomize_torch_tensor(torch_tensor_map, tensor_shape): + if tensor_shape in torch_tensor_map.keys(): + torch_tensor = torch_tensor_map[tensor_shape] + else: + torch_tensor = torch.randn(tensor_shape, dtype=torch.bfloat16).float() + torch_tensor_map[tensor_shape] = torch_tensor + + return torch_tensor + + def run_conv( device, + torch_tensor_map, math_fidelity, activations_dtype, weights_dtype, @@ -64,15 +83,15 @@ def run_conv( total_batch_size = batch_size torch.manual_seed(0) - conv_input_shape = [total_batch_size, input_channels, input_height, input_width] - conv_weight_shape = [output_channels, input_channels // groups, filter_height, filter_width] - conv_bias_shape = [1, 1, 1, output_channels] - torch_input_tensor_nchw = torch.randn(conv_input_shape, dtype=torch.bfloat16).float() - + conv_input_shape = (total_batch_size, input_channels, input_height, input_width) + conv_weight_shape = (output_channels, input_channels // groups, filter_height, filter_width) + conv_bias_shape = (1, 1, 1, output_channels) + torch_input_tensor_nchw = randomize_torch_tensor(torch_tensor_map, conv_input_shape) torch_input_tensor = torch.permute(torch_input_tensor_nchw, (0, 2, 3, 1)) - torch_weight_tensor = torch.randn(conv_weight_shape, dtype=torch.bfloat16).float() - torch_bias_tensor = torch.randn(conv_bias_shape, dtype=torch.bfloat16).float() if has_bias else None + torch_weight_tensor = randomize_torch_tensor(torch_tensor_map, conv_weight_shape) + torch_bias_tensor = randomize_torch_tensor(torch_tensor_map, conv_bias_shape) if has_bias else None + torch_out_golden_tensor = torch.nn.functional.conv2d( torch_input_tensor_nchw, torch_weight_tensor, @@ -190,6 +209,7 @@ def run_conv( def run_conv_with_split( device, + torch_tensor_map, math_fidelity, activations_dtype, weights_dtype, @@ -214,13 +234,13 @@ def run_conv_with_split( torch.manual_seed(0) assert input_channels % split_factor == 0 split_input_channels = input_channels // split_factor - full_conv_input_shape = [batch_size, input_channels, input_height, input_width] - full_conv_weight_shape = [output_channels, input_channels, filter_height, filter_width] - torch_input_tensor_nchw = torch.randn(full_conv_input_shape, dtype=torch.bfloat16).float() - torch_weight_tensor = torch.randn(full_conv_weight_shape, dtype=torch.bfloat16).float() - conv_bias_shape = [1, 1, 1, output_channels] - torch_bias_tensor = torch.randn(conv_bias_shape, dtype=torch.bfloat16).float() - torch_bias_zeroes_tensor = torch.randn(conv_bias_shape, dtype=torch.bfloat16).float() + full_conv_input_shape = (batch_size, input_channels, input_height, input_width) + full_conv_weight_shape = (output_channels, input_channels, filter_height, filter_width) + torch_input_tensor_nchw = randomize_torch_tensor(torch_tensor_map, full_conv_input_shape) + torch_weight_tensor = randomize_torch_tensor(torch_tensor_map, full_conv_weight_shape) + conv_bias_shape = (1, 1, 1, output_channels) + torch_bias_tensor = randomize_torch_tensor(torch_tensor_map, conv_bias_shape) + torch_bias_zeroes_tensor = randomize_torch_tensor(torch_tensor_map, conv_bias_shape) torch_out_golden_tensor = torch.nn.functional.conv2d( torch_input_tensor_nchw, torch_weight_tensor, @@ -344,6 +364,7 @@ def run_conv_with_split( @pytest.mark.parametrize("output_layout", [ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT]) def test_conv_features( device, + torch_tensor_map, use_program_cache, math_fidelity, activations_dtype, @@ -370,6 +391,7 @@ def test_conv_features( run_conv( device, + torch_tensor_map, math_fidelity, activations_dtype, weights_dtype, @@ -424,6 +446,7 @@ def test_conv_features( @pytest.mark.parametrize("output_layout", [ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT]) def test_conv_features_multi_device( mesh_device, + torch_tensor_map, use_program_cache, math_fidelity, activations_dtype, @@ -446,6 +469,7 @@ def test_conv_features_multi_device( run_conv( mesh_device, + torch_tensor_map, math_fidelity, activations_dtype, weights_dtype, @@ -507,6 +531,7 @@ def test_conv_features_multi_device( @pytest.mark.parametrize("tilized_input", [True, False], ids=["tilized", "row_major"]) def test_conv_ws( device, + torch_tensor_map, use_program_cache, batch_size, output_channels, @@ -536,20 +561,19 @@ def test_conv_ws( debug = False groups = 1 - conv_input_shape = [batch_size, input_channels, input_height, input_width] - conv_weight_shape = [output_channels, input_channels // groups, filter_height, filter_width] - conv_bias_shape = [1, 1, 1, output_channels] + conv_input_shape = (batch_size, input_channels, input_height, input_width) + conv_weight_shape = (output_channels, input_channels // groups, filter_height, filter_width) + conv_bias_shape = (1, 1, 1, output_channels) - torch_input_tensor_nchw = torch.randn(conv_input_shape, dtype=torch.bfloat16).float() - torch_input_tensor_nchw = torch_input_tensor_nchw.broadcast_to(conv_input_shape).float() + torch_input_tensor_nchw = randomize_torch_tensor(torch_tensor_map, conv_input_shape) torch_input_tensor = torch.permute(torch_input_tensor_nchw, (0, 2, 3, 1)) - torch_weight_tensor = torch.randn(conv_weight_shape, dtype=torch.bfloat16).float() + torch_weight_tensor = randomize_torch_tensor(torch_tensor_map, conv_weight_shape) tt_bias_tensor = None torch_bias_tensor = None if has_bias: - torch_bias_tensor = torch.randn(conv_bias_shape, dtype=torch.bfloat16).float() * 50 + torch_bias_tensor = randomize_torch_tensor(torch_tensor_map, conv_bias_shape) * 50 tt_bias_tensor = ttnn.from_torch( torch_bias_tensor, weights_dtype if weights_dtype != ttnn.bfloat8_b else ttnn.float32 ) @@ -678,6 +702,7 @@ def test_conv_ws( @skip_for_grayskull() def test_conv_for_segformer_512x512( device, + torch_tensor_map, use_program_cache, math_fidelity, activations_dtype, @@ -702,6 +727,7 @@ def test_conv_for_segformer_512x512( ): run_conv( device, + torch_tensor_map, math_fidelity, activations_dtype, weights_dtype, @@ -767,6 +793,7 @@ def test_conv_for_segformer_512x512( @pytest.mark.parametrize("auto_shard", [True, False], ids=["auto_shard", "no_auto_shard"]) def test_resnet50_conv_gs( device, + torch_tensor_map, use_program_cache, math_fidelity, activations_dtype, @@ -805,6 +832,7 @@ def test_resnet50_conv_gs( run_conv( device, + torch_tensor_map, math_fidelity, activations_dtype, weights_dtype, @@ -893,6 +921,7 @@ def test_resnet50_conv_gs( @pytest.mark.parametrize("auto_shard", [True, False], ids=["auto_shard", "no_auto_shard"]) def test_resnet50_conv_wh( device, + torch_tensor_map, use_program_cache, math_fidelity, activations_dtype, @@ -920,6 +949,7 @@ def test_resnet50_conv_wh( use_shallow_conv_variant = (input_channels == 16) and device.arch() == ttnn.device.Arch.GRAYSKULL run_conv( device, + torch_tensor_map, math_fidelity, activations_dtype, weights_dtype, @@ -956,6 +986,7 @@ def test_resnet50_conv_wh( @pytest.mark.parametrize("memory_config", [ttnn.L1_MEMORY_CONFIG, ttnn.DRAM_MEMORY_CONFIG]) def test_conv_mem_config_wh( device, + torch_tensor_map, use_program_cache, batch_size, output_channels, @@ -978,6 +1009,7 @@ def test_conv_mem_config_wh( use_shallow_conv_variant = (input_channels == 16) and device.arch() != ttnn.device.Arch.WORMHOLE_B0 run_conv( device, + torch_tensor_map, ttnn.MathFidelity.LoFi, ttnn.bfloat8_b, ttnn.bfloat8_b, @@ -1060,6 +1092,7 @@ def test_conv_mem_config_wh( @pytest.mark.parametrize("auto_shard", [True, False], ids=["auto_shard", "no_auto_shard"]) def test_resnet50_conv_wh_fp32( device, + torch_tensor_map, use_program_cache, math_fidelity, fp32_accum, @@ -1100,6 +1133,7 @@ def test_resnet50_conv_wh_fp32( use_shallow_conv_variant = (input_channels == 16) and device.arch() != ttnn.device.Arch.WORMHOLE_B0 run_conv( device, + torch_tensor_map, math_fidelity, activations_dtype, weights_dtype, @@ -1190,6 +1224,7 @@ def test_resnet50_conv_wh_fp32( @pytest.mark.parametrize("auto_shard", [False], ids=["no_auto_shard"]) def test_sd_conv( device, + torch_tensor_map, use_program_cache, math_fidelity, activations_dtype, @@ -1215,6 +1250,7 @@ def test_sd_conv( pytest.skip("Not running split SD conv with auto formatting") run_conv_with_split( device, + torch_tensor_map, math_fidelity, activations_dtype, weights_dtype, @@ -1236,6 +1272,7 @@ def test_sd_conv( else: run_conv( device, + torch_tensor_map, math_fidelity, activations_dtype, weights_dtype, @@ -1328,6 +1365,7 @@ def test_sd_conv( @pytest.mark.parametrize("math_fidelity", [ttnn.MathFidelity.LoFi]) def test_sd_conv_wh( device, + torch_tensor_map, use_program_cache, math_fidelity, activations_dtype, @@ -1366,6 +1404,7 @@ def test_sd_conv_wh( if filter_height > 1 and (input_channels > 1280 or (input_channels > 640 and input_height > 16)): run_conv_with_split( device, + torch_tensor_map, math_fidelity, activations_dtype, weights_dtype, @@ -1389,6 +1428,7 @@ def test_sd_conv_wh( else: run_conv( device, + torch_tensor_map, math_fidelity, activations_dtype, weights_dtype, @@ -1466,6 +1506,7 @@ def test_sd_conv_wh( @pytest.mark.parametrize("auto_shard", [True, BS], ids=["auto_shard", "no_auto_shard"]) def test_unet_conv( device, + torch_tensor_map, use_program_cache, math_fidelity, activations_dtype, @@ -1496,6 +1537,7 @@ def test_unet_conv( pytest.skip("OOM") run_conv( device, + torch_tensor_map, math_fidelity, activations_dtype, weights_dtype, @@ -1557,6 +1599,7 @@ def test_unet_conv( @pytest.mark.parametrize("auto_shard", [True, False], ids=["auto_shard", "no_auto_shard"]) def test_unet_conv_wh( device, + torch_tensor_map, use_program_cache, math_fidelity, activations_dtype, @@ -1586,6 +1629,7 @@ def test_unet_conv_wh( pytest.skip("OOM") run_conv( device, + torch_tensor_map, math_fidelity, activations_dtype, weights_dtype, @@ -1652,6 +1696,7 @@ def test_unet_conv_wh( @pytest.mark.parametrize("auto_shard", [True, False], ids=["auto_shard", "no_auto_shard"]) def test_unet_conv_groups_2_wh( device, + torch_tensor_map, use_program_cache, math_fidelity, activations_dtype, @@ -1682,6 +1727,7 @@ def test_unet_conv_groups_2_wh( pytest.skip("OOM") run_conv( device, + torch_tensor_map, math_fidelity, activations_dtype, weights_dtype, @@ -1748,6 +1794,7 @@ def test_unet_conv_groups_2_wh( @pytest.mark.parametrize("output_layout", [ttnn.TILE_LAYOUT]) def test_unet_conv_groups_4_6_wh( device, + torch_tensor_map, use_program_cache, math_fidelity, activations_dtype, @@ -1777,6 +1824,7 @@ def test_unet_conv_groups_4_6_wh( pytest.skip("OOM") run_conv( device, + torch_tensor_map, math_fidelity, activations_dtype, weights_dtype, @@ -1843,6 +1891,7 @@ def test_unet_conv_groups_4_6_wh( @pytest.mark.parametrize("auto_shard", [False], ids=["no_auto_shard"]) def test_unet_conv_groups_8_wh( device, + torch_tensor_map, use_program_cache, math_fidelity, activations_dtype, @@ -1873,6 +1922,7 @@ def test_unet_conv_groups_8_wh( pytest.skip("OOM") run_conv( device, + torch_tensor_map, math_fidelity, activations_dtype, weights_dtype, @@ -1913,6 +1963,7 @@ def test_unet_conv_groups_8_wh( @pytest.mark.parametrize("auto_shard", [True, False], ids=["auto_shard", "no_auto_shard"]) def test_halo_reshard_conv( device, + torch_tensor_map, use_program_cache, shard_layout, batch_size, @@ -1935,6 +1986,7 @@ def test_halo_reshard_conv( run_conv( device, + torch_tensor_map, math_fidelity, activations_dtype, weights_dtype, @@ -1971,6 +2023,7 @@ def test_halo_reshard_conv( @pytest.mark.parametrize("auto_shard", [True, False], ids=["auto_shard", "no_auto_shard"]) def test_conv_core_nondivis( device, + torch_tensor_map, use_program_cache, shard_layout, batch_size, @@ -1997,6 +2050,7 @@ def test_conv_core_nondivis( run_conv( device, + torch_tensor_map, math_fidelity, activations_dtype, weights_dtype, @@ -2057,6 +2111,7 @@ def test_conv_core_nondivis( ) def test_conv_dilation( device, + torch_tensor_map, use_program_cache, math_fidelity, activations_dtype, @@ -2077,6 +2132,7 @@ def test_conv_dilation( config_override = {"act_block_w_div": act_block_w_div} run_conv( device, + torch_tensor_map, math_fidelity, activations_dtype, weights_dtype, @@ -2145,6 +2201,7 @@ def test_conv_dilation( # @pytest.mark.parametrize("auto_shard", [True, False], ids=["auto_shard", "no_auto_shard"]) def test_conv_groups( device, + torch_tensor_map, use_program_cache, math_fidelity, activations_dtype, @@ -2168,6 +2225,7 @@ def test_conv_groups( ): run_conv( device, + torch_tensor_map, math_fidelity, activations_dtype, weights_dtype, @@ -2257,6 +2315,7 @@ def test_conv_groups( @pytest.mark.parametrize("auto_shard", [True, False], ids=["auto_shard", "no_auto_shard"]) def test_yolov4_conv_groups_larger_than_one( device, + torch_tensor_map, use_program_cache, math_fidelity, activations_dtype, @@ -2285,6 +2344,7 @@ def test_yolov4_conv_groups_larger_than_one( pytest.skip("OOM") run_conv( device, + torch_tensor_map, math_fidelity, activations_dtype, weights_dtype, @@ -2330,6 +2390,7 @@ def test_yolov4_conv_groups_larger_than_one( @pytest.mark.parametrize("auto_shard", [True, False], ids=["auto_shard", "no_auto_shard"]) def test_swin_s_conv( device, + torch_tensor_map, use_program_cache, math_fidelity, activations_dtype, @@ -2358,6 +2419,7 @@ def test_swin_s_conv( pytest.skip("OOM issue for batch_size 8") run_conv( device, + torch_tensor_map, math_fidelity, activations_dtype, weights_dtype, @@ -2405,6 +2467,7 @@ def test_swin_s_conv( @pytest.mark.parametrize("auto_shard", [True, False], ids=["auto_shard", "no_auto_shard"]) def test_model_k_256x256( device, + torch_tensor_map, use_program_cache, math_fidelity, activations_dtype, @@ -2426,6 +2489,7 @@ def test_model_k_256x256( ): run_conv( device, + torch_tensor_map, math_fidelity, activations_dtype, weights_dtype, @@ -2481,6 +2545,7 @@ def test_model_k_256x256( @skip_for_grayskull() def test_conv_for_vanilla_unet( device, + torch_tensor_map, use_program_cache, math_fidelity, activations_dtype, @@ -2505,6 +2570,7 @@ def test_conv_for_vanilla_unet( pytest.skip("This test is not supported for N300") run_conv( device, + torch_tensor_map, math_fidelity, activations_dtype, weights_dtype, @@ -2564,6 +2630,7 @@ def test_conv_for_vanilla_unet( @pytest.mark.parametrize("has_bias", [True, False], ids=["with_bias", "no_bias"]) def test_non_tile_multiple_height_conv_wh( device, + torch_tensor_map, use_program_cache, math_fidelity, activations_dtype, @@ -2618,6 +2685,7 @@ def test_non_tile_multiple_height_conv_wh( use_shallow_conv_variant = (input_channels == 16) and device.arch() != ttnn.device.Arch.WORMHOLE_B0 run_conv( device, + torch_tensor_map, math_fidelity, activations_dtype, weights_dtype, @@ -2682,6 +2750,7 @@ def test_non_tile_multiple_height_conv_wh( @pytest.mark.parametrize("math_fidelity", [ttnn.MathFidelity.LoFi]) def test_non_tile_multiple_width_conv_wh( device, + torch_tensor_map, use_program_cache, math_fidelity, activations_dtype, @@ -2702,6 +2771,7 @@ def test_non_tile_multiple_width_conv_wh( ): run_conv( device, + torch_tensor_map, math_fidelity, activations_dtype, weights_dtype, From d0b59bdf04c7a7c6d7781d4dd1113aca2ae0ebe0 Mon Sep 17 00:00:00 2001 From: Mark O'Connor Date: Fri, 7 Feb 2025 13:32:46 +0100 Subject: [PATCH 004/316] Add HF model support inc. DS-R1-Distill, Qwen needs yarn support (#17421) ### Problem description Existing codebase loads the meta checkpoint format but many derivative models are only available on huggingface. ### What's changed Add support for loading HuggingFace model formats, paving the way for full Qwen support (pending yarn rope implementation) and adding DeepSeek-R1-Distill-Llama-70B support. ### Checklist All passing locally. - [x] [all-post-commit](https://github.com/tenstorrent/tt-metal/actions/runs/13181023765) - [FIXED] Failing on loading the tokenizer on this pipeline only (investigating) - [x] [Single](https://github.com/tenstorrent/tt-metal/actions/runs/13142509908/job/36672984561) - [x] [Single-demos](https://github.com/tenstorrent/tt-metal/actions/runs/13180995444) - Only failing on N300 performance - Investigating - [ ] [T3K](https://github.com/tenstorrent/tt-metal/actions/runs/13142519276) - [x] [Unit](https://github.com/tenstorrent/tt-metal/actions/runs/13163296158/job/36737812258) - [x] [Model-perf](https://github.com/tenstorrent/tt-metal/actions/runs/13164376159) - [x] [Frequent-1](https://github.com/tenstorrent/tt-metal/actions/runs/13174954913) - [x] [Frequent-2](https://github.com/tenstorrent/tt-metal/actions/runs/13164380377/job/36742877847) - [x] [Demo](https://github.com/tenstorrent/tt-metal/actions/runs/13180986094) - [x] [TG](https://github.com/tenstorrent/tt-metal/actions/runs/13154035596/job/36707218743) - Pipelines have issues not related to these changes. --------- Signed-off-by: Salar Hosseini Co-authored-by: mtairum Co-authored-by: Salar Hosseini --- README.md | 5 +- models/common/rmsnorm.py | 3 + models/demos/llama3/PERF.md | 93 +- models/demos/llama3/README.md | 27 +- models/demos/llama3/demo/demo.py | 254 ++--- .../demo/input_data_questions_reasoning.json | 20 + .../demos/llama3/demo/simple_vision_demo.py | 2 +- models/demos/llama3/lt | 57 +- models/demos/llama3/requirements.txt | 1 + .../llama3/tests/generate_reference_hf.py | 148 +++ .../tests/generate_reference_outputs.py | 60 +- .../tests/generate_reference_outputs.sh | 27 +- ..._llama_cross_attention_transformer_text.py | 9 +- ...{70b.refpt => Llama3.1-70B-Instruct.refpt} | Bin .../{8b.refpt => Llama3.1-8B-Instruct.refpt} | Bin ...{11b.refpt => Llama3.2-11B-Instruct.refpt} | Bin .../{1b.refpt => Llama3.2-1B-Instruct.refpt} | Bin .../{3b.refpt => Llama3.2-3B-Instruct.refpt} | Bin .../Qwen2.5-72B-Instruct.refpt | Bin 0 -> 50726 bytes .../Qwen2.5-7B-Instruct.refpt | Bin 0 -> 50720 bytes .../tests/test_interleaved_to_sharded.py | 35 +- .../demos/llama3/tests/test_llama_accuracy.py | 41 +- .../llama3/tests/test_llama_attention.py | 28 +- .../tests/test_llama_attention_prefill.py | 14 +- .../demos/llama3/tests/test_llama_decoder.py | 7 +- .../tests/test_llama_decoder_prefill.py | 11 +- .../llama3/tests/test_llama_embedding.py | 8 +- models/demos/llama3/tests/test_llama_mlp.py | 22 +- models/demos/llama3/tests/test_llama_model.py | 114 +-- .../llama3/tests/test_llama_model_prefill.py | 23 +- .../demos/llama3/tests/test_llama_rms_norm.py | 9 +- models/demos/llama3/tests/test_llama_torch.py | 13 +- models/demos/llama3/tests/test_lm_head.py | 3 +- models/demos/llama3/tests/test_ref.py | 104 ++ models/demos/llama3/tt/generator_vllm.py | 2 +- models/demos/llama3/tt/llama_attention.py | 139 ++- models/demos/llama3/tt/llama_ccl.py | 8 +- models/demos/llama3/tt/llama_common.py | 95 +- models/demos/llama3/tt/llama_decoder.py | 2 + models/demos/llama3/tt/llama_mlp.py | 75 +- models/demos/llama3/tt/llama_model.py | 14 +- models/demos/llama3/tt/llama_rope.py | 33 +- models/demos/llama3/tt/lm_head.py | 16 +- models/demos/llama3/tt/load_checkpoints.py | 303 ++++++ models/demos/llama3/tt/model_config.py | 897 +++++++++++++----- .../tt/multimodal/llama_cross_attention.py | 2 + .../llama_cross_attention_transformer_text.py | 8 +- ...lama_cross_attention_transformer_vision.py | 14 +- .../llama3/tt/multimodal/llama_image_mlp.py | 14 +- .../tt/multimodal/llama_vision_model.py | 3 +- 50 files changed, 1983 insertions(+), 780 deletions(-) create mode 100644 models/demos/llama3/demo/input_data_questions_reasoning.json mode change 100644 => 100755 models/demos/llama3/lt create mode 100644 models/demos/llama3/tests/generate_reference_hf.py rename models/demos/llama3/tests/reference_outputs/{70b.refpt => Llama3.1-70B-Instruct.refpt} (100%) rename models/demos/llama3/tests/reference_outputs/{8b.refpt => Llama3.1-8B-Instruct.refpt} (100%) rename models/demos/llama3/tests/reference_outputs/{11b.refpt => Llama3.2-11B-Instruct.refpt} (100%) rename models/demos/llama3/tests/reference_outputs/{1b.refpt => Llama3.2-1B-Instruct.refpt} (100%) rename models/demos/llama3/tests/reference_outputs/{3b.refpt => Llama3.2-3B-Instruct.refpt} (100%) create mode 100644 models/demos/llama3/tests/reference_outputs/Qwen2.5-72B-Instruct.refpt create mode 100644 models/demos/llama3/tests/reference_outputs/Qwen2.5-7B-Instruct.refpt create mode 100644 models/demos/llama3/tests/test_ref.py create mode 100644 models/demos/llama3/tt/load_checkpoints.py diff --git a/README.md b/README.md index e4d2c5b951d..817558ebf75 100644 --- a/README.md +++ b/README.md @@ -36,12 +36,13 @@ | [Llama 3.1 70B (TP=8)](./models/demos/t3000/llama3_70b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 190 | 15.1 | 20 | 483.2 | [v0.54.0-rc2](https://github.com/tenstorrent/tt-metal/tree/v0.54.0-rc2) | [9531611](https://github.com/tenstorrent/vllm/tree/953161188c50f10da95a88ab305e23977ebd3750) | | [Falcon 40B (TP=8)](./models/demos/t3000/falcon40b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | | 5.3 | 36 | 169.6 | [v0.55.0-rc19](https://github.com/tenstorrent/tt-metal/tree/v0.55.0-rc19) | | | [Mixtral 8x7B (TP=8)](./models/demos/t3000/mixtral8x7b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 227 | 14.9 | 33 | 476.8 | [v0.55.0-rc19](https://github.com/tenstorrent/tt-metal/tree/v0.55.0-rc19) | | +| [DeepSeek R1 Distill Llama 3.3 70B (TP=8)](./models/demos/llama3) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 1113 | 16.4 | 33 |386.4 | [main](https://github.com/tenstorrent/tt-metal/) | [2f33504](https://github.com/tenstorrent/vllm/tree/2f33504bad49a6202d3685155107a6126a5b5e6e) | | [Falcon 7B (DP=32)](./models/demos/tg/falcon7b) | 1024 | [Galaxy](https://tenstorrent.com/hardware/galaxy) | 223 | 4.8 | 26 | 4915.2 | [v0.55.0-rc18](https://github.com/tenstorrent/tt-metal/tree/v0.55.0-rc18) | | | [Llama 3.1 70B (DP=4, TP=8)](./models/demos/t3000/llama3_70b) | 128 | [Galaxy](https://tenstorrent.com/hardware/galaxy) | 190 | 14.3 | 20 | 1835.5 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) | | | [Llama 3.1 70B (TP=32)](./models/demos/llama3) | 32 | [Galaxy](https://tenstorrent.com/hardware/galaxy) | 763 | 13.5 | 80 | 432.0 | [v0.55.0-rc12](https://github.com/tenstorrent/tt-metal/tree/v0.55.0-rc12) | [2f33504](https://github.com/tenstorrent/vllm/tree/2f33504bad49a6202d3685155107a6126a5b5e6e) | -| [DeepSeek R1 Distill Llama 3.3 70B (TP=8)](https://github.com/tenstorrent/tt-metal/tree/hf-llama/models/demos/llama3) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 1113 | 16.4 | 33 |524.8 | [hf-llama](https://github.com/tenstorrent/tt-metal/tree/hf-llama) | [b9564bf](https://github.com/tenstorrent/vllm/tree/b9564bf364e95a3850619fc7b2ed968cc71e30b7) | +| [DeepSeek R1 Distill Llama 3.3 70B (TP=8)](https://github.com/tenstorrent/tt-metal/tree/main/models/demos/llama3) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 1113 | 16.4 | 33 |524.8 | [main](https://github.com/tenstorrent/tt-metal/) | [b9564bf](https://github.com/tenstorrent/vllm/tree/b9564bf364e95a3850619fc7b2ed968cc71e30b7) | -> **Last Update:** January 27, 2025 +> **Last Update:** February 5, 2025 > > **Notes:** > diff --git a/models/common/rmsnorm.py b/models/common/rmsnorm.py index 36f06ea8cc4..28eb9cadf55 100644 --- a/models/common/rmsnorm.py +++ b/models/common/rmsnorm.py @@ -49,10 +49,12 @@ def __init__( eps: float = 1e-05, sharded_program_config=None, sharded_output_config=None, + ccl_topology=ttnn.Topology.Ring, ): super().__init__() self.eps = eps self.is_distributed = is_distributed + self.ccl_topology = ccl_topology if state_dict_prefix: weight_name = f"{state_dict_prefix}{weight_key}.weight" @@ -144,6 +146,7 @@ def _distributed_rmsnorm( tt_stats, dim=3, num_links=1, + topology=self.ccl_topology, memory_config=ttnn.DRAM_MEMORY_CONFIG, ) # Run distributed rmsnorm part 2 diff --git a/models/demos/llama3/PERF.md b/models/demos/llama3/PERF.md index 62ac609d2ce..f0bb11616df 100644 --- a/models/demos/llama3/PERF.md +++ b/models/demos/llama3/PERF.md @@ -4,51 +4,54 @@ Performance collected from [demo/demo.py](demo/demo.py) and accuracy collected f Note that `test_llama_accuracy.py` parses the below to determine expected values +- 0.5. -## LlamaOptimizations.performance +## Performance This configuration uses bfp4 MLP FF1+FF3 for all models. -| Model | Device | Top-1 (%) | Top-5 (%) | Speed (t/s/u) | -|-------|--------|-----------|-----------|---------------| -| 1b | N150 | 87 | 98 | 91.0 | -| 1b | N300 | 87 | 98 | 98.8 | -| 1b | T3K | 87 | 98 | 97.8 | -| 1b | TG | 88 | 99 | 51.0 | -| 3b | N150 | 90 | 98 | 49.2 | -| 3b | N300 | 90 | 98 | 56.8 | -| 3b | T3K | 88 | 98 | 54.5 | -| 3b | TG | 90 | 97 | 33.5 | -| 8b | N150 | 86 | 99 | 28.6 | -| 8b | N300 | 85 | 98 | 38.9 | -| 8b | T3K | 84 | 97 | 53.7 | -| 8b | TG | 86 | 98 | 29.5 | -| 11b | N300 | 87 | 98 | 38.6 | -| 11b | T3K | 88 | 98 | 52.6 | -| 11b | TG | 86 | 98 | 29.5 | -| 70b | T3K | 95 | 99 | 14.7 | -| 70b | TG | 95 | 100 | 12.7 | - - -## LlamaOptimizations.accuracy - -This configuration uses bfp4 MLP FF1+FF3 only for the 3.1-70B model. - -| Model | Device | Top-1 (%) | Top-5 (%) | Speed (t/s/u) | -|-------|--------|-----------|-----------|---------------| -| 1b | N150 | 89 | 98 | 86.8 | -| 1b | N300 | 88 | 99 | 98.1 | -| 1b | T3K | 86 | 99 | 97.5 | -| 1b | TG | 87 | 98 | 51.3 | -| 3b | N150 | 92 | 100 | 44.2 | -| 3b | N300 | 92 | 99 | 54.2 | -| 3b | T3K | 91 | 98 | 55.6 | -| 3b | TG | 91 | 98 | 33.6 | -| 8b | N150 | 91 | 99 | 23.6 | -| 8b | N300 | 91 | 99 | 34.5 | -| 8b | T3K | 90 | 99 | 49.8 | -| 8b | TG | 88 | 100 | 29.5 | -| 11b | N300 | 91 | 99 | 33.8 | -| 11b | T3K | 91 | 99 | 52.6 | -| 11b | TG | 88 | 100 | 29.5 | -| 70b | T3K | 95 | 99 | 14.7 | -| 70b | TG | 95 | 100 | 12.7 | +| Model | Device | Top-1 (%) | Top-5 (%) | Speed (t/s/u) | +|----------------|--------|-----------|-----------|---------------| +| Llama3.2-1B | N150 | 89 | 98 | 86.9 | +| Llama3.2-1B | N300 | 91 | 98 | 104.3 | +| Llama3.2-1B | T3K | 91 | 98 | 118.5 | +| Llama3.2-1B | TG | | | 72.3 | +| Llama3.2-3B | N150 | 92 | 96 | 53.3 | +| Llama3.2-3B | N300 | 91 | 96 | 66.1 | +| Llama3.2-3B | T3K | 91 | 96 | 66.9 | +| Llama3.2-3B | TG | | | 48.5 | +| Llama3.1-8B | N150 | 87 | 99 | 27.9 | +| Llama3.1-8B | N300 | 88 | 99 | 43.7 | +| Llama3.1-8B | T3K | 91 | 100 | 64.2 | +| Llama3.1-8B | TG | | | 41.0 | +| Llama3.2-11B | N300 | 89 | 99 | 43.5 | +| Llama3.2-11B | T3K | 88 | 99 | 63.4 | +| Llama3.2-11B | TG | | | 40.9 | +| Llama3.1-70B | T3K | 96 | 100 | 16.1 | +| Llama3.1-70B | TG | | | | +| Qwen2.5-7B | N300 | 81 | 96 | 37.9 | +| Qwen2.5-72B | T3K | 99 | 100 | 12.8 | + +## Accuracy + +This configuration uses bfp4 MLP FF1+FF3 only for the Llama-3.1-70B model and the Qwen-2.5-72B model. + +| Model | Device | Top-1 (%) | Top-5 (%) | Speed (t/s/u) | +|----------------|--------|-----------|-----------|---------------| +| Llama3.2-1B | N150 | 88 | 98 | 86.8 | +| Llama3.2-1B | N300 | 90 | 98 | 98.1 | +| Llama3.2-1B | T3K | 90 | 98 | 97.5 | +| Llama3.2-1B | TG | 87 | 98 | 51.3 | +| Llama3.2-3B | N150 | 93 | 99 | 44.2 | +| Llama3.2-3B | N300 | 92 | 98 | 54.2 | +| Llama3.2-3B | T3K | 93 | 98 | 55.6 | +| Llama3.2-3B | TG | 91 | 98 | 33.6 | +| Llama3.1-8B | N150 | 93 | 100 | 23.6 | +| Llama3.1-8B | N300 | 93 | 100 | 34.5 | +| Llama3.1-8B | T3K | 92 | 100 | 49.8 | +| Llama3.1-8B | TG | 88 | 100 | 29.5 | +| Llama3.2-11B | N300 | 93 | 100 | 33.8 | +| Llama3.2-11B | T3K | 94 | 100 | 52.6 | +| Llama3.2-11B | TG | 88 | 100 | 29.5 | +| Llama3.1-70B | T3K | 97 | 100 | 14.7 | +| Llama3.1-70B | TG | 95 | 100 | 12.7 | +| Qwen2.5-7B | N300 | 81 | 96 | 33.4 | +| Qwen2.5-72B | T3K | 99 | 100 | 12.8 | diff --git a/models/demos/llama3/README.md b/models/demos/llama3/README.md index b64f4739a90..65d370e4a5b 100644 --- a/models/demos/llama3/README.md +++ b/models/demos/llama3/README.md @@ -8,6 +8,7 @@ The current version supports the following Llama3 models: - Llama3.1-8B - Llama3.2-11B - Llama3.1-70B (T3000 and TG-only) +- DeepSeek R1 Distill Llama 3.3 70B (T3000 and TG-only) All the above llama models (with the exception of 70B due to its large size) are compatible and tested on the following Tenstorrent hardware: - N150 (1-chip) @@ -25,13 +26,15 @@ Max Prefill Chunk Sizes (text-only): | Llama3.1-8B | 4k tokens | 64k tokens | 128k tokens | 128k tokens | | Llama3.2-11B | 4k tokens | 64k tokens | 128k tokens | 128k tokens | | Llama3.1-70B | Not supported | Not supported | 32k tokens | 128k tokens | +| DeepSeek-R1-Distill-Llama3.3-70B | Not supported | Not supported | 32k tokens | 128k tokens | + - These max chunk sizes are specific to max context length 128k and are configured via `MAX_PREFILL_CHUNK_SIZES_DIV1024` in [model_config.py](https://github.com/tenstorrent/tt-metal/blob/main/models/demos/llama3/tt/model_config.py). If the max context length is set to a smaller value using the `max_seq_len` flag (see [Run the demo](#run-the-demo)), these chunk sizes can possibly be increased due to using a smaller KV cache. **Max Context Lengths (Llama3.2-11B multimodal)**: Llama3.2-11B multimodal is currently only supported on N300 and T3000. On N300, a max prefill context length of 8k is supported, while T3000 supports a max context length of 128k. ## How to Run -### Download the weights +### Llama models: download the weights Download the weights [directly from Meta](https://llama.meta.com/llama-downloads/), this will mean accepting their license terms. @@ -59,17 +62,33 @@ Llama3.2-11B multimodal requires extra python dependencies. Install them from: pip install -r models/demos/llama3/requirements.txt ``` +### HuggingFace models (e.g. DeepSeek R1 Distill Llama 3.3 70B) + +Download the weights from [HuggingFace](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B). Your model directory should have the following structure: + +``` +DeepSeek-R1-Distill-Llama-70B/ + config.json + generation_config.json + model-00001-of-00062.safetensors + ... +``` + ### Setup TT environment 1. Set up environment variables: ``` -export LLAMA_DIR= +export LLAMA_DIR= +``` + +On N150, N300 and T3K: +``` export WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml ``` - `$LLAMA_DIR` sets the path for the Llama3 model weights and caches. -- `$WH_ARCH_YAML` sets the dispatch over ethernet cores. This is optional for N150 and required for N300 and T3000, enabling a full core grid utilization (8x8), allowing for maximum performance of LLama3 models. +- `$WH_ARCH_YAML` sets the dispatch over ethernet cores. This is optional for N150 and required for N300 and T3000, enabling a full core grid utilization (8x8), allowing for maximum performance of LLama3 models. Do not set this for TG. On the first execution of each model, TTNN will create weight cache files for that model, to speed up future runs. These cache files only need to be created once for each model and each weight (i.e. new finetuned weights will need to be cached) and will be stored accordingly to the machine you are running the models: @@ -80,7 +99,6 @@ $LLAMA_DIR/T3K # For T3000 $LLAMA_DIR/TG # For TG ``` - ### Run the demo The Llama3 demo includes 3 main modes of operation and is fully parametrized to support other configurations. @@ -88,6 +106,7 @@ The Llama3 demo includes 3 main modes of operation and is fully parametrized to - `batch-1`: Runs a small prompt for a single user - `batch-32`: Runs a small prompt for a a batch of 32 users - `long-context`: Runs a large prompt (64k tokens) for a single user +- `reasoning-1`: Runs a reasoning prompt for a single user If you want to provide your own demo configuration, please take a look at the pytest parametrize calls in `models/demos/llama3/demo/demo.py`. For convenience we list all the supported params below: diff --git a/models/demos/llama3/demo/demo.py b/models/demos/llama3/demo/demo.py index a0b09e4dae1..21aea65fb6b 100644 --- a/models/demos/llama3/demo/demo.py +++ b/models/demos/llama3/demo/demo.py @@ -15,22 +15,17 @@ from pathlib import Path import hashlib -from models.utility_functions import nearest_32 from models.demos.llama3.tt.llama_common import ( get_prefill_rot_mat, - get_rot_transformation_mat, - HostEmbedding, - encode_prompt_llama_instruct, PagedAttentionConfig, sample_host, ) from models.demos.llama3.tt.llama_model import TtTransformer from models.demos.llama3.tt.llama_embedding import TtLlamaEmbedding -from models.demos.t3000.llama2_70b.reference.llama.llama31_8b.tokenizer import Tokenizer from models.demos.llama3.tt.model_config import TtModelArgs from models.perf.benchmarking_utils import BenchmarkProfiler -from models.demos.utils.llm_demo_utils import create_benchmark_data, verify_perf +from models.demos.utils.llm_demo_utils import create_benchmark_data from models.demos.llama3.tt.model_config import LlamaOptimizations @@ -108,10 +103,7 @@ def preprocess_inputs_prefill( if max_prefill_len == 128 * 1024: max_prefill_len = 128 * 1024 - max_generated_tokens - if instruct: - encoded_prompts = [encode_prompt_llama_instruct(tokenizer, prompt) for prompt in input_prompts] - else: - encoded_prompts = [tokenizer.encode(prompt, bos=True, eos=False) for prompt in input_prompts] + encoded_prompts = [model_args.encode_prompt(prompt, instruct=instruct) for prompt in input_prompts] # Print the length of encoded prompts logger.info("Encoded prompt lengths:" + ", ".join(str(len(prompt)) for prompt in encoded_prompts)) @@ -122,14 +114,26 @@ def preprocess_inputs_prefill( # The large input demo we provide contains more tokens than the maximum (32k tokens) # To avoid running out of memory, clip to max_prefill_len + if min_prompt_len > max_prefill_len: - logger.info(f"Clipping prompts to {max_prefill_len}") - if instruct: # When clipping, make sure to add the ` 】 token at the end (4 tokens) - encoded_prompts = [encod[: max_prefill_len - 4] for encod in encoded_prompts] - dec_prompts = [tokenizer.decode(encod) + " 】" for encod in encoded_prompts] - encoded_prompts = [tokenizer.encode(prompt, bos=True, eos=False) for prompt in dec_prompts] + logger.info(f"Left-clipping prompts to {max_prefill_len}") + if instruct: + # We need to allow a few tokens for the system prompt and the special turn tokens for assistant and user; + # to find out how big those will be, we will: + # 1. Tokenize the entire prompt with non-instruct tokenization + # 2. Calculate overhead = length of instruct tokenization - length of non-instruct tokenization + # 3. Shorten the tokenized clipped prompt by the overhead and convert back to text + # 4. Tokenize the result with instruct tokenization + # 5. Assert that the length of this is equal to the max_prefill_len + raw_prompts = [model_args.encode_prompt(prompt, instruct=False) for prompt in input_prompts] + overhead = [len(e) - len(r) for e, r in zip(encoded_prompts, raw_prompts)] + shortened = [tokenizer.decode(e[-(max_prefill_len - o) :]) for e, o in zip(raw_prompts, overhead)] + encoded_prompts = [model_args.encode_prompt(prompt, instruct=instruct) for prompt in shortened] + assert all( + len(e) == max_prefill_len for e in encoded_prompts + ), f"Clipped prompts are not of the correct length, expected {max_prefill_len} but got {[len(e) for e in encoded_prompts]}" else: - encoded_prompts = [encod[:max_prefill_len] for encod in encoded_prompts] + encoded_prompts = [encod[-max_prefill_len:] for encod in encoded_prompts] # Update prompt lengths prompt_lens = [len(x) for x in encoded_prompts] @@ -227,20 +231,20 @@ def run_llama3_demo( max_seq_len=max_seq_len, ) - tokenizer = Tokenizer(model_args.tokenizer_path) + tokenizer = model_args.tokenizer # Check max sequence length compatibility with model and architecture. Refer to README for more information - llama_model_name = model_args.model_name # ["3.2-1B", "3.2-3B", "3.1-8B", "3.2-11B", "3.1-70B"] + llama_model_name = model_args.base_model_name # ["3.2-1B", "3.2-3B", "3.1-8B", "3.2-11B", "3.1-70B"] tt_device_name = model_args.device_name # ["N150", "N300", "T3K", "TG"] - if llama_model_name in ["3.1-8B", "3.2-11B"] and tt_device_name == "N150": + if llama_model_name in ["Llama3.1-8B", "Llama3.2-11B"] and tt_device_name == "N150": assert ( max_seq_len <= 64 * 1024 ), "N150 only supports a max context length of 64k tokens for Llama3.1-8B and Llama3.2-11B" else: - assert max_seq_len <= 128 * 1024, f"Llama{llama_model_name} supports a max context length of 128k tokens" + assert max_seq_len <= 128 * 1024, f"{llama_model_name} supports a max context length of 128k tokens" - if llama_model_name == "3.1-70B": + if llama_model_name == "Llama3.1-70B": assert tt_device_name in ["T3K", "TG"], "Llama3.1-70B is only supported on T3K or TG" logger.info("Loading weights...") @@ -284,7 +288,7 @@ def run_llama3_demo( state_dict=state_dict, dtype=ttnn.bfloat16, # Row major layout requires bfloat16 ) - embd = HostEmbedding(model_args) + embd = model_args.reference_embedding() state_dict_prefix = model_args.get_state_dict_prefix("", None) embd.load_state_dict({"emb.weight": state_dict[f"{state_dict_prefix}tok_embeddings.weight"]}) profiler.end("loading_weights_to_device") @@ -340,8 +344,10 @@ def run_llama3_demo( model_args.head_dim, model_args.max_seq_len, mesh_device, - seq_len=prefill_seq_len, - scale_factor=model_args.rope_scaling_factor, + prefill_seq_len, + model_args.rope_theta, + model_args.rope_scaling_factor, + model_args.orig_context_len, ) if decoding_pos[batch_id] < prefill_seq_len: pt_prefill_input[batch_id][ @@ -483,10 +489,15 @@ def run_llama3_demo( if tt_model.args.num_devices > 1: if tt_model.args.is_galaxy: tt_out_gathered = ttnn.all_gather( - tt_out, dim=3, num_links=2, cluster_axis=0, mesh_device=mesh_device, topology=ttnn.Topology.Linear + tt_out, + dim=3, + num_links=2, + cluster_axis=0, + mesh_device=mesh_device, + topology=model_args.ccl_topology(), ) else: - tt_out_gathered = ttnn.all_gather(tt_out, dim=3, num_links=1, topology=ttnn.Topology.Linear) + tt_out_gathered = ttnn.all_gather(tt_out, dim=3, num_links=1, topology=model_args.ccl_topology()) ttnn.deallocate(tt_out) else: tt_out_gathered = tt_out @@ -527,10 +538,15 @@ def run_llama3_demo( if tt_model.args.num_devices > 1: if tt_model.args.is_galaxy: tt_out_gathered = ttnn.all_gather( - tt_out, dim=3, num_links=2, cluster_axis=0, mesh_device=mesh_device, topology=ttnn.Topology.Linear + tt_out, + dim=3, + num_links=2, + cluster_axis=0, + mesh_device=mesh_device, + topology=model_args.ccl_topology(), ) else: - tt_out_gathered = ttnn.all_gather(tt_out, dim=3, num_links=1, topology=ttnn.Topology.Linear) + tt_out_gathered = ttnn.all_gather(tt_out, dim=3, num_links=1, topology=model_args.ccl_topology()) ttnn.deallocate(tt_out) else: tt_out_gathered = tt_out @@ -550,13 +566,15 @@ def run_llama3_demo( current_pos_reset = ttnn.from_torch( current_pos, dtype=ttnn.int32, - mesh_mapper=ttnn.ShardTensor2dMesh( - mesh_device, - dims=(None, 0) if (model_args.is_galaxy and batch_size > 1) else (None, None), - mesh_shape=model_args.cluster_shape, - ) - if tt_model.args.num_devices > 1 - else None, + mesh_mapper=( + ttnn.ShardTensor2dMesh( + mesh_device, + dims=(None, 0) if (model_args.is_galaxy and batch_size > 1) else (None, None), + mesh_shape=model_args.cluster_shape, + ) + if tt_model.args.num_devices > 1 + else None + ), ) tt_out_tok_reset = ttnn.from_torch( torch.nn.functional.pad( @@ -629,8 +647,8 @@ def run_llama3_demo( for user in range(batch_size): user_tok = tt_output_torch[user].tolist() if ( - user_tok != 128009 and user_done[user] == False - ): # Stop saving the ouput after hitting the eos token (<|eot_id|>) (128009) + user_tok not in tokenizer.stop_tokens and user_done[user] == False + ): # Read until an eos token (e.g. <|eot_id|>); create_tokenizer adds stop_tokens to HF tokenizers all_outputs[user].append(user_tok) else: user_done[user] = True @@ -680,14 +698,10 @@ def run_llama3_demo( profiler.start(f"log_saving_file", iteration=batch_idx) for i, (output, prompt) in enumerate(zip(all_outputs, input_prompts)): text = tokenizer.decode(output) - if instruct_mode: - split_text = text.split("<|start_header_id|>assistant<|end_header_id|>", 1) - else: - split_text = text.split(prompt, 1) - if len(split_text) > 1: - text_after_prompt = split_text[1] - else: - text_after_prompt = text # If prompt is not found, use the whole text + prompt_including_assistant_tags = tokenizer.decode( + model_args.encode_prompt(prompt, instruct=instruct_mode) + ) + text_after_prompt = text.replace(prompt_including_assistant_tags, "", 1) if print_to_file: with open(output_filename, "a") as f: f.write( @@ -770,76 +784,78 @@ def run_llama3_demo( ) logger.info("") - supported_models = ["3.2-1B", "3.2-3B", "3.1-8B", "3.2-11B", "3.1-70B"] + supported_models = ["Llama3.2-1B", "Llama3.2-3B", "Llama3.1-8B", "Llama3.2-11B", "Llama3.1-70B"] supported_devices = ["N150", "N300", "T3K", "TG"] # TODO update targets based on the llama3 model and the target device - llama_model_name = model_args.model_name tt_device_name = model_args.device_name - assert llama_model_name in supported_models, f"Model {llama_model_name} not supported" - assert tt_device_name in supported_devices, f"Device {tt_device_name} not supported" - - # Set the target times to first token for every combination of device and model - target_prefill_tok_s = { - "N150_3.2-1B": 1050, # TODO Update target - "N300_3.2-1B": 1050, # TODO Update target - "T3K_3.2-1B": 1050, # TODO Update target - "TG_3.2-1B": 1050, # TODO Update target - # - "N150_3.2-3B": 1050, # TODO Update target - "N300_3.2-3B": 1050, # TODO Update target - "T3K_3.2-3B": 1050, # TODO Update target - "TG_3.2-3B": 1050, # TODO Update target - # - "N150_3.1-8B": 1050, - "N300_3.1-8B": 1050, - "T3K_3.1-8B": 1050, - "TG_3.1-8B": 1050, - # - "N150_3.2-11B": 1050, # TODO Update target - "N300_3.2-11B": 1050, # TODO Update target - "T3K_3.2-11B": 1050, # TODO Update target - "TG_3.2-11B": 1050, # TODO Update target - # - "N150_3.1-70B": 1050, # TODO Update target - "N300_3.1-70B": 1050, # TODO Update target - "T3K_3.1-70B": 1050, # TODO Update target - "TG_3.1-70B": 1050, # TODO Update target - }[f"{tt_device_name}_{llama_model_name}"] - - # Set the target decode timesfor every combination of device and model - target_decode_tok_s_u = { - "N150_3.2-1B": 160, # TODO Update target - "N300_3.2-1B": 250, # TODO Update target - "T3K_3.2-1B": 300, # TODO Update target - "TG_3.2-1B": 300, # TODO Update target - # - "N150_3.2-3B": 60, # TODO Update target - "N300_3.2-3B": 100, # TODO Update target - "T3K_3.2-3B": 150, # TODO Update target - "TG_3.2-3B": 150, # TODO Update target - # - "N150_3.1-8B": 23, # TODO Update target - "N300_3.1-8B": 38, - "T3K_3.1-8B": 45, - "TG_3.1-8B": 45, # TODO Update target - # - "N150_3.2-11B": 23, - "N300_3.2-11B": 38, # TODO Update target - "T3K_3.2-11B": 45, # TODO Update target - "TG_3.2-11B": 45, # TODO Update target - # - "T3K_3.1-70B": 20, # TODO Update target - "TG_3.1-70B": 20, # TODO Update target - }[f"{tt_device_name}_{llama_model_name}"] - - target_decode_tok_s = target_decode_tok_s_u * batch_size - targets = { - "prefill_t/s": target_prefill_tok_s, - "decode_t/s": target_decode_tok_s, - "decode_t/s/u": target_decode_tok_s_u, - } + if model_args.base_model_name in supported_models: + assert tt_device_name in supported_devices, f"Device {tt_device_name} not supported" + + # Set the target times to first token for every combination of device and model + target_prefill_tok_s = { + "N150_Llama3.2-1B": 1050, # TODO Update target + "N300_Llama3.2-1B": 1050, # TODO Update target + "T3K_Llama3.2-1B": 1050, # TODO Update target + "TG_Llama3.2-1B": 1050, # TODO Update target + # + "N150_Llama3.2-3B": 1050, # TODO Update target + "N300_Llama3.2-3B": 1050, # TODO Update target + "T3K_Llama3.2-3B": 1050, # TODO Update target + "TG_Llama3.2-3B": 1050, # TODO Update target + # + "N150_Llama3.1-8B": 1050, + "N300_Llama3.1-8B": 1050, + "T3K_Llama3.1-8B": 1050, + "TG_Llama3.1-8B": 1050, + # + "N150_Llama3.2-11B": 1050, # TODO Update target + "N300_Llama3.2-11B": 1050, # TODO Update target + "T3K_Llama3.2-11B": 1050, # TODO Update target + "TG_Llama3.2-11B": 1050, # TODO Update target + # + "N150_Llama3.1-70B": 1050, # TODO Update target + "N300_Llama3.1-70B": 1050, # TODO Update target + "T3K_Llama3.1-70B": 1050, # TODO Update target + "TG_Llama3.1-70B": 1050, # TODO Update target + }[f"{tt_device_name}_{model_args.base_model_name}"] + + # Set the target decode timesfor every combination of device and model + target_decode_tok_s_u = { + "N150_Llama3.2-1B": 160, # TODO Update target + "N300_Llama3.2-1B": 250, # TODO Update target + "T3K_Llama3.2-1B": 300, # TODO Update target + "TG_Llama3.2-1B": 300, # TODO Update target + # + "N150_Llama3.2-3B": 60, # TODO Update target + "N300_Llama3.2-3B": 100, # TODO Update target + "T3K_Llama3.2-3B": 150, # TODO Update target + "TG_Llama3.2-3B": 150, # TODO Update target + # + "N150_Llama3.1-8B": 23, # TODO Update target + "N300_Llama3.1-8B": 38, + "T3K_Llama3.1-8B": 45, + "TG_Llama3.1-8B": 45, # TODO Update target + # + "N150_Llama3.2-11B": 23, + "N300_Llama3.2-11B": 38, # TODO Update target + "T3K_Llama3.2-11B": 45, # TODO Update target + "TG_Llama3.2-11B": 45, # TODO Update target + # + "T3K_Llama3.1-70B": 20, # TODO Update target + "TG_Llama3.1-70B": 20, # TODO Update target + }[f"{tt_device_name}_{model_args.base_model_name}"] + + target_decode_tok_s = target_decode_tok_s_u * batch_size + targets = { + "prefill_t/s": target_prefill_tok_s, + "decode_t/s": target_decode_tok_s, + "decode_t/s/u": target_decode_tok_s_u, + } + else: + logger.warning(f"Model {model_args.base_model_name} not does not have performance targets set") + targets = {} # Save benchmark data for CI dashboard if is_ci_env: @@ -847,7 +863,7 @@ def run_llama3_demo( benchmark_data.save_partial_run_json( profiler, run_type=f"{tt_device_name}-demo", - ml_model_name=llama_model_name, + ml_model_name=model_args.base_model_name, ml_model_type="llm", num_layers=model_args.n_layers, batch_size=batch_size, @@ -873,6 +889,17 @@ def run_llama3_demo( @pytest.mark.parametrize( "input_prompts, instruct, repeat_batches, max_seq_len, batch_size, max_generated_tokens, paged_attention, page_params, sampling_params", [ + ( # Batch-1 run (Reasoning) - single user, small prompt, long thinking time + "models/demos/llama3/demo/input_data_questions_reasoning.json", # input_prompts + True, # instruct mode + 1, # repeat_batches + 16384, # max_seq_len + 1, # batch_size + 15000, # max_generated_tokens + True, # paged_attention + {"page_block_size": 32, "page_max_num_blocks": 1024}, # page_params # TODO This will be serviced by vLLM + {"temperature": 0, "top_p": 0.08}, # sampling_params (argmax) + ), ( # Batch-1 run (Latency) - single user, small prompt "models/demos/llama3/demo/input_data_questions_prefill_128.json", # input_prompts True, # instruct mode @@ -908,6 +935,7 @@ def run_llama3_demo( ), ], ids=[ + "reasoning-1", # reasoning "batch-1", # latency "batch-32", # throughput "long-context", # max-length @@ -946,7 +974,9 @@ def test_llama_demo( is_ci_env, reset_seeds, ): - if is_ci_env and ("long" in input_prompts or optimizations == LlamaOptimizations.accuracy): + if is_ci_env and ( + "long" in input_prompts or "reasoning" in input_prompts or optimizations == LlamaOptimizations.accuracy + ): pytest.skip("Do not run the 'long-context' or accuracy tests on CI to reduce load") # TODO: Remove this once all batch sizes are supported on TG diff --git a/models/demos/llama3/demo/input_data_questions_reasoning.json b/models/demos/llama3/demo/input_data_questions_reasoning.json new file mode 100644 index 00000000000..360a4b49cad --- /dev/null +++ b/models/demos/llama3/demo/input_data_questions_reasoning.json @@ -0,0 +1,20 @@ +[ + { + "prompt": "Find all integer solutions (x, y) to the equation x^2 - 3y^2 = 1." + }, + { + "prompt": "Find the least odd prime factor of 2019^8 + 1" + }, + { + "prompt": "Compose a maximally-catchy piece of piano music; the left hand should only play chords and the right hand a simple melody. The song should get stuck in the listener's head for days." + }, + { + "prompt": "Compose the most beautiful and maximally-elegant haiku that captures the poignancy of the human condition; think carefully about how to make sure it packs the maximum possible emotional punch for the reader." + }, + { + "prompt": "A fair coin is tossed 8 times. What is the probability (in simplest fractional form) of getting exactly 5 heads?" + }, + { + "prompt": "How many 7-digit integers have digits strictly increasing from left to right? (For example, 1234567 is valid, 1357899 is not because of the repeated 9.)" + } +] diff --git a/models/demos/llama3/demo/simple_vision_demo.py b/models/demos/llama3/demo/simple_vision_demo.py index 47719f91462..7eaed8091a7 100644 --- a/models/demos/llama3/demo/simple_vision_demo.py +++ b/models/demos/llama3/demo/simple_vision_demo.py @@ -108,7 +108,7 @@ def test_llama_multimodal_demo_text( mesh_device.enable_async(True) model_args, model = create_multimodal_model(mesh_device, max_batch_size=max_batch_size, max_seq_len=max_seq_len) generator = LlamaGenerator(model, model_args, mesh_device) - tokenizer = Tokenizer(model_path=tokenizer_path) + tokenizer = model_args.tokenizer formatter = ChatFormat(tokenizer) xattn_caches = generator.model.setup_cache(model_args.max_batch_size) diff --git a/models/demos/llama3/lt b/models/demos/llama3/lt old mode 100644 new mode 100755 index 2a807109237..c088bb586d8 --- a/models/demos/llama3/lt +++ b/models/demos/llama3/lt @@ -61,13 +61,17 @@ def ensure_ttsmi_installed(): def reset_device_sync(config_file): - reset_cmd = ["tt-smi", "-r", config_file] - try: + if os.environ.get("RESET_CMD"): + reset_cmd = os.environ.get("RESET_CMD").split(" ") + print(f"Resetting device using custom command: {reset_cmd}") + else: + reset_cmd = ["tt-smi", "-r", config_file] print(f"Resetting device using config file: {config_file}") + try: result = subprocess.run(reset_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print(f"Device reset successfully: {result.stdout}") except subprocess.CalledProcessError as e: - print(f"Error during device reset: {e.stderr}") + print(f"Error during device reset: {e.stdout} {e.stderr}") sys.exit(1) @@ -82,7 +86,7 @@ def get_device(): device = "N150" elif total_devices == 8: device = "T3K" - else: # TG has 36 devices + else: # TG has 36 devices device = "TG" # Old method of getting device name based on hostname @@ -109,11 +113,13 @@ def list_supported_devices(device): # Counts number of devices using `tt-smi -ls` output def count_devices(output): # Split the output into available boards section - sections = output.split('All available boards on host') - available_boards = sections[1].split('Boards that can be reset')[0] + sections = output.split("All available boards on host") + available_boards = sections[1].split("Boards that can be reset")[0] # Count total PCI devices (ignoring N/A) - total_pci_devices = len([line for line in available_boards.split('\n') if ('Wormhole' or 'Grayskull' or 'Blackhole') in line]) + total_pci_devices = len( + [line for line in available_boards.split("\n") if ("Wormhole" or "Grayskull" or "Blackhole") in line] + ) return total_pci_devices @@ -332,7 +338,7 @@ def main(stdscr): # Input fields positions (reordered) input_fields = [ {"label": "Command [demo]", "value": "", "x": 0, "y": 0}, - {"label": "Model (1b, 3b, 8b, 11b, 70b) [all]", "value": "", "x": 0, "y": 1}, + {"label": "Model (1b, 3b, 8b, 11b, 70b, 70b-r1, q7b, q72b) [all]", "value": "", "x": 0, "y": 1}, { "label": f"Device ({list_supported_devices(host_device)}) [all]", "value": "", @@ -447,10 +453,8 @@ def main(stdscr): if current_field == len(input_fields) - 1: # Submit command command_input = input_fields[0]["value"] or "demo" - model_input = input_fields[1]["value"] or "1b,3b,8b,11b,70b" - device_input = ( - input_fields[2]["value"] or list_supported_devices(host_device) - ) + model_input = input_fields[1]["value"] or "1b,3b,8b,11b,70b,70b-r1,q7b,q72b" + device_input = input_fields[2]["value"] or list_supported_devices(host_device) if command_input == "modules": command_input = "rmsnorm,attention,attention-prefill,mlp,lm-head" @@ -461,6 +465,9 @@ def main(stdscr): if command_input == "table": command_input = "accuracy,demo,accuracy-acc,demo-acc" + if command_input == "vision": + command_input = "vision-mlp,vision-attn,vision-block,vision-xfmr,vision-xattn,vision-xblock,vision-conv,vision-class,vision-tile-pos,vision-pos,vision-encoder,vision-text-xfmr,vision-vision-xfmr" + # Parse models, devices, and commands models = parse_list(model_input) devices = parse_list(device_input) @@ -469,7 +476,9 @@ def main(stdscr): # Generate combinations (reordered) # Ignore invalid combinations: # - 11b and 11b-b models on n150 device - # - 70b model on n150 and n300 devices + # - 70b and 70b-r1 model on n150 and n300 devices + # - 72b model on n150 and n300 devices + # - q7b on anything other than N300 # - Vision commands on non-vision (11b) models combinations = [ (c, m, d) @@ -479,6 +488,9 @@ def main(stdscr): if not ( (m in ["11b", "11b-b"] and d == "n150") or (m == "70b" and d in ["n150", "n300"]) + or (m == "70b-r1" and d in ["n150", "n300"]) + or (m == "q72b" and d in ["n150", "n300"]) + or (m == "q7b" and d != "n300") or ("vision" in c and m not in ["11b", "11b-b"]) ) ] @@ -1034,6 +1046,9 @@ def get_llama_dir(model): "11b": os.environ.get("LLAMA_32_11B_DIR", "/proj_sw/user_dev/llama32-data/Llama3.2-11B-Vision-Instruct"), "11b-b": os.environ.get("LLAMA_32_11B_BASE_DIR", "/proj_sw/user_dev/llama32-data/Llama3.2-11B-Vision"), "70b": os.environ.get("LLAMA_31_70B_DIR", "/proj_sw/llama3_1-weights/Meta-Llama-3.1-70B-Instruct/repacked"), + "70b-r1": os.environ.get("DEEPSEEK_R1_LLAMA_70B_DIR", "/proj_sw/deepseek/DeepSeek-R1-Distill-Llama-70B"), + "q7b": os.environ.get("QWEN_7B_DIR", "/proj_sw/user_dev/Qwen/Qwen2.5-7B-Instruct"), + "q72b": os.environ.get("QWEN_72B_DIR", "/proj_sw/user_dev/Qwen/Qwen2.5-72B-Instruct"), }.get(model.lower(), "") if not llama_dir or not os.path.exists(llama_dir): @@ -1044,6 +1059,9 @@ def get_llama_dir(model): print(" - LLAMA_31_8B_DIR for 8b model") print(" - LLAMA_32_11B_DIR for 11b model") print(" - LLAMA_31_70B_DIR for 70b model") + print(" - DEEPSEEK_R1_LLAMA_70B_DIR for DeepSeek R1 Llama 70b distill model") + print(" - QWEN_7B_DIR for 7b Qwen2.5 model") + print(" - QWEN_72B_DIR for 72b Qwen2.5 model") sys.exit(1) return llama_dir @@ -1250,6 +1268,17 @@ def export_results_to_markdown(output_entries, stdscr): "|-------|--------|-----------|-----------|---------------|", ] + fullname = { + "1b": "Llama-3.2-1B", + "3b": "Llama-3.2-3B", + "8b": "Llama-3.1-8B", + "11b": "Llama-3.2-11B", + "70b": "Llama-3.1-70B", + "70b-r1": "DeepSeek-R1-Llama-70B", + "q7b": "Qwen-2.5-7B", + "q72b": "Qwen-2.5-72B", + } + # Add rows for performance table in original order for entry in perf_entries: (model, device), top1, top5, speed = entry @@ -1271,7 +1300,7 @@ def export_results_to_markdown(output_entries, stdscr): # Add rows for accuracy table in original order for entry in acc_entries: (model, device), top1, top5, speed = entry - markdown_lines.append(f"| {model} | {device} | {top1} | {top5} | {speed} |") + markdown_lines.append(f"| {fullname[model]} | {device} | {top1} | {top5} | {speed} |") # Write to PERF.md with open("PERF.md", "w") as f: diff --git a/models/demos/llama3/requirements.txt b/models/demos/llama3/requirements.txt index e830cffd233..438cea7dbee 100644 --- a/models/demos/llama3/requirements.txt +++ b/models/demos/llama3/requirements.txt @@ -1 +1,2 @@ git+https://github.com/tenstorrent/llama-models.git@tt_metal_tag +transformers >= 4.46.3 diff --git a/models/demos/llama3/tests/generate_reference_hf.py b/models/demos/llama3/tests/generate_reference_hf.py new file mode 100644 index 00000000000..f275584e6da --- /dev/null +++ b/models/demos/llama3/tests/generate_reference_hf.py @@ -0,0 +1,148 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +# SPDX-License-Identifier: Apache-2.0 + +import torch +import bz2 +import os +import argparse +from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer +from loguru import logger + + +def generate_reference_outputs(total_length, output_file, model_name): + # Set device + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + logger.info(f"Using device: {device}") + + # Load model and tokenizer from HuggingFace + config = AutoConfig.from_pretrained(model_name) + + # Qwen only: add rope scaling to the config + # https://huggingface.co/Qwen/Qwen2.5-7B-Instruct#processing-long-texts + if "Qwen" in model_name: + config.rope_scaling = {"factor": 4.0, "original_max_position_embeddings": 32768, "type": "yarn"} + + tokenizer = AutoTokenizer.from_pretrained(model_name) + model = AutoModelForCausalLM.from_pretrained(model_name, config=config, device_map="auto") + model.eval() + + # Load the book text + current_file_path = os.path.abspath(__file__) + current_file_dir = os.path.dirname(current_file_path) + prompt_file = os.path.join(current_file_dir, "tale-of-two-cities.txt.bz2") + + with bz2.open(prompt_file, "rt", encoding="utf-8") as f: + text = f.read() + + # Encode text to tokens + encoded_tokens = tokenizer.encode(text, add_special_tokens=True)[:total_length] + encoded_tokens_tensor = torch.tensor(encoded_tokens, device=device).unsqueeze(0) # Shape [1, seq_len] on device + + print(f"{'Progress':<15}{'Correct':<8}{'Actual':<15}{'Top 5 Predictions':<75}") + print("-" * 113) + + # Initialize lists to store results + all_top1_correct = [] + all_top5_correct = [] + all_top5_tokens = [] + segment_accuracies = [] + chunk_size = 1024 + + with torch.no_grad(): + for chunk_start in range(0, total_length - 1, chunk_size): + chunk_end = min(chunk_start + chunk_size, total_length) + # Get input and target chunks + chunk_tokens = encoded_tokens_tensor[:, chunk_start:chunk_end] + chunk_next_tokens = encoded_tokens[chunk_start + 1 : chunk_end + 1] + actual_chunk_size = min(len(chunk_tokens[0]), len(chunk_next_tokens)) + + # Trim input chunk if needed + chunk_tokens = chunk_tokens[:, :actual_chunk_size] + + # Process chunk using HuggingFace model + outputs = model(chunk_tokens.to(device)) + logits = outputs.logits + + # Compute top-5 predictions + probs = torch.softmax(logits, dim=-1) + _, chunk_top5_tokens = torch.topk(probs, k=5, dim=-1) # Shape: [1, chunk_size, 5] + chunk_top5_tokens = chunk_top5_tokens.squeeze(0) # Shape: [chunk_size, 5] + + # Get next tokens tensor + chunk_next_tokens_tensor = torch.tensor( + chunk_next_tokens[:actual_chunk_size], device=device + ) # Move to same device + + # Calculate correctness + chunk_top1_correct = chunk_top5_tokens[:, 0] == chunk_next_tokens_tensor + chunk_top5_correct = torch.any(chunk_top5_tokens == chunk_next_tokens_tensor.unsqueeze(1), dim=1) + + # Store results + all_top1_correct.extend(chunk_top1_correct.tolist()) + all_top5_correct.extend(chunk_top5_correct.tolist()) + all_top5_tokens.append(chunk_top5_tokens) + + # Print predictions for this chunk + for i in range(len(chunk_next_tokens)): + global_pos = chunk_start + i + next_token = chunk_next_tokens[i] + + sanitize = lambda x: x.replace("\n", "").replace("\r", "").replace("\x0c", "") + actual_token = sanitize(tokenizer.decode([next_token])) + top5_tokens = [sanitize(tokenizer.decode([t.item()])) for t in chunk_top5_tokens[i]] + correct = "x" if chunk_top1_correct[i] else ("-" if chunk_top5_correct[i] else " ") + top5_str = " ".join(f"{t:<14}" for t in top5_tokens) + + progress_str = f"{global_pos+1}/{total_length-1}" + print(f"{progress_str:<15}{correct:<8}{actual_token:<15}{top5_str}") + + # Calculate and store segment accuracies every 100 tokens + if (global_pos + 1) % 100 == 0 or global_pos == total_length - 2: + start_idx = (global_pos // 100) * 100 + end_idx = min(start_idx + 100, len(all_top1_correct)) + segment_top1_acc = sum(all_top1_correct[start_idx:end_idx]) / (end_idx - start_idx) * 100 + segment_top5_acc = sum(all_top5_correct[start_idx:end_idx]) / (end_idx - start_idx) * 100 + if len(segment_accuracies) <= global_pos // 100: + segment_accuracies.append((segment_top1_acc, segment_top5_acc)) + + # Save the data - ensure tensors are concatenated and on CPU + data = { + "top5_tokens": torch.cat(all_top5_tokens, dim=0).cpu(), + "reference_tokens": encoded_tokens_tensor[:, :total_length].clone().cpu(), + } + + torch.save(data, output_file) + logger.info(f"Saved reference outputs to {output_file}") + + # Print all segment accuracy summaries as a table + print("\nSegment Accuracy Summaries:") + print(f"{'Tokens':<15}{'Top-1 Accuracy':<20}{'Top-5 Accuracy':<20}") + print("-" * 55) + for i, (top1_acc, top5_acc) in enumerate(segment_accuracies): + start_token = i * 100 + 1 + end_token = min((i + 1) * 100, total_length) + print(f"{f'{start_token}-{end_token}':<15}{f'{top1_acc:.2f}%':<20}{f'{top5_acc:.2f}%':<20}") + + # Calculate overall accuracy + overall_top1_acc = sum(acc[0] for acc in segment_accuracies) / len(segment_accuracies) + overall_top5_acc = sum(acc[1] for acc in segment_accuracies) / len(segment_accuracies) + print("-" * 55) + print(f"{'Overall':<15}{f'{overall_top1_acc:.2f}%':<20}{f'{overall_top5_acc:.2f}%':<20}") + + +def main(): + parser = argparse.ArgumentParser(description="Generate reference outputs using HuggingFace models.") + parser.add_argument("--total_length", type=int, default=1024, help="Total length of tokens to process") + parser.add_argument( + "--output_file", type=str, default="reference_outputs.pt", help="Output file path for reference data" + ) + parser.add_argument( + "--model", type=str, required=True, help="HuggingFace model name (e.g., 'meta-llama/Llama-2-7b-hf')" + ) + args = parser.parse_args() + + generate_reference_outputs(total_length=args.total_length, output_file=args.output_file, model_name=args.model) + + +if __name__ == "__main__": + main() diff --git a/models/demos/llama3/tests/generate_reference_outputs.py b/models/demos/llama3/tests/generate_reference_outputs.py index 1f0514bfe7b..f874e913a10 100644 --- a/models/demos/llama3/tests/generate_reference_outputs.py +++ b/models/demos/llama3/tests/generate_reference_outputs.py @@ -5,28 +5,40 @@ import bz2 import os import argparse -import time -from models.demos.llama3.tt.llama_common import HostEmbedding -from models.demos.t3000.llama2_70b.reference.llama.llama31_8b.model import Transformer -from models.demos.llama3.tt.model_config import TtModelArgs -from models.demos.t3000.llama2_70b.reference.llama.llama31_8b.tokenizer import Tokenizer +from models.demos.llama3.tt.model_config import TtModelArgs, CheckpointType from loguru import logger from transformers import AutoModelForCausalLM, AutoTokenizer def generate_reference_outputs(total_length, output_file, hf_model_name=None): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + logger.info(f"Using device: {device}") + if hf_model_name: # HuggingFace path tokenizer = AutoTokenizer.from_pretrained(hf_model_name) - model = AutoModelForCausalLM.from_pretrained(hf_model_name, torch_dtype=torch.float32) + config = AutoConfig.from_pretrained(hf_model_name) + # Qwen only: add rope scaling to the config + # https://huggingface.co/Qwen/Qwen2.5-7B-Instruct#processing-long-texts + if "Qwen" in hf_model_name: + config.rope_scaling = {"factor": 4.0, "original_max_position_embeddings": 32768, "type": "yarn"} + model = AutoModelForCausalLM.from_pretrained( + hf_model_name, config=config, torch_dtype=torch.float32 if device == "cpu" else None, device_map="auto" + ) model.eval() + else: # Original path - load reference model model_args = TtModelArgs(mesh_device=None) model_args.max_seq_len = total_length tokenizer = Tokenizer(model_args.tokenizer_path) + # Special-case Hf models as they can load directly from the safetensors much more efficiently + if model_args.checkpoint_type == CheckpointType.Meta: + # Load the model state dict state_dict = model_args.load_state_dict() + + # Initialize the reference model state_dict_prefix = model_args.get_state_dict_prefix("", None) reference_state_dict = { k[len(state_dict_prefix) :]: v @@ -41,13 +53,20 @@ def generate_reference_outputs(total_length, output_file, hf_model_name=None): ) ) } - model = Transformer(model_args) - model.load_state_dict(reference_state_dict) - model.eval() + reference_model = model_args.reference_transformer() + reference_model.to(device) # Move model to device + reference_model.eval() # Set to evaluation mode + reference_model.load_state_dict(reference_state_dict) - # Initialize HostEmbedding - embd = HostEmbedding(model_args) + embd = model_args.reference_embedding(reference_model) + embd.to(device) # Move embedding to device embd.load_state_dict({"emb.weight": state_dict[f"{state_dict_prefix}tok_embeddings.weight"]}) + else: + reference_model = model_args.reference_transformer(load_checkpoint=True) + reference_model.to(device) # Move model to device + reference_model.eval() # Set to evaluation mode + embd = reference_model.model.model.embed_tokens + embd.to(device) # Move embedding to device # Load the book text and encode tokens current_file_path = os.path.abspath(__file__) @@ -57,13 +76,9 @@ def generate_reference_outputs(total_length, output_file, hf_model_name=None): with bz2.open(prompt_file, "rt", encoding="utf-8") as f: text = f.read() - # Modify token encoding based on model type - if hf_model_name: - encoded_tokens = tokenizer.encode(text, add_special_tokens=True)[:total_length] - else: - encoded_tokens = tokenizer.encode(text, bos=True, eos=False)[:total_length] - - encoded_tokens_tensor = torch.tensor(encoded_tokens).unsqueeze(0) # Shape [1, seq_len] + # Encode text to tokens + encoded_tokens = model_args.encode_prompt(text, instruct=False) + encoded_tokens_tensor = torch.tensor(encoded_tokens, device=device).unsqueeze(0) # Move to device print(f"{'Progress':<15}{'Correct':<8}{'Actual':<15}{'Top 5 Predictions':<75}") print("-" * 113) @@ -87,6 +102,7 @@ def generate_reference_outputs(total_length, output_file, hf_model_name=None): chunk_tokens = chunk_tokens[:, :actual_chunk_size] # Process chunk based on model type + chunk_tokens = chunk_tokens.to(device) if hf_model_name: outputs = model(chunk_tokens) ref_output = outputs.logits @@ -100,7 +116,7 @@ def generate_reference_outputs(total_length, output_file, hf_model_name=None): chunk_top5_tokens = chunk_top5_tokens.squeeze(0) # Shape: [chunk_size, 5] # Get next tokens tensor, ensuring same length as predictions - chunk_next_tokens_tensor = torch.tensor(chunk_next_tokens[:actual_chunk_size]) + chunk_next_tokens_tensor = torch.tensor(chunk_next_tokens[:actual_chunk_size], device=device) # Calculate correctness chunk_top1_correct = chunk_top5_tokens[:, 0] == chunk_next_tokens_tensor @@ -137,10 +153,10 @@ def generate_reference_outputs(total_length, output_file, hf_model_name=None): # Concatenate all top5 tokens into a single tensor all_top5_tokens = torch.cat(all_top5_tokens, dim=0) # Shape: [total_tokens, 5] - # Save the data + # Move tensors back to CPU before saving data = { - "top5_tokens": all_top5_tokens, - "reference_tokens": encoded_tokens_tensor, + "top5_tokens": torch.cat(all_top5_tokens, dim=0).cpu(), + "reference_tokens": encoded_tokens_tensor[:, :total_length].clone().cpu(), } torch.save(data, output_file) diff --git a/models/demos/llama3/tests/generate_reference_outputs.sh b/models/demos/llama3/tests/generate_reference_outputs.sh index a756a0b3ef4..bf419c42a08 100755 --- a/models/demos/llama3/tests/generate_reference_outputs.sh +++ b/models/demos/llama3/tests/generate_reference_outputs.sh @@ -33,6 +33,8 @@ LLAMA_DIRS=( "${LLAMA_31_8B_DIR:-/proj_sw/user_dev/llama31-8b-data/Meta-Llama-3.1-8B-Instruct}" "${LLAMA_32_11B_DIR:-/proj_sw/user_dev/llama32-data/Llama3.2-11B-Vision-Instruct}" "${LLAMA_31_70B_DIR:-/proj_sw/llama3_1-weights/Meta-Llama-3.1-70B-Instruct/repacked}" + "${QWEN_25_7B_DIR:-/proj_sw/user_dev/Qwen/Qwen2.5-7B-Instruct}" + "${QWEN_25_72B_DIR:-/proj_sw/user_dev/Qwen/Qwen2.5-72B-Instruct}" ) # Create reference_outputs directory if it doesn't exist @@ -40,21 +42,14 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" OUTPUT_DIR="${SCRIPT_DIR}/reference_outputs" mkdir -p "$OUTPUT_DIR" -# Function to get model size from directory path -get_model_size() { - if [[ $1 == *"-1B"* ]]; then - echo "1b" - elif [[ $1 == *"-3B"* ]]; then - echo "3b" - elif [[ $1 == *"-8B"* ]]; then - echo "8b" - elif [[ $1 == *"-11B"* ]]; then - echo "11b" - elif [[ $1 == *"-70B"* ]]; then - echo "70b" - else - echo "unknown" +# Function to get model name from directory path +get_model_name() { + local dir_name=$(basename "$1") + # If the path ends in /repacked, use the parent directory name instead + if [ "$dir_name" = "repacked" ]; then + dir_name=$(basename "$(dirname "$1")") fi + echo "$dir_name" } # Loop through each LLAMA directory @@ -65,8 +60,8 @@ for DIR in "${LLAMA_DIRS[@]}"; do fi # Get model size for output filename - MODEL_SIZE=$(get_model_size "$DIR") - OUTPUT_FILE="${OUTPUT_DIR}/${MODEL_SIZE}.refpt" + MODEL_NAME=$(get_model_name "$DIR") + OUTPUT_FILE="${OUTPUT_DIR}/${MODEL_NAME}_full.refpt" echo "Generating reference outputs for ${MODEL_SIZE} model..." echo "Using weights from: ${DIR}" diff --git a/models/demos/llama3/tests/multimodal/test_llama_cross_attention_transformer_text.py b/models/demos/llama3/tests/multimodal/test_llama_cross_attention_transformer_text.py index 631bdf31446..e23ea6e62bd 100644 --- a/models/demos/llama3/tests/multimodal/test_llama_cross_attention_transformer_text.py +++ b/models/demos/llama3/tests/multimodal/test_llama_cross_attention_transformer_text.py @@ -216,8 +216,10 @@ def test_llama_cross_attention_transformer_text_inference( model_args.head_dim, model_args.max_seq_len, mesh_device, - seq_len=seq_len, - scale_factor=model_args.rope_scaling_factor, + seq_len, + model_args.rope_theta, + model_args.rope_scaling_factor, + model_args.orig_context_len, ) tt_out = tt_model( tt_h, @@ -260,6 +262,9 @@ def test_llama_cross_attention_transformer_text_inference( mesh_device, model_args.num_devices, start_pos=cur_pos - 1, + theta=model_args.rope_theta, + scale_factor=model_args.rope_scaling_factor, + orig_context_len=model_args.orig_context_len, ) tt_rope_id = tt_model.rope_setup.get_rot_idxs(position_ids) rot_mats = tt_model.rope_setup.get_rot_mats(tt_rope_id) diff --git a/models/demos/llama3/tests/reference_outputs/70b.refpt b/models/demos/llama3/tests/reference_outputs/Llama3.1-70B-Instruct.refpt similarity index 100% rename from models/demos/llama3/tests/reference_outputs/70b.refpt rename to models/demos/llama3/tests/reference_outputs/Llama3.1-70B-Instruct.refpt diff --git a/models/demos/llama3/tests/reference_outputs/8b.refpt b/models/demos/llama3/tests/reference_outputs/Llama3.1-8B-Instruct.refpt similarity index 100% rename from models/demos/llama3/tests/reference_outputs/8b.refpt rename to models/demos/llama3/tests/reference_outputs/Llama3.1-8B-Instruct.refpt diff --git a/models/demos/llama3/tests/reference_outputs/11b.refpt b/models/demos/llama3/tests/reference_outputs/Llama3.2-11B-Instruct.refpt similarity index 100% rename from models/demos/llama3/tests/reference_outputs/11b.refpt rename to models/demos/llama3/tests/reference_outputs/Llama3.2-11B-Instruct.refpt diff --git a/models/demos/llama3/tests/reference_outputs/1b.refpt b/models/demos/llama3/tests/reference_outputs/Llama3.2-1B-Instruct.refpt similarity index 100% rename from models/demos/llama3/tests/reference_outputs/1b.refpt rename to models/demos/llama3/tests/reference_outputs/Llama3.2-1B-Instruct.refpt diff --git a/models/demos/llama3/tests/reference_outputs/3b.refpt b/models/demos/llama3/tests/reference_outputs/Llama3.2-3B-Instruct.refpt similarity index 100% rename from models/demos/llama3/tests/reference_outputs/3b.refpt rename to models/demos/llama3/tests/reference_outputs/Llama3.2-3B-Instruct.refpt diff --git a/models/demos/llama3/tests/reference_outputs/Qwen2.5-72B-Instruct.refpt b/models/demos/llama3/tests/reference_outputs/Qwen2.5-72B-Instruct.refpt new file mode 100644 index 0000000000000000000000000000000000000000..61de1e579435d473ee43954f9b6c28612db8c87d GIT binary patch literal 50726 zcmcJY3!INt{>LA;aY+fKSTV!6l-rnzF^F-eA(u+XZ7^e$Fbq->m1K2Ox+tOBwCO^m zE3MkDwkXwZEk&VLcGcEalJuYH^Lfu}o!w_A|Nran^{O-H_Pu<*-}61^JkNQ~Z$MVL z@=;W&QuJ^CPKzo;183$;N^RMuMS5!27JVj7E0{WcY(cAW*#+4xr(8U-XV)^(u(@Tg z$d8&1Ij*RC!Q?4zMiorHIA_wd{4!$;CQluk*K*YKf(a9+RT(ukXUy~o6UU7zD5{)1 zb<~X1D*0ubHXTy2sN1xnmf7QT^2@19mA;cFjXzgqRr1RZDO1!WCBMRuaz*LbDbw>4 z`b8%d!JYyL$JD!u9rrB^!bb-MEX8!6ps z`tLa{+@A50l@53NtIklqe|vXuIrjb8 z_)c>9ZUgt&32J}Z5T%8#kDP-4A`800a+YtZ26yMHUrzCOF8a6M{_~RYPyD4cirg(K z+@*rIu2DL}d{W)MqZ?9A{sotR(?I>e4ZHMlf9ji`s&*OGlwz-si{n?(zuoq$YUbb4 z1Bm)F-0o4gdsW>^{1AVkynaBho9X7Ctd6vsW4iGDqxHHl<>+>l(=NsI65O8rCYN_I zAR8>lwdMog2YZQrp8cS<>FjiTpdEa3?0E1;AKKwhw5J{N#I@^ed8yx0U2;K(dgyie zT=geEt8}5|zGk5a^;2<#zUXTqbH(x0=GwoOvnYrHz9E>garfzp5G zs{D9mqdenGevYc6Mz^a0{yD|+7CO!_9uquXE^|M|8SUcg8FyQ;v*|x&`!04sh0h8L za;MWNNs_PgETuI(PS#is=c^n$F`hfxu8d>Ke{g(bd_85kh-*C^$KSWy;8xD<-f{ieH6-UF z_7~OzKh9D^;u>_3&*DbNLp}Xzhu<(iQ~z*1@ul7C4=I0-abmqeJ@&v3tT)ax4(kl` zO`fj@*$(udbdu;}XXqh6bi?}4{@~60R@Qi?Rz6yvDlSKV@abf^7~jZOxJ&&nZKL!a zPh?eW|BBEzUE<4Ve9Zt;dNT_M#np$$#8J?Vt-D(Elz`^&clF9c($E!#bjt z{q1$jO+0LFy|L@!$3!PDO=(}#d(`t(-Fs9%*7MO;+jXJq@4r#?*uT@;hiwB?4?XlwwxfX4q2_9L{T8L*f}G%kywGEu!S8S0 zXMk7dR-(5sN$Kl@ls3CS>Asapml~f*gO%Uuad~+Y<>9;9`Xd+R;W%O(d}Mj>vwLQW zz)Q)Zo2VoT{f~7#eu)23j(==_TLc)7tm{eNzg+xw*njKvRKCx}O3?%RqYvXK*YgGP z|Fe_&p~pen3w?Td{EjAmSwGe@vmL)feH#Vw8@*R~Ke^oN=qufEr1Rl=2migw@#s{K z6UHgHIgCbLewO$<``NJGFhmYZk;6*l*}FN4eE2q3du~LI*t# z%N!AQV_dLrw3|g9_R;oUGRQVFYHa(kKWATlm+@m?KF@^M|H1!M&+FK8#*-ole&_>! z;05mBw$yQka@v6l^?U7C9W6I{f+KphcYVb(^~W0f*Ix`|=%-g)uXbNuu5?v-0gGPv z=XF*Hf5blzyCHen;jgp@H{=B;^u}+)@(@38!yoYv@B%;l2!7|^9|}hou%@74OEYz%BOo@gr6oPshss5epuB2;g9$s zdJ{LpcE8q7@fYkG+6#MPXZ(-$*o$`bryX*R@wkFt=)d%XF8+z%hV{_Jui#ILUqO#? zg&vgC9{TiyF6HF$OZr2Xct(7Co7Jl&F-J;KPZuX^L-mCJf&sNGllX;PS3wpvo$Loqe zja9$=B&9DlRNCV*rO#ULC;Y(eM{huTC#rt9?K;5g&s=Yee)>}Nv^(!n<&g_HAF#t$ z@PImEeZT>EufIS5qDSE_(W`!wQs)0dPb=Ts{J%dp{HaGS_)!1VO!W)L595k)#D0}=#JEXR_lWuCGq20GcpaVOK=XIw zx4;wP65EYrNYn{r8Y+ z>Ibgi0uG@b?8Aw-VLkG)FJEiFKp)!I_XI=x4J#x^m!FjW?1}8%ZOW%wzZ}26$THnk zcBq*i7wudRefYt@s-Kgv&tJMu^ndH|ba0{az1{8~=1+g*4ebD*I6JUU$FJGPLwDl{ z@r4e2*vIo;ZeEVsF-~}I2Oat$UvrOR{Dt)g@%~-Uz0W5Xn3I-@SF$#B2Rnh@;vN1&x_aCKYm&u zI8 z9(n4pKy>}9$)dN*^3m>U$J^VRsUH8qA9(+SUtky3cRb%?e_YS}mOBO4PdbV|e6iE; z0yU(Z_UsoEOb2@~55OPX=?~u6l{kYvu)|ZcCC`$v)_b_p-gfXo1CN$B#07rfgFK|) zTW;hB5AXqxDxMc_v_GJKw);~LKkC6}L_5g|-;f`8(H>k_PlFHR0X}s+pgMIC{SSXs z`iA4dpxMfYxPS+Acprj%;DDUqM0;@AkS+R`dp+`?_XFUB-t@cSYqf_ixRFPH+Jg&n z#o=;>H;mv@CslmX4=SzjiqhfEFD@s;k+zwk`kHpsx35+Hi;+s>@B=sAlOreifgAe0 zV!s7HZ~{m0JKKKsjP081Jha5;6WV1&yWu}kkf1;;uE-l2l#-?0>@cW@BpU}5BR|!KKK=Kgt&kMxL^Y!1U4qK{byJGpF1EZ?gZc5Nx@rebDGii=zcdGMZNJMjJpoWT=+#XrFtKgBO|O$YpI=SeQg!I%E{E%f0BfB4~_wA)}jmfbA+ z*E=2$zf%I#@j8e1hGUKIt?#M*35}H=YM}OQvX!6Z4KVoB^}4Xd)2gQ&9EppJ@5{XI zLw@E7<`48k5Aeg!h>zfTn$M$94}Q$Y;5z$Pu1k=g`2znVE^-cqah&n9^q%0jm->V9 zKY9JUW#xazCEfmj+~5-8fInbwa3L;UcBc5S4+9@?03YTL_<;|2!v{HN2Ogy8cjA2U z1*ajeDNp_F%awnlrP8as-x%b5)IDxr+w;oq?rL|_ECF+}-}hse(63mJ;9tm%KHvyW z=m&lwUf@W3@M0bTFK`8)IKIdY-w}1C-|(lDPXA8npuZ};Em3;EY8*cFJmTRTjUKD| z^?djo&WL^O`0}c6;qnfCo*UxF zyb#7Wy|ejkm5WqFb7C-eYU{1dzwr^rEjiOyx$Bi33k3Kz7?O6BGKC~P5a+r$13mn7w z99*#*e$V=+#<}84JjPzwFY#{GqYw6jF6HniZd6vcD6|{%*fUmemg5I@%k*=>Y8fJs zwoxhe!*1BAzU|P}@dJBew|fG6V#h~r5#2bufj{rVu?zYV59_3h4(-Eyt;KlX)j_%Z6&L}>_$80ckGpG`R#TQ z_8jSQ>;!-2AMA!7@%{<>h37$eU-h^H&4Rj8yu$NJ^SwX0rHb+`YAQX!$jQw7r_2c<8 zxK2J^70=8O4mI~FWxq1}H_9LMx_rcF_!vDyJxUnt| zaXZlmjKCEfu@iidgL>=dhv~FRrV!yyFJn`z*;+IdRL6*P?2sv){O`cmCVaYX6G;f4I}fb5#DQ*Bup{hIYeF ztj9vX2S@M%PwYjCzhgh*1@QzseLhhN;P+GClAPET`tZeG*a`n9kG-%Hb_74{hP}GA z5Wh`ZmG*Fa#ctS%`53%TH}0Xms=Y5dFBh zANIXvg8-P|edU>Ll*fLgl=D311Gl@SpV}uoPIU6~!)s@#e39pgDIUN3E2zAAEu}y8 zS31q_!|(^v5N9tRBJ@E&^bT>xpTU{9haH$Vz#m+1@Bw-7&6_GY-}M4EjxYR}hr#X6 zyVRfgVCJbZ#635t{4ZVyPdHim6_+TjX?xvg+`t9D1D{N{D=ogDH+Yaz4i3x*?3cj< ze83-kz#Th)4}M5J`25gW`qjD6DeprcR=$d#bAw-q7yQ8!{J?3j@ddw+>~G))Zs52J zKlOM=e)K_pZ~`|{%0v7Z$KZzlfE#`Vz7N^{7Ys2D^_3zI=YPNz`M`sA_+PkAVV^e3 z>)-Dh%5iwlx!BL6c+a`{8r7%IRSJIK29AC#5i!pp2l9a*a)TH6fg3ncPducYeQSsx z`hy!d@*EL48Y~c=v_~H5!43I0->mlYJnqN)eZs{pR9@Q~$M8F$-Ah&9Z-CMy2d+5$ zz%A3`82;dfe8>wv;0GSy7~)1fIO2EgD{{^EB>Uf9?~|y1%;Pgl>O`SD$P2El7r1AE z`G)ca9^ZfW`%l)(TRaY5%NL_*2b9te{3bdc;)m7OsU7}^Jje--;D?;x2X5d3ew-Jh zoPOY0-uwKD=ZHUiJw29u|Arh*)vV-RfN+1tcm+516CJ#cU19?LJzt#CPXsef_bTth zE;K$ptE*!4WToH-PTyzi>P zMl7%kbg+xvHzE$OuV6pHduhh``u38m_UB4R{aNXg$x2gv?*i)t))Ogz4ANu33*p2an{TN5sk#WO%mHi)f z#Xju&z#DsFFYJRIu^;my{g4AWksJNK_x=|9Rj4C?UvwP$oyQkAgXcQi;c<_j6=w)P zaNhC<;7Pm+<30N>@Mix7Zs1G22VdSBVF%F$nEz?Oi{+t)Zo(Em-GZK9tU5tB)$>I5o!3KYXQ`a$FW8&sF!%@0W3V%Ph{wp4 z>v2kYi=WrOmLxvk%vV~+euKTRFZQzeBi0>^1I8`mgmF>N{>u2xb)IodyaXrek(>Dn zx{OEo!5@CKgD>yD|>U--d4j&spmFeNH96-zOfXd4GD2_`;9& z*ag48-Ot-c8ppdcq+lueI{XeCSTDl| zy^w?U$j>^2bH~KPowgUxfl2rI{R8XLAxXlA=O^52Rp0&LbFnw7tdp46;Kw*;eTl_tNb6h7Dk{PDkE8xODpDR#hK*n>EX9k2_}^^hOCU- zYHA;~cI>0rKk?oge&B{*!XGALK(WM&IKI=3k8;ql1(5-S1Mn7<4+<`OY z?5n|@=XBs3;>z<|-m8Oe$QOR#9fvb`@;-t*?|0@@7Y@AtDK|j*@O=-@J@HfC`+Sh7 z_RksDpL;6*sLwT>oTPkB^Vwj#J?ZsiuKQt!4-(Yg*TF{Tysvx(KZmTAsQjKDN_Rf2 z^cla;XyOkspcjrC#uMX*`yos=3h!wEe}}Pud#;>oeRzLW16!;SywgL#6P6FZ!Sd z_XD5j=PU5x-Wv3PFZbXfpSQUY{pQ^xzO1h(r+rATz2jYt^6FRnBc*>@tCW7M>&Q>_ zJPCd5yYo2p%Suv8JL-|k&7z_98*fjdPi-jd#rdQ)ZU=w(f?J3S^*?(574j!vx#iJ% z(vR}D&rtmypI@e5%a>Fh;zB)ixNjI-5<48tFT@8td^{@xCvbs&hzmG`_=LE;mm&Uf zxNJ7g@CoTH@Wy7Y1NWagi(c(CrStTnd9{r9`yqbd1|9q@4nNkz*bTn|zXZpR%1*%({O~KD55X7zA;qsc*w1D) zmtK3l{|WJf4sv0Ss+sE7!Rv&%`;@0WIEJ`T4?d;&75tzFE+G!!gMWbw{m=`Xz=Jdn z2mFry&_j>kdVPeSt#2e88hfD^w%cca%CjG5*^&16J>~?{!@oXvyROwlx6xpw1HJ#; zV#f(_24DEc`5$&gFX9|H)4_py z=uyx9opRc-zb_4!IDf%E>eyewA@mn;VSa%>xa=}+p?uJzeuo$Gv^(uh`Aw6lvM;z& z4fPnf3So?l1UBX}G|T-%AN``KQ-)_zQA^3-ZrMkv!Bxm-3p+ zgu{HtZShkG3w7`M{=1$^e3PEW_Nhph+m-i!q4MZ7-#vZM7x z{#^He!}HX`j(2CTR(skZZzIzs|BlyTv;)VXrc1j|9gk^;o}{I%heCbP6TL!x(Tn@D zlZ-F?w^-j7bEW6ul}h>jEAT*H`ng+#-l4we-lJ_2^4{#NTNb>I)zA#NN!$ z?e9F=UbM&lg@HYq1oq(G2Fj7+Y|pdkpMIVA^mP9|eqM3pG?k}oxRjiC4fEmn`?LLc zhW)eb1)>B0Z|@bs2V1B-#E*823-Ao_V_dJdeC!{=4}6X{PUxF|oA^@yvipZP(LS#H z1Jgwg@BIECKyRw{4d`JMJ_yxz{RzW8g- zc(tcLzk9^{JNkn!=cm5)a~SIJ&(`*%JxQVqAMA|&{2l@Jr97kuzVPR{MqZZa^SonK zFXhu6NSHs+3%bl7PZ@^_0nR1uP8L1%`oxNa^wPYqs5o5h$%pOuzH5rl6{2rkJNSMY z@a;WCd}`z=g)X>2k8<>ZUTBZUj063!2jwB1J`In?v^@0n^$SaJv-33 z$8@QOF6~yko_5ruC;r0s!_@P;f&5-zf-4^La{|7D=eziRodnQq>1)lJ8*i_ z@e042ZMw9he-*D6uC|_+1oJHIW_X;Bx?l1k2jiUQ+Qb|DigNfC`Z*u+Vecy}AN**) zys`L|ucvf}<)j_wyl5Z#9sJ=#JLG`QZyj$xt0g|PZ*zk3-|STy*$%rc2mQLcI>d#z z%5y~GAmzvd9$(cFA?!v9g)x(s}@cZ4b?N^C?RlYG%Df~L7C?DE^JnfcvJY!GRWwqU(c2D>{33htd6jB;|4tESTCM$V+Yn7JM53O?Dx<8DEWye(Cy&)kM#!YHRc`G9XtnTpZ~J$g8a~% zl^{OvJ8pFKIk}#vtNioFl+Jii3T(0-^oO3!5-|?odyN&ve`rsC>XGk#>))xr_+HsX z=^dF$Ypzi`+VSGg-q+^&oCf^y|BxT+Hsqte>?NX$T;M@D`R6Vc09X4tFa5!VJaq6U z#!JYTc0-Lj&$~LdlRUd>D@`7#6#C`7uKLmVQeHPf?Z(*w!GV-GiGR|MctAOE0r`W(Tnl^*JPFR`+O_C5JkV=d46^{jBJjQ_Ymy2ep5ws>N=j_hu|=-x9Y?3 z+r)A+jt5&``o-aiKM^1BPsS~N%XnVl_=}%HANwFLbVI)2ktsDI{FLzW!XPp~E?S^!u~p0{0ut^*ot8Nc7)tp_KmU@s`(T``%G~!Ba|~vAt-w#`qu?@d0}1 z#r#d4dY-!=?~fbA=kJa;wD0$p%5OG*%DV;e2!DRXSU)HU6*^bCD)bq)G-Y2CbsNZz2H*TG+{O$I`TaqPTf!nouTJ^22R=U!HARl~? zi~TNgRrGvtz9*VO56F4Ob*b%pzT>&4&8U&*v1gZw??lT79{5AZzjb4^Uphq$KJYr0 z_Pa7wU+y8L#1Z7D9eObC*soI#UvTI7$~%_x3O^VB`eXCEPwAlQO7S1)@Le72YsU2# zZeMe>`tdv7BaJKVSwBOMan{r83))@e=ZGC27rlmwN(VatZYw9;*xz+KOZCtNAMj$F zBsk98{=M3B-s)0!B%Tr9*f)SLagFyo#6#9E{LVYi&4@4T3y_cZuRNc^5AT{Nxrp-} z94}t6UvoZ&d&$rDco{iL{n}(G#qU|4!IyJf;8Vuqu$uX_@cLm?Fu#4)M0C5|tn|09 zDNWj?6#8-eu*0|BFQj)?zkUssA`kZ%F1H+{)L-fH3_ob7>j|}kpAXRPwX5YH{4T-q zo=@V&DF}}wS6rVHt zDLw7mqw)1L5v}yN+GqTldcOVHxDIgqy7?vb&mFAv$Q??T%~bkfQ>Bbc*1@ccu`A>B z*00q5se?*4|4wQ9=ag2pKRNuCygwn1;OES%>;qWuGmh3s{gQKu362l=5zlj>i(QF( z#6kGwZIpbQ8YrEaD+aAoluxo>)^$Ap{Szt=pR@A(ndi~`9uf9KKI}}~;CWbc>%}}z zJO+;vyshu(1jzxO=mVeoO^^2no!pQ1ANctW{zgCK!vDZyTX*q8j;4<5oL?P!sp_Fu z!}H=(R(Q4Tzp0PfQ9o*r^3P=}-7-poH}N{7y8Cxkb#%yjpJ@H=vR?S1-8=F$R^n&v z-M{fE`eWe?rF=&-=xgQsIo`eP{1b~*KE?u^YyMwfr1FMt53UnWRXO)A^txSw5(gT4 z-NQL~;uz=Wng8)y;so)-;Vr6jh3GEw`ha@czur-DJnwjX#Lp+H*{|z5J{;$9ZkVIK zbJcHT2c^s_#J?`DtDO0zi?UIdiTb1eBTCmBr!V^{Km8`9^Sn-OvP|;VR}wXJTGi{i zzBXiUb&HUPe#o)e`o49x=;Yk5ba9f>wSJy5@38WTZvSPT@*UvCU3vJY->UrHe<;1xaqWJ`#eTiTpZO9x z#5d;8XHQoDWghqSpH_abKdAe5m-6>HKCzC#eyk&wI4;$21@*+`=c|ei@#hZXoH#+{ z8_YlQ0Iz0x7pF-N_5tX@d|HBsnw6|4@F)C#MYi}{;W&267y&@7 ztWx64OpoWT*5@Sq1@(+iY?h@Uu_eqX@UVly*t#bSXI`juu_>8i>PQ6k6 zxbJ0wJKpbkn79cY)(5v*KjQ15x}pPLaArOp(p2?~599%7zH2ya`!2Xf?P-^BnevH~ zl=k*Kl5D|fcj~#Szb{AWpY8A2m6Y$Cq4W{^P32j_@vcObL+38r8^2?It$Tyo9l2d8 z?HVf^u|9wNunLGv;L_ghD(}%Bc7up>ZB^W$%;~OoJL1)c99#YJ z@*WJkF@6~j_yO}4`!mY1)BS=cTID$ThWRf`6Q5fWmDVn=be-Q54In$8kesP}UIy_YAGLQQW9xv=yct6W`V(39RdDfBZZRcmbo}`}dcHXKj zeq|DrCOaOz;(cG#MCF@ZPWwLBs2u#zAN;@*{HSNY!Z{bt&xHQ-pmAK|v|Szj@uB5h zJyZM{?~&&V=D#|PRFA)~KHqFSnGY6pP&@pU=N9EnLmjp|{!Z~c_-r-x!@oG6hF^da&mEs} z`*XaY8)_gpPqMw2=<$8Pp$W=YOI6x+pwbE+0G>V~^qpWlUsX7+0UpS{?7AN{QZ2-3yf#>_xKU> zI?pk$vOM6*d_Kl{9Z3|wy>55A*ZJFw1ALNOsU7j=teVPiPga_0y!UuKVBdqLi+tD_ zz0s3=rSYPdo2is_1@&noM3DKMcGuNc`z}hNCWG|HZ5J!8WIJ>3*^QQqbr$y!RWL4d z{GMceMfFGiMWd9z+J1R~>(PsSDDAP_vNdt-|q5i&vWP9la=Rv=Y#fxec2Ldj>iY~!XLngeL8Z6zt={4;Q! z9^;7TdfPqkWJ$~jJF|~!=5p{_XnEm>d|Rw{iv6>l{q_>4%vY?(Lcc;k<`u>nxL)G< zV%RVVw6MR@Z@izW=XL2pw|lIC>Yw>c>6XJvp$k9WYvLn|9=yuM+0Z(OiFFprbT*c*A{&yO)Ho>eQZIiaoGjgty=f& znidT^_J2#gj^Un*yFH&>^@do#XT7+$6@R?Q<$d2%JIc8iWa(m+V<(<#a6i^=6JXu@ zlJO|qrG8#MMC@}IM;|+m!!O_KjSC$|S(kH8c&FDp#Engs8#{OTT>NOiw~O+ri`1gXyF9J07p-O?haq^Niypk4yOE zc-_Rhx^S2HAy0G53x3}vs=k)>UHq8xV~r>G4IJlq!1(^Pet}%v2SUHHUdM-eE=&^t zNrRQ*Hy?Rj3I90%ddqR7m*uDZs#8P{yJ2U;75OzlN$O=!$@)E6-6O`w3db}0nS8X= ze*4c(GI%IQO9w{ydzlc?Ka`Jp!Z9g)!Pnt0VtiGxU$M`**9<~DLOC49OU~zo_3#Hb z(!HJs4|u-rmOk?I>OiHKX=i-LcR zbpOn3HG~hpzvt;8O1??;CHF98D1VOUjV&H$#I@sFi}1k9)h@iB@wpE7UN@Ge>B*K3n4IpXvP?Z)`b`1!y03++a~pB#Yro%J}rq20KznBO7icgExT$JvkH;jV?h zOO2A>3&zhw`|&%~%y;Zt;`noKD1OWDNcAvZey^9`<7FMs?;yv~UAz;4mHt|~#DVa4 zc=1p4=DtYE@q6Mqze8KmdaxhocVgq@<{mP}6YEcYN0r~Vi0jY&fXIbE@jIgYPH0?z z?jOV+=*90bVpo1A(ZjpscMjv|a?c<8tT?)?TaT4K_vM9h^E-q5jv&7i7$+z9!GRz5 z;c?$xT)*%>K7Kdm*Y<}mzo!CDoPXnY)Z+NlpZnig&+t2A{Ek>$fBcSnWWn)!&#(NB zR$PDX%L6z3j^CNe^l&V#e^a-EKfkk7-{XjT^SBqUGEKere+iJwu%N z9h+mN!|%LcZ|;}xXB=2>5SKQ39^-ck9%`tDrLAkid+wMYxtEr4&GR1a_5HQ;CC|6- zNRm8pb_wT6?7{Ci?6DotFOE()Z$gKAZ{zxf^CkEE_ch+kFX$gfCvM*4o_g+~U+Mi5 z_sGZ53+GYpVUKGc&YO(4)E}e>`(FHpd&J}Dh4Ur!hS;9ar>+z?XX#u{-wx z@|?6L@gnHQJ%q>FFT59wd-h7>7xv>GI_{Y(tv~mbA#c@8+38sOaZg#Eap65;X>_=+ z3jW(X?vJ$} z^f+(F_XFTg99r$?Z>-z7r{W{)&pi{|Bf)zY&XaJ@1NS(a3^LNVm&h?H=(dRim^?YX<_T#%w zzTZS2@IoK-=KE*9b4Cx|SMpwp?+E|D8}XfLX?pNp6uJ41lX-&mB;Q%`9VPD* z@arEp2xsKrJH?&ZDo?i^!H@3*;Y&H+@$sD=^(Eig`GGO^XN25*C&YI^ycgy>9=_AL`6c0p+cIHroIK~?Ip>bO@h6@SbIu(3k%M#KobyHw`g0DObJjeU z=bSWnbIut%L7#KT_!)FLCmfyw<{U46!Z}>*!8uy!a4wE>uAF0qKj%=lrwRU?x5O@-ljJ-T=Nvb99%f!a4$c|U|EVAzaLx~Y_$BA?72S`C zb8MVbgZ}}`#W^zkgmYl5&-|JrD$&dDt-aPcxM@8X;be#7%##y#guurudG;LAA=&OsnQ=P-D`#yJY?$TA5Ve+J&T~q%T2N2S#6ie{V^6}>v8@~9z zcnO+CjsCL)Eq^%zF)e56gzSkEF3T<`l4jI|aXq^xl`c^0wA8j~DQ(-e&1jdA)~-!j zO1qRcDJd!GZBx!{-7cf;nQ5)twn;6jE0(fB<(KCd@6U?=8<17z7fLW)J8gO5XP&$_q68cwD*R?E4-hL^DkR2?f^snYADCac-VnK=_C%_uu^dgkbv6DExvnUnu; z>hzJbT9(Z&*0AB=Qu)JX~cBTtVX^$m;_S*xvkC}S$;m!{Kz7xc&HAO6Dt*`LC$fs?vs&=Kela1v zIBI!DNJr1f$(&liaQDv5@-h7-|4o{ZlQSu^fQ3aJj#Q7!7p~xcv011QgM|XEHc$HH z^ViOwt>C}E7w9Sfzl&5v$1l}CDqpYiUAH%BuYWA{^L1rZUwy91s>M`RFul4iZ+=o*7z@sP?<_t-avC`XyDCR}r-{y)@6?%XRqZT}2&v zeyyK>V>(}^=y@mi8`@a)k{75P(m>^2^M77_)tMLi1&&nk-$~OS;c{SIwO`MymS}!hyOnogpmCIEJjqRAWr1}*SCkt6hFv4 ztUC6A{oGSc?Xx^DWqXh7;`XPl=L()b+wDC)f0FBMDu^EZqA%*`?dv%8+wFPYGv8f| z$A8&g&uw@*J?sT~;Ddj?VtU!8GtYFZ-6T5nXZ+Bl)9LdZI*gAqUF2l^BFjC^&lh_> zp3gEq(1l)J+0*CEtZ_PjZI+5K_@i&=?()3QecpAR!#~f*RTll%E1f3l{Nv`k%mb>w8^{5ltv&8Cm)B>ie;L=?yrKF( z{-p9IlC^KsKQ(XC`h@2YClXPo{49v$tE;Dp`}I8NjqR)5yP>uk^Wc|n_Re6VNi_FT7P z&+y&E9a@YP-MY7^BkKY@ zm&ohg%B^~ym8`P2`#C*`YTl>zG1k)?&fkvN?v~hLmfF9Nd*2p~L$9a&9Q~DZ9zY&4 zvyu9bw1f6F9}Y_q@%a(Ub;{2pm*i3OV;vwLrC)EyYxo(X&bdK!!4=#?y8{Q-SMq!G zjoj}z{$UT~_gn3!v^F9XZ`B^l$3eG5zAK zH=ffDeJ`Jp!%AeciOgnXwjwV-h4aSh$2`z;*ej$9osd3yhA-rRf9%t0itPT;-%aqs zp%N55u|E&@4bC-6U3$7bL8q+sSIYj7dszL*@5uK! z*I4NOoL?-pLe6o%^=kw0;}BC z6YPd|^nxDI3wDZKV!!lbJpGxE`I(1(oR_o0xWl~A;oM-D^&dWGUhI*1Xon8tuqXKA zIsNGe-_VWK4?5s+%yvS2+cHS{Y_B4^^KtzH`()iEo}GMM&;Qj*CGl!!qUz)g$VuEH z-=Tdnz#6yh-lwsS;4k8-3-yogu77)n{!!iWfpgVnW7YoE5{&2N$i{07wx%0^S*6-|JqOdoVZwJqH&q_tm^%(-({^;A8&p+7yP)M=;2SqgU=sT zfBYx(8~p1$>jOVxe>yQi%pdO!hy%ROU|+?4YPkag z`wR9B(|pc!Y>092I&`zxiLUp2Q%kCWd61tvaR(fbJ6>cXS`f5yQFbcwg~ zj6e9j)JJ$-VLM&aPIdO3?6=ohA&Isd%?_)h@U;e21jMcBU4-}rc zU;KXTgrTJ6IN&(2)C*gt^^~$${WsgNW;;IL;0+LT%X?h|Klnh7^*cqUhx;w}dUA59 z+LzhS7>8c)<0fyZA9{(UhYfM+*H%UJ>~Q@9e}g`D1b*z}!~OfZG>vB*CH&!k$PJ(P zRlJ5r6^H5{tkdu}Zj0LYq)UO18RrD!!MetL%sVj^ zs_4#lqF2}Ulw;hy3`fbuMKINL-}7_$V}9cGi@a-av!wca-$#Pf#EtJd4!SC7&7 z8aAMn-BiESc~yHm0(gK=As(h5;_{8#zqG#h*iZ)qxRmw#8TbX4;m#9yPCGbV7AHO* zvAv>?IMW9o^Z@_NbI|gIxLiK!bUbRbRK2nDn;)#`fwR;OUF@vkwQ2{auWeWG2QJtP zc(e*|S?+c0nGDh6-U9rG_&nK3{lMvHlIkHI*b)4|KXkzb9IzK~0*?VVh<@J`F<$Eq z)$d7AxieYimic6}-a<4|0b7hFstXuHc0pmY5E> z9-kxznHPHkFZ>MqVjl2&vx(#naRe{$!{26`5$uO~LR`Ru{1g7b1^t6hEL^|?zQF|? zj?NJO;DnshUsF4E=8uIB^ueL}K#j+)zzbYP*nhyK(IWN7?q2mi{eeX3hy8|^iD3oQJIGKz{5B zys$Us0T=jy@5uT97wCh}XZE95xLiG3xTN(`nUSk<#T*g7@6W1NxBo73zSSf_?eKwL z67Rr|{ULmZ>j(0IEBHPxGLcS0;@WejxOXdYX_+>uk5Ag#x?)!f2cs^>3a7mt} z^2M&gp-G&|X=PPLD^-rR9DQo3KCivXEDbNr>%gs=?Q-sQ?(aB+93kJt8{`JR(C)BD zRF8Tp4@I#OIWf|j=XB=2B*k4t1`}h*Vp^J*>Uk&u2oZ}q& zjAehU%kU4s$c6ri=irCEf#crq#Q=T`Zr}x;(7%0(_+;J?H{u%ehxp;Y;7FcKe8OIk zpL;9#+g)B)yZGFjeQJrul4G0=hVkHs-x3eM{Y?Fl1Kg04xQ86z2X5d9ZrEXn+sZu6 z1FrFYj{;rd95})cIEMIv8+Het-E3E1xa5B1HMTRGS2V@`-qrmEH`hO|6r0hep87|H z7Am(Hr{49?&291Sj-GzHaqL$g{qPAFXB|LeM{vK=OcH;tNwufwv_eW*M9f#1U-MGiORkz zBK(g18hqC<-QTCE;|tc$4*N^B@oL{SNoDH=Dj6S3Z{*$J2VVF^s7L$~y|E63`a@sn z$L1Gt?(@^{>GD}bPt1@0m>2#-y`d-KDf-*iRdPr6uS1PgN001d%3B{z-7oDv^(*G~ z3xe~A5I_8k^$-4$C&Uq)L;S#v_2}YqVgTG~`@+&@+bOtKbiT*D;1ie-PqU+j;35gdsZ#1GCHup9iHdE%XylRvGjDV(`yNL(N20W-XROf=rBUG7TI zxKEw`S8#og>)SH*{FTR44s-h}9*-VFyQ+;J^sc9gg((%H?I>XoT#C{d9W` zJNf-&kBL$U`u)WELBHL7pCi=o`k_@k(l2>MEdAE<{Goq_`bDq!BlNWIB_%BMooHK;_-REF?!c)i`z!B!2|U0DJiv!|#eBpW@POY~ zxUe1&U&vF~?=DN1oEP1#vgsU^_#ODM4}8?`VRv}K`zNbEd|+3hJrSpi!jCu^;uq>0 ze-3d2&j*^yKgjd2Bkbz1<=7b~{=pA?iQC`~-tZUl^Y6<<2fV?T_`T%IZ(<9sI!;_7dU;UhHRxgR$_#AHfgYz_HGf)A`|i5k9~X+_>ihZj_uO zvi~9Pr z`$XEeI_~tYE57W85&A^WA&$rie&7a<;0BJoAFA)X;fuAJ@AVQY2YO=Ywy?rN{6+-l ztndkrXNw=Wfn(Nm@loPJmEc**8_?hS+yOj7J8Y<7k+-3R-&X)1&R4+)|HN+E{3rtK zCvGz?!@Q2BnZBl(&?&S8N#s&PqjrZ#KFL@bqfFHPFhvccT{1^Q2-&puT4;;Y> zd&G~iJ1>8Q_J`jR7vUQ@!590Bg>$^^zqU)pgA;x{vb_N1`I=4|z;o_GPMQdaA=ObeIP{Krhq>dO$Cn+hX6LKENLwV%anM1>Res5AesH(NARyfIZ{?@Wr}> ze!w3-sAK1$Ua))ifqB*o`$Y0m@MB%$egrs{EiJsadwlm7RA=1k?W)h2tg@O7jPs2t z_Ook-==tlx_)u@<)Geyz^AYS3T+ko-!v16F4SnHHp&y|q=%7FJh5pbN`a^GwLx1Qi z&2oh6VNrSmSL}Cq4-pD+0^f5z;d|akgD-l3-UXvhr?YjD>fjB|;0xa1%=uY}b7+s; zI|FC(TW}`+gB!RO;yGG;fj@YIFMNgmft(>f-+2FtpO6>#(KPw@LiG>!$G4l2r)H_W zv;CBFqgq~Ht2j^lgZCGV=lulhbJaxA*X}id?bHkKIp?AdMb>E)+2b}k9>y(zXUgMyutp- zcLvDsm}k1}pEw3yuev=JF7OMV-~wLk16Vh(cjN~j)(7yxo|zXu!GZYYu=GF9cfn<% z-`B9N7qy;qZ{Vo)cB{)(F6TH74R@XRa%!IR{SUtvg5Q;u)lQxDh;yV6Z{!76>;id1 zoUu1>26u1jUlJdP13uO& zyzdAe-~->aJP-DV{NMrK#8dDZ`CU+0e2 zlsL1!aQ_aRc#lZ@3;q4i38J&U-s$v+-^4-i#ov(^xxg2^L;C|?@Wu|o8GPCAgDZIo zdSE{02WN0E%Fp>7N@#!B9d?NQ6>B40I0sHHsd``UtMB!DS?1s9xciBpueeA3yY`gf zO|{-yj5{6g!OyCW-{*Bwf9Mhic#dAsGkAaxc%u*WgWdBy)Ccwn{^5NK&H-cHlZIdH zz9@YV56}zakrO}19%IQH%87i)i=ELP${otdy#sK-F2NDIBmOlC{1F^OKSxgFW}V@@ zWY9X<0l#Azl&d=D#UpP{>}WX>%&g!Ir$m+@GPI7wwtc!#qv~syFn%T zz~1nG)+6#+><@gfL-fLWjXuCX)C+kg_@fuj_lOhd1^mPP34Aawc!c^u59B%MgZrqw z7Z3G=K7Qr=ADj-&5N_Bh`dFT&_T}z>(Qehz597}kXYdW>1aEK-~{nMbEWxxDhy>e64pPa9f`M{s`3ptq|JirHgMjv4w5$c2c1jG^az&#rD zfSz~{&Ag!=n4k44mOjt}_FT?#k0>qqhdVDgIZ^=r(|H59fIB!~*VsAyg}7jc;1kPE zu}^RbaUhQck8ph>u7>!q&V{&Ohxj||4f0|K$cvoV1M-4Daw8{k9C^d}c@Ky^AQy5Y zFLGiJ$O(Pc5%@su=ak1>X1Yey6+R6Z@Q}d@gq|Uh+KU^KQNeK)kxq_fh#iG3T_;SWn@1 zjnPNn1ES0OylwSVKNzR-Xg8I4n^a<7PZ|*VqgUvnU-Zg%E8;~a;=2ss&ABV*tgM6J z&G{(b0~u$#rJe6A@co5iOEnSkg!KzP;g5Y0`EK~#g}$E0MFj4Acj02=JGPVNd!wpK zzQ2&_f$TSG`2HZjD+90ay9@9=$mHCAQFKlt)J4Cvtp;LG_h-!EBlr5MAmL%PtL zXa3Ox_`@IZhd9M|5}vc&^zeJJ0e;UN?|UiXeAox_Fz+C*@}T#+b*yl_+DbU=i83>{qkRZZ)#2x(G79od*|@=uICv(RQ(&cY_?nv>b;}( zwgw`^1w7y@#05OS2V81gD!L&q*a^6xXY2)iFdue%ld1)xK%C%0`YGjAtD$U`*@t`A_w?kUyR4DLj3U4(Vj2FFT@Q! zgWup1;+J`*J01}~zze$q$HTT);u82l2YHCIX_m8mn&xZg_qZ=ERDE?VmGeJV8QK+e z!KWy@!k(}PaKV1S1^YVKRr7)ia^rXBnSc1;d58=47UF{6GM;>AtLg6YdU4$EnZGx_ z%N%c;J0NAb|LCLQ_o)3Q-3GSz2elt}zmhf(?2qrSRY*EB&fr^={ed5Tj2(e9&%qnJ zBhG;{ag6ccgZ+W;hEC%5m3t)E;r6OO;e(dY{;)f6Cg0kZpm8+~$n}dud@ z?GIew?`-X_@b6w&kK_l~9r%D3xG)d)%X&1+dIJ~y5PaUg?dNd;58@l~?)f)0?t|tk zpBks~!?7xN-zVHdT;LOYu$x%%Dwbb@3w8qC5SMq&AJ37WafNmF!%rLdT)X3F$rIWO zc<{SM;#d0$>OaDEvB>N5oX^zWaPisU63bq|13wDw1^Ka;LYy)+Z)h*r2X=!zh3)AY z_oWlXMSUg5-UrT%3w{H>;M8NM`j;IgdRx41tO?#1U@yoIuH@I)%Y2XDWc!{KC;Hv( zH}`#^dRz&Wc-``TzftWU zO;8zcKe*I#A#Z9RC;Vl{Yg{Pr825)?W6{)mQt`1eGmpPv{l7u^03U zA3P_{lIMr#9^{qe;hcADzFhMzuC9_i4Ek>wPwW%EN(MOdef~CXr{DYj2lVdsJ|@}r zoH|YOQtxw0b^5lvMR?gD+bx*78;rSkW!*$0$=)pI5U*``d zgzfYv4@WQ5v7erGH4pcSAGaM`wifO9>JCGEg<*qHDdIs zjiPgf?Rl5^y212W@BiuNoh>JS=V@{UjUV}-%I`hlGRwPXfZE{)dn3=rp5UAIT%Y3{ zcUius=)CIqe53heTviV~k8_?=!uw_Ncjl)*d~~tCn3v}tRujEt-uEO}&REZ}SNeT* zOyl_jK($Thi04Z)j_?H^@YTlsl7f~0vZP*cimlVCgN1DVa{QlUVtgi$1&(0UB!!YB1e5>k{oqs%U z{Go^4@b?M9mwAbU_3h`(k3B6k5cpkZ>vf{-X0j96wH~<36Av>V<8BpyoxBdi4>-Ul zeuN*uUuDy2YX47hezDW-#U_jX3a_K||0YB2@DDD`%em7&+wC_i)t`Ck2M+MZxmWwv z>QBGXYg9k%xZlBk^PTad-`~>Ik349vrSYh=L3!Z$L1q%v7QgLJQKVQ{ARS;A8x7v1MGj;A#o1*IS+-e zkL=gnzeSEL`*j(|SMY(3VUMhih<#X1kAqG-+tb({;u9Q0|7D-Rxx)=@)E_!?okuNp zoY-`!o~NmZLjT2XtK6av*eiC5zrxSvYU;<|A!XirULWy`-6`tV(fJkj&vWea1;-2g z82|k%=LI`N=M&S%j^PLWU^n3Yu7*YMM?c~W{nOkJ99lP3!{<+`WZW&6sSYl*gF{jN z%Y5XkX(AUt(B#RR6+u4^H@Toc#s6 z#~*l39y6Ysk2@?7IzezNM791l`07xIGF!`|PJzaTf~A+4N`v#;Cd{G+-T`XiQub^4H> z^S&OtB7R`s%+Gqpyx1rEeDYfC`vLn?Qk>*laHYzDKF|1ZuIj6N->SLeY;&JmEcN~* z$K&A-TzDUjp0LYOUhi^^=apVy-gREq-SPKI=Ocr~M)aIFw-b(IJFP$Tj~`bVrFo(AJKG)fz^kJ5eXW69v{&QcBm0KlTSb@n!g~zl-Ddqi z=aTv1FTwgDujAfKW#WqS8~l)aSp0oS>>hi=Zkn$UUtfKyl5?nY99PGCy&O45&zm}* z_^^TM35QiK`?Ja~|E{vj9V(fRxXtgAnZJ0h^p@)N|2EtE$y=*cWz%oiPt~wfb(2+vs>RzQ6cyxX$g4R}Jk)AJ=ue_pM)^P<^rMOX5^d zwq5c5YL@d^{1W>_U-%v8=y!f8I&<5qWZ!siQ`HAFSBambitPw`4yzGzTV2t5$B$C( z_)((hkdKg0@ptc7zlcNRk;I+O-UmH7Pjo7FQd!sjLwuXKz8ohi`|<9N@? zxirs-)A%3!vQOc=FY9dA;q(nxOKfKdJo7KUDtSagqM$fp{I- zcZ(((SHl4R%K1c9$5Z^SUsd(XttY`+7}t-cseO7M)A4&!;^c2_*Yh0j`rDtMc04}i zICO*8&sA56{;T~&&oCF>Z^SOc_;jTM*!Q->PTucy5}D`@`_ZmAH7xLY^o9A(HGVI; zUtO^g)pq%b6+0)s67P8*L_9-(#qE&% z{=ZZm&Byr*{fH~*e~0Dh9{3seTlRh-I_YavvQH;IF27Li@uu^r{bj-TYM<-4-fn^F zhpq3+(7Ods&(`xfja4Eac?bSd((}yirGChTJlOZ_PI|t}{IH&LU%Kz5dVaEpO8%bk z!%pl!bo@Wz{aF{=XNte`2|wuXV%rCH3E$Lt9*%?WcHJac)du2c*fiCf?N$9qRn?s? zMwM(Rw|QOdP+$F4c^r0{ZF@fG^%MKNWwH8$A9BDCIHqdasK4XERo2fG`+bd?qVrvj zN}j)MyI3$^?TmlyE7j32>k9L;-Y_rxq~57<@Jl|q&2i$S`D7e+jvUZ$71((%kK;M> zg?R+|IsBCi^o=~MXS|PP-HV?f{H*b5NJkk3`o)eGYlXIYuKfXiqpmE-Nhvv=oe2hm< z#uE>jXXFzahuqvVfFIr?g>;|~J>n|+cJc(itHL?YwdR-kpv(F7TH~@Oumj%LPIJ8E z{RnX<(F_q6CY!NdH%Y#0y)I_BPE6jI^XY7-C?~43oj%omT3Hi$3I;;I}&LhCFZK~S;Z8^{rc%v`u z0eaXu{m4t8%iq!FURe*vzkj)8{UzVud4>2$p01L7hWA(4HFn0|&!T;*<0^K_^PP^5 zU;Ex3{otGXT=kt-Aul*!r_col=H2W8AB1s$d7Jv&;rnYOZztpOX`JdO2C00; z{s}$e68wboVTZIcAAIl}JLUb&8qY%O z3V9OuUf}m(kK5rAdFB|O(7%=}6n~8yt6Z9(5_@4@_A_DJU|!BIShx5)63jo*c(9NA zF;VldZyV+Ogm&ya97n%##vQ)c7xVo>?C=HW&)gqlAF zj#s;l#&dqkzVRXJxw`%9`grwQXuUjSyZyv@(Z}9*$6HTj-LI?rVSo4!c7dGichB{> zr(MG5h?_M(=cA(?_oq4k_}q%>VuJ7VQGfK$JlG%h!G7*Z%ZWV@M{4JZ&bQ7Vj+-ye zKQ48k;QXF`UOo!r_tW;HXRUYmLQm)qeW6G6*0h@VAbt=xUa2OAD%&228*#xr$Pexr zPm10M>lwMR_ocRH{5;$jLC?!d;djA!U&?yUI?lSj#`%3S>ks-Xu9FzKiZ+jaxSmW!ch#Gjo zF5OfO>q?04<(6lI>wL!nzR?Hs%roxsc2u5M8zZ{NhrH0iZuq?<=kg`(m)Dryp~j+f zx8*E~PPY4zZ;%I&XSA~($wTOO$Z`;`UiSXwg#^tX>IXTX_naN*ug>$x6Yp>xee!!0 z-mfe*{q~O6e=wc*?cdywIod#c@IL5><={O${J|G`z;Cfz?1b^@T|~EMFO|H9q91XN zbCOfGj|*MCWBE43X*}_`_U)>l7q1fevD=y6&vVX(U$dToOEd2mSD8*-`#*Mte8dmr zXT6?mdtm<7Pm3Sk13d11%TC85`qK{_x7u--k9iI(6ulYtfB0x%y38BW2M6e&AM!Zj z7x4-{xL<>O&?gUg%>$r|pZ(BDd?wlNSyzyUJcc~+UgME!J<}gPun+n-+a&sLTR+&r z(;i>O!XZp-BInBpr$)SB>dTar8gdN%fYusl;A6C(Cslc*Ak&ehaw6{37qVpq=kUkY_PJ z_L4D5^Fb%PANr>K8=S!-)Yl-}-4y#J^Y9+xLGzh=So6V86Y~pxU&rftHOsqltLkHn zC;OCgj^Ft2+1dqs@x2$u6>Fz?LOF?(Q^-Fo$04u#%pc3XUh62jmz#h3Z@xh7=nXyF zO(Ullg{9M-=zGtfsNoU*ali9B#+iJy-gf)Xjv5&9(aZsn`JEm_j1T$ax#5o}nHPNR zZV~=f)^^K# zy8Vo}c5Vv|>_1n}o4lnmUPbgv=Or#Oew_mt{El+me$@OvbH66a^vY4&7q8auR6F#p zFm9orpofY>)qmU;m9%q@i|+#wfB0Su>l@#D;d?HutDN(}2lV*v75BC_SRT$H`8^Q! z9FF5VzO*xr?*qje$9Hbyg8BH~5Pv@+)_A_#8qPbrmI%fg$9L*NIS1SzydOyX`SE-o zGMtyc%a?rY?DQ6X2c2^UzWe+M}h-IYF1iOm1m(j^Y?cX(sb=X;!?-4f6FJG7-dKl^e1PHZf?e0L0g zBLC#?sPgwMVvXm!itvR!g?}eB)_A_dhdziueD@Q*@^=#L-VuMtG8SFFUxz+o(Is#F zrS$o3TgZ3#cLe!vS}Z>K9vt}bcg*}hH9zAUx*zjzyHgFd?MM8b zB)*GR6ul5n>~5D6N%XYc^o#N@_zZF8@2UJ!I{cj%^nI|IrHRt&H94;vFOBFH~G85e1CqU_fO$>=e0*(s#HX z=Gwl>&Hs5m!gY}E@5GWjT;B@6ue$2z=ZUp$h3f|2QH?b}@gKVaw=iBFj2AzA|1RhHD5T6@6a{!{06XoZ#}U5+ zsB1pBzmMO6bE!HSfd6)0uR8bNGmRJe#-2DI=AJqH!^bT?Z|7b&d@#P4f7io3YtH4l z9}V93P7*)p3Hsba#?GL7!DtB_-UH_TE_T8_T=em+&lRDwWTEEeo-6lQnV);8$jyBX z?uim-!IgVv+|vYq?nj}Q;y&-^ehv4Op7A=&x&j~ED`foR);Dr&pDj6<54+^vD1OKL zCGN?kna)QoG@g5G+*4!z&&^jEzt_c1xCcgl<}@)Xl*{j}k&Am&=;LP3f6OKK2cVC9 z@Nw5f@yR`*UA8~u=Kc=);@%DSYf=N;c;9tos_0@jocH4Q+%rMX++$!~eveVh=P~fl zJq+Hjac=@WHgtdFz*-uv>W@JvEkTGe(^%*(&+>D$swtMF~XTea5 zjNi&iO@wrQ1=((lZ=>0?iIYM~i#i;s9+y9`;D3IpU@`;q zX`hMG=s7u=Q>Ty3oF0<;&x<7_IHL8}7^HqeGAX3K>Ors-VC6{x_XLdSd_p literal 0 HcmV?d00001 diff --git a/models/demos/llama3/tests/test_interleaved_to_sharded.py b/models/demos/llama3/tests/test_interleaved_to_sharded.py index 62a0a20dd2e..e2915d8b7a8 100644 --- a/models/demos/llama3/tests/test_interleaved_to_sharded.py +++ b/models/demos/llama3/tests/test_interleaved_to_sharded.py @@ -6,16 +6,7 @@ from loguru import logger import os import ttnn -from models.demos.llama3.tt.llama_common import ( - precompute_freqs, -) -from models.demos.llama3.tt.llama_decoder import TtTransformerBlock from models.demos.llama3.tt.model_config import TtModelArgs -from models.demos.t3000.llama2_70b.reference.llama.llama31_8b.model import TransformerBlock -from models.utility_functions import ( - comp_pcc, - comp_allclose, -) from models.utility_functions import skip_for_grayskull @@ -31,8 +22,6 @@ indirect=True, ) def test_llama_decoder_inference(mesh_device, use_program_cache, reset_seeds): - dtype = ttnn.bfloat8_b - mesh_device.enable_async(True) model_args = TtModelArgs(mesh_device) @@ -43,42 +32,20 @@ def test_llama_decoder_inference(mesh_device, use_program_cache, reset_seeds): partial_state_dict = { k[len(first_layer_prefix) :]: v for k, v in state_dict.items() if (k.startswith(first_layer_prefix)) } - reference_model = TransformerBlock(layer_id=0, args=model_args) + reference_model = model_args.reference_decoder() reference_model.load_state_dict(partial_state_dict) - generation_start_pos = 0 generation_length = 10 - all_tests_pass = True - - # Initialize TT model - tt_model = TtTransformerBlock( - args=model_args, - mesh_device=mesh_device, - dtype=dtype, - state_dict=state_dict, - layer_num=0, - weight_cache_path=model_args.weight_cache_path(dtype), - ) seqlen = 1 batch = model_args.max_batch_size - cos, sin = precompute_freqs(model_args.head_dim, model_args.max_seq_len * 2, model_args.rope_scaling_factor) - freqs_cis = torch.complex(cos, sin) - for i in range(generation_length): logger.info(f"[Decoder] Generating token {i}") # input = torch.randn(1, 32, 4096) pt_decode_input = (torch.rand(batch, seqlen, model_args.dim) * 2) - 1 tt_decode_input = pt_decode_input.clone() - current_pos = generation_start_pos + i - current_pos_tensor = ttnn.from_torch( - torch.tensor([current_pos] * batch), - device=mesh_device, - dtype=ttnn.int32, - mesh_mapper=ttnn.ReplicateTensorToMesh(mesh_device), - ) decode_input = model_args.prepare_residual_tensor_decode( tt_decode_input, diff --git a/models/demos/llama3/tests/test_llama_accuracy.py b/models/demos/llama3/tests/test_llama_accuracy.py index c77f3e3c914..d0fd2d2a15b 100644 --- a/models/demos/llama3/tests/test_llama_accuracy.py +++ b/models/demos/llama3/tests/test_llama_accuracy.py @@ -9,21 +9,16 @@ import ttnn from models.demos.llama3.tt.llama_common import ( get_prefill_rot_mat, - HostEmbedding, PagedAttentionConfig, ) from models.demos.llama3.tt.llama_model import TtTransformer from models.demos.llama3.tt.model_config import TtModelArgs, LlamaOptimizations -from models.demos.t3000.llama2_70b.reference.llama.llama31_8b.tokenizer import Tokenizer from models.demos.llama3.demo.demo import preprocess_inputs_prefill from pathlib import Path -def get_accuracy_thresholds(model_name: str, device_name: str, optimizations: LlamaOptimizations): +def get_accuracy_thresholds(base_model_name: str, device_name: str, optimizations: LlamaOptimizations): """Parse accuracy thresholds from PERF.md for the given model, optimization mode, and device.""" - # Get model size (e.g., "1b", "3b", etc.) - model_size = model_name.split("-")[1].lower() - # Read PERF.md perf_file = Path(__file__).parent.parent / "PERF.md" with open(perf_file, "r") as f: @@ -31,22 +26,28 @@ def get_accuracy_thresholds(model_name: str, device_name: str, optimizations: Ll # Split into sections based on optimization mode sections = content.split("## ") - target_section = next(s for s in sections if s.startswith(f"LlamaOptimizations.{optimizations.__name__}\n")) + target_section = next(s for s in sections if s.lower().startswith(f"{optimizations.__name__}\n")) # Parse the table and find the row for our model and device + # Potential lines have the form "| Llama3.1-8b | T3K | 91 | 99 | 49.8 |" + correct_line = ( + lambda line: "|" in line + and base_model_name.lower() in line.split("|")[1].strip().lower() + and device_name.lower() in line.split("|")[2].strip().lower() + ) rows = [ line.split("|")[1:] # Each row starts with a separator - for line in target_section.replace(" ", "").split("\n") - if f"|{model_size}|{device_name}|" in line + for line in target_section.split("\n") + if correct_line(line) ] if not rows: raise ValueError( - f"Could not find accuracy data for {model_size} on {device_name} in {optimizations.__name__} mode" + f"Could not find accuracy data for {base_model_name} on {device_name} in {optimizations.__name__} mode" ) assert ( len(rows) == 1 - ), f"Found multiple rows for {model_size} on {device_name} in {optimizations.__name__} mode in PERF.md" + ), f"Found multiple rows for {base_model_name} on {device_name} in {optimizations.__name__} mode in PERF.md" row = rows[0] top1_acc = float(row[2].strip()) top5_acc = float(row[3].strip()) @@ -60,11 +61,12 @@ def get_accuracy_thresholds(model_name: str, device_name: str, optimizations: Ll @pytest.mark.parametrize( "prefill_len, decode_len, max_seq_len", # Max seqlen should be at least prefill_len + decode_len ((512, 128, 1024),), + # ((131072-8192, 8192-1, 131072),), ) @pytest.mark.parametrize( "mesh_device", [ - {"N150": (1, 1), "N300": (1, 2), "T3K": (1, 8), "TG": (8, 4)}.get( + {"N150": (1, 1), "N300": (1, 2), "N150x4": (1, 4), "T3K": (1, 8), "TG": (8, 4)}.get( os.environ.get("FAKE_DEVICE"), len(ttnn.get_device_ids()) ) ], @@ -130,7 +132,7 @@ def test_tt_model_acc( mesh_device, optimizations=optimizations, max_batch_size=batch_size, max_seq_len=max_seq_len ) - tokenizer = Tokenizer(model_args.tokenizer_path) + tokenizer = model_args.tokenizer # Load state_dict for TT model logger.info("Loading weights...") @@ -138,11 +140,10 @@ def test_tt_model_acc( logger.info("Finished loading weights...") # Load the reference data - model_size = model_args.model_name.split("-")[1].lower() # e.g., "1b", "3b", "8b", "70b" if use_reference_file: # Existing reference file loading logic - reference_data_file = f"models/demos/llama3/tests/reference_outputs/{model_size}.refpt" + reference_data_file = f"models/demos/llama3/tests/reference_outputs/{model_args.model_name}.refpt" logger.info(f"Loading reference data from {reference_data_file}") assert os.path.exists(reference_data_file) reference_data = torch.load(reference_data_file) @@ -201,7 +202,7 @@ def test_tt_model_acc( paged_attention_config=paged_attention_config, ) # Initialize embedding - embd = HostEmbedding(model_args) + embd = model_args.reference_embedding() state_dict_prefix = model_args.get_state_dict_prefix("", None) embd.load_state_dict({"emb.weight": state_dict[f"{state_dict_prefix}tok_embeddings.weight"]}) @@ -230,8 +231,10 @@ def test_tt_model_acc( model_args.head_dim, model_args.max_seq_len, mesh_device, - seq_len=prefill_lens[0], - scale_factor=model_args.rope_scaling_factor, + prefill_lens[0], + model_args.rope_theta, + model_args.rope_scaling_factor, + model_args.orig_context_len, ) prefill_input = model_args.prepare_residual_tensor_prefill( @@ -438,7 +441,7 @@ def test_tt_model_acc( # Get accuracy thresholds from PERF.md min_top1_acc, min_top5_acc = get_accuracy_thresholds( - model_args.model_name, + model_args.base_model_name, model_args.device_name, optimizations, ) diff --git a/models/demos/llama3/tests/test_llama_attention.py b/models/demos/llama3/tests/test_llama_attention.py index c0a077b465c..e942eb8a3f8 100644 --- a/models/demos/llama3/tests/test_llama_attention.py +++ b/models/demos/llama3/tests/test_llama_attention.py @@ -13,7 +13,6 @@ precompute_freqs, PagedAttentionConfig, ) -from models.demos.t3000.llama2_70b.reference.llama.llama31_8b.model import Attention from models.utility_functions import ( comp_pcc, comp_allclose, @@ -71,7 +70,7 @@ def test_llama_attention_inference( mesh_device.enable_async(True) model_args = TtModelArgs(mesh_device, max_batch_size=batch_size, max_seq_len=max_seq_len) - model_args.n_layers = 1 # For the unit test, just run a sigle layer + model_args.n_layers = 1 # For the unit test, just run a single layer state_dict = model_args.load_state_dict() @@ -81,7 +80,7 @@ def test_llama_attention_inference( k[len(first_layer_prefix) :]: v for k, v in state_dict.items() if (k.startswith(first_layer_prefix)) } - reference_model = Attention(args=model_args) + reference_model = model_args.reference_attention() reference_model.load_state_dict(partial_state_dict) seq_len = 1 @@ -97,8 +96,8 @@ def test_llama_attention_inference( model_args.head_dim, model_args.max_seq_len, model_args.rope_theta, - model_args.use_scaled_rope, model_args.rope_scaling_factor, + model_args.orig_context_len, ) transformation_mats = rope_setup.get_both_trans_mats() @@ -146,8 +145,8 @@ def test_llama_attention_inference( model_args.head_dim, model_args.max_seq_len * 2, model_args.rope_theta, - model_args.use_scaled_rope, model_args.rope_scaling_factor, + model_args.orig_context_len, ) freqs_cis = torch.complex(cos, sin) @@ -166,7 +165,7 @@ def test_llama_attention_inference( for i in range(generation_length): # 70B attention block typically sees tensors with mean 0 and std 0.03 - 0.05 in layer 1 - pt_attention_input = torch.randn(batch_size, seq_len, model_args.dim) * 0.05 + pt_attention_input = torch.randn(batch_size, seq_len, model_args.dim) # Qwen2.5 0.5B sees 0.1 to 2.1 tt_attention_input = pt_attention_input.clone() @@ -209,7 +208,7 @@ def test_llama_attention_inference( all_tests_pass = False # Increment position - current_pos = torch.tensor([generation_start_pos + i for _ in range(batch_size)]) + current_pos = torch.tensor([generation_start_pos + i + 1 for _ in range(batch_size)]) current_pos_tensor = ttnn.from_torch( current_pos, device=mesh_device, @@ -266,21 +265,16 @@ def test_llama_attention_inference( )[:batch_size, :, :, :] for cache in tt_model.layer_past ] - - for i, (cache_pt, cache_tt) in enumerate(zip(pytorch_layer_present, tt_layer_present)): - cache_length_to_check = min(model_args.max_seq_len, generation_start_pos + generation_length + 1) + for label, cache_pt, cache_tt in zip(["K", "V"], pytorch_layer_present, tt_layer_present): + cache_length_to_check = min(model_args.max_seq_len, generation_start_pos + i + 1) cache_pt = cache_pt[:, :, generation_start_pos:cache_length_to_check, :] cache_tt = cache_tt[:, :, generation_start_pos:cache_length_to_check, :] does_pass, output_pcc = comp_pcc(cache_pt, cache_tt, pcc) - if i == 0: - logger.info(f"K cache output: {output_pcc}") - else: - logger.info(f"V cache output: {output_pcc}") - + logger.info(f"{label} cache output: {output_pcc}") if does_pass: - logger.info(f"KV Cache Passed!") + logger.info(f"{label} cache Passed!") else: - logger.warning(f"KV Cache Failed! PCC value is lower than {pcc}") + logger.warning(f"{label} Cache Failed! PCC value is lower than {pcc}") all_tests_pass = False if all_tests_pass: diff --git a/models/demos/llama3/tests/test_llama_attention_prefill.py b/models/demos/llama3/tests/test_llama_attention_prefill.py index b8496e652a2..bf1db31f622 100644 --- a/models/demos/llama3/tests/test_llama_attention_prefill.py +++ b/models/demos/llama3/tests/test_llama_attention_prefill.py @@ -13,7 +13,7 @@ get_rot_transformation_mat, PagedAttentionConfig, ) -from models.demos.t3000.llama2_70b.reference.llama.llama31_8b.model import Attention, precompute_freqs_cis +from models.demos.t3000.llama2_70b.reference.llama.llama31_8b.model import precompute_freqs_cis from models.utility_functions import ( comp_pcc, comp_allclose, @@ -51,7 +51,7 @@ @pytest.mark.parametrize( "max_seq_len", ( - 2048, + 256, # 4096, # 1024 * 32, # 1024 * 64, ), @@ -80,7 +80,7 @@ def test_llama_attention_inference( partial_state_dict = { k[len(first_layer_prefix) :]: v for k, v in state_dict.items() if (k.startswith(first_layer_prefix)) } - reference_model = Attention(args=model_args) + reference_model = model_args.reference_attention() reference_model.load_state_dict(partial_state_dict) # pre-compute the rotational embedding matrix and send to device @@ -88,10 +88,13 @@ def test_llama_attention_inference( model_args.head_dim, model_args.max_seq_len, mesh_device, - seq_len=max_seq_len, - scale_factor=model_args.rope_scaling_factor, + max_seq_len, + model_args.rope_theta, + model_args.rope_scaling_factor, + model_args.orig_context_len, ) transformation_mat_torch = get_rot_transformation_mat(model_args.head_dim) + transformation_mats_prefill = ttnn.as_tensor( transformation_mat_torch, dtype=ttnn.bfloat16, @@ -165,7 +168,6 @@ def test_llama_attention_inference( model_args.head_dim, model_args.max_seq_len * 2, model_args.rope_theta, - model_args.use_scaled_rope, model_args.rope_scaling_factor, )[positions] attn_mask = torch.full((max_seq_len, max_seq_len), torch.finfo(torch.float32).min) diff --git a/models/demos/llama3/tests/test_llama_decoder.py b/models/demos/llama3/tests/test_llama_decoder.py index c74a4aa3dbc..df7562461c4 100644 --- a/models/demos/llama3/tests/test_llama_decoder.py +++ b/models/demos/llama3/tests/test_llama_decoder.py @@ -13,7 +13,6 @@ from models.demos.llama3.tt.model_config import TtModelArgs from models.demos.llama3.tt.llama_decoder import TtTransformerBlock from models.demos.llama3.tt.llama_rope import TtLlamaRotarySetup -from models.demos.t3000.llama2_70b.reference.llama.llama31_8b.model import TransformerBlock from models.utility_functions import ( comp_pcc, comp_allclose, @@ -78,7 +77,7 @@ def test_llama_decoder_inference( partial_state_dict = { k[len(first_layer_prefix) :]: v for k, v in state_dict.items() if (k.startswith(first_layer_prefix)) } - reference_model = TransformerBlock(layer_id=0, args=model_args) + reference_model = model_args.reference_decoder() reference_model.load_state_dict(partial_state_dict) generation_start_pos = 0 @@ -92,8 +91,8 @@ def test_llama_decoder_inference( model_args.head_dim, model_args.max_seq_len, model_args.rope_theta, - model_args.use_scaled_rope, model_args.rope_scaling_factor, + model_args.orig_context_len, ) transformation_mats = rope_setup.get_both_trans_mats() @@ -143,8 +142,8 @@ def test_llama_decoder_inference( model_args.head_dim, model_args.max_seq_len * 2, model_args.rope_theta, - model_args.use_scaled_rope, model_args.rope_scaling_factor, + model_args.orig_context_len, ) freqs_cis = torch.complex(cos, sin) diff --git a/models/demos/llama3/tests/test_llama_decoder_prefill.py b/models/demos/llama3/tests/test_llama_decoder_prefill.py index 85f767b3301..53cbf81cb03 100644 --- a/models/demos/llama3/tests/test_llama_decoder_prefill.py +++ b/models/demos/llama3/tests/test_llama_decoder_prefill.py @@ -13,7 +13,7 @@ ) from models.demos.llama3.tt.llama_decoder import TtTransformerBlock from models.demos.llama3.tt.model_config import TtModelArgs -from models.demos.t3000.llama2_70b.reference.llama.llama31_8b.model import TransformerBlock, precompute_freqs_cis +from models.demos.t3000.llama2_70b.reference.llama.llama31_8b.model import precompute_freqs_cis from models.utility_functions import ( comp_pcc, comp_allclose, @@ -79,7 +79,7 @@ def test_llama_decoder_inference( k[len(first_layer_prefix) :]: v for k, v in state_dict.items() if (k.startswith(first_layer_prefix)) } - reference_model = TransformerBlock(layer_id=0, args=model_args) + reference_model = model_args.reference_decoder() reference_model.load_state_dict(partial_state_dict) generation_start_pos = 0 @@ -91,8 +91,10 @@ def test_llama_decoder_inference( model_args.head_dim, model_args.max_seq_len, mesh_device, - seq_len=max_seq_len, - scale_factor=model_args.rope_scaling_factor, + max_seq_len, + model_args.rope_theta, + model_args.rope_scaling_factor, + model_args.orig_context_len, ) transformation_mat_torch = get_rot_transformation_mat(model_args.head_dim) transformation_mats_prefill = ttnn.as_tensor( @@ -153,7 +155,6 @@ def test_llama_decoder_inference( model_args.head_dim, model_args.max_seq_len * 2, model_args.rope_theta, - model_args.use_scaled_rope, model_args.rope_scaling_factor, )[positions] diff --git a/models/demos/llama3/tests/test_llama_embedding.py b/models/demos/llama3/tests/test_llama_embedding.py index 9c42a859a94..71d56a3a7f4 100644 --- a/models/demos/llama3/tests/test_llama_embedding.py +++ b/models/demos/llama3/tests/test_llama_embedding.py @@ -8,13 +8,11 @@ import ttnn from models.demos.llama3.tt.llama_embedding import TtLlamaEmbedding from models.demos.llama3.tt.model_config import TtModelArgs -from models.demos.t3000.llama2_70b.reference.llama.llama31_8b.tokenizer import Tokenizer from models.utility_functions import ( comp_pcc, comp_allclose, ) from models.utility_functions import skip_for_grayskull -from models.demos.llama3.tt.llama_common import HostEmbedding @torch.no_grad() @@ -44,9 +42,9 @@ def test_llama_embedding(max_seq_len, batch_size, mesh_device, use_program_cache model_args.n_layers = 1 state_dict = model_args.load_state_dict() - tokenizer = Tokenizer(model_args.tokenizer_path) + tokenizer = model_args.tokenizer - reference_emb = HostEmbedding(model_args) + reference_emb = model_args.reference_embedding() if model_args.is_vision(): layer_name = "text_model.tok_embeddings.weight" else: @@ -62,7 +60,7 @@ def test_llama_embedding(max_seq_len, batch_size, mesh_device, use_program_cache ) prompts = ["Joy"] * 32 - pt_input = torch.tensor([tokenizer.encode(prompt, bos=False, eos=False) for prompt in prompts]) + pt_input = torch.tensor([model_args.encode_prompt(prompt, instruct=False) for prompt in prompts]) reference_output = reference_emb(pt_input) logger.info(f"reference_output: {reference_output.shape}") diff --git a/models/demos/llama3/tests/test_llama_mlp.py b/models/demos/llama3/tests/test_llama_mlp.py index 7d785a554b7..710ee9498c5 100644 --- a/models/demos/llama3/tests/test_llama_mlp.py +++ b/models/demos/llama3/tests/test_llama_mlp.py @@ -9,7 +9,6 @@ import ttnn from models.demos.llama3.tt.llama_mlp import TtLlamaMLP from models.demos.llama3.tt.model_config import TtModelArgs -from models.demos.t3000.llama2_70b.reference.llama.llama31_8b.model import FeedForward from models.utility_functions import ( comp_pcc, comp_allclose, @@ -57,12 +56,7 @@ def test_llama_mlp_inference(seq_len, batch_size, mesh_device, use_program_cache } model_args.WEIGHTS_DTYPE = dtype - reference_model = FeedForward( - dim=model_args.dim, - hidden_dim=4 * model_args.dim, - multiple_of=model_args.multiple_of, - ffn_dim_multiplier=model_args.ffn_dim_multiplier, - ) + reference_model = model_args.reference_mlp() reference_model.load_state_dict(partial_state_dict) tt_model = TtLlamaMLP( @@ -84,12 +78,14 @@ def test_llama_mlp_inference(seq_len, batch_size, mesh_device, use_program_cache ), # When both dims are None, the mapper used is `ReplicateTensorToMesh` dtype=ttnn.bfloat8_b, memory_config=( - tt_model.model_config["MLP_ACT_MEMCFG"] - if model_args.is_galaxy - else model_args.model_config["SHARDED_MLP_INPUT_MEMCFG"] - ) - if mode == "decode" - else ttnn.DRAM_MEMORY_CONFIG, + ( + tt_model.model_config["MLP_ACT_MEMCFG"] + if model_args.is_galaxy + else model_args.model_config["SHARDED_MLP_INPUT_MEMCFG"] + ) + if mode == "decode" + else ttnn.DRAM_MEMORY_CONFIG + ), layout=ttnn.TILE_LAYOUT, ) diff --git a/models/demos/llama3/tests/test_llama_model.py b/models/demos/llama3/tests/test_llama_model.py index a41645f3394..fefda03034f 100644 --- a/models/demos/llama3/tests/test_llama_model.py +++ b/models/demos/llama3/tests/test_llama_model.py @@ -8,14 +8,10 @@ import ttnn from models.demos.llama3.tt.llama_common import ( sample_host, - encode_prompt_llama_instruct, - HostEmbedding, PagedAttentionConfig, ) from models.demos.llama3.tt.model_config import TtModelArgs, LlamaOptimizations from models.demos.llama3.tt.llama_model import TtTransformer -from models.demos.t3000.llama2_70b.reference.llama.llama31_8b.model import Transformer -from models.demos.t3000.llama2_70b.reference.llama.llama31_8b.tokenizer import Tokenizer from models.utility_functions import ( comp_pcc, comp_allclose, @@ -92,7 +88,7 @@ def test_llama_model_inference( dtype = ttnn.bfloat8_b mesh_device.enable_async(True) mode_accuracy = optimizations == LlamaOptimizations.accuracy - instruct = True if weights == "instruct" else False + instruct = False # True if weights == "instruct" else False dummy_weights = True if weights == "random" else False model_args = TtModelArgs( mesh_device, @@ -103,49 +99,52 @@ def test_llama_model_inference( max_batch_size=batch_size, ) - model_name = { - (16, False): "llama32_1b", - (28, False): "llama32_3b", - (32, False): "llama31_8b", - (32, True): "llama32_11b", - (80, False): "llama31_70b", - }[(model_args.n_layers, model_args.is_vision())] - # Define minimum PCC for each iteration if layers == 1: pcc = 0.88 if mode_accuracy else 0.86 else: pcc = 0.94 if mode_accuracy else 0.86 - # Define tight final PCC thresholds for quick mode - final_model_pcc = { - "llama32_1b": 0.9990 if mode_accuracy else 0.9864, - "llama32_3b": 0.9989 if mode_accuracy else 0.9837, - "llama31_8b": 0.9987 if mode_accuracy else 0.9850, - "llama32_11b": 0.9987 if mode_accuracy else 0.9850, - "llama31_70b": 0.9419 if mode_accuracy else 0.9419, - }[model_name] - - final_k_cache_pcc = { - "llama32_1b": 0.9998, - "llama32_3b": 0.9998, - "llama31_8b": 0.9997, - "llama32_11b": 0.9995, - "llama31_70b": 0.9997, - }[model_name] - final_v_cache_pcc = { - "llama32_1b": 0.9996, - "llama32_3b": 0.9998, - "llama31_8b": 0.9997, - "llama32_11b": 0.9996, - "llama31_70b": 0.9997, - }[model_name] - - quick_iterations = {"llama32_1b": 2, "llama32_3b": 4, "llama31_8b": 6, "llama32_11b": 6, "llama31_70b": 6}[ - model_name - ] - - iterations = quick_iterations if layers == 1 else 9 + if layers == 1: # quick mode has tight PCC checks for known models + model_name = { + (16, False): "llama32_1b", + (28, False): "llama32_3b", + (32, False): "llama31_8b", + (32, True): "llama32_11b", + (80, False): "llama31_70b", + }[(model_args.n_layers, model_args.is_vision())] + + # Define tight final PCC thresholds for quick mode + final_model_pcc = { + "llama32_1b": 0.9991 if mode_accuracy else 0.9864, + "llama32_3b": 0.9989 if mode_accuracy else 0.9837, + "llama31_8b": 0.9987 if mode_accuracy else 0.9850, + "llama32_11b": 0.9987 if mode_accuracy else 0.9850, + "llama31_70b": 0.9843 if mode_accuracy else 0.97607, + }[model_name] + + final_k_cache_pcc = { + "llama32_1b": 0.9998, + "llama32_3b": 0.9998, + "llama31_8b": 0.9997, + "llama32_11b": 0.9995, + "llama31_70b": 0.9997, + }[model_name] + final_v_cache_pcc = { + "llama32_1b": 0.9996, + "llama32_3b": 0.9998, + "llama31_8b": 0.9997, + "llama32_11b": 0.9996, + "llama31_70b": 0.9997, + }[model_name] + + quick_iterations = {"llama32_1b": 2, "llama32_3b": 4, "llama31_8b": 6, "llama32_11b": 6, "llama31_70b": 6}[ + model_name + ] + + iterations = quick_iterations + else: + iterations = 9 if layers is not None: model_args.n_layers = layers @@ -172,18 +171,18 @@ def test_llama_model_inference( ] * model_args.max_batch_size # "This is a test" encoded prompt assert not instruct, "Instruct prompt not implemented with dummy weights" else: - tokenizer = Tokenizer(model_args.tokenizer_path) + tokenizer = model_args.tokenizer if instruct: - encoded_prompts = [encode_prompt_llama_instruct(tokenizer, prompt) for prompt in prompts] + encoded_prompts = [model_args.encode_prompt(prompt) for prompt in prompts] else: - encoded_prompts = [tokenizer.encode(prompt, bos=True, eos=False) for prompt in prompts] + encoded_prompts = [model_args.encode_prompt(prompt, instruct=False) for prompt in prompts] if run_ref_pt: - reference_model = Transformer(model_args) + reference_model = model_args.reference_transformer() reference_model.load_state_dict(reference_state_dict) # Embedding on host - embd = HostEmbedding(model_args) + embd = model_args.reference_embedding() embd.load_state_dict({"emb.weight": state_dict[f"{state_dict_prefix}tok_embeddings.weight"]}) generation_start_pos = 0 @@ -320,15 +319,21 @@ def test_llama_model_inference( pt_decode_input = embd(encoded_prompts_tensor[:, i]).view(batch, seqlen, -1) else: # Greedy decode (temperature = 0) the generated token and save it to print out later - tt_out_tok = sample_host(tt_output_torch, None, temperature=0, top_p=0.8) - tt_decode_input = embd(tt_out_tok) - all_outputs.append(tt_out_tok.squeeze(1).tolist()[0]) # Update generated token to list of TT outputs if run_ref_pt: + # Sample from reference model first pt_out_tok = sample_host(ref_output, None, temperature=0, top_p=0.8) pt_decode_input = embd(pt_out_tok) - all_outputs_ref.append( - pt_out_tok.squeeze(1).tolist()[0] - ) # Update generated token to list of ref outputs + all_outputs_ref.append(pt_out_tok.squeeze(1).tolist()[0]) + + # Use the same token for TT model (teacher forcing) + tt_decode_input = pt_decode_input + all_outputs.append(pt_out_tok.squeeze(1).tolist()[0]) + else: + # If not running reference model, sample from TT model directly + tt_out_tok = sample_host(tt_output_torch, None, temperature=0, top_p=0.8) + tt_decode_input = embd(tt_out_tok) + all_outputs.append(tt_out_tok.squeeze(1).tolist()[0]) + # Measure PCC if also running reference model if run_ref_pt: if layers == 1 and i == iterations - 1: # On last iteration in the quick test, set a tighter PCC @@ -432,6 +437,7 @@ def test_llama_model_inference( logger.info(f"All {generation_length} Llama decode iterations Passed!") else: logger.warning("One or more iterations of Llama decode had bad PCC") - assert final_tests_pass, f"PCC value is lower than {final_model_pcc} for final output. Check Warnings!" + if layers == 1: + assert final_tests_pass, f"PCC value is lower than {final_model_pcc} for final output. Check Warnings!" assert kv_cache_tests_pass, f"KV Cache PCC value is lower expected for some of the outputs. Check Warnings!" assert all_tests_pass, f"PCC value is lower than {pcc} for some of the outputs. Check Warnings!" diff --git a/models/demos/llama3/tests/test_llama_model_prefill.py b/models/demos/llama3/tests/test_llama_model_prefill.py index 91e45e8bc98..fb16414e979 100644 --- a/models/demos/llama3/tests/test_llama_model_prefill.py +++ b/models/demos/llama3/tests/test_llama_model_prefill.py @@ -9,15 +9,10 @@ import ttnn from models.demos.llama3.tt.llama_common import ( get_prefill_rot_mat, - get_rot_transformation_mat, - HostEmbedding, - encode_prompt_llama_instruct, PagedAttentionConfig, ) from models.demos.llama3.tt.llama_model import TtTransformer from models.demos.llama3.tt.model_config import TtModelArgs, LlamaOptimizations -from models.demos.t3000.llama2_70b.reference.llama.llama31_8b.model import Transformer -from models.demos.t3000.llama2_70b.reference.llama.llama31_8b.tokenizer import Tokenizer from models.utility_functions import ( comp_pcc, comp_allclose, @@ -98,7 +93,7 @@ def test_llama_model_inference( instruct = True model_args = TtModelArgs(mesh_device, max_batch_size=batch_size, optimizations=optimizations, max_seq_len=seq_len) - tokenizer = Tokenizer(model_args.tokenizer_path) + tokenizer = model_args.tokenizer logger.info("Loading weights...") state_dict_prefix = model_args.get_state_dict_prefix("", None) @@ -125,16 +120,14 @@ def test_llama_model_inference( with bz2.open(prompt_file, "rt", encoding="utf-8") as f: prompt = f.read() - if instruct: - encoded_prompt = encode_prompt_llama_instruct(tokenizer, prompt)[:seq_len] - else: - encoded_prompt = tokenizer.encode(prompt, bos=True, eos=False)[:seq_len] + encoded_prompt = model_args.encode_prompt(prompt, instruct=instruct)[:seq_len] if run_ref_pt: - reference_model = Transformer(model_args) + reference_model = model_args.reference_transformer() reference_model.load_state_dict(reference_state_dict) + # Embedding on host - embd = HostEmbedding(model_args) + embd = model_args.reference_embedding() embd.load_state_dict({"emb.weight": state_dict[f"{state_dict_prefix}tok_embeddings.weight"]}) # pre-compute the rotational embedding matrix and send to device @@ -142,8 +135,10 @@ def test_llama_model_inference( model_args.head_dim, model_args.max_seq_len, mesh_device, - seq_len=seq_len, - scale_factor=model_args.rope_scaling_factor, + seq_len, + model_args.rope_theta, + model_args.rope_scaling_factor, + model_args.orig_context_len, ) # Setup page table page_table_tt = None diff --git a/models/demos/llama3/tests/test_llama_rms_norm.py b/models/demos/llama3/tests/test_llama_rms_norm.py index 5fdc99ee14d..4493b8b4518 100644 --- a/models/demos/llama3/tests/test_llama_rms_norm.py +++ b/models/demos/llama3/tests/test_llama_rms_norm.py @@ -8,7 +8,6 @@ import ttnn from models.common.rmsnorm import RMSNorm as TtRMSNorm from models.demos.llama3.tt.model_config import TtModelArgs -from models.demos.t3000.llama2_70b.reference.llama.llama31_8b.model import RMSNorm as RefRMSNorm from models.utility_functions import ( comp_pcc, comp_allclose, @@ -77,7 +76,7 @@ def test_llama_rms_norm_inference( partial_state_dict = { k[len(first_layer_prefix) :]: v for k, v in state_dict.items() if (k.startswith(first_layer_prefix)) } - reference_model = RefRMSNorm(dim=model_args.dim, eps=model_args.norm_eps) + reference_model = model_args.reference_rms_norm() reference_model.load_state_dict(partial_state_dict) input = torch.rand(1, 1, 32, model_args.dim) @@ -90,9 +89,9 @@ def test_llama_rms_norm_inference( dtype=dtype, layout=ttnn.TILE_LAYOUT, mesh_mapper=ttnn.ShardTensor2dMesh(mesh_device, dims=(None, -1), mesh_shape=model_args.cluster_shape), - memory_config=model_args.get_model_config()["DECODE_RESIDUAL_MEMCFG"] - if mode == "decode" - else ttnn.DRAM_MEMORY_CONFIG, + memory_config=( + model_args.get_model_config()["DECODE_RESIDUAL_MEMCFG"] if mode == "decode" else ttnn.DRAM_MEMORY_CONFIG + ), ) tt_output = tt_model(tt_input, mode=mode) diff --git a/models/demos/llama3/tests/test_llama_torch.py b/models/demos/llama3/tests/test_llama_torch.py index 90713eb01ab..3ff878c5ec0 100644 --- a/models/demos/llama3/tests/test_llama_torch.py +++ b/models/demos/llama3/tests/test_llama_torch.py @@ -4,10 +4,7 @@ import torch # import ttnn -from models.demos.llama3.tt.llama_common import HostEmbedding from models.demos.llama3.tt.model_config import TtModelArgs -from models.demos.t3000.llama2_70b.reference.llama.llama31_8b.model import Transformer -from models.demos.t3000.llama2_70b.reference.llama.llama31_8b.tokenizer import Tokenizer from loguru import logger @@ -18,16 +15,16 @@ def test_llama_torch_inference(ensure_gc): model_args = TtModelArgs(mesh_device=None) state_dict = model_args.load_state_dict() - tokenizer = Tokenizer(model_args.tokenizer_path) + tokenizer = model_args.tokenizer prompts = ["1 2 3 4 "] * model_args.max_batch_size - encoded_prompts = [tokenizer.encode(prompt, bos=True, eos=False) for prompt in prompts] + encoded_prompts = [model_args.encode_prompt(prompt, instruct=False) for prompt in prompts] - reference_model = Transformer(model_args) + reference_model = model_args.reference_transformer() reference_model.load_state_dict(state_dict) # Embedding on host - embd = HostEmbedding(model_args) + embd = model_args.reference_embedding() state_dict_prefix = model_args.get_state_dict_prefix("", None) embd.load_state_dict({"emb.weight": state_dict[f"{state_dict_prefix}tok_embeddings.weight"]}) @@ -66,4 +63,4 @@ def test_llama_torch_inference(ensure_gc): all_outputs_ref.append(pt_out_tok.squeeze(1).tolist()[0]) # Update generated token to list of ref outputs # TODO print all 32 users - logger.info("[User 0] Ref generation: ", "".join(tokenizer.decode(all_outputs_ref))) + logger.info("[User 0] Ref generation: '" + "".join(tokenizer.decode(all_outputs_ref)) + "'") diff --git a/models/demos/llama3/tests/test_lm_head.py b/models/demos/llama3/tests/test_lm_head.py index b3b422b36dc..ea42d7c4eb4 100644 --- a/models/demos/llama3/tests/test_lm_head.py +++ b/models/demos/llama3/tests/test_lm_head.py @@ -9,7 +9,6 @@ import ttnn from models.demos.llama3.tt.lm_head import LMHead from models.demos.llama3.tt.model_config import TtModelArgs -from models.demos.t3000.llama2_70b.reference.llama.llama31_8b.model import ColumnParallelLinear from models.utility_functions import ( comp_pcc, comp_allclose, @@ -52,7 +51,7 @@ def test_llama_lm_head_inference(seq_len, batch_size, mesh_device, use_program_c } model_args.WEIGHTS_DTYPE = dtype - reference_model = ColumnParallelLinear(model_args.dim, model_args.vocab_size, bias=False, init_method=lambda x: x) + reference_model = model_args.reference_lm_head() reference_model.load_state_dict(partial_state_dict) tt_model = LMHead( diff --git a/models/demos/llama3/tests/test_ref.py b/models/demos/llama3/tests/test_ref.py new file mode 100644 index 00000000000..d3ad5ba20bf --- /dev/null +++ b/models/demos/llama3/tests/test_ref.py @@ -0,0 +1,104 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 +import torch +import pytest +from loguru import logger +import os +import ttnn +from models.demos.llama3.tt.llama_attention import TtLlamaAttention +from models.demos.llama3.tt.llama_rope import TtLlamaRotarySetup +from models.demos.llama3.tt.model_config import TtModelArgs +from models.demos.llama3.tt.llama_common import ( + precompute_freqs, + PagedAttentionConfig, +) +from models.utility_functions import ( + comp_pcc, + comp_allclose, +) +from models.utility_functions import skip_for_grayskull +from models.demos.llama3.tt.load_checkpoints import convert_meta_to_hf, convert_hf_to_meta, map_hf_to_meta_keys + + +@torch.no_grad() +@skip_for_grayskull("Requires wormhole_b0 to run") +@pytest.mark.parametrize( + "mesh_device", + [ + {"N150": (1, 1), "N300": (1, 2), "T3K": (1, 8), "TG": (8, 4)}.get( + os.environ.get("FAKE_DEVICE"), len(ttnn.get_device_ids()) + ) + ], + indirect=True, +) +@pytest.mark.parametrize( + "paged_attention", + ( + # True, + False, + ), + ids=( + # "paged_attention", + "default_attention", + ), +) +@pytest.mark.parametrize( + "page_params", + [{"page_block_size": 32, "page_max_num_blocks": 1024}], +) +@pytest.mark.parametrize( + "batch_size", + (1,), +) +@pytest.mark.parametrize( + "max_seq_len", + (128,), # For decode-only unit test, there's no need to run with large sequence lengths +) +def test_llama_attention_inference( + max_seq_len, + batch_size, + paged_attention, + page_params, + mesh_device, + use_program_cache, + reset_seeds, + ensure_gc, +): + dtype = ttnn.bfloat8_b + pcc = 0.99 + + mesh_device.enable_async(True) + + model_args = TtModelArgs(mesh_device, max_batch_size=batch_size, max_seq_len=max_seq_len) + model_args.n_layers = 1 # For the unit test, just run a single layer + + state_dict = model_args.load_state_dict() + + first_layer_prefix = model_args.get_state_dict_prefix("TtLlamaAttention", 0) + "." + # Ref model needs partial state dict, but our models use full state dict keys as cached weight names + partial_state_dict = { + k[len(first_layer_prefix) :]: v for k, v in state_dict.items() if (k.startswith(first_layer_prefix)) + } + + ref_model = model_args.reference_attention() + ref_model.load_state_dict(partial_state_dict) + + from transformers import AutoModelForCausalLM + + hf_transformer = AutoModelForCausalLM.from_pretrained(model_args.DEFAULT_CKPT_DIR) + hf_model = hf_transformer.model.layers[0].self_attn + hf_model.eval() + + # Get the state dicts + ref_state_dict = ref_model.attention.state_dict() # should contain hf keys and weights + hf_state_dict = hf_model.state_dict() + + for key in ["k_proj", "q_proj"]: + for suffix in ["weight", "bias"]: + print( + f"{key}.{suffix}: ref matches hf : {torch.allclose(ref_state_dict[key + '.' + suffix], hf_state_dict[key + '.' + suffix])}" + ) + + print(" ".join(f"{x:+3.1f}" for x in ref_state_dict["k_proj.bias"])) + print(" ".join(f"{x:+3.1f}" for x in hf_state_dict["k_proj.bias"])) diff --git a/models/demos/llama3/tt/generator_vllm.py b/models/demos/llama3/tt/generator_vllm.py index 846e0cef34f..06a9b1e37ea 100644 --- a/models/demos/llama3/tt/generator_vllm.py +++ b/models/demos/llama3/tt/generator_vllm.py @@ -32,7 +32,7 @@ def initialize_vllm_text_transformer( # Load model args, weights model_args = TtModelArgs( mesh_device, - instruct=("Instruct" in hf_config._name_or_path), + instruct=("Instruct" in hf_config._name_or_path or "DeepSeek-R1-Distill-Llama-70B" in hf_config._name_or_path), max_batch_size=max_batch_size, optimizations=optimizations, max_seq_len=max_seq_len, diff --git a/models/demos/llama3/tt/llama_attention.py b/models/demos/llama3/tt/llama_attention.py index 322e2edf2d2..a2c5490fef8 100644 --- a/models/demos/llama3/tt/llama_attention.py +++ b/models/demos/llama3/tt/llama_attention.py @@ -2,12 +2,14 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional +import math import torch import ttnn from models.common.lightweightmodule import LightweightModule from models.demos.llama3.tt.llama_ccl import tt_all_reduce, tt_all_gather +from models.demos.llama3.tt.llama_common import first_five +from models.demos.llama3.tt.load_checkpoints import permute class TtLlamaAttention(LightweightModule): @@ -41,7 +43,7 @@ def __init__( self.num_reduce_scatter_links = configuration.num_reduce_scatter_links self.num_all_gather_links = configuration.num_all_gather_links self.MAX_QKV_MM_SEQ_LEN = configuration.MAX_QKV_MM_SEQ_LEN - + self.tile_size = configuration.tile_size self.num_device_groups = self.num_devices // self.n_kv_heads self.num_devices_per_group = self.n_kv_heads if self.TG else self.num_devices self.batch_size_per_device_group = ( @@ -99,10 +101,65 @@ def __init__( else: cache_name = lambda name: weight_cache_path / (f"{layer_name}.{name}") - wq_str = f"{layer_name}.wq.weight" - wk_str = f"{layer_name}.wk.weight" - wv_str = f"{layer_name}.wv.weight" - wo_str = f"{layer_name}.wo.weight" + wq_str = f"{layer_name}.wq" + wk_str = f"{layer_name}.wk" + wv_str = f"{layer_name}.wv" + wo_str = f"{layer_name}.wo" + + # Initialize bias tensors as None + self.wqkv_bias_decode = None + self.wqkv_bias_prefill = None + + # Create combined QKV bias if present in state dict + if f"{wq_str}.bias" in self.state_dict: + qkv_bias = torch.concat( + [ + torch.concat( + [ + torch.chunk(self.state_dict[f"{wq_str}.bias"], configuration.num_devices)[i], + torch.chunk(self.state_dict[f"{wk_str}.bias"], configuration.num_devices)[i], + torch.chunk(self.state_dict[f"{wv_str}.bias"], configuration.num_devices)[i], + ], + dim=-1, + ) + for i in range(configuration.num_devices) + ], + dim=-1, + ) + # Prefill can use broadcasting on the bias add so wants a 1d tensor + self.wqkv_bias_prefill = ttnn.as_tensor( + qkv_bias, + device=self.mesh_device, + mesh_mapper=ttnn.ShardTensorToMesh(self.mesh_device, dim=-1), + dtype=self.dtype, + memory_config=ttnn.DRAM_MEMORY_CONFIG, + layout=ttnn.TILE_LAYOUT, + cache_file_name=cache_name("wqkv_bias_prefill_sharded"), + ) + # as_tensor returns (32, dim) which is incorrect, this reshape updates the padded size to the correct size + self.wqkv_bias_prefill = ttnn.reshape( + self.wqkv_bias_prefill, ttnn.Shape([1, 1, 1, self.wqkv_bias_prefill.shape[-1]]) + ) + + # Broadcasting does not seem to be supported inside execute_trace so expand to the whole batch size + # Create a list of bias tensors for each multiple of tile_size up to max_batch_size + self.wqkv_bias_decode = [] + for batch_size in range( + configuration.tile_size, + configuration.tile_padded_batch_rows + configuration.tile_size, + configuration.tile_size, + ): + qkv_bias_decode = qkv_bias.unsqueeze(0).expand(batch_size, -1) + bias_tensor = ttnn.as_tensor( + qkv_bias_decode, + device=self.mesh_device, + mesh_mapper=ttnn.ShardTensorToMesh(self.mesh_device, dim=-1), + dtype=self.dtype, + memory_config=ttnn.DRAM_MEMORY_CONFIG, + layout=ttnn.TILE_LAYOUT, + cache_file_name=cache_name(f"wqkv_bias_decode_sharded_{batch_size}"), + ) + self.wqkv_bias_decode.append(bias_tensor) # when splitting the devices, we need to make sure that the number of heads is divisible by the number of devices assert self.n_heads % self.num_devices_per_group == 0 @@ -118,9 +175,9 @@ def __init__( qkv_list = [] for i in range(self.num_devices_per_group): # Chunk weights - wq_selected = torch.chunk(self.state_dict[wq_str], self.num_devices_per_group, dim=0)[i] - wk_selected = torch.chunk(self.state_dict[wk_str], self.num_devices_per_group, dim=0)[i] - wv_selected = torch.chunk(self.state_dict[wv_str], self.num_devices_per_group, dim=0)[i] + wq_selected = torch.chunk(self.state_dict[f"{wq_str}.weight"], self.num_devices_per_group, dim=0)[i] + wk_selected = torch.chunk(self.state_dict[f"{wk_str}.weight"], self.num_devices_per_group, dim=0)[i] + wv_selected = torch.chunk(self.state_dict[f"{wv_str}.weight"], self.num_devices_per_group, dim=0)[i] # Transpose the selected chunks wq = torch.transpose(wq_selected, -2, -1) @@ -146,7 +203,7 @@ def __init__( # For ring topology we can use all gather matmul for wo self.use_fused_all_gather_matmul = self.model_config["USE_FUSED_ALL_GATHER_MATMUL"] - pt_wo = self.state_dict[wo_str].transpose(-1, -2).unsqueeze(0).unsqueeze(0) + pt_wo = self.state_dict[f"{wo_str}.weight"].transpose(-1, -2).unsqueeze(0).unsqueeze(0) wo_mem_config = configuration.create_dram_sharded_mem_config( configuration.dim // configuration.num_devices, configuration.dim @@ -163,9 +220,9 @@ def __init__( dims=(2, 3) if (self.use_fused_all_gather_matmul or self.TG) else (3, 2), mesh_shape=configuration.cluster_shape, ), - cache_file_name=cache_name("wo_width_sharded_2d") - if (self.use_fused_all_gather_matmul or self.TG) - else cache_name("wo"), + cache_file_name=( + cache_name("wo_width_sharded_2d") if (self.use_fused_all_gather_matmul or self.TG) else cache_name("wo") + ), ) if not use_paged_kv_cache: # vLLM provides its own kv cache @@ -221,9 +278,11 @@ def init_kv_cache(self, configuration, weight_cache_path): device=self.mesh_device, memory_config=ttnn.DRAM_MEMORY_CONFIG, mesh_mapper=ttnn.ReplicateTensorToMesh(self.mesh_device), - cache_file_name=f"{weight_cache_path}/kvcache_{k_or_v.shape}" - if weight_cache_path and not configuration.dummy_weights - else None, + cache_file_name=( + f"{weight_cache_path}/kvcache_{k_or_v.shape}" + if weight_cache_path and not configuration.dummy_weights + else None + ), ) for k_or_v in [cache_k, cache_v] ] @@ -245,14 +304,28 @@ def forward_decode( # QKV matmuls # Use HiFi2 for DRAM-sharded matmuls as they are otherwise flop-bound. Loses 1 bit of activation precision. ### + + as_torch = lambda tensor: torch.Tensor( + ttnn.to_torch(tensor, mesh_composer=ttnn.ConcatMeshToTensor(self.mesh_device, dim=-1)) + ) + + # print(f"our x:", " ".join(f'{t:+3.1f}' for t in as_torch(x)[0, 0, 0].flatten())) xqkv_fused_sharded = ttnn.linear( x, self.wqkv, + # bias=self.wqkv_bias, memory_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG, program_config=self.model_config["XQKV_DECODE_PROGCFG"], compute_kernel_config=self.compute_kernel_config_hifi2, dtype=self.ccl_dtype if self.TG else ttnn.bfloat16, ) + # FIXME: File bug against dram-sharded matmuls with bias + if self.wqkv_bias_decode: + # select the bias tensor based on the number of tiles in the rows + # WARNING: must not change the batch size between compiling and executing a trace + num_tiles = int(math.ceil(xqkv_fused_sharded.shape[-2] / self.tile_size)) + xqkv_fused_sharded = xqkv_fused_sharded + self.wqkv_bias_decode[num_tiles - 1] + ttnn.deallocate(x) xqkv_fused = tt_all_reduce( xqkv_fused_sharded, @@ -263,6 +336,7 @@ def forward_decode( memory_config=self.model_config["QKV_OUT_GATHERED_MEMCFG"](list(self.mesh_device.shape)[1]), sharded=True, dtype=self.ccl_dtype, + topology=self.ccl_topology, ) if self.TG: @@ -437,13 +511,16 @@ def forward_decode( num_reduce_scatter_links=self.num_reduce_scatter_links, num_all_gather_links=self.num_all_gather_links, dim=0 if (self.TG and self.hidden_size < 8192) else 3, + topology=self.ccl_topology, memory_config=( - self.model_config["SELF_OUT_REDUCE_SCATTER_MEMCFG"] - if self.hidden_size == 8192 - else self.model_config["SELF_OUT_GATHERED_MEMCFG"](list(self.mesh_device.shape)[0]) - ) - if self.TG - else self.model_config["DECODE_RESIDUAL_MEMCFG"], + ( + self.model_config["SELF_OUT_REDUCE_SCATTER_MEMCFG"] + if self.hidden_size == 8192 + else self.model_config["SELF_OUT_GATHERED_MEMCFG"](list(self.mesh_device.shape)[0]) + ) + if self.TG + else self.model_config["DECODE_RESIDUAL_MEMCFG"] + ), sharded=True, dtype=self.ccl_dtype, use_composite=True if self.hidden_size == 8192 else False, @@ -481,12 +558,17 @@ def forward_prefill( xqkv_fused = ttnn.linear( x_11SH, self.wqkv, + # bias=self.wqkv_bias_prefill, dtype=self.ccl_dtype if self.TG else ttnn.bfloat16, memory_config=ttnn.DRAM_MEMORY_CONFIG, compute_kernel_config=self.compute_kernel_config_hifi2, program_config=self.model_config["XQKV_PREFILL_PROGCFG"](seq_len), ) + # FIXME: surely ttnn.linear bias should work? + if self.wqkv_bias_prefill is not None: + xqkv_fused = xqkv_fused + self.wqkv_bias_prefill + xqkv_fused = tt_all_reduce( xqkv_fused, self.mesh_device, @@ -500,6 +582,18 @@ def forward_prefill( if seq_len > self.MAX_QKV_MM_SEQ_LEN: xqkv_fused = ttnn.reshape(xqkv_fused, [1, 1, seq_len, -1]) + def fix(xqkv): + torch_q = xqkv[: self.head_dim * self.n_local_heads] + torch_k = xqkv[ + self.head_dim * self.n_local_heads : self.head_dim * (self.n_local_heads + self.n_local_kv_heads) + ] + torch_v = xqkv[self.head_dim * (self.n_local_heads + self.n_local_kv_heads) :] + to_hf = lambda t: permute(t.unsqueeze(-1), t.shape[0] // self.head_dim, t.shape[0], 1).squeeze(-1) + torch_q = to_hf(torch_q) + torch_k = to_hf(torch_k) + torch_v = torch_v + return torch_k.flatten() + ttnn.deallocate(x_11SH) # split qkv into heads @@ -677,6 +771,7 @@ def forward_prefill( dim=0 if self.TG else 3, num_reduce_scatter_links=self.num_reduce_scatter_links, num_all_gather_links=self.num_all_gather_links, + topology=self.ccl_topology, memory_config=ttnn.DRAM_MEMORY_CONFIG, dtype=self.ccl_dtype, ) diff --git a/models/demos/llama3/tt/llama_ccl.py b/models/demos/llama3/tt/llama_ccl.py index 300c615c187..5e91c6f5209 100644 --- a/models/demos/llama3/tt/llama_ccl.py +++ b/models/demos/llama3/tt/llama_ccl.py @@ -13,6 +13,7 @@ def tt_all_reduce( dim=0, num_reduce_scatter_links=1, num_all_gather_links=2, + topology=ttnn.Topology.Linear, memory_config=None, sharded=False, dtype=ttnn.bfloat16, @@ -40,6 +41,7 @@ def tt_all_reduce( dim=dim, math_op=ttnn.ReduceType.Sum, num_links=num_reduce_scatter_links, + topology=topology, memory_config=memory_config, ) input_tensor.deallocate(True) @@ -63,7 +65,7 @@ def tt_all_reduce( num_links=num_all_gather_links, cluster_axis=cluster_axis, mesh_device=mesh_device, - topology=ttnn.Topology.Linear, + topology=topology, memory_config=ttnn.DRAM_MEMORY_CONFIG if not sharded else memory_config, ) @@ -87,7 +89,7 @@ def tt_all_reduce( cluster_axis=cluster_axis, mesh_device=mesh_device, math_op=ttnn.ReduceType.Sum, - topology=ttnn.Topology.Linear, + topology=topology, memory_config=ttnn.DRAM_MEMORY_CONFIG if not sharded else memory_config, ) @@ -97,7 +99,7 @@ def tt_all_reduce( num_links=num_all_gather_links, cluster_axis=cluster_axis, mesh_device=mesh_device, - topology=ttnn.Topology.Linear, + topology=topology, memory_config=input_mem_cfg, ) diff --git a/models/demos/llama3/tt/llama_common.py b/models/demos/llama3/tt/llama_common.py index 843cf066c78..d1de6bce149 100644 --- a/models/demos/llama3/tt/llama_common.py +++ b/models/demos/llama3/tt/llama_common.py @@ -44,15 +44,34 @@ def encode_prompt_llama_instruct(tokenizer, prompt_text, system_prompt_text=None return begin_of_text + system_prompt + user_prompt + assistant_reply -def apply_scaling(freqs: torch.Tensor, scale_factor: float = 8): - # Llama-3.x specific scaling +def encode_prompt_hf(tokenizer, prompt_text, system_prompt_text=None): + """See https://huggingface.co/docs/transformers/main/en/chat_templating""" + chat = [] + if system_prompt_text: + chat.append({"role": "system", "content": system_prompt_text}) + if prompt_text: + chat.append({"role": "user", "content": prompt_text}) + return tokenizer.apply_chat_template(chat, tokenize=True, add_generation_prompt=True) + + +def encode_prompt_hf(tokenizer, prompt_text, system_prompt_text=None): + """See https://huggingface.co/docs/transformers/main/en/chat_templating""" + chat = [] + if system_prompt_text: + chat.append({"role": "system", "content": system_prompt_text}) + if prompt_text: + chat.append({"role": "user", "content": prompt_text}) + return tokenizer.apply_chat_template(chat, tokenize=True, add_generation_prompt=True) + + +def apply_scaling(freqs: torch.Tensor, scale_factor: float, orig_context_len: int): + # FIXME: Llama-3.x specific scaling - we need to support yarn for Qwen2.5 models # Values obtained from grid search low_freq_factor = 1 high_freq_factor = 4 - old_context_len = 8192 # original llama3 length - low_freq_wavelen = old_context_len / low_freq_factor - high_freq_wavelen = old_context_len / high_freq_factor + low_freq_wavelen = orig_context_len / low_freq_factor + high_freq_wavelen = orig_context_len / high_freq_factor new_freqs = [] for freq in freqs: wavelen = 2 * math.pi / freq @@ -62,12 +81,12 @@ def apply_scaling(freqs: torch.Tensor, scale_factor: float = 8): new_freqs.append(freq / scale_factor) else: assert low_freq_wavelen != high_freq_wavelen - smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) + smooth = (orig_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) new_freqs.append((1 - smooth) * freq / scale_factor + smooth * freq) return torch.tensor(new_freqs, dtype=freqs.dtype, device=freqs.device) -def precompute_freqs(dim: int, end: int, theta: float = 500000.0, use_scaled: bool = True, scale_factor: float = 8): +def precompute_freqs(dim: int, end: int, theta, scale_factor, orig_context_len): """ Precompute the frequency tensor for sine and cosine values with given dimensions. @@ -81,8 +100,8 @@ def precompute_freqs(dim: int, end: int, theta: float = 500000.0, use_scaled: bo """ freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)) t = torch.arange(end) - if use_scaled: - freqs = apply_scaling(freqs, scale_factor) + if scale_factor is not None: + freqs = apply_scaling(freqs, scale_factor, orig_context_len) freqs = torch.outer(t, freqs).float() return torch.cos(freqs), torch.sin(freqs) @@ -112,8 +131,10 @@ def gather_cos_sin(position_ids, cos, sin): return cos, sin -def get_prefill_rot_mat(head_dim, max_seq_len, mesh_device, seq_len, scale_factor, start_pos=0): - cos, sin = precompute_freqs(head_dim, max_seq_len * 2, scale_factor=scale_factor) +def get_prefill_rot_mat( + head_dim, max_seq_len, mesh_device, seq_len, theta, scale_factor, orig_context_len, start_pos=0 +): + cos, sin = precompute_freqs(head_dim, max_seq_len * 2, theta, scale_factor, orig_context_len) cos_gathered, sin_gathered = gather_cos_sin(torch.arange(start_pos, start_pos + seq_len), cos, sin) assert cos_gathered.size() == (1, 1, seq_len, head_dim) assert sin_gathered.size() == (1, 1, seq_len, head_dim) @@ -151,14 +172,15 @@ def get_single_rot_mat( dhead, mesh_device, num_devices, - start_pos=0, - theta: float = 500000.0, - use_scaled=True, + start_pos, + theta, + scale_factor, + orig_context_len, on_host=False, ): freqs_unscaled = 1.0 / (theta ** (torch.arange(0, dhead, 2)[: (dhead // 2)].float() / dhead)) - if use_scaled: - freqs = apply_scaling(freqs_unscaled) + if scale_factor is not None: + freqs = apply_scaling(freqs_unscaled, scale_factor, orig_context_len) sin_freqs, cos_freqs = torch.sin(freqs), torch.cos(freqs) rot_matrix = torch.zeros(dhead, dhead) rot_matrix[torch.arange(0, dhead, 2), torch.arange(0, dhead, 2)] = cos_freqs.clone() @@ -169,8 +191,8 @@ def get_single_rot_mat( # Support for start_pos different than 0 freqs = start_pos * freqs_unscaled - if use_scaled: - freqs = apply_scaling(freqs) + if scale_factor is not None: + freqs = apply_scaling(freqs, scale_factor, orig_context_len) sin_freqs, cos_freqs = torch.sin(freqs), torch.cos(freqs) current_rot_mat = torch.zeros(dhead, dhead) current_rot_mat[torch.arange(0, dhead, 2), torch.arange(0, dhead, 2)] = cos_freqs.clone() @@ -376,3 +398,40 @@ def get_max_prefill_chunk_size(seq_len, max_prefill_seq_len): return chunk_size raise ValueError("No valid chunk size found") + + +def nearest_multiple(x, multiple_of): + return math.ceil(x / multiple_of) * multiple_of + + +def pad_to_size(x: torch.Tensor, dim: int, size: int) -> torch.Tensor: + """ + Pads the specified dimension of the input tensor with zeros + + :param x: Input PyTorch Tensor + :param dim: The dimension to pad + :param size: The size to pad to + :return: Padded PyTorch Tensor + """ + # handle negative dim + if dim < 0: + dim = x.dim() + dim + assert isinstance(x, torch.Tensor), "Input must be a torch.Tensor" + assert -x.dim() <= dim < x.dim(), f"Dimension out of range (expected between {-x.dim()} and {x.dim()-1})" + dim = x.dim() + dim if dim < 0 else dim + + current_size = x.size(dim) + pad_size = size - current_size + + if pad_size == 0: + return x # No padding needed + + # Prepare the padding configuration for F.pad + # F.pad expects padding in the form (pad_last_dim_left, pad_last_dim_right, ..., pad_dim_left, pad_dim_right) + # We only pad on the "end" side of the specified dimension + pad = [0] * (2 * x.dim()) # Initialize padding for all dimensions + pad_index = 2 * (x.dim() - dim - 1) + pad[pad_index + 1] = pad_size # Pad on the "right" side of the specified dimension + + padded_x = torch.nn.functional.pad(x, pad, mode="constant", value=0) + return padded_x diff --git a/models/demos/llama3/tt/llama_decoder.py b/models/demos/llama3/tt/llama_decoder.py index 96116cc6340..58404ec1e09 100644 --- a/models/demos/llama3/tt/llama_decoder.py +++ b/models/demos/llama3/tt/llama_decoder.py @@ -72,6 +72,7 @@ def __init__( is_distributed=self.args.is_distributed_norm, sharded_program_config=self.model_config["SHARDED_NORM_ATTN_PRGM_CFG"], sharded_output_config=self.model_config["SHARDED_ATTN_INPUT_MEMCFG"], + ccl_topology=self.args.ccl_topology(), ), args, TG=args.is_galaxy, @@ -88,6 +89,7 @@ def __init__( is_distributed=self.args.is_distributed_norm, sharded_program_config=self.model_config["SHARDED_NORM_MLP_PRGM_CFG"], sharded_output_config=self.model_config["SHARDED_MLP_INPUT_MEMCFG"], + ccl_topology=self.args.ccl_topology(), ), args, TG=args.is_galaxy, diff --git a/models/demos/llama3/tt/llama_mlp.py b/models/demos/llama3/tt/llama_mlp.py index 31a845052d1..4ea55b8865b 100644 --- a/models/demos/llama3/tt/llama_mlp.py +++ b/models/demos/llama3/tt/llama_mlp.py @@ -5,6 +5,7 @@ import torch import ttnn from models.common.lightweightmodule import LightweightModule +from models.demos.llama3.tt.llama_common import pad_to_size from models.demos.llama3.tt.llama_ccl import tt_all_reduce @@ -21,41 +22,44 @@ def __init__( self.model_config = model_config state_dict_prefix = state_dict_prefix or args.get_state_dict_prefix(self.__class__.__name__, layer_num) torch_weight = lambda name: torch.transpose(self.state_dict[f"{state_dict_prefix}.{name}.weight"], -2, -1) + pad_hidden_dim = lambda tensor, dim: pad_to_size(tensor, dim=dim, size=args.hidden_dim) + # If pading was applied (e.g. via env var), add the unpadded hidden dim to the cache name to avoid loading incorrect weights + hidden_dim_string = f".hidden_dim_{args.hidden_dim}" if args.hidden_dim != args.unpadded_hidden_dim else "" if args.dummy_weights: cache_name = lambda _: None else: - cache_name = lambda name: weight_cache_path / (state_dict_prefix + f".{name}") + cache_name = lambda name: weight_cache_path / f"{state_dict_prefix}.{name}{hidden_dim_string}" w1_w3_mem_config = args.create_dram_sharded_mem_config(args.dim, args.hidden_dim // args.num_devices) w2_mem_config = args.create_dram_sharded_mem_config(args.hidden_dim // args.num_devices, args.dim) # TODO Clean up this code. With sharding, we load the normal weights and then shard them - as_sharded_tensor = lambda name, type, dim: ttnn.as_tensor( - torch_weight(name[:2]), # Grab only the wX part of the name + as_sharded_tensor = lambda name, type, dims: ttnn.as_tensor( + pad_hidden_dim( + torch_weight(name[:2]), dims[0] if args.is_galaxy else dims[-1] + ), # Grab only the wX part of the name dtype=type, device=self.mesh_device, - mesh_mapper=ttnn.ShardTensor2dMesh(self.mesh_device, dims=dim, mesh_shape=args.cluster_shape), + mesh_mapper=ttnn.ShardTensor2dMesh(self.mesh_device, dims=dims, mesh_shape=args.cluster_shape), layout=ttnn.TILE_LAYOUT, - memory_config=ttnn.DRAM_MEMORY_CONFIG - if args.is_galaxy - else w2_mem_config - if "w2" in name - else w1_w3_mem_config, + memory_config=( + ttnn.DRAM_MEMORY_CONFIG if args.is_galaxy else w2_mem_config if "w2" in name else w1_w3_mem_config + ), cache_file_name=cache_name(name), ) self.four_bit_mlp = args.optimizations.bfp4_mlp # Sharded weights - w1_dim = (-1, -2) if args.is_galaxy else (-2, -1) - w2_dim = (-2, -1) if args.is_galaxy else (-1, -2) + w1_dims = (-1, -2) if args.is_galaxy else (-2, -1) + w2_dims = (-2, -1) if args.is_galaxy else (-1, -2) self.w1 = as_sharded_tensor( - "w1_sharded", ttnn.bfloat4_b if self.four_bit_mlp else ttnn.bfloat8_b, dim=w1_dim + "w1_sharded", ttnn.bfloat4_b if self.four_bit_mlp else ttnn.bfloat8_b, dims=w1_dims ) # bfp4 normally ok here but sub .99 pcc for llama 3.1 weights - self.w2 = as_sharded_tensor("w2_sharded", ttnn.bfloat8_b, dim=w2_dim) - self.w3 = as_sharded_tensor("w3_sharded", ttnn.bfloat4_b if self.four_bit_mlp else ttnn.bfloat8_b, dim=w1_dim) + self.w2 = as_sharded_tensor("w2_sharded", ttnn.bfloat8_b, dims=w2_dims) + self.w3 = as_sharded_tensor("w3_sharded", ttnn.bfloat4_b if self.four_bit_mlp else ttnn.bfloat8_b, dims=w1_dims) def forward(self, x: ttnn.Tensor, mode) -> ttnn.Tensor: """ @@ -89,10 +93,12 @@ def forward(self, x: ttnn.Tensor, mode) -> ttnn.Tensor: w1_out = ttnn.linear( x, self.w1, - compute_kernel_config=self.args.compute_kernel_config_lofi - if self.four_bit_mlp - else self.args.compute_kernel_config_hifi2_fp16, - core_grid=ttnn.CoreGrid(y=8, x=8) if not pc_1 else None, + compute_kernel_config=( + self.args.compute_kernel_config_lofi + if self.four_bit_mlp + else self.args.compute_kernel_config_hifi2_fp16 + ), + core_grid=None, # FIXME: validate on TG ttnn.CoreGrid(y=8, x=8) if not pc_1 else None, dtype=ttnn.bfloat8_b if TG else ttnn.bfloat16, program_config=pc_1, memory_config=x.memory_config(), @@ -101,11 +107,13 @@ def forward(self, x: ttnn.Tensor, mode) -> ttnn.Tensor: w3_out = ttnn.linear( x, self.w3, - compute_kernel_config=self.args.compute_kernel_config_lofi - if self.four_bit_mlp - else self.args.compute_kernel_config_hifi2_fp16, - core_grid=ttnn.CoreGrid(y=8, x=8) if not pc_3 else None, - dtype=ttnn.bfloat8_b if TG else ttnn.bfloat16, + compute_kernel_config=( + self.args.compute_kernel_config_lofi + if self.four_bit_mlp + else self.args.compute_kernel_config_hifi2_fp16 + ), + core_grid=None, # FIXME: validate on TG ttnn.CoreGrid(y=8, x=8) if not pc_3 else None, + dtype=ttnn.bfloat16, program_config=pc_3, memory_config=x.memory_config(), ) @@ -144,6 +152,7 @@ def forward(self, x: ttnn.Tensor, mode) -> ttnn.Tensor: cluster_axis=1, num_all_gather_links=2, sharded=True if mode == "decode" else False, + topology=self.args.ccl_topology(), memory_config=self.model_config["FF1_OUT_GATHERED_MEMCFG"] if mode == "decode" else None, ) w3_out = tt_all_reduce( @@ -152,6 +161,7 @@ def forward(self, x: ttnn.Tensor, mode) -> ttnn.Tensor: cluster_axis=1, num_all_gather_links=2, sharded=True if mode == "decode" else False, + topology=self.args.ccl_topology(), memory_config=self.model_config["FF1_OUT_GATHERED_MEMCFG"] if mode == "decode" else None, ) @@ -188,10 +198,12 @@ def forward(self, x: ttnn.Tensor, mode) -> ttnn.Tensor: compute_kernel_config=self.args.compute_kernel_config_hifi2_fp16, dtype=self.args.ccl_dtype if TG else ttnn.bfloat16, program_config=pc_2, - memory_config=(ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG if mode == "decode" else ttnn.DRAM_MEMORY_CONFIG) - if TG - else w2_in.memory_config(), - core_grid=ttnn.CoreGrid(y=8, x=8) if not pc_2 else None, + memory_config=( + (ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG if mode == "decode" else ttnn.DRAM_MEMORY_CONFIG) + if TG + else w2_in.memory_config() + ), + core_grid=None, # FIXME: validate on TG ttnn.CoreGrid(y=8, x=8) if not pc_2 else None, ) ttnn.deallocate(w2_in) # if mode == "decode" and not TG: @@ -204,11 +216,14 @@ def forward(self, x: ttnn.Tensor, mode) -> ttnn.Tensor: num_reduce_scatter_links=self.args.num_reduce_scatter_links, num_all_gather_links=self.args.num_all_gather_links, sharded=(mode == "decode"), - memory_config=(self.model_config["FF2_OUT_REDUCE_SCATTER_MEMCFG"] if TG else w2_out.memory_config()) - if mode == "decode" - else ttnn.DRAM_MEMORY_CONFIG, + memory_config=( + (self.model_config["FF2_OUT_REDUCE_SCATTER_MEMCFG"] if TG else w2_out.memory_config()) + if mode == "decode" + else ttnn.DRAM_MEMORY_CONFIG + ), dtype=self.args.ccl_dtype, use_composite=True if self.dim == 8192 else False, + topology=self.args.ccl_topology(), ) # Ensure dim 0 and 1 are 1 diff --git a/models/demos/llama3/tt/llama_model.py b/models/demos/llama3/tt/llama_model.py index 3b784ad0bbb..8a909981efb 100644 --- a/models/demos/llama3/tt/llama_model.py +++ b/models/demos/llama3/tt/llama_model.py @@ -2,8 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 -import os -import math import ttnn import torch import torch.nn as nn @@ -11,11 +9,10 @@ from models.demos.llama3.tt.llama_decoder import TtTransformerBlock from models.common.rmsnorm import RMSNorm import ttnn -from typing import Optional from models.common.lightweightmodule import LightweightModule from models.demos.llama3.tt.distributed_norm import DistributedNorm from models.demos.llama3.tt.lm_head import LMHead -from models.demos.llama3.tt.llama_common import copy_host_to_device, get_prefill_rot_mat, HostEmbedding +from models.demos.llama3.tt.llama_common import copy_host_to_device, get_prefill_rot_mat from models.demos.llama3.tt.llama_rope import TtLlamaRotarySetup from models.demos.llama3.tt.llama_embedding import TtLlamaEmbedding @@ -56,8 +53,8 @@ def __init__( args.head_dim, args.max_seq_len, args.rope_theta, - args.use_scaled_rope, args.rope_scaling_factor, + args.orig_context_len, ) self.trans_mats_dict = self.rope_setup.get_both_trans_mats() @@ -87,6 +84,7 @@ def __init__( is_distributed=self.args.is_distributed_norm, sharded_program_config=self.model_config["SHARDED_NORM_LM_HEAD_PRGM_CFG"], sharded_output_config=self.model_config["LM_HEAD_INPUT_MEMCFG"], + ccl_topology=self.args.ccl_topology(), ), args, args.is_galaxy, @@ -124,8 +122,10 @@ def prepare_inputs_prefill(self, tokens, start_pos=0, page_table=None, chunk_pag self.args.head_dim, self.args.max_seq_len, self.mesh_device, - seq_len=S, - scale_factor=self.args.rope_scaling_factor, + S, + self.args.rope_theta, + self.args.rope_scaling_factor, + self.args.orig_context_len, start_pos=start_pos, ) diff --git a/models/demos/llama3/tt/llama_rope.py b/models/demos/llama3/tt/llama_rope.py index 06406a4eb2d..4b395c3eec5 100644 --- a/models/demos/llama3/tt/llama_rope.py +++ b/models/demos/llama3/tt/llama_rope.py @@ -11,8 +11,8 @@ from loguru import logger -def compute_gather_cos_sin(dhead, end, theta, position_ids, use_scaled_rope, scale_factor): - cos, sin = precompute_freqs(dhead, end, theta, use_scaled_rope, scale_factor) +def compute_gather_cos_sin(dhead, end, theta, scale_factor, orig_context_len, position_ids): + cos, sin = precompute_freqs(dhead, end, theta, scale_factor, orig_context_len) return gather_cos_sin(position_ids, cos, sin) @@ -23,9 +23,9 @@ def __init__( batch_size: int, head_dim: int, max_seq_len: int, - rope_theta: float = 10000, - use_scaled_rope: bool = False, - scale_factor: float = 8, + rope_theta: float, + scale_factor: float, # use None to disable rope scaling + orig_context_len: int, # only used if scaling enabled datatype=ttnn.bfloat16, ): super().__init__() @@ -40,16 +40,15 @@ def __init__( else: self.batch_size_per_device_group = self.batch_size self.core_grid = device.compute_with_storage_grid_size() - num_cores = self.core_grid.x * self.core_grid.y # Generate the cos/sin matrices needed for ttnn.embedding op cos_matrix, sin_matrix = compute_gather_cos_sin( dhead=head_dim, end=max_seq_len * 2, theta=rope_theta, - position_ids=torch.arange(max_seq_len), - use_scaled_rope=use_scaled_rope, scale_factor=scale_factor, + orig_context_len=orig_context_len, + position_ids=torch.arange(max_seq_len), ) self.cos_matrix = ttnn.from_torch( @@ -73,7 +72,7 @@ def __init__( 1, 1, batch_size, - 1 + 1, # 1, 1, num_cores, 1 ) # Repeat across all cores on device trans_mat_mem_config = ttnn.create_sharded_memory_config( @@ -89,13 +88,15 @@ def __init__( layout=ttnn.TILE_LAYOUT, dtype=datatype, memory_config=trans_mat_mem_config, - mesh_mapper=ShardTensor2dMesh( - device, - dims=(None, 2) if (self.num_devices == 32 and batch_size > 1) else (None, None), - mesh_shape=list(device.shape), - ) - if self.is_mesh_device - else None, + mesh_mapper=( + ShardTensor2dMesh( + device, + dims=(None, 2) if (self.num_devices == 32 and batch_size > 1) else (None, None), + mesh_shape=list(device.shape), + ) + if self.is_mesh_device + else None + ), ) # TODO: Colman, should this be TILE_SIZE or head_dim? Why should it be different for prefill and decode? diff --git a/models/demos/llama3/tt/lm_head.py b/models/demos/llama3/tt/lm_head.py index bd5cbe6ba8f..a79f8856e66 100644 --- a/models/demos/llama3/tt/lm_head.py +++ b/models/demos/llama3/tt/lm_head.py @@ -103,13 +103,15 @@ def __init__( ) if args.is_galaxy: self.program_configs = [ - None - if args.dim == 2048 - else args.dram_matmul_config( - args.tile_padded_batch_rows, # (8k, 128k) -> (2k, 16k) - args.dim // 4, - 16 * 1024, - args.lm_head_core_grid.num_cores, + ( + None + if args.dim == 2048 + else args.dram_matmul_config( + args.tile_padded_batch_rows, # (8k, 128k) -> (2k, 16k) + args.dim // 4, + 16 * 1024, + args.lm_head_core_grid.num_cores, + ) ) ] diff --git a/models/demos/llama3/tt/load_checkpoints.py b/models/demos/llama3/tt/load_checkpoints.py new file mode 100644 index 00000000000..7e330a2e18d --- /dev/null +++ b/models/demos/llama3/tt/load_checkpoints.py @@ -0,0 +1,303 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +import os +import torch +from safetensors.torch import load_file as safetensors_load_file +from tqdm import tqdm +import json +from pathlib import Path +from loguru import logger + + +# TODO Update function for large models: For 1 layer tests we only want to load 1 checkpoint file, instead of all. +def load_hf_state_dict(ckpt_dir): + # First check if index file exists + index_path = os.path.join(ckpt_dir, "model.safetensors.index.json") + if os.path.exists(index_path): + # Multi-file case: Read the index file and load all referenced safetensor files + with open(index_path, "r") as f: + index_data = json.load(f) + + # Retrieve the weight file names from the index JSON + weight_map = index_data["weight_map"] + safetensor_files = set(weight_map.values()) + + # Read each safetensors file mentioned in the index + loaded_weights = {} + for file in safetensor_files: + safetensor_path = os.path.join(ckpt_dir, file) + weights = safetensors_load_file(safetensor_path) + loaded_weights.update(weights) # Merge weights into a single dictionary + else: + # Single-file case: Load the single model.safetensors file + safetensor_path = os.path.join(ckpt_dir, "model.safetensors") + if not os.path.exists(safetensor_path): + raise FileNotFoundError(f"Neither model.safetensors.index.json nor model.safetensors found in {ckpt_dir}") + loaded_weights = safetensors_load_file(safetensor_path) + + if not "lm_head.weight" in loaded_weights: + # Assume tied to the embeddings if not present + loaded_weights["lm_head.weight"] = loaded_weights["model.embed_tokens.weight"] + + return loaded_weights + + +def convert_hf_to_meta(state_dict, head_dim): + state_dict = convert_hf_qkv_to_meta_format(state_dict, head_dim) + state_dict = map_hf_to_meta_keys(state_dict) + return state_dict + + +def map_hf_to_meta_keys(loaded_weights): + hf_to_meta = { + # Top level mappings + "model.embed_tokens.weight": "tok_embeddings.weight", + "model.norm.weight": "norm.weight", + "lm_head.weight": "output.weight", + # Layer level mappings + "input_layernorm.weight": "attention_norm.weight", + "post_attention_layernorm.weight": "ffn_norm.weight", + # Attention module mappings + "self_attn.q_proj.weight": "attention.wq.weight", + "self_attn.k_proj.weight": "attention.wk.weight", + "self_attn.v_proj.weight": "attention.wv.weight", + "self_attn.o_proj.weight": "attention.wo.weight", + "self_attn.q_proj.bias": "attention.wq.bias", + "self_attn.k_proj.bias": "attention.wk.bias", + "self_attn.v_proj.bias": "attention.wv.bias", + # Feed forward module mappings + "mlp.gate_proj.weight": "feed_forward.w1.weight", + "mlp.up_proj.weight": "feed_forward.w3.weight", + "mlp.down_proj.weight": "feed_forward.w2.weight", + # Direct module mappings + "gate_proj.weight": "w1.weight", + "down_proj.weight": "w2.weight", + "up_proj.weight": "w3.weight", + "q_proj.weight": "wq.weight", + "k_proj.weight": "wk.weight", + "v_proj.weight": "wv.weight", + "o_proj.weight": "wo.weight", + "q_proj.bias": "wq.bias", + "k_proj.bias": "wk.bias", + "v_proj.bias": "wv.bias", + "weight": "emb.weight", # For host embeddings + # Full path layer mappings + "model.layers.{layer}.input_layernorm.weight": "layers.{layer}.attention_norm.weight", + "model.layers.{layer}.post_attention_layernorm.weight": "layers.{layer}.ffn_norm.weight", + "model.layers.{layer}.self_attn.q_proj.weight": "layers.{layer}.attention.wq.weight", + "model.layers.{layer}.self_attn.k_proj.weight": "layers.{layer}.attention.wk.weight", + "model.layers.{layer}.self_attn.v_proj.weight": "layers.{layer}.attention.wv.weight", + "model.layers.{layer}.self_attn.o_proj.weight": "layers.{layer}.attention.wo.weight", + "model.layers.{layer}.self_attn.q_proj.bias": "layers.{layer}.attention.wq.bias", + "model.layers.{layer}.self_attn.k_proj.bias": "layers.{layer}.attention.wk.bias", + "model.layers.{layer}.self_attn.v_proj.bias": "layers.{layer}.attention.wv.bias", + "model.layers.{layer}.mlp.gate_proj.weight": "layers.{layer}.feed_forward.w1.weight", + "model.layers.{layer}.mlp.up_proj.weight": "layers.{layer}.feed_forward.w3.weight", + "model.layers.{layer}.mlp.down_proj.weight": "layers.{layer}.feed_forward.w2.weight", + } + + meta_state_dict = {} + for key, tensor in loaded_weights.items(): + if key in hf_to_meta: + # Direct match for top-level keys + meta_state_dict[hf_to_meta[key]] = tensor + elif "model.layers." in key: + # Extract layer number and form a template key + parts = key.split(".") + layer_num = parts[2] # e.g. "0" in "model.layers.0.input_layernorm.weight" + template_key = "model.layers.{layer}." + ".".join(parts[3:]) + if template_key in hf_to_meta: + meta_state_dict[hf_to_meta[template_key].format(layer=layer_num)] = tensor + + return meta_state_dict + + +def load_meta_state_dict(ckpt_dir, n_layers=None, start_layer_idx=0): + checkpoints = sorted(Path(ckpt_dir).glob("*.pth")) + assert len(checkpoints) > 0, f"no checkpoint files found in {ckpt_dir}" + is_chunked = "layers_" in str(checkpoints[0]) + if is_chunked: + checkpoint = load_chunked_checkpoints(checkpoints, n_layers, start_layer_idx) + else: + checkpoint = load_sharded_checkpoints(checkpoints, n_layers) + + return checkpoint + + +def load_chunked_checkpoints(checkpoints, n_layers, start_layer_idx): + checkpoint = {} + + (f"Loading {len(checkpoints)} checkpoint files") + for ckpt in tqdm(checkpoints): + if n_layers: + # Layer range is in the file name, like layers_start-end.pth + layer_range = ckpt.stem.split("_")[1] + start_layer, end_layer = map(int, layer_range.split("-")) + if start_layer > n_layers + start_layer_idx: + continue + if end_layer < start_layer_idx: + continue + + loaded_ckpt = torch.load(ckpt, map_location="cpu") + checkpoint.update(loaded_ckpt) + return checkpoint + + +def load_sharded_checkpoints(checkpoints, n_layers): + checkpoint = {} + logger.info(f"Loading {len(checkpoints)} checkpoint files") + for ckpt in tqdm(checkpoints): + loaded_ckpt = torch.load(ckpt, map_location="cpu") + for ( + key, + value, + ) in loaded_ckpt.items(): + if "layers." in key: + layer_num = int(key.split("layers.")[1].split(".")[0]) + if n_layers and layer_num >= n_layers: + continue + if key in checkpoint: + checkpoint[key] += [value] + else: + checkpoint[key] = [value] + del loaded_ckpt + + # concat checkpoint values + for key, value in checkpoint.items(): + if len(value) == 1 or "norm" in key: + checkpoint[key] = value[0] + else: + if key == "tok_embeddings.weight" or key == "output.weight": + assert value[0].shape[1] == 8192 # FIXME: do we need this hardcoded shape? + # Concatenate along dimension 0 for llama3 token embeddings weight and lm head + checkpoint[key] = torch.cat(value, dim=0) + else: + # cat_dim is index of the smallest dimension in value[0].shape + cat_dim = torch.argmin(torch.tensor(value[0].shape)) + checkpoint[key] = torch.cat(value, dim=cat_dim) + + return checkpoint + + +def convert_hf_qkv_to_meta_format(loaded_weights, head_dim): + """Convert HuggingFace QKV weights to Meta format for RoPE compatibility.""" + converted_weights = {} + for key, tensor in loaded_weights.items(): + if "q_proj.weight" in key or "k_proj.weight" in key: + # For weights: n_heads = tensor.shape[0] // head_dim + n_heads = tensor.shape[0] // head_dim + converted_weights[key] = reverse_permute(tensor, n_heads, tensor.shape[0], tensor.shape[1]) + elif "q_proj.bias" in key or "k_proj.bias" in key: + # For biases: n_heads = tensor.shape[0] // head_dim + n_heads = tensor.shape[0] // head_dim + converted_weights[key] = reverse_permute(tensor, n_heads, tensor.shape[0], 1).squeeze(-1) + else: + # Keep all other weights unchanged + converted_weights[key] = tensor + return converted_weights + + +def convert_meta_to_hf(state_dict, head_dim): + state_dict = convert_meta_qkv_to_hf_format(state_dict, head_dim) + state_dict = map_meta_to_hf_keys(state_dict) + return state_dict + + +def map_meta_to_hf_keys(loaded_weights): + # Define mappings at each level of the hierarchy + meta_to_hf_mappings = { + # Top level + "tok_embeddings.weight": "model.embed_tokens.weight", + "norm.weight": "model.norm.weight", + "output.weight": "lm_head.weight", + # Layer level + "attention_norm.weight": "input_layernorm.weight", + "ffn_norm.weight": "post_attention_layernorm.weight", + # Attention module + "attention.wq.weight": "self_attn.q_proj.weight", + "attention.wk.weight": "self_attn.k_proj.weight", + "attention.wv.weight": "self_attn.v_proj.weight", + "attention.wo.weight": "self_attn.o_proj.weight", + "attention.wq.bias": "self_attn.q_proj.bias", + "attention.wk.bias": "self_attn.k_proj.bias", + "attention.wv.bias": "self_attn.v_proj.bias", + # Feed forward module + "feed_forward.w1.weight": "mlp.gate_proj.weight", + "feed_forward.w3.weight": "mlp.up_proj.weight", + "feed_forward.w2.weight": "mlp.down_proj.weight", + # Direct mappings for when we get just the final components + "w1.weight": "gate_proj.weight", + "w2.weight": "down_proj.weight", + "w3.weight": "up_proj.weight", + "wq.weight": "q_proj.weight", + "wk.weight": "k_proj.weight", + "wv.weight": "v_proj.weight", + "wo.weight": "o_proj.weight", + "wq.bias": "q_proj.bias", + "wk.bias": "k_proj.bias", + "wv.bias": "v_proj.bias", + # Host embeddings + "emb.weight": "weight", + } + + hf_state_dict = {} + for key, tensor in loaded_weights.items(): + # Handle full model paths with layer numbers + if "layers." in key: + parts = key.split(".") + layer_num = parts[1] + remainder = ".".join(parts[2:]) + if remainder in meta_to_hf_mappings: + new_key = f"model.layers.{layer_num}.{meta_to_hf_mappings[remainder]}" + hf_state_dict[new_key] = tensor + continue + + # Try exact matches first + if key in meta_to_hf_mappings: + hf_state_dict[meta_to_hf_mappings[key]] = tensor + continue + + # For submodule state dicts, try matching the end of the key + matched = False + for meta_pattern, hf_pattern in meta_to_hf_mappings.items(): + if key.endswith(meta_pattern): + # Replace only the matching part at the end + prefix = key[: -len(meta_pattern)] + new_key = prefix + hf_pattern + hf_state_dict[new_key] = tensor + matched = True + break + + # If no mapping found, keep the original key + if not matched: + hf_state_dict[key] = tensor + + return hf_state_dict + + +def convert_meta_qkv_to_hf_format(loaded_weights, head_dim): + """Convert Meta QKV weights back to HuggingFace format.""" + converted_weights = {} + for key, tensor in loaded_weights.items(): + if "wq.weight" in key or "wk.weight" in key: + # For weights: n_heads = tensor.shape[0] // head_dim + n_heads = tensor.shape[0] // head_dim + converted_weights[key] = permute(tensor, n_heads, tensor.shape[0], tensor.shape[1]) + elif "wq.bias" in key or "wk.bias" in key: + # For biases: n_heads = tensor.shape[0] // head_dim + n_heads = tensor.shape[0] // head_dim + converted_weights[key] = permute(tensor.unsqueeze(-1), n_heads, tensor.shape[0], 1).squeeze(-1) + else: + # Keep all other weights unchanged + converted_weights[key] = tensor + return converted_weights + + +def reverse_permute(tensor, n_heads, dim1, dim2): + return tensor.view(n_heads, 2, dim1 // n_heads // 2, dim2).transpose(1, 2).reshape(dim1, dim2) + + +def permute(tensor, n_heads, dim1, dim2): + return tensor.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2) diff --git a/models/demos/llama3/tt/model_config.py b/models/demos/llama3/tt/model_config.py index 0002654966a..6c91825dbbc 100644 --- a/models/demos/llama3/tt/model_config.py +++ b/models/demos/llama3/tt/model_config.py @@ -16,12 +16,22 @@ num_to_core_range_set, calculate_hidden_dim, get_out_subblock_w, + encode_prompt_llama_instruct, + encode_prompt_hf, + nearest_multiple, ) from typing import Tuple from models.utility_functions import nearest_32 from pathlib import Path -from tqdm import tqdm from dataclasses import dataclass +from enum import Enum, auto +from models.demos.llama3.tt.load_checkpoints import ( + load_meta_state_dict, + load_hf_state_dict, + convert_hf_to_meta, + convert_meta_to_hf, + reverse_permute, +) @dataclass @@ -35,9 +45,10 @@ class LlamaOptimizations: @classmethod def accuracy(cls, model_name): """Configuration optimized for accuracy - Only 3.1-70B uses bfp4 MLPs in this configuration + Only 70B models uses bfp4 MLPs in this configuration """ - return cls(bfp4_mlp=model_name == "3.1-70B") + bfp4 = model_name in ["Llama3.1-70B", "DeepSeek-R1-Distill-Llama-70B", "Qwen2.5-72B"] + return cls(bfp4_mlp=bfp4) @classmethod def performance(cls, model_name): @@ -47,6 +58,11 @@ def performance(cls, model_name): return cls(bfp4_mlp=True) +class CheckpointType(Enum): + Meta = auto() + HuggingFace = auto() + + class TtModelArgs: OP_KEYS = ( # Embedding @@ -92,7 +108,7 @@ def __init__( ): self.num_devices = mesh_device.get_num_devices() if mesh_device else 0 self.mesh_device = mesh_device - self.device_name = {0: "CPU", 1: "N150", 2: "N300", 8: "T3K", 32: "TG"}[self.num_devices] + self.device_name = {0: "CPU", 1: "N150", 2: "N300", 4: "N150x4", 8: "T3K", 32: "TG"}[self.num_devices] self.model_name = "Unknown" # Llama model name will be dependent on the checkpoint directory self.max_seq_len = max_seq_len self.max_batch_size = max_batch_size @@ -108,6 +124,7 @@ def __init__( self.DEFAULT_CKPT_DIR = LLAMA_DIR self.DEFAULT_TOKENIZER_PATH = LLAMA_DIR self.DEFAULT_CACHE_PATH = os.path.join(LLAMA_DIR, self.device_name) + self.model_name = os.path.basename(LLAMA_DIR) # May be overridden by config else: assert "Please set $LLAMA_DIR to a valid checkpoint directory" @@ -116,14 +133,7 @@ def __init__( assert os.path.exists( self.DEFAULT_CKPT_DIR ), f"Checkpoint directory {self.DEFAULT_CKPT_DIR} does not exist, please set LLAMA_DIR=... or LLAMA_CKPT_DIR=..." - assert os.path.isfile( - self.DEFAULT_TOKENIZER_PATH + "/tokenizer.model" - ), f"Tokenizer file {self.DEFAULT_TOKENIZER_PATH + '/tokenizer.model'} does not exist, please set LLAMA_TOKENIZER_PATH=..." - if not os.path.exists(self.DEFAULT_CACHE_PATH): - os.makedirs(self.DEFAULT_CACHE_PATH) - assert os.path.exists( - self.DEFAULT_CACHE_PATH - ), f"Cache directory {self.DEFAULT_CACHE_PATH} does not exist, please set LLAMA_CACHE_PATH=..." + os.makedirs(self.DEFAULT_CACHE_PATH, exist_ok=True) # Check if weights exist in the specified folder. If not warn the user to run the download and untar script. # assert os.path.isfile( # self.DEFAULT_CKPT_DIR + "/consolidated.00.pth" @@ -133,57 +143,6 @@ def __init__( logger.info(f"Tokenizer file: {self.DEFAULT_TOKENIZER_PATH + '/tokenizer.model'}") logger.info(f"Cache directory: {self.DEFAULT_CACHE_PATH}") - # Set the model name based on the checkpoint directory being loaded - if "3.2-1B" in LLAMA_DIR: - local_params = "LLAMA3_2_1B_PARAMS" - self.model_name = "3.2-1B" - self.rope_scaling_factor = 32 - elif "3.2-3B" in LLAMA_DIR: - local_params = "LLAMA3_2_3B_PARAMS" - self.model_name = "3.2-3B" - self.rope_scaling_factor = 32 - elif "3.1-8B" in LLAMA_DIR: - local_params = "LLAMA3_1_8B_PARAMS" - self.model_name = "3.1-8B" - self.rope_scaling_factor = 8 - elif "3.2-11B" in LLAMA_DIR: - local_params = "LLAMA3_2_11B_PARAMS" - self.model_name = "3.2-11B" - self.rope_scaling_factor = 8 # shared with 3.1-8B - elif "3.1-70B" in LLAMA_DIR: - local_params = "LLAMA3_1_70B_PARAMS" - self.model_name = "3.1-70B" - self.rope_scaling_factor = 8 - self.is_70b = True # self.dim == 8192 and self.n_layers == 80 - else: - # NOTE: 3.2-90B and 3.3-70B also use scaling factor of 8 - raise ValueError(f"Unsupported LLAMA model: {LLAMA_DIR}") - - # Set the max number of tokens for each prefill chunk based on the model and device - MAX_PREFILL_CHUNK_SIZES_DIV1024 = { - "3.2-1B": {"N150": 128, "N300": 128, "T3K": 128, "TG": 128}, - "3.2-3B": {"N150": 8, "N300": 128, "T3K": 128, "TG": 128}, - "3.1-8B": {"N150": 4, "N300": 64, "T3K": 128, "TG": 128}, - "3.2-11B": {"N150": 4, "N300": 64, "T3K": 128, "TG": 128}, - "3.1-70B": {"N150": None, "N300": None, "T3K": 32, "TG": 128}, - } - max_prefill_chunk_size_div1024 = MAX_PREFILL_CHUNK_SIZES_DIV1024[self.model_name][self.device_name] - assert ( - max_prefill_chunk_size_div1024 is not None - ), f"Unsupported model {self.model_name} on device {self.device_name}" - self.max_prefill_chunk_size = max_prefill_chunk_size_div1024 * 1024 - - if callable(optimizations): - self.optimizations = optimizations(self.model_name) - else: - self.optimizations = optimizations - - # Load model params - if not dummy_weights: - self._set_llama_params(self.DEFAULT_CKPT_DIR) - else: # With Dummy weights, set the params from the local copy inside the model folder. This is required for CI pipeline that doesn't mount the external folders. - self._set_llama_params(self.LOCAL_LLAMA_PARAMS[local_params]) - # Some consumers like SentencePiece only accept str not Path for files self.model_base_path = Path(self.DEFAULT_CKPT_DIR) self.model_cache_path = Path(self.DEFAULT_CACHE_PATH) @@ -196,6 +155,58 @@ def __init__( # If the weights file contain the keyword `instruct` also set self.instruct to true if "instruct" in self.DEFAULT_CACHE_PATH.lower(): self.instruct = True + + # Load model params + if not dummy_weights: + self.checkpoint_type = self.detect_checkpoint_type() + self._set_model_params(self.DEFAULT_CKPT_DIR) + else: # With Dummy weights, set the params from the local copy inside the model folder. This is required for CI pipeline that doesn't mount the external folders. + self.checkpoint_type = CheckpointType.Meta + if "3.2-1B" in self.DEFAULT_CKPT_DIR: + local_params = "LLAMA3_2_1B_PARAMS" + elif "3.2-3B" in self.DEFAULT_CKPT_DIR: + local_params = "LLAMA3_2_3B_PARAMS" + elif "3.1-8B" in self.DEFAULT_CKPT_DIR: + local_params = "LLAMA3_1_8B_PARAMS" + elif "3.2-11B" in self.DEFAULT_CKPT_DIR: + local_params = "LLAMA3_2_11B_PARAMS" + elif "3.1-70B" in self.DEFAULT_CKPT_DIR: + local_params = "LLAMA3_1_70B_PARAMS" + else: + raise ValueError( + f"No local params found for {self.DEFAULT_CKPT_DIR}, dummy weights are not supported for this model" + ) + self._set_model_params(self.LOCAL_LLAMA_PARAMS[local_params]) + + # Set the max number of tokens for each prefill chunk based on the model and device + max_prefill_chunk_size_div1024 = os.getenv("MAX_PREFILL_CHUNK_SIZE") + if max_prefill_chunk_size_div1024 is None: + MAX_PREFILL_CHUNK_SIZES_DIV1024 = { + "Llama3.2-1B": {"N150": 128, "N300": 128, "T3K": 128, "TG": 128}, + "Llama3.2-3B": {"N150": 8, "N300": 128, "T3K": 128, "TG": 128}, + "Llama3.1-8B": {"N150": 4, "N300": 64, "T3K": 128, "TG": 128}, + "Llama3.2-11B": {"N150": 4, "N300": 64, "T3K": 128, "TG": 128}, + "Llama3.1-70B": {"N150": None, "N300": None, "T3K": 32, "TG": 128}, + "DeepSeek-R1-Distill-Llama-70B": {"N150": None, "N300": None, "T3K": 32, "TG": 128}, + "Qwen2.5-7B": {"N150": 4, "N300": 64, "T3K": 128, "TG": 128}, + "Qwen2.5-72B": {"N150": None, "N300": None, "T3K": 32, "TG": 128}, + } + try: + max_prefill_chunk_size_div1024 = MAX_PREFILL_CHUNK_SIZES_DIV1024[self.base_model_name][self.device_name] + except KeyError: + raise ValueError( + f"Unknown model {self.model_name} on device {self.device_name}, try setting MAX_PREFILL_CHUNK_SIZE between 4 (compatible) and 128 (faster)" + ) + assert ( + max_prefill_chunk_size_div1024 is not None + ), f"Unsupported model {self.model_name} on device {self.device_name}" + self.max_prefill_chunk_size = max_prefill_chunk_size_div1024 * 1024 + + if callable(optimizations): + self.optimizations = optimizations(self.model_name) + else: + self.optimizations = optimizations + self.dummy_weights = dummy_weights self.tile_padded_batch_rows = self.tile_size * int(math.ceil(self.max_batch_size / self.tile_size)) @@ -215,10 +226,12 @@ def __init__( self.model_config.update({f"{key}_TILE": ttnn.TILE_LAYOUT for key in self.OP_KEYS if "LAYOUT" in key}) self.cos, self.sin = precompute_freqs( - self.head_dim, self.max_seq_len * 2, self.rope_theta, self.use_scaled_rope, self.rope_scaling_factor + self.head_dim, self.max_seq_len * 2, self.rope_theta, self.rope_scaling_factor, self.orig_context_len ) # for prefill self.rot_emb = freqs_to_rotation_matrix(self.cos, self.sin) # for decode + self.tokenizer = None if dummy_weights else self.create_tokenizer() + device = mesh_device.get_devices()[0] if mesh_device is not None else None self.cluster_shape = list(mesh_device.shape) self.is_galaxy = self.num_devices == 32 @@ -350,45 +363,61 @@ def find_largest_divisor(n, max_divisor=8): else: self.model_config["ATTN_ALL_GATHER_MATMUL_PROGCFG"] = None + prefill_rows = lambda seq_len: min(seq_len, 1024) // self.tile_size + mlp1_3_grid = lambda seq_len: ( + (8, min(min(seq_len, 1024) // 32, 4)) + if self.is_galaxy + else self.find_prefill_grid(prefill_rows(seq_len), self.dim // self.tile_size) + ) + mlp2_grid = lambda seq_len: ( + (8, min(min(seq_len, 1024) // 32, 4)) + if self.is_galaxy + else self.find_prefill_grid(prefill_rows(seq_len), self.hidden_dim // self.tile_size) + ) + self.model_config["PREFILL_MLP_W1_W3_PRG_CONFIG"] = lambda seq_len: self.matmul_config( m=min(seq_len, 1024), k=self.dim // self.cluster_shape[0], n=self.hidden_dim // self.cluster_shape[1], - grid_size=(8, min(min(seq_len, 1024) // 32, 4)) - if self.is_galaxy - else ((8, 8) if seq_len >= 1024 else (8, 4)), + grid_size=mlp1_3_grid(seq_len), ) self.model_config["PREFILL_MLP_W2_PRG_CONFIG"] = lambda seq_len: self.matmul_config( m=min(seq_len, 1024), k=self.hidden_dim // (self.cluster_shape[1] if self.is_galaxy else 1), n=self.dim, - grid_size=(8, min(min(seq_len, 1024) // 32, 4)) - if self.is_galaxy - else ((8, 8) if seq_len >= 1024 else (8, 4)), + grid_size=mlp2_grid(seq_len), ) + k_dim = self.dim // self.cluster_shape[0] if self.is_galaxy else self.dim + n_dim = self.dim // self.cluster_shape[1] if self.is_galaxy else self.dim + num_rows = lambda seq_len: min(seq_len, 1024 if self.is_galaxy else 2048) self.model_config["WO_PREFILL_PROGCFG"] = lambda seq_len: self.matmul_config( - m=min(seq_len, 1024 if self.is_galaxy else 2048), - k=self.dim // self.cluster_shape[0] if self.is_galaxy else self.dim, - n=self.dim // self.cluster_shape[1] if self.is_galaxy else self.dim, - grid_size=(8, 8), + m=num_rows(seq_len), + k=k_dim, + n=n_dim, + grid_size=self.find_prefill_grid(num_rows(seq_len), n_dim // self.tile_size), in0_block_w=1, fuse_batch=seq_len <= 1024, # if self.is_galaxy else 2048), ) - # Calculate largest number of lm_head_num_rows such that self.dim % (lm_head_num_rows * 8) == 0 + # Calculate largest number of lm_head_num_rows such that self.dim % (lm_head_num_rows * lm_head_cores_per_row) == 0 if self.num_devices == 32: lm_head_num_rows = 4 while self.dim % (32 * 32 * lm_head_num_rows) != 0: lm_head_num_rows -= 1 else: lm_head_num_rows = 8 - while self.dim % (32 * lm_head_num_rows * 8) != 0: - lm_head_num_rows -= 1 - assert ( - lm_head_num_rows > 0 - ), f"Could not find a lm_head_num_rows such that self.dim(={self.dim}) % (lm_head_num_rows * 4) == 0" - self.lm_head_core_grid = ttnn.CoreGrid(y=lm_head_num_rows, x=8) + lm_head_cores_per_row = 8 + while self.dim % (32 * lm_head_num_rows * lm_head_cores_per_row) != 0: + lm_head_num_rows -= 1 + if lm_head_num_rows == 0: + lm_head_cores_per_row -= 1 + if lm_head_cores_per_row == 0: + raise ValueError( + f"Could not find a lm_head_num_rows such that self.dim(={self.dim}) % (lm_head_num_rows * 8) == 0" + ) + lm_head_num_rows = 8 + self.lm_head_core_grid = ttnn.CoreGrid(y=lm_head_num_rows, x=lm_head_cores_per_row) self.model_config["LM_HEAD_INPUT_MEMCFG"] = ttnn.create_sharded_memory_config( ( @@ -455,7 +484,6 @@ def find_largest_divisor(n, max_divisor=8): grid_by_batch = (1, 1) else: raise ValueError(f"Batch size {self.max_batch_size} not supported") - core_grid_by_batch = ttnn.CoreGrid(y=grid_by_batch[1], x=grid_by_batch[0]) core_range_set_by_batch = ttnn.CoreRangeSet( { ttnn.CoreRange( @@ -610,41 +638,42 @@ def find_largest_divisor(n, max_divisor=8): else self.model_config["FULL_GRID_MEMCFG"] ) - self.model_config["FF1_3_TG_PROGCFG"] = self.matmul_1d_config_from_tensor_shapes( - ( - 1, - 1, - 32, - self.dim // 4, - ), - ( - 1, - 1, - self.dim // 4, - self.hidden_dim // 8, - ), - grid=ttnn.CoreGrid(x=8, y=2), - overwrite_subblock_h=1, - overwrite_subblock_w=1, - ) + if self.is_galaxy: + self.model_config["FF1_3_TG_PROGCFG"] = self.matmul_1d_config_from_tensor_shapes( + ( + 1, + 1, + 32, + self.dim // 4, + ), + ( + 1, + 1, + self.dim // 4, + self.hidden_dim // 8, + ), + grid=ttnn.CoreGrid(x=8, y=2), + overwrite_subblock_h=1, + overwrite_subblock_w=1, + ) - self.model_config["FF2_TG_PROGCFG"] = self.matmul_1d_config_from_tensor_shapes( - ( - 1, - 1, - 32, - self.hidden_dim // 8, - ), - ( - 1, - 1, - self.hidden_dim // 8, - self.dim // 4, - ), - grid=ttnn.CoreGrid(x=8, y=2), - overwrite_subblock_h=1, - overwrite_subblock_w=1, - ) + self.model_config["FF2_TG_PROGCFG"] = self.matmul_1d_config_from_tensor_shapes( + ( + 1, + 1, + 32, + self.hidden_dim // 8, + ), + ( + 1, + 1, + self.hidden_dim // 8, + self.dim // 4, + ), + grid=ttnn.CoreGrid(x=8, y=2), + overwrite_subblock_h=1, + overwrite_subblock_w=1, + ) self.model_config["FF1_OUT_REDUCE_SCATTER_MEMCFG"] = ttnn.create_sharded_memory_config( shape=(32, self.hidden_dim // 28 // 8), # shard_grid_cores = 28, num_devices=8 @@ -815,6 +844,7 @@ def _get_xattn_kv_prefill_mem_cfg(seq_len): self.model_config["XATTN_KV_PREFILL_MEM_CFG"] = _get_xattn_kv_prefill_mem_cfg self.VISION_MAX_MM_SEQ = nearest_32(self.vision_chunk_ntok) + # RMS NORM self.model_config["SHARDED_NORM_ATTN_PRGM_CFG"] = self.create_sharded_norm_config(attn_input_grid) self.model_config["SHARDED_NORM_MLP_PRGM_CFG"] = self.create_sharded_norm_config(mlp_core_grid) @@ -835,7 +865,7 @@ def _get_xattn_kv_prefill_mem_cfg(seq_len): ), ) - self.model_config = set_tg_attention_config(self.model_config, self.dim) + self.set_tg_attention_config() self.is_multichip = self.num_devices > 1 self.num_reduce_scatter_links = 1 @@ -844,12 +874,20 @@ def _get_xattn_kv_prefill_mem_cfg(seq_len): ) # TODO: try out 3 for short axis and 4 for long axis (TG only) <- should work but untested in model self.ccl_dtype = ttnn.bfloat8_b + logger.info(f"Attention grid: {attn_input_grid}") + logger.info(f"MLP grid: {mlp_core_grid}") + logger.info(f"MLP prefill grids @ 32: w1/w3: {mlp1_3_grid(32)}, w2: {mlp2_grid(32)}") + logger.info( + f"MLP prefill grids @ max_seq_len({self.max_seq_len}): w1/w3: {mlp1_3_grid(self.max_seq_len)}, w2: {mlp2_grid(self.max_seq_len)}" + ) + logger.info(f"LM head grid: {self.lm_head_core_grid}") + def is_distributed_norm(self, mode): if not self.is_multichip: return False if all([dim > 1 for dim in list(self.mesh_device.shape)]): # 2D grid return True - elif self.dim >= 8192 and mode == "prefill": # Somewhere between 4k and 8k WH runs out of L1 if not distributed + elif self.dim > 4096 and mode == "prefill": # Somewhere between 4k and 8k WH runs out of L1 if not distributed return True return False @@ -932,23 +970,72 @@ def prepare_residual_tensor_prefill(self, x_bsh, force_replicated=False): ) return xs_1BSH - def _set_llama_params_from_dict(self, params): - # Text params - self.dim = params["dim"] - self.ffn_dim_multiplier = params["ffn_dim_multiplier"] - self.multiple_of = params["multiple_of"] - self.n_heads = params["n_heads"] - self.n_kv_heads = params["n_kv_heads"] - self.n_layers = params["n_layers"] - self.norm_eps = params["norm_eps"] - self.rope_theta = params["rope_theta"] - self.use_scaled_rope = params["use_scaled_rope"] + def _set_params_from_dict(self, params): + # Common params with different names between Meta and HF + self.dim = params.get("dim", params.get("hidden_size")) + self.n_heads = params.get("n_heads", params.get("num_attention_heads")) + self.n_kv_heads = params.get("n_kv_heads", params.get("num_key_value_heads")) + self.n_layers = params.get("n_layers", params.get("num_hidden_layers")) + self.full_model_n_layers = self.n_layers + self.norm_eps = params.get("norm_eps", params.get("rms_norm_eps")) self.vocab_size = params["vocab_size"] self.padded_vocab_size = 128 * 1024 self.head_dim = self.dim // self.n_heads - self.hidden_dim = calculate_hidden_dim(self.dim, self.ffn_dim_multiplier, self.multiple_of) - # Vision params + # Handle different MLP dimension specifications + if "intermediate_size" in params: + self.hidden_dim = params["intermediate_size"] + self.ffn_dim_multiplier = None + self.multiple_of = None + else: + self.ffn_dim_multiplier = params["ffn_dim_multiplier"] + self.multiple_of = params["multiple_of"] + self.hidden_dim = calculate_hidden_dim(self.dim, self.ffn_dim_multiplier, self.multiple_of) + + if "_name_or_path" in params: + self.model_name = os.path.basename(params["_name_or_path"]) + + if self.base_model_name == "Qwen2.5-7B" and self.num_devices not in [0, 2, 4]: + raise AssertionError( + "Qwen2.5-7B is only supported on 2 or 4 devices, run on an N300 or use FAKE_DEVICE=N150x4" + ) + + self.unpadded_hidden_dim = self.hidden_dim + # Don't need to pad for CPU runs + if self.num_devices: + # Default padding cores for each model, 0 if not set here + default_padded_cores = { + "Qwen2.5-72B": 32, + "Qwen2.5-7B": 16, + }.get(self.base_model_name, 0) + + # Override MLP padding cores from env var + mlp_padded_cores = int(os.environ.get("PAD_MLP_CORES", default_padded_cores)) + + # Only pad if MLP_PADDED_CORES is non-zero + if mlp_padded_cores > 0: + padded_hidden_dim = nearest_multiple( + self.hidden_dim, mlp_padded_cores * self.tile_size * self.num_devices + ) + if padded_hidden_dim != self.hidden_dim: + logger.info( + f"PAD_MLP_CORES={mlp_padded_cores}, padding hidden dim from {self.hidden_dim} to {padded_hidden_dim}" + ) + self.hidden_dim = padded_hidden_dim + + # RoPE params + self.rope_theta = params.get("rope_theta") + # If use_scaled_rope is not present, assume setting rope_scaling means use scaled rope + # If it is present and is set to false, do not use scaled rope + # Setting self.rope_scaling_factor to None is our way of saying do not use scaled rope + if "rope_scaling" in params and params.get("use_scaled_rope", True): + self.rope_scaling_factor = params.get("factor", None) + self.orig_context_len = params.get("original_max_position_embeddings", None) + else: + self.rope_scaling_factor = None + self.orig_context_len = None + + # Vision params (Meta-specific) self.vision_chunk_size = params.get("vision_chunk_size", -1) self.vision_max_num_chunks = params.get("vision_max_num_chunks", 4) self.vision_num_cross_attention_layers = params.get("vision_num_cross_attention_layers", -1) @@ -967,6 +1054,14 @@ def _set_llama_params_from_dict(self, params): self.vision_patch_size = 14 self.vision_in_channels = 3 + @property + def use_scaled_rope(self): + return self.rope_scaling_factor is not None + + @property + def base_model_name(self): + return self.model_name.split("B-")[0] + "B" if "B-" in self.model_name else self.model_name + @property def vision_chunk_ntok(self): """ @@ -974,12 +1069,50 @@ def vision_chunk_ntok(self): """ return (self.vision_chunk_size // self.vision_patch_size) ** 2 + 1 + def _set_model_params(self, checkpoint_dir): + if self.checkpoint_type == CheckpointType.Meta: + self._set_llama_params(checkpoint_dir) + elif self.checkpoint_type == CheckpointType.HuggingFace: + self._set_hf_params(checkpoint_dir) + else: + raise ValueError(f"Unsupported checkpoint type: {self.checkpoint_type}") + def _set_llama_params(self, checkpoint_dir): params_file = os.path.join(checkpoint_dir, "params.json") assert os.path.exists(params_file), f"params.json file not found at {params_file}" with open(params_file, "r") as f: params = json.load(f) - self._set_llama_params_from_dict(params) + self._set_params_from_dict(params) + + # Meta-style config dicts don't specity model name or rope_scaling_factor so hard-code these + # Set the model name based on the checkpoint directory being loaded + # FIXME: add a llama prefix to all llama-specific models and names + if "3.2-1B" in checkpoint_dir: + self.model_name = "Llama3.2-1B" + "-Instruct" if self.instruct else "" + self.rope_scaling_factor = 32 + elif "3.2-3B" in checkpoint_dir: + self.model_name = "Llama3.2-3B" + "-Instruct" if self.instruct else "" + self.rope_scaling_factor = 32 + elif "3.1-8B" in checkpoint_dir: + self.model_name = "Llama3.1-8B" + "-Instruct" if self.instruct else "" + self.rope_scaling_factor = 8 + elif "3.2-11B" in checkpoint_dir: + self.model_name = "Llama3.2-11B" + "-Instruct" if self.instruct else "" + self.rope_scaling_factor = 8 # shared with 3.1-8B + elif "3.1-70B" in checkpoint_dir: + self.model_name = "Llama3.1-70B" + "-Instruct" if self.instruct else "" + self.rope_scaling_factor = 8 + self.is_70b = True # self.dim == 8192 and self.n_layers == 80 + else: + logger.warning(f"Unknown Meta-style model: {checkpoint_dir}") + self.orig_context_len = 8192 + + def _set_hf_params(self, checkpoint_dir): + config_file = os.path.join(checkpoint_dir, "config.json") + assert os.path.exists(config_file), f"config.json file not found at {config_file}" + with open(config_file, "r") as f: + config = json.load(f) + self._set_params_from_dict(config) def __repr__(self): return f"""ModelArgs( @@ -992,7 +1125,7 @@ def __repr__(self): ffn_dim_multiplier={self.ffn_dim_multiplier}, norm_eps={self.norm_eps}, rope_theta={self.rope_theta}, - use_scaled_rope={self.use_scaled_rope}, + rope_scaling_factor={self.rope_scaling_factor}, max_batch_size={self.max_batch_size}, max_seq_len={self.max_seq_len}, vision_chunk_size={self.vision_chunk_size}, @@ -1031,19 +1164,19 @@ def get_model_config(self): # TODO Update function for large models: For 1 layer tests we only want to load 1 checkpoint file, instead of all. def load_state_dict(self): - """Generate or load state_dict for n_layers of the model""" if self.dummy_weights: reference_model = Transformer(self) state_dict = reference_model.state_dict() state_dict_prefix = self.get_state_dict_prefix("", None) state_dict = {f"{state_dict_prefix}{k}": torch.randn_like(v) for k, v in state_dict.items()} + elif self.checkpoint_type == CheckpointType.Meta: + state_dict = load_meta_state_dict(self.DEFAULT_CKPT_DIR, self.n_layers) else: - state_dict = load_llama_state_dict(self.DEFAULT_CKPT_DIR, self.n_layers) - + assert self.checkpoint_type == CheckpointType.HuggingFace + state_dict = load_hf_state_dict(self.DEFAULT_CKPT_DIR) + state_dict = convert_hf_to_meta(state_dict, self.head_dim) keys_dict = list(state_dict.keys())[:] - remv = [ - f"layers.{i}." for i in list(range(self.n_layers, 32)) - ] # TODO, this is not generalized to all models. it assumes max layers = 32 + remv = [f"layers.{i}." for i in list(range(self.n_layers, self.full_model_n_layers))] for k in keys_dict: if any([r in k for r in remv]): state_dict.pop(k) @@ -1068,7 +1201,7 @@ def matmul_config( in0_block_w: int = None, fuse_batch: bool = False, fused_activation=None, - ) -> ttnn.MatmulMultiCoreReuseMultiCastProgramConfig: + ): per_core_M = math.ceil(m / (self.tile_size * grid_size[1])) per_core_N = math.ceil(n / (self.tile_size * grid_size[0])) @@ -1134,6 +1267,31 @@ def find_grid(self, N): f"Cannot find a grid configuration for {N} tiles that evenly divides into {max_cores} cores of max size {max_rows}x{max_cols}." ) + def find_prefill_grid(self, row_tiles, col_tiles): + """Find a grid such that the number of row tiles evenly divides into the number + of rows and the number of column tiles evenly divides into the number of columns + """ + max_rows = 8 + max_cols = 8 + + # Find number of cols that evenly divides into the number of columns + cols = None + rows = None + + for i in range(max_cols, 0, -1): + if col_tiles % i == 0: + cols = i + break + + for i in range(max_rows, 0, -1): + if row_tiles % i == 0: + rows = i + break + + assert cols is not None, f"Cannot find a number of columns that evenly divides into {col_tiles}, not even 1(!)." + assert rows is not None, f"Cannot find a number of rows that evenly divides into {row_tiles}, not even 1(!)." + return rows, cols + def dram_shard_core_grid_for_k_and_n(self, k: int, n: int) -> Tuple[int, int]: rows, cols = self.find_grid_k_n(k // self.tile_size, n // self.tile_size) return ttnn.CoreGrid(x=cols, y=rows) @@ -1143,7 +1301,6 @@ def find_grid_k_n(self, K, N): Find the number of rows and columns for a grid of cores such that the total number of tiles N can be evenly divided among the cores. Each core will have the same integer number of tiles. - The grid size is limited to a maximum of 2 rows and 8 columns. Parameters: N (int): Total number of tiles to be distributed. @@ -1154,9 +1311,9 @@ def find_grid_k_n(self, K, N): Raises: AssertionError: If it's not possible to find such a grid configuration. """ - max_rows = 4 + max_rows = 8 max_cols = 8 # Maximum number of rows or columns - max_cores = max_rows * max_cols # Maximum number of cores (8x2 grid) + max_cores = max_rows * max_cols # Maximum number of cores # Find all possible numbers of cores that divide N and are less than or equal to max_cores possible_cores = [c for c in range(1, max_cores + 1) if K % c == 0 and N % c == 0] @@ -1175,12 +1332,10 @@ def find_grid_k_n(self, K, N): f"Cannot find a grid configuration such that both {K} and {N} tiles evenly divide into cores of max size {max_rows}x{max_cols}." ) - def dram_matmul_config( - self, m: int, k: int, n: int, num_cores=None - ) -> ttnn.MatmulMultiCoreReuseMultiCastDRAMShardedProgramConfig: + def dram_matmul_config(self, m: int, k: int, n: int, num_cores=None): # in0_block_w must evenly divide k and be no larger than tile_size * num_cores if num_cores is None: - # num_cores = self.dram_shard_core_grid_for_k_and_n(k).num_cores + # num_cores = self.dram_shard_core_grid_for_k(k).num_cores num_cores = self.dram_shard_core_grid_for_k_and_n(k, n).num_cores assert ( k % (self.tile_size * num_cores) == 0 @@ -1302,72 +1457,352 @@ def create_sharded_norm_config(self, grid): inplace=False, ) + def detect_checkpoint_type(self) -> CheckpointType: + """Detect if checkpoint directory contains Meta or HuggingFace format weights. + + Returns: + CheckpointType: Meta or HuggingFace enum value + + Raises: + ValueError: If neither Meta nor HuggingFace checkpoint format is detected + """ + config_path = os.path.join(self.DEFAULT_CKPT_DIR, "config.json") + params_path = os.path.join(self.DEFAULT_CKPT_DIR, "params.json") + + if os.path.exists(config_path): + with open(config_path) as f: + config = json.load(f) + if "transformers_version" in config: + return CheckpointType.HuggingFace + + if os.path.exists(params_path): + return CheckpointType.Meta + + raise ValueError( + f"Could not detect Meta or HuggingFace checkpoint format in {self.DEFAULT_CKPT_DIR}. " + "Directory should contain either config.json (HuggingFace) or params.json (Meta)." + ) + + def create_tokenizer(self): + """Create and return a Tokenizer instance based on the checkpoint type.""" + if self.checkpoint_type == CheckpointType.Meta: + # Use the Meta Tokenizer + from models.demos.t3000.llama2_70b.reference.llama.llama31_8b.tokenizer import Tokenizer + + return Tokenizer(self.tokenizer_path) + else: + # Create a HuggingFace AutoTokenizer + from transformers import AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained(self.DEFAULT_TOKENIZER_PATH) -def load_llama_state_dict(ckpt_dir, n_layers=None, start_layer_idx=0): - checkpoints = sorted(Path(ckpt_dir).glob("*.pth")) - assert len(checkpoints) > 0, f"no checkpoint files found in {ckpt_dir}" - is_chunked = "layers_" in str(checkpoints[0]) - if is_chunked: - checkpoint = load_chunked_checkpoints(checkpoints, n_layers, start_layer_idx) - else: - checkpoint = load_sharded_checkpoints(checkpoints, n_layers) - - return checkpoint - - -def load_chunked_checkpoints(checkpoints, n_layers, start_layer_idx): - checkpoint = {} - - (f"Loading {len(checkpoints)} checkpoint files") - for ckpt in tqdm(checkpoints): - if n_layers: - # Layer range is in the file name, like layers_start-end.pth - layer_range = ckpt.stem.split("_")[1] - start_layer, end_layer = map(int, layer_range.split("-")) - if start_layer > n_layers + start_layer_idx: - continue - if end_layer < start_layer_idx: - continue - - loaded_ckpt = torch.load(ckpt, map_location="cpu") - checkpoint.update(loaded_ckpt) - return checkpoint - - -def load_sharded_checkpoints(checkpoints, n_layers): - checkpoint = {} - logger.info(f"Loading {len(checkpoints)} checkpoint files") - for ckpt in tqdm(checkpoints): - loaded_ckpt = torch.load(ckpt, map_location="cpu") - for ( - key, - value, - ) in loaded_ckpt.items(): - if "layers." in key: - layer_num = int(key.split("layers.")[1].split(".")[0]) - if n_layers and layer_num >= n_layers: - continue - if key in checkpoint: - checkpoint[key] += [value] + # Add meta-compatible stop token list to the HF tokenizer + if not "stop_tokens" in tokenizer.__dict__: + tokenizer.stop_tokens = [tokenizer.eos_token_id] + return tokenizer + + def encode_prompt(self, prompt_text, system_prompt_text=None, instruct=True): + if self.checkpoint_type == CheckpointType.Meta: + if instruct: + return encode_prompt_llama_instruct(self.tokenizer, prompt_text, system_prompt_text) + else: + return self.tokenizer.encode(prompt_text, bos=True, eos=False) + else: + if instruct: + return encode_prompt_hf(self.tokenizer, prompt_text, system_prompt_text) + else: + return self.tokenizer.encode(prompt_text, add_special_tokens=False) + + def reference_lm_head(self): + if self.checkpoint_type == CheckpointType.Meta: + from models.demos.t3000.llama2_70b.reference.llama.llama31_8b.model import ColumnParallelLinear + + return ColumnParallelLinear(self.dim, self.vocab_size, bias=False, init_method=lambda x: x) + else: + model = self.reference_transformer(wrap=False) + layer = model.lm_head + layer._load_state_dict = layer.load_state_dict + layer.load_state_dict = lambda x: layer._load_state_dict(convert_meta_to_hf(x, self.head_dim)) + return layer + + def reference_transformer(self, wrap=True, load_checkpoint=False): + if self.checkpoint_type == CheckpointType.Meta: + from models.demos.t3000.llama2_70b.reference.llama.llama31_8b.model import Transformer + + model = Transformer(self) + if load_checkpoint: + model.load_state_dict(self.load_state_dict()) + return model + else: + from transformers import AutoConfig, AutoModelForCausalLM + + if not load_checkpoint: + config = AutoConfig.from_pretrained(self.DEFAULT_CKPT_DIR) + config.num_layers = self.n_layers + model = AutoModelForCausalLM.from_config(config) else: - checkpoint[key] = [value] - del loaded_ckpt + model = AutoModelForCausalLM.from_pretrained(self.DEFAULT_CKPT_DIR) + if wrap: + wrapper = HfModelWrapper(model, self.head_dim) + return wrapper + else: + return model + + def reference_rms_norm(self): + if self.checkpoint_type == CheckpointType.Meta: + from models.demos.t3000.llama2_70b.reference.llama.llama31_8b.model import RMSNorm + + return RMSNorm(self.dim, self.norm_eps) + else: + model = self.reference_transformer(wrap=False) + layer = model.model.norm + layer._load_state_dict = layer.load_state_dict + layer.load_state_dict = lambda x: layer._load_state_dict(convert_meta_to_hf(x, self.head_dim)) + return layer + + def reference_mlp(self): + if self.checkpoint_type == CheckpointType.Meta: + from models.demos.t3000.llama2_70b.reference.llama.llama31_8b.model import FeedForward - # concat checkpoint values - for key, value in checkpoint.items(): - if len(value) == 1 or "norm" in key: - checkpoint[key] = value[0] + return FeedForward(self.dim, 4 * self.dim, self.multiple_of, self.ffn_dim_multiplier) else: - if key == "tok_embeddings.weight" or key == "output.weight": - assert value[0].shape[1] == 8192 # FIXME: do we need this hardcoded shape? - # Concatenate along dimension 0 for llama3 token embeddings weight and lm head - checkpoint[key] = torch.cat(value, dim=0) + model = self.reference_transformer(wrap=False) + layer = model.model.layers[0].mlp + layer._load_state_dict = layer.load_state_dict + layer.load_state_dict = lambda x: layer._load_state_dict(convert_meta_to_hf(x, self.head_dim)) + return layer + + def reference_embedding(self, reference_model=None): + if self.checkpoint_type == CheckpointType.Meta: + from models.demos.llama3.tt.llama_common import HostEmbedding + + return HostEmbedding(self) + else: + if reference_model is None: + model = self.reference_transformer(wrap=False) else: - # cat_dim is index of the smallest dimension in value[0].shape - cat_dim = torch.argmin(torch.tensor(value[0].shape)) - checkpoint[key] = torch.cat(value, dim=cat_dim) + model = reference_model + layer = model.model.embed_tokens + layer._load_state_dict = layer.load_state_dict + layer.load_state_dict = lambda x: layer._load_state_dict(convert_meta_to_hf(x, self.head_dim)) + return layer + + def reference_decoder(self): + if self.checkpoint_type == CheckpointType.Meta: + from models.demos.t3000.llama2_70b.reference.llama.llama31_8b.model import TransformerBlock - return checkpoint + return TransformerBlock(layer_id=0, args=self) + else: + model = self.reference_transformer(wrap=False) + layer = model.model.layers[0] + wrapper = HfDecoderWrapper(layer, self.head_dim) + return wrapper + + def reference_attention(self): + if self.checkpoint_type == CheckpointType.Meta: + from models.demos.t3000.llama2_70b.reference.llama.llama31_8b.model import Attention + + return Attention(self) + else: + model = self.reference_transformer(wrap=False) + layer = model.model.layers[0].self_attn + wrapper = HfAttentionWrapper(layer, self.head_dim) + return wrapper + + def set_tg_attention_config(self): + shard_spec_n_cores_grid = ttnn.CoreRangeSet({num_to_corerange(40)}) + + self.model_config["CREATE_HEAD_INPUT_MEMCFG"] = ( + None + if self.dim < 4096 + else ttnn.MemoryConfig( + ttnn.TensorMemoryLayout.WIDTH_SHARDED, + ttnn.BufferType.L1, + ttnn.ShardSpec( + shard_spec_n_cores_grid, + [ + 32, + 32, + ], + ttnn.ShardOrientation.ROW_MAJOR, + ), + ) + ) + + if self.is_galaxy: + num_cores = 40 if self.dim == 8192 else (24 if self.dim == 4096 else (20 if self.dim == 3072 else 12)) + + self.model_config["QKV_OUT_GATHERED_MEMCFG"] = lambda mesh_cols: ttnn.create_sharded_memory_config( + shape=(32 * mesh_cols, 32), # mesh_cols = 4 + core_grid=num_to_coregrid(num_cores), + strategy=ttnn.ShardStrategy.WIDTH, + orientation=ttnn.ShardOrientation.ROW_MAJOR, + use_height_and_width_as_shard_shape=True, + ) + + self.model_config["SELF_OUT_GATHERED_MEMCFG"] = lambda mesh_rows: ttnn.create_sharded_memory_config( + shape=(32 * mesh_rows, self.dim // 4 // min(32, self.dim // 4 // 32)), + core_grid=num_to_coregrid(min(32, self.dim // 4 // 32)), + strategy=ttnn.ShardStrategy.WIDTH, + orientation=ttnn.ShardOrientation.ROW_MAJOR, + use_height_and_width_as_shard_shape=True, + ) + self.model_config["GATHER_USERS_MEMCFG"] = lambda mesh_cols: ttnn.create_sharded_memory_config( + shape=(32 * mesh_cols, 32), # mesh_cols = 4 + core_grid=num_to_coregrid(min(32, self.dim // 8 // 32)), + strategy=ttnn.ShardStrategy.WIDTH, + orientation=ttnn.ShardOrientation.ROW_MAJOR, + use_height_and_width_as_shard_shape=True, + ) + else: + qkv_core_grid = self.dram_shard_core_grid_for_k(self.dim) + self.model_config["QKV_OUT_GATHERED_MEMCFG"] = lambda mesh_rows: ttnn.create_sharded_memory_config( + ( + self.tile_size * mesh_rows, + self.dim // qkv_core_grid.num_cores, + ), # Shard shape: [32, 128] -> 1 shard per core + core_grid=qkv_core_grid, + strategy=ttnn.ShardStrategy.WIDTH, + orientation=ttnn.ShardOrientation.ROW_MAJOR, + use_height_and_width_as_shard_shape=True, + ) + gather_core_grid = self.dram_shard_core_grid_for_k(self.dim // 4) + self.model_config["SELF_OUT_GATHERED_MEMCFG"] = lambda mesh_rows: ttnn.create_sharded_memory_config( + ( + self.tile_size * mesh_rows, + self.dim // 4 // gather_core_grid.num_cores, + ), + core_grid=gather_core_grid, + strategy=ttnn.ShardStrategy.WIDTH, + orientation=ttnn.ShardOrientation.ROW_MAJOR, + use_height_and_width_as_shard_shape=True, + ) + users_core_grid = self.dram_shard_core_grid_for_k(self.dim // 8) + self.model_config["GATHER_USERS_MEMCFG"] = lambda mesh_cols: ttnn.create_sharded_memory_config( + ( + self.tile_size * mesh_cols, + self.dim // 8 // users_core_grid.num_cores, + ), + core_grid=users_core_grid, + strategy=ttnn.ShardStrategy.WIDTH, + orientation=ttnn.ShardOrientation.ROW_MAJOR, + use_height_and_width_as_shard_shape=True, + ) + + +class HfAttentionWrapper: + def __init__(self, attention, head_dim): + from transformers import DynamicCache + + super().__init__() + self.attention = attention + self.past_key_value = DynamicCache() + self.head_dim = head_dim + + def forward(self, x, start_pos, freqs_cis_i, mask=None): + position_ids = torch.tensor([list(range(start_pos, start_pos + x.shape[1]))] * x.shape[0]) + if mask is not None: + while len(mask.shape) < 4: + mask = mask.unsqueeze(0) + output, _, self.past_key_value = self.attention( + x, + past_key_value=self.past_key_value, + use_cache=True, + position_ids=position_ids, + attention_mask=mask, + ) + return output + + def __call__(self, *args, **kwargs): + return self.forward(*args, **kwargs) + + def load_state_dict(self, state_dict): + return self.attention.load_state_dict(convert_meta_to_hf(state_dict, self.head_dim)) + + @property + def cache_k(self): + [(k, v)] = self.past_key_value.to_legacy_cache() + hf_k = k.permute(0, 2, 1, 3) # match meta-style reference which uses (batch_size, seq, n_kv_heads, head_dim) + batch_size, seq_len, n_heads, head_dim = hf_k.shape + + meta_k = torch.zeros_like(hf_k) + for b in range(batch_size): + for s in range(seq_len): + # Flatten just heads and head_dim + flat = hf_k[b, s].flatten() + # Apply reverse_permute + transformed = reverse_permute(flat.unsqueeze(-1), n_heads, flat.shape[0], 1).squeeze(-1) + # Restore heads and head_dim shape + meta_k[b, s] = transformed.reshape(n_heads, head_dim) + + return meta_k + + @property + def cache_v(self): + [(k, v)] = self.past_key_value.to_legacy_cache() + return v.permute(0, 2, 1, 3) # match meta-style reference which uses (batch_size, seq, n_kv_heads, head_dim) + + +class HfDecoderWrapper: + def __init__(self, decoder, head_dim): + from transformers import DynamicCache + + self.decoder = decoder + self.head_dim = head_dim + self.past_key_values = DynamicCache() + + def forward(self, x, start_pos, freqs_cis_i, mask=None): + position_ids = torch.tensor([list(range(start_pos, start_pos + x.shape[1]))] * x.shape[0]) + if mask is not None: + while len(mask.shape) < 4: + mask = mask.unsqueeze(0) + output, self.past_key_values = self.decoder.forward( + x, + past_key_value=self.past_key_values, + use_cache=True, + position_ids=position_ids, + attention_mask=mask, + ) + return output + + def __call__(self, *args, **kwargs): + return self.forward(*args, **kwargs) + + def load_state_dict(self, state_dict): + return self.decoder.load_state_dict(convert_meta_to_hf(state_dict, self.head_dim)) + + +class HfModelWrapper: + def __init__(self, model, head_dim): + from transformers import DynamicCache + + self.model = model + self.head_dim = head_dim + self.past_key_values = DynamicCache() + + def forward(self, inputs_embeds, start_pos, mode="decode"): + position_ids = torch.tensor( + [list(range(start_pos, start_pos + inputs_embeds.shape[1]))] * inputs_embeds.shape[0] + ) + logits, new_cache, hidden_states = self.model.forward( + inputs_embeds=inputs_embeds, + position_ids=position_ids, + use_cache=True, + past_key_values=self.past_key_values, + return_dict=False, + output_hidden_states=True, + ) + self.past_key_values = new_cache + return logits if mode == "decode" else hidden_states[-2] # last hidden state is final norm + + def __call__(self, *args, **kwargs): + return self.forward(*args, **kwargs) + + def load_state_dict(self, state_dict): + return self.model.load_state_dict(convert_meta_to_hf(state_dict, self.head_dim)) + + def eval(self): + self.model.eval() def num_to_corerange(x): @@ -1388,51 +1823,3 @@ def num_to_coregrid(x): return ttnn.CoreGrid(y=2, x=6) if x == 20: return ttnn.CoreGrid(y=4, x=5) - - -def set_tg_attention_config(model_config, dim): - shard_spec_n_cores_grid = ttnn.CoreRangeSet({num_to_corerange(40)}) - - model_config["CREATE_HEAD_INPUT_MEMCFG"] = ( - None - if dim < 4096 - else ttnn.MemoryConfig( - ttnn.TensorMemoryLayout.WIDTH_SHARDED, - ttnn.BufferType.L1, - ttnn.ShardSpec( - shard_spec_n_cores_grid, - [ - 32, - 32, - ], - ttnn.ShardOrientation.ROW_MAJOR, - ), - ) - ) - - num_cores = 40 if dim == 8192 else (24 if dim == 4096 else (20 if dim == 3072 else 12)) - - model_config["QKV_OUT_GATHERED_MEMCFG"] = lambda mesh_cols: ttnn.create_sharded_memory_config( - shape=(32 * mesh_cols, 32), # mesh_cols = 4 - core_grid=num_to_coregrid(num_cores), - strategy=ttnn.ShardStrategy.WIDTH, - orientation=ttnn.ShardOrientation.ROW_MAJOR, - use_height_and_width_as_shard_shape=True, - ) - - model_config["SELF_OUT_GATHERED_MEMCFG"] = lambda mesh_rows: ttnn.create_sharded_memory_config( - shape=(32 * mesh_rows, dim // 4 // min(32, dim // 4 // 32)), - core_grid=num_to_coregrid(min(32, dim // 4 // 32)), - strategy=ttnn.ShardStrategy.WIDTH, - orientation=ttnn.ShardOrientation.ROW_MAJOR, - use_height_and_width_as_shard_shape=True, - ) - model_config["GATHER_USERS_MEMCFG"] = lambda mesh_cols: ttnn.create_sharded_memory_config( - shape=(32 * mesh_cols, 32), # mesh_cols = 4 - core_grid=num_to_coregrid(min(32, dim // 8 // 32)), - strategy=ttnn.ShardStrategy.WIDTH, - orientation=ttnn.ShardOrientation.ROW_MAJOR, - use_height_and_width_as_shard_shape=True, - ) - - return model_config diff --git a/models/demos/llama3/tt/multimodal/llama_cross_attention.py b/models/demos/llama3/tt/multimodal/llama_cross_attention.py index 57bfedecffa..ef312334bcf 100644 --- a/models/demos/llama3/tt/multimodal/llama_cross_attention.py +++ b/models/demos/llama3/tt/multimodal/llama_cross_attention.py @@ -292,6 +292,7 @@ def forward_decode( dim=3, math_op=ttnn.ReduceType.Sum, num_links=1, + topology=self.configuration.ccl_topology(), memory_config=ttnn.DRAM_MEMORY_CONFIG, ) @@ -382,6 +383,7 @@ def forward_prefill( dim=3, math_op=ttnn.ReduceType.Sum, num_links=1, + topology=self.configuration.ccl_topology(), memory_config=ttnn.DRAM_MEMORY_CONFIG, ) return dense_out_reduced diff --git a/models/demos/llama3/tt/multimodal/llama_cross_attention_transformer_text.py b/models/demos/llama3/tt/multimodal/llama_cross_attention_transformer_text.py index 162f6dc6da7..28ee6e810ed 100644 --- a/models/demos/llama3/tt/multimodal/llama_cross_attention_transformer_text.py +++ b/models/demos/llama3/tt/multimodal/llama_cross_attention_transformer_text.py @@ -126,8 +126,8 @@ def __init__( configuration.head_dim, configuration.max_seq_len, configuration.rope_theta, - configuration.use_scaled_rope, configuration.rope_scaling_factor, + configuration.orig_context_len, ) self.trans_mats_dict = self.rope_setup.get_both_trans_mats() @@ -291,9 +291,9 @@ def forward( h = xattn_layer( h, xattn_mask=xattn_mask, - xattn_cache=xattn_caches[xattn_layer_idx] - if cross_page_table is None - else kv_cache[total_layer_idx], + xattn_cache=( + xattn_caches[xattn_layer_idx] if cross_page_table is None else kv_cache[total_layer_idx] + ), full_text_row_masked_out_mask_1NSH=full_text_row_masked_out_mask_1NSH, full_text_row_masked_out_mask_11SD=full_text_row_masked_out_mask_11SD, mode=mode, diff --git a/models/demos/llama3/tt/multimodal/llama_cross_attention_transformer_vision.py b/models/demos/llama3/tt/multimodal/llama_cross_attention_transformer_vision.py index 4c59ecec52b..06e5095d4ca 100644 --- a/models/demos/llama3/tt/multimodal/llama_cross_attention_transformer_vision.py +++ b/models/demos/llama3/tt/multimodal/llama_cross_attention_transformer_vision.py @@ -72,14 +72,16 @@ def shuffle_weight(weight): return w.transpose(-1, -2).view(orig_shape) as_interleaved_tensor = lambda name, suffix, type, dim: ttnn.as_tensor( - shuffle_weight(torch_weight(name, suffix)) - if suffix == "weight" - else torch_bias(name, suffix), # Grab only the wX part of the name + ( + shuffle_weight(torch_weight(name, suffix)) if suffix == "weight" else torch_bias(name, suffix) + ), # Grab only the wX part of the name dtype=type, device=self.mesh_device, - mesh_mapper=ttnn.ShardTensorToMesh(self.mesh_device, dim=dim) - if dim is not None - else ttnn.ReplicateTensorToMesh(self.mesh_device), + mesh_mapper=( + ttnn.ShardTensorToMesh(self.mesh_device, dim=dim) + if dim is not None + else ttnn.ReplicateTensorToMesh(self.mesh_device) + ), layout=ttnn.TILE_LAYOUT, memory_config=ttnn.DRAM_MEMORY_CONFIG, cache_file_name=cache_name(name, suffix), diff --git a/models/demos/llama3/tt/multimodal/llama_image_mlp.py b/models/demos/llama3/tt/multimodal/llama_image_mlp.py index b0c63a83df2..45755f88f30 100644 --- a/models/demos/llama3/tt/multimodal/llama_image_mlp.py +++ b/models/demos/llama3/tt/multimodal/llama_image_mlp.py @@ -35,14 +35,16 @@ def __init__( cache_name = lambda name, suffix: weight_cache_path / (state_dict_prefix + f".{name}.{suffix}") as_interleaved_tensor = lambda name, suffix, type, dim: ttnn.as_tensor( - torch_weight(name, suffix) - if suffix == "weight" - else torch_bias(name, suffix), # Grab only the wX part of the name + ( + torch_weight(name, suffix) if suffix == "weight" else torch_bias(name, suffix) + ), # Grab only the wX part of the name dtype=type, device=self.mesh_device, - mesh_mapper=ttnn.ShardTensorToMesh(self.mesh_device, dim=dim) - if dim is not None - else ttnn.ReplicateTensorToMesh(self.mesh_device), + mesh_mapper=( + ttnn.ShardTensorToMesh(self.mesh_device, dim=dim) + if dim is not None + else ttnn.ReplicateTensorToMesh(self.mesh_device) + ), layout=ttnn.TILE_LAYOUT, memory_config=ttnn.DRAM_MEMORY_CONFIG, cache_file_name=cache_name(name, suffix), diff --git a/models/demos/llama3/tt/multimodal/llama_vision_model.py b/models/demos/llama3/tt/multimodal/llama_vision_model.py index c22c4100f43..7a4918c96c1 100644 --- a/models/demos/llama3/tt/multimodal/llama_vision_model.py +++ b/models/demos/llama3/tt/multimodal/llama_vision_model.py @@ -28,7 +28,6 @@ from models.demos.llama3.tt.llama_common import ( get_prefill_rot_mat, get_rot_transformation_mat, - get_single_rot_mat, copy_host_to_device, get_padded_prefill_len, ) @@ -374,7 +373,9 @@ def prepare_inputs_prefill( self.configuration.max_seq_len, self.mesh_device, seq_len=S, + theta=self.configuration.rope_theta, scale_factor=self.configuration.rope_scaling_factor, + orig_context_len=self.configuration.orig_context_len, ) if isinstance(page_table, torch.Tensor): From a4b97710f1ac6abdd227bc082e713ffa10cbebfd Mon Sep 17 00:00:00 2001 From: Mark O'Connor Date: Fri, 7 Feb 2025 13:44:35 +0100 Subject: [PATCH 005/316] Update README.md (#17715) [skip ci] Fix duplicate README entry --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 817558ebf75..749849664cf 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,6 @@ | [Llama 3.1 70B (TP=8)](./models/demos/t3000/llama3_70b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 190 | 15.1 | 20 | 483.2 | [v0.54.0-rc2](https://github.com/tenstorrent/tt-metal/tree/v0.54.0-rc2) | [9531611](https://github.com/tenstorrent/vllm/tree/953161188c50f10da95a88ab305e23977ebd3750) | | [Falcon 40B (TP=8)](./models/demos/t3000/falcon40b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | | 5.3 | 36 | 169.6 | [v0.55.0-rc19](https://github.com/tenstorrent/tt-metal/tree/v0.55.0-rc19) | | | [Mixtral 8x7B (TP=8)](./models/demos/t3000/mixtral8x7b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 227 | 14.9 | 33 | 476.8 | [v0.55.0-rc19](https://github.com/tenstorrent/tt-metal/tree/v0.55.0-rc19) | | -| [DeepSeek R1 Distill Llama 3.3 70B (TP=8)](./models/demos/llama3) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 1113 | 16.4 | 33 |386.4 | [main](https://github.com/tenstorrent/tt-metal/) | [2f33504](https://github.com/tenstorrent/vllm/tree/2f33504bad49a6202d3685155107a6126a5b5e6e) | | [Falcon 7B (DP=32)](./models/demos/tg/falcon7b) | 1024 | [Galaxy](https://tenstorrent.com/hardware/galaxy) | 223 | 4.8 | 26 | 4915.2 | [v0.55.0-rc18](https://github.com/tenstorrent/tt-metal/tree/v0.55.0-rc18) | | | [Llama 3.1 70B (DP=4, TP=8)](./models/demos/t3000/llama3_70b) | 128 | [Galaxy](https://tenstorrent.com/hardware/galaxy) | 190 | 14.3 | 20 | 1835.5 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) | | | [Llama 3.1 70B (TP=32)](./models/demos/llama3) | 32 | [Galaxy](https://tenstorrent.com/hardware/galaxy) | 763 | 13.5 | 80 | 432.0 | [v0.55.0-rc12](https://github.com/tenstorrent/tt-metal/tree/v0.55.0-rc12) | [2f33504](https://github.com/tenstorrent/vllm/tree/2f33504bad49a6202d3685155107a6126a5b5e6e) | From 2a2dc54131daf607236316a1f1d83bfe5d4c6acb Mon Sep 17 00:00:00 2001 From: Slavko Krstic Date: Fri, 7 Feb 2025 14:51:48 +0100 Subject: [PATCH 006/316] Removed workaround for blackhole alignment (#17710) ### Ticket Issue: #17226 ### What's changed As https://github.com/tenstorrent/tt-metal/pull/17122 is merged, allocator uses L1 and DRAM specific alignments, not max of these 2, so this workaround can be removed. This also resolves the 8 PCC errors in the Blackhole Conv2D unit tests (sticks in these cases were 16B aligned, but with the workaround, they were overridden to 32B, which caused reading from invalid source addresses in halo op). With this change, tests will have a 100% pass rate. --- .../device/untilize_with_halo_v2_program_factory.cpp | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_program_factory.cpp index cbd408e01bf..749c570ac99 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_program_factory.cpp @@ -173,15 +173,8 @@ operation::ProgramWithCallbacks untilize_with_halo_multi_core_v2( log_debug(tt::LogOp, "out_stick_nbytes = {}", out_stick_nbytes); log_debug(tt::LogOp, "input_tensor.buffer()->alignment() = {}", input_tensor.buffer()->alignment()); - uint32_t input_buffer_alignment = input_tensor.buffer()->alignment(); - if (device->arch() == tt::ARCH::BLACKHOLE) { - // FIXME: Remove this workaround once the alignment is fixed in the allocator: - // https://github.com/tenstorrent/tt-metal/pull/13762, ticket: - // https://github.com/tenstorrent/tt-metal/issues/13609 - input_buffer_alignment = 32; // this is a workaround for the issue mentioned above - } - if (out_stick_nbytes % input_buffer_alignment != 0) { - aligned_input_nstick_nbytes = tt::round_up(out_stick_nbytes, input_buffer_alignment); + if (out_stick_nbytes % input_tensor.buffer()->alignment() != 0) { + aligned_input_nstick_nbytes = tt::round_up(out_stick_nbytes, input_tensor.buffer()->alignment()); } // reader kernel std::vector reader_ct_args = { From 4be16f5495f1d1f5f11a6717d4b1eaaf3613b46f Mon Sep 17 00:00:00 2001 From: Sofija Jovic <148721049+s-jovic@users.noreply.github.com> Date: Fri, 7 Feb 2025 16:26:56 +0100 Subject: [PATCH 007/316] #0: Update SD device perf margin to match other models (#17658) --- models/demos/wormhole/stable_diffusion/tests/test_perf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/models/demos/wormhole/stable_diffusion/tests/test_perf.py b/models/demos/wormhole/stable_diffusion/tests/test_perf.py index dc62a4fc16d..2056f5efcba 100644 --- a/models/demos/wormhole/stable_diffusion/tests/test_perf.py +++ b/models/demos/wormhole/stable_diffusion/tests/test_perf.py @@ -211,7 +211,7 @@ def test_stable_diffusion_perf(device, batch_size, num_inference_steps, expected ) def test_stable_diffusion_device_perf(expected_kernel_samples_per_second): subdir = "ttnn_stable_diffusion" - margin = 0.01 + margin = 0.03 batch = 1 iterations = 1 command = f"pytest models/demos/wormhole/stable_diffusion/tests/test_unet_2d_condition_model.py::test_unet_2d_condition_model_512x512[2-4-64-64-device_params=l1_small_size_24576]" From 1a51cf225823ed58d56c62a6c196c5f8c7469fcf Mon Sep 17 00:00:00 2001 From: Sean Nijjar Date: Fri, 7 Feb 2025 10:33:17 -0500 Subject: [PATCH 008/316] optimize edm fabric packet header structure (#17579) flatten the command and noc command type field to a single field to simplify header processing; lowers burden on workers and eriscs. ### Ticket https://github.com/tenstorrent/tt-metal/issues/17429 ### Problem description The packet header added an unnecessary additional level of nesting by having a field for write vs atomic and a separate one for noc_unicast vs noc_multicast. This leads to inefficiencies in kernels processing or inspecting headers because it means they often require nested checks. Additionally, it forces an additional call on the worker when setting up the packet header. ### What's changed Flattened/merged these two packet fields to remove nesting code in EDM fabric and workers. ### Additional Changes Updated reduce scatter async tests. The reduce scatter test is passing in the same global semaphores to reduce scatter for every back to back iteration which is unsafe and can lead to hangs due to race between op iterations using the same global semaphore. Additionally, reduce scatter currently does not correctly override RT args for global semaphores when the op is rerun with different global semaphores across iterations (to fix first mentioned issue). For that reason we are patching the test for CI pipeline stability until the reduce scatter RT arg update is resolved. We patch the test by synching after each iteration which removes the race for global semaphore usage between op iterations. --- .../gtests/ccl/kernels/edm_fabric_writer.cpp | 62 ++++----- ...c_erisc_datamover_sender_worker_sender.cpp | 33 ++--- .../fabric_worker_sender_multi_input.cpp | 33 ++--- .../ccl/kernels/test_kernels.common.hpp | 16 +-- .../ccl/test_reduce_scatter_async.py | 40 +++--- .../kernel_common/kernel_writers.hpp | 4 +- .../kernel_common/noc_addr.hpp | 10 +- .../kernels/ccl_send_reader_two_input.cpp | 69 ++++------ .../ccl/common/kernels/ccl_send_utils.hpp | 35 ++--- .../edm_fabric/fabric_edm_packet_header.hpp | 121 +++++++--------- .../fabric_edm_packet_header_validate.hpp | 8 +- .../fabric_edm_packet_transmission.hpp | 130 ++++++------------ .../edm_fabric/fabric_erisc_datamover.cpp | 5 +- .../interleaved_dim3_1_1_32_any_writer.cpp | 9 +- .../llama_post_binary_matmul_shape_writer.cpp | 9 +- .../device/kernels/minimal_ccl_common.hpp | 9 +- 16 files changed, 241 insertions(+), 352 deletions(-) diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp index 717791c746c..cd142bef8fd 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp @@ -7,6 +7,8 @@ #include "ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/noc_addr.hpp" #include "dataflow_api.h" +#include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp" + #include #include @@ -23,21 +25,21 @@ FORCE_INLINE void line_sync( size_t sync_noc_y, size_t sync_val) { using namespace tt::fabric; - mcast_fwd_packet_header->to_atomic_inc(); - mcast_bwd_packet_header->to_atomic_inc(); + auto dest_noc_addr = + safe_get_noc_addr(static_cast(sync_noc_x), static_cast(sync_noc_y), sync_bank_addr, 0); if (fabric_connection.has_forward_connection()) { - mcast_fwd_packet_header->to_noc_unicast_atomic_inc(NocUnicastAtomicIncCommandHeader{ - sync_bank_addr, 1, 128, static_cast(sync_noc_x), static_cast(sync_noc_y)}); + mcast_fwd_packet_header->to_noc_unicast_atomic_inc(NocUnicastAtomicIncCommandHeader{dest_noc_addr, 1, 128}); fabric_connection.get_forward_connection().wait_for_empty_write_slot(); + print_pkt_header(mcast_fwd_packet_header); fabric_connection.get_forward_connection().send_payload_flush_non_blocking_from_address( (uint32_t)mcast_fwd_packet_header, sizeof(tt::fabric::PacketHeader)); } if (fabric_connection.has_backward_connection()) { - mcast_bwd_packet_header->to_noc_unicast_atomic_inc(NocUnicastAtomicIncCommandHeader{ - sync_bank_addr, 1, 128, static_cast(sync_noc_x), static_cast(sync_noc_y)}); + mcast_bwd_packet_header->to_noc_unicast_atomic_inc(NocUnicastAtomicIncCommandHeader{dest_noc_addr, 1, 128}); fabric_connection.get_backward_connection().wait_for_empty_write_slot(); + print_pkt_header(mcast_bwd_packet_header); fabric_connection.get_backward_connection().send_payload_flush_non_blocking_from_address( (uint32_t)mcast_bwd_packet_header, sizeof(tt::fabric::PacketHeader)); } @@ -101,10 +103,8 @@ void kernel_main() { reinterpret_cast(packet_header_buffer_address + sizeof(tt::fabric::PacketHeader)); auto* unicast_packet_header = reinterpret_cast(packet_header_buffer_address + sizeof(tt::fabric::PacketHeader) * 2); - mcast_fwd_packet_header->to_write().to_chip_multicast( - MulticastRoutingCommandHeader{1, static_cast(mcast_fwd_hops)}); - mcast_bwd_packet_header->to_write().to_chip_multicast( - MulticastRoutingCommandHeader{1, static_cast(mcast_bwd_hops)}); + mcast_fwd_packet_header->to_chip_multicast(MulticastRoutingCommandHeader{1, static_cast(mcast_fwd_hops)}); + mcast_bwd_packet_header->to_chip_multicast(MulticastRoutingCommandHeader{1, static_cast(mcast_bwd_hops)}); if (enable_start_synchronization) { line_sync( @@ -126,31 +126,27 @@ void kernel_main() { 2 * start_sync_val); } - mcast_fwd_packet_header->to_write().to_chip_multicast( - MulticastRoutingCommandHeader{1, static_cast(mcast_fwd_hops)}); - mcast_bwd_packet_header->to_write().to_chip_multicast( - MulticastRoutingCommandHeader{1, static_cast(mcast_bwd_hops)}); - unicast_packet_header->to_atomic_inc().to_chip_unicast( - UnicastRoutingCommandHeader{static_cast(unicast_hops)}); + mcast_fwd_packet_header->to_chip_multicast(MulticastRoutingCommandHeader{1, static_cast(mcast_fwd_hops)}); + mcast_bwd_packet_header->to_chip_multicast(MulticastRoutingCommandHeader{1, static_cast(mcast_bwd_hops)}); + unicast_packet_header->to_chip_unicast(UnicastRoutingCommandHeader{static_cast(unicast_hops)}); { DeviceZoneScopedN("MAIN-WRITE-ZONE"); for (size_t i = 0; i < num_mcasts; i++) { - noc_async_write( - source_l1_buffer_address, - safe_get_noc_addr(static_cast(dest_noc_x), static_cast(dest_noc_y), dest_bank_addr), - packet_payload_size_bytes); + auto noc0_dest_addr = safe_get_noc_addr( + static_cast(dest_noc_x), static_cast(dest_noc_y), dest_bank_addr, 0); + auto dest_addr = + safe_get_noc_addr(static_cast(dest_noc_x), static_cast(dest_noc_y), dest_bank_addr); + noc_async_write(source_l1_buffer_address, dest_addr, packet_payload_size_bytes); if (fabric_connection.has_forward_connection()) { DeviceZoneScopedN("WR-FWD"); - mcast_fwd_packet_header->to_noc_unicast(NocUnicastCommandHeader{ - dest_bank_addr, - packet_payload_size_bytes + sizeof(tt::fabric::PacketHeader), - static_cast(dest_noc_x), - static_cast(dest_noc_y)}); + mcast_fwd_packet_header->to_noc_unicast_write(NocUnicastCommandHeader{ + noc0_dest_addr, packet_payload_size_bytes + sizeof(tt::fabric::PacketHeader)}); { DeviceZoneScopedN("WR-FWD-WAIT"); fabric_connection.get_forward_connection().wait_for_empty_write_slot(); } + print_pkt_header(mcast_fwd_packet_header); fabric_connection.get_forward_connection().send_payload_without_header_non_blocking_from_address( source_l1_buffer_address, packet_payload_size_bytes); fabric_connection.get_forward_connection().send_payload_flush_non_blocking_from_address( @@ -159,15 +155,13 @@ void kernel_main() { if (fabric_connection.has_backward_connection()) { DeviceZoneScopedN("WR-BWD"); - mcast_bwd_packet_header->to_noc_unicast(NocUnicastCommandHeader{ - dest_bank_addr, - packet_payload_size_bytes + sizeof(tt::fabric::PacketHeader), - static_cast(dest_noc_x), - static_cast(dest_noc_y)}); + mcast_bwd_packet_header->to_noc_unicast_write(NocUnicastCommandHeader{ + noc0_dest_addr, packet_payload_size_bytes + sizeof(tt::fabric::PacketHeader)}); { DeviceZoneScopedN("WR-BWD-WAIT"); fabric_connection.get_backward_connection().wait_for_empty_write_slot(); } + print_pkt_header(mcast_bwd_packet_header); fabric_connection.get_backward_connection().send_payload_without_header_non_blocking_from_address( source_l1_buffer_address, packet_payload_size_bytes); fabric_connection.get_backward_connection().send_payload_flush_non_blocking_from_address( @@ -180,14 +174,12 @@ void kernel_main() { } for (size_t i = 0; i < num_unicasts; i++) { + auto noc0_dest_addr = + safe_get_noc_addr(static_cast(dest_noc_x), static_cast(dest_noc_y), dest_bank_addr, 0); DeviceZoneScopedN("UNICAST-WRITE"); auto& fabric_conn = unicast_is_fwd ? fabric_connection.get_forward_connection() : fabric_connection.get_backward_connection(); - unicast_packet_header->to_noc_unicast(NocUnicastCommandHeader{ - dest_bank_addr, - packet_payload_size_bytes, - static_cast(dest_noc_x), - static_cast(dest_noc_y)}); + unicast_packet_header->to_noc_unicast_write(NocUnicastCommandHeader{noc0_dest_addr, packet_payload_size_bytes}); fabric_conn.wait_for_empty_write_slot(); fabric_conn.send_payload_without_header_non_blocking_from_address( source_l1_buffer_address, packet_payload_size_bytes); diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp index bd9b986c2f3..d0b384fc55f 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp @@ -8,7 +8,7 @@ #include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp" #include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp" #include "tests/ttnn/unit_tests/gtests/ccl/kernels/test_kernels.common.hpp" - +#include "ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/noc_addr.hpp" struct unicast_mode { uint8_t distance; }; @@ -122,31 +122,19 @@ void kernel_main() { // bit of a hack to extract X/Y const auto dest_noc_address = get_noc_addr(p, dest_addr_gen, 0, NORMALIZED_NOC_INDEX); - const size_t dest_addr = dest_noc_address & 0xFFFFFFFF; - const size_t dest_noc_x = (dest_noc_address >> NOC_ADDR_LOCAL_BITS) & ((1 << NOC_ADDR_NODE_ID_BITS) - 1); - const size_t dest_noc_y = - (dest_noc_address >> (NOC_ADDR_LOCAL_BITS + NOC_ADDR_NODE_ID_BITS)) & ((1 << NOC_ADDR_NODE_ID_BITS) - 1); const size_t packet_size = page_size + sizeof(tt::fabric::PacketHeader); - auto packet_addr = get_read_ptr(cb_id_in0); auto& packet_header = *reinterpret_cast(packet_addr); if constexpr (mcast_mode) { - packet_header.to_write() + packet_header .to_chip_multicast(tt::fabric::MulticastRoutingCommandHeader{config.mcast.distance, config.mcast.range}) - .to_noc_unicast(tt::fabric::NocUnicastCommandHeader{ - dest_addr, - (pages_to_send * page_size) + sizeof(tt::fabric::PacketHeader), - static_cast(dest_noc_x), - static_cast(dest_noc_y)}); + .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{ + dest_noc_address, (pages_to_send * page_size) + sizeof(tt::fabric::PacketHeader)}); packet_header.reserved2 = 0x1111; // debug only } else { - packet_header.to_write() - .to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{config.unicast.distance}) - .to_noc_unicast(tt::fabric::NocUnicastCommandHeader{ - dest_addr, - (pages_to_send * page_size) + sizeof(tt::fabric::PacketHeader), - static_cast(dest_noc_x), - static_cast(dest_noc_y)}); + packet_header.to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{config.unicast.distance}) + .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{ + dest_noc_address, (pages_to_send * page_size) + sizeof(tt::fabric::PacketHeader)}); packet_header.reserved2 = 0x1111; // debug only } @@ -160,10 +148,11 @@ void kernel_main() { auto& packet_header = *reinterpret_cast(a_packet_header_addr); ASSERT(*last_message_semaphore_address == 0); - packet_header.to_atomic_inc(); + uint64_t last_message_semaphore_noc0_addr = + safe_get_noc_addr(my_x[0], my_y[0], (uint32_t)last_message_semaphore_address, 0); packet_header.to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{2}); - packet_header.to_noc_unicast_atomic_inc(tt::fabric::NocUnicastAtomicIncCommandHeader( - reinterpret_cast(last_message_semaphore_address), 1, 32, my_x[0], my_y[0])); + packet_header.to_noc_unicast_atomic_inc( + tt::fabric::NocUnicastAtomicIncCommandHeader(last_message_semaphore_noc0_addr, 1, 32)); sender.send_payload_blocking_from_address( a_packet_header_addr, packet_header.get_payload_size_including_header()); diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp index f699132dbca..98a60766922 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp @@ -51,28 +51,20 @@ auto forward_to_fabric_from_cb( sender.wait_for_empty_write_slot(); // bit of a hack to extract X/Y - const auto dest_noc_address = get_noc_addr(current_page, dest_addr_gen, 0, NORMALIZED_NOC_INDEX); - const auto [dest_worker_noc, dest_addr] = get_noc_address_components(dest_noc_address); + const auto noc0_dest_address = get_noc_addr(current_page, dest_addr_gen, 0, NORMALIZED_NOC_INDEX); const size_t packet_size = page_size + sizeof(tt::fabric::PacketHeader); auto packet_addr = get_read_ptr(cb_id); auto &packet_header = *reinterpret_cast(packet_addr); if constexpr (mcast_mode) { - packet_header.to_write() + packet_header .to_chip_multicast(tt::fabric::MulticastRoutingCommandHeader{config.mcast.distance, config.mcast.range}) - .to_noc_unicast(tt::fabric::NocUnicastCommandHeader{ - dest_addr, - (pages_to_send * page_size) + sizeof(tt::fabric::PacketHeader), - static_cast(dest_worker_noc.x), - static_cast(dest_worker_noc.y)}); + .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{ + noc0_dest_address, (pages_to_send * page_size) + sizeof(tt::fabric::PacketHeader)}); } else { - packet_header.to_write() - .to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{config.unicast.distance}) - .to_noc_unicast(tt::fabric::NocUnicastCommandHeader{ - dest_addr, - (pages_to_send * page_size) + sizeof(tt::fabric::PacketHeader), - static_cast(dest_worker_noc.x), - static_cast(dest_worker_noc.y)}); + packet_header.to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{config.unicast.distance}) + .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{ + noc0_dest_address, (pages_to_send * page_size) + sizeof(tt::fabric::PacketHeader)}); } uint64_t buffer_address = sender.edm_buffer_addr + (*sender.buffer_index_ptr * (sender.buffer_size_bytes + sizeof(eth_channel_sync_t))); @@ -196,15 +188,10 @@ void kernel_main() { ASSERT(*last_message_semaphore_address == 0); packet_header.reserved = 0xE; packet_header.reserved2 = 0xFFFF; - packet_header.to_atomic_inc(); + uint64_t last_message_sem_noc_addr = get_noc_addr(my_x[0], my_y[0], last_message_semaphore_address); packet_header.to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{kLoopbackNumHopsToMyChip}); - packet_header.to_noc_unicast_atomic_inc(tt::fabric::NocUnicastAtomicIncCommandHeader( - reinterpret_cast(last_message_semaphore_address), - 1, - 32, - my_x[0], - my_y[0] - )); + packet_header.to_noc_unicast_atomic_inc( + tt::fabric::NocUnicastAtomicIncCommandHeader(last_message_sem_noc_addr, 1, 32)); sender.send_payload_blocking_from_address(a_packet_header_addr, packet_header.get_payload_size_including_header()); diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/test_kernels.common.hpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/test_kernels.common.hpp index 53c102f6098..cae2798e893 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/kernels/test_kernels.common.hpp +++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/test_kernels.common.hpp @@ -23,20 +23,18 @@ bool terminate_fabric_endpoints_farthest_to_nearest ( closed = true; sender.close(); } + uint64_t termination_sig_noc_addr = get_noc_addr(edm_noc_x, edm_noc_y, termination_addr); if (distance == 0) { - noc_inline_dw_write(get_noc_addr(edm_noc_x, edm_noc_y, termination_addr), tt::fabric::TerminationSignal::GRACEFULLY_TERMINATE); + noc_inline_dw_write( + get_noc_addr(edm_noc_x, edm_noc_y, termination_addr), + tt::fabric::TerminationSignal::GRACEFULLY_TERMINATE); } else { auto &packet_header = *reinterpret_cast(a_packet_header_addr); reinterpret_cast(a_packet_header_addr)[sizeof(tt::fabric::PacketHeader) >> 2] = tt::fabric::TerminationSignal::GRACEFULLY_TERMINATE; sender.wait_for_empty_write_slot(); - packet_header.to_write() - .to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{static_cast(distance)}) - .to_noc_unicast(tt::fabric::NocUnicastCommandHeader{ - termination_addr, - sizeof(tt::fabric::PacketHeader) + sizeof(uint32_t), - static_cast(edm_noc_x), - static_cast(edm_noc_y) - }); + packet_header.to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{static_cast(distance)}) + .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{ + termination_sig_noc_addr, sizeof(tt::fabric::PacketHeader) + sizeof(uint32_t)}); sender.send_payload_blocking_from_address(a_packet_header_addr, packet_header.get_payload_size_including_header()); noc_async_writes_flushed(); } diff --git a/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_async.py b/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_async.py index 1c8e4b69deb..9235e247eb3 100644 --- a/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_async.py +++ b/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_async.py @@ -51,8 +51,8 @@ def run_with_trace( output_tensor_mesh = ttnn.experimental.reduce_scatter_async( input_tensor_mesh, dim=dim, - from_remote_multi_device_global_semaphore=from_remote_semaphore_handles, - to_remote_multi_device_global_semaphore=to_remote_semaphore_handles, + from_remote_multi_device_global_semaphore=from_remote_semaphore_handles[0], + to_remote_multi_device_global_semaphore=to_remote_semaphore_handles[0], math_op=math_op, num_links=num_links, memory_config=output_mem_config, @@ -69,8 +69,10 @@ def run_with_trace( output_tensor_mesh = ttnn.experimental.reduce_scatter_async( input_tensor_mesh, dim=dim, - from_remote_multi_device_global_semaphore=from_remote_semaphore_handles, - to_remote_multi_device_global_semaphore=to_remote_semaphore_handles, + from_remote_multi_device_global_semaphore=from_remote_semaphore_handles[ + i % len(from_remote_semaphore_handles) + ], + to_remote_multi_device_global_semaphore=to_remote_semaphore_handles[i % len(to_remote_semaphore_handles)], math_op=math_op, num_links=num_links, memory_config=output_mem_config, @@ -168,16 +170,12 @@ def run_reduce_scatter_test( mesh_device.set_sub_device_stall_group(sub_device_stall_group) # create global semaphore handles - from_remote_semaphore_handles = create_global_semaphore_with_same_address( - mesh_device, - ccl_sub_device_crs, - 0, # , search_max=True - ) - to_remote_semaphore_handles = create_global_semaphore_with_same_address( - mesh_device, - ccl_sub_device_crs, - 0, # , search_max=True - ) + from_remote_semaphore_handles = [ + create_global_semaphore_with_same_address(mesh_device, ccl_sub_device_crs, 0) for _ in range(num_iters) + ] + to_remote_semaphore_handles = [ + create_global_semaphore_with_same_address(mesh_device, ccl_sub_device_crs, 0) for _ in range(num_iters) + ] mesh_device.set_sub_device_stall_group([worker_sub_device_id]) debug = False @@ -237,8 +235,12 @@ def run_reduce_scatter_test( output_tensor_mesh = ttnn.experimental.reduce_scatter_async( input_tensor_mesh, dim=dim, - from_remote_multi_device_global_semaphore=from_remote_semaphore_handles, - to_remote_multi_device_global_semaphore=to_remote_semaphore_handles, + from_remote_multi_device_global_semaphore=from_remote_semaphore_handles[ + i % len(from_remote_semaphore_handles) + ], + to_remote_multi_device_global_semaphore=to_remote_semaphore_handles[ + i % len(to_remote_semaphore_handles) + ], math_op=math_op, num_links=num_links, memory_config=output_mem_config, @@ -246,9 +248,9 @@ def run_reduce_scatter_test( subdevice_id=worker_sub_device_id, ) - logger.info(f"Waiting for op to finish all iterations") - ttnn.synchronize_devices(mesh_device, sub_device_ids=sub_device_stall_group) - logger.info(f"Done iterations") + logger.info(f"Waiting for op to finish all iterations") + ttnn.synchronize_devices(mesh_device, sub_device_ids=sub_device_stall_group) + logger.info(f"Done iterations") # Compute golden # TODO: Make it model how reduce scatter actually works for numerical correctness/ordering diff --git a/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/kernel_writers.hpp b/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/kernel_writers.hpp index 827e5f6f649..b69b5caaad2 100644 --- a/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/kernel_writers.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/kernel_writers.hpp @@ -26,7 +26,6 @@ FORCE_INLINE void write_and_advance_local_read_address_for_fabric_write( FabricConnectionManager& fabric_connection, size_t& l1_read_addr, uint32_t payload_size_bytes) { - const auto [dest_noc_xy, dest_addr] = get_noc_address_components(noc0_dest_noc_addr); const size_t payload_l1_address = l1_read_addr; auto pkt_hdr = reinterpret_cast(packet_header_buffer_addr); @@ -35,8 +34,7 @@ FORCE_INLINE void write_and_advance_local_read_address_for_fabric_write( #endif size_t packet_send_size_bytes = payload_size_bytes + sizeof(tt::fabric::PacketHeader); - pkt_hdr->to_write()->to_noc_unicast(tt::fabric::NocUnicastCommandHeader{ - dest_addr, packet_send_size_bytes, static_cast(dest_noc_xy.x), static_cast(dest_noc_xy.y)}); + pkt_hdr->to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_noc_addr, packet_send_size_bytes}); switch (current_cmd_header.dest_type) { case ttnn::ccl::cmd::CclCommandDestType::CHIP_UNICAST: { diff --git a/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/noc_addr.hpp b/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/noc_addr.hpp index c9a2ecb6559..e4988f9c973 100644 --- a/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/noc_addr.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/noc_addr.hpp @@ -7,6 +7,7 @@ #include "cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp" #include "dataflow_api.h" +#include "noc_nonblocking_api.h" #include // NOTE: This will eventually be updated with an official API @@ -16,15 +17,16 @@ FORCE_INLINE bool is_using_noc_coords(uint16_t noc_x, uint16_t noc_y) { return noc_x < VIRTUAL_COORDS_START_X && noc_y < VIRTUAL_COORDS_START_Y; } -FORCE_INLINE uint64_t safe_get_noc_addr(uint8_t dest_noc_x, uint8_t dest_noc_y, uint32_t dest_bank_addr) { +FORCE_INLINE uint64_t +safe_get_noc_addr(uint8_t dest_noc_x, uint8_t dest_noc_y, uint32_t dest_bank_addr, uint8_t noc_id = noc_index) { bool using_noc_coords = is_using_noc_coords(dest_noc_x, dest_noc_y); uint8_t noc_x = dest_noc_x; uint8_t noc_y = dest_noc_y; if (using_noc_coords) { - noc_x = NOC_X_PHYS_COORD(dest_noc_x); - noc_y = NOC_Y_PHYS_COORD(dest_noc_y); + noc_x = NOC_0_X_PHYS_COORD(noc_id, noc_size_x, dest_noc_x); + noc_y = NOC_0_Y_PHYS_COORD(noc_id, noc_size_y, dest_noc_y); } - return get_noc_addr(noc_x, noc_y, dest_bank_addr); + return get_noc_addr(noc_x, noc_y, dest_bank_addr, noc_id); } // TODO: COMMONIZE WITH THE ONE IN `ccl_send_writer.cpp` FORCE_INLINE std::pair get_noc_address_components(uint64_t noc_addr) { diff --git a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp index 370be920c8c..4225247db41 100644 --- a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp @@ -125,7 +125,7 @@ template < tt::tt_metal::BufferType buffer_type, tt::tt_metal::Layout page_layout, typename ShardingInfoType> -FORCE_INLINE auto build_source_address_generator( +auto build_source_address_generator( std::size_t& arg_idx, address_t tensor_address, std::size_t page_size, @@ -208,7 +208,7 @@ void update_ccl_command( template struct command_context_t final { - FORCE_INLINE command_context_t( + command_context_t( FabricConnectionManager& fabric_connection, Addrgen& addrgen, uint16_t num_commands, @@ -269,7 +269,7 @@ struct command_context_t final { FORCE_INLINE bool current_command_active() const { return populated; } - FORCE_INLINE void fetch_next_command() { + void fetch_next_command() { populated = true; this->current_cmd_header = ttnn::ccl::cmd::CclCommandHeader::from_uint32(get_arg_val(arg_idx++)); @@ -416,7 +416,7 @@ void update_ccl_command( } template -FORCE_INLINE void try_advance_inline_write_or_atomic_inc(command_context_t& cmd_ctx) { +void try_advance_inline_write_or_atomic_inc(command_context_t& cmd_ctx) { const size_t value = cmd_ctx.cmd_specific_ctx.inline_value_ctx.value; const size_t dest_bank_addr = cmd_ctx.dest_addr_info.address; bool is_remote_atomic_inc_over_fabric = cmd_ctx.command_requires_fabric(); @@ -432,31 +432,23 @@ FORCE_INLINE void try_advance_inline_write_or_atomic_inc(command_context_t(cmd_ctx.packet_header_buffer_addr); - if (cmd_ctx.current_cmd_header.code == ttnn::ccl::cmd::CclCommandCode::ATOMIC_INC) { - pkt_hdr->to_atomic_inc(); - } else { - pkt_hdr->to_write(); - } #ifdef DEBUG_PRINT_ENABLED pkt_hdr->reserved2 = my_chip_id; #endif - pkt_hdr->to_noc_unicast_atomic_inc(tt::fabric::NocUnicastAtomicIncCommandHeader{ - dest_bank_addr, - static_cast(value), - 32, - static_cast(dest_noc0_x), - static_cast(dest_noc0_y)}); + uint64_t dest_noc_addr_for_pkt = safe_get_noc_addr(dest_noc0_x, dest_noc0_y, dest_bank_addr, 0); + if (cmd_ctx.current_cmd_header.code == ttnn::ccl::cmd::CclCommandCode::ATOMIC_INC) { + pkt_hdr->to_noc_unicast_atomic_inc( + tt::fabric::NocUnicastAtomicIncCommandHeader{dest_noc_addr_for_pkt, static_cast(value), 32}); + } else { + pkt_hdr->to_noc_unicast_write( + tt::fabric::NocUnicastCommandHeader{dest_noc_addr_for_pkt, static_cast(value)}); + } switch (cmd_ctx.current_cmd_header.dest_type) { case ttnn::ccl::cmd::CclCommandDestType::CHIP_UNICAST: { @@ -471,11 +463,12 @@ FORCE_INLINE void try_advance_inline_write_or_atomic_inc(command_context_tto_chip_multicast(tt::fabric::MulticastRoutingCommandHeader{ 1, static_cast(mcast_args.num_targets_forward_direction)}); + cmd_ctx.fabric_connection.get_forward_connection().wait_for_empty_write_slot(); cmd_ctx.fabric_connection.get_forward_connection().send_payload_flush_blocking_from_address( cmd_ctx.packet_header_buffer_addr, sizeof(tt::fabric::PacketHeader)); } @@ -489,13 +482,6 @@ FORCE_INLINE void try_advance_inline_write_or_atomic_inc(command_context_t -FORCE_INLINE void try_advance_read_tensor_to_cb(command_context_t& cmd_ctx) { +void try_advance_read_tensor_to_cb(command_context_t& cmd_ctx) { if (!cb_pages_reservable_at_back(cmd_ctx.cb_id, cmd_ctx.packet_size_in_pages)) { return; } @@ -566,14 +553,13 @@ FORCE_INLINE void try_advance_read_tensor_to_cb(command_context_t& cmd_ } #endif -FORCE_INLINE void write_and_advance_local_read_address_for_fabric_write( +void write_and_advance_local_read_address_for_fabric_write( uint64_t noc0_dest_noc_addr, size_t packet_header_buffer_addr, const ttnn::ccl::cmd::CclCommandHeader& current_cmd_header, FabricConnectionManager& fabric_connection, size_t& l1_read_addr, uint32_t payload_size_bytes) { - const auto [dest_noc_xy, dest_addr] = get_noc_address_components(noc0_dest_noc_addr); const size_t payload_l1_address = l1_read_addr; auto pkt_hdr = reinterpret_cast(packet_header_buffer_addr); @@ -582,8 +568,8 @@ FORCE_INLINE void write_and_advance_local_read_address_for_fabric_write( #endif size_t packet_send_size_bytes = payload_size_bytes + sizeof(tt::fabric::PacketHeader); - pkt_hdr->to_write()->to_noc_unicast(tt::fabric::NocUnicastCommandHeader{ - dest_addr, packet_send_size_bytes, static_cast(dest_noc_xy.x), static_cast(dest_noc_xy.y)}); + pkt_hdr->to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{ + noc0_dest_noc_addr, packet_send_size_bytes}); switch (current_cmd_header.dest_type) { case ttnn::ccl::cmd::CclCommandDestType::CHIP_UNICAST: { @@ -592,13 +578,16 @@ FORCE_INLINE void write_and_advance_local_read_address_for_fabric_write( : fabric_connection.get_backward_connection(); pkt_hdr->to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{unicast_args.distance_in_hops}); + fabric_conn.wait_for_empty_write_slot(); fabric_conn.send_payload_without_header_non_blocking_from_address(l1_read_addr, payload_size_bytes); fabric_conn.send_payload_flush_blocking_from_address((uint32_t)pkt_hdr, sizeof(tt::fabric::PacketHeader)); } break; case ttnn::ccl::cmd::CclCommandDestType::CHIP_MULTICAST: { + const auto [dest_noc_xy, dest_addr] = get_noc_address_components(noc0_dest_noc_addr); + uint64_t dest_noc_addr = safe_get_noc_addr(static_cast(dest_noc_xy.x), static_cast(dest_noc_xy.y), dest_addr); noc_async_write( - payload_l1_address, safe_get_noc_addr(dest_noc_xy.x, dest_noc_xy.y, dest_addr), payload_size_bytes); + payload_l1_address, dest_noc_addr, payload_size_bytes); const auto& mcast_args = current_cmd_header.get_multicast_dest_args(); if (fabric_connection.has_forward_connection()) { pkt_hdr->to_chip_multicast(tt::fabric::MulticastRoutingCommandHeader{ @@ -670,7 +659,7 @@ FORCE_INLINE void write_payload_then_advance_read_address( // based on command type so we can avoid the perf overhead of the branching that would otherwise // be required. template -FORCE_INLINE void try_advance_write_tensor_from_cb(command_context_t& cmd_ctx) { +void try_advance_write_tensor_from_cb(command_context_t& cmd_ctx) { if (!cb_pages_available_at_front(cmd_ctx.cb_id, cmd_ctx.packet_size_in_pages)) { return; } @@ -748,7 +737,7 @@ FORCE_INLINE static ttnn::ccl::cmd::noc_transfer_info advance_to_next_noc_transa return noc_transfer_info; } -FORCE_INLINE static void try_advance_noc_read_burst( +static void try_advance_noc_read_burst( noc_transfer_burst_context& noc_burst_ctx, uint32_t cb_id, uint32_t packet_size_in_pages, arg_idx_t& arg_idx) { if (!cb_pages_reservable_at_back(cb_id, packet_size_in_pages)) { return; @@ -805,7 +794,7 @@ static void try_advance_noc_write_burst( } template -FORCE_INLINE void try_advance(command_context_t& cmd_ctx) { +void try_advance(command_context_t& cmd_ctx) { switch (cmd_ctx.current_cmd_header.code) { case ttnn::ccl::cmd::CclCommandCode::STREAM_TENSOR_TO_EDM: // STREAM TENSOR TO CB #ifndef NO_TENSOR_MODE diff --git a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp index 9fe68098a7b..0f662c4bfd4 100644 --- a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp @@ -84,7 +84,7 @@ std::pair get_noc_address_components(uint64_t noc_addr) { //------------------------------------------------------------------------------ void mcast_contig_pages_to_noc_address( - uint64_t noc_addr, + uint64_t noc0_dest_addr, size_t l1_read_addr, size_t contig_pages_advanced, size_t payload_page_size, @@ -95,12 +95,17 @@ void mcast_contig_pages_to_noc_address( size_t forward_direction_num_hops, size_t backward_direction_num_hops) { const size_t payload_size_bytes = contig_pages_advanced * payload_page_size; - const auto [dest_noc_xy, dest_addr] = get_noc_address_components(noc_addr); + const auto [dest_noc_xy, dest_addr] = get_noc_address_components(noc0_dest_addr); const size_t payload_l1_address = l1_read_addr + sizeof(tt::fabric::PacketHeader); // Local chip write noc_async_write( - payload_l1_address, get_noc_addr(dest_noc_xy.x, dest_noc_xy.y, dest_addr, noc_index), payload_size_bytes); + payload_l1_address, + // We are writing out from local core so we need to normalize to our noc + // if the target is a virtual coord this is actually redundant but for DRAM + // coords it is necessary + get_noc_addr(dest_noc_xy.x, dest_noc_xy.y, dest_addr, noc_index), + payload_size_bytes); size_t packet_send_size_bytes = payload_size_bytes + sizeof(tt::fabric::PacketHeader); // Forward fabric connection @@ -110,14 +115,12 @@ void mcast_contig_pages_to_noc_address( "sizeof(sizeof(tt::fabric::PacketHeader)) is not a power of two which violates the below assertion"); auto& pkt_hdr = *reinterpret_cast(l1_read_addr); - pkt_hdr.to_write() + pkt_hdr .to_chip_multicast( tt::fabric::MulticastRoutingCommandHeader{1, static_cast(forward_direction_num_hops)}) - .to_noc_unicast(tt::fabric::NocUnicastCommandHeader{ - dest_addr, - packet_send_size_bytes, - static_cast(dest_noc_xy.x), - static_cast(dest_noc_xy.y)}); + .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{ + noc0_dest_addr, + packet_send_size_bytes}); forward_fabric_sender.wait_for_empty_write_slot(); forward_fabric_sender.send_payload_flush_blocking_from_address(l1_read_addr, packet_send_size_bytes); } @@ -125,14 +128,12 @@ void mcast_contig_pages_to_noc_address( // Backward fabric connection if (has_backward_fabric_connection) { auto& pkt_hdr = *reinterpret_cast(l1_read_addr); - pkt_hdr.to_write() + pkt_hdr .to_chip_multicast( tt::fabric::MulticastRoutingCommandHeader{1, static_cast(backward_direction_num_hops)}) - .to_noc_unicast(tt::fabric::NocUnicastCommandHeader{ - dest_addr, - packet_send_size_bytes, - static_cast(dest_noc_xy.x), - static_cast(dest_noc_xy.y)}); + .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{ + noc0_dest_addr, + packet_send_size_bytes}); backward_fabric_sender.wait_for_empty_write_slot(); backward_fabric_sender.send_payload_non_blocking_from_address(l1_read_addr, packet_send_size_bytes); } @@ -170,7 +171,7 @@ void mcast_payload_chunk_to_output_tensor_address( contig_pages_advanced = std::min(contig_pages, n_pages); mcast_contig_pages_to_noc_address( - noc_addr, + noc0_dest_addr, l1_read_addr, contig_pages_advanced, payload_page_size, @@ -294,7 +295,7 @@ void mcast_sync_signal_to_addr( ASSERT((pkt_addr & (sizeof(tt::fabric::PacketHeader) - 1)) == 0); auto& pkt_hdr = *reinterpret_cast(pkt_addr); - pkt_hdr.to_atomic_inc() + pkt_hdr .to_chip_multicast(tt::fabric::MulticastRoutingCommandHeader{1, static_cast(directional_num_hops)}) .to_noc_unicast_atomic_inc(tt::fabric::NocUnicastAtomicIncCommandHeader{ remote_sem_l1_addr, diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp index 28771d3e9e7..be4f8c42ce4 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp @@ -19,22 +19,20 @@ enum TerminationSignal : uint32_t { IMMEDIATELY_TERMINATE = 2 }; + // 2 bits -enum CommandType : uint8_t { - WRITE = 0, - ATOMIC_INC = 1 +enum NocSendType : uint8_t { + NOC_UNICAST_WRITE = 0, + NOC_MULTICAST_WRITE = 1, + NOC_UNICAST_ATOMIC_INC = 2, + NOC_MULTICAST_ATOMIC_INC = 3 }; - // How to send the payload across the cluster // 1 bit enum ChipSendType : uint8_t { CHIP_UNICAST = 0, CHIP_MULTICAST = 1, }; -enum NocSendType : uint8_t { - NOC_UNICAST = 0, - NOC_MULTICAST = 1 -}; struct UnicastRoutingCommandHeader { @@ -53,27 +51,20 @@ union RoutingFields { static_assert(sizeof(RoutingFields) == sizeof(UnicastRoutingCommandHeader), "RoutingFields size is not 1 bytes"); struct NocUnicastCommandHeader { - // TODO: just encode the noc_addr as uint64_t directly - uint32_t address; + uint64_t noc_address; uint32_t size; - uint8_t noc_x; - uint8_t noc_y; - uint16_t reserved; // ignores header size inline uint32_t get_payload_only_size() const { return size; } }; struct NocUnicastAtomicIncCommandHeader { - NocUnicastAtomicIncCommandHeader(uint32_t address, uint16_t val, uint16_t wrap, uint8_t noc_x, uint8_t noc_y) - : address(address), val(val), wrap(wrap), noc_x(noc_x), noc_y(noc_y) {} + NocUnicastAtomicIncCommandHeader(uint64_t noc_address, uint16_t val, uint16_t wrap) + : noc_address(noc_address), val(val), wrap(wrap) {} - uint32_t address; + uint64_t noc_address; uint16_t val; uint16_t wrap; - uint8_t noc_x; - uint8_t noc_y; - }; struct NocMulticastCommandHeader { uint32_t address; @@ -97,17 +88,17 @@ struct NocMulticastAtomicIncCommandHeader { uint8_t size_x; uint8_t size_y; }; -static_assert(sizeof(NocUnicastCommandHeader) == 12, "NocUnicastCommandHeader size is not 1 byte"); +static_assert(sizeof(NocUnicastCommandHeader) == 16, "NocUnicastCommandHeader size is not 1 byte"); static_assert(sizeof(NocMulticastCommandHeader) == 12, "NocMulticastCommandHeader size is not 1 byte"); -static_assert(sizeof(NocUnicastAtomicIncCommandHeader) == 12, "NocUnicastCommandHeader size is not 1 byte"); +static_assert(sizeof(NocUnicastAtomicIncCommandHeader) == 16, "NocUnicastCommandHeader size is not 1 byte"); static_assert(sizeof(NocMulticastAtomicIncCommandHeader) == 12, "NocAtomicIncCommandHeader size is not 1 byte"); -union CommandFields{ +union NocCommandFields{ NocUnicastCommandHeader unicast_write; NocMulticastCommandHeader mcast_write; NocUnicastAtomicIncCommandHeader unicast_seminc; NocMulticastAtomicIncCommandHeader mcast_seminc; } ; -static_assert(sizeof(CommandFields) <= 15, "CommandFields size is not 15 bytes"); +static_assert(sizeof(NocCommandFields) <= 16, "CommandFields size is not 16 bytes"); // TODO: wrap this in a debug version that holds type info so we can assert for field/command/ struct PacketHeader { @@ -115,9 +106,9 @@ struct PacketHeader { // -> unicast_write, mcast_write, unicast_seminc, mcast_seminc // For now, kept it separate so I could do reads which would be handled differently // but for our purposes we shouldn't need read so we should be able to omit the support - CommandType command_type : 2; + NocSendType noc_send_type : 2; ChipSendType chip_send_type : 1; - NocSendType noc_send_type : 1; + uint8_t reserved : 1; // Used only by the EDM sender and receiver channels. Populated by EDM sender channel to // indicate to the receiver channel what channel was the source of this packet. Reserved // otherwise. @@ -125,7 +116,7 @@ struct PacketHeader { RoutingFields routing_fields; uint16_t reserved2; // can be tagged with src device for debug - CommandFields command_fields; + NocCommandFields command_fields; // size = 16B due to uint64_t alignment // Sort of hack to work-around DRAM read alignment issues that must be 32B aligned // To simplify worker kernel code, we for now decide to pad up the packet header @@ -137,43 +128,34 @@ struct PacketHeader { // manage this complexity. uint32_t padding0; uint32_t padding1; - uint32_t padding2; - uint32_t padding3; - inline void set_command_type(CommandType &type) { this->command_type = type; } inline void set_chip_send_type(ChipSendType &type) { this->chip_send_type = type; } inline void set_noc_send_type(NocSendType &type) { this->noc_send_type = type; } inline void set_routing_fields(RoutingFields &fields) { this->routing_fields = fields; } - inline void set_command_fields(CommandFields &fields) { this->command_fields = fields; } + inline void set_command_fields(NocCommandFields &fields) { this->command_fields = fields; } size_t get_payload_size_excluding_header() volatile const { - switch(this->command_type) { - case WRITE: { - switch(this->noc_send_type) { - case NOC_UNICAST: { - return this->command_fields.unicast_write.size - sizeof(PacketHeader); - } break; - case NOC_MULTICAST: { - return this->command_fields.mcast_write.size - sizeof(PacketHeader); - } break; - default: - return 0; - } + switch(this->noc_send_type) { + case NOC_UNICAST_WRITE: { + return this->command_fields.unicast_write.size - sizeof(PacketHeader); } break; - case ATOMIC_INC: { - return 0; + case NOC_MULTICAST_WRITE: { + return this->command_fields.mcast_write.size - sizeof(PacketHeader); } break; + case NOC_UNICAST_ATOMIC_INC: + case NOC_MULTICAST_ATOMIC_INC: + return 0; default: + #if defined(KERNEL_BUILD) || defined(FW_BUILD) + ASSERT(false); + #endif return 0; - } + }; } inline size_t get_payload_size_including_header() volatile const { return get_payload_size_excluding_header() + sizeof(PacketHeader); } - inline PacketHeader& to_write() { this->command_type = WRITE; return *this; } - inline PacketHeader& to_atomic_inc() { this->command_type = ATOMIC_INC; return *this; } - inline PacketHeader &to_chip_unicast(UnicastRoutingCommandHeader const &chip_unicast_command_header) { this->chip_send_type = CHIP_UNICAST; this->routing_fields.chip_unicast = chip_unicast_command_header; @@ -184,30 +166,29 @@ struct PacketHeader { this->routing_fields.chip_mcast = chip_multicast_command_header; return *this; } - inline PacketHeader &to_noc_unicast(NocUnicastCommandHeader const &noc_unicast_command_header) { - this->noc_send_type = NOC_UNICAST; + + inline PacketHeader &to_noc_unicast_write(NocUnicastCommandHeader const &noc_unicast_command_header) { + this->noc_send_type = NOC_UNICAST_WRITE; this->command_fields.unicast_write = noc_unicast_command_header; return *this; } - inline PacketHeader &to_noc_multicast(NocMulticastCommandHeader const &noc_multicast_command_header) { - this->noc_send_type = NOC_MULTICAST; + inline PacketHeader &to_noc_multicast_write(NocMulticastCommandHeader const &noc_multicast_command_header) { + this->noc_send_type = NOC_MULTICAST_WRITE; this->command_fields.mcast_write = noc_multicast_command_header; return *this; } - inline PacketHeader &to_noc_unicast_atomic_inc( - NocUnicastAtomicIncCommandHeader const &noc_unicast_atomic_inc_command_header) { - this->noc_send_type = NOC_UNICAST; + inline PacketHeader &to_noc_unicast_atomic_inc(NocUnicastAtomicIncCommandHeader const &noc_unicast_atomic_inc_command_header) { + this->noc_send_type = NOC_UNICAST_ATOMIC_INC; this->command_fields.unicast_seminc = noc_unicast_atomic_inc_command_header; return *this; } - inline PacketHeader &to_noc_multicast_atomic_inc( - NocMulticastAtomicIncCommandHeader const &noc_multicast_atomic_inc_command_header) { - this->noc_send_type = NOC_MULTICAST; - this->command_fields.mcast_seminc = noc_multicast_atomic_inc_command_header; + inline PacketHeader &to_noc_multicast_atomic_inc(NocMulticastAtomicIncCommandHeader const &noc_multicast_command_header) { + #if defined(KERNEL_BUILD) || defined(FW_BUILD) + ASSERT(false); + while (1) {}; + #endif return *this; } - inline volatile PacketHeader* to_write() volatile { this->command_type = WRITE; return this; } - inline volatile PacketHeader* to_atomic_inc() volatile { this->command_type = ATOMIC_INC; return this; } inline volatile PacketHeader *to_chip_unicast(UnicastRoutingCommandHeader const &chip_unicast_command_header) volatile { this->chip_send_type = CHIP_UNICAST; @@ -220,17 +201,15 @@ struct PacketHeader { this->routing_fields.chip_mcast.start_distance_in_hops = chip_multicast_command_header.start_distance_in_hops; return this; } - inline volatile PacketHeader *to_noc_unicast(NocUnicastCommandHeader const &noc_unicast_command_header) volatile { - this->noc_send_type = NOC_UNICAST; - this->command_fields.unicast_write.address = noc_unicast_command_header.address; + inline volatile PacketHeader *to_noc_unicast_write(NocUnicastCommandHeader const &noc_unicast_command_header) volatile { + this->noc_send_type = NOC_UNICAST_WRITE; + this->command_fields.unicast_write.noc_address = noc_unicast_command_header.noc_address; this->command_fields.unicast_write.size = noc_unicast_command_header.size; - this->command_fields.unicast_write.noc_x = noc_unicast_command_header.noc_x; - this->command_fields.unicast_write.noc_y = noc_unicast_command_header.noc_y; return this; } inline volatile PacketHeader *to_noc_multicast(NocMulticastCommandHeader const &noc_multicast_command_header) volatile { - this->noc_send_type = NOC_MULTICAST; + this->noc_send_type = NOC_MULTICAST_WRITE; this->command_fields.mcast_write.mcast_rect_size_x = noc_multicast_command_header.mcast_rect_size_x; this->command_fields.mcast_write.mcast_rect_size_y = noc_multicast_command_header.mcast_rect_size_y; this->command_fields.mcast_write.noc_x_start = noc_multicast_command_header.noc_x_start; @@ -242,10 +221,8 @@ struct PacketHeader { } inline volatile PacketHeader *to_noc_unicast_atomic_inc( NocUnicastAtomicIncCommandHeader const &noc_unicast_atomic_inc_command_header) volatile { - this->noc_send_type = NOC_UNICAST; - this->command_fields.unicast_seminc.address = noc_unicast_atomic_inc_command_header.address; - this->command_fields.unicast_seminc.noc_x = noc_unicast_atomic_inc_command_header.noc_x; - this->command_fields.unicast_seminc.noc_y = noc_unicast_atomic_inc_command_header.noc_y; + this->noc_send_type = NOC_UNICAST_ATOMIC_INC; + this->command_fields.unicast_seminc.noc_address = noc_unicast_atomic_inc_command_header.noc_address; this->command_fields.unicast_seminc.val = noc_unicast_atomic_inc_command_header.val; this->command_fields.unicast_seminc.wrap = noc_unicast_atomic_inc_command_header.wrap; @@ -253,7 +230,7 @@ struct PacketHeader { } inline volatile PacketHeader *to_noc_multicast_atomic_inc( NocMulticastAtomicIncCommandHeader const &noc_multicast_atomic_inc_command_header) volatile { - this->noc_send_type = NOC_MULTICAST; + this->noc_send_type = NOC_MULTICAST_ATOMIC_INC; this->command_fields.mcast_seminc.address = noc_multicast_atomic_inc_command_header.address; this->command_fields.mcast_seminc.noc_x_start = noc_multicast_atomic_inc_command_header.noc_x_start; this->command_fields.mcast_seminc.noc_y_start = noc_multicast_atomic_inc_command_header.noc_y_start; diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header_validate.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header_validate.hpp index 831b38063af..bb6b6603e11 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header_validate.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header_validate.hpp @@ -9,13 +9,9 @@ namespace tt::fabric { -FORCE_INLINE void validate(PacketHeader const& packet_header) { - ASSERT(packet_header.command_type == CommandType::WRITE || packet_header.command_type == CommandType::ATOMIC_INC); - ASSERT(packet_header.chip_send_type < 2); - ASSERT(packet_header.noc_send_type < 2); -} +FORCE_INLINE void validate(const PacketHeader& packet_header) { ASSERT(packet_header.chip_send_type < 2); } FORCE_INLINE bool is_valid(PacketHeader const& packet_header) { - return (packet_header.command_type < 2) && (packet_header.chip_send_type < 2) && (packet_header.noc_send_type < 2); + return (packet_header.chip_send_type < 2) && (packet_header.noc_send_type < 2); } } // namespace tt::fabric diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp index edde4791916..16d003b1c71 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp @@ -32,36 +32,25 @@ void print_pkt_hdr_routing_fields(volatile tt::fabric::PacketHeader *const packe void print_pkt_header_noc_fields(volatile tt::fabric::PacketHeader *const packet_start) { switch (packet_start->noc_send_type) { - case tt::fabric::NocSendType::NOC_UNICAST: { - switch (packet_start->command_type) { - case tt::fabric::CommandType::WRITE: { - DPRINT << "N_WR addr:"<<(uint32_t)packet_start->command_fields.unicast_write.address << - ", size:" << (uint32_t) packet_start->command_fields.unicast_write.size << - ", x:" << (uint32_t) packet_start->command_fields.unicast_write.noc_x << - ", y:" << (uint32_t) packet_start->command_fields.unicast_write.noc_y << "\n"; - } break; - case tt::fabric::CommandType::ATOMIC_INC: { - DPRINT << "N_WR addr:"<<(uint32_t)packet_start->command_fields.unicast_seminc.address << - ", val:" << (uint32_t) packet_start->command_fields.unicast_seminc.val << - ", x:" << (uint32_t) packet_start->command_fields.unicast_seminc.noc_x << - ", y:" << (uint32_t) packet_start->command_fields.unicast_seminc.noc_y << "\n"; - - } break; - } - break; - } - case tt::fabric::NocSendType::NOC_MULTICAST: { - ASSERT(false); // unimplemented - break; - } - } + case tt::fabric::NocSendType::NOC_UNICAST_WRITE: { + DPRINT << "N_WR addr:"<<(uint64_t)packet_start->command_fields.unicast_write.noc_address << + ", size:" << (uint32_t) packet_start->command_fields.unicast_write.size << "\n"; + } break; + case tt::fabric::NocSendType::NOC_UNICAST_ATOMIC_INC: { + DPRINT << "N_WR addr:"<<(uint64_t)packet_start->command_fields.unicast_seminc.noc_address << + ", val:" << (uint32_t) packet_start->command_fields.unicast_seminc.val << "\n"; + + } break; + default: + ASSERT(false); // unimplemented + break; + }; } void print_pkt_header(volatile tt::fabric::PacketHeader *const packet_start) { auto const& header = *packet_start; - DPRINT << "PKT: cmd_t:" << (uint32_t) packet_start->command_type << + DPRINT << "PKT: nsnd_t:" << (uint32_t) packet_start->noc_send_type << ", csnd_t:" << (uint32_t) packet_start->chip_send_type << - ", nsnd_t:" << (uint32_t) packet_start->noc_send_type << ", src_chip:" << (uint32_t) packet_start->reserved2 << "\n"; print_pkt_hdr_routing_fields(packet_start); print_pkt_header_noc_fields(packet_start); @@ -73,73 +62,40 @@ void execute_chip_unicast_to_local_chip(volatile tt::fabric::PacketHeader *const auto const& header = *packet_start; uint32_t payload_start_address = reinterpret_cast(packet_start) + sizeof(tt::fabric::PacketHeader); - tt::fabric::CommandType command_type = packet_start->command_type; tt::fabric::NocSendType noc_send_type = packet_start->noc_send_type; - switch (command_type) { - case tt::fabric::CommandType::WRITE: { - switch (noc_send_type) { - case tt::fabric::NocSendType::NOC_UNICAST: { - DPRINT << "C_UNI to y|x" << (uint32_t)((header.command_fields.unicast_write.noc_y << 16) | header.command_fields.unicast_write.noc_x) << - ", " << (uint32_t)header.command_fields.unicast_write.address << "\n"; - auto const dest_address = get_noc_addr( - header.command_fields.unicast_write.noc_x, - header.command_fields.unicast_write.noc_y, - header.command_fields.unicast_write.address); - auto const size = header.command_fields.unicast_write.size - sizeof(tt::fabric::PacketHeader); - noc_async_write_one_packet_with_trid(payload_start_address, dest_address, size, transaction_id); - - }break; - case tt::fabric::NocSendType::NOC_MULTICAST: { - // TODO: confirm if we need to adjust dest core count if we span eth or dram cores - auto const mcast_dest_address = get_noc_multicast_addr( - header.command_fields.mcast_write.noc_x_start, - header.command_fields.mcast_write.noc_y_start, - header.command_fields.mcast_write.noc_x_start + header.command_fields.mcast_write.mcast_rect_size_x, - header.command_fields.mcast_write.noc_y_start + header.command_fields.mcast_write.mcast_rect_size_y, - header.command_fields.mcast_write.address); - auto const num_dests = header.command_fields.mcast_write.mcast_rect_size_x * header.command_fields.mcast_write.mcast_rect_size_y; - auto const size = header.command_fields.mcast_write.size - sizeof(tt::fabric::PacketHeader); - noc_async_write_one_packet_with_trid(payload_start_address, mcast_dest_address, size, num_dests, transaction_id); - - }break; - default: { - ASSERT(false); - } - } - break; - } - case tt::fabric::CommandType::ATOMIC_INC: { - DPRINT << "C_AT_INC\n"; - switch (noc_send_type) { - case tt::fabric::NocSendType::NOC_UNICAST: { - auto const dest_address = get_noc_addr( - header.command_fields.unicast_seminc.noc_x, - header.command_fields.unicast_seminc.noc_y, - header.command_fields.unicast_seminc.address); - auto const increment = header.command_fields.unicast_seminc.val; - DPRINT << "\tx=" << (uint32_t)header.command_fields.unicast_seminc.noc_x << - ", y=" << (uint32_t)header.command_fields.unicast_seminc.noc_y << - ", addr=" << (uint32_t)header.command_fields.unicast_seminc.address << - ", inc=" << (uint32_t)increment << "\n"; - noc_semaphore_inc(dest_address, increment); - - }break; - case tt::fabric::NocSendType::NOC_MULTICAST: { - ASSERT(false); - // noc_async_write(payload_start_address, header.dest_address, header.size_bytes); - - }break; - default: { - ASSERT(false); - } - } - break; + switch (noc_send_type) { + case tt::fabric::NocSendType::NOC_UNICAST_WRITE: { + auto const dest_address = header.command_fields.unicast_write.noc_address; + auto const size = header.command_fields.unicast_write.size - sizeof(tt::fabric::PacketHeader); + noc_async_write_one_packet_with_trid(payload_start_address, dest_address, size, transaction_id); + + } break; + + case tt::fabric::NocSendType::NOC_MULTICAST_WRITE: { + // TODO: confirm if we need to adjust dest core count if we span eth or dram cores + auto const mcast_dest_address = get_noc_multicast_addr( + header.command_fields.mcast_write.noc_x_start, + header.command_fields.mcast_write.noc_y_start, + header.command_fields.mcast_write.noc_x_start + header.command_fields.mcast_write.mcast_rect_size_x, + header.command_fields.mcast_write.noc_y_start + header.command_fields.mcast_write.mcast_rect_size_y, + header.command_fields.mcast_write.address); + auto const num_dests = header.command_fields.mcast_write.mcast_rect_size_x * header.command_fields.mcast_write.mcast_rect_size_y; + auto const size = header.command_fields.mcast_write.size - sizeof(tt::fabric::PacketHeader); + noc_async_write_one_packet_with_trid(payload_start_address, mcast_dest_address, size, num_dests, transaction_id); - }; + } break; + + case tt::fabric::NocSendType::NOC_UNICAST_ATOMIC_INC: { + uint64_t const dest_address = header.command_fields.unicast_seminc.noc_address; + auto const increment = header.command_fields.unicast_seminc.val; + noc_semaphore_inc(dest_address, increment); + + } break; + case tt::fabric::NocSendType::NOC_MULTICAST_ATOMIC_INC: default: { ASSERT(false); - } + } break; }; } diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp index f296601f2a3..e913c18f7aa 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp @@ -778,7 +778,8 @@ void run_receiver_channel_step( print_pkt_header(packet_header); bool can_send_to_all_local_chip_receivers = can_forward_packet_completely(packet_header, downstream_edm_interface); - if (can_send_to_all_local_chip_receivers) { + bool trid_flushed = receiver_channel_trid_tracker.transaction_flushed(receiver_buffer_index); + if (can_send_to_all_local_chip_receivers && trid_flushed) { uint8_t trid = receiver_channel_trid_tracker.update_buffer_slot_to_next_trid_and_advance_trid_counter(receiver_buffer_index); receiver_forward_packet(packet_header, downstream_edm_interface, trid); wr_sent_ptr.increment(); @@ -789,6 +790,8 @@ void run_receiver_channel_step( bool unflushed_writes = !wr_flush_ptr.is_caught_up_to(wr_sent_ptr); if (unflushed_writes) { auto receiver_buffer_index = wr_flush_ptr.get_buffer_index(); + // Temporary patch for instability. Issue was not caught due to what appears to be a bug in CI + // not running all tests. Issue tracked here: https://github.com/tenstorrent/tt-metal/issues/17702 bool next_trid_flushed = receiver_channel_trid_tracker.transaction_flushed(receiver_buffer_index); if (next_trid_flushed) { local_receiver_channel.eth_clear_sender_channel_ack(receiver_buffer_index); diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/interleaved_dim3_1_1_32_any_writer.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/interleaved_dim3_1_1_32_any_writer.cpp index 003d5934ded..a8dbeb8ade7 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/interleaved_dim3_1_1_32_any_writer.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/interleaved_dim3_1_1_32_any_writer.cpp @@ -150,14 +150,13 @@ void kernel_main() { } // 2. mcast output ready semaphore + uint64_t out_ready_sem_noc_addr_in_pkt = + safe_get_noc_addr(out_ready_sem_noc0_x, out_ready_sem_noc0_y, out_ready_sem_bank_addr, 0); auto* pkt_hdr = reinterpret_cast(packet_header_buffer_seminc); - pkt_hdr->to_atomic_inc(); pkt_hdr->to_noc_unicast_atomic_inc(tt::fabric::NocUnicastAtomicIncCommandHeader{ - out_ready_sem_bank_addr, + out_ready_sem_noc_addr_in_pkt, static_cast(1), // increment 1 - 32, - static_cast(out_ready_sem_noc0_x), - static_cast(out_ready_sem_noc0_y)}); + 32}); // Write the mcast packet (forward) if (fabric_connection.has_forward_connection()) { fabric_connection.get_forward_connection().wait_for_empty_write_slot(); diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/llama_post_binary_matmul_shape_writer.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/llama_post_binary_matmul_shape_writer.cpp index 54bfa996d39..b9f306cc42b 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/llama_post_binary_matmul_shape_writer.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/llama_post_binary_matmul_shape_writer.cpp @@ -159,13 +159,12 @@ void kernel_main() { // 2. mcast output ready semaphore auto* pkt_hdr = reinterpret_cast(packet_header_buffer_seminc); - pkt_hdr->to_atomic_inc(); + uint64_t out_ready_sem_noc_addr_in_pkt = + safe_get_noc_addr(out_ready_sem_noc0_x, out_ready_sem_noc0_y, out_ready_sem_bank_addr, 0); pkt_hdr->to_noc_unicast_atomic_inc(tt::fabric::NocUnicastAtomicIncCommandHeader{ - out_ready_sem_bank_addr, + out_ready_sem_noc_addr_in_pkt, static_cast(1), // increment 1 - 32, - static_cast(out_ready_sem_noc0_x), - static_cast(out_ready_sem_noc0_y)}); + 32}); // Write the mcast packet (forward) if (fabric_connection.has_forward_connection()) { fabric_connection.get_forward_connection().wait_for_empty_write_slot(); diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/minimal_ccl_common.hpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/minimal_ccl_common.hpp index 777010fb399..a281806cafc 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/minimal_ccl_common.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/minimal_ccl_common.hpp @@ -6,6 +6,7 @@ #include #include "cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/fabric_connection_manager.hpp" #include "cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/noc_addr.hpp" +#include "cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp" #include #include @@ -20,10 +21,10 @@ FORCE_INLINE void write_and_advance_local_read_address_for_fabric_write( const size_t payload_l1_address = l1_read_addr; size_t packet_send_size_bytes = payload_size_bytes + sizeof(tt::fabric::PacketHeader); - pkt_hdr_forward->to_write()->to_noc_unicast(tt::fabric::NocUnicastCommandHeader{ - dest_addr, packet_send_size_bytes, static_cast(dest_noc_xy.x), static_cast(dest_noc_xy.y)}); - pkt_hdr_backward->to_write()->to_noc_unicast(tt::fabric::NocUnicastCommandHeader{ - dest_addr, packet_send_size_bytes, static_cast(dest_noc_xy.x), static_cast(dest_noc_xy.y)}); + pkt_hdr_forward->to_noc_unicast_write( + tt::fabric::NocUnicastCommandHeader{noc0_dest_noc_addr, packet_send_size_bytes}); + pkt_hdr_backward->to_noc_unicast_write( + tt::fabric::NocUnicastCommandHeader{noc0_dest_noc_addr, packet_send_size_bytes}); noc_async_write(payload_l1_address, safe_get_noc_addr(dest_noc_xy.x, dest_noc_xy.y, dest_addr), payload_size_bytes); if (fabric_connection.has_forward_connection()) { From e254ef42b46cd327388388501037f88df698b9c1 Mon Sep 17 00:00:00 2001 From: Sofija Jovic <148721049+s-jovic@users.noreply.github.com> Date: Fri, 7 Feb 2025 16:48:12 +0100 Subject: [PATCH 009/316] #0: fix golden functions for conv and matmul (#17592) --- ttnn/ttnn/operations/conv2d.py | 7 ++++++- ttnn/ttnn/operations/matmul.py | 4 +++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/ttnn/ttnn/operations/conv2d.py b/ttnn/ttnn/operations/conv2d.py index 1a506ec400f..1ce52333ce6 100644 --- a/ttnn/ttnn/operations/conv2d.py +++ b/ttnn/ttnn/operations/conv2d.py @@ -245,6 +245,8 @@ def _golden_function( groups: int = 1, bias_tensor=None, conv_config: Conv2dConfig = None, + return_output_dim=False, + return_weights_and_bias=False, **_, ): import torch @@ -272,7 +274,10 @@ def _golden_function( N, C, H, W = output_tensor.shape output_tensor = output_tensor.permute(0, 2, 3, 1).reshape(1, 1, N * H * W, C) # N, C, H, W -> 1, 1, NHW, C - return [output_tensor] + if return_output_dim or return_weights_and_bias: + return [output_tensor] + + return output_tensor ttnn.attach_golden_function( diff --git a/ttnn/ttnn/operations/matmul.py b/ttnn/ttnn/operations/matmul.py index 42b65471ec7..02cc4beaa24 100644 --- a/ttnn/ttnn/operations/matmul.py +++ b/ttnn/ttnn/operations/matmul.py @@ -17,7 +17,9 @@ ) -def _golden_function(input_tensor_a, input_tensor_b, *args, **kwargs): +def _golden_function( + input_tensor_a, input_tensor_b, transpose_a=False, transpose_b=False, *, bias=None, activation=None, **kwargs +): import torch if transpose_a: From 06e413bcfa69521201513a3bd22058555dff1346 Mon Sep 17 00:00:00 2001 From: Nathan Sidwell Date: Fri, 7 Feb 2025 10:58:02 -0500 Subject: [PATCH 010/316] #0: remove duplicate header (#17722) ### Ticket NA ### Problem description I noticed this single instance of the wormhole ckernel_ops.h header. There are no matching GS nor BH instances in adjacent directories. This just looks redundant. ### What's changed Delete the header and the containing inc directory. ### Checklist - [YES] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes - [YES] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [ ] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [ ] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [ ] New/Existing tests provide coverage for changes --- .../hw/ckernels/wormhole_b0/inc/ckernel_ops.h | 1277 ----------------- 1 file changed, 1277 deletions(-) delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/inc/ckernel_ops.h diff --git a/tt_metal/hw/ckernels/wormhole_b0/inc/ckernel_ops.h b/tt_metal/hw/ckernels/wormhole_b0/inc/ckernel_ops.h deleted file mode 100644 index 94947ef7456..00000000000 --- a/tt_metal/hw/ckernels/wormhole_b0/inc/ckernel_ops.h +++ /dev/null @@ -1,1277 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -// -// Auto-generated file, do not modify! -// - -#pragma once - -#define TT_OP(opcode, params) ((opcode << 24) + params) -#define INSTRUCTION_WORD(x) \ - __asm__ __volatile__(".word (%0)" : : "i"((x))) // Drop 32 bits into the instruction stream. -#define TRISC_OP_SWIZZLE(x) \ - ((((x) >> 30) & 0x3) | (((x) & 0x3FFFFFFF) << 2)) // Put top 2 bits, which are currently never 'b11 to bottom, - // indicating to Risc that they are not risc instructions - -#define TT_OP_ADDDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) \ - TT_OP(0x58, (((OpBisConst) << 23) + ((ResultRegIndex) << 12) + ((OpBRegIndex) << 6) + ((OpARegIndex) << 0))) -#define TT_ADDDMAREG_VALID(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) \ - (ckernel::is_valid(OpBisConst, 1) && ckernel::is_valid(ResultRegIndex, 11) && ckernel::is_valid(OpBRegIndex, 6) && \ - ckernel::is_valid(OpARegIndex, 6)) -#define TT_ADDDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) \ - ckernel::instrn_buffer[0] = TT_OP_ADDDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) -#define TTI_ADDDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ADDDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex))) - -#define TT_OP_ADDRCRXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \ - TT_OP( \ - 0x53, \ - (((CntSetMask) << 21) + ((Ch1_Y) << 15) + ((Ch1_X) << 12) + ((Ch0_Y) << 9) + ((Ch0_X) << 6) + \ - ((BitMask) << 0))) -#define TT_ADDRCRXY_VALID(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \ - (ckernel::is_valid(CntSetMask, 3) && ckernel::is_valid(Ch1_Y, 6) && ckernel::is_valid(Ch1_X, 3) && \ - ckernel::is_valid(Ch0_Y, 3) && ckernel::is_valid(Ch0_X, 3) && ckernel::is_valid(BitMask, 6)) -#define TT_ADDRCRXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \ - ckernel::instrn_buffer[0] = TT_OP_ADDRCRXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) -#define TTI_ADDRCRXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ADDRCRXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask))) - -#define TT_OP_ADDRCRZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \ - TT_OP( \ - 0x56, \ - (((CntSetMask) << 21) + ((Ch1_Y) << 15) + ((Ch1_X) << 12) + ((Ch0_Y) << 9) + ((Ch0_X) << 6) + \ - ((BitMask) << 0))) -#define TT_ADDRCRZW_VALID(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \ - (ckernel::is_valid(CntSetMask, 3) && ckernel::is_valid(Ch1_Y, 6) && ckernel::is_valid(Ch1_X, 3) && \ - ckernel::is_valid(Ch0_Y, 3) && ckernel::is_valid(Ch0_X, 3) && ckernel::is_valid(BitMask, 6)) -#define TT_ADDRCRZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \ - ckernel::instrn_buffer[0] = TT_OP_ADDRCRZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) -#define TTI_ADDRCRZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ADDRCRZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask))) - -#define TT_OP_APOOL3S1(clear_dvalid, addr_mode, index_en, dst) \ - TT_OP(0x25, (((clear_dvalid) << 22) + ((addr_mode) << 15) + ((index_en) << 14) + ((dst) << 0))) -#define TT_APOOL3S1_VALID(clear_dvalid, addr_mode, index_en, dst) \ - (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(addr_mode, 7) && ckernel::is_valid(index_en, 1) && \ - ckernel::is_valid(dst, 14)) -#define TT_APOOL3S1(clear_dvalid, addr_mode, index_en, dst) \ - ckernel::instrn_buffer[0] = TT_OP_APOOL3S1(clear_dvalid, addr_mode, index_en, dst) -#define TTI_APOOL3S1(clear_dvalid, addr_mode, index_en, dst) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_APOOL3S1(clear_dvalid, addr_mode, index_en, dst))) - -#define TT_OP_APOOL3S2(clear_dvalid, addr_mode, index_en, dst) \ - TT_OP(0x32, (((clear_dvalid) << 22) + ((addr_mode) << 15) + ((index_en) << 14) + ((dst) << 0))) -#define TT_APOOL3S2_VALID(clear_dvalid, addr_mode, index_en, dst) \ - (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(addr_mode, 7) && ckernel::is_valid(index_en, 1) && \ - ckernel::is_valid(dst, 14)) -#define TT_APOOL3S2(clear_dvalid, addr_mode, index_en, dst) \ - ckernel::instrn_buffer[0] = TT_OP_APOOL3S2(clear_dvalid, addr_mode, index_en, dst) -#define TTI_APOOL3S2(clear_dvalid, addr_mode, index_en, dst) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_APOOL3S2(clear_dvalid, addr_mode, index_en, dst))) - -#define TT_OP_ATCAS(MemHierSel, SwapVal, CmpVal, Sel32b, DataRegIndex, AddrRegIndex) \ - TT_OP( \ - 0x64, \ - (((MemHierSel) << 23) + ((SwapVal) << 18) + ((CmpVal) << 14) + ((Sel32b) << 12) + ((DataRegIndex) << 6) + \ - ((AddrRegIndex) << 0))) -#define TT_ATCAS_VALID(MemHierSel, SwapVal, CmpVal, Sel32b, DataRegIndex, AddrRegIndex) \ - (ckernel::is_valid(MemHierSel, 1) && ckernel::is_valid(SwapVal, 5) && ckernel::is_valid(CmpVal, 4) && \ - ckernel::is_valid(Sel32b, 2) && ckernel::is_valid(DataRegIndex, 6) && ckernel::is_valid(AddrRegIndex, 6)) -#define TT_ATCAS(MemHierSel, SwapVal, CmpVal, Sel32b, DataRegIndex, AddrRegIndex) \ - ckernel::instrn_buffer[0] = TT_OP_ATCAS(MemHierSel, SwapVal, CmpVal, Sel32b, DataRegIndex, AddrRegIndex) -#define TTI_ATCAS(MemHierSel, SwapVal, CmpVal, Sel32b, DataRegIndex, AddrRegIndex) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ATCAS(MemHierSel, SwapVal, CmpVal, Sel32b, DataRegIndex, AddrRegIndex))) - -#define TT_OP_ATGETM(mutex_index) TT_OP(0xa0, (((mutex_index) << 0))) -#define TT_ATGETM_VALID(mutex_index) (ckernel::is_valid(mutex_index, 24)) -#define TT_ATGETM(mutex_index) ckernel::instrn_buffer[0] = TT_OP_ATGETM(mutex_index) -#define TTI_ATGETM(mutex_index) INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ATGETM(mutex_index))) - -#define TT_OP_ATINCGET(MemHierSel, WrapVal, Sel32b, DataRegIndex, AddrRegIndex) \ - TT_OP( \ - 0x61, \ - (((MemHierSel) << 23) + ((WrapVal) << 14) + ((Sel32b) << 12) + ((DataRegIndex) << 6) + ((AddrRegIndex) << 0))) -#define TT_ATINCGET_VALID(MemHierSel, WrapVal, Sel32b, DataRegIndex, AddrRegIndex) \ - (ckernel::is_valid(MemHierSel, 1) && ckernel::is_valid(WrapVal, 9) && ckernel::is_valid(Sel32b, 2) && \ - ckernel::is_valid(DataRegIndex, 6) && ckernel::is_valid(AddrRegIndex, 6)) -#define TT_ATINCGET(MemHierSel, WrapVal, Sel32b, DataRegIndex, AddrRegIndex) \ - ckernel::instrn_buffer[0] = TT_OP_ATINCGET(MemHierSel, WrapVal, Sel32b, DataRegIndex, AddrRegIndex) -#define TTI_ATINCGET(MemHierSel, WrapVal, Sel32b, DataRegIndex, AddrRegIndex) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ATINCGET(MemHierSel, WrapVal, Sel32b, DataRegIndex, AddrRegIndex))) - -#define TT_OP_ATINCGETPTR(MemHierSel, NoIncr, IncrVal, WrapVal, Sel32b, DataRegIndex, AddrRegIndex) \ - TT_OP( \ - 0x62, \ - (((MemHierSel) << 23) + ((NoIncr) << 22) + ((IncrVal) << 18) + ((WrapVal) << 14) + ((Sel32b) << 12) + \ - ((DataRegIndex) << 6) + ((AddrRegIndex) << 0))) -#define TT_ATINCGETPTR_VALID(MemHierSel, NoIncr, IncrVal, WrapVal, Sel32b, DataRegIndex, AddrRegIndex) \ - (ckernel::is_valid(MemHierSel, 1) && ckernel::is_valid(NoIncr, 1) && ckernel::is_valid(IncrVal, 4) && \ - ckernel::is_valid(WrapVal, 4) && ckernel::is_valid(Sel32b, 2) && ckernel::is_valid(DataRegIndex, 6) && \ - ckernel::is_valid(AddrRegIndex, 6)) -#define TT_ATINCGETPTR(MemHierSel, NoIncr, IncrVal, WrapVal, Sel32b, DataRegIndex, AddrRegIndex) \ - ckernel::instrn_buffer[0] = \ - TT_OP_ATINCGETPTR(MemHierSel, NoIncr, IncrVal, WrapVal, Sel32b, DataRegIndex, AddrRegIndex) -#define TTI_ATINCGETPTR(MemHierSel, NoIncr, IncrVal, WrapVal, Sel32b, DataRegIndex, AddrRegIndex) \ - INSTRUCTION_WORD( \ - TRISC_OP_SWIZZLE(TT_OP_ATINCGETPTR(MemHierSel, NoIncr, IncrVal, WrapVal, Sel32b, DataRegIndex, AddrRegIndex))) - -#define TT_OP_ATRELM(mutex_index) TT_OP(0xa1, (((mutex_index) << 0))) -#define TT_ATRELM_VALID(mutex_index) (ckernel::is_valid(mutex_index, 24)) -#define TT_ATRELM(mutex_index) ckernel::instrn_buffer[0] = TT_OP_ATRELM(mutex_index) -#define TTI_ATRELM(mutex_index) INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ATRELM(mutex_index))) - -#define TT_OP_ATSWAP(MemHierSel, SwapMask, DataRegIndex, AddrRegIndex) \ - TT_OP(0x63, (((MemHierSel) << 23) + ((SwapMask) << 14) + ((DataRegIndex) << 6) + ((AddrRegIndex) << 0))) -#define TT_ATSWAP_VALID(MemHierSel, SwapMask, DataRegIndex, AddrRegIndex) \ - (ckernel::is_valid(MemHierSel, 1) && ckernel::is_valid(SwapMask, 9) && ckernel::is_valid(DataRegIndex, 8) && \ - ckernel::is_valid(AddrRegIndex, 6)) -#define TT_ATSWAP(MemHierSel, SwapMask, DataRegIndex, AddrRegIndex) \ - ckernel::instrn_buffer[0] = TT_OP_ATSWAP(MemHierSel, SwapMask, DataRegIndex, AddrRegIndex) -#define TTI_ATSWAP(MemHierSel, SwapMask, DataRegIndex, AddrRegIndex) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ATSWAP(MemHierSel, SwapMask, DataRegIndex, AddrRegIndex))) - -#define TT_OP_BITWOPDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) \ - TT_OP( \ - 0x5b, \ - (((OpBisConst) << 23) + ((OpSel) << 18) + ((ResultRegIndex) << 12) + ((OpBRegIndex) << 6) + \ - ((OpARegIndex) << 0))) -#define TT_BITWOPDMAREG_VALID(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) \ - (ckernel::is_valid(OpBisConst, 1) && ckernel::is_valid(OpSel, 5) && ckernel::is_valid(ResultRegIndex, 6) && \ - ckernel::is_valid(OpBRegIndex, 6) && ckernel::is_valid(OpARegIndex, 6)) -#define TT_BITWOPDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) \ - ckernel::instrn_buffer[0] = TT_OP_BITWOPDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) -#define TTI_BITWOPDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_BITWOPDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex))) - -#define TT_OP_CLEARDVALID(cleardvalid, reset) TT_OP(0x36, (((cleardvalid) << 22) + ((reset) << 0))) -#define TT_CLEARDVALID_VALID(cleardvalid, reset) (ckernel::is_valid(cleardvalid, 2) && ckernel::is_valid(reset, 22)) -#define TT_CLEARDVALID(cleardvalid, reset) ckernel::instrn_buffer[0] = TT_OP_CLEARDVALID(cleardvalid, reset) -#define TTI_CLEARDVALID(cleardvalid, reset) INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_CLEARDVALID(cleardvalid, reset))) - -#define TT_OP_CLREXPHIST TT_OP(0x21, 0) -#define TTI_CLREXPHIST INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_CLREXPHIST)) - -#define TT_OP_CMPDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) \ - TT_OP( \ - 0x5d, \ - (((OpBisConst) << 23) + ((OpSel) << 18) + ((ResultRegIndex) << 12) + ((OpBRegIndex) << 6) + \ - ((OpARegIndex) << 0))) -#define TT_CMPDMAREG_VALID(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) \ - (ckernel::is_valid(OpBisConst, 1) && ckernel::is_valid(OpSel, 5) && ckernel::is_valid(ResultRegIndex, 6) && \ - ckernel::is_valid(OpBRegIndex, 6) && ckernel::is_valid(OpARegIndex, 6)) -#define TT_CMPDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) \ - ckernel::instrn_buffer[0] = TT_OP_CMPDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) -#define TTI_CMPDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_CMPDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex))) - -#define TT_OP_CONV3S1(clear_dvalid, rotate_weights, addr_mode, dst) \ - TT_OP(0x22, (((clear_dvalid) << 22) + ((rotate_weights) << 17) + ((addr_mode) << 15) + ((dst) << 0))) -#define TT_CONV3S1_VALID(clear_dvalid, rotate_weights, addr_mode, dst) \ - (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(rotate_weights, 5) && ckernel::is_valid(addr_mode, 2) && \ - ckernel::is_valid(dst, 15)) -#define TT_CONV3S1(clear_dvalid, rotate_weights, addr_mode, dst) \ - ckernel::instrn_buffer[0] = TT_OP_CONV3S1(clear_dvalid, rotate_weights, addr_mode, dst) -#define TTI_CONV3S1(clear_dvalid, rotate_weights, addr_mode, dst) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_CONV3S1(clear_dvalid, rotate_weights, addr_mode, dst))) - -#define TT_OP_CONV3S2(clear_dvalid, rotate_weights, addr_mode, dst) \ - TT_OP(0x23, (((clear_dvalid) << 22) + ((rotate_weights) << 17) + ((addr_mode) << 15) + ((dst) << 0))) -#define TT_CONV3S2_VALID(clear_dvalid, rotate_weights, addr_mode, dst) \ - (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(rotate_weights, 5) && ckernel::is_valid(addr_mode, 2) && \ - ckernel::is_valid(dst, 15)) -#define TT_CONV3S2(clear_dvalid, rotate_weights, addr_mode, dst) \ - ckernel::instrn_buffer[0] = TT_OP_CONV3S2(clear_dvalid, rotate_weights, addr_mode, dst) -#define TTI_CONV3S2(clear_dvalid, rotate_weights, addr_mode, dst) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_CONV3S2(clear_dvalid, rotate_weights, addr_mode, dst))) - -#define TT_OP_DMANOP TT_OP(0x60, 0) -#define TTI_DMANOP INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_DMANOP)) - -#define TT_OP_DOTPV(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \ - TT_OP( \ - 0x29, \ - (((clear_dvalid) << 22) + ((dest_accum_en) << 21) + ((instr_mod19) << 19) + ((addr_mode) << 15) + \ - ((dst) << 0))) -#define TT_DOTPV_VALID(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \ - (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(dest_accum_en, 1) && ckernel::is_valid(instr_mod19, 2) && \ - ckernel::is_valid(addr_mode, 4) && ckernel::is_valid(dst, 15)) -#define TT_DOTPV(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \ - ckernel::instrn_buffer[0] = TT_OP_DOTPV(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) -#define TTI_DOTPV(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_DOTPV(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst))) - -#define TT_OP_ELWADD(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \ - TT_OP( \ - 0x28, \ - (((clear_dvalid) << 22) + ((dest_accum_en) << 21) + ((instr_mod19) << 19) + ((addr_mode) << 15) + \ - ((dst) << 0))) -#define TT_ELWADD_VALID(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \ - (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(dest_accum_en, 1) && ckernel::is_valid(instr_mod19, 2) && \ - ckernel::is_valid(addr_mode, 4) && ckernel::is_valid(dst, 15)) -#define TT_ELWADD(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \ - ckernel::instrn_buffer[0] = TT_OP_ELWADD(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) -#define TTI_ELWADD(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ELWADD(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst))) - -#define TT_OP_ELWMUL(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \ - TT_OP( \ - 0x27, \ - (((clear_dvalid) << 22) + ((dest_accum_en) << 21) + ((instr_mod19) << 19) + ((addr_mode) << 15) + \ - ((dst) << 0))) -#define TT_ELWMUL_VALID(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \ - (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(dest_accum_en, 1) && ckernel::is_valid(instr_mod19, 2) && \ - ckernel::is_valid(addr_mode, 4) && ckernel::is_valid(dst, 15)) -#define TT_ELWMUL(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \ - ckernel::instrn_buffer[0] = TT_OP_ELWMUL(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) -#define TTI_ELWMUL(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ELWMUL(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst))) - -#define TT_OP_ELWSUB(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \ - TT_OP( \ - 0x30, \ - (((clear_dvalid) << 22) + ((dest_accum_en) << 21) + ((instr_mod19) << 19) + ((addr_mode) << 15) + \ - ((dst) << 0))) -#define TT_ELWSUB_VALID(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \ - (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(dest_accum_en, 1) && ckernel::is_valid(instr_mod19, 2) && \ - ckernel::is_valid(addr_mode, 4) && ckernel::is_valid(dst, 15)) -#define TT_ELWSUB(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \ - ckernel::instrn_buffer[0] = TT_OP_ELWSUB(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) -#define TTI_ELWSUB(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ELWSUB(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst))) - -#define TT_OP_FLUSHDMA(FlushSpec) TT_OP(0x46, (((FlushSpec) << 0))) -#define TT_FLUSHDMA_VALID(FlushSpec) (ckernel::is_valid(FlushSpec, 24)) -#define TT_FLUSHDMA(FlushSpec) ckernel::instrn_buffer[0] = TT_OP_FLUSHDMA(FlushSpec) -#define TTI_FLUSHDMA(FlushSpec) INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_FLUSHDMA(FlushSpec))) - -#define TT_OP_GAPOOL(clear_dvalid, instr_mod19, addr_mode, max_pool_index_en, dst) \ - TT_OP( \ - 0x34, \ - (((clear_dvalid) << 22) + ((instr_mod19) << 19) + ((addr_mode) << 15) + ((max_pool_index_en) << 14) + \ - ((dst) << 0))) -#define TT_GAPOOL_VALID(clear_dvalid, instr_mod19, addr_mode, max_pool_index_en, dst) \ - (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(instr_mod19, 3) && ckernel::is_valid(addr_mode, 4) && \ - ckernel::is_valid(max_pool_index_en, 1) && ckernel::is_valid(dst, 14)) -#define TT_GAPOOL(clear_dvalid, instr_mod19, addr_mode, max_pool_index_en, dst) \ - ckernel::instrn_buffer[0] = TT_OP_GAPOOL(clear_dvalid, instr_mod19, addr_mode, max_pool_index_en, dst) -#define TTI_GAPOOL(clear_dvalid, instr_mod19, addr_mode, max_pool_index_en, dst) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_GAPOOL(clear_dvalid, instr_mod19, addr_mode, max_pool_index_en, dst))) - -#define TT_OP_GATESRCRST(reset_srcb_gate_control, reset_srca_gate_control) \ - TT_OP(0x35, (((reset_srcb_gate_control) << 1) + ((reset_srca_gate_control) << 0))) -#define TT_GATESRCRST_VALID(reset_srcb_gate_control, reset_srca_gate_control) \ - (ckernel::is_valid(reset srcb gate control, 23) && ckernel::is_valid(reset srca gate control, 1)) -#define TT_GATESRCRST(reset_srcb_gate_control, reset_srca_gate_control) \ - ckernel::instrn_buffer[0] = TT_OP_GATESRCRST(reset_srcb_gate_control, reset_srca_gate_control) -#define TTI_GATESRCRST(reset_srcb_gate_control, reset_srca_gate_control) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_GATESRCRST(reset_srcb_gate_control, reset_srca_gate_control))) - -#define TT_OP_GMPOOL(clear_dvalid, instr_mod19, addr_mode, max_pool_index_en, dst) \ - TT_OP( \ - 0x33, \ - (((clear_dvalid) << 22) + ((instr_mod19) << 19) + ((addr_mode) << 15) + ((max_pool_index_en) << 14) + \ - ((dst) << 0))) -#define TT_GMPOOL_VALID(clear_dvalid, instr_mod19, addr_mode, max_pool_index_en, dst) \ - (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(instr_mod19, 3) && ckernel::is_valid(addr_mode, 4) && \ - ckernel::is_valid(max_pool_index_en, 1) && ckernel::is_valid(dst, 14)) -#define TT_GMPOOL(clear_dvalid, instr_mod19, addr_mode, max_pool_index_en, dst) \ - ckernel::instrn_buffer[0] = TT_OP_GMPOOL(clear_dvalid, instr_mod19, addr_mode, max_pool_index_en, dst) -#define TTI_GMPOOL(clear_dvalid, instr_mod19, addr_mode, max_pool_index_en, dst) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_GMPOOL(clear_dvalid, instr_mod19, addr_mode, max_pool_index_en, dst))) - -#define TT_OP_INCADCXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X) \ - TT_OP(0x52, (((CntSetMask) << 21) + ((Ch1_Y) << 15) + ((Ch1_X) << 12) + ((Ch0_Y) << 9) + ((Ch0_X) << 6))) -#define TT_INCADCXY_VALID(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X) \ - (ckernel::is_valid(CntSetMask, 3) && ckernel::is_valid(Ch1_Y, 6) && ckernel::is_valid(Ch1_X, 3) && \ - ckernel::is_valid(Ch0_Y, 3) && ckernel::is_valid(Ch0_X, 3)) -#define TT_INCADCXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X) \ - ckernel::instrn_buffer[0] = TT_OP_INCADCXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X) -#define TTI_INCADCXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_INCADCXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X))) - -#define TT_OP_INCADCZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X) \ - TT_OP(0x55, (((CntSetMask) << 21) + ((Ch1_Y) << 15) + ((Ch1_X) << 12) + ((Ch0_Y) << 9) + ((Ch0_X) << 6))) -#define TT_INCADCZW_VALID(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X) \ - (ckernel::is_valid(CntSetMask, 3) && ckernel::is_valid(Ch1_Y, 6) && ckernel::is_valid(Ch1_X, 3) && \ - ckernel::is_valid(Ch0_Y, 3) && ckernel::is_valid(Ch0_X, 3)) -#define TT_INCADCZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X) \ - ckernel::instrn_buffer[0] = TT_OP_INCADCZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X) -#define TTI_INCADCZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_INCADCZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X))) - -#define TT_OP_INCRWC(rwc_cr, rwc_d, rwc_b, rwc_a) \ - TT_OP(0x38, (((rwc_cr) << 18) + ((rwc_d) << 14) + ((rwc_b) << 10) + ((rwc_a) << 6))) -#define TT_INCRWC_VALID(rwc_cr, rwc_d, rwc_b, rwc_a) \ - (ckernel::is_valid(rwc_cr, 6) && ckernel::is_valid(rwc_d, 4) && ckernel::is_valid(rwc_b, 4) && \ - ckernel::is_valid(rwc_a, 4)) -#define TT_INCRWC(rwc_cr, rwc_d, rwc_b, rwc_a) ckernel::instrn_buffer[0] = TT_OP_INCRWC(rwc_cr, rwc_d, rwc_b, rwc_a) -#define TTI_INCRWC(rwc_cr, rwc_d, rwc_b, rwc_a) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_INCRWC(rwc_cr, rwc_d, rwc_b, rwc_a))) - -#define TT_OP_LOADIND(SizeSel, OffsetIndex, AutoIncSpec, DataRegIndex, AddrRegIndex) \ - TT_OP( \ - 0x49, \ - (((SizeSel) << 22) + ((OffsetIndex) << 14) + ((AutoIncSpec) << 12) + ((DataRegIndex) << 6) + \ - ((AddrRegIndex) << 0))) -#define TT_LOADIND_VALID(SizeSel, OffsetIndex, AutoIncSpec, DataRegIndex, AddrRegIndex) \ - (ckernel::is_valid(SizeSel, 2) && ckernel::is_valid(OffsetIndex, 8) && ckernel::is_valid(AutoIncSpec, 2) && \ - ckernel::is_valid(DataRegIndex, 6) && ckernel::is_valid(AddrRegIndex, 6)) -#define TT_LOADIND(SizeSel, OffsetIndex, AutoIncSpec, DataRegIndex, AddrRegIndex) \ - ckernel::instrn_buffer[0] = TT_OP_LOADIND(SizeSel, OffsetIndex, AutoIncSpec, DataRegIndex, AddrRegIndex) -#define TTI_LOADIND(SizeSel, OffsetIndex, AutoIncSpec, DataRegIndex, AddrRegIndex) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_LOADIND(SizeSel, OffsetIndex, AutoIncSpec, DataRegIndex, AddrRegIndex))) - -#define TT_OP_LOADREG(TdmaDataRegIndex, RegAddr) TT_OP(0x68, (((TdmaDataRegIndex) << 18) + ((RegAddr) << 0))) -#define TT_LOADREG_VALID(TdmaDataRegIndex, RegAddr) \ - (ckernel::is_valid(TdmaDataRegIndex, 6) && ckernel::is_valid(RegAddr, 18)) -#define TT_LOADREG(TdmaDataRegIndex, RegAddr) ckernel::instrn_buffer[0] = TT_OP_LOADREG(TdmaDataRegIndex, RegAddr) -#define TTI_LOADREG(TdmaDataRegIndex, RegAddr) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_LOADREG(TdmaDataRegIndex, RegAddr))) - -#define TT_OP_MFCONV3S1(clear_dvalid, rotate_weights, addr_mode, dst) \ - TT_OP(0x3a, (((clear_dvalid) << 22) + ((rotate_weights) << 17) + ((addr_mode) << 15) + ((dst) << 0))) -#define TT_MFCONV3S1_VALID(clear_dvalid, rotate_weights, addr_mode, dst) \ - (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(rotate_weights, 5) && ckernel::is_valid(addr_mode, 2) && \ - ckernel::is_valid(dst, 15)) -#define TT_MFCONV3S1(clear_dvalid, rotate_weights, addr_mode, dst) \ - ckernel::instrn_buffer[0] = TT_OP_MFCONV3S1(clear_dvalid, rotate_weights, addr_mode, dst) -#define TTI_MFCONV3S1(clear_dvalid, rotate_weights, addr_mode, dst) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MFCONV3S1(clear_dvalid, rotate_weights, addr_mode, dst))) - -#define TT_OP_MOP(mop_type, loop_count, zmask_lo16) \ - TT_OP(0x01, (((mop_type) << 23) + ((loop_count) << 16) + ((zmask_lo16) << 0))) -#define TT_MOP_VALID(mop_type, loop_count, zmask_lo16) \ - (ckernel::is_valid(mop_type, 1) && ckernel::is_valid(loop_count, 7) && ckernel::is_valid(zmask_lo16, 16)) -#define TT_MOP(mop_type, loop_count, zmask_lo16) ckernel::instrn_buffer[0] = TT_OP_MOP(mop_type, loop_count, zmask_lo16) -#define TTI_MOP(mop_type, loop_count, zmask_lo16) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MOP(mop_type, loop_count, zmask_lo16))) - -#define TT_OP_MOP_CFG(zmask_hi16) TT_OP(0x03, (((zmask_hi16) << 0))) -#define TT_MOP_CFG_VALID(zmask_hi16) (ckernel::is_valid(zmask_hi16, 24)) -#define TT_MOP_CFG(zmask_hi16) ckernel::instrn_buffer[0] = TT_OP_MOP_CFG(zmask_hi16) -#define TTI_MOP_CFG(zmask_hi16) INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MOP_CFG(zmask_hi16))) - -#define TT_OP_MOVA2D(dest_32b_lo, src, addr_mode, instr_mod, dst) \ - TT_OP(0x12, (((dest_32b_lo) << 23) + ((src) << 17) + ((addr_mode) << 15) + ((instr_mod) << 12) + ((dst) << 0))) -#define TT_MOVA2D_VALID(dest_32b_lo, src, addr_mode, instr_mod, dst) \ - (ckernel::is_valid(dest_32b_lo, 1) && ckernel::is_valid(src, 6) && ckernel::is_valid(addr_mode, 2) && \ - ckernel::is_valid(instr_mod, 3) && ckernel::is_valid(dst, 12)) -#define TT_MOVA2D(dest_32b_lo, src, addr_mode, instr_mod, dst) \ - ckernel::instrn_buffer[0] = TT_OP_MOVA2D(dest_32b_lo, src, addr_mode, instr_mod, dst) -#define TTI_MOVA2D(dest_32b_lo, src, addr_mode, instr_mod, dst) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MOVA2D(dest_32b_lo, src, addr_mode, instr_mod, dst))) - -#define TT_OP_MOVB2A(srca, addr_mode, instr_mod, srcb) \ - TT_OP(0x0b, (((srca) << 17) + ((addr_mode) << 15) + ((instr_mod) << 12) + ((srcb) << 0))) -#define TT_MOVB2A_VALID(srca, addr_mode, instr_mod, srcb) \ - (ckernel::is_valid(srca, 7) && ckernel::is_valid(addr_mode, 2) && ckernel::is_valid(instr_mod, 3) && \ - ckernel::is_valid(srcb, 12)) -#define TT_MOVB2A(srca, addr_mode, instr_mod, srcb) \ - ckernel::instrn_buffer[0] = TT_OP_MOVB2A(srca, addr_mode, instr_mod, srcb) -#define TTI_MOVB2A(srca, addr_mode, instr_mod, srcb) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MOVB2A(srca, addr_mode, instr_mod, srcb))) - -#define TT_OP_MOVB2D(dest_32b_lo, src, addr_mode, instr_mod, dst) \ - TT_OP(0x13, (((dest_32b_lo) << 23) + ((src) << 17) + ((addr_mode) << 15) + ((instr_mod) << 12) + ((dst) << 0))) -#define TT_MOVB2D_VALID(dest_32b_lo, src, addr_mode, instr_mod, dst) \ - (ckernel::is_valid(dest_32b_lo, 1) && ckernel::is_valid(src, 6) && ckernel::is_valid(addr_mode, 2) && \ - ckernel::is_valid(instr_mod, 3) && ckernel::is_valid(dst, 12)) -#define TT_MOVB2D(dest_32b_lo, src, addr_mode, instr_mod, dst) \ - ckernel::instrn_buffer[0] = TT_OP_MOVB2D(dest_32b_lo, src, addr_mode, instr_mod, dst) -#define TTI_MOVB2D(dest_32b_lo, src, addr_mode, instr_mod, dst) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MOVB2D(dest_32b_lo, src, addr_mode, instr_mod, dst))) - -#define TT_OP_MOVD2A(dest_32b_lo, src, addr_mode, instr_mod, dst) \ - TT_OP(0x08, (((dest_32b_lo) << 23) + ((src) << 17) + ((addr_mode) << 15) + ((instr_mod) << 12) + ((dst) << 0))) -#define TT_MOVD2A_VALID(dest_32b_lo, src, addr_mode, instr_mod, dst) \ - (ckernel::is_valid(dest_32b_lo, 1) && ckernel::is_valid(src, 6) && ckernel::is_valid(addr_mode, 2) && \ - ckernel::is_valid(instr_mod, 3) && ckernel::is_valid(dst, 12)) -#define TT_MOVD2A(dest_32b_lo, src, addr_mode, instr_mod, dst) \ - ckernel::instrn_buffer[0] = TT_OP_MOVD2A(dest_32b_lo, src, addr_mode, instr_mod, dst) -#define TTI_MOVD2A(dest_32b_lo, src, addr_mode, instr_mod, dst) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MOVD2A(dest_32b_lo, src, addr_mode, instr_mod, dst))) - -#define TT_OP_MOVD2B(dest_32b_lo, src, addr_mode, instr_mod, dst) \ - TT_OP(0x0a, (((dest_32b_lo) << 23) + ((src) << 17) + ((addr_mode) << 15) + ((instr_mod) << 12) + ((dst) << 0))) -#define TT_MOVD2B_VALID(dest_32b_lo, src, addr_mode, instr_mod, dst) \ - (ckernel::is_valid(dest_32b_lo, 1) && ckernel::is_valid(src, 6) && ckernel::is_valid(addr_mode, 2) && \ - ckernel::is_valid(instr_mod, 3) && ckernel::is_valid(dst, 12)) -#define TT_MOVD2B(dest_32b_lo, src, addr_mode, instr_mod, dst) \ - ckernel::instrn_buffer[0] = TT_OP_MOVD2B(dest_32b_lo, src, addr_mode, instr_mod, dst) -#define TTI_MOVD2B(dest_32b_lo, src, addr_mode, instr_mod, dst) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MOVD2B(dest_32b_lo, src, addr_mode, instr_mod, dst))) - -#define TT_OP_MOVDBGA2D(dest_32b_lo, src, addr_mode, instr_mod, dst) \ - TT_OP(0x09, (((dest_32b_lo) << 23) + ((src) << 17) + ((addr_mode) << 15) + ((instr_mod) << 12) + ((dst) << 0))) -#define TT_MOVDBGA2D_VALID(dest_32b_lo, src, addr_mode, instr_mod, dst) \ - (ckernel::is_valid(dest_32b_lo, 1) && ckernel::is_valid(src, 6) && ckernel::is_valid(addr_mode, 2) && \ - ckernel::is_valid(instr_mod, 3) && ckernel::is_valid(dst, 12)) -#define TT_MOVDBGA2D(dest_32b_lo, src, addr_mode, instr_mod, dst) \ - ckernel::instrn_buffer[0] = TT_OP_MOVDBGA2D(dest_32b_lo, src, addr_mode, instr_mod, dst) -#define TTI_MOVDBGA2D(dest_32b_lo, src, addr_mode, instr_mod, dst) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MOVDBGA2D(dest_32b_lo, src, addr_mode, instr_mod, dst))) - -#define TT_OP_MPOOL3S1(clear_dvalid, addr_mode, index_en, dst) \ - TT_OP(0x24, (((clear_dvalid) << 22) + ((addr_mode) << 15) + ((index_en) << 14) + ((dst) << 0))) -#define TT_MPOOL3S1_VALID(clear_dvalid, addr_mode, index_en, dst) \ - (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(addr_mode, 7) && ckernel::is_valid(index_en, 1) && \ - ckernel::is_valid(dst, 14)) -#define TT_MPOOL3S1(clear_dvalid, addr_mode, index_en, dst) \ - ckernel::instrn_buffer[0] = TT_OP_MPOOL3S1(clear_dvalid, addr_mode, index_en, dst) -#define TTI_MPOOL3S1(clear_dvalid, addr_mode, index_en, dst) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MPOOL3S1(clear_dvalid, addr_mode, index_en, dst))) - -#define TT_OP_MPOOL3S2(clear_dvalid, addr_mode, index_en, dst) \ - TT_OP(0x31, (((clear_dvalid) << 22) + ((addr_mode) << 15) + ((index_en) << 14) + ((dst) << 0))) -#define TT_MPOOL3S2_VALID(clear_dvalid, addr_mode, index_en, dst) \ - (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(addr_mode, 7) && ckernel::is_valid(index_en, 1) && \ - ckernel::is_valid(dst, 14)) -#define TT_MPOOL3S2(clear_dvalid, addr_mode, index_en, dst) \ - ckernel::instrn_buffer[0] = TT_OP_MPOOL3S2(clear_dvalid, addr_mode, index_en, dst) -#define TTI_MPOOL3S2(clear_dvalid, addr_mode, index_en, dst) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MPOOL3S2(clear_dvalid, addr_mode, index_en, dst))) - -#define TT_OP_MULDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) \ - TT_OP(0x5a, (((OpBisConst) << 23) + ((ResultRegIndex) << 12) + ((OpBRegIndex) << 6) + ((OpARegIndex) << 0))) -#define TT_MULDMAREG_VALID(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) \ - (ckernel::is_valid(OpBisConst, 1) && ckernel::is_valid(ResultRegIndex, 11) && ckernel::is_valid(OpBRegIndex, 6) && \ - ckernel::is_valid(OpARegIndex, 6)) -#define TT_MULDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) \ - ckernel::instrn_buffer[0] = TT_OP_MULDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) -#define TTI_MULDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MULDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex))) - -#define TT_OP_MVMUL(clear_dvalid, instr_mod19, addr_mode, dst) \ - TT_OP(0x26, (((clear_dvalid) << 22) + ((instr_mod19) << 19) + ((addr_mode) << 15) + ((dst) << 0))) -#define TT_MVMUL_VALID(clear_dvalid, instr_mod19, addr_mode, dst) \ - (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(instr_mod19, 3) && ckernel::is_valid(addr_mode, 4) && \ - ckernel::is_valid(dst, 15)) -#define TT_MVMUL(clear_dvalid, instr_mod19, addr_mode, dst) \ - ckernel::instrn_buffer[0] = TT_OP_MVMUL(clear_dvalid, instr_mod19, addr_mode, dst) -#define TTI_MVMUL(clear_dvalid, instr_mod19, addr_mode, dst) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MVMUL(clear_dvalid, instr_mod19, addr_mode, dst))) - -#define TT_OP_NOP TT_OP(0x02, 0) -#define TTI_NOP INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_NOP)) - -#define TT_OP_PACR(AddrMode, ZeroWrite, PackSel, OvrdThreadId, Concat, Flush, Last) \ - TT_OP( \ - 0x41, \ - (((AddrMode) << 15) + ((ZeroWrite) << 12) + ((PackSel) << 8) + ((OvrdThreadId) << 7) + ((Concat) << 4) + \ - ((Flush) << 1) + ((Last) << 0))) -#define TT_PACR_VALID(AddrMode, ZeroWrite, PackSel, OvrdThreadId, Concat, Flush, Last) \ - (ckernel::is_valid(AddrMode, 9) && ckernel::is_valid(ZeroWrite, 3) && ckernel::is_valid(PackSel, 4) && \ - ckernel::is_valid(OvrdThreadId, 1) && ckernel::is_valid(Concat, 3) && ckernel::is_valid(Flush, 3) && \ - ckernel::is_valid(Last, 1)) -#define TT_PACR(AddrMode, ZeroWrite, PackSel, OvrdThreadId, Concat, Flush, Last) \ - ckernel::instrn_buffer[0] = TT_OP_PACR(AddrMode, ZeroWrite, PackSel, OvrdThreadId, Concat, Flush, Last) -#define TTI_PACR(AddrMode, ZeroWrite, PackSel, OvrdThreadId, Concat, Flush, Last) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_PACR(AddrMode, ZeroWrite, PackSel, OvrdThreadId, Concat, Flush, Last))) - -#define TT_OP_PACR_SETREG(Push, AddrSel, WrData, PackSel, StreamId, Flush, Last) \ - TT_OP( \ - 0x4a, \ - (((Push) << 23) + ((AddrSel) << 22) + ((WrData) << 12) + ((PackSel) << 8) + ((StreamId) << 2) + \ - ((Flush) << 1) + ((Last) << 0))) -#define TT_PACR_SETREG_VALID(Push, AddrSel, WrData, PackSel, StreamId, Flush, Last) \ - (ckernel::is_valid(Push, 1) && ckernel::is_valid(AddrSel, 1) && ckernel::is_valid(WrData, 10) && \ - ckernel::is_valid(PackSel, 4) && ckernel::is_valid(StreamId, 6) && ckernel::is_valid(Flush, 1) && \ - ckernel::is_valid(Last, 1)) -#define TT_PACR_SETREG(Push, AddrSel, WrData, PackSel, StreamId, Flush, Last) \ - ckernel::instrn_buffer[0] = TT_OP_PACR_SETREG(Push, AddrSel, WrData, PackSel, StreamId, Flush, Last) -#define TTI_PACR_SETREG(Push, AddrSel, WrData, PackSel, StreamId, Flush, Last) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_PACR_SETREG(Push, AddrSel, WrData, PackSel, StreamId, Flush, Last))) - -#define TT_OP_RAREB TT_OP(0x15, 0) -#define TTI_RAREB INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_RAREB)) - -#define TT_OP_RDCFG(GprAddress, CfgReg) TT_OP(0xb1, (((GprAddress) << 16) + ((CfgReg) << 0))) -#define TT_RDCFG_VALID(GprAddress, CfgReg) (ckernel::is_valid(GprAddress, 8) && ckernel::is_valid(CfgReg, 16)) -#define TT_RDCFG(GprAddress, CfgReg) ckernel::instrn_buffer[0] = TT_OP_RDCFG(GprAddress, CfgReg) -#define TTI_RDCFG(GprAddress, CfgReg) INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_RDCFG(GprAddress, CfgReg))) - -#define TT_OP_REG2FLOP(SizeSel, TargetSel, ByteOffset, ContextId_2, FlopIndex, RegIndex) \ - TT_OP( \ - 0x48, \ - (((SizeSel) << 22) + ((TargetSel) << 20) + ((ByteOffset) << 18) + ((ContextId_2) << 16) + ((FlopIndex) << 6) + \ - ((RegIndex) << 0))) -#define TT_REG2FLOP_VALID(SizeSel, TargetSel, ByteOffset, ContextId_2, FlopIndex, RegIndex) \ - (ckernel::is_valid(SizeSel, 2) && ckernel::is_valid(TargetSel, 2) && ckernel::is_valid(ByteOffset, 2) && \ - ckernel::is_valid(ContextId_2, 2) && ckernel::is_valid(FlopIndex, 10) && ckernel::is_valid(RegIndex, 6)) -#define TT_REG2FLOP(SizeSel, TargetSel, ByteOffset, ContextId_2, FlopIndex, RegIndex) \ - ckernel::instrn_buffer[0] = TT_OP_REG2FLOP(SizeSel, TargetSel, ByteOffset, ContextId_2, FlopIndex, RegIndex) -#define TTI_REG2FLOP(SizeSel, TargetSel, ByteOffset, ContextId_2, FlopIndex, RegIndex) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_REG2FLOP(SizeSel, TargetSel, ByteOffset, ContextId_2, FlopIndex, RegIndex))) - -#define TT_OP_REPLAY(start_idx, len, execute_while_loading, load_mode) \ - TT_OP(0x04, (((start_idx) << 14) + ((len) << 4) + ((execute_while_loading) << 1) + ((load_mode) << 0))) -#define TT_REPLAY_VALID(start_idx, len, execute_while_loading, load_mode) \ - (ckernel::is_valid(start_idx, 10) && ckernel::is_valid(len, 10) && ckernel::is_valid(execute_while_loading, 3) && \ - ckernel::is_valid(load_mode, 1)) -#define TT_REPLAY(start_idx, len, execute_while_loading, load_mode) \ - ckernel::instrn_buffer[0] = TT_OP_REPLAY(start_idx, len, execute_while_loading, load_mode) -#define TTI_REPLAY(start_idx, len, execute_while_loading, load_mode) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_REPLAY(start_idx, len, execute_while_loading, load_mode))) - -#define TT_OP_RMWCIB0(Mask, Data, CfgRegAddr) TT_OP(0xb3, (((Mask) << 16) + ((Data) << 8) + ((CfgRegAddr) << 0))) -#define TT_RMWCIB0_VALID(Mask, Data, CfgRegAddr) \ - (ckernel::is_valid(Mask, 8) && ckernel::is_valid(Data, 8) && ckernel::is_valid(CfgRegAddr, 8)) -#define TT_RMWCIB0(Mask, Data, CfgRegAddr) ckernel::instrn_buffer[0] = TT_OP_RMWCIB0(Mask, Data, CfgRegAddr) -#define TTI_RMWCIB0(Mask, Data, CfgRegAddr) INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_RMWCIB0(Mask, Data, CfgRegAddr))) - -#define TT_OP_RMWCIB1(Mask, Data, CfgRegAddr) TT_OP(0xb4, (((Mask) << 16) + ((Data) << 8) + ((CfgRegAddr) << 0))) -#define TT_RMWCIB1_VALID(Mask, Data, CfgRegAddr) \ - (ckernel::is_valid(Mask, 8) && ckernel::is_valid(Data, 8) && ckernel::is_valid(CfgRegAddr, 8)) -#define TT_RMWCIB1(Mask, Data, CfgRegAddr) ckernel::instrn_buffer[0] = TT_OP_RMWCIB1(Mask, Data, CfgRegAddr) -#define TTI_RMWCIB1(Mask, Data, CfgRegAddr) INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_RMWCIB1(Mask, Data, CfgRegAddr))) - -#define TT_OP_RMWCIB2(Mask, Data, CfgRegAddr) TT_OP(0xb5, (((Mask) << 16) + ((Data) << 8) + ((CfgRegAddr) << 0))) -#define TT_RMWCIB2_VALID(Mask, Data, CfgRegAddr) \ - (ckernel::is_valid(Mask, 8) && ckernel::is_valid(Data, 8) && ckernel::is_valid(CfgRegAddr, 8)) -#define TT_RMWCIB2(Mask, Data, CfgRegAddr) ckernel::instrn_buffer[0] = TT_OP_RMWCIB2(Mask, Data, CfgRegAddr) -#define TTI_RMWCIB2(Mask, Data, CfgRegAddr) INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_RMWCIB2(Mask, Data, CfgRegAddr))) - -#define TT_OP_RMWCIB3(Mask, Data, CfgRegAddr) TT_OP(0xb6, (((Mask) << 16) + ((Data) << 8) + ((CfgRegAddr) << 0))) -#define TT_RMWCIB3_VALID(Mask, Data, CfgRegAddr) \ - (ckernel::is_valid(Mask, 8) && ckernel::is_valid(Data, 8) && ckernel::is_valid(CfgRegAddr, 8)) -#define TT_RMWCIB3(Mask, Data, CfgRegAddr) ckernel::instrn_buffer[0] = TT_OP_RMWCIB3(Mask, Data, CfgRegAddr) -#define TTI_RMWCIB3(Mask, Data, CfgRegAddr) INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_RMWCIB3(Mask, Data, CfgRegAddr))) - -#define TT_OP_RSTDMA TT_OP(0x44, 0) -#define TTI_RSTDMA INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_RSTDMA)) - -#define TT_OP_SEMGET(sem_sel) TT_OP(0xa5, (((sem_sel) << 2))) -#define TT_SEMGET_VALID(sem_sel) (ckernel::is_valid(sem_sel, 22)) -#define TT_SEMGET(sem_sel) ckernel::instrn_buffer[0] = TT_OP_SEMGET(sem_sel) -#define TTI_SEMGET(sem_sel) INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SEMGET(sem_sel))) - -#define TT_OP_SEMINIT(max_value, init_value, sem_sel) \ - TT_OP(0xa3, (((max_value) << 20) + ((init_value) << 16) + ((sem_sel) << 2))) -#define TT_SEMINIT_VALID(max_value, init_value, sem_sel) \ - (ckernel::is_valid(max_value, 4) && ckernel::is_valid(init_value, 4) && ckernel::is_valid(sem_sel, 14)) -#define TT_SEMINIT(max_value, init_value, sem_sel) \ - ckernel::instrn_buffer[0] = TT_OP_SEMINIT(max_value, init_value, sem_sel) -#define TTI_SEMINIT(max_value, init_value, sem_sel) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SEMINIT(max_value, init_value, sem_sel))) - -#define TT_OP_SEMPOST(sem_sel) TT_OP(0xa4, (((sem_sel) << 2))) -#define TT_SEMPOST_VALID(sem_sel) (ckernel::is_valid(sem_sel, 22)) -#define TT_SEMPOST(sem_sel) ckernel::instrn_buffer[0] = TT_OP_SEMPOST(sem_sel) -#define TTI_SEMPOST(sem_sel) INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SEMPOST(sem_sel))) - -#define TT_OP_SEMWAIT(stall_res, sem_sel, wait_sem_cond) \ - TT_OP(0xa6, (((stall_res) << 15) + ((sem_sel) << 2) + ((wait_sem_cond) << 0))) -#define TT_SEMWAIT_VALID(stall_res, sem_sel, wait_sem_cond) \ - (ckernel::is_valid(stall_res, 9) && ckernel::is_valid(sem_sel, 13) && ckernel::is_valid(wait_sem_cond, 2)) -#define TT_SEMWAIT(stall_res, sem_sel, wait_sem_cond) \ - ckernel::instrn_buffer[0] = TT_OP_SEMWAIT(stall_res, sem_sel, wait_sem_cond) -#define TTI_SEMWAIT(stall_res, sem_sel, wait_sem_cond) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SEMWAIT(stall_res, sem_sel, wait_sem_cond))) - -#define TT_OP_SETADC(CntSetMask, ChannelIndex, DimensionIndex, Value) \ - TT_OP(0x50, (((CntSetMask) << 21) + ((ChannelIndex) << 20) + ((DimensionIndex) << 18) + ((Value) << 0))) -#define TT_SETADC_VALID(CntSetMask, ChannelIndex, DimensionIndex, Value) \ - (ckernel::is_valid(CntSetMask, 3) && ckernel::is_valid(ChannelIndex, 1) && ckernel::is_valid(DimensionIndex, 2) && \ - ckernel::is_valid(Value, 18)) -#define TT_SETADC(CntSetMask, ChannelIndex, DimensionIndex, Value) \ - ckernel::instrn_buffer[0] = TT_OP_SETADC(CntSetMask, ChannelIndex, DimensionIndex, Value) -#define TTI_SETADC(CntSetMask, ChannelIndex, DimensionIndex, Value) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETADC(CntSetMask, ChannelIndex, DimensionIndex, Value))) - -#define TT_OP_SETADCXX(CntSetMask, x_end2, x_start) \ - TT_OP(0x5e, (((CntSetMask) << 21) + ((x_end2) << 10) + ((x_start) << 0))) -#define TT_SETADCXX_VALID(CntSetMask, x_end2, x_start) \ - (ckernel::is_valid(CntSetMask, 3) && ckernel::is_valid(x_end2, 11) && ckernel::is_valid(x_start, 10)) -#define TT_SETADCXX(CntSetMask, x_end2, x_start) ckernel::instrn_buffer[0] = TT_OP_SETADCXX(CntSetMask, x_end2, x_start) -#define TTI_SETADCXX(CntSetMask, x_end2, x_start) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETADCXX(CntSetMask, x_end2, x_start))) - -#define TT_OP_SETADCXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \ - TT_OP( \ - 0x51, \ - (((CntSetMask) << 21) + ((Ch1_Y) << 15) + ((Ch1_X) << 12) + ((Ch0_Y) << 9) + ((Ch0_X) << 6) + \ - ((BitMask) << 0))) -#define TT_SETADCXY_VALID(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \ - (ckernel::is_valid(CntSetMask, 3) && ckernel::is_valid(Ch1_Y, 6) && ckernel::is_valid(Ch1_X, 3) && \ - ckernel::is_valid(Ch0_Y, 3) && ckernel::is_valid(Ch0_X, 3) && ckernel::is_valid(BitMask, 6)) -#define TT_SETADCXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \ - ckernel::instrn_buffer[0] = TT_OP_SETADCXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) -#define TTI_SETADCXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETADCXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask))) - -#define TT_OP_SETADCZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \ - TT_OP( \ - 0x54, \ - (((CntSetMask) << 21) + ((Ch1_Y) << 15) + ((Ch1_X) << 12) + ((Ch0_Y) << 9) + ((Ch0_X) << 6) + \ - ((BitMask) << 0))) -#define TT_SETADCZW_VALID(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \ - (ckernel::is_valid(CntSetMask, 3) && ckernel::is_valid(Ch1_Y, 6) && ckernel::is_valid(Ch1_X, 3) && \ - ckernel::is_valid(Ch0_Y, 3) && ckernel::is_valid(Ch0_X, 3) && ckernel::is_valid(BitMask, 6)) -#define TT_SETADCZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \ - ckernel::instrn_buffer[0] = TT_OP_SETADCZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) -#define TTI_SETADCZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETADCZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask))) - -#define TT_OP_SETASHRMH(reg_mask, halo_mask) TT_OP(0x1e, (((reg_mask) << 1) + ((halo_mask) << 0))) -#define TT_SETASHRMH_VALID(reg_mask, halo_mask) (ckernel::is_valid(reg_mask, 23) && ckernel::is_valid(halo_mask, 1)) -#define TT_SETASHRMH(reg_mask, halo_mask) ckernel::instrn_buffer[0] = TT_OP_SETASHRMH(reg_mask, halo_mask) -#define TTI_SETASHRMH(reg_mask, halo_mask) INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETASHRMH(reg_mask, halo_mask))) - -#define TT_OP_SETASHRMH0(reg_mask, halo_mask) TT_OP(0x1a, (((reg_mask) << 1) + ((halo_mask) << 0))) -#define TT_SETASHRMH0_VALID(reg_mask, halo_mask) (ckernel::is_valid(reg_mask, 23) && ckernel::is_valid(halo_mask, 1)) -#define TT_SETASHRMH0(reg_mask, halo_mask) ckernel::instrn_buffer[0] = TT_OP_SETASHRMH0(reg_mask, halo_mask) -#define TTI_SETASHRMH0(reg_mask, halo_mask) INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETASHRMH0(reg_mask, halo_mask))) - -#define TT_OP_SETASHRMH1(reg_mask, halo_mask) TT_OP(0x1b, (((reg_mask) << 1) + ((halo_mask) << 0))) -#define TT_SETASHRMH1_VALID(reg_mask, halo_mask) (ckernel::is_valid(reg_mask, 23) && ckernel::is_valid(halo_mask, 1)) -#define TT_SETASHRMH1(reg_mask, halo_mask) ckernel::instrn_buffer[0] = TT_OP_SETASHRMH1(reg_mask, halo_mask) -#define TTI_SETASHRMH1(reg_mask, halo_mask) INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETASHRMH1(reg_mask, halo_mask))) - -#define TT_OP_SETASHRMV(reg_mask2) TT_OP(0x1c, (((reg_mask2) << 0))) -#define TT_SETASHRMV_VALID(reg_mask2) (ckernel::is_valid(reg_mask2, 24)) -#define TT_SETASHRMV(reg_mask2) ckernel::instrn_buffer[0] = TT_OP_SETASHRMV(reg_mask2) -#define TTI_SETASHRMV(reg_mask2) INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETASHRMV(reg_mask2))) - -#define TT_OP_SETC16(setc16_reg, setc16_value) TT_OP(0xb2, (((setc16_reg) << 16) + ((setc16_value) << 0))) -#define TT_SETC16_VALID(setc16_reg, setc16_value) \ - (ckernel::is_valid(setc16_reg, 8) && ckernel::is_valid(setc16_value, 16)) -#define TT_SETC16(setc16_reg, setc16_value) ckernel::instrn_buffer[0] = TT_OP_SETC16(setc16_reg, setc16_value) -#define TTI_SETC16(setc16_reg, setc16_value) INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETC16(setc16_reg, setc16_value))) - -#define TT_OP_SETDMAREG(Payload_SigSelSize, Payload_SigSel, SetSignalsMode, RegIndex16b) \ - TT_OP( \ - 0x45, \ - (((Payload_SigSelSize) << 22) + ((Payload_SigSel) << 8) + ((SetSignalsMode) << 7) + ((RegIndex16b) << 0))) -#define TT_SETDMAREG_VALID(Payload_SigSelSize, Payload_SigSel, SetSignalsMode, RegIndex16b) \ - (ckernel::is_valid(Payload_SigSelSize, 2) && ckernel::is_valid(Payload_SigSel, 14) && \ - ckernel::is_valid(SetSignalsMode, 1) && ckernel::is_valid(RegIndex16b, 7)) -#define TT_SETDMAREG(Payload_SigSelSize, Payload_SigSel, SetSignalsMode, RegIndex16b) \ - ckernel::instrn_buffer[0] = TT_OP_SETDMAREG(Payload_SigSelSize, Payload_SigSel, SetSignalsMode, RegIndex16b) -#define TTI_SETDMAREG(Payload_SigSelSize, Payload_SigSel, SetSignalsMode, RegIndex16b) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETDMAREG(Payload_SigSelSize, Payload_SigSel, SetSignalsMode, RegIndex16b))) - -#define TT_OP_SETDVALID(setvalid) TT_OP(0x57, (((setvalid) << 0))) -#define TT_SETDVALID_VALID(setvalid) (ckernel::is_valid(setvalid, 24)) -#define TT_SETDVALID(setvalid) ckernel::instrn_buffer[0] = TT_OP_SETDVALID(setvalid) -#define TTI_SETDVALID(setvalid) INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETDVALID(setvalid))) - -#define TT_OP_SETIBRWC(rwc_cr, rwc_bias, set_inc_ctrl) \ - TT_OP(0x39, (((rwc_cr) << 18) + ((rwc_bias) << 6) + ((set_inc_ctrl) << 0))) -#define TT_SETIBRWC_VALID(rwc_cr, rwc_bias, set_inc_ctrl) \ - (ckernel::is_valid(rwc_cr, 6) && ckernel::is_valid(rwc_bias, 12) && ckernel::is_valid(set_inc_ctrl, 6)) -#define TT_SETIBRWC(rwc_cr, rwc_bias, set_inc_ctrl) \ - ckernel::instrn_buffer[0] = TT_OP_SETIBRWC(rwc_cr, rwc_bias, set_inc_ctrl) -#define TTI_SETIBRWC(rwc_cr, rwc_bias, set_inc_ctrl) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETIBRWC(rwc_cr, rwc_bias, set_inc_ctrl))) - -#define TT_OP_SETPKEDGOF(y_end, y_start, x_end, x_start) \ - TT_OP(0x1d, (((y_end) << 12) + ((y_start) << 8) + ((x_end) << 4) + ((x_start) << 0))) -#define TT_SETPKEDGOF_VALID(y_end, y_start, x_end, x_start) \ - (ckernel::is_valid(y_end, 12) && ckernel::is_valid(y_start, 4) && ckernel::is_valid(x_end, 4) && \ - ckernel::is_valid(x_start, 4)) -#define TT_SETPKEDGOF(y_end, y_start, x_end, x_start) \ - ckernel::instrn_buffer[0] = TT_OP_SETPKEDGOF(y_end, y_start, x_end, x_start) -#define TTI_SETPKEDGOF(y_end, y_start, x_end, x_start) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETPKEDGOF(y_end, y_start, x_end, x_start))) - -#define TT_OP_SETRWC(clear_ab_vld, rwc_cr, rwc_d, rwc_b, rwc_a, BitMask) \ - TT_OP( \ - 0x37, \ - (((clear_ab_vld) << 22) + ((rwc_cr) << 18) + ((rwc_d) << 14) + ((rwc_b) << 10) + ((rwc_a) << 6) + \ - ((BitMask) << 0))) -#define TT_SETRWC_VALID(clear_ab_vld, rwc_cr, rwc_d, rwc_b, rwc_a, BitMask) \ - (ckernel::is_valid(clear_ab_vld, 2) && ckernel::is_valid(rwc_cr, 4) && ckernel::is_valid(rwc_d, 4) && \ - ckernel::is_valid(rwc_b, 4) && ckernel::is_valid(rwc_a, 4) && ckernel::is_valid(BitMask, 6)) -#define TT_SETRWC(clear_ab_vld, rwc_cr, rwc_d, rwc_b, rwc_a, BitMask) \ - ckernel::instrn_buffer[0] = TT_OP_SETRWC(clear_ab_vld, rwc_cr, rwc_d, rwc_b, rwc_a, BitMask) -#define TTI_SETRWC(clear_ab_vld, rwc_cr, rwc_d, rwc_b, rwc_a, BitMask) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETRWC(clear_ab_vld, rwc_cr, rwc_d, rwc_b, rwc_a, BitMask))) - -#define TT_OP_SFPABS(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - TT_OP(0x7d, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) -#define TT_SFPABS_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && \ - ckernel::is_valid(instr_mod1, 4)) -#define TT_SFPABS(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - ckernel::instrn_buffer[0] = TT_OP_SFPABS(imm12_math, lreg_c, lreg_dest, instr_mod1) -#define TTI_SFPABS(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPABS(imm12_math, lreg_c, lreg_dest, instr_mod1))) - -#define TT_OP_SFPADD(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \ - TT_OP( \ - 0x85, \ - (((lreg_src_a) << 16) + ((lreg_src_b) << 12) + ((lreg_src_c) << 8) + ((lreg_dest) << 4) + \ - ((instr_mod1) << 0))) -#define TT_SFPADD_VALID(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \ - (ckernel::is_valid(lreg_src_a, 8) && ckernel::is_valid(lreg_src_b, 4) && ckernel::is_valid(lreg_src_c, 4) && \ - ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4)) -#define TT_SFPADD(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \ - ckernel::instrn_buffer[0] = TT_OP_SFPADD(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) -#define TTI_SFPADD(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPADD(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1))) - -#define TT_OP_SFPADDI(imm16_math, lreg_dest, instr_mod1) \ - TT_OP(0x75, (((imm16_math) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) -#define TT_SFPADDI_VALID(imm16_math, lreg_dest, instr_mod1) \ - (ckernel::is_valid(imm16_math, 16) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4)) -#define TT_SFPADDI(imm16_math, lreg_dest, instr_mod1) \ - ckernel::instrn_buffer[0] = TT_OP_SFPADDI(imm16_math, lreg_dest, instr_mod1) -#define TTI_SFPADDI(imm16_math, lreg_dest, instr_mod1) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPADDI(imm16_math, lreg_dest, instr_mod1))) - -#define TT_OP_SFPAND(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - TT_OP(0x7e, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) -#define TT_SFPAND_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && \ - ckernel::is_valid(instr_mod1, 4)) -#define TT_SFPAND(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - ckernel::instrn_buffer[0] = TT_OP_SFPAND(imm12_math, lreg_c, lreg_dest, instr_mod1) -#define TTI_SFPAND(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPAND(imm12_math, lreg_c, lreg_dest, instr_mod1))) - -#define TT_OP_SFPCAST(lreg_src_c, lreg_dest, instr_mod1) \ - TT_OP(0x90, (((lreg_src_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) -#define TT_SFPCAST_VALID(lreg_src_c, lreg_dest, instr_mod1) \ - (ckernel::is_valid(lreg_src_c, 16) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4)) -#define TT_SFPCAST(lreg_src_c, lreg_dest, instr_mod1) \ - ckernel::instrn_buffer[0] = TT_OP_SFPCAST(lreg_src_c, lreg_dest, instr_mod1) -#define TTI_SFPCAST(lreg_src_c, lreg_dest, instr_mod1) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPCAST(lreg_src_c, lreg_dest, instr_mod1))) - -#define TT_OP_SFPCOMPC(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - TT_OP(0x8b, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) -#define TT_SFPCOMPC_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && \ - ckernel::is_valid(instr_mod1, 4)) -#define TT_SFPCOMPC(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - ckernel::instrn_buffer[0] = TT_OP_SFPCOMPC(imm12_math, lreg_c, lreg_dest, instr_mod1) -#define TTI_SFPCOMPC(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPCOMPC(imm12_math, lreg_c, lreg_dest, instr_mod1))) - -#define TT_OP_SFPCONFIG(imm16_math, config_dest, instr_mod1) \ - TT_OP(0x91, (((imm16_math) << 8) + ((config_dest) << 4) + ((instr_mod1) << 0))) -#define TT_SFPCONFIG_VALID(imm16_math, config_dest, instr_mod1) \ - (ckernel::is_valid(imm16_math, 16) && ckernel::is_valid(config_dest, 4) && ckernel::is_valid(instr_mod1, 4)) -#define TT_SFPCONFIG(imm16_math, config_dest, instr_mod1) \ - ckernel::instrn_buffer[0] = TT_OP_SFPCONFIG(imm16_math, config_dest, instr_mod1) -#define TTI_SFPCONFIG(imm16_math, config_dest, instr_mod1) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPCONFIG(imm16_math, config_dest, instr_mod1))) - -#define TT_OP_SFPDIVP2(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - TT_OP(0x76, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) -#define TT_SFPDIVP2_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && \ - ckernel::is_valid(instr_mod1, 4)) -#define TT_SFPDIVP2(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - ckernel::instrn_buffer[0] = TT_OP_SFPDIVP2(imm12_math, lreg_c, lreg_dest, instr_mod1) -#define TTI_SFPDIVP2(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPDIVP2(imm12_math, lreg_c, lreg_dest, instr_mod1))) - -#define TT_OP_SFPENCC(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - TT_OP(0x8a, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) -#define TT_SFPENCC_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && \ - ckernel::is_valid(instr_mod1, 4)) -#define TT_SFPENCC(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - ckernel::instrn_buffer[0] = TT_OP_SFPENCC(imm12_math, lreg_c, lreg_dest, instr_mod1) -#define TTI_SFPENCC(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPENCC(imm12_math, lreg_c, lreg_dest, instr_mod1))) - -#define TT_OP_SFPEXEXP(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - TT_OP(0x77, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) -#define TT_SFPEXEXP_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && \ - ckernel::is_valid(instr_mod1, 4)) -#define TT_SFPEXEXP(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - ckernel::instrn_buffer[0] = TT_OP_SFPEXEXP(imm12_math, lreg_c, lreg_dest, instr_mod1) -#define TTI_SFPEXEXP(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPEXEXP(imm12_math, lreg_c, lreg_dest, instr_mod1))) - -#define TT_OP_SFPEXMAN(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - TT_OP(0x78, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) -#define TT_SFPEXMAN_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && \ - ckernel::is_valid(instr_mod1, 4)) -#define TT_SFPEXMAN(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - ckernel::instrn_buffer[0] = TT_OP_SFPEXMAN(imm12_math, lreg_c, lreg_dest, instr_mod1) -#define TTI_SFPEXMAN(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPEXMAN(imm12_math, lreg_c, lreg_dest, instr_mod1))) - -#define TT_OP_SFPIADD(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - TT_OP(0x79, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) -#define TT_SFPIADD_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && \ - ckernel::is_valid(instr_mod1, 4)) -#define TT_SFPIADD(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - ckernel::instrn_buffer[0] = TT_OP_SFPIADD(imm12_math, lreg_c, lreg_dest, instr_mod1) -#define TTI_SFPIADD(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPIADD(imm12_math, lreg_c, lreg_dest, instr_mod1))) - -#define TT_OP_SFPLOAD(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) \ - TT_OP(0x70, (((lreg_ind) << 20) + ((instr_mod0) << 16) + ((sfpu_addr_mode) << 14) + ((dest_reg_addr) << 0))) -#define TT_SFPLOAD_VALID(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) \ - (ckernel::is_valid(lreg_ind, 4) && ckernel::is_valid(instr_mod0, 4) && ckernel::is_valid(sfpu_addr_mode, 2) && \ - ckernel::is_valid(dest_reg_addr, 14)) -#define TT_SFPLOAD(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) \ - ckernel::instrn_buffer[0] = TT_OP_SFPLOAD(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) -#define TTI_SFPLOAD(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPLOAD(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr))) - -#define TT_OP_SFPLOADI(lreg_ind, instr_mod0, imm16) \ - TT_OP(0x71, (((lreg_ind) << 20) + ((instr_mod0) << 16) + ((imm16) << 0))) -#define TT_SFPLOADI_VALID(lreg_ind, instr_mod0, imm16) \ - (ckernel::is_valid(lreg_ind, 4) && ckernel::is_valid(instr_mod0, 4) && ckernel::is_valid(imm16, 16)) -#define TT_SFPLOADI(lreg_ind, instr_mod0, imm16) ckernel::instrn_buffer[0] = TT_OP_SFPLOADI(lreg_ind, instr_mod0, imm16) -#define TTI_SFPLOADI(lreg_ind, instr_mod0, imm16) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPLOADI(lreg_ind, instr_mod0, imm16))) - -#define TT_OP_SFPLOADMACRO(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) \ - TT_OP(0x93, (((lreg_ind) << 20) + ((instr_mod0) << 16) + ((sfpu_addr_mode) << 14) + ((dest_reg_addr) << 0))) -#define TT_SFPLOADMACRO_VALID(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) \ - (ckernel::is_valid(lreg_ind, 4) && ckernel::is_valid(instr_mod0, 4) && ckernel::is_valid(sfpu_addr_mode, 2) && \ - ckernel::is_valid(dest_reg_addr, 14)) -#define TT_SFPLOADMACRO(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) \ - ckernel::instrn_buffer[0] = TT_OP_SFPLOADMACRO(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) -#define TTI_SFPLOADMACRO(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPLOADMACRO(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr))) - -#define TT_OP_SFPLUT(lreg_ind, instr_mod0, dest_reg_addr) \ - TT_OP(0x73, (((lreg_ind) << 20) + ((instr_mod0) << 16) + ((dest_reg_addr) << 0))) -#define TT_SFPLUT_VALID(lreg_ind, instr_mod0, dest_reg_addr) \ - (ckernel::is_valid(lreg_ind, 4) && ckernel::is_valid(instr_mod0, 4) && ckernel::is_valid(dest_reg_addr, 16)) -#define TT_SFPLUT(lreg_ind, instr_mod0, dest_reg_addr) \ - ckernel::instrn_buffer[0] = TT_OP_SFPLUT(lreg_ind, instr_mod0, dest_reg_addr) -#define TTI_SFPLUT(lreg_ind, instr_mod0, dest_reg_addr) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPLUT(lreg_ind, instr_mod0, dest_reg_addr))) - -#define TT_OP_SFPLUTFP32(lreg_dest, instr_mod1) TT_OP(0x95, (((lreg_dest) << 4) + ((instr_mod1) << 0))) -#define TT_SFPLUTFP32_VALID(lreg_dest, instr_mod1) \ - (ckernel::is_valid(lreg_dest, 20) && ckernel::is_valid(instr_mod1, 4)) -#define TT_SFPLUTFP32(lreg_dest, instr_mod1) ckernel::instrn_buffer[0] = TT_OP_SFPLUTFP32(lreg_dest, instr_mod1) -#define TTI_SFPLUTFP32(lreg_dest, instr_mod1) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPLUTFP32(lreg_dest, instr_mod1))) - -#define TT_OP_SFPLZ(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - TT_OP(0x81, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) -#define TT_SFPLZ_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && \ - ckernel::is_valid(instr_mod1, 4)) -#define TT_SFPLZ(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - ckernel::instrn_buffer[0] = TT_OP_SFPLZ(imm12_math, lreg_c, lreg_dest, instr_mod1) -#define TTI_SFPLZ(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPLZ(imm12_math, lreg_c, lreg_dest, instr_mod1))) - -#define TT_OP_SFPMAD(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \ - TT_OP( \ - 0x84, \ - (((lreg_src_a) << 16) + ((lreg_src_b) << 12) + ((lreg_src_c) << 8) + ((lreg_dest) << 4) + \ - ((instr_mod1) << 0))) -#define TT_SFPMAD_VALID(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \ - (ckernel::is_valid(lreg_src_a, 8) && ckernel::is_valid(lreg_src_b, 4) && ckernel::is_valid(lreg_src_c, 4) && \ - ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4)) -#define TT_SFPMAD(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \ - ckernel::instrn_buffer[0] = TT_OP_SFPMAD(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) -#define TTI_SFPMAD(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPMAD(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1))) - -#define TT_OP_SFPMOV(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - TT_OP(0x7c, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) -#define TT_SFPMOV_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && \ - ckernel::is_valid(instr_mod1, 4)) -#define TT_SFPMOV(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - ckernel::instrn_buffer[0] = TT_OP_SFPMOV(imm12_math, lreg_c, lreg_dest, instr_mod1) -#define TTI_SFPMOV(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPMOV(imm12_math, lreg_c, lreg_dest, instr_mod1))) - -#define TT_OP_SFPMUL(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \ - TT_OP( \ - 0x86, \ - (((lreg_src_a) << 16) + ((lreg_src_b) << 12) + ((lreg_src_c) << 8) + ((lreg_dest) << 4) + \ - ((instr_mod1) << 0))) -#define TT_SFPMUL_VALID(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \ - (ckernel::is_valid(lreg_src_a, 8) && ckernel::is_valid(lreg_src_b, 4) && ckernel::is_valid(lreg_src_c, 4) && \ - ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4)) -#define TT_SFPMUL(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \ - ckernel::instrn_buffer[0] = TT_OP_SFPMUL(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) -#define TTI_SFPMUL(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPMUL(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1))) - -#define TT_OP_SFPMULI(imm16_math, lreg_dest, instr_mod1) \ - TT_OP(0x74, (((imm16_math) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) -#define TT_SFPMULI_VALID(imm16_math, lreg_dest, instr_mod1) \ - (ckernel::is_valid(imm16_math, 16) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4)) -#define TT_SFPMULI(imm16_math, lreg_dest, instr_mod1) \ - ckernel::instrn_buffer[0] = TT_OP_SFPMULI(imm16_math, lreg_dest, instr_mod1) -#define TTI_SFPMULI(imm16_math, lreg_dest, instr_mod1) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPMULI(imm16_math, lreg_dest, instr_mod1))) - -#define TT_OP_SFPNOP TT_OP(0x8f, 0) -#define TTI_SFPNOP INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPNOP)) - -#define TT_OP_SFPNOT(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - TT_OP(0x80, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) -#define TT_SFPNOT_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && \ - ckernel::is_valid(instr_mod1, 4)) -#define TT_SFPNOT(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - ckernel::instrn_buffer[0] = TT_OP_SFPNOT(imm12_math, lreg_c, lreg_dest, instr_mod1) -#define TTI_SFPNOT(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPNOT(imm12_math, lreg_c, lreg_dest, instr_mod1))) - -#define TT_OP_SFPOR(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - TT_OP(0x7f, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) -#define TT_SFPOR_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && \ - ckernel::is_valid(instr_mod1, 4)) -#define TT_SFPOR(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - ckernel::instrn_buffer[0] = TT_OP_SFPOR(imm12_math, lreg_c, lreg_dest, instr_mod1) -#define TTI_SFPOR(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPOR(imm12_math, lreg_c, lreg_dest, instr_mod1))) - -#define TT_OP_SFPPOPC(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - TT_OP(0x88, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) -#define TT_SFPPOPC_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && \ - ckernel::is_valid(instr_mod1, 4)) -#define TT_SFPPOPC(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - ckernel::instrn_buffer[0] = TT_OP_SFPPOPC(imm12_math, lreg_c, lreg_dest, instr_mod1) -#define TTI_SFPPOPC(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPPOPC(imm12_math, lreg_c, lreg_dest, instr_mod1))) - -#define TT_OP_SFPPUSHC(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - TT_OP(0x87, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) -#define TT_SFPPUSHC_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && \ - ckernel::is_valid(instr_mod1, 4)) -#define TT_SFPPUSHC(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - ckernel::instrn_buffer[0] = TT_OP_SFPPUSHC(imm12_math, lreg_c, lreg_dest, instr_mod1) -#define TTI_SFPPUSHC(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPPUSHC(imm12_math, lreg_c, lreg_dest, instr_mod1))) - -#define TT_OP_SFPSETCC(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - TT_OP(0x7b, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) -#define TT_SFPSETCC_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && \ - ckernel::is_valid(instr_mod1, 4)) -#define TT_SFPSETCC(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - ckernel::instrn_buffer[0] = TT_OP_SFPSETCC(imm12_math, lreg_c, lreg_dest, instr_mod1) -#define TTI_SFPSETCC(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPSETCC(imm12_math, lreg_c, lreg_dest, instr_mod1))) - -#define TT_OP_SFPSETEXP(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - TT_OP(0x82, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) -#define TT_SFPSETEXP_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && \ - ckernel::is_valid(instr_mod1, 4)) -#define TT_SFPSETEXP(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - ckernel::instrn_buffer[0] = TT_OP_SFPSETEXP(imm12_math, lreg_c, lreg_dest, instr_mod1) -#define TTI_SFPSETEXP(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPSETEXP(imm12_math, lreg_c, lreg_dest, instr_mod1))) - -#define TT_OP_SFPSETMAN(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - TT_OP(0x83, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) -#define TT_SFPSETMAN_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && \ - ckernel::is_valid(instr_mod1, 4)) -#define TT_SFPSETMAN(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - ckernel::instrn_buffer[0] = TT_OP_SFPSETMAN(imm12_math, lreg_c, lreg_dest, instr_mod1) -#define TTI_SFPSETMAN(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPSETMAN(imm12_math, lreg_c, lreg_dest, instr_mod1))) - -#define TT_OP_SFPSETSGN(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - TT_OP(0x89, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) -#define TT_SFPSETSGN_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && \ - ckernel::is_valid(instr_mod1, 4)) -#define TT_SFPSETSGN(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - ckernel::instrn_buffer[0] = TT_OP_SFPSETSGN(imm12_math, lreg_c, lreg_dest, instr_mod1) -#define TTI_SFPSETSGN(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPSETSGN(imm12_math, lreg_c, lreg_dest, instr_mod1))) - -#define TT_OP_SFPSHFT(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - TT_OP(0x7a, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) -#define TT_SFPSHFT_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && \ - ckernel::is_valid(instr_mod1, 4)) -#define TT_SFPSHFT(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - ckernel::instrn_buffer[0] = TT_OP_SFPSHFT(imm12_math, lreg_c, lreg_dest, instr_mod1) -#define TTI_SFPSHFT(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPSHFT(imm12_math, lreg_c, lreg_dest, instr_mod1))) - -#define TT_OP_SFPSHFT2(imm12_math, lreg_src_c, lreg_dest, instr_mod1) \ - TT_OP(0x94, (((imm12_math) << 12) + ((lreg_src_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) -#define TT_SFPSHFT2_VALID(imm12_math, lreg_src_c, lreg_dest, instr_mod1) \ - (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_src_c, 4) && ckernel::is_valid(lreg_dest, 4) && \ - ckernel::is_valid(instr_mod1, 4)) -#define TT_SFPSHFT2(imm12_math, lreg_src_c, lreg_dest, instr_mod1) \ - ckernel::instrn_buffer[0] = TT_OP_SFPSHFT2(imm12_math, lreg_src_c, lreg_dest, instr_mod1) -#define TTI_SFPSHFT2(imm12_math, lreg_src_c, lreg_dest, instr_mod1) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPSHFT2(imm12_math, lreg_src_c, lreg_dest, instr_mod1))) - -#define TT_OP_SFPSTORE(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) \ - TT_OP(0x72, (((lreg_ind) << 20) + ((instr_mod0) << 16) + ((sfpu_addr_mode) << 14) + ((dest_reg_addr) << 0))) -#define TT_SFPSTORE_VALID(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) \ - (ckernel::is_valid(lreg_ind, 4) && ckernel::is_valid(instr_mod0, 4) && ckernel::is_valid(sfpu_addr_mode, 2) && \ - ckernel::is_valid(dest_reg_addr, 14)) -#define TT_SFPSTORE(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) \ - ckernel::instrn_buffer[0] = TT_OP_SFPSTORE(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) -#define TTI_SFPSTORE(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPSTORE(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr))) - -#define TT_OP_SFPSWAP(imm12_math, lreg_src_c, lreg_dest, instr_mod1) \ - TT_OP(0x92, (((imm12_math) << 12) + ((lreg_src_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) -#define TT_SFPSWAP_VALID(imm12_math, lreg_src_c, lreg_dest, instr_mod1) \ - (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_src_c, 4) && ckernel::is_valid(lreg_dest, 4) && \ - ckernel::is_valid(instr_mod1, 4)) -#define TT_SFPSWAP(imm12_math, lreg_src_c, lreg_dest, instr_mod1) \ - ckernel::instrn_buffer[0] = TT_OP_SFPSWAP(imm12_math, lreg_src_c, lreg_dest, instr_mod1) -#define TTI_SFPSWAP(imm12_math, lreg_src_c, lreg_dest, instr_mod1) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPSWAP(imm12_math, lreg_src_c, lreg_dest, instr_mod1))) - -#define TT_OP_SFPTRANSP(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - TT_OP(0x8c, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) -#define TT_SFPTRANSP_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && \ - ckernel::is_valid(instr_mod1, 4)) -#define TT_SFPTRANSP(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - ckernel::instrn_buffer[0] = TT_OP_SFPTRANSP(imm12_math, lreg_c, lreg_dest, instr_mod1) -#define TTI_SFPTRANSP(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPTRANSP(imm12_math, lreg_c, lreg_dest, instr_mod1))) - -#define TT_OP_SFPXOR(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - TT_OP(0x8d, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) -#define TT_SFPXOR_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && \ - ckernel::is_valid(instr_mod1, 4)) -#define TT_SFPXOR(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - ckernel::instrn_buffer[0] = TT_OP_SFPXOR(imm12_math, lreg_c, lreg_dest, instr_mod1) -#define TTI_SFPXOR(imm12_math, lreg_c, lreg_dest, instr_mod1) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPXOR(imm12_math, lreg_c, lreg_dest, instr_mod1))) - -#define TT_OP_SFP_STOCH_RND(rnd_mode, imm8_math, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \ - TT_OP( \ - 0x8e, \ - (((rnd_mode) << 21) + ((imm8_math) << 16) + ((lreg_src_b) << 12) + ((lreg_src_c) << 8) + ((lreg_dest) << 4) + \ - ((instr_mod1) << 0))) -#define TT_SFP_STOCH_RND_VALID(rnd_mode, imm8_math, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \ - (ckernel::is_valid(rnd_mode, 3) && ckernel::is_valid(imm8_math, 5) && ckernel::is_valid(lreg_src_b, 4) && \ - ckernel::is_valid(lreg_src_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4)) -#define TT_SFP_STOCH_RND(rnd_mode, imm8_math, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \ - ckernel::instrn_buffer[0] = TT_OP_SFP_STOCH_RND(rnd_mode, imm8_math, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) -#define TTI_SFP_STOCH_RND(rnd_mode, imm8_math, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \ - INSTRUCTION_WORD( \ - TRISC_OP_SWIZZLE(TT_OP_SFP_STOCH_RND(rnd_mode, imm8_math, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1))) - -#define TT_OP_SHIFTDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) \ - TT_OP( \ - 0x5c, \ - (((OpBisConst) << 23) + ((OpSel) << 18) + ((ResultRegIndex) << 12) + ((OpBRegIndex) << 6) + \ - ((OpARegIndex) << 0))) -#define TT_SHIFTDMAREG_VALID(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) \ - (ckernel::is_valid(OpBisConst, 1) && ckernel::is_valid(OpSel, 5) && ckernel::is_valid(ResultRegIndex, 6) && \ - ckernel::is_valid(OpBRegIndex, 6) && ckernel::is_valid(OpARegIndex, 6)) -#define TT_SHIFTDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) \ - ckernel::instrn_buffer[0] = TT_OP_SHIFTDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) -#define TTI_SHIFTDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SHIFTDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex))) - -#define TT_OP_SHIFTXA(log2_amount2, shift_mode) TT_OP(0x17, (((log2_amount2) << 2) + ((shift_mode) << 0))) -#define TT_SHIFTXA_VALID(log2_amount2, shift_mode) \ - (ckernel::is_valid(log2_amount2, 22) && ckernel::is_valid(shift_mode, 2)) -#define TT_SHIFTXA(log2_amount2, shift_mode) ckernel::instrn_buffer[0] = TT_OP_SHIFTXA(log2_amount2, shift_mode) -#define TTI_SHIFTXA(log2_amount2, shift_mode) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SHIFTXA(log2_amount2, shift_mode))) - -#define TT_OP_SHIFTXB(addr_mode, rot_shift, shift_row) \ - TT_OP(0x18, (((addr_mode) << 15) + ((rot_shift) << 10) + ((shift_row) << 0))) -#define TT_SHIFTXB_VALID(addr_mode, rot_shift, shift_row) \ - (ckernel::is_valid(addr_mode, 9) && ckernel::is_valid(rot_shift, 5) && ckernel::is_valid(shift_row, 10)) -#define TT_SHIFTXB(addr_mode, rot_shift, shift_row) \ - ckernel::instrn_buffer[0] = TT_OP_SHIFTXB(addr_mode, rot_shift, shift_row) -#define TTI_SHIFTXB(addr_mode, rot_shift, shift_row) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SHIFTXB(addr_mode, rot_shift, shift_row))) - -#define TT_OP_STALLWAIT(stall_res, wait_res) TT_OP(0xa2, (((stall_res) << 15) + ((wait_res) << 0))) -#define TT_STALLWAIT_VALID(stall_res, wait_res) (ckernel::is_valid(stall_res, 9) && ckernel::is_valid(wait_res, 15)) -#define TT_STALLWAIT(stall_res, wait_res) ckernel::instrn_buffer[0] = TT_OP_STALLWAIT(stall_res, wait_res) -#define TTI_STALLWAIT(stall_res, wait_res) INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_STALLWAIT(stall_res, wait_res))) - -#define TT_OP_STOREIND(MemHierSel, SizeSel, RegSizeSel, OffsetIndex, AutoIncSpec, DataRegIndex, AddrRegIndex) \ - TT_OP( \ - 0x66, \ - (((MemHierSel) << 23) + ((SizeSel) << 22) + ((RegSizeSel) << 21) + ((OffsetIndex) << 14) + \ - ((AutoIncSpec) << 12) + ((DataRegIndex) << 6) + ((AddrRegIndex) << 0))) -#define TT_STOREIND_VALID(MemHierSel, SizeSel, RegSizeSel, OffsetIndex, AutoIncSpec, DataRegIndex, AddrRegIndex) \ - (ckernel::is_valid(MemHierSel, 1) && ckernel::is_valid(SizeSel, 1) && ckernel::is_valid(RegSizeSel, 1) && \ - ckernel::is_valid(OffsetIndex, 7) && ckernel::is_valid(AutoIncSpec, 2) && ckernel::is_valid(DataRegIndex, 6) && \ - ckernel::is_valid(AddrRegIndex, 6)) -#define TT_STOREIND(MemHierSel, SizeSel, RegSizeSel, OffsetIndex, AutoIncSpec, DataRegIndex, AddrRegIndex) \ - ckernel::instrn_buffer[0] = \ - TT_OP_STOREIND(MemHierSel, SizeSel, RegSizeSel, OffsetIndex, AutoIncSpec, DataRegIndex, AddrRegIndex) -#define TTI_STOREIND(MemHierSel, SizeSel, RegSizeSel, OffsetIndex, AutoIncSpec, DataRegIndex, AddrRegIndex) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE( \ - TT_OP_STOREIND(MemHierSel, SizeSel, RegSizeSel, OffsetIndex, AutoIncSpec, DataRegIndex, AddrRegIndex))) - -#define TT_OP_STOREREG(TdmaDataRegIndex, RegAddr) TT_OP(0x67, (((TdmaDataRegIndex) << 18) + ((RegAddr) << 0))) -#define TT_STOREREG_VALID(TdmaDataRegIndex, RegAddr) \ - (ckernel::is_valid(TdmaDataRegIndex, 6) && ckernel::is_valid(RegAddr, 18)) -#define TT_STOREREG(TdmaDataRegIndex, RegAddr) ckernel::instrn_buffer[0] = TT_OP_STOREREG(TdmaDataRegIndex, RegAddr) -#define TTI_STOREREG(TdmaDataRegIndex, RegAddr) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_STOREREG(TdmaDataRegIndex, RegAddr))) - -#define TT_OP_SUBDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) \ - TT_OP(0x59, (((OpBisConst) << 23) + ((ResultRegIndex) << 12) + ((OpBRegIndex) << 6) + ((OpARegIndex) << 0))) -#define TT_SUBDMAREG_VALID(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) \ - (ckernel::is_valid(OpBisConst, 1) && ckernel::is_valid(ResultRegIndex, 11) && ckernel::is_valid(OpBRegIndex, 6) && \ - ckernel::is_valid(OpARegIndex, 6)) -#define TT_SUBDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) \ - ckernel::instrn_buffer[0] = TT_OP_SUBDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) -#define TTI_SUBDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SUBDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex))) - -#define TT_OP_TBUFCMD TT_OP(0x4b, 0) -#define TTI_TBUFCMD INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_TBUFCMD)) - -#define TT_OP_TRNSPSRCA TT_OP(0x14, 0) -#define TTI_TRNSPSRCA INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_TRNSPSRCA)) - -#define TT_OP_TRNSPSRCB TT_OP(0x16, 0) -#define TTI_TRNSPSRCB INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_TRNSPSRCB)) - -#define TT_OP_UNPACR( \ - Unpack_block_selection, \ - AddrMode, \ - CfgContextCntInc, \ - CfgContextId, \ - AddrCntContextId, \ - OvrdThreadId, \ - SetDatValid, \ - rareb_en, \ - ZeroWrite2, \ - AutoIncContextID, \ - RowSearch, \ - SearchCacheFlush, \ - Last) \ - TT_OP( \ - 0x42, \ - (((Unpack_block_selection) << 23) + ((AddrMode) << 15) + ((CfgContextCntInc) << 13) + ((CfgContextId) << 10) + \ - ((AddrCntContextId) << 8) + ((OvrdThreadId) << 7) + ((SetDatValid) << 6) + ((rareb_en) << 5) + \ - ((ZeroWrite2) << 4) + ((AutoIncContextID) << 3) + ((RowSearch) << 2) + ((SearchCacheFlush) << 1) + \ - ((Last) << 0))) -#define TT_UNPACR_VALID( \ - Unpack_block_selection, \ - AddrMode, \ - CfgContextCntInc, \ - CfgContextId, \ - AddrCntContextId, \ - OvrdThreadId, \ - SetDatValid, \ - rareb_en, \ - ZeroWrite2, \ - AutoIncContextID, \ - RowSearch, \ - SearchCacheFlush, \ - Last) \ - (ckernel::is_valid(Unpack_block_selection, 1) && ckernel::is_valid(AddrMode, 8) && \ - ckernel::is_valid(CfgContextCntInc, 2) && ckernel::is_valid(CfgContextId, 3) && \ - ckernel::is_valid(AddrCntContextId, 2) && ckernel::is_valid(OvrdThreadId, 1) && \ - ckernel::is_valid(SetDatValid, 1) && ckernel::is_valid(rareb_en, 1) && ckernel::is_valid(ZeroWrite2, 1) && \ - ckernel::is_valid(AutoIncContextID, 1) && ckernel::is_valid(RowSearch, 1) && \ - ckernel::is_valid(SearchCacheFlush, 1) && ckernel::is_valid(Last, 1)) -#define TT_UNPACR( \ - Unpack_block_selection, \ - AddrMode, \ - CfgContextCntInc, \ - CfgContextId, \ - AddrCntContextId, \ - OvrdThreadId, \ - SetDatValid, \ - rareb_en, \ - ZeroWrite2, \ - AutoIncContextID, \ - RowSearch, \ - SearchCacheFlush, \ - Last) \ - ckernel::instrn_buffer[0] = TT_OP_UNPACR( \ - Unpack_block_selection, \ - AddrMode, \ - CfgContextCntInc, \ - CfgContextId, \ - AddrCntContextId, \ - OvrdThreadId, \ - SetDatValid, \ - rareb_en, \ - ZeroWrite2, \ - AutoIncContextID, \ - RowSearch, \ - SearchCacheFlush, \ - Last) -#define TTI_UNPACR( \ - Unpack_block_selection, \ - AddrMode, \ - CfgContextCntInc, \ - CfgContextId, \ - AddrCntContextId, \ - OvrdThreadId, \ - SetDatValid, \ - rareb_en, \ - ZeroWrite2, \ - AutoIncContextID, \ - RowSearch, \ - SearchCacheFlush, \ - Last) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_UNPACR( \ - Unpack_block_selection, \ - AddrMode, \ - CfgContextCntInc, \ - CfgContextId, \ - AddrCntContextId, \ - OvrdThreadId, \ - SetDatValid, \ - rareb_en, \ - ZeroWrite2, \ - AutoIncContextID, \ - RowSearch, \ - SearchCacheFlush, \ - Last))) - -#define TT_OP_UNPACR_NOP(Unpack_block_selection, NoOp) TT_OP(0x43, (((Unpack_block_selection) << 23) + ((NoOp) << 0))) -#define TT_UNPACR_NOP_VALID(Unpack_block_selection, NoOp) \ - (ckernel::is_valid(Unpack_block_selection, 1) && ckernel::is_valid(NoOp, 23)) -#define TT_UNPACR_NOP(Unpack_block_selection, NoOp) \ - ckernel::instrn_buffer[0] = TT_OP_UNPACR_NOP(Unpack_block_selection, NoOp) -#define TTI_UNPACR_NOP(Unpack_block_selection, NoOp) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_UNPACR_NOP(Unpack_block_selection, NoOp))) - -#define TT_OP_WRCFG(GprAddress, wr128b, CfgReg) TT_OP(0xb0, (((GprAddress) << 16) + ((wr128b) << 15) + ((CfgReg) << 0))) -#define TT_WRCFG_VALID(GprAddress, wr128b, CfgReg) \ - (ckernel::is_valid(GprAddress, 8) && ckernel::is_valid(wr128b, 1) && ckernel::is_valid(CfgReg, 15)) -#define TT_WRCFG(GprAddress, wr128b, CfgReg) ckernel::instrn_buffer[0] = TT_OP_WRCFG(GprAddress, wr128b, CfgReg) -#define TTI_WRCFG(GprAddress, wr128b, CfgReg) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_WRCFG(GprAddress, wr128b, CfgReg))) - -#define TT_OP_XMOV(Mov_block_selection, Last) TT_OP(0x40, (((Mov_block_selection) << 23) + ((Last) << 0))) -#define TT_XMOV_VALID(Mov_block_selection, Last) \ - (ckernel::is_valid(Mov block selection, 1) && ckernel::is_valid(Last, 23)) -#define TT_XMOV(Mov_block_selection, Last) ckernel::instrn_buffer[0] = TT_OP_XMOV(Mov_block_selection, Last) -#define TTI_XMOV(Mov_block_selection, Last) INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_XMOV(Mov_block_selection, Last))) - -#define TT_OP_ZEROACC(clear_mode, AddrMode, dst) TT_OP(0x10, (((clear_mode) << 19) + ((AddrMode) << 15) + ((dst) << 0))) -#define TT_ZEROACC_VALID(clear_mode, AddrMode, dst) \ - (ckernel::is_valid(clear_mode, 5) && ckernel::is_valid(AddrMode, 4) && ckernel::is_valid(dst, 15)) -#define TT_ZEROACC(clear_mode, AddrMode, dst) ckernel::instrn_buffer[0] = TT_OP_ZEROACC(clear_mode, AddrMode, dst) -#define TTI_ZEROACC(clear_mode, AddrMode, dst) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ZEROACC(clear_mode, AddrMode, dst))) - -#define TT_OP_ZEROSRC(zero_val, write_mode, bank_mask, src_mask) \ - TT_OP(0x11, (((zero_val) << 4) + ((write_mode) << 3) + ((bank_mask) << 2) + ((src_mask) << 0))) -#define TT_ZEROSRC_VALID(zero_val, write_mode, bank_mask, src_mask) \ - (ckernel::is_valid(zero_val, 20) && ckernel::is_valid(write_mode, 1) && ckernel::is_valid(bank_mask, 1) && \ - ckernel::is_valid(src_mask, 2)) -#define TT_ZEROSRC(zero_val, write_mode, bank_mask, src_mask) \ - ckernel::instrn_buffer[0] = TT_OP_ZEROSRC(zero_val, write_mode, bank_mask, src_mask) -#define TTI_ZEROSRC(zero_val, write_mode, bank_mask, src_mask) \ - INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ZEROSRC(zero_val, write_mode, bank_mask, src_mask))) From ee47474dacd3512ebe390df89c4ad1451b1d7cb2 Mon Sep 17 00:00:00 2001 From: Sofija Jovic <148721049+s-jovic@users.noreply.github.com> Date: Fri, 7 Feb 2025 17:38:00 +0100 Subject: [PATCH 011/316] #17134: Add SD cross attn down block ut (#17712) --- .../tests/test_cross_attn_downblock_2d.py | 134 ++++++++++++++++++ .../test_cross_attn_downblock_2d.py | 1 + .../stable_diffusion/test_downblock_2d.py | 1 + 3 files changed, 136 insertions(+) create mode 100644 models/demos/wormhole/stable_diffusion/tests/test_cross_attn_downblock_2d.py create mode 120000 tests/nightly/single_card/stable_diffusion/test_cross_attn_downblock_2d.py create mode 120000 tests/nightly/single_card/stable_diffusion/test_downblock_2d.py diff --git a/models/demos/wormhole/stable_diffusion/tests/test_cross_attn_downblock_2d.py b/models/demos/wormhole/stable_diffusion/tests/test_cross_attn_downblock_2d.py new file mode 100644 index 00000000000..fbb3178dc47 --- /dev/null +++ b/models/demos/wormhole/stable_diffusion/tests/test_cross_attn_downblock_2d.py @@ -0,0 +1,134 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +from diffusers import StableDiffusionPipeline +import pytest +import torch +import ttnn + +from models.demos.wormhole.stable_diffusion.custom_preprocessing import custom_preprocessor +from models.demos.wormhole.stable_diffusion.tt.ttnn_functional_cross_attention_down_block_2d_new_conv import ( + cross_attention_down_block_2d, +) +from models.demos.wormhole.stable_diffusion.tt.ttnn_functional_utility_functions import ( + get_default_compute_config, + preprocess_and_push_input_to_device, + post_process_output_and_move_to_host, +) +from models.utility_functions import skip_for_grayskull, torch_random +from ttnn.model_preprocessing import preprocess_model_parameters +from tests.ttnn.utils_for_testing import assert_with_pcc + + +@skip_for_grayskull() +@pytest.mark.parametrize("device_params", [{"l1_small_size": 32768}], indirect=True) +@pytest.mark.parametrize( + "block_index, hidden_states, shard_layout, shard_end_core, shard_shape, out_channels", + [ + (0, [2, 320, 64, 64], ttnn.TensorMemoryLayout.HEIGHT_SHARDED, (7, 7), (128, 320), 320), + (1, [2, 320, 32, 32], ttnn.TensorMemoryLayout.BLOCK_SHARDED, (4, 7), (256, 64), 640), + (2, [2, 640, 16, 16], ttnn.TensorMemoryLayout.BLOCK_SHARDED, (4, 7), (64, 128), 1280), + ], +) +@pytest.mark.parametrize("temb", [[1, 1, 2, 1280]]) +def test_cross_attention_downblock_512x512( + reset_seeds, device, block_index, hidden_states, shard_layout, shard_end_core, shard_shape, out_channels, temb +): + # Initialize PyTorch component + pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float32) + unet = pipe.unet + unet.eval() + torch_down_block = unet.down_blocks[block_index] + + # Initialize ttnn component + reader_patterns_cache = {} + parameters = preprocess_model_parameters( + initialize_model=lambda: unet, custom_preprocessor=custom_preprocessor, device=device + ) + parameters = parameters.down_blocks[block_index] + N, _, H, W = hidden_states + compute_kernel_config = get_default_compute_config(device) + + ttnn_down_block = cross_attention_down_block_2d( + device, parameters, reader_patterns_cache, N, H, W, compute_kernel_config + ) + + # Prepare inputs + in_channels = hidden_states[1] + temb_channels = 1280 + input_shape = hidden_states + hidden_states = torch_random(input_shape, -0.1, 0.1, dtype=torch.float32) + temb = torch_random(temb, -0.1, 0.1, dtype=torch.float32) + + encoder_hidden_states_shape = [1, 2, 77, 768] + encoder_hidden_states = torch.randn(encoder_hidden_states_shape) + + # Run PyTorch component + torch_output, torch_residuals = torch_down_block( + hidden_states, temb.squeeze(0).squeeze(0), encoder_hidden_states.squeeze(0) + ) + + # Prepare inputs for ttnn component + hidden_states = preprocess_and_push_input_to_device( + device, + hidden_states, + memory_config=ttnn.MemoryConfig( + shard_layout, + ttnn.BufferType.L1, + ttnn.ShardSpec( + ttnn.CoreRangeSet( + { + ttnn.CoreRange( + ttnn.CoreCoord(0, 0), + ttnn.CoreCoord(shard_end_core[0], shard_end_core[1]), + ), + } + ), + shard_shape, + ttnn.ShardOrientation.ROW_MAJOR, + ), + ), + ) + + temb = temb.permute(2, 0, 1, 3) + temb = ttnn.from_torch(temb, ttnn.bfloat16) + temb = ttnn.to_layout(temb, ttnn.TILE_LAYOUT, ttnn.bfloat8_b) + temb = ttnn.to_device(temb, device, memory_config=ttnn.L1_MEMORY_CONFIG) + + encoder_hidden_states = torch.nn.functional.pad(encoder_hidden_states, (0, 0, 0, 19)) + encoder_hidden_states = ttnn.from_torch( + encoder_hidden_states, dtype=ttnn.bfloat8_b, layout=ttnn.TILE_LAYOUT, device=device + ) + encoder_hidden_states = ttnn.to_device(encoder_hidden_states, device, memory_config=ttnn.L1_MEMORY_CONFIG) + + # Run ttnn component + output, residuals = ttnn_down_block( + hidden_states=hidden_states, + temb=temb, + encoder_hidden_states=encoder_hidden_states, + config=unet.config, + in_channels=in_channels, + out_channels=out_channels, + temb_channels=temb_channels, + add_downsample=True, + resnet_eps=1e-5, + resnet_act_fn="silu", + ) + + # Compare outputs + output = post_process_output_and_move_to_host(output, N, H // 2, W // 2, out_channels) + assert_with_pcc(torch_output, output, 0.98) + + for residual_index, (torch_residual, residual) in enumerate(zip(torch_residuals, residuals)): + if residual_index < 2: + out_height = H + out_width = W + else: + out_height = H // 2 + out_width = W // 2 + + residual = post_process_output_and_move_to_host(residual, N, out_height, out_width, out_channels) + + assert_with_pcc(torch_residual, residual, 0.98) diff --git a/tests/nightly/single_card/stable_diffusion/test_cross_attn_downblock_2d.py b/tests/nightly/single_card/stable_diffusion/test_cross_attn_downblock_2d.py new file mode 120000 index 00000000000..5e00d1e08c8 --- /dev/null +++ b/tests/nightly/single_card/stable_diffusion/test_cross_attn_downblock_2d.py @@ -0,0 +1 @@ +../../../../models/demos/wormhole/stable_diffusion/tests/test_cross_attn_downblock_2d.py \ No newline at end of file diff --git a/tests/nightly/single_card/stable_diffusion/test_downblock_2d.py b/tests/nightly/single_card/stable_diffusion/test_downblock_2d.py new file mode 120000 index 00000000000..4b25e9313af --- /dev/null +++ b/tests/nightly/single_card/stable_diffusion/test_downblock_2d.py @@ -0,0 +1 @@ +../../../../models/demos/wormhole/stable_diffusion/tests/test_downblock_2d.py \ No newline at end of file From f3f7cbf92e71e720e73d96299495eb350d365c51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bojan=20Ro=C5=A1ko?= <156314064+broskoTT@users.noreply.github.com> Date: Fri, 7 Feb 2025 18:07:03 +0100 Subject: [PATCH 012/316] [UMD] Use new CoreCoord api for eth cores (#17642) ### Ticket Related to https://github.com/tenstorrent/tt-metal/issues/17002 ### Problem description Reduce the usage of old soc descriptor structures, and introduce the usage of .get_cores, and get_eth_core_for_channel. The only api which we don't provide is getting eth channel from a core. Our logical coordinates are defined such that channel == logical_coord.y, so we don't deem that necessary. However, I've left this "helper" inside metal_soc_descriptor for now, we might choose to remove it someday. ### Testing I've added the code to generate_logical_eth_coords_mapping to verify that the new and old code indeed return the same values for all. Now all of this code is without eth harvesting, which is being introduced for BH, so VIRTUAL was equal to PHYSICAL, which won't be true for BH. Some modifications might be needed in the future. I tried to honor physical coords throughout the code, but I think virtual might be needed at some places. Fortunately, after all these modifications switching between coord systems should be trivial. ### What's changed - Exchanged chan_to_logical_eth_core_map with get_eth_core_for_channel - Changed physical_ethernet_cores with get_cores and translate_coord_to - Changed get_logical_ethernet_cores with get_cores ### Checklist - [x] All post-commit tests : https://github.com/tenstorrent/tt-metal/actions/runs/13197483019 - [x] Blackhole post-commit tests : https://github.com/tenstorrent/tt-metal/actions/runs/13197485279 - [ ] (Single-card) Model perf tests : https://github.com/tenstorrent/tt-metal/actions/runs/13197487222 - [ ] (Single-card) Device perf regressions : https://github.com/tenstorrent/tt-metal/actions/runs/13197488581 - [ ] (T3K) T3000 unit tests : https://github.com/tenstorrent/tt-metal/actions/runs/13197490205 - [ ] (T3K) T3000 demo tests : https://github.com/tenstorrent/tt-metal/actions/runs/13197492002 - [ ] (TG) TG unit tests : https://github.com/tenstorrent/tt-metal/actions/runs/13197494115 - [ ] (TG) TG demo tests : https://github.com/tenstorrent/tt-metal/actions/runs/13197496589 - [x] (TGG) TGG unit tests : https://github.com/tenstorrent/tt-metal/actions/runs/13197499153 - [x] (TGG) TGG demo tests : https://github.com/tenstorrent/tt-metal/actions/runs/13197501071 --- tt_fabric/control_plane.cpp | 7 ++-- .../api/tt-metalium/metal_soc_descriptor.h | 6 --- tt_metal/common/metal_soc_descriptor.cpp | 40 +++++-------------- tt_metal/impl/device/device.cpp | 12 +++--- tt_metal/llrt/tt_cluster.cpp | 20 ++++++---- tt_metal/third_party/umd | 2 +- 6 files changed, 33 insertions(+), 54 deletions(-) diff --git a/tt_fabric/control_plane.cpp b/tt_fabric/control_plane.cpp index d57cc6b884d..0bfede9f0a0 100644 --- a/tt_fabric/control_plane.cpp +++ b/tt_fabric/control_plane.cpp @@ -512,9 +512,10 @@ std::tuple ControlPlane::get_connected_mesh_chi mesh_id_t mesh_id, chip_id_t chip_id, chan_id_t chan_id) const { // TODO: simplify this and maybe have this functionality in ControlPlane auto physical_chip_id = logical_mesh_chip_id_to_physical_chip_id_mapping_[mesh_id][chip_id]; - auto eth_core = tt::Cluster::instance().get_soc_desc(physical_chip_id).chan_to_logical_eth_core_map.at(chan_id); - auto [connected_physical_chip_id, connected_eth_core] = - tt::Cluster::instance().get_connected_ethernet_core(std::make_tuple(physical_chip_id, eth_core)); + tt::umd::CoreCoord eth_core = + tt::Cluster::instance().get_soc_desc(physical_chip_id).get_eth_core_for_channel(chan_id, CoordSystem::LOGICAL); + auto [connected_physical_chip_id, connected_eth_core] = tt::Cluster::instance().get_connected_ethernet_core( + std::make_tuple(physical_chip_id, CoreCoord{eth_core.x, eth_core.y})); auto [connected_mesh_id, connected_chip_id] = this->get_mesh_chip_id_from_physical_chip_id(connected_physical_chip_id); diff --git a/tt_metal/api/tt-metalium/metal_soc_descriptor.h b/tt_metal/api/tt-metalium/metal_soc_descriptor.h index aa62a78c826..e554e1b7040 100644 --- a/tt_metal/api/tt-metalium/metal_soc_descriptor.h +++ b/tt_metal/api/tt-metalium/metal_soc_descriptor.h @@ -20,14 +20,10 @@ struct metal_SocDescriptor : public tt_SocDescriptor { std::vector dram_view_eth_cores; // per dram view preferred eth endpoint std::vector dram_view_address_offsets; // starting address offset - std::vector logical_ethernet_cores; uint64_t dram_core_size; uint64_t dram_view_size; - std::vector physical_ethernet_cores; - std::map logical_eth_core_to_chan_map; - std::map chan_to_logical_eth_core_map; metal_SocDescriptor(const tt_SocDescriptor& other, uint32_t harvesting_mask, const BoardType& board_type); metal_SocDescriptor() = default; @@ -41,8 +37,6 @@ struct metal_SocDescriptor : public tt_SocDescriptor { const std::vector& get_pcie_cores() const; const std::vector get_dram_cores() const; - const std::vector& get_logical_ethernet_cores() const; - const std::vector& get_physical_ethernet_cores() const; int get_dram_channel_from_logical_core(const CoreCoord& logical_coord) const; diff --git a/tt_metal/common/metal_soc_descriptor.cpp b/tt_metal/common/metal_soc_descriptor.cpp index ec2827a9edf..7b41d62c8cf 100644 --- a/tt_metal/common/metal_soc_descriptor.cpp +++ b/tt_metal/common/metal_soc_descriptor.cpp @@ -75,14 +75,6 @@ const std::vector metal_SocDescriptor::get_dram_cores() const { return cores; } -const std::vector& metal_SocDescriptor::get_physical_ethernet_cores() const { - return this->physical_ethernet_cores; -} - -const std::vector& metal_SocDescriptor::get_logical_ethernet_cores() const { - return this->logical_ethernet_cores; -} - int metal_SocDescriptor::get_dram_channel_from_logical_core(const CoreCoord& logical_coord) const { const uint32_t num_dram_views = this->get_num_dram_views(); TT_FATAL( @@ -94,25 +86,15 @@ int metal_SocDescriptor::get_dram_channel_from_logical_core(const CoreCoord& log } CoreCoord metal_SocDescriptor::get_physical_ethernet_core_from_logical(const CoreCoord& logical_coord) const { - const auto& eth_chan_map = this->logical_eth_core_to_chan_map; - TT_FATAL( - (eth_chan_map.find(logical_coord) != eth_chan_map.end()), - "Bounds-Error -- Logical_core={} is outside of ethernet logical grid", - logical_coord.str()); - return this->physical_ethernet_cores.at(eth_chan_map.at(logical_coord)); + tt::umd::CoreCoord physical_coord = + translate_coord_to({logical_coord, CoreType::ETH, CoordSystem::LOGICAL}, CoordSystem::PHYSICAL); + return {physical_coord.x, physical_coord.y}; } CoreCoord metal_SocDescriptor::get_logical_ethernet_core_from_physical(const CoreCoord& physical_coord) const { - const auto& phys_eth_map = this->physical_ethernet_cores; - auto it = std::find(phys_eth_map.begin(), phys_eth_map.end(), physical_coord); - - TT_FATAL( - (it != phys_eth_map.end()), - "Bounds-Error -- Physical_core={} is outside of ethernet physical grid", - physical_coord.str()); - - int chan = it - phys_eth_map.begin(); - return this->chan_to_logical_eth_core_map.at(chan); + tt::umd::CoreCoord logical_coord = + translate_coord_to({physical_coord, CoreType::ETH, CoordSystem::PHYSICAL}, CoordSystem::LOGICAL); + return {logical_coord.x, logical_coord.y}; } CoreCoord metal_SocDescriptor::get_physical_tensix_core_from_logical(const CoreCoord& logical_coord) const { @@ -189,12 +171,8 @@ CoordSystem metal_SocDescriptor::get_umd_coord_system() const { } void metal_SocDescriptor::generate_logical_eth_coords_mapping() { - this->physical_ethernet_cores = this->ethernet_cores; - for (int i = 0; i < this->physical_ethernet_cores.size(); i++) { - CoreCoord core = {0, static_cast(i)}; - this->logical_eth_core_to_chan_map.insert({core, i}); - this->chan_to_logical_eth_core_map.insert({i, core}); - this->logical_ethernet_cores.emplace_back(core); + for (int i = 0; i < this->get_cores(CoreType::ETH).size(); i++) { + this->logical_eth_core_to_chan_map.insert({{0, i}, i}); } } @@ -204,7 +182,7 @@ void metal_SocDescriptor::generate_physical_routing_to_profiler_flat_id() { this->physical_routing_to_profiler_flat_id.emplace((CoreCoord){core.x, core.y}, 0); } - for (auto& core : this->physical_ethernet_cores) { + for (auto& core : this->get_cores(CoreType::ETH, CoordSystem::PHYSICAL)) { this->physical_routing_to_profiler_flat_id.emplace((CoreCoord){core.x, core.y}, 0); } diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index d20696b8112..c544bf00a3c 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -324,8 +324,8 @@ std::unique_ptr Device::initialize_allocator(size_t l1_small_size, si const auto noc_coord = this->virtual_core_from_logical_core(core, dispatch_core_type); config.core_type_from_noc_coord_table[noc_coord] = AllocCoreType::Dispatch; } - for (const auto &core : soc_desc.get_logical_ethernet_cores()) { - this->ethernet_cores_.insert(core); + for (const tt::umd::CoreCoord& core : soc_desc.get_cores(CoreType::ETH, CoordSystem::LOGICAL)) { + this->ethernet_cores_.insert({core.x, core.y}); } // L1_BANKING scheme creates 1 bank per DRAM core and splits up L1 such that there are power 2 num L1 banks @@ -715,7 +715,7 @@ void Device::initialize_and_launch_firmware() { const std::vector &pcie_cores = soc_d.get_pcie_cores(); const std::vector &dram_cores = soc_d.get_dram_cores(); - const std::vector ð_cores = soc_d.get_physical_ethernet_cores(); + const std::vector& eth_cores = soc_d.get_cores(CoreType::ETH, CoordSystem::PHYSICAL); // The SOC descriptor can list a dram core multiple times, depending on how GDDR is assigned to banks // Get a list of unique DRAM cores. std::unordered_set unique_dram_cores(dram_cores.begin(), dram_cores.end()); @@ -739,14 +739,14 @@ void Device::initialize_and_launch_firmware() { for (const CoreCoord &core : unique_dram_cores) { core_info->non_worker_cores[non_worker_cores_idx++] = {core.x, core.y, AddressableCoreType::DRAM}; } - for (const CoreCoord &core : eth_cores) { + for (const tt::umd::CoreCoord& core : eth_cores) { core_info->non_worker_cores[non_worker_cores_idx++] = {core.x, core.y, AddressableCoreType::ETH}; } if (hal.is_coordinate_virtualization_enabled()) { // Track Virtual Non Worker Cores (In this case only Eth) separately uint32_t virtual_non_worker_cores_idx = 0; - for (const CoreCoord &core : eth_cores) { - auto virtual_core = this->virtual_core_from_physical_core(core); + for (const tt::umd::CoreCoord& core : eth_cores) { + auto virtual_core = this->virtual_core_from_physical_core({core.x, core.y}); core_info->virtual_non_worker_cores[virtual_non_worker_cores_idx++] = {virtual_core.x, virtual_core.y, AddressableCoreType::ETH}; } } diff --git a/tt_metal/llrt/tt_cluster.cpp b/tt_metal/llrt/tt_cluster.cpp index 258e98e7273..59de00cd515 100644 --- a/tt_metal/llrt/tt_cluster.cpp +++ b/tt_metal/llrt/tt_cluster.cpp @@ -379,10 +379,13 @@ void Cluster::generate_logical_to_virtual_coord_mapping() { CoreCoord virtual_coords = this->get_virtual_coordinate_from_physical_coordinates(chip_id, phys_core); this->worker_logical_to_virtual_y_.at(board_type).insert({y_coords.first, virtual_coords.y}); } - for (std::size_t log_eth_core_y = 0; log_eth_core_y < soc_desc.physical_ethernet_cores.size(); log_eth_core_y++) { + for (std::size_t log_eth_core_y = 0; log_eth_core_y < soc_desc.get_cores(CoreType::ETH).size(); + log_eth_core_y++) { CoreCoord logical_eth_core = {0, log_eth_core_y}; - CoreCoord virtual_coords = this->get_virtual_coordinate_from_physical_coordinates( - chip_id, soc_desc.physical_ethernet_cores.at(log_eth_core_y)); + tt::umd::CoreCoord phys_eth_core = + soc_desc.translate_coord_to(soc_desc.get_eth_core_for_channel(log_eth_core_y), CoordSystem::PHYSICAL); + CoreCoord virtual_coords = + this->get_virtual_coordinate_from_physical_coordinates(chip_id, {phys_eth_core.x, phys_eth_core.y}); this->eth_logical_to_virtual_.at(board_type).insert({logical_eth_core, virtual_coords}); } } @@ -696,7 +699,7 @@ std::unordered_map> Cluster::get_ethernet_core this->cluster_desc_->get_directly_connected_ethernet_channels_between_chips(chip_id, other_chip_id)) { ethernet_channel_t local_chip_chan = std::get<0>(channel_pair); active_ethernet_cores.emplace_back( - get_soc_desc(chip_id).chan_to_logical_eth_core_map.at(local_chip_chan)); + get_soc_desc(chip_id).get_eth_core_for_channel(local_chip_chan, CoordSystem::LOGICAL)); } connected_chips.insert({other_chip_id, active_ethernet_cores}); } else { @@ -959,7 +962,8 @@ std::tuple Cluster::get_connected_ethernet_core(std::tuple auto connected_eth_core = this->cluster_desc_->get_chip_and_channel_of_remote_ethernet_core(std::get<0>(eth_core), eth_chan); return std::make_tuple( - std::get<0>(connected_eth_core), soc_desc.chan_to_logical_eth_core_map.at(std::get<1>(connected_eth_core))); + std::get<0>(connected_eth_core), + soc_desc.get_eth_core_for_channel(std::get<1>(connected_eth_core), CoordSystem::LOGICAL)); } std::vector Cluster::get_ethernet_sockets(chip_id_t local_chip, chip_id_t remote_chip) const { @@ -978,8 +982,10 @@ CoreCoord Cluster::ethernet_core_from_logical_core(chip_id_t chip_id, const Core } CoreCoord Cluster::get_virtual_eth_core_from_channel(chip_id_t chip_id, int channel) const { - CoreCoord logical_coord = this->get_soc_desc(chip_id).chan_to_logical_eth_core_map.at(channel); - return this->get_virtual_coordinate_from_logical_coordinates(chip_id, logical_coord, CoreType::ETH); + tt::umd::CoreCoord logical_coord = + this->get_soc_desc(chip_id).get_eth_core_for_channel(channel, CoordSystem::LOGICAL); + return this->get_virtual_coordinate_from_logical_coordinates( + chip_id, {logical_coord.x, logical_coord.y}, CoreType::ETH); } tt_cxy_pair Cluster::get_eth_core_for_dispatch_core( diff --git a/tt_metal/third_party/umd b/tt_metal/third_party/umd index b24a0c68150..5de287e9c5b 160000 --- a/tt_metal/third_party/umd +++ b/tt_metal/third_party/umd @@ -1 +1 @@ -Subproject commit b24a0c68150fb664559be34fabcc4958a3de9705 +Subproject commit 5de287e9c5b2fa3d55fbfd53e9bc59e2050f32fb From e6440482eefe5acf5a670d8a00cce41371c66414 Mon Sep 17 00:00:00 2001 From: Juan Camilo Vega Date: Fri, 7 Feb 2025 12:20:59 -0500 Subject: [PATCH 013/316] #17246: Fixing invalid test in ccl (#17727) ### Ticket #17246 ### Problem description The input sharding configuration for the tiny tile test was invalid for width and block sharding. Changed the shapes in width and block to abide by the requirements. ### What's changed Modify the pytest so tensor memory layout is tied to shape and setting the shape and shard shape based on the test being run ### Checklist - [ ] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes - [ ] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [ ] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [ ] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [ ] New/Existing tests provide coverage for changes --- .../operations/ccl/test_all_gather.py | 23 +++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/tests/ttnn/unit_tests/operations/ccl/test_all_gather.py b/tests/ttnn/unit_tests/operations/ccl/test_all_gather.py index 8485abce37e..d80fc6d6193 100644 --- a/tests/ttnn/unit_tests/operations/ccl/test_all_gather.py +++ b/tests/ttnn/unit_tests/operations/ccl/test_all_gather.py @@ -1959,24 +1959,29 @@ def test_all_gather_fp32( # https://github.com/tenstorrent/tt-metal/issues/9686 ttnn.bfloat16, ], ) -@pytest.mark.parametrize( - "tensor_mem_layout", - [ - ttnn.TensorMemoryLayout.WIDTH_SHARDED, - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, - ttnn.TensorMemoryLayout.BLOCK_SHARDED, - ], -) @pytest.mark.parametrize("orientation", [ttnn.ShardOrientation.ROW_MAJOR]) @pytest.mark.parametrize("num_links", [1]) @pytest.mark.parametrize( - "input_shape, input_shard_shape,shard_grid", + "input_shape, input_shard_shape,shard_grid,tensor_mem_layout", ( # LLama ( (4, 1, 256, 32), (32, 32), ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}), + ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + ), + ( + (1, 1, 64, 1024), + (64, 32), + ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}), + ttnn.TensorMemoryLayout.WIDTH_SHARDED, + ), + ( + (4, 1, 256, 64), + (256, 32), + ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}), + ttnn.TensorMemoryLayout.BLOCK_SHARDED, ), ), ) From b4c3918e449b97399d7e6565b0e237e26deb5451 Mon Sep 17 00:00:00 2001 From: mtairum Date: Fri, 7 Feb 2025 17:28:12 +0000 Subject: [PATCH 014/316] #0: Fix Llama3 RoPE eager test regression --- .../misc/test_rotary_embedding_llama.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama.py index e3c172ebb8c..6d4db95ccb7 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama.py @@ -106,7 +106,9 @@ def forward(self, xq, xk, freqs_cis): def compute_gather_cos_sin(dhead, end, position_ids): - cos, sin = precompute_freqs(dhead, end, theta=10000.0, use_scaled=False) # Using reference defaults + cos, sin = precompute_freqs( + dhead, end, theta=10000.0, scale_factor=None, orig_context_len=131072 + ) # Using reference defaults (no scaling) position_id_expanded = position_ids.unsqueeze(1).expand(-1, cos.shape[-1]) cos = cos.gather(0, position_id_expanded) sin = sin.gather(0, position_id_expanded) @@ -178,8 +180,10 @@ def run_test_rotary_embedding_llama( # inp: [seq_len, batch, n_heads, head_dim] if fuse_qk: - # Set up rope with 2 * batch size (for fused qk) - rope_setup_decode = TtLlamaRotarySetup(device, batch * 2, head_dim, max_seq_len) + # Set up rope with 2 * batch size (for fused qk) (no scaling) + rope_setup_decode = TtLlamaRotarySetup( + device, batch * 2, head_dim, max_seq_len, rope_theta=10000, scale_factor=None, orig_context_len=131072 + ) tt_model.transformation_mat = rope_setup_decode.transformation_mat cos, sin = rope_setup_decode.get_rot_mats(position_ids.repeat(2)) @@ -217,8 +221,11 @@ def run_test_rotary_embedding_llama( input_mem_configs = [q_input_mem_config, k_input_mem_config] else: - # Set up rope with batch size - rope_setup_decode = TtLlamaRotarySetup(device, batch, head_dim, max_seq_len) + # Set up rope with batch size (no scaling) + rope_setup_decode = TtLlamaRotarySetup( + device, batch, head_dim, max_seq_len, rope_theta=10000, scale_factor=None, orig_context_len=131072 + ) + tt_model.transformation_mat = rope_setup_decode.transformation_mat cos, sin = rope_setup_decode.get_rot_mats(position_ids) From d54089cafece8198ed7a7be54004567b3fa07da3 Mon Sep 17 00:00:00 2001 From: Aditya Saigal Date: Sun, 2 Feb 2025 18:26:10 -0800 Subject: [PATCH 015/316] Multi MeshCQ and MeshEvents API Bringup - Natively support Host <-> MeshCQ and MeshCQ <-> MeshCQ synchronization in TT-Mesh - Enable users to access up to 2 MeshCQs through MeshDevice - Add event synchronization APIs to distributed.hpp as per the spec - Share command assembly related to event APIs between MeshCQ and HardwareCommandQueue - With all core TT-Metal functionality added to TT-Mesh, the MeshCQ no longer relies on the single device HardwareCommandQueue to be available or initialized - Remove all bookkeeping done in MeshCQ to maintain shared state with HardwareCommandQueue - Add MeshEvent tests - Minor fixup for sending go signals to devices not involved in a MeshWorkload when SubDevices are loaded --- tests/tt_metal/distributed/CMakeLists.txt | 2 + .../tt_metal/distributed/test_mesh_events.cpp | 253 ++++++++++++++++++ .../distributed/test_mesh_sub_device.cpp | 32 +-- .../distributed/test_mesh_workload.cpp | 121 +-------- tests/tt_metal/distributed/utils.cpp | 126 +++++++++ tests/tt_metal/distributed/utils.hpp | 18 ++ .../tt_metal/common/multi_device_fixture.hpp | 27 +- tt_metal/api/tt-metalium/command_queue.hpp | 7 +- .../api/tt-metalium/dispatch_core_manager.hpp | 2 + tt_metal/api/tt-metalium/distributed.hpp | 20 +- .../api/tt-metalium/mesh_command_queue.hpp | 24 +- tt_metal/api/tt-metalium/mesh_device.hpp | 4 +- tt_metal/api/tt-metalium/mesh_device_view.hpp | 9 + tt_metal/api/tt-metalium/mesh_event.hpp | 19 ++ tt_metal/api/tt-metalium/mesh_workload.hpp | 5 - tt_metal/distributed/distributed.cpp | 30 ++- tt_metal/distributed/mesh_command_queue.cpp | 148 ++++++++-- tt_metal/distributed/mesh_device.cpp | 13 +- tt_metal/distributed/mesh_workload_utils.cpp | 75 ++---- tt_metal/distributed/mesh_workload_utils.hpp | 23 +- tt_metal/impl/CMakeLists.txt | 1 + tt_metal/impl/buffers/dispatch.hpp | 12 +- .../impl/dispatch/dispatch_core_manager.cpp | 5 + .../impl/dispatch/dispatch_query_manager.cpp | 49 +++- .../impl/dispatch/dispatch_query_manager.hpp | 4 + .../impl/dispatch/hardware_command_queue.cpp | 61 +---- .../impl/dispatch/hardware_command_queue.hpp | 6 +- .../impl/dispatch/host_runtime_commands.cpp | 162 +---------- .../impl/dispatch/host_runtime_commands.hpp | 55 ---- tt_metal/impl/event/dispatch.cpp | 183 +++++++++++++ tt_metal/impl/event/dispatch.hpp | 48 ++++ tt_metal/impl/program/dispatch.cpp | 9 +- 32 files changed, 1019 insertions(+), 534 deletions(-) create mode 100644 tests/tt_metal/distributed/test_mesh_events.cpp create mode 100644 tests/tt_metal/distributed/utils.cpp create mode 100644 tests/tt_metal/distributed/utils.hpp create mode 100644 tt_metal/api/tt-metalium/mesh_event.hpp create mode 100644 tt_metal/impl/event/dispatch.cpp create mode 100644 tt_metal/impl/event/dispatch.hpp diff --git a/tests/tt_metal/distributed/CMakeLists.txt b/tests/tt_metal/distributed/CMakeLists.txt index 97aa4feff0b..27bb9ee7b53 100644 --- a/tests/tt_metal/distributed/CMakeLists.txt +++ b/tests/tt_metal/distributed/CMakeLists.txt @@ -4,6 +4,8 @@ set(UNIT_TESTS_DISTRIBUTED_SRC ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_workload.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_sub_device.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_allocator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_events.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp ) # Define the function to create test executables for each architecture diff --git a/tests/tt_metal/distributed/test_mesh_events.cpp b/tests/tt_metal/distributed/test_mesh_events.cpp new file mode 100644 index 00000000000..c19d3632800 --- /dev/null +++ b/tests/tt_metal/distributed/test_mesh_events.cpp @@ -0,0 +1,253 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include + +#include "tests/tt_metal/tt_metal/dispatch/dispatch_test_utils.hpp" +#include "tests/tt_metal/tt_metal/common/multi_device_fixture.hpp" +#include "tests/tt_metal/distributed/utils.hpp" + +namespace tt::tt_metal::distributed::test { +namespace { + +using MeshEventsTest = T3000MultiCQMultiDeviceFixture; + +TEST_F(MeshEventsTest, ReplicatedAsyncIO) { + uint32_t NUM_TILES = 1000; + uint32_t num_iterations = 20; + int32_t single_tile_size = ::tt::tt_metal::detail::TileSize(DataFormat::UInt32); + + DeviceLocalBufferConfig per_device_buffer_config{ + .page_size = single_tile_size, + .buffer_type = BufferType::L1, + .buffer_layout = TensorMemoryLayout::INTERLEAVED, + .bottom_up = false}; + ReplicatedBufferConfig global_buffer_config = { + .size = NUM_TILES * single_tile_size, + }; + + std::shared_ptr buf = + MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device_.get()); + + for (std::size_t i = 0; i < num_iterations; i++) { + std::vector src_vec(NUM_TILES * single_tile_size / sizeof(uint32_t), 0); + std::iota(src_vec.begin(), src_vec.end(), i); + + std::vector> readback_vecs = {}; + std::shared_ptr event = std::make_shared(); + // Writes on CQ 0 + EnqueueWriteMeshBuffer(mesh_device_->mesh_command_queue(0), buf, src_vec); + // Device to Device Synchronization + EnqueueRecordEvent(mesh_device_->mesh_command_queue(0), event); + EnqueueWaitForEvent(mesh_device_->mesh_command_queue(1), event); + + // Reads on CQ 1 + for (std::size_t logical_x = 0; logical_x < buf->device()->num_cols(); logical_x++) { + for (std::size_t logical_y = 0; logical_y < buf->device()->num_rows(); logical_y++) { + readback_vecs.push_back({}); + auto shard = buf->get_device_buffer(Coordinate(logical_y, logical_x)); + ReadShard( + mesh_device_->mesh_command_queue(1), readback_vecs.back(), buf, Coordinate(logical_y, logical_x)); + } + } + + for (auto& vec : readback_vecs) { + EXPECT_EQ(vec, src_vec); + } + } +} + +TEST_F(MeshEventsTest, ShardedAsyncIO) { + uint32_t num_iterations = 20; + uint32_t single_tile_size = ::tt::tt_metal::detail::TileSize(DataFormat::UInt32); + + DeviceLocalBufferConfig per_device_buffer_config{ + .page_size = single_tile_size, + .buffer_type = BufferType::DRAM, + .buffer_layout = TensorMemoryLayout::INTERLEAVED, + .bottom_up = true}; + + Shape2D global_buffer_shape = {2048, 2048}; + Shape2D shard_shape = {512, 1024}; + + uint32_t global_buffer_size = global_buffer_shape.height() * global_buffer_shape.width() * sizeof(uint32_t); + + ShardedBufferConfig sharded_config{ + .global_size = global_buffer_size, + .global_buffer_shape = global_buffer_shape, + .shard_shape = shard_shape, + .shard_orientation = ShardOrientation::ROW_MAJOR, + }; + + auto mesh_buffer = MeshBuffer::create(sharded_config, per_device_buffer_config, mesh_device_.get()); + for (std::size_t i = 0; i < num_iterations; i++) { + std::vector src_vec = + std::vector(global_buffer_shape.height() * global_buffer_shape.width(), 0); + std::iota(src_vec.begin(), src_vec.end(), i); + std::shared_ptr event = std::make_shared(); + // Writes on CQ 0 + EnqueueWriteMeshBuffer(mesh_device_->mesh_command_queue(0), mesh_buffer, src_vec); + if (i % 2) { + // Test Host <-> Device synchronization + EnqueueRecordEventToHost(mesh_device_->mesh_command_queue(0), event); + EventSynchronize(event); + } else { + // Test Device <-> Device synchronization + EnqueueRecordEvent(mesh_device_->mesh_command_queue(0), event); + EnqueueWaitForEvent(mesh_device_->mesh_command_queue(1), event); + } + // Reads on CQ 1 + std::vector dst_vec = {}; + EnqueueReadMeshBuffer(mesh_device_->mesh_command_queue(1), dst_vec, mesh_buffer); + + EXPECT_EQ(dst_vec, src_vec); + } +} + +TEST_F(MeshEventsTest, AsyncWorkloadAndIO) { + uint32_t num_iters = 5; + std::vector> src0_bufs = {}; + std::vector> src1_bufs = {}; + std::vector> output_bufs = {}; + + CoreCoord worker_grid_size = mesh_device_->compute_with_storage_grid_size(); + + auto programs = tt::tt_metal::distributed::test::utils::create_eltwise_bin_programs( + mesh_device_, src0_bufs, src1_bufs, output_bufs); + auto mesh_workload = CreateMeshWorkload(); + LogicalDeviceRange devices_0 = LogicalDeviceRange({0, 0}, {3, 0}); + LogicalDeviceRange devices_1 = LogicalDeviceRange({0, 1}, {3, 1}); + + AddProgramToMeshWorkload(mesh_workload, *programs[0], devices_0); + AddProgramToMeshWorkload(mesh_workload, *programs[1], devices_1); + + for (int iter = 0; iter < num_iters; iter++) { + std::vector src0_vec = create_constant_vector_of_bfloat16(src0_bufs[0]->size(), iter + 2); + std::vector src1_vec = create_constant_vector_of_bfloat16(src1_bufs[0]->size(), iter + 3); + + std::shared_ptr write_event = std::make_shared(); + std::shared_ptr op_event = std::make_shared(); + + // Issue writes on MeshCQ 1 + for (std::size_t col_idx = 0; col_idx < worker_grid_size.x; col_idx++) { + for (std::size_t row_idx = 0; row_idx < worker_grid_size.y; row_idx++) { + EnqueueWriteMeshBuffer( + mesh_device_->mesh_command_queue(1), src0_bufs[col_idx * worker_grid_size.y + row_idx], src0_vec); + EnqueueWriteMeshBuffer( + mesh_device_->mesh_command_queue(1), src1_bufs[col_idx * worker_grid_size.y + row_idx], src1_vec); + } + } + if (iter % 2) { + // Test Host <-> Device Synchronization + EnqueueRecordEventToHost(mesh_device_->mesh_command_queue(1), write_event); + EventSynchronize(write_event); + } else { + // Test Device <-> Device Synchronization + EnqueueRecordEvent(mesh_device_->mesh_command_queue(1), write_event); + EnqueueWaitForEvent(mesh_device_->mesh_command_queue(0), write_event); + } + // Issue workloads on MeshCQ 0 + EnqueueMeshWorkload(mesh_device_->mesh_command_queue(0), mesh_workload, false); + if (iter % 2) { + // Test Device <-> Device Synchronization + EnqueueRecordEvent(mesh_device_->mesh_command_queue(0), op_event); + EnqueueWaitForEvent(mesh_device_->mesh_command_queue(1), op_event); + } else { + // Test Host <-> Device Synchronization + EnqueueRecordEventToHost(mesh_device_->mesh_command_queue(0), op_event); + EventSynchronize(op_event); + } + + // Issue reads on MeshCQ 1 + for (std::size_t logical_y = 0; logical_y < mesh_device_->num_rows(); logical_y++) { + for (std::size_t logical_x = 0; logical_x < mesh_device_->num_cols(); logical_x++) { + for (std::size_t col_idx = 0; col_idx < worker_grid_size.x; col_idx++) { + for (std::size_t row_idx = 0; row_idx < worker_grid_size.y; row_idx++) { + std::vector dst_vec = {}; + ReadShard( + mesh_device_->mesh_command_queue(1), + dst_vec, + output_bufs[col_idx * worker_grid_size.y + row_idx], + Coordinate(logical_y, logical_x)); + if (logical_y == 0) { + for (int i = 0; i < dst_vec.size(); i++) { + EXPECT_EQ(dst_vec[i].to_float(), (2 * iter + 5)); + } + } else { + for (int i = 0; i < dst_vec.size(); i++) { + EXPECT_EQ(dst_vec[i].to_float(), (iter + 2) * (iter + 3)); + } + } + } + } + } + } + } +} + +TEST_F(MeshEventsTest, CustomDeviceRanges) { + uint32_t NUM_TILES = 1000; + uint32_t num_iterations = 20; + int32_t single_tile_size = ::tt::tt_metal::detail::TileSize(DataFormat::UInt32); + + DeviceLocalBufferConfig per_device_buffer_config{ + .page_size = single_tile_size, + .buffer_type = BufferType::L1, + .buffer_layout = TensorMemoryLayout::INTERLEAVED, + .bottom_up = false}; + ReplicatedBufferConfig global_buffer_config = { + .size = NUM_TILES * single_tile_size, + }; + + std::shared_ptr buf = + MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device_.get()); + + for (std::size_t i = 0; i < num_iterations; i++) { + std::vector src_vec(NUM_TILES * single_tile_size / sizeof(uint32_t), i); + std::iota(src_vec.begin(), src_vec.end(), i); + LogicalDeviceRange devices_0 = LogicalDeviceRange({0, 0}, {3, 0}); + LogicalDeviceRange devices_1 = LogicalDeviceRange({0, 1}, {3, 1}); + + std::vector> readback_vecs = {}; + std::shared_ptr event_0 = std::make_shared(); + std::shared_ptr event_1 = std::make_shared(); + + mesh_device_->mesh_command_queue(1).enqueue_write_shard_to_sub_grid(*buf, src_vec.data(), devices_0, false); + EnqueueRecordEvent(mesh_device_->mesh_command_queue(1), event_0, {}, devices_0); + EnqueueWaitForEvent(mesh_device_->mesh_command_queue(0), event_0); + + for (std::size_t logical_x = devices_0.start_coord.x; logical_x < devices_0.end_coord.x; logical_x++) { + for (std::size_t logical_y = devices_0.start_coord.y; logical_y < devices_0.end_coord.y; logical_y++) { + readback_vecs.push_back({}); + auto shard = buf->get_device_buffer(Coordinate(logical_y, logical_x)); + ReadShard( + mesh_device_->mesh_command_queue(0), readback_vecs.back(), buf, Coordinate(logical_y, logical_x)); + } + } + + mesh_device_->mesh_command_queue(1).enqueue_write_shard_to_sub_grid(*buf, src_vec.data(), devices_1, false); + EnqueueRecordEventToHost(mesh_device_->mesh_command_queue(1), event_1, {}, devices_1); + EventSynchronize(event_1); + + for (std::size_t logical_x = devices_1.start_coord.x; logical_x < devices_1.end_coord.x; logical_x++) { + for (std::size_t logical_y = devices_1.start_coord.y; logical_y < devices_1.end_coord.y; logical_y++) { + readback_vecs.push_back({}); + auto shard = buf->get_device_buffer(Coordinate(logical_y, logical_x)); + ReadShard( + mesh_device_->mesh_command_queue(0), readback_vecs.back(), buf, Coordinate(logical_y, logical_x)); + } + } + for (auto& vec : readback_vecs) { + EXPECT_EQ(vec, src_vec); + } + } + Finish(mesh_device_->mesh_command_queue(0)); + Finish(mesh_device_->mesh_command_queue(1)); +} + +} // namespace +} // namespace tt::tt_metal::distributed::test diff --git a/tests/tt_metal/distributed/test_mesh_sub_device.cpp b/tests/tt_metal/distributed/test_mesh_sub_device.cpp index 90c0983f4c1..7a21597dd59 100644 --- a/tests/tt_metal/distributed/test_mesh_sub_device.cpp +++ b/tests/tt_metal/distributed/test_mesh_sub_device.cpp @@ -116,34 +116,10 @@ TEST_F(MeshSubDeviceTest, DataCopyOnSubDevices) { std::vector src_vec(input_buf->size() / sizeof(uint32_t)); std::iota(src_vec.begin(), src_vec.end(), i); - EnqueueWriteMeshBuffer(mesh_device_->mesh_command_queue(), input_buf, src_vec, false); - // Read Back global semaphore value across all cores to verify that it has been reset to 0 - // before updating it through host - auto shard_parameters = - ShardSpecBuffer(all_cores, {1, 1}, ShardOrientation::ROW_MAJOR, {1, 1}, {all_cores.size(), 1}); - DeviceLocalBufferConfig global_sem_buf_local_config{ - .page_size = sizeof(uint32_t), - .buffer_type = BufferType::L1, - .buffer_layout = TensorMemoryLayout::HEIGHT_SHARDED, - .shard_parameters = shard_parameters, - .bottom_up = false}; - ReplicatedBufferConfig global_sem_buf_global_config{ - .size = all_cores.size() * sizeof(uint32_t), - }; - - auto global_sem_buf = MeshBuffer::create( - global_sem_buf_global_config, global_sem_buf_local_config, mesh_device_.get(), global_sem.address()); - - for (std::size_t logical_x = 0; logical_x < input_buf->device()->num_cols(); logical_x++) { - for (std::size_t logical_y = 0; logical_y < input_buf->device()->num_rows(); logical_y++) { - std::vector dst_vec; - ReadShard( - mesh_device_->mesh_command_queue(), dst_vec, global_sem_buf, Coordinate(logical_y, logical_x)); - for (const auto& val : dst_vec) { - EXPECT_EQ(val, 0); - } - } - } + // Block after this write on host, since the global semaphore update starting the + // program goes through an independent path (UMD) and can go out of order wrt the + // buffer data + EnqueueWriteMeshBuffer(mesh_device_->mesh_command_queue(), input_buf, src_vec, true); for (auto device : mesh_device_->get_devices()) { tt::llrt::write_hex_vec_to_core( diff --git a/tests/tt_metal/distributed/test_mesh_workload.cpp b/tests/tt_metal/distributed/test_mesh_workload.cpp index ec25670047e..dcf3f9a4158 100644 --- a/tests/tt_metal/distributed/test_mesh_workload.cpp +++ b/tests/tt_metal/distributed/test_mesh_workload.cpp @@ -11,6 +11,7 @@ #include "tests/tt_metal/tt_metal/dispatch/dispatch_test_utils.hpp" #include "tests/tt_metal/tt_metal/common/multi_device_fixture.hpp" +#include "tests/tt_metal/distributed/utils.hpp" namespace tt::tt_metal::distributed::test { namespace { @@ -323,123 +324,6 @@ std::shared_ptr initialize_dummy_program(CoreCoord worker_grid_size) { return program; } -std::vector> create_eltwise_bin_programs( - std::shared_ptr& mesh_device, - std::vector>& src0_bufs, - std::vector>& src1_bufs, - std::vector>& output_bufs) { - const std::vector op_id_to_op_define = {"add_tiles", "mul_tiles"}; - const std::vector op_id_to_op_type_define = {"EltwiseBinaryType::ELWADD", "EltwiseBinaryType::ELWMUL"}; - - CoreCoord worker_grid_size = mesh_device->compute_with_storage_grid_size(); - - std::vector> programs = {std::make_shared(), std::make_shared()}; - auto full_grid = CoreRange({0, 0}, {worker_grid_size.x - 1, worker_grid_size.y - 1}); - - for (std::size_t eltwise_op = 0; eltwise_op < op_id_to_op_define.size(); eltwise_op++) { - auto& program = *programs[eltwise_op]; - uint32_t single_tile_size = 2 * 1024; - uint32_t num_tiles = 2048; - uint32_t dram_buffer_size = - single_tile_size * num_tiles; // num_tiles of FP16_B, hard-coded in the reader/writer kernels - uint32_t page_size = single_tile_size; - - ReplicatedBufferConfig global_buffer_config{.size = dram_buffer_size}; - DeviceLocalBufferConfig per_device_buffer_config{ - .page_size = page_size, - .buffer_type = tt_metal::BufferType::DRAM, - .buffer_layout = TensorMemoryLayout::INTERLEAVED, - .bottom_up = true}; - - for (std::size_t col_idx = 0; col_idx < worker_grid_size.x; col_idx++) { - for (std::size_t row_idx = 0; row_idx < worker_grid_size.y; row_idx++) { - auto src0_dram_buffer = - MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device.get()); - src0_bufs.push_back(src0_dram_buffer); - - auto src1_dram_buffer = - MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device.get()); - src1_bufs.push_back(src1_dram_buffer); - auto dst_dram_buffer = - MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device.get()); - output_bufs.push_back(dst_dram_buffer); - } - } - - uint32_t src0_cb_index = tt::CBIndex::c_0; - uint32_t num_input_tiles = 2; - tt_metal::CircularBufferConfig cb_src0_config = - tt_metal::CircularBufferConfig( - num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}}) - .set_page_size(src0_cb_index, single_tile_size); - auto cb_src0 = tt_metal::CreateCircularBuffer(program, full_grid, cb_src0_config); - - uint32_t src1_cb_index = tt::CBIndex::c_1; - tt_metal::CircularBufferConfig cb_src1_config = - tt_metal::CircularBufferConfig( - num_input_tiles * single_tile_size, {{src1_cb_index, tt::DataFormat::Float16_b}}) - .set_page_size(src1_cb_index, single_tile_size); - auto cb_src1 = tt_metal::CreateCircularBuffer(program, full_grid, cb_src1_config); - - uint32_t ouput_cb_index = tt::CBIndex::c_16; - uint32_t num_output_tiles = 2; - tt_metal::CircularBufferConfig cb_output_config = - tt_metal::CircularBufferConfig( - num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}}) - .set_page_size(ouput_cb_index, single_tile_size); - auto cb_output = tt_metal::CreateCircularBuffer(program, full_grid, cb_output_config); - - auto binary_reader_kernel = tt_metal::CreateKernel( - program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_dual_8bank.cpp", - full_grid, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); - - auto unary_writer_kernel = tt_metal::CreateKernel( - program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp", - full_grid, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); - - std::vector compute_kernel_args = {}; - - bool fp32_dest_acc_en = false; - bool math_approx_mode = false; - std::map binary_defines = { - {"ELTWISE_OP", op_id_to_op_define[eltwise_op]}, {"ELTWISE_OP_TYPE", op_id_to_op_type_define[eltwise_op]}}; - auto eltwise_binary_kernel = tt_metal::CreateKernel( - program, - "tt_metal/kernels/compute/eltwise_binary.cpp", - full_grid, - tt_metal::ComputeConfig{.compile_args = compute_kernel_args, .defines = binary_defines}); - - SetRuntimeArgs(program, eltwise_binary_kernel, full_grid, {2048, 1}); - - for (std::size_t col_idx = 0; col_idx < worker_grid_size.x; col_idx++) { - for (std::size_t row_idx = 0; row_idx < worker_grid_size.y; row_idx++) { - CoreCoord curr_core = {col_idx, row_idx}; - const std::array reader_args = { - src0_bufs.at(col_idx * worker_grid_size.y + row_idx)->address(), - 0, - num_tiles, - src1_bufs.at(col_idx * worker_grid_size.y + row_idx)->address(), - 0, - num_tiles, - 0}; - - const std::array writer_args = { - output_bufs.at(col_idx * worker_grid_size.y + row_idx)->address(), 0, num_tiles}; - - SetRuntimeArgs(program, unary_writer_kernel, curr_core, writer_args); - SetRuntimeArgs(program, binary_reader_kernel, curr_core, reader_args); - } - } - } - return programs; -} - void verify_cb_config( std::shared_ptr& mesh_device, MeshWorkload& workload, @@ -650,7 +534,8 @@ TEST_F(MeshWorkloadTest, EltwiseBinaryMeshWorkload) { CoreCoord worker_grid_size = mesh_device_->compute_with_storage_grid_size(); - auto programs = create_eltwise_bin_programs(mesh_device_, src0_bufs, src1_bufs, output_bufs); + auto programs = tt::tt_metal::distributed::test::utils::create_eltwise_bin_programs( + mesh_device_, src0_bufs, src1_bufs, output_bufs); auto mesh_workload = CreateMeshWorkload(); LogicalDeviceRange devices_0 = LogicalDeviceRange({0, 0}, {3, 0}); LogicalDeviceRange devices_1 = LogicalDeviceRange({0, 1}, {3, 1}); diff --git a/tests/tt_metal/distributed/utils.cpp b/tests/tt_metal/distributed/utils.cpp new file mode 100644 index 00000000000..c53f1c9d96a --- /dev/null +++ b/tests/tt_metal/distributed/utils.cpp @@ -0,0 +1,126 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "tests/tt_metal/distributed/utils.hpp" + +namespace tt::tt_metal::distributed::test::utils { + +std::vector> create_eltwise_bin_programs( + std::shared_ptr& mesh_device, + std::vector>& src0_bufs, + std::vector>& src1_bufs, + std::vector>& output_bufs) { + const std::vector op_id_to_op_define = {"add_tiles", "mul_tiles"}; + const std::vector op_id_to_op_type_define = {"EltwiseBinaryType::ELWADD", "EltwiseBinaryType::ELWMUL"}; + + CoreCoord worker_grid_size = mesh_device->compute_with_storage_grid_size(); + + std::vector> programs = {std::make_shared(), std::make_shared()}; + auto full_grid = CoreRange({0, 0}, {worker_grid_size.x - 1, worker_grid_size.y - 1}); + + for (std::size_t eltwise_op = 0; eltwise_op < op_id_to_op_define.size(); eltwise_op++) { + auto& program = *programs[eltwise_op]; + uint32_t single_tile_size = 2 * 1024; + uint32_t num_tiles = 2048; + uint32_t dram_buffer_size = + single_tile_size * num_tiles; // num_tiles of FP16_B, hard-coded in the reader/writer kernels + uint32_t page_size = single_tile_size; + + ReplicatedBufferConfig global_buffer_config{.size = dram_buffer_size}; + DeviceLocalBufferConfig per_device_buffer_config{ + .page_size = page_size, + .buffer_type = tt_metal::BufferType::DRAM, + .buffer_layout = TensorMemoryLayout::INTERLEAVED, + .bottom_up = true}; + + for (std::size_t col_idx = 0; col_idx < worker_grid_size.x; col_idx++) { + for (std::size_t row_idx = 0; row_idx < worker_grid_size.y; row_idx++) { + auto src0_dram_buffer = + MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device.get()); + src0_bufs.push_back(src0_dram_buffer); + + auto src1_dram_buffer = + MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device.get()); + src1_bufs.push_back(src1_dram_buffer); + auto dst_dram_buffer = + MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device.get()); + output_bufs.push_back(dst_dram_buffer); + } + } + + uint32_t src0_cb_index = tt::CBIndex::c_0; + uint32_t num_input_tiles = 2; + tt_metal::CircularBufferConfig cb_src0_config = + tt_metal::CircularBufferConfig( + num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}}) + .set_page_size(src0_cb_index, single_tile_size); + auto cb_src0 = tt_metal::CreateCircularBuffer(program, full_grid, cb_src0_config); + + uint32_t src1_cb_index = tt::CBIndex::c_1; + tt_metal::CircularBufferConfig cb_src1_config = + tt_metal::CircularBufferConfig( + num_input_tiles * single_tile_size, {{src1_cb_index, tt::DataFormat::Float16_b}}) + .set_page_size(src1_cb_index, single_tile_size); + auto cb_src1 = tt_metal::CreateCircularBuffer(program, full_grid, cb_src1_config); + + uint32_t ouput_cb_index = tt::CBIndex::c_16; + uint32_t num_output_tiles = 2; + tt_metal::CircularBufferConfig cb_output_config = + tt_metal::CircularBufferConfig( + num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}}) + .set_page_size(ouput_cb_index, single_tile_size); + auto cb_output = tt_metal::CreateCircularBuffer(program, full_grid, cb_output_config); + + auto binary_reader_kernel = tt_metal::CreateKernel( + program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_dual_8bank.cpp", + full_grid, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); + + auto unary_writer_kernel = tt_metal::CreateKernel( + program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp", + full_grid, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); + + std::vector compute_kernel_args = {}; + + bool fp32_dest_acc_en = false; + bool math_approx_mode = false; + std::map binary_defines = { + {"ELTWISE_OP", op_id_to_op_define[eltwise_op]}, {"ELTWISE_OP_TYPE", op_id_to_op_type_define[eltwise_op]}}; + auto eltwise_binary_kernel = tt_metal::CreateKernel( + program, + "tt_metal/kernels/compute/eltwise_binary.cpp", + full_grid, + tt_metal::ComputeConfig{.compile_args = compute_kernel_args, .defines = binary_defines}); + + SetRuntimeArgs(program, eltwise_binary_kernel, full_grid, {2048, 1}); + + for (std::size_t col_idx = 0; col_idx < worker_grid_size.x; col_idx++) { + for (std::size_t row_idx = 0; row_idx < worker_grid_size.y; row_idx++) { + CoreCoord curr_core = {col_idx, row_idx}; + const std::array reader_args = { + src0_bufs.at(col_idx * worker_grid_size.y + row_idx)->address(), + 0, + num_tiles, + src1_bufs.at(col_idx * worker_grid_size.y + row_idx)->address(), + 0, + num_tiles, + 0}; + + const std::array writer_args = { + output_bufs.at(col_idx * worker_grid_size.y + row_idx)->address(), 0, num_tiles}; + + SetRuntimeArgs(program, unary_writer_kernel, curr_core, writer_args); + SetRuntimeArgs(program, binary_reader_kernel, curr_core, reader_args); + } + } + } + return programs; +} + +} // namespace tt::tt_metal::distributed::test::utils diff --git a/tests/tt_metal/distributed/utils.hpp b/tests/tt_metal/distributed/utils.hpp new file mode 100644 index 00000000000..36b1bbb2fdd --- /dev/null +++ b/tests/tt_metal/distributed/utils.hpp @@ -0,0 +1,18 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include + +namespace tt::tt_metal::distributed::test::utils { + +std::vector> create_eltwise_bin_programs( + std::shared_ptr& mesh_device, + std::vector>& src0_bufs, + std::vector>& src1_bufs, + std::vector>& output_bufs); + +} // namespace tt::tt_metal::distributed::test::utils diff --git a/tests/tt_metal/tt_metal/common/multi_device_fixture.hpp b/tests/tt_metal/tt_metal/common/multi_device_fixture.hpp index 04a8ce84a78..1fa6f2443c9 100644 --- a/tests/tt_metal/tt_metal/common/multi_device_fixture.hpp +++ b/tests/tt_metal/tt_metal/common/multi_device_fixture.hpp @@ -52,7 +52,7 @@ class N300DeviceFixture : public MultiDeviceFixture { class T3000MultiDeviceFixture : public ::testing::Test { protected: - void SetUp() override { + virtual void SetUp() override { using tt::tt_metal::distributed::MeshDevice; using tt::tt_metal::distributed::MeshDeviceConfig; using tt::tt_metal::distributed::MeshShape; @@ -66,7 +66,7 @@ class T3000MultiDeviceFixture : public ::testing::Test { if (num_devices < 8 or arch != tt::ARCH::WORMHOLE_B0) { GTEST_SKIP() << "Skipping T3K Multi-Device test suite on non T3K machine."; } - mesh_device_ = MeshDevice::create(MeshDeviceConfig{.mesh_shape = MeshShape{2, 4}}); + create_mesh_device(); } void TearDown() override { @@ -77,5 +77,28 @@ class T3000MultiDeviceFixture : public ::testing::Test { mesh_device_->close(); mesh_device_.reset(); } + +protected: + virtual void create_mesh_device() { + using tt::tt_metal::distributed::MeshDevice; + using tt::tt_metal::distributed::MeshDeviceConfig; + using tt::tt_metal::distributed::MeshShape; + + mesh_device_ = MeshDevice::create(MeshDeviceConfig{.mesh_shape = MeshShape{2, 4}}); + } + std::shared_ptr mesh_device_; }; + +class T3000MultiCQMultiDeviceFixture : public T3000MultiDeviceFixture { +protected: + // Override only the mesh device creation logic + void create_mesh_device() override { + using tt::tt_metal::distributed::MeshDevice; + using tt::tt_metal::distributed::MeshDeviceConfig; + using tt::tt_metal::distributed::MeshShape; + + mesh_device_ = + MeshDevice::create(MeshDeviceConfig{.mesh_shape = MeshShape{2, 4}}, 0, 0, 2, DispatchCoreType::ETH); + } +}; diff --git a/tt_metal/api/tt-metalium/command_queue.hpp b/tt_metal/api/tt-metalium/command_queue.hpp index 9c9bb3b29de..3c1a57fe7e7 100644 --- a/tt_metal/api/tt-metalium/command_queue.hpp +++ b/tt_metal/api/tt-metalium/command_queue.hpp @@ -75,10 +75,9 @@ class CommandQueue { tt::stl::Span sub_device_ids = {}) = 0; virtual void enqueue_record_event( - const std::shared_ptr& event, - bool clear_count = false, - tt::stl::Span sub_device_ids = {}) = 0; - virtual void enqueue_wait_for_event(const std::shared_ptr& sync_event, bool clear_count = false) = 0; + const std::shared_ptr& event, tt::stl::Span sub_device_ids = {}) = 0; + + virtual void enqueue_wait_for_event(const std::shared_ptr& sync_event) = 0; virtual void enqueue_write_buffer( const std::variant, std::shared_ptr>& buffer, diff --git a/tt_metal/api/tt-metalium/dispatch_core_manager.hpp b/tt_metal/api/tt-metalium/dispatch_core_manager.hpp index 62433e832b5..2edda1f01ae 100644 --- a/tt_metal/api/tt-metalium/dispatch_core_manager.hpp +++ b/tt_metal/api/tt-metalium/dispatch_core_manager.hpp @@ -143,6 +143,8 @@ class dispatch_core_manager { bool is_dispatcher_s_core_allocated(chip_id_t device_id, uint16_t channel, uint8_t cq_id); + bool is_dispatcher_d_core_allocated(chip_id_t device_id, uint16_t channel, uint8_t cq_id); + /// @brief Gets the location of the kernel designated to relay fast dispatch commands to worker cores from a particular command queue /// @param device_id ID of the device that should be running the command /// @param channel assigned to the command queue where commands are enqueued diff --git a/tt_metal/api/tt-metalium/distributed.hpp b/tt_metal/api/tt-metalium/distributed.hpp index 96b3a23ed10..017214b437a 100644 --- a/tt_metal/api/tt-metalium/distributed.hpp +++ b/tt_metal/api/tt-metalium/distributed.hpp @@ -6,7 +6,7 @@ #include "mesh_buffer.hpp" #include "mesh_command_queue.hpp" -#include "mesh_workload.hpp" +#include "mesh_event.hpp" namespace tt::tt_metal { @@ -78,7 +78,23 @@ void EnqueueReadMeshBuffer( mesh_cq.enqueue_read_mesh_buffer(dst.data(), mesh_buffer, blocking); } -void Finish(MeshCommandQueue& mesh_cq); +void EnqueueRecordEvent( + MeshCommandQueue& mesh_cq, + const std::shared_ptr& event, + tt::stl::Span sub_device_ids = {}, + const std::optional& device_range = std::nullopt); + +void EnqueueRecordEventToHost( + MeshCommandQueue& mesh_cq, + const std::shared_ptr& event, + tt::stl::Span sub_device_ids = {}, + const std::optional& device_range = std::nullopt); + +void EnqueueWaitForEvent(MeshCommandQueue& mesh_cq, const std::shared_ptr& event); + +void EventSynchronize(const std::shared_ptr& event); + +void Finish(MeshCommandQueue& mesh_cq, tt::stl::Span sub_device_ids = {}); } // namespace distributed } // namespace tt::tt_metal diff --git a/tt_metal/api/tt-metalium/mesh_command_queue.hpp b/tt_metal/api/tt-metalium/mesh_command_queue.hpp index 61263207b9c..11ca2ab65e8 100644 --- a/tt_metal/api/tt-metalium/mesh_command_queue.hpp +++ b/tt_metal/api/tt-metalium/mesh_command_queue.hpp @@ -5,6 +5,8 @@ #pragma once #include +#include + #include "buffer.hpp" #include "command_queue_interface.hpp" #include "mesh_buffer.hpp" @@ -13,6 +15,9 @@ namespace tt::tt_metal::distributed { +class MeshEvent; +struct MeshReadEventDescriptor; + class MeshCommandQueue { // Main interface to dispatch data and workloads to a MeshDevice // Currently only supports dispatching workloads and relies on the @@ -39,12 +44,18 @@ class MeshCommandQueue { // Helper functions for read and write entire Sharded-MeshBuffers void write_sharded_buffer(const MeshBuffer& buffer, const void* src); void read_sharded_buffer(MeshBuffer& buffer, void* dst); + void enqueue_record_event_helper( + const std::shared_ptr& event, + tt::stl::Span sub_device_ids, + bool notify_host, + const std::optional& device_range = std::nullopt); std::array config_buffer_mgr_; std::array expected_num_workers_completed_; MeshDevice* mesh_device_ = nullptr; uint32_t id_ = 0; CoreCoord dispatch_core_; CoreType dispatch_core_type_ = CoreType::WORKER; + std::queue> event_descriptors_; public: MeshCommandQueue(MeshDevice* mesh_device, uint32_t id); @@ -76,7 +87,18 @@ class MeshCommandQueue { const std::shared_ptr& mesh_buffer, bool blocking); - void finish(); + void enqueue_record_event( + const std::shared_ptr& event, + tt::stl::Span sub_device_ids = {}, + const std::optional& device_range = std::nullopt); + void enqueue_record_event_to_host( + const std::shared_ptr& event, + tt::stl::Span sub_device_ids = {}, + const std::optional& device_range = std::nullopt); + void enqueue_wait_for_event(const std::shared_ptr& sync_event); + void drain_events_from_completion_queue(); + void verify_reported_events_after_draining(const std::shared_ptr& event); + void finish(tt::stl::Span sub_device_ids = {}); void reset_worker_state( bool reset_launch_msg_state, uint32_t num_sub_devices, diff --git a/tt_metal/api/tt-metalium/mesh_device.hpp b/tt_metal/api/tt-metalium/mesh_device.hpp index ec04ada058f..c4f1469ee46 100644 --- a/tt_metal/api/tt-metalium/mesh_device.hpp +++ b/tt_metal/api/tt-metalium/mesh_device.hpp @@ -58,7 +58,7 @@ class MeshDevice : public IDevice, public std::enable_shared_from_this> submeshes_; // Parent owns submeshes and is responsible for their destruction std::weak_ptr parent_mesh_; // Submesh created with reference to parent mesh - std::unique_ptr mesh_command_queue_; + std::vector> mesh_command_queues_; std::unique_ptr sub_device_manager_tracker_; // This is a reference device used to query properties that are the same for all devices in the mesh. @@ -238,7 +238,7 @@ class MeshDevice : public IDevice, public std::enable_shared_from_this sub_devices, DeviceAddr local_l1_size); // TODO #16526: Temporary api until migration to actual fabric is complete diff --git a/tt_metal/api/tt-metalium/mesh_device_view.hpp b/tt_metal/api/tt-metalium/mesh_device_view.hpp index 98a7cad5740..fbadc8f32c2 100644 --- a/tt_metal/api/tt-metalium/mesh_device_view.hpp +++ b/tt_metal/api/tt-metalium/mesh_device_view.hpp @@ -39,6 +39,15 @@ struct Coordinate { return os << "Coord(" << coord.row << ", " << coord.col << ")"; } }; +// TODO (Issue #17477): MeshWorkload and MeshEvent currently rely on the coordinate systems +// exposed below. These must be uplifted to an ND coordinate system (DeviceCoord and DeviceRange), +// keeping things more consistent across the stack. +// For now, since the LogicalDeviceRange concept is fundamentally identical to the CoreRange concept +// on a 2D Mesh use this definition. CoreRange contains several utility functions required +// in the MeshWorkload context. + +using DeviceCoord = CoreCoord; +using LogicalDeviceRange = CoreRange; /** * @brief The MeshDeviceView class provides a view of a specific sub-region within the MeshDevice. diff --git a/tt_metal/api/tt-metalium/mesh_event.hpp b/tt_metal/api/tt-metalium/mesh_event.hpp new file mode 100644 index 00000000000..f115a118d15 --- /dev/null +++ b/tt_metal/api/tt-metalium/mesh_event.hpp @@ -0,0 +1,19 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "mesh_device.hpp" + +namespace tt::tt_metal::distributed { + +class MeshEvent { +public: + MeshDevice* device = nullptr; + LogicalDeviceRange device_range = LogicalDeviceRange({0, 0}); + uint32_t cq_id = 0; + uint32_t event_id = 0; +}; + +} // namespace tt::tt_metal::distributed diff --git a/tt_metal/api/tt-metalium/mesh_workload.hpp b/tt_metal/api/tt-metalium/mesh_workload.hpp index 577c1f0e7d6..f57bccb3edf 100644 --- a/tt_metal/api/tt-metalium/mesh_workload.hpp +++ b/tt_metal/api/tt-metalium/mesh_workload.hpp @@ -9,11 +9,6 @@ #include "mesh_buffer.hpp" namespace tt::tt_metal::distributed { -// The LogicalDeviceRange concept is fundamentally identical to the CoreRange concept -// Use this definition for now, since CoreRange contains several utility functions required -// in the MeshWorkload context. CoreRange can eventually be renamed to Range2D. -using LogicalDeviceRange = CoreRange; -using DeviceCoord = CoreCoord; using RuntimeArgsPerCore = std::vector>; class MeshCommandQueue; diff --git a/tt_metal/distributed/distributed.cpp b/tt_metal/distributed/distributed.cpp index d7410816baa..b92546832a1 100644 --- a/tt_metal/distributed/distributed.cpp +++ b/tt_metal/distributed/distributed.cpp @@ -20,6 +20,34 @@ void EnqueueMeshWorkload(MeshCommandQueue& mesh_cq, MeshWorkload& mesh_workload, mesh_cq.enqueue_mesh_workload(mesh_workload, blocking); } -void Finish(MeshCommandQueue& mesh_cq) { mesh_cq.finish(); } +void EnqueueRecordEvent( + MeshCommandQueue& mesh_cq, + const std::shared_ptr& event, + tt::stl::Span sub_device_ids, + const std::optional& device_range) { + mesh_cq.enqueue_record_event(event, sub_device_ids, device_range); +} + +void EnqueueRecordEventToHost( + MeshCommandQueue& mesh_cq, + const std::shared_ptr& event, + tt::stl::Span sub_device_ids, + const std::optional& device_range) { + mesh_cq.enqueue_record_event_to_host(event, sub_device_ids, device_range); +} + +void EnqueueWaitForEvent(MeshCommandQueue& mesh_cq, const std::shared_ptr& event) { + mesh_cq.enqueue_wait_for_event(event); +} + +void EventSynchronize(const std::shared_ptr& event) { + auto& mesh_cq = event->device->mesh_command_queue(event->cq_id); + mesh_cq.drain_events_from_completion_queue(); + mesh_cq.verify_reported_events_after_draining(event); +} + +void Finish(MeshCommandQueue& mesh_cq, tt::stl::Span sub_device_ids) { + mesh_cq.finish(sub_device_ids); +} } // namespace tt::tt_metal::distributed diff --git a/tt_metal/distributed/mesh_command_queue.cpp b/tt_metal/distributed/mesh_command_queue.cpp index 89eaaff1b03..d19911a3112 100644 --- a/tt_metal/distributed/mesh_command_queue.cpp +++ b/tt_metal/distributed/mesh_command_queue.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -11,9 +12,15 @@ #include "tt_metal/distributed/mesh_workload_utils.hpp" #include "tt_metal/impl/buffers/dispatch.hpp" #include "tt_metal/impl/program/dispatch.hpp" +#include "tt_metal/impl/dispatch/dispatch_query_manager.hpp" namespace tt::tt_metal::distributed { +struct MeshReadEventDescriptor { + ReadEventDescriptor single_device_descriptor; + LogicalDeviceRange device_range; +}; + MeshCommandQueue::MeshCommandQueue(MeshDevice* mesh_device, uint32_t id) { this->mesh_device_ = mesh_device; this->id_ = id; @@ -62,6 +69,8 @@ void MeshCommandQueue::enqueue_mesh_workload(MeshWorkload& mesh_workload, bool b auto sub_device_index = sub_device_id.to_index(); auto mesh_device_id = this->mesh_device_->id(); auto& sysmem_manager = mesh_device_->get_device(0, 0)->sysmem_manager(); + auto dispatch_core_config = DispatchQueryManager::instance().get_dispatch_core_config(); + CoreType dispatch_core_type = dispatch_core_config.get_core_type(); TT_FATAL( mesh_workload.get_program_binary_status(mesh_device_id) != ProgramBinaryStatus::NotSent, @@ -105,7 +114,7 @@ void MeshCommandQueue::enqueue_mesh_workload(MeshWorkload& mesh_workload, bool b sysmem_manager.get_worker_launch_message_buffer_state()[sub_device_index].get_unicast_wptr(), expected_num_workers_completed_[sub_device_index], this->virtual_program_dispatch_core(), - this->dispatch_core_type(), + dispatch_core_type, sub_device_id, dispatch_metadata, mesh_workload.get_program_binary_status(mesh_device_id), @@ -117,14 +126,13 @@ void MeshCommandQueue::enqueue_mesh_workload(MeshWorkload& mesh_workload, bool b logical_x++) { for (std::size_t logical_y = device_range.start_coord.y; logical_y < device_range.end_coord.y + 1; logical_y++) { - experimental::write_program_commands( - this->mesh_device_->get_device(logical_y, logical_x)->command_queue(this->id_), + program_dispatch::write_program_command_sequence( program_cmd_seq, - num_workers, - sub_device_id, + this->mesh_device_->get_device(logical_y, logical_x)->sysmem_manager(), + id_, + dispatch_core_type, dispatch_metadata.stall_first, - dispatch_metadata.stall_before_program, - false); + dispatch_metadata.stall_before_program); chip_ids_in_workload.insert(this->mesh_device_->get_device(logical_y, logical_x)->id()); } } @@ -132,8 +140,11 @@ void MeshCommandQueue::enqueue_mesh_workload(MeshWorkload& mesh_workload, bool b // Send go signals to devices not running a program to ensure consistent global state for (auto& device : this->mesh_device_->get_devices()) { if (chip_ids_in_workload.find(device->id()) == chip_ids_in_workload.end()) { - experimental::write_go_signal( - device->command_queue(this->id_), + write_go_signal( + id_, + device, + sub_device_id, + device->sysmem_manager(), expected_num_workers_completed_[sub_device_index], this->virtual_program_dispatch_core(), mcast_go_signals, @@ -159,10 +170,11 @@ void MeshCommandQueue::enqueue_mesh_workload(MeshWorkload& mesh_workload, bool b } } -void MeshCommandQueue::finish() { - for (auto device : this->mesh_device_->get_devices()) { - Finish(device->command_queue(this->id_)); - } +void MeshCommandQueue::finish(tt::stl::Span sub_device_ids) { + std::shared_ptr event = std::make_shared(); + this->enqueue_record_event_to_host(event, sub_device_ids); + this->drain_events_from_completion_queue(); + this->verify_reported_events_after_draining(event); } void MeshCommandQueue::write_shard_to_device( @@ -181,6 +193,7 @@ void MeshCommandQueue::read_shard_from_device( void* dst, const BufferRegion& region, tt::stl::Span sub_device_ids) { + this->drain_events_from_completion_queue(); auto device = shard_view->device(); chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device->id()); uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id()); @@ -417,6 +430,110 @@ void MeshCommandQueue::enqueue_read_shards( } } +void MeshCommandQueue::enqueue_record_event_helper( + const std::shared_ptr& event, + tt::stl::Span sub_device_ids, + bool notify_host, + const std::optional& device_range) { + auto& sysmem_manager = mesh_device_->get_device(0, 0)->sysmem_manager(); + event->cq_id = id_; + event->event_id = sysmem_manager.get_next_event(id_); + event->device = mesh_device_; + event->device_range = + device_range.value_or(LogicalDeviceRange({0, 0}, {mesh_device_->num_cols() - 1, mesh_device_->num_rows() - 1})); + + sub_device_ids = buffer_dispatch::select_sub_device_ids(mesh_device_, sub_device_ids); + for (std::size_t logical_x = event->device_range.start_coord.x; logical_x < event->device_range.end_coord.x + 1; + logical_x++) { + for (std::size_t logical_y = event->device_range.start_coord.y; logical_y < event->device_range.end_coord.y + 1; + logical_y++) { + event_dispatch::issue_record_event_commands( + mesh_device_, + event->event_id, + id_, + mesh_device_->num_hw_cqs(), + mesh_device_->get_device(logical_y, logical_x)->sysmem_manager(), + sub_device_ids, + expected_num_workers_completed_, + notify_host); + } + } +} + +void MeshCommandQueue::enqueue_record_event( + const std::shared_ptr& event, + tt::stl::Span sub_device_ids, + const std::optional& device_range) { + this->enqueue_record_event_helper(event, sub_device_ids, false, device_range); +} + +void MeshCommandQueue::enqueue_record_event_to_host( + const std::shared_ptr& event, + tt::stl::Span sub_device_ids, + const std::optional& device_range) { + this->enqueue_record_event_helper(event, sub_device_ids, true, device_range); + event_descriptors_.push(std::make_shared(MeshReadEventDescriptor{ + .single_device_descriptor = ReadEventDescriptor(event->event_id), .device_range = event->device_range})); +} + +void MeshCommandQueue::enqueue_wait_for_event(const std::shared_ptr& sync_event) { + for (std::size_t logical_x = sync_event->device_range.start_coord.x; + logical_x < sync_event->device_range.end_coord.x + 1; + logical_x++) { + for (std::size_t logical_y = sync_event->device_range.start_coord.y; + logical_y < sync_event->device_range.end_coord.y + 1; + logical_y++) { + event_dispatch::issue_wait_for_event_commands( + id_, + sync_event->cq_id, + mesh_device_->get_device(logical_y, logical_x)->sysmem_manager(), + sync_event->event_id); + } + } +} + +void MeshCommandQueue::drain_events_from_completion_queue() { + constexpr bool exit_condition = false; + auto num_events = event_descriptors_.size(); + for (std::size_t event_idx = 0; event_idx < num_events; event_idx++) { + auto& mesh_read_descriptor = event_descriptors_.front(); + auto& device_range = mesh_read_descriptor->device_range; + for (std::size_t logical_x = device_range.start_coord.x; logical_x < device_range.end_coord.x + 1; + logical_x++) { + for (std::size_t logical_y = device_range.start_coord.y; logical_y < device_range.end_coord.y + 1; + logical_y++) { + auto device = mesh_device_->get_device(logical_y, logical_x); + chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device->id()); + uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id()); + bool exit_condition = false; + device->sysmem_manager().completion_queue_wait_front(id_, exit_condition); + event_dispatch::read_events_from_completion_queue( + mesh_read_descriptor->single_device_descriptor, + mmio_device_id, + channel, + id_, + device->sysmem_manager()); + } + } + event_descriptors_.pop(); + } +} + +void MeshCommandQueue::verify_reported_events_after_draining(const std::shared_ptr& event) { + auto& device_range = event->device_range; + for (std::size_t logical_x = device_range.start_coord.x; logical_x < device_range.end_coord.x + 1; logical_x++) { + for (std::size_t logical_y = device_range.start_coord.y; logical_y < device_range.end_coord.y + 1; + logical_y++) { + TT_FATAL( + mesh_device_->get_device(logical_y, logical_x) + ->sysmem_manager() + .get_last_completed_event(event->cq_id) >= event->event_id, + "Expected to see event id {} in completion queue", + event->event_id); + } + } +} + void MeshCommandQueue::reset_worker_state( bool reset_launch_msg_state, uint32_t num_sub_devices, const vector_memcpy_aligned& go_signal_noc_data) { for (auto device : mesh_device_->get_devices()) { @@ -433,11 +550,6 @@ void MeshCommandQueue::reset_worker_state( } program_dispatch::reset_config_buf_mgrs_and_expected_workers( config_buffer_mgr_, expected_num_workers_completed_, mesh_device_->num_sub_devices()); - for (auto device : mesh_device_->get_devices()) { - for (int i = 0; i < mesh_device_->num_sub_devices(); i++) { - device->command_queue(id_).set_expected_num_workers_completed_for_sub_device(i, 0); - } - } if (reset_launch_msg_state) { auto& sysmem_manager = mesh_device_->get_device(0, 0)->sysmem_manager(); sysmem_manager.reset_worker_launch_message_buffer_state(num_sub_devices); diff --git a/tt_metal/distributed/mesh_device.cpp b/tt_metal/distributed/mesh_device.cpp index e02498c3c28..312d164934b 100644 --- a/tt_metal/distributed/mesh_device.cpp +++ b/tt_metal/distributed/mesh_device.cpp @@ -221,9 +221,10 @@ IDevice* MeshDevice::get_device(size_t row_idx, size_t col_idx) const { return this->get_device_index(row_idx * num_cols() + col_idx); } -MeshCommandQueue& MeshDevice::mesh_command_queue() { - TT_FATAL(this->using_fast_dispatch(), "Can only acess the MeshCommandQueue when using Fast Dispatch."); - return *(mesh_command_queue_); +MeshCommandQueue& MeshDevice::mesh_command_queue(std::size_t cq_id) const { + TT_FATAL(this->using_fast_dispatch(), "Can only access the MeshCommandQueue when using Fast Dispatch."); + TT_FATAL(cq_id < mesh_command_queues_.size(), "cq_id {} is out of range", cq_id); + return *(mesh_command_queues_[cq_id]); } const DeviceIds MeshDevice::get_device_ids() const { @@ -626,9 +627,11 @@ bool MeshDevice::initialize( const auto& allocator = reference_device()->allocator(); sub_device_manager_tracker_ = std::make_unique( this, std::make_unique(allocator->get_config()), sub_devices); - + mesh_command_queues_.reserve(this->num_hw_cqs()); if (this->using_fast_dispatch()) { - mesh_command_queue_ = std::make_unique(this, 0); + for (std::size_t cq_id = 0; cq_id < this->num_hw_cqs(); cq_id++) { + mesh_command_queues_.push_back(std::make_unique(this, cq_id)); + } } return true; } diff --git a/tt_metal/distributed/mesh_workload_utils.cpp b/tt_metal/distributed/mesh_workload_utils.cpp index 634249da09c..c51a99c957a 100644 --- a/tt_metal/distributed/mesh_workload_utils.cpp +++ b/tt_metal/distributed/mesh_workload_utils.cpp @@ -6,54 +6,28 @@ #include #include "tt_metal/impl/program/dispatch.hpp" +#include "tt_metal/impl/dispatch/dispatch_query_manager.hpp" namespace tt::tt_metal::distributed { -namespace experimental { - -void write_program_commands( - CommandQueue& cq, - ProgramCommandSequence& program_cmd_seq, - uint32_t num_active_cores_in_program, - SubDeviceId sub_device_id, - bool stall_first, - bool stall_before_program, - bool blocking) { - auto sub_device_index = sub_device_id.to_index(); - // Increment expected num workers inside single device CQs to ensure other paths dont break. - // This is temporary, since data movement and events rely on single device CQs. Once MeshCommandQueue - // supports all runtime features, this will be removed, and program dispatch commands will be written - // directly through dedicated interfaces. - - uint32_t num_workers_in_cq = cq.get_expected_num_workers_completed_for_sub_device(sub_device_index); - cq.set_expected_num_workers_completed_for_sub_device( - sub_device_index, num_workers_in_cq + num_active_cores_in_program); - // Write program command stream to device - program_dispatch::write_program_command_sequence( - program_cmd_seq, - cq.device()->sysmem_manager(), - cq.id(), - dispatch_core_manager::instance().get_dispatch_core_type(cq.device()->id()), - stall_first, - stall_before_program); -} - // Use this function to send go signals to a device not running a program. // In the MeshWorkload context, a go signal must be sent to each device when // a workload is dispatched, in order to maintain consistent global state. void write_go_signal( - CommandQueue& cq, + uint8_t cq_id, + IDevice* device, + SubDeviceId sub_device_id, + SystemMemoryManager& sysmem_manager, uint32_t expected_num_workers_completed, CoreCoord dispatch_core, bool send_mcast, bool send_unicasts, - int num_unicast_txns = -1) { + int num_unicast_txns) { uint32_t pcie_alignment = hal.get_alignment(HalMemType::HOST); uint32_t cmd_sequence_sizeB = align(sizeof(CQPrefetchCmd) + sizeof(CQDispatchCmd), pcie_alignment) + hal.get_alignment(HalMemType::HOST); - auto& manager = cq.device()->sysmem_manager(); - void* cmd_region = manager.issue_queue_reserve(cmd_sequence_sizeB, cq.id()); + void* cmd_region = sysmem_manager.issue_queue_reserve(cmd_sequence_sizeB, cq_id); HugepageDeviceCommand go_signal_cmd_sequence(cmd_region, cmd_sequence_sizeB); go_msg_t run_program_go_signal; @@ -63,30 +37,37 @@ void write_go_signal( run_program_go_signal.master_y = dispatch_core.y; run_program_go_signal.dispatch_message_offset = 0; - CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(cq.device()->id()); + CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id()); uint32_t dispatch_message_addr = DispatchMemMap::get(dispatch_core_type) .get_device_command_queue_addr(CommandQueueDeviceAddrType::DISPATCH_MESSAGE); - go_signal_cmd_sequence.add_notify_dispatch_s_go_signal_cmd( - 0, /* wait */ - 1 /* index_bitmask */); - + auto sub_device_index = sub_device_id.to_index(); + // When running with dispatch_s enabled: + // - dispatch_d must notify dispatch_s that a go signal can be sent + // - dispatch_s then mcasts the go signal to all workers. + // When running without dispatch_s: + // - dispatch_d handles sending the go signal to all workers + // There is no need for dispatch_d to barrier before sending the dispatch_s notification or go signal, + // since this go signal is not preceeded by NOC txns for program config data + if (DispatchQueryManager::instance().dispatch_s_enabled()) { + uint16_t index_bitmask = 1 << sub_device_index; + go_signal_cmd_sequence.add_notify_dispatch_s_go_signal_cmd( + 0, /* wait */ + index_bitmask /* index_bitmask */); // When running on sub devices, we must account for this + } go_signal_cmd_sequence.add_dispatch_go_signal_mcast( expected_num_workers_completed, *reinterpret_cast(&run_program_go_signal), dispatch_message_addr, - send_mcast ? cq.device()->num_noc_mcast_txns(SubDeviceId{0}) : 0, - send_unicasts ? ((num_unicast_txns > 0) ? num_unicast_txns : cq.device()->num_noc_unicast_txns(SubDeviceId{0})) - : 0, - 0, /* noc_data_start_idx */ + send_mcast ? device->num_noc_mcast_txns(sub_device_id) : 0, + send_unicasts ? ((num_unicast_txns > 0) ? num_unicast_txns : device->num_noc_unicast_txns(sub_device_id)) : 0, + device->noc_data_start_index(sub_device_id, send_mcast, send_unicasts), /* noc_data_start_idx */ DispatcherSelect::DISPATCH_SLAVE); - manager.issue_queue_push_back(cmd_sequence_sizeB, cq.id()); + sysmem_manager.issue_queue_push_back(cmd_sequence_sizeB, cq_id); - manager.fetch_queue_reserve_back(cq.id()); - manager.fetch_queue_write(cmd_sequence_sizeB, cq.id()); + sysmem_manager.fetch_queue_reserve_back(cq_id); + sysmem_manager.fetch_queue_write(cmd_sequence_sizeB, cq_id); } -} // namespace experimental - } // namespace tt::tt_metal::distributed diff --git a/tt_metal/distributed/mesh_workload_utils.hpp b/tt_metal/distributed/mesh_workload_utils.hpp index e6b0429dd54..1461aad13f8 100644 --- a/tt_metal/distributed/mesh_workload_utils.hpp +++ b/tt_metal/distributed/mesh_workload_utils.hpp @@ -4,30 +4,19 @@ #include +// Utility functions for dispatch MeshWorkloads +// Used by MeshCommandQueue namespace tt::tt_metal::distributed { -namespace experimental { -// Utility functions for writing program dispatch commands -// and go signals through the per device CQ. -// Usage of these functions is temporary, until the MeshCQ -// can function independently and support MeshBuffer reads and -// writes. -void write_program_commands( - CommandQueue& cq, - ProgramCommandSequence& program_cmd_seq, - uint32_t num_active_cores_in_program, - SubDeviceId sub_device_id, - bool stall_first, - bool stall_before_program, - bool blocking); - void write_go_signal( - CommandQueue& cq, + uint8_t cq_id, + IDevice* device, + SubDeviceId sub_device_id, + SystemMemoryManager& sysmem_manager, uint32_t expected_num_workers_completed, CoreCoord dispatch_core, bool send_mcast, bool send_unicasts, int num_unicast_txns = -1); -} // namespace experimental } // namespace tt::tt_metal::distributed diff --git a/tt_metal/impl/CMakeLists.txt b/tt_metal/impl/CMakeLists.txt index c72409857bf..46a2578a2af 100644 --- a/tt_metal/impl/CMakeLists.txt +++ b/tt_metal/impl/CMakeLists.txt @@ -47,6 +47,7 @@ set(IMPL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/trace/trace.cpp ${CMAKE_CURRENT_SOURCE_DIR}/trace/trace_buffer.cpp ${CMAKE_CURRENT_SOURCE_DIR}/event/event.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/event/dispatch.cpp ${CMAKE_CURRENT_SOURCE_DIR}/flatbuffer/base_types_from_flatbuffer.cpp ${CMAKE_CURRENT_SOURCE_DIR}/flatbuffer/base_types_to_flatbuffer.cpp ${CMAKE_CURRENT_SOURCE_DIR}/flatbuffer/buffer_types_from_flatbuffer.cpp diff --git a/tt_metal/impl/buffers/dispatch.hpp b/tt_metal/impl/buffers/dispatch.hpp index 15c3fa6e440..c2064fce6a4 100644 --- a/tt_metal/impl/buffers/dispatch.hpp +++ b/tt_metal/impl/buffers/dispatch.hpp @@ -8,6 +8,7 @@ #include #include #include "buffer.hpp" +#include "tt_metal/impl/event/dispatch.hpp" namespace tt::tt_metal { @@ -44,17 +45,6 @@ struct ReadBufferDescriptor { starting_host_page_id(starting_host_page_id) {} }; -// Used so host knows data in completion queue is just an event ID -struct ReadEventDescriptor { - uint32_t event_id; - uint32_t global_offset; - - explicit ReadEventDescriptor(uint32_t event) : event_id(event), global_offset(0) {} - - void set_global_offset(uint32_t offset) { global_offset = offset; } - uint32_t get_global_event_id() { return global_offset + event_id; } -}; - using CompletionReaderVariant = std::variant; // Contains helper functions to interface with buffers on device diff --git a/tt_metal/impl/dispatch/dispatch_core_manager.cpp b/tt_metal/impl/dispatch/dispatch_core_manager.cpp index 09b8f7e4b4a..401172737e9 100644 --- a/tt_metal/impl/dispatch/dispatch_core_manager.cpp +++ b/tt_metal/impl/dispatch/dispatch_core_manager.cpp @@ -225,6 +225,11 @@ bool dispatch_core_manager::is_dispatcher_s_core_allocated(chip_id_t device_id, return assignment.dispatcher_s.has_value(); } +bool dispatch_core_manager::is_dispatcher_d_core_allocated(chip_id_t device_id, uint16_t channel, uint8_t cq_id) { + dispatch_core_placement_t& assignment = this->dispatch_core_assignments[device_id][channel][cq_id]; + return assignment.dispatcher_d.has_value(); +} + const tt_cxy_pair& dispatch_core_manager::dispatcher_d_core(chip_id_t device_id, uint16_t channel, uint8_t cq_id) { dispatch_core_placement_t& assignment = this->dispatch_core_assignments[device_id][channel][cq_id]; if (assignment.dispatcher_d.has_value()) { diff --git a/tt_metal/impl/dispatch/dispatch_query_manager.cpp b/tt_metal/impl/dispatch/dispatch_query_manager.cpp index e49af46ef7e..9eef6cbc72a 100644 --- a/tt_metal/impl/dispatch/dispatch_query_manager.cpp +++ b/tt_metal/impl/dispatch/dispatch_query_manager.cpp @@ -6,6 +6,8 @@ #include "tt_cluster.hpp" +using dispatch_core_mgr = tt::tt_metal::dispatch_core_manager; + namespace { tt::tt_metal::DispatchCoreConfig dispatch_core_config() { @@ -13,7 +15,7 @@ tt::tt_metal::DispatchCoreConfig dispatch_core_config() { tt::tt_metal::DispatchCoreConfig first_dispatch_core_config; for (chip_id_t device_id = 0; device_id < tt::Cluster::instance().number_of_devices(); device_id++) { - dispatch_core_config = tt::tt_metal::dispatch_core_manager::instance().get_dispatch_core_config(device_id); + dispatch_core_config = dispatch_core_mgr::instance().get_dispatch_core_config(device_id); if (device_id == 0) { first_dispatch_core_config = dispatch_core_config; } else { @@ -26,6 +28,36 @@ tt::tt_metal::DispatchCoreConfig dispatch_core_config() { return dispatch_core_config; } +tt_cxy_pair dispatch_core(uint8_t cq_id) { + tt_cxy_pair dispatch_core = tt_cxy_pair(0, 0, 0); + std::optional first_dispatch_core = std::nullopt; + for (chip_id_t device_id = 0; device_id < tt::Cluster::instance().number_of_devices(); device_id++) { + uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id); + if (tt::Cluster::instance().get_associated_mmio_device(device_id) == device_id) { + // Dispatch core is not allocated on this MMIO device, skip it + if (not dispatch_core_mgr::instance().is_dispatcher_core_allocated(device_id, channel, cq_id)) { + continue; + } + dispatch_core = dispatch_core_mgr::instance().dispatcher_core(device_id, channel, cq_id); + } else { + // Dispatch core is not allocated on this Non-MMIO device, skip it + if (not dispatch_core_mgr::instance().is_dispatcher_d_core_allocated(device_id, channel, cq_id)) { + continue; + } + dispatch_core = dispatch_core_mgr::instance().dispatcher_d_core(device_id, channel, cq_id); + } + if (not first_dispatch_core.has_value()) { + first_dispatch_core = dispatch_core; + } else { + TT_FATAL( + dispatch_core.x == first_dispatch_core.value().x and dispatch_core.y == first_dispatch_core.value().y, + "Expected the Dispatch Cores to be consistent across physical devices"); + } + } + TT_FATAL(first_dispatch_core.has_value(), "Could not find the dispatch core for {}", cq_id); + return dispatch_core; +} + tt::tt_metal::DispatchQueryManager* inst = nullptr; } // namespace @@ -60,6 +92,8 @@ void DispatchQueryManager::reset(uint8_t num_hw_cqs) { distributed_dispatcher_ = (num_hw_cqs == 1 and dispatch_core_config_.get_dispatch_core_type() == DispatchCoreType::ETH); go_signal_noc_ = dispatch_s_enabled_ ? NOC::NOC_1 : NOC::NOC_0; + // Reset the dispatch cores reported by the manager. Will be re-populated when the associated query is made + dispatch_cores_ = {}; } const DispatchCoreConfig& DispatchQueryManager::get_dispatch_core_config() const { return dispatch_core_config_; } @@ -72,6 +106,19 @@ const std::vector& DispatchQueryManager::get_logical_dispatch_cores(u return tt::get_logical_dispatch_cores(device_id, num_hw_cqs_, dispatch_core_config_); } +tt_cxy_pair DispatchQueryManager::get_dispatch_core(uint8_t cq_id) const { + if (dispatch_cores_.empty()) { + for (auto cq = 0; cq < num_hw_cqs_; cq++) { + // Populate when queried. Statically allocating at + // the start of the process causes the dispatch core + // order to change, which leads to lower performance + // with ethernet dispatch. + dispatch_cores_.push_back(dispatch_core(cq)); + } + } + return dispatch_cores_[cq_id]; +} + DispatchQueryManager::DispatchQueryManager(uint8_t num_hw_cqs) { this->reset(num_hw_cqs); } } // namespace tt::tt_metal diff --git a/tt_metal/impl/dispatch/dispatch_query_manager.hpp b/tt_metal/impl/dispatch/dispatch_query_manager.hpp index e01cae1d068..9435871461f 100644 --- a/tt_metal/impl/dispatch/dispatch_query_manager.hpp +++ b/tt_metal/impl/dispatch/dispatch_query_manager.hpp @@ -31,6 +31,7 @@ class DispatchQueryManager { const DispatchCoreConfig& get_dispatch_core_config() const; const std::vector& get_logical_storage_cores(uint32_t device_id) const; const std::vector& get_logical_dispatch_cores(uint32_t device_id) const; + tt_cxy_pair get_dispatch_core(uint8_t cq_id) const; private: void reset(uint8_t num_hw_cqs); @@ -41,6 +42,9 @@ class DispatchQueryManager { NOC go_signal_noc_ = NOC::NOC_0; uint8_t num_hw_cqs_ = 0; DispatchCoreConfig dispatch_core_config_; + // Make this mutable, since this is JIT populated + // through a const instance when queried + mutable std::vector dispatch_cores_; }; } // namespace tt::tt_metal diff --git a/tt_metal/impl/dispatch/hardware_command_queue.cpp b/tt_metal/impl/dispatch/hardware_command_queue.cpp index ed24132819c..8a72db6e742 100644 --- a/tt_metal/impl/dispatch/hardware_command_queue.cpp +++ b/tt_metal/impl/dispatch/hardware_command_queue.cpp @@ -399,7 +399,7 @@ void HWCommandQueue::enqueue_program(Program& program, bool blocking) { } void HWCommandQueue::enqueue_record_event( - const std::shared_ptr& event, bool clear_count, tt::stl::Span sub_device_ids) { + const std::shared_ptr& event, tt::stl::Span sub_device_ids) { ZoneScopedN("HWCommandQueue_enqueue_record_event"); TT_FATAL(!this->manager.get_bypass_mode(), "Enqueue Record Event cannot be used with tracing"); @@ -413,38 +413,22 @@ void HWCommandQueue::enqueue_record_event( event->ready = true; // what does this mean??? sub_device_ids = buffer_dispatch::select_sub_device_ids(this->device_, sub_device_ids); - - auto command = EnqueueRecordEventCommand( - this->id_, - this->device_, - this->noc_index_, - this->manager, + event_dispatch::issue_record_event_commands( + device_, event->event_id, - this->expected_num_workers_completed, + id_, + device_->num_hw_cqs(), + this->manager, sub_device_ids, - clear_count, - true); - this->enqueue_command(command, false, sub_device_ids); - - if (clear_count) { - for (const auto& id : sub_device_ids) { - this->expected_num_workers_completed[id.to_index()] = 0; - } - } + this->expected_num_workers_completed); this->issued_completion_q_reads.push( std::make_shared(std::in_place_type, event->event_id)); this->increment_num_entries_in_completion_q(); } -void HWCommandQueue::enqueue_wait_for_event(const std::shared_ptr& sync_event, bool clear_count) { +void HWCommandQueue::enqueue_wait_for_event(const std::shared_ptr& sync_event) { ZoneScopedN("HWCommandQueue_enqueue_wait_for_event"); - - auto command = EnqueueWaitForEventCommand(this->id_, this->device_, this->manager, *sync_event, clear_count); - this->enqueue_command(command, false, {}); - - if (clear_count) { - this->manager.reset_event_id(this->id_); - } + event_dispatch::issue_wait_for_event_commands(id_, sync_event->cq_id, this->manager, sync_event->event_id); } void HWCommandQueue::enqueue_trace(const uint32_t trace_id, bool blocking) { @@ -528,29 +512,8 @@ void HWCommandQueue::read_completion_queue() { this->exit_condition); } else if constexpr (std::is_same_v) { ZoneScopedN("CompletionQueueReadEvent"); - uint32_t read_ptr = this->manager.get_completion_queue_read_ptr(this->id_); - thread_local static std::vector dispatch_cmd_and_event( - (sizeof(CQDispatchCmd) + DispatchSettings::EVENT_PADDED_SIZE) / sizeof(uint32_t)); - tt::Cluster::instance().read_sysmem( - dispatch_cmd_and_event.data(), - sizeof(CQDispatchCmd) + DispatchSettings::EVENT_PADDED_SIZE, - read_ptr, - mmio_device_id, - channel); - uint32_t event_completed = dispatch_cmd_and_event[sizeof(CQDispatchCmd) / sizeof(uint32_t)]; - - TT_ASSERT( - event_completed == read_descriptor.event_id, - "Event Order Issue: expected to read back completion signal for event {} but got {}!", - read_descriptor.event_id, - event_completed); - this->manager.completion_queue_pop_front(1, this->id_); - this->manager.set_last_completed_event(this->id_, read_descriptor.get_global_event_id()); - log_trace( - LogAlways, - "Completion queue popped event {} (global: {})", - event_completed, - read_descriptor.get_global_event_id()); + event_dispatch::read_events_from_completion_queue( + read_descriptor, mmio_device_id, channel, this->id_, this->manager); } }, read_descriptor); @@ -570,7 +533,7 @@ void HWCommandQueue::finish(tt::stl::Span sub_device_ids) { ZoneScopedN("HWCommandQueue_finish"); tt::log_debug(tt::LogDispatch, "Finish for command queue {}", this->id_); std::shared_ptr event = std::make_shared(); - this->enqueue_record_event(event, false, sub_device_ids); + this->enqueue_record_event(event, sub_device_ids); if (tt::llrt::RunTimeOptions::get_instance().get_test_mode_enabled()) { while (this->num_entries_in_completion_q > this->num_completed_completion_q_reads) { if (DPrintServerHangDetected()) { diff --git a/tt_metal/impl/dispatch/hardware_command_queue.hpp b/tt_metal/impl/dispatch/hardware_command_queue.hpp index b281934db54..eeb8c1b9fe8 100644 --- a/tt_metal/impl/dispatch/hardware_command_queue.hpp +++ b/tt_metal/impl/dispatch/hardware_command_queue.hpp @@ -72,10 +72,8 @@ class HWCommandQueue : public CommandQueue { tt::stl::Span sub_device_ids = {}) override; void enqueue_record_event( - const std::shared_ptr& event, - bool clear_count = false, - tt::stl::Span sub_device_ids = {}) override; - void enqueue_wait_for_event(const std::shared_ptr& sync_event, bool clear_count = false) override; + const std::shared_ptr& event, tt::stl::Span sub_device_ids = {}) override; + void enqueue_wait_for_event(const std::shared_ptr& sync_event) override; void enqueue_write_buffer( const std::variant, std::shared_ptr>& buffer, diff --git a/tt_metal/impl/dispatch/host_runtime_commands.cpp b/tt_metal/impl/dispatch/host_runtime_commands.cpp index e1e0dfa8b5b..368bc663199 100644 --- a/tt_metal/impl/dispatch/host_runtime_commands.cpp +++ b/tt_metal/impl/dispatch/host_runtime_commands.cpp @@ -173,166 +173,6 @@ void EnqueueProgramCommand::process() { program.set_program_binary_status(device->id(), ProgramBinaryStatus::Committed); } -EnqueueRecordEventCommand::EnqueueRecordEventCommand( - uint32_t command_queue_id, - IDevice* device, - NOC noc_index, - SystemMemoryManager& manager, - uint32_t event_id, - tt::stl::Span expected_num_workers_completed, - tt::stl::Span sub_device_ids, - bool clear_count, - bool write_barrier) : - command_queue_id(command_queue_id), - device(device), - noc_index(noc_index), - manager(manager), - event_id(event_id), - expected_num_workers_completed(expected_num_workers_completed), - sub_device_ids(sub_device_ids), - clear_count(clear_count), - write_barrier(write_barrier) {} - -void EnqueueRecordEventCommand::process() { - std::vector event_payload(DispatchSettings::EVENT_PADDED_SIZE / sizeof(uint32_t), 0); - event_payload[0] = this->event_id; - - uint32_t pcie_alignment = hal.get_alignment(HalMemType::HOST); - uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); - uint8_t num_hw_cqs = - this->device->num_hw_cqs(); // Device initialize asserts that there can only be a maximum of 2 HW CQs - uint32_t packed_event_payload_sizeB = - align(sizeof(CQDispatchCmd) + num_hw_cqs * sizeof(CQDispatchWritePackedUnicastSubCmd), l1_alignment) + - (align(DispatchSettings::EVENT_PADDED_SIZE, l1_alignment) * num_hw_cqs); - uint32_t packed_write_sizeB = align(sizeof(CQPrefetchCmd) + packed_event_payload_sizeB, pcie_alignment); - uint32_t num_worker_counters = this->sub_device_ids.size(); - - uint32_t cmd_sequence_sizeB = - hal.get_alignment(HalMemType::HOST) * - num_worker_counters + // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WAIT - packed_write_sizeB + // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WRITE_PACKED + unicast subcmds + event - // payload - align( - sizeof(CQPrefetchCmd) + sizeof(CQDispatchCmd) + DispatchSettings::EVENT_PADDED_SIZE, - pcie_alignment); // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WRITE_LINEAR_HOST + event ID - - void* cmd_region = this->manager.issue_queue_reserve(cmd_sequence_sizeB, this->command_queue_id); - - HugepageDeviceCommand command_sequence(cmd_region, cmd_sequence_sizeB); - - CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(this->device->id()); - uint32_t dispatch_message_base_addr = - DispatchMemMap::get(dispatch_core_type) - .get_device_command_queue_addr(CommandQueueDeviceAddrType::DISPATCH_MESSAGE); - - uint32_t last_index = num_worker_counters - 1; - // We only need the write barrier for the last wait cmd - for (uint32_t i = 0; i < last_index; ++i) { - auto offset_index = this->sub_device_ids[i].to_index(); - uint32_t dispatch_message_addr = - dispatch_message_base_addr + - DispatchMemMap::get(dispatch_core_type).get_dispatch_message_offset(offset_index); - command_sequence.add_dispatch_wait( - false, dispatch_message_addr, this->expected_num_workers_completed[offset_index], this->clear_count); - } - auto offset_index = this->sub_device_ids[last_index].to_index(); - uint32_t dispatch_message_addr = - dispatch_message_base_addr + - DispatchMemMap::get(dispatch_core_type).get_dispatch_message_offset(offset_index); - command_sequence.add_dispatch_wait( - this->write_barrier, - dispatch_message_addr, - this->expected_num_workers_completed[offset_index], - this->clear_count); - - CoreType core_type = dispatch_core_manager::instance().get_dispatch_core_type(this->device->id()); - uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device->id()); - std::vector unicast_sub_cmds(num_hw_cqs); - std::vector> event_payloads(num_hw_cqs); - - for (uint8_t cq_id = 0; cq_id < num_hw_cqs; cq_id++) { - tt_cxy_pair dispatch_location; - if (device->is_mmio_capable()) { - dispatch_location = dispatch_core_manager::instance().dispatcher_core(this->device->id(), channel, cq_id); - } else { - dispatch_location = dispatch_core_manager::instance().dispatcher_d_core(this->device->id(), channel, cq_id); - } - - CoreCoord dispatch_virtual_core = this->device->virtual_core_from_logical_core(dispatch_location, core_type); - unicast_sub_cmds[cq_id] = CQDispatchWritePackedUnicastSubCmd{ - .noc_xy_addr = this->device->get_noc_unicast_encoding(this->noc_index, dispatch_virtual_core)}; - event_payloads[cq_id] = {event_payload.data(), event_payload.size() * sizeof(uint32_t)}; - } - - uint32_t completion_q0_last_event_addr = DispatchMemMap::get(core_type).get_device_command_queue_addr( - CommandQueueDeviceAddrType::COMPLETION_Q0_LAST_EVENT); - uint32_t completion_q1_last_event_addr = DispatchMemMap::get(core_type).get_device_command_queue_addr( - CommandQueueDeviceAddrType::COMPLETION_Q1_LAST_EVENT); - uint32_t address = this->command_queue_id == 0 ? completion_q0_last_event_addr : completion_q1_last_event_addr; - const uint32_t packed_write_max_unicast_sub_cmds = get_packed_write_max_unicast_sub_cmds(this->device); - command_sequence.add_dispatch_write_packed( - num_hw_cqs, - address, - DispatchSettings::EVENT_PADDED_SIZE, - packed_event_payload_sizeB, - unicast_sub_cmds, - event_payloads, - packed_write_max_unicast_sub_cmds); - - bool flush_prefetch = true; - command_sequence.add_dispatch_write_host( - flush_prefetch, DispatchSettings::EVENT_PADDED_SIZE, true, event_payload.data()); - - this->manager.issue_queue_push_back(cmd_sequence_sizeB, this->command_queue_id); - - this->manager.fetch_queue_reserve_back(this->command_queue_id); - this->manager.fetch_queue_write(cmd_sequence_sizeB, this->command_queue_id); -} - -EnqueueWaitForEventCommand::EnqueueWaitForEventCommand( - uint32_t command_queue_id, - IDevice* device, - SystemMemoryManager& manager, - const Event& sync_event, - bool clear_count) : - command_queue_id(command_queue_id), - device(device), - manager(manager), - sync_event(sync_event), - clear_count(clear_count) { - this->dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id()); - // Should not be encountered under normal circumstances (record, wait) unless user is modifying sync event ID. - // TT_ASSERT(command_queue_id != sync_event.cq_id || event != sync_event.event_id, - // "EnqueueWaitForEventCommand cannot wait on it's own event id on the same CQ. Event ID: {} CQ ID: {}", - // event, command_queue_id); -} - -void EnqueueWaitForEventCommand::process() { - uint32_t cmd_sequence_sizeB = - hal.get_alignment(HalMemType::HOST); // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WAIT - - void* cmd_region = this->manager.issue_queue_reserve(cmd_sequence_sizeB, this->command_queue_id); - - HugepageDeviceCommand command_sequence(cmd_region, cmd_sequence_sizeB); - uint32_t completion_q0_last_event_addr = - DispatchMemMap::get(this->dispatch_core_type) - .get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q0_LAST_EVENT); - uint32_t completion_q1_last_event_addr = - DispatchMemMap::get(this->dispatch_core_type) - .get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q1_LAST_EVENT); - - uint32_t last_completed_event_address = - sync_event.cq_id == 0 ? completion_q0_last_event_addr : completion_q1_last_event_addr; - - command_sequence.add_dispatch_wait(false, last_completed_event_address, sync_event.event_id, this->clear_count); - - this->manager.issue_queue_push_back(cmd_sequence_sizeB, this->command_queue_id); - - this->manager.fetch_queue_reserve_back(this->command_queue_id); - - this->manager.fetch_queue_write(cmd_sequence_sizeB, this->command_queue_id); -} - EnqueueTraceCommand::EnqueueTraceCommand( uint32_t command_queue_id, IDevice* device, @@ -584,7 +424,7 @@ void EnqueueProgram(CommandQueue& cq, Program& program, bool blocking) { void EnqueueRecordEvent( CommandQueue& cq, const std::shared_ptr& event, tt::stl::Span sub_device_ids) { detail::DispatchStateCheck(true); - cq.enqueue_record_event(event, false, sub_device_ids); + cq.enqueue_record_event(event, sub_device_ids); } void EnqueueWaitForEvent(CommandQueue& cq, const std::shared_ptr& event) { diff --git a/tt_metal/impl/dispatch/host_runtime_commands.hpp b/tt_metal/impl/dispatch/host_runtime_commands.hpp index 655a379deb1..6a62c3a2053 100644 --- a/tt_metal/impl/dispatch/host_runtime_commands.hpp +++ b/tt_metal/impl/dispatch/host_runtime_commands.hpp @@ -96,61 +96,6 @@ class EnqueueProgramCommand : public Command { constexpr bool has_side_effects() { return true; } }; -class EnqueueRecordEventCommand : public Command { -private: - uint32_t command_queue_id; - IDevice* device; - NOC noc_index; - SystemMemoryManager& manager; - uint32_t event_id; - tt::stl::Span expected_num_workers_completed; - tt::stl::Span sub_device_ids; - bool clear_count; - bool write_barrier; - -public: - EnqueueRecordEventCommand( - uint32_t command_queue_id, - IDevice* device, - NOC noc_index, - SystemMemoryManager& manager, - uint32_t event_id, - tt::stl::Span expected_num_workers_completed, - tt::stl::Span sub_device_ids, - bool clear_count = false, - bool write_barrier = true); - - void process(); - - EnqueueCommandType type() { return EnqueueCommandType::ENQUEUE_RECORD_EVENT; } - - constexpr bool has_side_effects() { return false; } -}; - -class EnqueueWaitForEventCommand : public Command { -private: - uint32_t command_queue_id; - IDevice* device; - SystemMemoryManager& manager; - const Event& sync_event; - CoreType dispatch_core_type; - bool clear_count; - -public: - EnqueueWaitForEventCommand( - uint32_t command_queue_id, - IDevice* device, - SystemMemoryManager& manager, - const Event& sync_event, - bool clear_count = false); - - void process(); - - EnqueueCommandType type() { return EnqueueCommandType::ENQUEUE_WAIT_FOR_EVENT; } - - constexpr bool has_side_effects() { return false; } -}; - class EnqueueTraceCommand : public Command { private: uint32_t command_queue_id; diff --git a/tt_metal/impl/event/dispatch.cpp b/tt_metal/impl/event/dispatch.cpp new file mode 100644 index 00000000000..36a62181c60 --- /dev/null +++ b/tt_metal/impl/event/dispatch.cpp @@ -0,0 +1,183 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "tt_metal/impl/event/dispatch.hpp" +#include +#include "tt_metal/impl/dispatch/dispatch_query_manager.hpp" +#include + +namespace tt::tt_metal { + +namespace event_dispatch { + +namespace { +uint32_t get_packed_write_max_unicast_sub_cmds(IDevice* device) { + return device->compute_with_storage_grid_size().x * device->compute_with_storage_grid_size().y; +} +} // namespace + +void issue_record_event_commands( + IDevice* device, + uint32_t event_id, + uint8_t cq_id, + uint32_t num_command_queues, + SystemMemoryManager& manager, + tt::stl::Span sub_device_ids, + tt::stl::Span expected_num_workers_completed, + bool notify_host) { + std::vector event_payload(DispatchSettings::EVENT_PADDED_SIZE / sizeof(uint32_t), 0); + event_payload[0] = event_id; + + uint32_t pcie_alignment = hal.get_alignment(HalMemType::HOST); + uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); + uint32_t packed_event_payload_sizeB = + align(sizeof(CQDispatchCmd) + num_command_queues * sizeof(CQDispatchWritePackedUnicastSubCmd), l1_alignment) + + (align(DispatchSettings::EVENT_PADDED_SIZE, l1_alignment) * num_command_queues); + uint32_t packed_write_sizeB = align(sizeof(CQPrefetchCmd) + packed_event_payload_sizeB, pcie_alignment); + uint32_t num_worker_counters = sub_device_ids.size(); + + uint32_t cmd_sequence_sizeB = + hal.get_alignment(HalMemType::HOST) * + num_worker_counters + // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WAIT + packed_write_sizeB + // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WRITE_PACKED + + // unicast subcmds + event payload + align( + sizeof(CQPrefetchCmd) + sizeof(CQDispatchCmd) + DispatchSettings::EVENT_PADDED_SIZE, + pcie_alignment) * + notify_host; // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WRITE_LINEAR_HOST + event ID ===> Write + // event notification back to host, if requested by user + + void* cmd_region = manager.issue_queue_reserve(cmd_sequence_sizeB, cq_id); + + HugepageDeviceCommand command_sequence(cmd_region, cmd_sequence_sizeB); + + auto dispatch_core_config = DispatchQueryManager::instance().get_dispatch_core_config(); + CoreType dispatch_core_type = dispatch_core_config.get_core_type(); + + uint32_t dispatch_message_base_addr = + DispatchMemMap::get(dispatch_core_type) + .get_device_command_queue_addr(CommandQueueDeviceAddrType::DISPATCH_MESSAGE); + + uint32_t last_index = num_worker_counters - 1; + for (uint32_t i = 0; i < num_worker_counters; ++i) { + auto offset_index = sub_device_ids[i].to_index(); + uint32_t dispatch_message_addr = + dispatch_message_base_addr + + DispatchMemMap::get(dispatch_core_type).get_dispatch_message_offset(offset_index); + // recording an event does not have any side-effects on the dispatch completion count + // hence clear_count is set to false, i.e. the number of workers on the dispatcher is + // not reset + // We only need the write barrier for the last wait cmd. + command_sequence.add_dispatch_wait( + (i == num_worker_counters - 1), /* write_barrier ensures that all writes initiated by the dispatcher are + flushed before the event is recorded */ + dispatch_message_addr, + expected_num_workers_completed[offset_index], + false /* recording an event does not have any side-effects on the dispatch completion count */); + } + + std::vector unicast_sub_cmds(num_command_queues); + std::vector> event_payloads(num_command_queues); + + for (auto cq_id = 0; cq_id < num_command_queues; cq_id++) { + tt_cxy_pair dispatch_location = DispatchQueryManager::instance().get_dispatch_core(cq_id); + CoreCoord dispatch_virtual_core = device->virtual_core_from_logical_core(dispatch_location, dispatch_core_type); + unicast_sub_cmds[cq_id] = CQDispatchWritePackedUnicastSubCmd{ + .noc_xy_addr = device->get_noc_unicast_encoding(dispatch_downstream_noc, dispatch_virtual_core)}; + event_payloads[cq_id] = {event_payload.data(), event_payload.size() * sizeof(uint32_t)}; + } + + uint32_t completion_q0_last_event_addr = + DispatchMemMap::get(dispatch_core_type) + .get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q0_LAST_EVENT); + uint32_t completion_q1_last_event_addr = + DispatchMemMap::get(dispatch_core_type) + .get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q1_LAST_EVENT); + uint32_t address = cq_id == 0 ? completion_q0_last_event_addr : completion_q1_last_event_addr; + const uint32_t packed_write_max_unicast_sub_cmds = get_packed_write_max_unicast_sub_cmds(device); + command_sequence.add_dispatch_write_packed( + num_command_queues, + address, + DispatchSettings::EVENT_PADDED_SIZE, + packed_event_payload_sizeB, + unicast_sub_cmds, + event_payloads, + packed_write_max_unicast_sub_cmds); + + if (notify_host) { + bool flush_prefetch = true; + command_sequence.add_dispatch_write_host( + flush_prefetch, DispatchSettings::EVENT_PADDED_SIZE, true, event_payload.data()); + } + + manager.issue_queue_push_back(cmd_sequence_sizeB, cq_id); + + manager.fetch_queue_reserve_back(cq_id); + manager.fetch_queue_write(cmd_sequence_sizeB, cq_id); +} + +void issue_wait_for_event_commands( + uint8_t cq_id, uint8_t event_cq_id, SystemMemoryManager& sysmem_manager, uint32_t event_id) { + uint32_t cmd_sequence_sizeB = + hal.get_alignment(HalMemType::HOST); // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WAIT + + auto dispatch_core_config = DispatchQueryManager::instance().get_dispatch_core_config(); + CoreType dispatch_core_type = dispatch_core_config.get_core_type(); + + void* cmd_region = sysmem_manager.issue_queue_reserve(cmd_sequence_sizeB, cq_id); + + HugepageDeviceCommand command_sequence(cmd_region, cmd_sequence_sizeB); + uint32_t completion_q0_last_event_addr = + DispatchMemMap::get(dispatch_core_type) + .get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q0_LAST_EVENT); + uint32_t completion_q1_last_event_addr = + DispatchMemMap::get(dispatch_core_type) + .get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q1_LAST_EVENT); + + uint32_t last_completed_event_address = + event_cq_id == 0 ? completion_q0_last_event_addr : completion_q1_last_event_addr; + + command_sequence.add_dispatch_wait(false, last_completed_event_address, event_id, false); + + sysmem_manager.issue_queue_push_back(cmd_sequence_sizeB, cq_id); + + sysmem_manager.fetch_queue_reserve_back(cq_id); + + sysmem_manager.fetch_queue_write(cmd_sequence_sizeB, cq_id); +} + +void read_events_from_completion_queue( + ReadEventDescriptor& event_descriptor, + chip_id_t mmio_device_id, + uint16_t channel, + uint8_t cq_id, + SystemMemoryManager& sysmem_manager) { + uint32_t read_ptr = sysmem_manager.get_completion_queue_read_ptr(cq_id); + thread_local static std::vector dispatch_cmd_and_event( + (sizeof(CQDispatchCmd) + DispatchSettings::EVENT_PADDED_SIZE) / sizeof(uint32_t)); + tt::Cluster::instance().read_sysmem( + dispatch_cmd_and_event.data(), + sizeof(CQDispatchCmd) + DispatchSettings::EVENT_PADDED_SIZE, + read_ptr, + mmio_device_id, + channel); + uint32_t event_completed = dispatch_cmd_and_event[sizeof(CQDispatchCmd) / sizeof(uint32_t)]; + + TT_ASSERT( + event_completed == event_descriptor.event_id, + "Event Order Issue: expected to read back completion signal for event {} but got {}!", + event_descriptor.event_id, + event_completed); + sysmem_manager.completion_queue_pop_front(1, cq_id); + sysmem_manager.set_last_completed_event(cq_id, event_descriptor.get_global_event_id()); + log_trace( + LogAlways, + "Completion queue popped event {} (global: {})", + event_completed, + event_descriptor.get_global_event_id()); +} + +} // namespace event_dispatch + +} // namespace tt::tt_metal diff --git a/tt_metal/impl/event/dispatch.hpp b/tt_metal/impl/event/dispatch.hpp new file mode 100644 index 00000000000..461fd47018f --- /dev/null +++ b/tt_metal/impl/event/dispatch.hpp @@ -0,0 +1,48 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include + +namespace tt::tt_metal { + +// Used so host knows data in completion queue is just an event ID +struct ReadEventDescriptor { + uint32_t event_id; + uint32_t global_offset; + + explicit ReadEventDescriptor(uint32_t event) : event_id(event), global_offset(0) {} + + void set_global_offset(uint32_t offset) { global_offset = offset; } + uint32_t get_global_event_id() { return global_offset + event_id; } +}; + +namespace event_dispatch { + +void issue_record_event_commands( + IDevice* device, + uint32_t event_id, + uint8_t cq_id, + uint32_t num_command_queues, + SystemMemoryManager& manager, + tt::stl::Span sub_device_ids, + tt::stl::Span expected_num_workers_completed, + bool notify_host = true); + +void issue_wait_for_event_commands( + uint8_t cq_id, uint8_t event_cq_id, SystemMemoryManager& sysmem_manager, uint32_t event_id); + +void read_events_from_completion_queue( + ReadEventDescriptor& event_descriptor, + chip_id_t mmio_device_id, + uint16_t channel, + uint8_t cq_id, + SystemMemoryManager& sysmem_manager); + +} // namespace event_dispatch + +} // namespace tt::tt_metal diff --git a/tt_metal/impl/program/dispatch.cpp b/tt_metal/impl/program/dispatch.cpp index fcd9b76494d..67e9a1a2740 100644 --- a/tt_metal/impl/program/dispatch.cpp +++ b/tt_metal/impl/program/dispatch.cpp @@ -406,7 +406,8 @@ void insert_empty_program_dispatch_preamble_cmd(ProgramCommandSequence& program_ void insert_stall_cmds(ProgramCommandSequence& program_command_sequence, SubDeviceId sub_device_id, IDevice* device) { // Initialize stall command sequences for this program. - auto dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id()); + auto dispatch_core_config = DispatchQueryManager::instance().get_dispatch_core_config(); + auto dispatch_core_type = dispatch_core_config.get_core_type(); uint32_t dispatch_message_addr = DispatchMemMap::get(dispatch_core_type) .get_device_command_queue_addr(CommandQueueDeviceAddrType::DISPATCH_MESSAGE) + @@ -549,7 +550,8 @@ void assemble_runtime_args_commands( ProgramCommandSequence& program_command_sequence, Program& program, IDevice* device) { static const uint32_t packed_write_max_unicast_sub_cmds = get_packed_write_max_unicast_sub_cmds(device); NOC noc_index = dispatch_downstream_noc; - CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id()); + auto dispatch_core_config = DispatchQueryManager::instance().get_dispatch_core_config(); + auto dispatch_core_type = dispatch_core_config.get_core_type(); const uint32_t max_prefetch_command_size = DispatchMemMap::get(dispatch_core_type).max_prefetch_command_size(); // Dispatch Commands to Unicast Unique Runtime Args to Workers @@ -812,7 +814,8 @@ void insert_write_packed_payloads( void assemble_device_commands( ProgramCommandSequence& program_command_sequence, Program& program, IDevice* device, SubDeviceId sub_device_id) { DeviceCommandCalculator calculator; - CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id()); + auto dispatch_core_config = DispatchQueryManager::instance().get_dispatch_core_config(); + auto dispatch_core_type = dispatch_core_config.get_core_type(); NOC noc_index = dispatch_downstream_noc; const uint32_t max_prefetch_command_size = DispatchMemMap::get(dispatch_core_type).max_prefetch_command_size(); static const uint32_t packed_write_max_unicast_sub_cmds = get_packed_write_max_unicast_sub_cmds(device); From 14b5991c13592bc842bed8a59413f436228446d1 Mon Sep 17 00:00:00 2001 From: Debin Chen Date: Fri, 7 Feb 2025 10:49:57 -0800 Subject: [PATCH 016/316] #17128 Advanced programming example vecadd_multi_core (#17129) Advanced version of vecadd_multi_core compared to the old issue 16443 Changes: old: hardcoded to 4 cores new: no hardcode. Automatically split work to cores, like how a real program does. old: kernels take core id as runtime arg. new: core id is not good. Now it takes star tile_id and number_of_tiles_per core as runtime arg, like how a real kernel does. ### Ticket [Link to Github Issue](https://github.com/tenstorrent/tt-metal/issues/17128) ### Problem description Advanced version of vecadd_multi_core compared to the old issue 16443 Changes: Make the programming example closer to real world behavior ### What's changed old: hardcoded to 4 cores new: no hardcode. Automatically split work to cores, like how a real program does. old: kernels take core id as runtime arg. new: core id is not good. Now it takes star tile_id and number_of_tiles_per core as runtime arg, like how a real kernel does. ### Checklist - [ ] Post commit CI passes - [ ] Blackhole Post commit (if applicable) - [ ] Model regression CI testing passes (if applicable) - [ ] Device performance regression CI testing passes (if applicable) - [ ] **(For models and ops writers)** Full [new models](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) tests passes - [ ] New/Existing tests provide coverage for changes --- tt_metal/programming_examples/CMakeLists.txt | 1 + .../kernels/add_multi_core.cpp | 17 ++-- .../interleaved_tile_read_multi_core.cpp | 8 +- .../kernels/tile_write_multi_core.cpp | 8 +- .../vecadd_multi_core/vecadd_multi_core.cpp | 85 ++++++++++++------- 5 files changed, 73 insertions(+), 46 deletions(-) diff --git a/tt_metal/programming_examples/CMakeLists.txt b/tt_metal/programming_examples/CMakeLists.txt index 5b0d988e663..7c2a70af0fa 100644 --- a/tt_metal/programming_examples/CMakeLists.txt +++ b/tt_metal/programming_examples/CMakeLists.txt @@ -14,6 +14,7 @@ set(PROGRAMMING_EXAMPLES_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/pad/pad_multi_core.cpp ${CMAKE_CURRENT_SOURCE_DIR}/sharding/shard_data_rm.cpp ${CMAKE_CURRENT_SOURCE_DIR}/vecadd_sharding/vecadd_sharding.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/vecadd_multi_core/vecadd_multi_core.cpp ) include(${PROJECT_SOURCE_DIR}/cmake/helper_functions.cmake) diff --git a/tt_metal/programming_examples/vecadd_multi_core/kernels/add_multi_core.cpp b/tt_metal/programming_examples/vecadd_multi_core/kernels/add_multi_core.cpp index d38a6d2e30a..debb8de4f83 100644 --- a/tt_metal/programming_examples/vecadd_multi_core/kernels/add_multi_core.cpp +++ b/tt_metal/programming_examples/vecadd_multi_core/kernels/add_multi_core.cpp @@ -11,7 +11,7 @@ namespace NAMESPACE { void MAIN { uint32_t n_tiles = get_arg_val(0); - uint32_t core_id = get_arg_val(1); // Add core ID argument + uint32_t start_tile_id = get_arg_val(1); // We are going to read from these two circular buffers constexpr auto cb_in0 = get_compile_time_arg_val(0); @@ -36,24 +36,27 @@ void MAIN { add_tiles_init(cb_in0, cb_in1); // Calculate the range of tiles this core should process - const uint32_t tiles_per_core = n_tiles; - const uint32_t start_tile = core_id * tiles_per_core; - const uint32_t end_tile = start_tile + tiles_per_core; + const uint32_t end_tile_id = start_tile_id + n_tiles; // Loop over the assigned tiles and perform the computation - for (uint32_t i = start_tile; i < end_tile; i++) { - // Make sure there is a valid register we can use. - acquire_dst(); + for (uint32_t i = start_tile_id; i < end_tile_id; i++) { // Wait until there is a tile in both input circular buffers cb_wait_front(cb_in0, 1); cb_wait_front(cb_in1, 1); + // Make sure there is a valid register we can use. + tile_regs_acquire(); // Add the tiles from the input circular buffers and write the result to // the destination register add_tiles(cb_in0, cb_in1, 0, 0, dst_reg); + tile_regs_commit(); + // Make sure there is space in the output circular buffer cb_reserve_back(cb_out0, 1); + tile_regs_wait(); // Copy the result from adding the tiles to the output circular buffer pack_tile(dst_reg, cb_out0); + tile_regs_release(); + // Mark the output tile as ready and pop the input tiles cb_push_back(cb_out0, 1); cb_pop_front(cb_in0, 1); diff --git a/tt_metal/programming_examples/vecadd_multi_core/kernels/interleaved_tile_read_multi_core.cpp b/tt_metal/programming_examples/vecadd_multi_core/kernels/interleaved_tile_read_multi_core.cpp index 039b33b7a7d..f64fbf90823 100644 --- a/tt_metal/programming_examples/vecadd_multi_core/kernels/interleaved_tile_read_multi_core.cpp +++ b/tt_metal/programming_examples/vecadd_multi_core/kernels/interleaved_tile_read_multi_core.cpp @@ -11,7 +11,7 @@ void kernel_main() { uint32_t a_addr = get_arg_val(0); uint32_t b_addr = get_arg_val(1); uint32_t n_tiles = get_arg_val(2); - uint32_t core_id = get_arg_val(3); // Add core ID argument + uint32_t start_tile_id = get_arg_val(3); // The circular buffers to read the tiles into constexpr uint32_t cb_in0 = get_compile_time_arg_val(0); @@ -39,13 +39,11 @@ void kernel_main() { }; // Calculate the range of tiles this core should process - const uint32_t tiles_per_core = n_tiles; - const uint32_t start_tile = core_id * tiles_per_core; - const uint32_t end_tile = start_tile + tiles_per_core; + const uint32_t end_tile_id = start_tile_id + n_tiles; // Now we loop over the assigned tiles and read them into the circular // buffers - for (uint32_t i = start_tile; i < end_tile; i++) { + for (uint32_t i = start_tile_id; i < end_tile_id; i++) { // First we make sure there is space in the circular buffers cb_reserve_back(cb_in0, 1); cb_reserve_back( diff --git a/tt_metal/programming_examples/vecadd_multi_core/kernels/tile_write_multi_core.cpp b/tt_metal/programming_examples/vecadd_multi_core/kernels/tile_write_multi_core.cpp index b5599bb8baa..44565e321f4 100644 --- a/tt_metal/programming_examples/vecadd_multi_core/kernels/tile_write_multi_core.cpp +++ b/tt_metal/programming_examples/vecadd_multi_core/kernels/tile_write_multi_core.cpp @@ -7,7 +7,7 @@ void kernel_main() { uint32_t c_addr = get_arg_val(0); uint32_t n_tiles = get_arg_val(1); - uint32_t core_id = get_arg_val(2); // Add core ID argument + uint32_t start_tile_id = get_arg_val(2); // The circular buffer that we are going to read from and write to DRAM constexpr uint32_t cb_out0 = get_compile_time_arg_val(0); @@ -22,12 +22,10 @@ void kernel_main() { }; // Calculate the range of tiles this core should process - const uint32_t tiles_per_core = n_tiles; - const uint32_t start_tile = core_id * tiles_per_core; - const uint32_t end_tile = start_tile + tiles_per_core; + const uint32_t end_tile_id = start_tile_id + n_tiles; // Loop over the assigned tiles and write them to the output buffer - for (uint32_t i = start_tile; i < end_tile; i++) { + for (uint32_t i = start_tile_id; i < end_tile_id; i++) { // Make sure there is a tile in the circular buffer cb_wait_front(cb_out0, 1); uint32_t cb_out0_addr = get_read_ptr(cb_out0); diff --git a/tt_metal/programming_examples/vecadd_multi_core/vecadd_multi_core.cpp b/tt_metal/programming_examples/vecadd_multi_core/vecadd_multi_core.cpp index 8e44fc1295a..b8dca40a282 100644 --- a/tt_metal/programming_examples/vecadd_multi_core/vecadd_multi_core.cpp +++ b/tt_metal/programming_examples/vecadd_multi_core/vecadd_multi_core.cpp @@ -9,12 +9,15 @@ #include #include #include +#include + #include #include #include #include #include #include +#include using namespace tt; using namespace tt::tt_metal; @@ -104,21 +107,17 @@ int main(int argc, char** argv) { help(argv[0]); } } + // n_tiles is number of tiles of data for this programming example to add two vectors + const uint32_t n_tiles = 640; - IDevice* device = CreateDevice(device_id); - + auto* device = CreateDevice(device_id); Program program = CreateProgram(); - // Define 4 cores. - const uint32_t num_core = 4; - // designate 4 cores for utilization - cores (0,0), (0,1), (0,2), (0,3) - CoreCoord start_core = {0, 0}; - CoreCoord end_core = {0, 3}; - CoreRange cores(start_core, end_core); CommandQueue& cq = device->command_queue(); - const uint32_t n_tiles = 64; + const uint32_t tile_size = tt::constants::TILE_WIDTH * tt::constants::TILE_HEIGHT; - const uint32_t tiles_per_core = n_tiles / num_core; + std::vector tiles_per_core; + const uint32_t core_to_print = 4; // Create 3 buffers on DRAM. These will hold the input and output data. A // and B are the input buffers, C is the output buffer. @@ -130,10 +129,16 @@ int main(int argc, char** argv) { std::vector a_data = create_random_vector_of_bfloat16_native(tile_size * n_tiles * 2, 10, rng()); std::vector b_data = create_random_vector_of_bfloat16_native(tile_size * n_tiles * 2, 10, rng()); + auto compute_with_storage_grid_size = device->compute_with_storage_grid_size(); + uint32_t num_cores_x = compute_with_storage_grid_size.x; + uint32_t num_cores_y = compute_with_storage_grid_size.y; + uint32_t num_cores_total = num_cores_x * num_cores_y; + auto all_device_cores = CoreRange({0, 0}, {num_cores_x - 1, num_cores_y - 1}); + const uint32_t cir_buf_num_title = 4; - CBHandle cb_a = MakeCircularBufferBFP16(program, cores, tt::CBIndex::c_0, cir_buf_num_title); - CBHandle cb_b = MakeCircularBufferBFP16(program, cores, tt::CBIndex::c_1, cir_buf_num_title); - CBHandle cb_c = MakeCircularBufferBFP16(program, cores, tt::CBIndex::c_2, cir_buf_num_title); + CBHandle cb_a = MakeCircularBufferBFP16(program, all_device_cores, tt::CBIndex::c_0, cir_buf_num_title); + CBHandle cb_b = MakeCircularBufferBFP16(program, all_device_cores, tt::CBIndex::c_1, cir_buf_num_title); + CBHandle cb_c = MakeCircularBufferBFP16(program, all_device_cores, tt::CBIndex::c_2, cir_buf_num_title); // A Tensix core is made up with 5 processors. 2 data movement processors, // and 3 compute processors. The 2 data movement processors act independent @@ -154,11 +159,12 @@ int main(int argc, char** argv) { std::vector writer_compile_time_args = {(std::uint32_t)tt::CBIndex::c_2}; std::vector compute_compile_time_args = { (std::uint32_t)tt::CBIndex::c_0, (std::uint32_t)tt::CBIndex::c_1, (std::uint32_t)tt::CBIndex::c_2}; + auto reader = CreateKernel( program, "tt_metal/programming_examples/vecadd_multi_core/kernels/" "interleaved_tile_read_multi_core.cpp", - cores, + all_device_cores, DataMovementConfig{ .processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default, @@ -167,7 +173,7 @@ int main(int argc, char** argv) { program, "tt_metal/programming_examples/vecadd_multi_core/kernels/" "tile_write_multi_core.cpp", - cores, + all_device_cores, DataMovementConfig{ .processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default, @@ -176,15 +182,36 @@ int main(int argc, char** argv) { program, "tt_metal/programming_examples/vecadd_multi_core/" "kernels/add_multi_core.cpp", - cores, + all_device_cores, ComputeConfig{.math_approx_mode = false, .compile_args = compute_compile_time_args, .defines = {}}); - for (int i = 0; i < num_core; ++i) { - // Set runtime arguments for each core. - CoreCoord core = {0, i}; - SetRuntimeArgs(program, reader, core, {a->address(), b->address(), tiles_per_core, i}); - SetRuntimeArgs(program, writer, core, {c->address(), tiles_per_core, i}); - SetRuntimeArgs(program, compute, core, {tiles_per_core, i}); + constexpr bool row_major = true; + auto [num_cores, all_cores, core_group_1, core_group_2, num_tiles_per_core_group_1, num_tiles_per_core_group_2] = + tt::tt_metal::split_work_to_cores(compute_with_storage_grid_size, n_tiles, row_major); + + auto cores = grid_to_cores(num_cores_total, num_cores_x, num_cores_y, row_major); + for (uint32_t i = 0, start_tile_id = 0; i < num_cores_total; i++) { + const auto& core = cores[i]; + + uint32_t num_tiles_per_core; + + if (core_group_1.contains(core)) { + num_tiles_per_core = num_tiles_per_core_group_1; + } else if (core_group_2.contains(core)) { + num_tiles_per_core = num_tiles_per_core_group_2; + } else { + SetRuntimeArgs(program, reader, core, std::array{0}); + SetRuntimeArgs(program, writer, core, std::array{0}); + SetRuntimeArgs(program, compute, core, std::array{0}); + continue; + } + if (i < core_to_print) { + tiles_per_core.push_back(num_tiles_per_core); + } + SetRuntimeArgs(program, reader, core, {a->address(), b->address(), num_tiles_per_core, start_tile_id}); + SetRuntimeArgs(program, writer, core, {c->address(), num_tiles_per_core, start_tile_id}); + SetRuntimeArgs(program, compute, core, {num_tiles_per_core, start_tile_id}); + start_tile_id += num_tiles_per_core; } EnqueueWriteBuffer(cq, a, a_data, false); @@ -202,14 +229,14 @@ int main(int argc, char** argv) { // some error due to BFP16 precision) std::cout << "Partial results: (note we are running under BFP16. It's going " "to be less accurate)\n"; - size_t data_per_core = std::min((size_t)10, (size_t)tile_size * tiles_per_core); - - for (int core = 0; core < num_core; ++core) { - const auto core_offset = core * (tile_size + tiles_per_core); - for (int index = 0; index < data_per_core; index++) { + auto core_offset = 0; + for (int core_index = 0; core_index < std::min(core_to_print, num_cores_total); ++core_index) { + core_offset += core_index * tile_size * tiles_per_core[core_index]; + std::cout << "Core (0, " << core_index << "):\n"; + for (int index = 0; index < 10; index++) { const auto i = core_offset + index; - std::cout << " " << a_data[i].to_float() << " + " << b_data[i].to_float() << " = " << c_data[i].to_float() - << "\n"; + std::cout << "index " << i << " " << a_data[i].to_float() << " + " << b_data[i].to_float() << " = " + << c_data[i].to_float() << "\n"; } std::cout << std::endl; } From 14f9739ed89245fa47241ff057e8cf147b42c852 Mon Sep 17 00:00:00 2001 From: Artem Yerofieiev <169092593+ayerofieiev-tt@users.noreply.github.com> Date: Fri, 7 Feb 2025 10:55:31 -0800 Subject: [PATCH 017/316] Make QueueId a strong type (#17637) ### Ticket https://github.com/tenstorrent/tt-metal/issues/10605 ### Problem description We use uint8_t for command queue across the codebase. This is error prone. ### What's changed Changing to a strong type in TT-NN. Not **yet** changing in Metal, thats tbd. Currently QueueId type is defined in TT-NN inside `common/constants.hpp`, this is not great. I will take any advice on where best to place this type. This change should allow to evolve TT-NN infra to automatically add an overload w/o queue_id, which should further minimize # lines of code needed to define an operation. ### Checklist - [x] [All post commit](https://github.com/tenstorrent/tt-metal/actions/runs/13188378352) - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/runs/13205024345) - [ ] Device performance regression - not running, has a regression on main --- tests/tt_eager/ops/test_bcast_op.cpp | 10 +-- tests/tt_eager/ops/test_fold_op.cpp | 3 +- ...erisc_data_mover_loopback_with_workers.cpp | 10 +-- .../tensor/common_tensor_test_utils.cpp | 8 +- .../gtests/tensor/test_create_tensor.cpp | 2 +- .../unit_tests/gtests/test_async_runtime.cpp | 22 ++--- .../unit_tests/gtests/test_ccl_on_galaxy.cpp | 18 ++-- .../gtests/test_multi_cq_multi_dev.cpp | 20 ++--- .../gtests/test_multiprod_queue.cpp | 14 +-- .../ttml/ttnn_fixed/trivial_ttnn_ops.cpp | 2 +- tt_metal/tt_stl/strong_type.hpp | 1 + ttnn/CMakeLists.txt | 4 +- ttnn/cpp/pybind11/decorators.hpp | 5 +- ttnn/cpp/pybind11/device.cpp | 6 +- ttnn/cpp/pybind11/events.cpp | 10 ++- ttnn/cpp/pybind11/operations/__init__.hpp | 1 + ttnn/cpp/pybind11/operations/copy.hpp | 8 +- ttnn/cpp/pybind11/operations/core.hpp | 16 ++-- ttnn/cpp/pybind11/operations/creation.hpp | 7 +- ttnn/cpp/pybind11/pytensor.cpp | 8 +- ttnn/cpp/pybind11/types.cpp | 90 +++++++++++++++++++ ttnn/cpp/pybind11/types.hpp | 67 +------------- ttnn/cpp/ttnn/async_runtime.cpp | 8 +- ttnn/cpp/ttnn/async_runtime.hpp | 5 +- ttnn/cpp/ttnn/common/constants.hpp | 15 ---- ttnn/cpp/ttnn/common/queue_id.hpp | 27 ++++++ ttnn/cpp/ttnn/decorators.hpp | 4 +- ttnn/cpp/ttnn/device_operation.hpp | 8 +- ttnn/cpp/ttnn/events.cpp | 12 +-- ttnn/cpp/ttnn/events.hpp | 8 +- .../ccl/all_gather/device/all_gather_op.cpp | 2 +- .../ttnn/operations/conv/conv2d/conv2d.cpp | 4 +- .../ttnn/operations/conv/conv2d/conv2d.hpp | 4 +- .../operations/conv/conv2d/conv2d_pybind.cpp | 8 +- .../conv_transpose2d/conv_transpose2d.cpp | 4 +- .../conv_transpose2d/conv_transpose2d.hpp | 4 +- .../conv_transpose2d_pybind.cpp | 8 +- ttnn/cpp/ttnn/operations/copy.hpp | 8 +- ttnn/cpp/ttnn/operations/core/core.cpp | 20 ++--- ttnn/cpp/ttnn/operations/core/core.hpp | 14 +-- .../core/to_layout/to_layout_op.cpp | 2 +- ttnn/cpp/ttnn/operations/creation.hpp | 18 ++-- .../operations/data_movement/bcast/bcast.cpp | 4 +- .../operations/data_movement/bcast/bcast.hpp | 2 +- .../data_movement/bcast/bcast_pybind.cpp | 4 +- .../data_movement/common/common.cpp | 2 +- .../data_movement/common/common.hpp | 4 +- .../data_movement/concat/concat.cpp | 10 +-- .../data_movement/concat/concat.hpp | 2 +- .../data_movement/concat/concat_pybind.hpp | 4 +- .../operations/data_movement/copy/copy.cpp | 8 +- .../operations/data_movement/copy/copy.hpp | 6 +- .../data_movement/copy/copy_pybind.cpp | 12 +-- .../data_movement/expand/expand.cpp | 7 +- .../data_movement/expand/expand.hpp | 2 +- .../data_movement/expand/expand_pybind.cpp | 4 +- .../data_movement/fill_pad/fill_pad.cpp | 4 +- .../data_movement/fill_pad/fill_pad.hpp | 2 +- .../fill_pad/fill_pad_pybind.cpp | 4 +- .../data_movement/fill_rm/fill_rm.cpp | 6 +- .../data_movement/fill_rm/fill_rm.hpp | 4 +- .../data_movement/fill_rm/fill_rm_pybind.cpp | 8 +- .../operations/data_movement/fold/fold.cpp | 8 +- .../operations/data_movement/fold/fold.hpp | 2 +- .../data_movement/fold/fold_pybind.cpp | 4 +- .../indexed_fill/indexed_fill.cpp | 4 +- .../indexed_fill/indexed_fill.hpp | 2 +- .../indexed_fill/indexed_fill_pybind.cpp | 4 +- .../operations/data_movement/move/move.cpp | 8 +- .../operations/data_movement/move/move.hpp | 2 +- .../data_movement/move/move_pybind.cpp | 4 +- .../non_zero_indices/non_zero_indices.cpp | 4 +- .../non_zero_indices/non_zero_indices.hpp | 2 +- .../non_zero_indices_pybind.cpp | 4 +- .../ttnn/operations/data_movement/pad/pad.cpp | 10 +-- .../ttnn/operations/data_movement/pad/pad.hpp | 5 +- .../data_movement/pad/pad_pybind.hpp | 36 ++++---- .../data_movement/permute/permute.cpp | 4 +- .../data_movement/permute/permute.hpp | 2 +- .../data_movement/permute/permute_pybind.cpp | 4 +- .../data_movement/repeat/repeat.cpp | 8 +- .../data_movement/repeat/repeat.hpp | 2 +- .../data_movement/repeat/repeat_pybind.cpp | 4 +- .../reshape_on_device/reshape.cpp | 8 +- .../reshape_on_device/reshape.hpp | 6 +- .../reshape_on_device/reshape_pybind.cpp | 4 +- .../data_movement/reshape_view/reshape.cpp | 29 +++--- .../data_movement/reshape_view/reshape.hpp | 16 ++-- .../reshape_view/reshape_pybind.cpp | 12 +-- .../interleaved_to_sharded.cpp | 6 +- .../interleaved_to_sharded.hpp | 4 +- .../interleaved_to_sharded_pybind.cpp | 8 +- .../data_movement/sharded/reshard/reshard.cpp | 2 +- .../data_movement/sharded/reshard/reshard.hpp | 2 +- .../sharded/reshard/reshard_pybind.cpp | 4 +- .../sharded_to_interleaved.cpp | 4 +- .../sharded_to_interleaved.hpp | 2 +- .../sharded_to_interleaved_pybind.cpp | 4 +- .../interleaved_to_sharded_partial.cpp | 4 +- .../interleaved_to_sharded_partial.hpp | 2 +- .../interleaved_to_sharded_partial_pybind.cpp | 4 +- .../sharded_to_interleaved_partial.cpp | 4 +- .../sharded_to_interleaved_partial.hpp | 2 +- .../sharded_to_interleaved_partial_pybind.cpp | 4 +- .../operations/data_movement/slice/slice.cpp | 15 ++-- .../operations/data_movement/slice/slice.hpp | 6 +- .../data_movement/slice/slice_pybind.hpp | 8 +- .../operations/data_movement/split/split.cpp | 4 +- .../operations/data_movement/split/split.hpp | 2 +- .../data_movement/split/split_pybind.hpp | 4 +- .../data_movement/tilize/tilize.cpp | 4 +- .../data_movement/tilize/tilize.hpp | 2 +- .../data_movement/tilize/tilize_pybind.hpp | 4 +- .../tilize_with_val_padding.cpp | 8 +- .../tilize_with_val_padding.hpp | 8 +- .../tilize_with_val_padding_pybind.hpp | 8 +- .../data_movement/transpose/transpose.cpp | 4 +- .../data_movement/transpose/transpose.hpp | 2 +- .../transpose/transpose_pybind.cpp | 4 +- .../data_movement/untilize/untilize.cpp | 4 +- .../data_movement/untilize/untilize.hpp | 2 +- .../untilize/untilize_pybind.hpp | 4 +- .../untilize_with_halo_v2.cpp | 4 +- .../untilize_with_halo_v2.hpp | 2 +- .../untilize_with_halo_v2_pybind.hpp | 4 +- .../untilize_with_unpadding.cpp | 4 +- .../untilize_with_unpadding.hpp | 2 +- .../untilize_with_unpadding_pybind.hpp | 4 +- .../ttnn/operations/eltwise/binary/binary.cpp | 16 ++-- .../ttnn/operations/eltwise/binary/binary.hpp | 12 +-- .../eltwise/binary/binary_composite.hpp | 42 ++++----- .../eltwise/binary/binary_pybind.hpp | 38 ++++---- .../binary/device/binary_composite_op.cpp | 48 +++++----- .../binary/device/binary_device_operation.hpp | 2 +- .../binary_backward/binary_backward.cpp | 32 +++---- .../binary_backward/binary_backward.hpp | 34 +++---- .../binary_backward_pybind.hpp | 24 ++--- .../eltwise/binary_ng/binary_ng.cpp | 12 +-- .../eltwise/binary_ng/binary_ng.hpp | 12 +-- .../eltwise/binary_ng/binary_ng_pybind.cpp | 24 ++--- .../eltwise/ternary/ternary_pybind.hpp | 16 ++-- .../ttnn/operations/eltwise/ternary/where.cpp | 12 +-- .../ttnn/operations/eltwise/ternary/where.hpp | 10 +-- .../ternary_backward/ternary_backward.cpp | 2 +- .../ternary_backward/ternary_backward.hpp | 2 +- .../ternary_backward_pybind.hpp | 4 +- .../unary/device/unary_composite_op.cpp | 8 +- .../ttnn/operations/eltwise/unary/unary.cpp | 36 ++++---- .../ttnn/operations/eltwise/unary/unary.hpp | 34 +++---- .../eltwise/unary/unary_composite.hpp | 4 +- .../operations/eltwise/unary/unary_pybind.hpp | 42 ++++----- .../eltwise/unary_backward/unary_backward.cpp | 58 ++++++++---- .../eltwise/unary_backward/unary_backward.hpp | 18 ++-- .../unary_backward/unary_backward_pybind.hpp | 12 +-- .../ttnn/operations/embedding/embedding.cpp | 4 +- .../ttnn/operations/embedding/embedding.hpp | 2 +- .../operations/embedding/embedding_pybind.hpp | 4 +- .../embedding_backward/embedding_backward.cpp | 2 +- .../embedding_backward/embedding_backward.hpp | 2 +- .../embedding_backward_pybind.cpp | 4 +- .../experimental/auto_format/auto_format.cpp | 4 +- .../cnn/convert_to_chw/convert_to_chw.cpp | 4 +- .../cnn/convert_to_chw/convert_to_chw.hpp | 5 +- .../convert_to_chw/convert_to_chw_pybind.cpp | 4 +- .../experimental/copy/typecast/typecast.cpp | 4 +- .../experimental/copy/typecast/typecast.hpp | 2 +- .../copy/typecast/typecast_pybind.cpp | 4 +- .../matmul/attn_matmul/attn_matmul.cpp | 4 +- .../matmul/attn_matmul/attn_matmul.hpp | 4 +- .../matmul/attn_matmul/attn_matmul_pybind.cpp | 8 +- .../device/attn_matmul_device_operation.hpp | 2 +- .../group_attn_matmul_device_operation.hpp | 2 +- .../group_attn_matmul/group_attn_matmul.cpp | 2 +- .../group_attn_matmul/group_attn_matmul.hpp | 2 +- .../group_attn_matmul_pybind.cpp | 4 +- .../plusone/device/plusone_op.hpp | 2 +- .../experimental/plusone/plusone.cpp | 2 +- .../experimental/plusone/plusone.hpp | 2 +- .../fast_reduce_nc_device_operation.cpp | 4 +- .../fast_reduce_nc_device_operation.hpp | 4 +- .../fast_reduce_nc/fast_reduce_nc.cpp | 2 +- .../fast_reduce_nc/fast_reduce_nc.hpp | 2 +- .../fast_reduce_nc/fast_reduce_nc_pybind.cpp | 4 +- .../operations/experimental/reshape/view.cpp | 2 +- .../device/hc_sum_reduce_program_factory.cpp | 2 +- .../ssm/hc_sum_reduce/hc_sum_reduce.cpp | 4 +- .../ssm/hc_sum_reduce/hc_sum_reduce.hpp | 2 +- .../hc_sum_reduce/hc_sum_reduce_pybind.cpp | 4 +- .../ssm/prefix_scan/prefix_scan.cpp | 4 +- .../ssm/prefix_scan/prefix_scan.hpp | 2 +- .../ssm/prefix_scan/prefix_scan_pybind.cpp | 4 +- ...interleave_eltwise_mul_program_factory.cpp | 2 +- .../repeat_and_interleave_eltwise_mul.cpp | 4 +- .../repeat_and_interleave_eltwise_mul.hpp | 2 +- ...peat_and_interleave_eltwise_mul_pybind.cpp | 4 +- .../concatenate_heads/concatenate_heads.hpp | 2 +- .../concatenate_heads_pybind.hpp | 4 +- .../concatenate_heads_device_operation.hpp | 2 +- .../create_qkv_heads/create_qkv_heads.cpp | 4 +- .../create_qkv_heads/create_qkv_heads.hpp | 2 +- .../create_qkv_heads_pybind.cpp | 4 +- ...create_qkv_heads_from_separate_tensors.cpp | 4 +- ...create_qkv_heads_from_separate_tensors.hpp | 2 +- ...qkv_heads_from_separate_tensors_pybind.cpp | 4 +- .../nlp_concat_heads_device_operation.hpp | 2 +- .../nlp_concat_heads/nlp_concat_heads.cpp | 2 +- .../nlp_concat_heads/nlp_concat_heads.hpp | 2 +- .../nlp_concat_heads_pybind.cpp | 4 +- ...p_concat_heads_decode_device_operation.hpp | 2 +- .../nlp_concat_heads_decode.cpp | 2 +- .../nlp_concat_heads_decode.hpp | 2 +- .../nlp_concat_heads_decode_pybind.cpp | 4 +- .../nlp_create_qkv_heads_device_operation.hpp | 2 +- .../nlp_create_qkv_heads.cpp | 2 +- .../nlp_create_qkv_heads.hpp | 2 +- .../nlp_create_qkv_heads_pybind.cpp | 4 +- .../nlp_create_qkv_heads_decode.cpp | 4 +- .../nlp_create_qkv_heads_decode.hpp | 2 +- .../nlp_create_qkv_heads_decode_pybind.cpp | 4 +- ...te_qkv_heads_falcon7b_device_operation.hpp | 2 +- .../nlp_create_qkv_heads_falcon7b.cpp | 2 +- .../nlp_create_qkv_heads_falcon7b.hpp | 2 +- .../nlp_create_qkv_heads_falcon7b_pybind.cpp | 4 +- ...e_qkv_heads_segformer_device_operation.hpp | 2 +- .../nlp_create_qkv_heads_segformer.cpp | 2 +- .../nlp_create_qkv_heads_segformer.hpp | 2 +- .../nlp_create_qkv_heads_segformer_pybind.cpp | 4 +- ..._create_qkv_heads_vit_device_operation.hpp | 2 +- .../nlp_create_qkv_heads_vit.cpp | 2 +- .../nlp_create_qkv_heads_vit.hpp | 2 +- .../nlp_create_qkv_heads_vit_pybind.cpp | 4 +- ...p_kv_cache_load_slice_device_operation.hpp | 2 +- .../nlp_kv_cache_load_slice.cpp | 2 +- .../nlp_kv_cache_load_slice.hpp | 2 +- .../nlp_kv_cache_load_slice_pybind.cpp | 4 +- ...value_and_split_heads_device_operation.hpp | 2 +- .../split_query_key_value_and_split_heads.hpp | 2 +- ...query_key_value_and_split_heads_pybind.hpp | 4 +- .../operations/kv_cache/kv_cache_pybind.cpp | 2 +- ttnn/cpp/ttnn/operations/loss/loss.cpp | 6 +- ttnn/cpp/ttnn/operations/loss/loss.hpp | 6 +- ttnn/cpp/ttnn/operations/loss/loss_pybind.cpp | 8 +- .../operations/matmul/device/matmul_op.cpp | 2 +- .../operations/matmul/device/matmul_op.hpp | 2 +- ttnn/cpp/ttnn/operations/matmul/matmul.cpp | 4 +- .../operations/pool/generic/generic_pools.cpp | 2 +- .../operations/pool/generic/generic_pools.hpp | 2 +- .../pool/generic/generic_pools_pybind.cpp | 4 +- .../operations/reduction/argmax/argmax.cpp | 2 +- .../operations/reduction/argmax/argmax.hpp | 2 +- .../reduction/argmax/argmax_pybind.hpp | 4 +- .../reduction/argmax/device/argmax_op.hpp | 2 +- .../cpp/ttnn/operations/reduction/moe/moe.cpp | 2 +- .../cpp/ttnn/operations/reduction/moe/moe.hpp | 2 +- .../operations/reduction/moe/moe_pybind.hpp | 4 +- .../ttnn/operations/reduction/prod/prod.cpp | 2 +- .../reduction/sampling/device/sampling_op.hpp | 2 +- .../reduction/sampling/sampling.cpp | 2 +- .../reduction/sampling/sampling.hpp | 2 +- .../reduction/sampling/sampling_pybind.cpp | 4 +- .../ttnn/operations/reduction/topk/topk.hpp | 4 +- .../operations/reduction/topk/topk_pybind.hpp | 4 +- .../operations/sliding_window/halo/halo.cpp | 2 +- .../operations/sliding_window/halo/halo.hpp | 2 +- .../ttnn/operations/transformer/sdpa/sdpa.cpp | 8 +- .../ttnn/operations/transformer/sdpa/sdpa.hpp | 6 +- .../transformer/sdpa/sdpa_pybind.cpp | 12 +-- .../transformer/sdpa_decode/sdpa_decode.cpp | 6 +- .../transformer/sdpa_decode/sdpa_decode.hpp | 4 +- .../sdpa_decode/sdpa_decode_pybind.cpp | 8 +- ttnn/cpp/ttnn/run_operation.cpp | 16 ++-- ttnn/cpp/ttnn/run_operation.hpp | 17 ++-- ttnn/cpp/ttnn/tensor/tensor.cpp | 14 +-- ttnn/cpp/ttnn/tensor/tensor.hpp | 12 +-- ttnn/cpp/ttnn/tensor/tensor_impl.cpp | 46 +++++----- ttnn/cpp/ttnn/tensor/tensor_impl.hpp | 18 ++-- ttnn/cpp/ttnn/tensor/tensor_ops.cpp | 6 +- ttnn/cpp/ttnn/tensor/tensor_ops.hpp | 7 +- ttnn/cpp/ttnn/types.hpp | 1 + 279 files changed, 1063 insertions(+), 980 deletions(-) create mode 100644 ttnn/cpp/pybind11/types.cpp create mode 100644 ttnn/cpp/ttnn/common/queue_id.hpp diff --git a/tests/tt_eager/ops/test_bcast_op.cpp b/tests/tt_eager/ops/test_bcast_op.cpp index 8913161cd05..f96d738337f 100644 --- a/tests/tt_eager/ops/test_bcast_op.cpp +++ b/tests/tt_eager/ops/test_bcast_op.cpp @@ -54,7 +54,7 @@ int main(int argc, char** argv) { ttnn::Shape({1, 1, TILE_HEIGHT, TILE_WIDTH}), DataType::BFLOAT16, Layout::TILE, *device); for (auto bcast_math : magic_enum::enum_values()) { - Tensor c = ttnn::bcast(0, a, b, bcast_math, bcast_dim); + Tensor c = ttnn::bcast(ttnn::DefaultQueueId, a, b, bcast_math, bcast_dim); Tensor d = c.cpu(); //////////////////////////////////////////////////////////////////////////// @@ -69,28 +69,28 @@ int main(int argc, char** argv) { { Tensor a = ttnn::random::random(Shape({1, 1, 32, 4544})).to_layout(Layout::TILE).to_device(device); Tensor b = ttnn::zeros(ttnn::Shape({1, 1, 32, 4544}), DataType::BFLOAT16, Layout::TILE, *device); - Tensor c = ttnn::bcast(0, a, b, ttnn::BcastOpMath::MUL, ttnn::BcastOpDim::H); + Tensor c = ttnn::bcast(ttnn::DefaultQueueId, a, b, ttnn::BcastOpMath::MUL, ttnn::BcastOpDim::H); Tensor d = c.cpu(); } { Tensor a = ttnn::random::random(Shape({1, 1, 32, 4544})).to_layout(Layout::TILE).to_device(device); Tensor b = ttnn::zeros(ttnn::Shape({1, 1, 32, 4544}), DataType::BFLOAT16, Layout::TILE, *device); - Tensor c = ttnn::bcast(0, a, b, ttnn::BcastOpMath::ADD, ttnn::BcastOpDim::H); + Tensor c = ttnn::bcast(ttnn::DefaultQueueId, a, b, ttnn::BcastOpMath::ADD, ttnn::BcastOpDim::H); Tensor d = c.cpu(); } { Tensor a = ttnn::random::random(Shape({1, 71, 32, 32})).to_layout(Layout::TILE).to_device(device); Tensor b = ttnn::zeros(ttnn::Shape({1, 1, 32, 32}), DataType::BFLOAT16, Layout::TILE, *device); - Tensor c = ttnn::bcast(0, a, b, ttnn::BcastOpMath::MUL, ttnn::BcastOpDim::HW); + Tensor c = ttnn::bcast(ttnn::DefaultQueueId, a, b, ttnn::BcastOpMath::MUL, ttnn::BcastOpDim::HW); Tensor d = c.cpu(); } { Tensor a = ttnn::random::random(Shape({1, 71, 32, 64})).to_layout(Layout::TILE).to_device(device); Tensor b = ttnn::zeros(ttnn::Shape({1, 1, 32, 32}), DataType::BFLOAT16, Layout::TILE, *device); - Tensor c = ttnn::bcast(0, a, b, ttnn::BcastOpMath::MUL, ttnn::BcastOpDim::HW); + Tensor c = ttnn::bcast(ttnn::DefaultQueueId, a, b, ttnn::BcastOpMath::MUL, ttnn::BcastOpDim::HW); Tensor d = c.cpu(); } }; diff --git a/tests/tt_eager/ops/test_fold_op.cpp b/tests/tt_eager/ops/test_fold_op.cpp index 0d8129a2155..fec37c1a120 100644 --- a/tests/tt_eager/ops/test_fold_op.cpp +++ b/tests/tt_eager/ops/test_fold_op.cpp @@ -19,8 +19,7 @@ void run_fold(IDevice* device, const ttnn::Shape& shape) { Tensor input_tensor = ttnn::random::random(shape).to_layout(Layout::ROW_MAJOR).to_device(device); uint32_t stride_h = 2; uint32_t stride_w = 2; - uint8_t queue_id = 0; - Tensor device_output_tensor = ttnn::fold(queue_id, input_tensor, stride_h, stride_w); + Tensor device_output_tensor = ttnn::fold(ttnn::DefaultQueueId, input_tensor, stride_h, stride_w); Tensor output_tensor = device_output_tensor.cpu(); } diff --git a/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp b/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp index 78cf7ebcab3..ee3a644e06e 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp @@ -12,7 +12,7 @@ #include "tt-metalium/kernel_types.hpp" #include "tt_metal/test_utils/df/df.hpp" #include "tt_metal/test_utils/env_vars.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/cpp/ttnn/operations/ccl/ccl_common.hpp" #include "ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp" #include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp" @@ -1471,7 +1471,7 @@ bool TestMultiInputReaderKernel( log_info(tt::LogTest, "Finished"); for (auto d : devices) { - tt_metal::Synchronize(d, ttnn::DefaultQueueId); + tt_metal::Synchronize(d, *ttnn::DefaultQueueId); } } return pass; @@ -2826,7 +2826,7 @@ TEST(CclAsyncOp, ReduceScatterSmall_PersistentFabric) { log_info(tt::LogTest, "Waiting for teardown completion"); for (auto d : devices) { - tt_metal::Synchronize(d, ttnn::DefaultQueueId); + tt_metal::Synchronize(d, *ttnn::DefaultQueueId); } log_info(tt::LogTest, "Finished"); } @@ -2930,7 +2930,7 @@ void run_all_gather_with_persistent_fabric(const size_t dim, const size_t num_li log_info(tt::LogTest, "Waiting for teardown completion"); for (auto d : devices) { - tt_metal::Synchronize(d, ttnn::DefaultQueueId); + tt_metal::Synchronize(d, *ttnn::DefaultQueueId); } log_info(tt::LogTest, "Finished"); } @@ -3213,7 +3213,7 @@ void RunWriteThroughputStabilityTestWithPersistentFabric( log_info(tt::LogTest, "Waiting for teardown completion"); for (IDevice* d : devices) { - tt_metal::Synchronize(d, ttnn::DefaultQueueId); + tt_metal::Synchronize(d, *ttnn::DefaultQueueId); } for (size_t i = 0; i < programs.size(); i++) { auto d = worker_devices[i]; diff --git a/tests/ttnn/unit_tests/gtests/tensor/common_tensor_test_utils.cpp b/tests/ttnn/unit_tests/gtests/tensor/common_tensor_test_utils.cpp index 962e47ace39..d338afe5125 100644 --- a/tests/ttnn/unit_tests/gtests/tensor/common_tensor_test_utils.cpp +++ b/tests/ttnn/unit_tests/gtests/tensor/common_tensor_test_utils.cpp @@ -13,7 +13,7 @@ namespace test_utils { void test_tensor_on_device(const ttnn::Shape& input_shape, const TensorLayout& layout, tt::tt_metal::IDevice* device) { using namespace tt::tt_metal; - const uint32_t io_cq = 0; + const ttnn::QueueId io_cq = ttnn::DefaultQueueId; const auto input_buf_size_bytes = layout.compute_packed_buffer_size_bytes(input_shape); const auto host_buffer_datum_size_bytes = sizeof(uint32_t); @@ -28,13 +28,13 @@ void test_tensor_on_device(const ttnn::Shape& input_shape, const TensorLayout& l } auto tensor = tt::tt_metal::create_device_tensor(TensorSpec(input_shape, layout), device); - ttnn::queue_synchronize(device->command_queue(io_cq)); + ttnn::queue_synchronize(device->command_queue(*io_cq)); ttnn::write_buffer(io_cq, tensor, {host_data}); - ttnn::queue_synchronize(device->command_queue(io_cq)); + ttnn::queue_synchronize(device->command_queue(*io_cq)); ttnn::read_buffer(io_cq, tensor, {readback_data}); - ttnn::queue_synchronize(device->command_queue(io_cq)); + ttnn::queue_synchronize(device->command_queue(*io_cq)); for (int i = 0; i < input_buf_size; i++) { EXPECT_EQ(host_data[i], readback_data[i]); diff --git a/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor.cpp b/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor.cpp index 26b8fcedb57..297e9816605 100644 --- a/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor.cpp +++ b/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor.cpp @@ -24,7 +24,7 @@ void run_create_tensor_test(tt::tt_metal::IDevice* device, const ttnn::Shape& in .buffer_type = BufferType::DRAM, .shard_spec = std::nullopt}; - const uint32_t io_cq = 0; + const ttnn::QueueId io_cq = ttnn::DefaultQueueId; constexpr DataType dtype = DataType::BFLOAT16; constexpr uint32_t datum_size_bytes = 2; diff --git a/tests/ttnn/unit_tests/gtests/test_async_runtime.cpp b/tests/ttnn/unit_tests/gtests/test_async_runtime.cpp index d2353dbd574..5cf8b13da82 100644 --- a/tests/ttnn/unit_tests/gtests/test_async_runtime.cpp +++ b/tests/ttnn/unit_tests/gtests/test_async_runtime.cpp @@ -29,8 +29,8 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncPreallocatedOutputs) { uint32_t input_buf_size_datums = 1024 * 1024; uint32_t output_buf_size_datums = 1024 * 32; uint32_t datum_size_bytes = 2; - uint32_t io_cq = 1; // Data reads and writes done through CQ0 - uint32_t workload_dispatch_cq = 0; // Workload dispatched through CQ1 + ttnn::QueueId io_cq = ttnn::QueueId(1); // Data reads and writes done through CQ0 + ttnn::QueueId workload_dispatch_cq = ttnn::QueueId(0); // Workload dispatched through CQ1 ttnn::Shape input_shape({1, 1, 1024, 1024}); auto host_data = std::shared_ptr(new bfloat16[input_buf_size_datums]); @@ -71,14 +71,14 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncPreallocatedOutputs) { // Populate input_tensor with data ttnn::write_buffer(io_cq, input_tensor, {host_data}); // Record the completion of the write event - ttnn::record_event(device_->command_queue(io_cq), write_event); + ttnn::record_event(device_->command_queue(*io_cq), write_event); // Host stalls until write is completed, before sending workload ttnn::event_synchronize(write_event); EXPECT_EQ(ttnn::event_query(write_event), true); // Dispatch workload. Preallocated output_tensor is populated by op/ ttnn::moreh_sum(input_tensor, /*dim*/ 3, false, output_tensor, std::nullopt, std::nullopt); // Record completion of workload - ttnn::record_event(device_->command_queue(workload_dispatch_cq), workload_event); + ttnn::record_event(device_->command_queue(*workload_dispatch_cq), workload_event); ttnn::event_synchronize(workload_event); EXPECT_EQ(ttnn::event_query(workload_event), true); // Read output back, once workload is complete @@ -93,7 +93,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncPreallocatedOutputs) { // Deallocate tensors (tensor gives up buffer). Done asynchronously, so sync on queue after. input_tensor.deallocate(); output_tensor.deallocate(); - ttnn::queue_synchronize(device_->command_queue(io_cq)); + ttnn::queue_synchronize(device_->command_queue(*io_cq)); // Buffer only has 2 owners in main thread. EXPECT_EQ(input_buffer.use_count(), 2); EXPECT_EQ(output_buffer.use_count(), 2); @@ -112,8 +112,8 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncRuntimeAllocatedBuffers) { uint32_t buf_size_datums = 1024 * 1024; uint32_t datum_size_bytes = 2; std::vector inputs = {4, 9, 16, 25, 36, 64}; - uint32_t io_cq = 1; - uint32_t workload_dispatch_cq = 0; + ttnn::QueueId io_cq = ttnn::QueueId(1); + ttnn::QueueId workload_dispatch_cq = ttnn::QueueId(0); ttnn::Shape shape{1, 1, 1024, 1024}; auto host_data = std::shared_ptr(new bfloat16[buf_size_datums]); @@ -134,9 +134,9 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncRuntimeAllocatedBuffers) { auto input_storage = tt::tt_metal::DeviceStorage{input_buffer}; Tensor input_tensor = Tensor(input_storage, shape, DataType::BFLOAT16, Layout::TILE); ttnn::write_buffer(io_cq, input_tensor, {host_data}); // Write using cq 1 - ttnn::record_event(device_->command_queue(io_cq), write_event); // Record write on cq 1 + ttnn::record_event(device_->command_queue(*io_cq), write_event); // Record write on cq 1 // Wait until cq 1 write is complete - ttnn::wait_for_event(device_->command_queue(workload_dispatch_cq), write_event); + ttnn::wait_for_event(device_->command_queue(*workload_dispatch_cq), write_event); // Run operation on cq 0 Tensor output_tensor = ttnn::sqrt(workload_dispatch_cq, input_tensor); @@ -147,9 +147,9 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncRuntimeAllocatedBuffers) { auto dummy_buffer_1 = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device_, TensorSpec(shape, tensor_layout)); // Record cq 0 prog execution - ttnn::record_event(device_->command_queue(workload_dispatch_cq), workload_event); + ttnn::record_event(device_->command_queue(*workload_dispatch_cq), workload_event); // Wait until cq 0 prog execution is done - ttnn::wait_for_event(device_->command_queue(io_cq), workload_event); + ttnn::wait_for_event(device_->command_queue(*io_cq), workload_event); // Read using cq 1 ttnn::read_buffer(io_cq, output_tensor, {readback_data}); for (int i = 0; i < buf_size_datums; i++) { diff --git a/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp b/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp index fd9bc559b03..8d5f455a4d2 100644 --- a/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp +++ b/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp @@ -23,7 +23,7 @@ using namespace tt_metal; namespace async_detail { template std::vector run_operation( - uint8_t cq_id, + QueueId cq_id, OpConfig devop, const operation::Tensors& input_tensors, const operation::OptionalConstTensors& optional_input_tensors = {}, @@ -155,7 +155,7 @@ TEST(GalaxyTests, TestAllGatherDeadlock) { auto input_storage = DeviceStorage{input_buffer}; Tensor input_tensor = Tensor(input_storage, shape, DataType::BFLOAT16, Layout::TILE); // Push inputs. - ttnn::write_buffer(0, input_tensor, {host_data}); + ttnn::write_buffer(ttnn::DefaultQueueId, input_tensor, {host_data}); // Configure CCL running on this device. uint32_t receiver_device_id = device_ids[(dev_idx) + 1 % num_devices_in_row]; uint32_t sender_device_id = device_ids[(dev_idx + num_devices_in_row - 1) % num_devices_in_row]; @@ -171,13 +171,13 @@ TEST(GalaxyTests, TestAllGatherDeadlock) { input_tensor.memory_config(), ttnn::ccl::Topology::Linear}; // Send CCL to this device. All CCLs will complete simultaneously. - output_tensors.push_back(async_detail::run_operation(0, all_gather_op, {input_tensor}).at(0)); + output_tensors.push_back(async_detail::run_operation(ttnn::DefaultQueueId, all_gather_op, {input_tensor}).at(0)); // Expose deadlock: After the CCL is sent to the first device in the tunnel, send enough data to it to // backpressure prefetch_h. This will block the demux, which will prevent the CCL from being sent to // additional chips. If the CCL has been tagged as having multi-device dependencies, deadlock should get // bypassed. if (!dev_idx) { - ttnn::write_buffer(0, input_tensor, {host_data}); + ttnn::write_buffer(ttnn::DefaultQueueId, input_tensor, {host_data}); } dev_idx++; } @@ -186,7 +186,7 @@ TEST(GalaxyTests, TestAllGatherDeadlock) { ASSERT_EQ( tensor.get_logical_shape(), Shape({1, 1, 32, static_cast(16384 * device_ids.size())})); - ttnn::read_buffer(0, tensor, {readback_data}); + ttnn::read_buffer(ttnn::DefaultQueueId, tensor, {readback_data}); for (int j = 0; j < device_ids.size() * 32 * 16384; j++) { ASSERT_EQ(readback_data[j].to_float(), 1); } @@ -266,7 +266,7 @@ TEST(GalaxyTests, TestReduceScatterDeadlock) { auto input_storage = DeviceStorage{input_buffer}; Tensor input_tensor = Tensor(input_storage, shape, DataType::BFLOAT16, Layout::TILE); // Push inputs. - ttnn::write_buffer(0, input_tensor, {host_data}); + ttnn::write_buffer(ttnn::DefaultQueueId, input_tensor, {host_data}); // Configure CCL running on this device. uint32_t receiver_device_id = device_ids[(dev_idx + 1) % ring_devices.size()]; uint32_t sender_device_id = device_ids[(dev_idx + ring_devices.size() - 1) % ring_devices.size()]; @@ -281,13 +281,13 @@ TEST(GalaxyTests, TestReduceScatterDeadlock) { input_tensor.memory_config(), ttnn::ccl::Topology::Ring}; // Send CCL to this device. All CCLs will complete simultaneously. - output_tensors.push_back(async_detail::run_operation(0, all_gather_op, {input_tensor}).at(0)); + output_tensors.push_back(async_detail::run_operation(ttnn::DefaultQueueId, all_gather_op, {input_tensor}).at(0)); // Expose deadlock: After the CCL is sent to a device in the first tunnel, send enough data to it to // backpressure prefetch_h. This will block the demux, which will prevent the CCL from being sent to // additional chips on the tunnel. If the CCL has been tagged as having multi-device dependencies, deadlock // should get bypassed. if (dev_idx < 3) { for (int j = 0; j < 16; j++) { - ttnn::write_buffer(0, input_tensor, {host_data}); + ttnn::write_buffer(ttnn::DefaultQueueId, input_tensor, {host_data}); } // } dev_idx++; @@ -295,7 +295,7 @@ TEST(GalaxyTests, TestReduceScatterDeadlock) { // Readback data and verify correctness. for (auto& tensor : output_tensors) { ASSERT_EQ(tensor.get_logical_shape(), Shape({1, 2, 256, 256})); - ttnn::read_buffer(0, tensor, {readback_data}); + ttnn::read_buffer(ttnn::DefaultQueueId, tensor, {readback_data}); for (int j = 0; j < 512 * 256; j++) { ASSERT_EQ(readback_data[j].to_float(), ring_devices.size()); } diff --git a/tests/ttnn/unit_tests/gtests/test_multi_cq_multi_dev.cpp b/tests/ttnn/unit_tests/gtests/test_multi_cq_multi_dev.cpp index 7a4732c90e8..2a83fdd1445 100644 --- a/tests/ttnn/unit_tests/gtests/test_multi_cq_multi_dev.cpp +++ b/tests/ttnn/unit_tests/gtests/test_multi_cq_multi_dev.cpp @@ -17,7 +17,7 @@ using namespace tt; using namespace tt_metal; using MultiCommandQueueT3KFixture = ttnn::MultiCommandQueueT3KFixture; -Tensor dispatch_ops_to_device(IDevice* dev, Tensor input_tensor, uint8_t cq_id) { +Tensor dispatch_ops_to_device(IDevice* dev, Tensor input_tensor, QueueId cq_id) { using ttnn::operations::unary::UnaryOpType; using ttnn::operations::unary::UnaryWithParam; @@ -71,17 +71,17 @@ TEST_F(MultiCommandQueueT3KFixture, Test2CQMultiDeviceProgramsOnCQ1) { auto write_event = std::make_shared(); auto workload_event = std::make_shared(); ttnn::write_buffer( - 0, + ttnn::QueueId(0), input_tensor, {host_data, host_data, host_data, host_data, host_data, host_data, host_data, host_data}); ttnn::record_event(device->command_queue(0), write_event); ttnn::wait_for_event(device->command_queue(1), write_event); - auto output_tensor = dispatch_ops_to_device(device, input_tensor, 1); + auto output_tensor = dispatch_ops_to_device(device, input_tensor, ttnn::QueueId(1)); ttnn::record_event(device->command_queue(1), workload_event); ttnn::wait_for_event(device->command_queue(0), workload_event); ttnn::read_buffer( - 0, + ttnn::QueueId(0), output_tensor, {readback_data, readback_data, @@ -139,17 +139,17 @@ TEST_F(MultiCommandQueueT3KFixture, Test2CQMultiDeviceProgramsOnCQ0) { auto write_event = std::make_shared(); auto workload_event = std::make_shared(); ttnn::write_buffer( - 1, + ttnn::QueueId(1), input_tensor, {host_data, host_data, host_data, host_data, host_data, host_data, host_data, host_data}); ttnn::record_event(device->command_queue(1), write_event); ttnn::wait_for_event(device->command_queue(0), write_event); - auto output_tensor = dispatch_ops_to_device(device, input_tensor, 0); + auto output_tensor = dispatch_ops_to_device(device, input_tensor, ttnn::DefaultQueueId); ttnn::record_event(device->command_queue(0), workload_event); ttnn::wait_for_event(device->command_queue(1), workload_event); // std::this_thread::sleep_for(std::chrono::milliseconds(50)); ttnn::read_buffer( - 1, + ttnn::QueueId(1), output_tensor, {readback_data, readback_data, @@ -208,16 +208,16 @@ TEST_F(MultiCommandQueueT3KFixture, Test2CQMultiDeviceWithCQ1Only) { auto workload_event = std::make_shared(); ttnn::write_buffer( - 1, + ttnn::QueueId(1), input_tensor, {host_data, host_data, host_data, host_data, host_data, host_data, host_data, host_data}); ttnn::record_event(device->command_queue(1), write_event); ttnn::wait_for_event(device->command_queue(1), write_event); - auto output_tensor = dispatch_ops_to_device(device, input_tensor, 1); + auto output_tensor = dispatch_ops_to_device(device, input_tensor, ttnn::QueueId(1)); ttnn::record_event(device->command_queue(1), workload_event); ttnn::wait_for_event(device->command_queue(1), workload_event); ttnn::read_buffer( - 1, + ttnn::QueueId(1), output_tensor, {readback_data, readback_data, diff --git a/tests/ttnn/unit_tests/gtests/test_multiprod_queue.cpp b/tests/ttnn/unit_tests/gtests/test_multiprod_queue.cpp index 1c7c33ee8aa..379505c770b 100644 --- a/tests/ttnn/unit_tests/gtests/test_multiprod_queue.cpp +++ b/tests/ttnn/unit_tests/gtests/test_multiprod_queue.cpp @@ -41,8 +41,8 @@ TEST_F(MultiProducerCommandQueueTest, Stress) { const TensorSpec tensor_spec(tensor_shape, tensor_layout); // Thread 0 uses cq_0, thread 1 uses cq_1 - const uint32_t t0_io_cq = 0; - const uint32_t t1_io_cq = 1; + const ttnn::QueueId t0_io_cq = ttnn::DefaultQueueId; + const ttnn::QueueId t1_io_cq = ttnn::QueueId(1); std::vector t0_host_data(tensor_shape.volume()); std::vector t1_host_data(tensor_shape.volume()); @@ -91,8 +91,8 @@ TEST_F(MultiProducerCommandQueueTest, EventSync) { const TensorLayout tensor_layout(DataType::FLOAT32, PageConfig(Layout::ROW_MAJOR), mem_cfg); const TensorSpec tensor_spec(tensor_shape, tensor_layout); - const uint32_t write_cq = 0; - const uint32_t read_cq = 1; + const ttnn::QueueId write_cq = ttnn::DefaultQueueId; + const ttnn::QueueId read_cq = ttnn::QueueId(1); std::shared_ptr write_event = std::make_shared(); std::shared_ptr read_event = std::make_shared(); @@ -110,10 +110,10 @@ TEST_F(MultiProducerCommandQueueTest, EventSync) { // Create tensor and transfer to device const Tensor host_tensor = Tensor::from_vector(host_data, tensor_spec); - memcpy(device->command_queue(write_cq), device_tensor, host_tensor); + memcpy(device->command_queue(*write_cq), device_tensor, host_tensor); EXPECT_TRUE(is_tensor_on_device(device_tensor)); - ttnn::record_event(device->command_queue(write_cq), write_event); + ttnn::record_event(device->command_queue(*write_cq), write_event); } }); @@ -127,7 +127,7 @@ TEST_F(MultiProducerCommandQueueTest, EventSync) { EXPECT_FALSE(is_tensor_on_device(readback_tensor)); EXPECT_THAT(readback_tensor.to_vector(), Pointwise(FloatEq(), host_data)); - ttnn::record_event(device->command_queue(read_cq), read_event); + ttnn::record_event(device->command_queue(*read_cq), read_event); } }); diff --git a/tt-train/sources/ttml/ttnn_fixed/trivial_ttnn_ops.cpp b/tt-train/sources/ttml/ttnn_fixed/trivial_ttnn_ops.cpp index ad818f6040f..f543f0d98d1 100644 --- a/tt-train/sources/ttml/ttnn_fixed/trivial_ttnn_ops.cpp +++ b/tt-train/sources/ttml/ttnn_fixed/trivial_ttnn_ops.cpp @@ -44,7 +44,7 @@ tt::tt_metal::Tensor softmax(const tt::tt_metal::Tensor& t, int dim) { } tt::tt_metal::Tensor divide(const tt::tt_metal::Tensor& a, const tt::tt_metal::Tensor& b) { - auto inv_b = ttnn::reciprocal(/* queue_id */ 0, b); + auto inv_b = ttnn::reciprocal(ttnn::DefaultQueueId, b); return ttnn::multiply(a, inv_b); } diff --git a/tt_metal/tt_stl/strong_type.hpp b/tt_metal/tt_stl/strong_type.hpp index f69309f8189..9d0af74e595 100644 --- a/tt_metal/tt_stl/strong_type.hpp +++ b/tt_metal/tt_stl/strong_type.hpp @@ -5,6 +5,7 @@ #pragma once #include +#include namespace tt::stl { diff --git a/ttnn/CMakeLists.txt b/ttnn/CMakeLists.txt index e9e3e010ef1..74f3ef87d4f 100644 --- a/ttnn/CMakeLists.txt +++ b/ttnn/CMakeLists.txt @@ -650,6 +650,8 @@ set(TTNN_SUBLIBRARIES set(TTNN_SRC) set(PYBIND_SRC + ${CMAKE_CURRENT_SOURCE_DIR}/cpp/pybind11/__init__.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/cpp/pybind11/types.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/pybind11/events.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/pybind11/global_circular_buffer.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/pybind11/global_semaphore.cpp @@ -683,6 +685,7 @@ set(TTNN_PUBLIC_INCLUDE_DIRS set(TTNN_PUBLIC_LINK_LIBRARIES metal_common_libs Metalium::Metal + Metalium::Metal::STL xtensor xtensor-blas xtl @@ -708,7 +711,6 @@ if(WITH_PYTHON_BINDINGS) list( APPEND TTNN_BASE_SRCS - ${PROJECT_SOURCE_DIR}/ttnn/cpp/pybind11/__init__.cpp ${TT_LIB_SRCS} ${PYBIND_SRC} ) # TT_LIB_SRCS from tt_eager/tt_lib/CMakeLists.txt for python bindigns diff --git a/ttnn/cpp/pybind11/decorators.hpp b/ttnn/cpp/pybind11/decorators.hpp index 00153d8b791..203d1f9bfb7 100644 --- a/ttnn/cpp/pybind11/decorators.hpp +++ b/ttnn/cpp/pybind11/decorators.hpp @@ -11,6 +11,7 @@ #include "ttnn/decorators.hpp" #include "small_vector_caster.hpp" // NOLINT - for pybind11 SmallVector binding support. #include "ttnn/types.hpp" +#include "types.hpp" namespace py = pybind11; @@ -41,7 +42,7 @@ constexpr auto resolve_primitive_operation_call_method(F) { using traits = function_traits; return [](arg_traits) { - return [](TSelf self, TArgs... args, std::uint8_t queue_id) -> + return [](TSelf self, TArgs... args, QueueId queue_id) -> typename traits::return_t { return self(queue_id, static_cast(args)...); }; }(typename traits::arg_tuple{}); } @@ -84,7 +85,7 @@ void def_call_operator(py_operation_t& py_operation, const pybind_overload_t cq_id, const std::vector& sub_device_ids) { + [](IDevice* device, const QueueId cq_id, const std::vector& sub_device_ids) { // Send finish command to issue queue through worker thread // Worker thread will stall until the device is flushed. device->push_work( - [device, cq_id, &sub_device_ids]() mutable { Synchronize(device, cq_id, sub_device_ids); }); + [device, cq_id, &sub_device_ids]() mutable { Synchronize(device, *cq_id, sub_device_ids); }); // Main thread stalls until worker is complete (full device and worker queue flush). device->synchronize(); }, @@ -609,7 +609,7 @@ void device_module(py::module& m_device) { >>> ttnn.synchronize_device(device) )doc", py::arg("device"), - py::arg("cq_id") = std::nullopt, + py::arg("cq_id") = DefaultQueueId, py::arg("sub_device_ids") = std::vector()); m_device.def("DumpDeviceProfiler", DumpDeviceProfiler, py::arg("device"), R"doc( Dump device side profiling data. diff --git a/ttnn/cpp/pybind11/events.cpp b/ttnn/cpp/pybind11/events.cpp index 5cf1d17b149..abc64a7cf2f 100644 --- a/ttnn/cpp/pybind11/events.cpp +++ b/ttnn/cpp/pybind11/events.cpp @@ -8,6 +8,8 @@ #include "pybind11/pybind11.h" #include +#include "ttnn/common/queue_id.hpp" + using namespace tt::tt_metal; namespace ttnn::events { @@ -32,7 +34,7 @@ void py_module(py::module& module) { module.def( "record_event", - py::overload_cast&, const std::vector&>(&record_event), + py::overload_cast&, const std::vector&>(&record_event), py::arg("cq_id"), py::arg("event"), py::arg("sub_device_ids") = std::vector(), @@ -47,7 +49,7 @@ void py_module(py::module& module) { module.def( "wait_for_event", - py::overload_cast&>(&wait_for_event), + py::overload_cast&>(&wait_for_event), py::arg("cq_id"), py::arg("event"), R"doc( @@ -72,7 +74,7 @@ void py_module(py::module& module) { module.def( "record_event", - py::overload_cast&>(&record_event), + py::overload_cast&>(&record_event), py::arg("cq_id"), py::arg("multi_device_event"), py::arg("sub_device_ids") = std::vector(), @@ -86,7 +88,7 @@ void py_module(py::module& module) { module.def( "wait_for_event", - py::overload_cast(&wait_for_event), + py::overload_cast(&wait_for_event), py::arg("cq_id"), py::arg("multi_device_event"), R"doc( diff --git a/ttnn/cpp/pybind11/operations/__init__.hpp b/ttnn/cpp/pybind11/operations/__init__.hpp index 76cd7a8ddeb..42ef0cd581e 100644 --- a/ttnn/cpp/pybind11/operations/__init__.hpp +++ b/ttnn/cpp/pybind11/operations/__init__.hpp @@ -7,6 +7,7 @@ #include #include +#include "pybind11/types.hpp" #include "pybind11/operations/copy.hpp" #include "pybind11/operations/core.hpp" #include "pybind11/operations/creation.hpp" diff --git a/ttnn/cpp/pybind11/operations/copy.hpp b/ttnn/cpp/pybind11/operations/copy.hpp index 008c0ab9601..38da7f9e03c 100644 --- a/ttnn/cpp/pybind11/operations/copy.hpp +++ b/ttnn/cpp/pybind11/operations/copy.hpp @@ -53,7 +53,7 @@ Example:: const DataType dtype, const std::optional& memory_config, const std::optional& output_tensor, - const uint8_t& queue_id) -> ttnn::Tensor { + QueueId queue_id) -> ttnn::Tensor { return self(queue_id, input_tensor, dtype, memory_config, output_tensor); }, py::arg("input_tensor"), @@ -61,7 +61,7 @@ Example:: py::kw_only(), py::arg("memory_config") = std::nullopt, py::arg("output_tensor") = std::nullopt, - py::arg("queue_id") = 0}, + py::arg("queue_id") = DefaultQueueId}, ttnn::pybind_overload_t{ [](const TypecastType& self, @@ -70,7 +70,7 @@ Example:: const DataType output_dtype, const std::optional& memory_config, const std::optional& output_tensor, - const uint8_t& queue_id) -> ttnn::Tensor { + QueueId queue_id) -> ttnn::Tensor { return self(queue_id, input_tensor, input_dtype, output_dtype, memory_config, output_tensor); }, py::arg("input_tensor"), @@ -79,7 +79,7 @@ Example:: py::kw_only(), py::arg("memory_config") = std::nullopt, py::arg("output_tensor") = std::nullopt, - py::arg("queue_id") = 0} + py::arg("queue_id") = DefaultQueueId} ); } diff --git a/ttnn/cpp/pybind11/operations/core.hpp b/ttnn/cpp/pybind11/operations/core.hpp index 1a108704897..6c37b0e0b21 100644 --- a/ttnn/cpp/pybind11/operations/core.hpp +++ b/ttnn/cpp/pybind11/operations/core.hpp @@ -65,7 +65,7 @@ void py_module(py::module& module) { module.def( "to_device", - py::overload_cast&, uint8_t>( + py::overload_cast&, QueueId>( &ttnn::operations::core::to_device), py::arg("tensor"), py::arg("device"), @@ -74,7 +74,7 @@ void py_module(py::module& module) { module.def( "to_device", - py::overload_cast&, uint8_t>( + py::overload_cast&, QueueId>( &ttnn::operations::core::to_device), py::arg("tensor"), py::arg("device"), @@ -262,14 +262,14 @@ void py_module(py::module& module) { module.def( "begin_trace_capture", - py::overload_cast(&ttnn::operations::core::begin_trace_capture), + py::overload_cast(&ttnn::operations::core::begin_trace_capture), py::arg("device"), py::kw_only(), py::arg("cq_id") = ttnn::DefaultQueueId); module.def( "end_trace_capture", - py::overload_cast(&ttnn::operations::core::end_trace_capture), + py::overload_cast(&ttnn::operations::core::end_trace_capture), py::arg("device"), py::arg("trace_id"), py::kw_only(), @@ -277,7 +277,7 @@ void py_module(py::module& module) { module.def( "execute_trace", - py::overload_cast(&ttnn::operations::core::execute_trace), + py::overload_cast(&ttnn::operations::core::execute_trace), py::arg("device"), py::arg("trace_id"), py::kw_only(), @@ -292,7 +292,7 @@ void py_module(py::module& module) { module.def( "begin_trace_capture", - [](MeshDevice* device, const uint8_t cq_id) { + [](MeshDevice* device, const QueueId cq_id) { return ttnn::operations::core::begin_trace_capture(device, cq_id); }, py::arg("mesh_device"), @@ -301,7 +301,7 @@ void py_module(py::module& module) { module.def( "end_trace_capture", - [](MeshDevice* device, const uint32_t tid, const uint8_t cq_id) { + [](MeshDevice* device, const uint32_t tid, const QueueId cq_id) { return ttnn::operations::core::end_trace_capture(device, tid, cq_id); }, py::arg("mesh_device"), @@ -311,7 +311,7 @@ void py_module(py::module& module) { module.def( "execute_trace", - [](MeshDevice* device, const uint32_t tid, const uint8_t cq_id, const bool blocking) { + [](MeshDevice* device, const uint32_t tid, const QueueId cq_id, const bool blocking) { return ttnn::operations::core::execute_trace(device, tid, cq_id, blocking); }, py::arg("mesh_device"), diff --git a/ttnn/cpp/pybind11/operations/creation.hpp b/ttnn/cpp/pybind11/operations/creation.hpp index bf0659674b6..54ae7ebfea4 100644 --- a/ttnn/cpp/pybind11/operations/creation.hpp +++ b/ttnn/cpp/pybind11/operations/creation.hpp @@ -8,6 +8,7 @@ #include #include "cpp/pybind11/decorators.hpp" +#include "cpp/pybind11/types.hpp" #include "ttnn/operations/creation.hpp" namespace py = pybind11; @@ -28,7 +29,7 @@ auto create_pybind_full_overload() { const std::optional> device, const std::optional& memory_config, std::optional& optional_output_tensor, - uint8_t queue_id) -> ttnn::Tensor { + QueueId queue_id) -> ttnn::Tensor { return self( queue_id, ttnn::Shape(shape), fill_value, dtype, layout, device, memory_config, optional_output_tensor); }, @@ -71,7 +72,7 @@ auto create_pybind_full_like_overload() { const std::optional> device, const std::optional& memory_config, std::optional& optional_output_tensor, - uint8_t queue_id) -> ttnn::Tensor { + QueueId queue_id) -> ttnn::Tensor { return self(queue_id, tensor, fill_value, dtype, layout, device, memory_config, optional_output_tensor); }, py::arg("tensor"), @@ -94,7 +95,7 @@ auto create_pybind_full_like_with_hard_coded_value_overload() { const std::optional> device, const std::optional& memory_config, std::optional& optional_output_tensor, - uint8_t queue_id) -> ttnn::Tensor { + QueueId queue_id) -> ttnn::Tensor { return self(queue_id, tensor, dtype, layout, device, memory_config, optional_output_tensor); }, py::arg("tensor"), diff --git a/ttnn/cpp/pybind11/pytensor.cpp b/ttnn/cpp/pybind11/pytensor.cpp index 23c47b0f8c3..f6e55603d8a 100644 --- a/ttnn/cpp/pybind11/pytensor.cpp +++ b/ttnn/cpp/pybind11/pytensor.cpp @@ -21,7 +21,7 @@ #include "ttnn/tensor/tensor_ops.hpp" #include "tools/profiler/op_profiler.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/operations/core/core.hpp" #include "ttnn/tensor/types.hpp" @@ -969,7 +969,7 @@ void pytensor_module(py::module& m_tensor) { )doc") .def( "to", - py::overload_cast(&Tensor::to_device, py::const_), + py::overload_cast(&Tensor::to_device, py::const_), py::arg("device").noconvert(), py::arg("mem_config").noconvert() = MemoryConfig{.memory_layout = TensorMemoryLayout::INTERLEAVED}, py::arg("cq_id") = ttnn::DefaultQueueId, @@ -1003,7 +1003,7 @@ void pytensor_module(py::module& m_tensor) { )doc") .def( "to", - py::overload_cast(&Tensor::to_device, py::const_), + py::overload_cast(&Tensor::to_device, py::const_), py::arg("mesh_device").noconvert(), py::arg("mem_config").noconvert() = MemoryConfig{.memory_layout = TensorMemoryLayout::INTERLEAVED}, py::arg("cq_id") = ttnn::DefaultQueueId, @@ -1078,7 +1078,7 @@ void pytensor_module(py::module& m_tensor) { )doc") .def( "cpu", - [](const Tensor& self, bool blocking, uint8_t cq_id) { return self.cpu(blocking, cq_id); }, + [](const Tensor& self, bool blocking, QueueId cq_id) { return self.cpu(blocking, cq_id); }, py::arg("blocking") = true, py::arg("cq_id") = ttnn::DefaultQueueId, R"doc( diff --git a/ttnn/cpp/pybind11/types.cpp b/ttnn/cpp/pybind11/types.cpp new file mode 100644 index 00000000000..fb980b4f070 --- /dev/null +++ b/ttnn/cpp/pybind11/types.cpp @@ -0,0 +1,90 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "types.hpp" + +#include + +#include +#include + +#include + +#include "export_enum.hpp" +#include "ttnn/tensor/tensor.hpp" +#include "ttnn/types.hpp" +#include "ttnn/operations/data_movement/bcast/bcast_types.hpp" + +namespace ttnn { +namespace types { + +void py_module_types(py::module& module) { + py::class_(module, "CoreGrid"); + py::class_(module, "Shape"); + py::class_(module, "QueueId") + .def(py::init()) + .def("__int__", [](const ttnn::QueueId& self) { return static_cast(*self); }) + .def( + "__repr__", + [](const ttnn::QueueId& self) { return "QueueId(" + std::to_string(static_cast(*self)) + ")"; }) + .def(py::self == py::self); + + export_enum(module, "BcastOpMath"); + export_enum(module, "BcastOpDim"); + + py::implicitly_convertible(); + + module.attr("DRAM_MEMORY_CONFIG") = py::cast(DRAM_MEMORY_CONFIG); + module.attr("L1_MEMORY_CONFIG") = py::cast(L1_MEMORY_CONFIG); +} + +void py_module(py::module& module) { + auto py_core_coord = static_cast>(module.attr("CoreGrid")); + py_core_coord.def(py::init(), py::kw_only(), py::arg("x"), py::arg("y")) + .def_property_readonly("x", [](const ttnn::CoreGrid& self) { return self.x; }) + .def_property_readonly("y", [](const ttnn::CoreGrid& self) { return self.y; }) + .def_property_readonly("num_cores", [](const ttnn::CoreGrid& self) { return self.x * self.y; }) + .def("__repr__", [](const ttnn::CoreGrid& self) -> std::string { + std::stringstream ss; + ss << self; + return ss.str(); + }); + + auto PyShape = static_cast>(module.attr("Shape")); + PyShape.def(py::init&>(), py::arg("shape")) + .def("__len__", [](const Shape& self) { return self.rank(); }) + .def("__getitem__", [](const Shape& self, std::int64_t index) { return self[index]; }) + .def( + "__iter__", + [](const Shape& self) { + return py::iter(py::cast(ttnn::SmallVector(self.cbegin(), self.cend()))); + }) + .def(pybind11::self == pybind11::self) + .def( + "__repr__", + [](const Shape& self) { + std::stringstream ss; + ss << self; + return ss.str(); + }) + .def_property_readonly("rank", [](const Shape& self) -> std::size_t { return self.rank(); }) + .def("to_rank", [](const Shape& self, std::size_t new_rank) { + SmallVector new_shape(new_rank, 1); + + int cur_idx = static_cast(self.rank()) - 1; + int new_idx = static_cast(new_rank) - 1; + for (; cur_idx >= 0 && new_idx >= 0; cur_idx--, new_idx--) { + new_shape[new_idx] = self[cur_idx]; + } + for (; cur_idx >= 0; cur_idx--) { + TT_FATAL(self[cur_idx] == 1, "Can't convert shape rank"); + } + + return ttnn::Shape(std::move(new_shape)); + }); + py::implicitly_convertible, ttnn::Shape>(); +} + +} // namespace types +} // namespace ttnn diff --git a/ttnn/cpp/pybind11/types.hpp b/ttnn/cpp/pybind11/types.hpp index 3ab9a55eadc..3442c817209 100644 --- a/ttnn/cpp/pybind11/types.hpp +++ b/ttnn/cpp/pybind11/types.hpp @@ -4,80 +4,17 @@ #pragma once -#include #include -#include -#include - -#include "export_enum.hpp" #include "small_vector_caster.hpp" // NOLINT - for pybind11 SmallVector binding support. -#include "ttnn/tensor/tensor.hpp" -#include "ttnn/types.hpp" -#include "ttnn/operations/data_movement/bcast/bcast_types.hpp" namespace py = pybind11; namespace ttnn { namespace types { -void py_module_types(py::module& module) { - py::class_(module, "CoreGrid"); - py::class_(module, "Shape"); - - export_enum(module, "BcastOpMath"); - export_enum(module, "BcastOpDim"); - - module.attr("DRAM_MEMORY_CONFIG") = py::cast(DRAM_MEMORY_CONFIG); - module.attr("L1_MEMORY_CONFIG") = py::cast(L1_MEMORY_CONFIG); -} - -void py_module(py::module& module) { - auto py_core_coord = static_cast>(module.attr("CoreGrid")); - py_core_coord.def(py::init(), py::kw_only(), py::arg("x"), py::arg("y")) - .def_property_readonly("x", [](const ttnn::CoreGrid& self) { return self.x; }) - .def_property_readonly("y", [](const ttnn::CoreGrid& self) { return self.y; }) - .def_property_readonly("num_cores", [](const ttnn::CoreGrid& self) { return self.x * self.y; }) - .def("__repr__", [](const ttnn::CoreGrid& self) -> std::string { - std::stringstream ss; - ss << self; - return ss.str(); - }); - - auto PyShape = static_cast>(module.attr("Shape")); - PyShape.def(py::init&>(), py::arg("shape")) - .def("__len__", [](const Shape& self) { return self.rank(); }) - .def("__getitem__", [](const Shape& self, std::int64_t index) { return self[index]; }) - .def( - "__iter__", - [](const Shape& self) { - return py::iter(py::cast(ttnn::SmallVector(self.cbegin(), self.cend()))); - }) - .def(pybind11::self == pybind11::self) - .def( - "__repr__", - [](const Shape& self) { - std::stringstream ss; - ss << self; - return ss.str(); - }) - .def_property_readonly("rank", [](const Shape& self) -> std::size_t { return self.rank(); }) - .def("to_rank", [](const Shape& self, std::size_t new_rank) { - SmallVector new_shape(new_rank, 1); - - int cur_idx = static_cast(self.rank()) - 1; - int new_idx = static_cast(new_rank) - 1; - for (; cur_idx >= 0 && new_idx >= 0; cur_idx--, new_idx--) { - new_shape[new_idx] = self[cur_idx]; - } - for (; cur_idx >= 0; cur_idx--) { - TT_FATAL(self[cur_idx] == 1, "Can't convert shape rank"); - } - - return ttnn::Shape(std::move(new_shape)); - }); - py::implicitly_convertible, ttnn::Shape>(); -} +void py_module_types(py::module& module); +void py_module(py::module& module); } // namespace types } // namespace ttnn diff --git a/ttnn/cpp/ttnn/async_runtime.cpp b/ttnn/cpp/ttnn/async_runtime.cpp index 76a7e25aa18..544ca4a538e 100644 --- a/ttnn/cpp/ttnn/async_runtime.cpp +++ b/ttnn/cpp/ttnn/async_runtime.cpp @@ -12,20 +12,20 @@ using namespace tt::tt_metal; namespace ttnn { void write_buffer( - queue_id cq_id, Tensor& dst, std::vector> src, const std::optional& region) { + QueueId cq_id, Tensor& dst, std::vector> src, const std::optional& region) { uint32_t dst_ref_count = dst.tensor_attributes->record_main_thread_ref_count(); for (const auto worker : dst.get_workers()) { auto src_for_device = (src.size() == 1) ? src.at(0) : src.at(worker->id()); worker->push_work([worker, src_for_device, dst, cq_id, region]() { auto shard = tt::tt_metal::get_shard_for_device(dst, worker); - tt::tt_metal::memcpy(worker->command_queue(cq_id), shard, src_for_device.get(), region); + tt::tt_metal::memcpy(worker->command_queue(*cq_id), shard, src_for_device.get(), region); }); } dst.tensor_attributes->update_main_thread_ref_count(dst.workers.at(0), dst_ref_count); } void read_buffer( - queue_id cq_id, + QueueId cq_id, Tensor& src, std::vector> dst, const std::optional& region, @@ -37,7 +37,7 @@ void read_buffer( auto dst_for_device = (dst.size() == 1) ? dst.at(0) : dst.at(worker->id()); worker->push_work([worker, dst_for_device, src, cq_id, region, src_offset, blocking]() { const auto& shard = tt::tt_metal::get_shard_for_device(src, worker); - tt::tt_metal::memcpy(worker->command_queue(cq_id), dst_for_device.get(), shard, region, blocking); + tt::tt_metal::memcpy(worker->command_queue(*cq_id), dst_for_device.get(), shard, region, blocking); }); } if (blocking) { diff --git a/ttnn/cpp/ttnn/async_runtime.hpp b/ttnn/cpp/ttnn/async_runtime.hpp index cbafdd631ff..f7647b28fcf 100644 --- a/ttnn/cpp/ttnn/async_runtime.hpp +++ b/ttnn/cpp/ttnn/async_runtime.hpp @@ -10,16 +10,15 @@ #include "types.hpp" namespace ttnn { -using queue_id = uint8_t; void write_buffer( - queue_id cq_id, + QueueId cq_id, Tensor& dst, std::vector> src, const std::optional& region = std::nullopt); void read_buffer( - queue_id cq_id, + QueueId cq_id, Tensor& src, std::vector> dst, const std::optional& region = std::nullopt, diff --git a/ttnn/cpp/ttnn/common/constants.hpp b/ttnn/cpp/ttnn/common/constants.hpp index 99a826a80a5..bee7ae2bb73 100644 --- a/ttnn/cpp/ttnn/common/constants.hpp +++ b/ttnn/cpp/ttnn/common/constants.hpp @@ -2,21 +2,6 @@ // // SPDX-License-Identifier: Apache-2.0 -#include - #pragma once #define MAX_PACK_UNTILIZE_WIDTH 8 // pack untilize currently does not support > 8 width - -namespace ttnn { - -/* - We have two software command queues available to overlap some work and reduce latency. - For example, Op2 can be prepared in a different queue while the first queue is blocked, waiting for data readout by - Op1. TT-NN operations allow specifying which queue should be used. The default queue is 0, and the possible values - are 0 and 1. -*/ - -constexpr uint8_t DefaultQueueId = 0; - -} // namespace ttnn diff --git a/ttnn/cpp/ttnn/common/queue_id.hpp b/ttnn/cpp/ttnn/common/queue_id.hpp new file mode 100644 index 00000000000..6b5f2cd33b0 --- /dev/null +++ b/ttnn/cpp/ttnn/common/queue_id.hpp @@ -0,0 +1,27 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include + +namespace ttnn { +/* + Type must be moved to metal. + Background: + We have two software command queues available to overlap some work and reduce latency. + For example, Op2 can be prepared in a different queue while the first queue is blocked, waiting for data readout by + Op1. TT-NN operations allow specifying which queue should be used. The default queue is 0, and the possible values + are 0 and 1. +*/ +using QueueId = tt::stl::StrongType; +static const QueueId DefaultQueueId = QueueId(0); + +} // namespace ttnn + +// Exporting to tt::tt_metal namespace because ttnn +// defines some of its own types (think Tensor) in tt::tt_metal namespace. +namespace tt::tt_metal { +using QueueId = ttnn::QueueId; +} diff --git a/ttnn/cpp/ttnn/decorators.hpp b/ttnn/cpp/ttnn/decorators.hpp index 4eac54c7443..f1217df35b8 100644 --- a/ttnn/cpp/ttnn/decorators.hpp +++ b/ttnn/cpp/ttnn/decorators.hpp @@ -8,7 +8,7 @@ #include #include -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/core.hpp" #include "ttnn/device_operation.hpp" #include "ttnn/operation.hpp" @@ -218,7 +218,7 @@ struct registered_operation_t { template requires PrimitiveOperationConcept - auto invoke(uint8_t queue_id, args_t&&... args) const { + auto invoke(QueueId queue_id, args_t&&... args) const { static_assert( requires { operation_t::invoke(std::forward(args)...); }, "Primitive Operation must implement operator() method to be invoked."); diff --git a/ttnn/cpp/ttnn/device_operation.hpp b/ttnn/cpp/ttnn/device_operation.hpp index df59fb8de64..c9794df5d6e 100644 --- a/ttnn/cpp/ttnn/device_operation.hpp +++ b/ttnn/cpp/ttnn/device_operation.hpp @@ -282,7 +282,7 @@ void launch_on_worker_thread(auto cq_id, auto device_operation_id, const auto& o const auto enqueue_or_launch_program = [=](tt::tt_metal::Program& program) { if (USE_FAST_DISPATCH) { ZoneScopedN("EnqueueProgram"); - auto& queue = device->command_queue(cq_id); + auto& queue = device->command_queue(*cq_id); tt::tt_metal::EnqueueProgram(queue, program, false); } else { ZoneScopedN("LaunchProgram"); @@ -345,7 +345,7 @@ void launch_on_worker_thread(auto cq_id, auto device_operation_id, const auto& o template typename device_operation_t::tensor_return_value_t launch_on_single_device( - uint8_t cq_id, + QueueId cq_id, const typename device_operation_t::operation_attributes_t& operation_attributes, const typename device_operation_t::tensor_args_t& tensor_args) { ZoneScopedN("Launch Device Operation"); @@ -415,7 +415,7 @@ static T make_tensor_return_value_from_shards(auto& old_storage, std::vector& template typename device_operation_t::tensor_return_value_t launch_on_multi_device( - uint8_t cq_id, + QueueId cq_id, const typename device_operation_t::operation_attributes_t& operation_attributes, const typename device_operation_t::tensor_args_t& tensor_args) { ZoneScopedN("Launch Multi Device Operation"); @@ -443,7 +443,7 @@ typename device_operation_t::tensor_return_value_t launch_on_multi_device( template typename device_operation_t::tensor_return_value_t invoke( - uint8_t cq_id, + QueueId cq_id, const typename device_operation_t::operation_attributes_t& operation_attributes, const typename device_operation_t::tensor_args_t& tensor_args) { ZoneScopedN("Run Device Operation"); diff --git a/ttnn/cpp/ttnn/events.cpp b/ttnn/cpp/ttnn/events.cpp index 3a43854739d..54d13fead11 100644 --- a/ttnn/cpp/ttnn/events.cpp +++ b/ttnn/cpp/ttnn/events.cpp @@ -31,28 +31,28 @@ std::shared_ptr create_event(IDevice* device) { return event; } -void record_event(uint8_t cq_id, const std::shared_ptr& event, const std::vector& sub_device_ids) { +void record_event(QueueId cq_id, const std::shared_ptr& event, const std::vector& sub_device_ids) { IDevice* device = event->device; device->push_work([device, event, cq_id, sub_device_ids] { - EnqueueRecordEvent(device->command_queue(cq_id), event, sub_device_ids); + EnqueueRecordEvent(device->command_queue(*cq_id), event, sub_device_ids); }); } -void wait_for_event(uint8_t cq_id, const std::shared_ptr& event) { +void wait_for_event(QueueId cq_id, const std::shared_ptr& event) { IDevice* device = event->device; - device->push_work([device, event, cq_id] { EnqueueWaitForEvent(device->command_queue(cq_id), event); }); + device->push_work([device, event, cq_id] { EnqueueWaitForEvent(device->command_queue(*cq_id), event); }); } MultiDeviceEvent create_event(MeshDevice* mesh_device) { return MultiDeviceEvent(mesh_device); } void record_event( - uint8_t cq_id, const MultiDeviceEvent& multi_device_event, const std::vector& sub_device_ids) { + QueueId cq_id, const MultiDeviceEvent& multi_device_event, const std::vector& sub_device_ids) { for (auto& event : multi_device_event.events) { record_event(cq_id, event, sub_device_ids); } } -void wait_for_event(uint8_t cq_id, const MultiDeviceEvent& multi_device_event) { +void wait_for_event(QueueId cq_id, const MultiDeviceEvent& multi_device_event) { for (auto& event : multi_device_event.events) { wait_for_event(cq_id, event); } diff --git a/ttnn/cpp/ttnn/events.hpp b/ttnn/cpp/ttnn/events.hpp index c3b53a73512..1e1eedbaac9 100644 --- a/ttnn/cpp/ttnn/events.hpp +++ b/ttnn/cpp/ttnn/events.hpp @@ -20,14 +20,14 @@ struct MultiDeviceEvent { // Single Device APIs std::shared_ptr create_event(IDevice* device); void record_event( - uint8_t cq_id, + QueueId cq_id, const std::shared_ptr& event, const std::vector& sub_device_ids = {}); -void wait_for_event(uint8_t cq_id, const std::shared_ptr& event); +void wait_for_event(QueueId cq_id, const std::shared_ptr& event); // Multi Device APIs MultiDeviceEvent create_event(MeshDevice* mesh_device); void record_event( - uint8_t cq_id, const MultiDeviceEvent& event, const std::vector& sub_device_ids = {}); -void wait_for_event(uint8_t cq_id, const MultiDeviceEvent& event); + QueueId cq_id, const MultiDeviceEvent& event, const std::vector& sub_device_ids = {}); +void wait_for_event(QueueId cq_id, const MultiDeviceEvent& event); } // namespace ttnn::events diff --git a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.cpp b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.cpp index 35eb5ab193f..f3d458c821b 100644 --- a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.cpp @@ -286,7 +286,7 @@ Tensor all_gather( if (input_tensor.get_dtype() != DataType::BFLOAT16 && input_tensor.get_dtype() != DataType::FLOAT32) { input_tensor = ttnn::typecast(input_tensor, DataType::BFLOAT16); } - input_tensor = ttnn::pad(0, input_tensor, padding, 0, false, std::nullopt); + input_tensor = ttnn::pad(ttnn::DefaultQueueId, input_tensor, padding, 0, false, std::nullopt); if (original_dtype != input_tensor.get_dtype()) { input_tensor = ttnn::typecast(input_tensor, original_dtype); } diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp index 944179cfed6..50b5c017a41 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp @@ -259,7 +259,7 @@ Result conv2d( } Result Conv2dOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const ttnn::Tensor& weight_tensor, IDevice* device, @@ -298,7 +298,7 @@ Result Conv2dOperation::invoke( } Result Conv2dOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const ttnn::Tensor& weight_tensor, MeshDevice* device, diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.hpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.hpp index f70bc6cea31..cee3027fdce 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.hpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.hpp @@ -41,7 +41,7 @@ Result conv2d( struct Conv2dOperation { static Result invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const ttnn::Tensor& weight_tensor, IDevice* device, @@ -61,7 +61,7 @@ struct Conv2dOperation { const std::optional& memory_config = std::nullopt); static Result invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const ttnn::Tensor& weight_tensor, MeshDevice* device, diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp index 36398f45c4e..ef664e12add 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp @@ -61,7 +61,7 @@ void py_bind_conv2d(py::module& module) { const std::optional& conv_config, const std::optional& compute_config, const std::optional& memory_config, - const uint8_t& queue_id) -> Result { + QueueId queue_id) -> Result { return self( queue_id, input_tensor, @@ -100,7 +100,7 @@ void py_bind_conv2d(py::module& module) { py::arg("conv_config") = std::nullopt, py::arg("compute_config") = std::nullopt, py::arg("memory_config") = std::nullopt, - py::arg("queue_id") = 0}, + py::arg("queue_id") = DefaultQueueId}, ttnn::pybind_overload_t{ [](const decltype(ttnn::conv2d)& self, @@ -121,7 +121,7 @@ void py_bind_conv2d(py::module& module) { const std::optional& conv_config, const std::optional& compute_config, const std::optional& memory_config, - const uint8_t& queue_id) -> Result { + QueueId queue_id) -> Result { return self( queue_id, input_tensor, @@ -160,7 +160,7 @@ void py_bind_conv2d(py::module& module) { py::arg("conv_config") = std::nullopt, py::arg("compute_config") = std::nullopt, py::arg("memory_config") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); module.def( "prepare_conv_weights", diff --git a/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d.cpp b/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d.cpp index 74df61c3d76..7c5ab221a0e 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d.cpp @@ -355,7 +355,7 @@ Result conv_transpose2d( } Result ConvTranpose2dOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const ttnn::Tensor& weight_tensor, IDevice* device, @@ -398,7 +398,7 @@ Result ConvTranpose2dOperation::invoke( } Result ConvTranpose2dOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const ttnn::Tensor& weight_tensor, MeshDevice* device, diff --git a/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d.hpp b/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d.hpp index 4f32025bc46..301c0d830cb 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d.hpp +++ b/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d.hpp @@ -18,7 +18,7 @@ using Result = std::tuple& compute_config, const std::optional& memory_config, bool mirror_kernel, - const uint8_t& queue_id) -> Result { + QueueId queue_id) -> Result { return self( queue_id, input_tensor, @@ -152,7 +152,7 @@ void py_bind_conv_transpose2d(py::module& module) { py::arg("compute_config") = std::nullopt, py::arg("memory_config") = std::nullopt, py::arg("mirror_kernel") = true, - py::arg("queue_id") = 0}, + py::arg("queue_id") = DefaultQueueId}, ttnn::pybind_overload_t{ [](const decltype(ttnn::conv_transpose2d)& self, @@ -175,7 +175,7 @@ void py_bind_conv_transpose2d(py::module& module) { const std::optional& compute_config, const std::optional& memory_config, bool mirror_kernel, - const uint8_t& queue_id) -> Result { + QueueId queue_id) -> Result { return self( queue_id, input_tensor, @@ -218,7 +218,7 @@ void py_bind_conv_transpose2d(py::module& module) { py::arg("compute_config") = std::nullopt, py::arg("memory_config") = std::nullopt, py::arg("mirror_kernel") = true, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } } // namespace conv_transpose2d diff --git a/ttnn/cpp/ttnn/operations/copy.hpp b/ttnn/cpp/ttnn/operations/copy.hpp index 750568b4c46..7554904252d 100644 --- a/ttnn/cpp/ttnn/operations/copy.hpp +++ b/ttnn/cpp/ttnn/operations/copy.hpp @@ -5,7 +5,7 @@ #pragma once #include "ttnn/decorators.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/operations/core/core.hpp" #include "ttnn/operations/eltwise/unary/unary.hpp" #include "ttnn/operations/eltwise/unary/device/unary_device_operation.hpp" @@ -18,7 +18,7 @@ namespace copy { namespace detail { inline Tensor copy_impl( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const std::vector& op_chain, const std::optional& memory_config = std::nullopt, @@ -52,7 +52,7 @@ inline Tensor copy_impl( struct Typecast { static Tensor invoke( - const uint8_t queue_id, + const QueueId queue_id, const Tensor& input, const DataType& output_dtype, const std::optional& memory_config_arg = std::nullopt, @@ -94,7 +94,7 @@ struct Typecast { // const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG) static ttnn::Tensor invoke( - const uint8_t queue_id, + const QueueId queue_id, const Tensor& input_tensor, const DataType& tt_input_dtype, const DataType& tt_output_dtype, diff --git a/ttnn/cpp/ttnn/operations/core/core.cpp b/ttnn/cpp/ttnn/operations/core/core.cpp index 21d90d6cf46..eb8370acf78 100644 --- a/ttnn/cpp/ttnn/operations/core/core.cpp +++ b/ttnn/cpp/ttnn/operations/core/core.cpp @@ -50,7 +50,7 @@ ttnn::Tensor squeeze_from_4D(const ttnn::Tensor& tensor, const int rank) { } ttnn::Tensor to_device( - const ttnn::Tensor& tensor, IDevice* device, const std::optional& memory_config, uint8_t cq_id) { + const ttnn::Tensor& tensor, IDevice* device, const std::optional& memory_config, QueueId cq_id) { auto mem_config = memory_config.value_or(ttnn::DRAM_MEMORY_CONFIG); if (mem_config.is_sharded() and (device->arch() == tt::ARCH::BLACKHOLE)) { auto interleaved_tensor = tensor.to_device(device, ttnn::DRAM_MEMORY_CONFIG, cq_id); @@ -64,7 +64,7 @@ ttnn::Tensor to_device( const ttnn::Tensor& tensor, MeshDevice* mesh_device, const std::optional& memory_config, - uint8_t cq_id) { + QueueId cq_id) { auto mem_config = memory_config.value_or(ttnn::DRAM_MEMORY_CONFIG); // Currently no direct sharded write support in BLACKHOLE due to alignment issue if (mem_config.is_sharded() and (mesh_device->arch() == tt::ARCH::BLACKHOLE)) { @@ -107,11 +107,11 @@ ttnn::Tensor allocate_tensor_on_device(const ttnn::TensorSpec& spec, MeshDevice* return tt::tt_metal::allocate_tensor_on_devices(spec, mesh_device->get_devices()); } -void copy_host_to_device_tensor(const ttnn::Tensor& host_tensor, ttnn::Tensor device_tensor, uint8_t cq_id) { +void copy_host_to_device_tensor(const ttnn::Tensor& host_tensor, ttnn::Tensor device_tensor, QueueId cq_id) { tt::tt_metal::write_tensor(std::move(host_tensor), std::move(device_tensor), cq_id); } -ttnn::Tensor from_device(const ttnn::Tensor& tensor, bool blocking, uint8_t cq_id) { +ttnn::Tensor from_device(const ttnn::Tensor& tensor, bool blocking, QueueId cq_id) { // Currently no direct sharded read support in BLACKHOLE due to alignment issue if (tensor.is_sharded() and (tensor.device()->arch() == tt::ARCH::BLACKHOLE)) { auto interleaved_tensor = ttnn::sharded_to_interleaved(cq_id, tensor, ttnn::DRAM_MEMORY_CONFIG, std::nullopt); @@ -128,21 +128,21 @@ Tensor reallocate(const Tensor& input_tensor, const std::optional& } // Trace APIs - Single Device -uint32_t begin_trace_capture(IDevice* device, const uint8_t cq_id) { +uint32_t begin_trace_capture(IDevice* device, const QueueId cq_id) { ZoneScoped; uint32_t tid = Trace::next_id(); - device->begin_trace(cq_id, tid); + device->begin_trace(*cq_id, tid); return tid; } -void end_trace_capture(IDevice* device, const uint32_t tid, const uint8_t cq_id) { +void end_trace_capture(IDevice* device, const uint32_t tid, const QueueId cq_id) { ZoneScoped; - device->end_trace(cq_id, tid); + device->end_trace(*cq_id, tid); } -void execute_trace(IDevice* device, const uint32_t tid, const uint8_t cq_id, bool blocking) { +void execute_trace(IDevice* device, const uint32_t tid, const QueueId cq_id, bool blocking) { ZoneScoped; - device->replay_trace(cq_id, tid, blocking); + device->replay_trace(*cq_id, tid, blocking); } void release_trace(IDevice* device, const uint32_t tid) { diff --git a/ttnn/cpp/ttnn/operations/core/core.hpp b/ttnn/cpp/ttnn/operations/core/core.hpp index 7b2e296b0f6..192d2451a44 100644 --- a/ttnn/cpp/ttnn/operations/core/core.hpp +++ b/ttnn/cpp/ttnn/operations/core/core.hpp @@ -28,13 +28,13 @@ ttnn::Tensor to_device( const ttnn::Tensor& tensor, IDevice* device, const std::optional& memory_config, - uint8_t cq_id = ttnn::DefaultQueueId); + ttnn::QueueId cq_id = ttnn::DefaultQueueId); ttnn::Tensor to_device( const ttnn::Tensor& tensor, MeshDevice* mesh_device, const std::optional& memory_config, - uint8_t cq_id = ttnn::DefaultQueueId); + ttnn::QueueId cq_id = ttnn::DefaultQueueId); ttnn::Tensor allocate_tensor_on_device( const Shape& shape, @@ -54,20 +54,20 @@ ttnn::Tensor allocate_tensor_on_device(const ttnn::TensorSpec& spec, IDevice* de ttnn::Tensor allocate_tensor_on_device(const ttnn::TensorSpec& spec, MeshDevice* device); void copy_host_to_device_tensor( - const ttnn::Tensor& host_tensor, ttnn::Tensor device_tensor, uint8_t cq_id = ttnn::DefaultQueueId); + const ttnn::Tensor& host_tensor, ttnn::Tensor device_tensor, ttnn::QueueId cq_id = ttnn::DefaultQueueId); -ttnn::Tensor from_device(const ttnn::Tensor& tensor, bool blocking = true, uint8_t cq_id = ttnn::DefaultQueueId); +ttnn::Tensor from_device(const ttnn::Tensor& tensor, bool blocking = true, ttnn::QueueId cq_id = ttnn::DefaultQueueId); void deallocate(Tensor& tensor, bool force = true); Tensor reallocate(const Tensor& input_tensor, const std::optional& memory_config); // Trace APIs - Single Device -uint32_t begin_trace_capture(IDevice* device, const uint8_t cq_id); +uint32_t begin_trace_capture(IDevice* device, const QueueId cq_id); -void end_trace_capture(IDevice* device, const uint32_t tid, const uint8_t cq_id); +void end_trace_capture(IDevice* device, const uint32_t tid, const QueueId cq_id); -void execute_trace(IDevice* device, const uint32_t tid, const uint8_t cq_id, bool blocking); +void execute_trace(IDevice* device, const uint32_t tid, const QueueId cq_id, bool blocking); void release_trace(IDevice* device, const uint32_t tid); diff --git a/ttnn/cpp/ttnn/operations/core/to_layout/to_layout_op.cpp b/ttnn/cpp/ttnn/operations/core/to_layout/to_layout_op.cpp index 83fdad149f5..c88c5c1c629 100644 --- a/ttnn/cpp/ttnn/operations/core/to_layout/to_layout_op.cpp +++ b/ttnn/cpp/ttnn/operations/core/to_layout/to_layout_op.cpp @@ -165,7 +165,7 @@ Tensor to_layout_impl( {0, 0}, {0, padded_output_shape[2] - output_shape[2]}, {0, padded_output_shape[3] - output_shape[3]}}; - tensor = ttnn::pad(0, tensor, padding, 0, true, std::nullopt); + tensor = ttnn::pad(ttnn::DefaultQueueId, tensor, padding, 0, true, std::nullopt); return ttnn::tilize(tensor, output_memory_config, dtype, use_multicore_tilize); } else { PadValue pad_value_variant; diff --git a/ttnn/cpp/ttnn/operations/creation.hpp b/ttnn/cpp/ttnn/operations/creation.hpp index 80cd7e023ad..d841ba33081 100644 --- a/ttnn/cpp/ttnn/operations/creation.hpp +++ b/ttnn/cpp/ttnn/operations/creation.hpp @@ -8,7 +8,7 @@ #include #include -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/core.hpp" #include "ttnn/decorators.hpp" #include "ttnn/distributed/types.hpp" @@ -108,7 +108,7 @@ static Tensor arange_impl( template static Tensor full_impl( - uint8_t queue_id, + QueueId queue_id, const ttnn::Shape& shape, T value, const Layout layout, @@ -134,7 +134,7 @@ static Tensor full_impl( for (auto* buffer : buffers) { if (using_fast_dispatch) { - auto& cmd_queue = buffer->device()->command_queue(queue_id); + auto& cmd_queue = buffer->device()->command_queue(*queue_id); tt::tt_metal::EnqueueWriteBuffer(cmd_queue, *buffer, owned_buffer.data(), /*blocking=*/false); } else { tt::tt_metal::detail::WriteToBuffer(*buffer, owned_buffer.get()); @@ -149,7 +149,7 @@ static Tensor full_impl( template inline ttnn::Tensor full_impl( - uint8_t queue_id, + QueueId queue_id, const ttnn::Shape& shape, const T fill_value, const std::optional& dtype = std::nullopt, @@ -193,7 +193,7 @@ inline ttnn::Tensor full( detail::OptionalAnyDevice device = std::nullopt, const std::optional& memory_config = std::nullopt, std::optional optional_output_tensor = std::nullopt, - uint8_t queue_id = ttnn::DefaultQueueId) { + ttnn::QueueId queue_id = ttnn::DefaultQueueId) { return full_impl( queue_id, shape, @@ -227,7 +227,7 @@ inline constexpr Ones ones{}; template inline ttnn::Tensor full_like_impl( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& tensor, const T fill_value, const std::optional& dtype = std::nullopt, @@ -288,7 +288,7 @@ struct FullLikeWith { static constexpr auto fill_value = FillValue.invoke(); static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& tensor, const std::optional& dtype = std::nullopt, const std::optional& layout = std::nullopt, @@ -351,7 +351,7 @@ struct Full { template requires std::is_same_v or std::is_same_v static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Shape& shape, const FillValueType fill_value, const std::optional& dtype = std::nullopt, @@ -396,7 +396,7 @@ struct FullLike { template requires std::is_same_v or std::is_same_v static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& tensor, const FillValueType fill_value, const std::optional& dtype = std::nullopt, diff --git a/ttnn/cpp/ttnn/operations/data_movement/bcast/bcast.cpp b/ttnn/cpp/ttnn/operations/data_movement/bcast/bcast.cpp index f426ccd9bb6..2f262f71639 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/bcast/bcast.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/bcast/bcast.cpp @@ -12,7 +12,7 @@ namespace ttnn::operations::data_movement { // Does a broadcast Tensor BcastOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a, const Tensor& input_tensor_b, BcastOpMath bcast_op, @@ -74,6 +74,8 @@ Tensor BcastOperation::invoke( {input_tensor_a, input_tensor_b}, {}, {output_tensor}, + 0, /* pad_value*/ + false, /*pad_c*/ queue_id); }, {input_tensor_a, input_tensor_b}, diff --git a/ttnn/cpp/ttnn/operations/data_movement/bcast/bcast.hpp b/ttnn/cpp/ttnn/operations/data_movement/bcast/bcast.hpp index e53289d9031..94b7dbd585f 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/bcast/bcast.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/bcast/bcast.hpp @@ -12,7 +12,7 @@ namespace operations::data_movement { struct BcastOperation { static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a, const Tensor& input_tensor_b, ttnn::BcastOpMath bcast_op, diff --git a/ttnn/cpp/ttnn/operations/data_movement/bcast/bcast_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/bcast/bcast_pybind.cpp index 10da63289d5..ef3df6fbcc5 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/bcast/bcast_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/bcast/bcast_pybind.cpp @@ -65,7 +65,7 @@ void py_bind_bcast(py::module& module) { ttnn::BcastOpDim bcast_dim, std::optional output_tensor, const std::optional& memory_config, - uint8_t queue_id) { + QueueId queue_id) { return self( queue_id, input_tensor_a, input_tensor_b, bcast_op, bcast_dim, memory_config, output_tensor); }, @@ -76,7 +76,7 @@ void py_bind_bcast(py::module& module) { py::kw_only(), py::arg("output_tensor") = std::nullopt, py::arg("memory_config") = std::nullopt, - py::arg("queue_id").noconvert() = 0}); + py::arg("queue_id") = DefaultQueueId}); } } // namespace ttnn::operations::data_movement::detail diff --git a/ttnn/cpp/ttnn/operations/data_movement/common/common.cpp b/ttnn/cpp/ttnn/operations/data_movement/common/common.cpp index f734390b4be..aa5f7d3d5ed 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/common/common.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/common/common.cpp @@ -53,7 +53,7 @@ ttnn::Tensor squeeze_from_ND_to_4D(const ttnn::Tensor& tensor) { } ttnn::Tensor pad_to_tile_vol( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& tensor, const float value, const bool use_multicore, diff --git a/ttnn/cpp/ttnn/operations/data_movement/common/common.hpp b/ttnn/cpp/ttnn/operations/data_movement/common/common.hpp index 4f9e7b72399..23801069b9d 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/common/common.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/common/common.hpp @@ -16,7 +16,7 @@ ttnn::Shape squeeze_shape_to_4D(ttnn::Shape output_shape); ttnn::Tensor squeeze_from_ND_to_4D(const ttnn::Tensor& tensor); ttnn::Tensor pad_to_tile_vol( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& tensor, const float value, const bool use_multicore, @@ -151,7 +151,7 @@ class MassagedOperation { }; ttnn::Tensor pad_to_tile_vol( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& tensor, const float value, const bool use_multicore, diff --git a/ttnn/cpp/ttnn/operations/data_movement/concat/concat.cpp b/ttnn/cpp/ttnn/operations/data_movement/concat/concat.cpp index aca777628a5..478eb4f127f 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/concat/concat.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/concat/concat.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/tensor/types.hpp" #include "ttnn/operations/core/core.hpp" #include @@ -88,7 +88,7 @@ MassagedConcat build_unsqueeze_concat(int input_rank, const MemoryConfig& output } MassagedConcat build_untilize_rm_retilize_concat( - uint8_t queue_id, const MemoryConfig& output_memory_config, ttnn::Shape& logical_output_shape) { + QueueId queue_id, const MemoryConfig& output_memory_config, ttnn::Shape& logical_output_shape) { return MassagedConcat(MassagedConcatParams{ .predicate = [](const std::vector& tensors, int dim, unsigned int groups) -> bool { // untilize_rm_retilize if the concat dim is padded for tilized tensors @@ -167,7 +167,7 @@ MassagedConcat build_untilize_rm_retilize_concat( } MassagedConcat build_prepost_transpose_concat( - uint8_t queue_id, const MemoryConfig& output_memory_config, int dim1, int dim2) { + QueueId queue_id, const MemoryConfig& output_memory_config, int dim1, int dim2) { return MassagedConcat(MassagedConcatParams{ .predicate = [dim1, dim2](const std::vector& tensors, int dim, unsigned int groups) -> bool { bool res = dim1 != dim2; @@ -210,7 +210,7 @@ MassagedConcat build_prepost_transpose_concat( } MassagedConcat build_non_aligned_last_dim_concat( - const std::vector& tensors, uint8_t queue_id, const MemoryConfig& output_memory_config) { + const std::vector& tensors, QueueId queue_id, const MemoryConfig& output_memory_config) { // this is a special case of pre-post transpose concat where we're // concatting on the last dim and the last dims of the input tensors are // not all aligned @@ -249,7 +249,7 @@ MassagedConcat build_non_aligned_last_dim_concat( // Wrapper for TTDNN ttnn::Tensor ConcatOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const std::vector& input_tensors, int dim, const std::optional& memory_config, diff --git a/ttnn/cpp/ttnn/operations/data_movement/concat/concat.hpp b/ttnn/cpp/ttnn/operations/data_movement/concat/concat.hpp index 3358087954d..08d06975590 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/concat/concat.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/concat/concat.hpp @@ -16,7 +16,7 @@ namespace data_movement { struct ConcatOperation { // Wrapper for TTDNN static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const std::vector& input_tensors, int dim, const std::optional& memory_config = std::nullopt, diff --git a/ttnn/cpp/ttnn/operations/data_movement/concat/concat_pybind.hpp b/ttnn/cpp/ttnn/operations/data_movement/concat/concat_pybind.hpp index 815a44a2f12..1a8649ba645 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/concat/concat_pybind.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/concat/concat_pybind.hpp @@ -52,7 +52,7 @@ Keyword Args: std::optional& optional_output_tensor, std::optional& memory_config, const int groups, - uint8_t queue_id) { + QueueId queue_id) { return self(queue_id, tensors, dim, memory_config, optional_output_tensor, groups); }, py::arg("tensors"), @@ -61,7 +61,7 @@ Keyword Args: py::arg("output_tensor").noconvert() = std::nullopt, py::arg("memory_config") = std::nullopt, py::arg("groups") = 1, - py::arg("queue_id") = 0, + py::arg("queue_id") = DefaultQueueId, }); } diff --git a/ttnn/cpp/ttnn/operations/data_movement/copy/copy.cpp b/ttnn/cpp/ttnn/operations/data_movement/copy/copy.cpp index 5256d468e6d..08abb9acd55 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/copy/copy.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/copy/copy.cpp @@ -7,7 +7,7 @@ #include #include "device/copy_device_operation.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/decorators.hpp" #include "ttnn/run_operation.hpp" @@ -15,7 +15,7 @@ using namespace tt::tt_metal; namespace ttnn::operations::data_movement { -ttnn::Tensor CopyOperation::invoke(uint8_t queue_id, const Tensor& src_tensor, const Tensor& dst_tensor) { +ttnn::Tensor CopyOperation::invoke(QueueId queue_id, const Tensor& src_tensor, const Tensor& dst_tensor) { operation::run( CopyDeviceOperation{dst_tensor.memory_config(), dst_tensor.get_dtype()}, {src_tensor, dst_tensor}, @@ -30,7 +30,7 @@ ttnn::Tensor CopyOperation::invoke(const Tensor& src_tensor, const Tensor& dst_t } ttnn::Tensor AssignOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input, const MemoryConfig& output_mem_config, std::optional output_dtype, @@ -49,7 +49,7 @@ ttnn::Tensor AssignOperation::invoke( return invoke(ttnn::DefaultQueueId, input, output_mem_config, output_dtype); } -ttnn::Tensor AssignOperation::invoke(uint8_t queue_id, const Tensor& input_a, const Tensor& input_b) { +ttnn::Tensor AssignOperation::invoke(QueueId queue_id, const Tensor& input_a, const Tensor& input_b) { operation::run( CopyDeviceOperation{input_b.memory_config(), input_b.get_dtype()}, {input_a, input_b}, {}, {}, queue_id); return input_b; diff --git a/ttnn/cpp/ttnn/operations/data_movement/copy/copy.hpp b/ttnn/cpp/ttnn/operations/data_movement/copy/copy.hpp index 85cb979f2f3..990cd20fa53 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/copy/copy.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/copy/copy.hpp @@ -12,14 +12,14 @@ namespace ttnn { namespace operations::data_movement { struct CopyOperation { - static ttnn::Tensor invoke(uint8_t queue_id, const Tensor& src_tensor, const Tensor& dst_tensor); + static ttnn::Tensor invoke(QueueId queue_id, const Tensor& src_tensor, const Tensor& dst_tensor); static ttnn::Tensor invoke(const Tensor& src_tensor, const Tensor& dst_tensor); }; struct AssignOperation { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input, const MemoryConfig& output_mem_config, std::optional output_dtype = std::nullopt, @@ -30,7 +30,7 @@ struct AssignOperation { const MemoryConfig& output_mem_config, std::optional output_dtype = std::nullopt); - static ttnn::Tensor invoke(uint8_t queue_id, const Tensor& input_a, const Tensor& input_b); + static ttnn::Tensor invoke(QueueId queue_id, const Tensor& input_a, const Tensor& input_b); static ttnn::Tensor invoke(const Tensor& input_a, const Tensor& input_b); }; diff --git a/ttnn/cpp/ttnn/operations/data_movement/copy/copy_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/copy/copy_pybind.cpp index 1c06a6f557b..cf317e9c5b7 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/copy/copy_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/copy/copy_pybind.cpp @@ -70,11 +70,11 @@ void py_bind_copy(py::module& module) { [](const decltype(ttnn::copy)& self, const ttnn::Tensor& input_a, const ttnn::Tensor& input_b, - uint8_t queue_id) { return self(queue_id, input_a, input_b); }, + QueueId queue_id) { return self(queue_id, input_a, input_b); }, py::arg("input_a").noconvert(), py::arg("input_b").noconvert(), py::kw_only(), - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } void py_bind_assign(py::module& module) { @@ -108,21 +108,21 @@ void py_bind_assign(py::module& module) { const ttnn::MemoryConfig& memory_config, const std::optional dtype, std::optional& optional_output_tensor, - uint8_t queue_id) { return self(queue_id, input, memory_config, dtype, optional_output_tensor); }, + QueueId queue_id) { return self(queue_id, input, memory_config, dtype, optional_output_tensor); }, py::arg("input_tensor").noconvert(), py::kw_only(), py::arg("memory_config"), py::arg("dtype") = std::nullopt, py::arg("output_tensor") = std::nullopt, - py::arg("queue_id") = 0}, + py::arg("queue_id") = DefaultQueueId}, ttnn::pybind_overload_t{ [](const decltype(ttnn::assign)& self, const ttnn::Tensor& input_a, const ttnn::Tensor& input_b, - uint8_t queue_id) { return self(queue_id, input_a, input_b); }, + QueueId queue_id) { return self(queue_id, input_a, input_b); }, py::arg("input_a").noconvert(), py::arg("input_b").noconvert(), - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } } // namespace ttnn::operations::data_movement::detail diff --git a/ttnn/cpp/ttnn/operations/data_movement/expand/expand.cpp b/ttnn/cpp/ttnn/operations/data_movement/expand/expand.cpp index 7bc7afaadbb..1ca49682e06 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/expand/expand.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/expand/expand.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/run_operation.hpp" #include "expand.hpp" #include @@ -35,9 +35,8 @@ ttnn::Tensor ExpandOperation::invoke( const ttnn::Tensor& tensor, const tt::stl::Span shape_vector, const std::optional& memory_config, - const std::optional& queue_id) { - const uint32_t queue_id_value = queue_id.value_or(0); - return ttnn::repeat(tensor, create_repetition_vector(tensor, shape_vector), memory_config, queue_id_value); + const QueueId& queue_id) { + return ttnn::repeat(tensor, create_repetition_vector(tensor, shape_vector), memory_config, queue_id); } } // namespace ttnn::operations::expand diff --git a/ttnn/cpp/ttnn/operations/data_movement/expand/expand.hpp b/ttnn/cpp/ttnn/operations/data_movement/expand/expand.hpp index b172769e54f..e452b5000f1 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/expand/expand.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/expand/expand.hpp @@ -13,7 +13,7 @@ struct ExpandOperation { const ttnn::Tensor& input, const tt::stl::Span shape_vector, const std::optional& memory_config, - const std::optional& queue_id); + const QueueId& queue_id = DefaultQueueId); }; } // namespace ttnn::operations::expand diff --git a/ttnn/cpp/ttnn/operations/data_movement/expand/expand_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/expand/expand_pybind.cpp index bfe4b5a357b..c35b781fdf8 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/expand/expand_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/expand/expand_pybind.cpp @@ -25,12 +25,12 @@ void py_bind_expand(py::module& module, const data_movement_operation_t& operati const ttnn::Tensor& input_tensor, const ttnn::SmallVector output_shape, const std::optional& memory_config, - const uint8_t queue_id) { return self(input_tensor, output_shape, memory_config, queue_id); }, + const QueueId queue_id) { return self(input_tensor, output_shape, memory_config, queue_id); }, py::arg("input_tensor"), py::arg("output_shape"), py::kw_only(), py::arg("memory_config") = std::nullopt, - py::arg("queue_id") = 0, + py::arg("queue_id") = DefaultQueueId, }); } diff --git a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/fill_pad.cpp b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/fill_pad.cpp index 5fdc70bbed6..3b5d0a3dbcd 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/fill_pad.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/fill_pad.cpp @@ -6,7 +6,7 @@ #include "device/fill_pad_op.hpp" #include "ttnn/run_operation.hpp" #include "ttnn/decorators.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/operations/core/core.hpp" #include @@ -15,7 +15,7 @@ using namespace tt::tt_metal; namespace ttnn::operations::data_movement { ttnn::Tensor FillPadOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, float fill_value, const std::optional& memory_config) { diff --git a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/fill_pad.hpp b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/fill_pad.hpp index 89d98772946..0213d996ea7 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/fill_pad.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/fill_pad.hpp @@ -12,7 +12,7 @@ namespace data_movement { struct FillPadOperation { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, float fill_value, const std::optional& memory_config = std::nullopt); diff --git a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/fill_pad_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/fill_pad_pybind.cpp index 7e47ea964bd..60c62920555 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/fill_pad_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/fill_pad_pybind.cpp @@ -52,12 +52,12 @@ void bind_fill_pad_op(py::module& module) { const Tensor& input_tensor, const float fill_value, const std::optional& memory_config, - uint8_t queue_id) { return self(queue_id, input_tensor, fill_value, memory_config); }, + QueueId queue_id) { return self(queue_id, input_tensor, fill_value, memory_config); }, py::arg("input_tensor"), py::arg("fill_value"), py::kw_only(), py::arg("memory_config") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } } // namespace detail diff --git a/ttnn/cpp/ttnn/operations/data_movement/fill_rm/fill_rm.cpp b/ttnn/cpp/ttnn/operations/data_movement/fill_rm/fill_rm.cpp index 544b53dc7cb..00de17b432d 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/fill_rm/fill_rm.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/fill_rm/fill_rm.cpp @@ -6,14 +6,14 @@ #include "device/fill_rm_op.hpp" #include "ttnn/run_operation.hpp" #include "ttnn/decorators.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" using namespace tt::tt_metal; namespace ttnn::operations::data_movement { ttnn::Tensor FillRMOperation::invoke( - uint8_t queue_id, + QueueId queue_id, uint32_t N, uint32_t C, uint32_t H, @@ -45,7 +45,7 @@ ttnn::Tensor FillRMOperation::invoke( } ttnn::Tensor FillOnesRMOperation::invoke( - uint8_t queue_id, + QueueId queue_id, uint32_t N, uint32_t C, uint32_t H, diff --git a/ttnn/cpp/ttnn/operations/data_movement/fill_rm/fill_rm.hpp b/ttnn/cpp/ttnn/operations/data_movement/fill_rm/fill_rm.hpp index 112844883b3..ddebbc6e4bb 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/fill_rm/fill_rm.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/fill_rm/fill_rm.hpp @@ -12,7 +12,7 @@ namespace data_movement { struct FillRMOperation { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, uint32_t N, uint32_t C, uint32_t H, @@ -39,7 +39,7 @@ struct FillRMOperation { struct FillOnesRMOperation { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, uint32_t N, uint32_t C, uint32_t H, diff --git a/ttnn/cpp/ttnn/operations/data_movement/fill_rm/fill_rm_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/fill_rm/fill_rm_pybind.cpp index ac0062c5490..74bf1adde9f 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/fill_rm/fill_rm_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/fill_rm/fill_rm_pybind.cpp @@ -95,7 +95,7 @@ void bind_fill_rm_op(py::module& module) { const float val_hi, const float val_lo, const std::optional& memory_config, - uint8_t queue_id) { + QueueId queue_id) { return self(queue_id, N, C, H, W, hOnes, wOnes, any, val_hi, val_lo, memory_config); }, py::arg("N"), @@ -109,7 +109,7 @@ void bind_fill_rm_op(py::module& module) { py::arg("val_lo"), py::kw_only(), py::arg("memory_config") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } void bind_fill_ones_rm_op(py::module& module) { @@ -169,7 +169,7 @@ void bind_fill_ones_rm_op(py::module& module) { uint32_t wOnes, const Tensor& any, const std::optional& memory_config, - uint8_t queue_id) { return self(queue_id, N, C, H, W, hOnes, wOnes, any, memory_config); }, + QueueId queue_id) { return self(queue_id, N, C, H, W, hOnes, wOnes, any, memory_config); }, py::arg("N"), py::arg("C"), py::arg("H"), @@ -179,7 +179,7 @@ void bind_fill_ones_rm_op(py::module& module) { py::arg("any"), py::kw_only(), py::arg("memory_config") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } } // namespace detail diff --git a/ttnn/cpp/ttnn/operations/data_movement/fold/fold.cpp b/ttnn/cpp/ttnn/operations/data_movement/fold/fold.cpp index 2c3f553681b..ca3d56d8f77 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/fold/fold.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/fold/fold.cpp @@ -21,7 +21,7 @@ namespace ttnn::operations::data_movement { std::vector fold_with_transpose_( - uint8_t queue_id, + QueueId queue_id, const Tensor& input, const std::optional& output_shape, uint32_t stride_h, @@ -145,7 +145,7 @@ ttnn::MemoryConfig create_sharded_memory_config( } std::vector fold_with_transpose_sharded_( - uint8_t queue_id, + QueueId queue_id, const Tensor& input, const std::optional& output_shape, uint32_t stride_h, @@ -283,7 +283,7 @@ std::vector fold_with_transpose_sharded_( } Tensor FoldOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, uint32_t stride_h, uint32_t stride_w, @@ -331,7 +331,7 @@ Tensor FoldOperation::invoke( uint32_t pad_w, const std::optional grid_size, const std::optional& override_memory_config) { - uint8_t queue_id = 0; + QueueId queue_id = DefaultQueueId; return invoke( queue_id, input_tensor, diff --git a/ttnn/cpp/ttnn/operations/data_movement/fold/fold.hpp b/ttnn/cpp/ttnn/operations/data_movement/fold/fold.hpp index 4aaf7fd86e3..7b52bd73666 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/fold/fold.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/fold/fold.hpp @@ -30,7 +30,7 @@ struct FoldOperation { const std::optional grid_size = std::nullopt, const std::optional& override_memory_config = std::nullopt); static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, uint32_t stride_h, uint32_t stride_w, diff --git a/ttnn/cpp/ttnn/operations/data_movement/fold/fold_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/fold/fold_pybind.cpp index 76af97891b8..4980a8e11e5 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/fold/fold_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/fold/fold_pybind.cpp @@ -39,7 +39,7 @@ void bind_fold_operation(py::module& module) { uint32_t pad_w, std::optional grid_size, std::optional override_memory_config, - const uint8_t& queue_id) -> ttnn::Tensor { + QueueId queue_id) -> ttnn::Tensor { return op( queue_id, input, @@ -64,7 +64,7 @@ void bind_fold_operation(py::module& module) { py::arg("grid_size") = std::nullopt, py::arg("override_memory_config") = std::nullopt, py::kw_only(), - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } } // namespace ttnn::operations::data_movement diff --git a/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/indexed_fill.cpp b/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/indexed_fill.cpp index 2c1097e6fba..370eace29bf 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/indexed_fill.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/indexed_fill.cpp @@ -4,14 +4,14 @@ #include "ttnn/operations/data_movement/indexed_fill/indexed_fill.hpp" #include "ttnn/operations/data_movement/indexed_fill/device/indexed_fill_op.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" using namespace tt::tt_metal; namespace ttnn::operations::data_movement { ttnn::Tensor IndexedFillOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& batch_id, const ttnn::Tensor& input_tensor_a, const ttnn::Tensor& input_tensor_b, diff --git a/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/indexed_fill.hpp b/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/indexed_fill.hpp index 7eb61b36f0f..f07b71b8e31 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/indexed_fill.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/indexed_fill.hpp @@ -13,7 +13,7 @@ namespace data_movement { struct IndexedFillOperation { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& batch_id, const ttnn::Tensor& input_tensor_a, const ttnn::Tensor& input_tensor_b, diff --git a/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/indexed_fill_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/indexed_fill_pybind.cpp index 7c8fddc0475..3e90c40a7d7 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/indexed_fill_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/indexed_fill_pybind.cpp @@ -51,7 +51,7 @@ void bind_indexed_fill(pybind11::module& module) { const ttnn::Tensor& input_tensor_b, const std::optional& memory_config, int64_t dim, - uint8_t queue_id) { + QueueId queue_id) { return self(queue_id, batch_id, input_tensor_a, input_tensor_b, memory_config, dim); }, pybind11::arg("batch_id").noconvert(), @@ -60,7 +60,7 @@ void bind_indexed_fill(pybind11::module& module) { pybind11::kw_only(), pybind11::arg("memory_config") = std::nullopt, pybind11::arg("dim") = 0, - pybind11::arg("queue_id") = 0}); + pybind11::arg("queue_id") = DefaultQueueId}); } } // namespace detail diff --git a/ttnn/cpp/ttnn/operations/data_movement/move/move.cpp b/ttnn/cpp/ttnn/operations/data_movement/move/move.cpp index 279d00bd1f8..a33d54247ae 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/move/move.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/move/move.cpp @@ -5,7 +5,7 @@ #include "ttnn/operations/data_movement/move/move.hpp" #include "device/move_device_operation.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/decorators.hpp" #include "ttnn/run_operation.hpp" #include "ttnn/distributed/api.hpp" @@ -34,7 +34,7 @@ bool can_deallocate(const Tensor& input_tensor, bool from_multi_device = false) input_tensor.get_storage()); } -static inline Tensor move(uint8_t queue_id, const Tensor& input_tensor, const std::optional& mem_config) { +static inline Tensor move(QueueId queue_id, const Tensor& input_tensor, const std::optional& mem_config) { TT_ASSERT(input_tensor.is_allocated(), "Expected input tensor to be allocated"); auto input_mem_config = input_tensor.memory_config(); auto input_address = input_tensor.buffer()->address(); @@ -124,7 +124,7 @@ static inline Tensor move(uint8_t queue_id, const Tensor& input_tensor, const st } static inline Tensor move_sharded( - uint8_t queue_id, const Tensor& input_tensor, const std::optional& mem_config) { + QueueId queue_id, const Tensor& input_tensor, const std::optional& mem_config) { std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))}; bool from_multi_device = distributed::is_multi_device_tensor(input_tensor); operation::launch_op( @@ -186,7 +186,7 @@ static inline Tensor move_sharded( } ttnn::Tensor MoveOperation::invoke( - uint8_t queue_id, const Tensor& input_tensor, const std::optional& output_mem_config) { + QueueId queue_id, const Tensor& input_tensor, const std::optional& output_mem_config) { if (input_tensor.memory_config().is_sharded()) { return move_sharded(queue_id, input_tensor, output_mem_config); } diff --git a/ttnn/cpp/ttnn/operations/data_movement/move/move.hpp b/ttnn/cpp/ttnn/operations/data_movement/move/move.hpp index e37e6c8d1b2..6b0bc470db6 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/move/move.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/move/move.hpp @@ -12,7 +12,7 @@ namespace operations::data_movement { struct MoveOperation { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const std::optional& output_mem_config = std::nullopt); diff --git a/ttnn/cpp/ttnn/operations/data_movement/move/move_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/move/move_pybind.cpp index 10fce7841db..bfa65f7a4fa 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/move/move_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/move/move_pybind.cpp @@ -32,11 +32,11 @@ void py_bind_move(pybind11::module& module) { [](const decltype(ttnn::move)& self, const ttnn::Tensor& input_tensor, const std::optional& memory_config, - uint8_t queue_id) { return self(queue_id, input_tensor, memory_config); }, + QueueId queue_id) { return self(queue_id, input_tensor, memory_config); }, pybind11::arg("input_tensor").noconvert(), pybind11::kw_only(), pybind11::arg("memory_config") = std::nullopt, - pybind11::arg("queue_id") = 0}); + pybind11::arg("queue_id") = DefaultQueueId}); } } // namespace ttnn::operations::data_movement::detail diff --git a/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/non_zero_indices.cpp b/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/non_zero_indices.cpp index dab51f930e1..2a75c0bf822 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/non_zero_indices.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/non_zero_indices.cpp @@ -6,14 +6,14 @@ #include "device/non_zero_indices_op.hpp" #include "ttnn/run_operation.hpp" #include "ttnn/decorators.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" using namespace tt::tt_metal; namespace ttnn::operations::data_movement { std::vector NonZeroIndicesOperation::invoke( - uint8_t queue_id, const ttnn::Tensor& input_tensor, const std::optional& memory_config_arg) { + QueueId queue_id, const ttnn::Tensor& input_tensor, const std::optional& memory_config_arg) { auto memory_config = memory_config_arg.value_or(input_tensor.memory_config()); return operation::run_without_autoformat(NonZeroIndices{memory_config}, {input_tensor}, {}, {}, queue_id); } diff --git a/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/non_zero_indices.hpp b/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/non_zero_indices.hpp index 5558841345c..2b9933836a4 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/non_zero_indices.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/non_zero_indices.hpp @@ -13,7 +13,7 @@ namespace operations::data_movement { struct NonZeroIndicesOperation { static std::vector invoke( - uint8_t queue_id, const ttnn::Tensor& input_tensor, const std::optional& memory_config); + QueueId queue_id, const ttnn::Tensor& input_tensor, const std::optional& memory_config); static std::vector invoke( const ttnn::Tensor& input_tensor, const std::optional& memory_config); diff --git a/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/non_zero_indices_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/non_zero_indices_pybind.cpp index afc539ab4b2..0cff2af5be7 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/non_zero_indices_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/non_zero_indices_pybind.cpp @@ -45,11 +45,11 @@ void bind_non_zero(py::module& module) { [](const OperationType& self, const ttnn::Tensor& input_tensor, const std::optional& memory_config, - uint8_t queue_id) { return self(queue_id, input_tensor, memory_config); }, + QueueId queue_id) { return self(queue_id, input_tensor, memory_config); }, py::arg("input_tensor").noconvert(), py::kw_only(), py::arg("memory_config") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } } // namespace detail diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/pad.cpp b/ttnn/cpp/ttnn/operations/data_movement/pad/pad.cpp index 7b030405da2..b5232f2c464 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/pad/pad.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/pad/pad.cpp @@ -4,7 +4,7 @@ #include "pad.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/operations/core/core.hpp" #include "ttnn/run_operation.hpp" #include "ttnn/operations/data_movement/common/common.hpp" @@ -31,7 +31,7 @@ ttnn::Shape update_original_shape(const ttnn::Shape& padded_shape, const ttnn::S } static ttnn::Tensor pad_impl( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, std::span output_padded_shape, std::span input_tensor_start, @@ -157,7 +157,7 @@ static ttnn::Tensor pad_impl( } static ttnn::Tensor pad_impl( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, ttnn::SmallVector> padding, const float value, @@ -228,7 +228,7 @@ static ttnn::Tensor pad_impl( // Any rank tensor supported ttnn::Tensor ExecutePad::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, tt::stl::Span> padding, const float value, @@ -263,7 +263,7 @@ ttnn::Tensor ExecutePad::invoke( #define PAD_OVERLOAD_DIM_IMPL(ShapeType) \ ttnn::Tensor ExecutePad::invoke( \ - uint8_t queue_id, \ + QueueId queue_id, \ const ttnn::Tensor& input_tensor, \ const ShapeType& output_padded_shape, \ const ShapeType& input_tensor_start, \ diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/pad.hpp b/ttnn/cpp/ttnn/operations/data_movement/pad/pad.hpp index 4407f008faa..127494f01a6 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/pad/pad.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/pad/pad.hpp @@ -7,6 +7,7 @@ #include "ttnn/tensor/types.hpp" #include #include "ttnn/decorators.hpp" +#include "ttnn/common/queue_id.hpp" namespace ttnn { namespace operations::data_movement { @@ -14,7 +15,7 @@ namespace operations::data_movement { // We overload over Array1D-8D #define PAD_OVERLOAD_DIM(ShapeType) \ static ttnn::Tensor invoke( \ - uint8_t, \ + QueueId, \ const ttnn::Tensor&, \ const ShapeType&, \ const ShapeType&, \ @@ -38,7 +39,7 @@ struct ExecutePad { // This function signature is similar to pytorch's signature // Any rank tensor supported static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, tt::stl::Span> padding, const float value, diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/pad_pybind.hpp b/ttnn/cpp/ttnn/operations/data_movement/pad/pad_pybind.hpp index f229e5fbd33..1e96645f42f 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/pad/pad_pybind.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/pad/pad_pybind.hpp @@ -55,14 +55,14 @@ void bind_pad(py::module& module) { const float value, const bool use_multicore, const std::optional& memory_config, - uint8_t queue_id) { return self(queue_id, input_tensor, padding, value, use_multicore, memory_config); }, + QueueId queue_id) { return self(queue_id, input_tensor, padding, value, use_multicore, memory_config); }, py::arg("input_tensor"), py::arg("padding"), py::arg("value"), py::kw_only(), py::arg("use_multicore") = true, py::arg("memory_config") = std::nullopt, - py::arg("queue_id") = 0, + py::arg("queue_id") = DefaultQueueId, }, ttnn::pybind_overload_t{ [](const OperationType& self, @@ -72,7 +72,7 @@ void bind_pad(py::module& module) { const float value, const bool use_multicore, const std::optional& memory_config, - uint8_t queue_id) { + QueueId queue_id) { return self( queue_id, input_tensor, @@ -89,7 +89,7 @@ void bind_pad(py::module& module) { py::kw_only(), py::arg("use_multicore") = false, py::arg("memory_config") = std::nullopt, - py::arg("queue_id") = 0, + py::arg("queue_id") = DefaultQueueId, }, ttnn::pybind_overload_t{ [](const OperationType& self, @@ -99,7 +99,7 @@ void bind_pad(py::module& module) { const float value, const bool use_multicore, const std::optional& memory_config, - uint8_t queue_id) { + QueueId queue_id) { return self( queue_id, input_tensor, @@ -116,7 +116,7 @@ void bind_pad(py::module& module) { py::kw_only(), py::arg("use_multicore") = false, py::arg("memory_config") = std::nullopt, - py::arg("queue_id") = 0, + py::arg("queue_id") = DefaultQueueId, }, ttnn::pybind_overload_t{ [](const OperationType& self, @@ -126,7 +126,7 @@ void bind_pad(py::module& module) { const float value, const bool use_multicore, const std::optional& memory_config, - uint8_t queue_id) { + QueueId queue_id) { return self( queue_id, input_tensor, @@ -143,7 +143,7 @@ void bind_pad(py::module& module) { py::kw_only(), py::arg("use_multicore") = false, py::arg("memory_config") = std::nullopt, - py::arg("queue_id") = 0, + py::arg("queue_id") = DefaultQueueId, }, ttnn::pybind_overload_t{ [](const OperationType& self, @@ -153,7 +153,7 @@ void bind_pad(py::module& module) { const float value, const bool use_multicore, const std::optional& memory_config, - uint8_t queue_id) { + QueueId queue_id) { return self( queue_id, input_tensor, @@ -170,7 +170,7 @@ void bind_pad(py::module& module) { py::kw_only(), py::arg("use_multicore") = false, py::arg("memory_config") = std::nullopt, - py::arg("queue_id") = 0, + py::arg("queue_id") = DefaultQueueId, }, ttnn::pybind_overload_t{ [](const OperationType& self, @@ -180,7 +180,7 @@ void bind_pad(py::module& module) { const float value, const bool use_multicore, const std::optional& memory_config, - uint8_t queue_id) { + QueueId queue_id) { return self( queue_id, input_tensor, @@ -197,7 +197,7 @@ void bind_pad(py::module& module) { py::kw_only(), py::arg("use_multicore") = false, py::arg("memory_config") = std::nullopt, - py::arg("queue_id") = 0, + py::arg("queue_id") = DefaultQueueId, }, ttnn::pybind_overload_t{ [](const OperationType& self, @@ -207,7 +207,7 @@ void bind_pad(py::module& module) { const float value, const bool use_multicore, const std::optional& memory_config, - uint8_t queue_id) { + QueueId queue_id) { return self( queue_id, input_tensor, @@ -224,7 +224,7 @@ void bind_pad(py::module& module) { py::kw_only(), py::arg("use_multicore") = false, py::arg("memory_config") = std::nullopt, - py::arg("queue_id") = 0, + py::arg("queue_id") = DefaultQueueId, }, ttnn::pybind_overload_t{ [](const OperationType& self, @@ -234,7 +234,7 @@ void bind_pad(py::module& module) { const float value, const bool use_multicore, const std::optional& memory_config, - uint8_t queue_id) { + QueueId queue_id) { return self( queue_id, input_tensor, @@ -251,7 +251,7 @@ void bind_pad(py::module& module) { py::kw_only(), py::arg("use_multicore") = false, py::arg("memory_config") = std::nullopt, - py::arg("queue_id") = 0, + py::arg("queue_id") = DefaultQueueId, }, ttnn::pybind_overload_t{ [](const OperationType& self, @@ -261,7 +261,7 @@ void bind_pad(py::module& module) { const float value, const bool use_multicore, const std::optional& memory_config, - uint8_t queue_id) { + QueueId queue_id) { return self( queue_id, input_tensor, @@ -278,7 +278,7 @@ void bind_pad(py::module& module) { py::kw_only(), py::arg("use_multicore") = false, py::arg("memory_config") = std::nullopt, - py::arg("queue_id") = 0, + py::arg("queue_id") = DefaultQueueId, }); } } // namespace ttnn::operations::data_movement::detail diff --git a/ttnn/cpp/ttnn/operations/data_movement/permute/permute.cpp b/ttnn/cpp/ttnn/operations/data_movement/permute/permute.cpp index bac6e8da401..f0c4ee555ed 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/permute/permute.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/permute/permute.cpp @@ -4,7 +4,7 @@ #include "permute.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/operations/data_movement/transpose/transpose.hpp" #include "ttnn/operations/data_movement/permute/device/permute_device_operation.hpp" @@ -176,7 +176,7 @@ bool is_permute_nop(const ttnn::Tensor& a, const ttnn::SmallVector& di } // namespace detail ttnn::Tensor ExecutePermute::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const ttnn::SmallVector& dims, const std::optional& memory_config, diff --git a/ttnn/cpp/ttnn/operations/data_movement/permute/permute.hpp b/ttnn/cpp/ttnn/operations/data_movement/permute/permute.hpp index 2b3fb1cdabc..bcf8f732a02 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/permute/permute.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/permute/permute.hpp @@ -11,7 +11,7 @@ namespace operations::data_movement { struct ExecutePermute { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const SmallVector& dims, const std::optional& memory_config, diff --git a/ttnn/cpp/ttnn/operations/data_movement/permute/permute_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/permute/permute_pybind.cpp index be6adbf880b..db90d90c13e 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/permute/permute_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/permute/permute_pybind.cpp @@ -42,7 +42,7 @@ void bind_permute(py::module& module) { const ttnn::Tensor& input_tensor, const ttnn::SmallVector& dims, const std::optional& memory_config, - uint8_t queue_id, + QueueId queue_id, const std::optional& pad_value) { return self(queue_id, input_tensor, dims, memory_config, pad_value); }, @@ -50,7 +50,7 @@ void bind_permute(py::module& module) { py::arg("dims"), py::kw_only(), py::arg("memory_config") = std::nullopt, - py::arg("queue_id") = 0, + py::arg("queue_id") = DefaultQueueId, py::arg("pad_value") = 0.0f, }); } diff --git a/ttnn/cpp/ttnn/operations/data_movement/repeat/repeat.cpp b/ttnn/cpp/ttnn/operations/data_movement/repeat/repeat.cpp index 3558ee8ce63..0cf3f74f8ef 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/repeat/repeat.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/repeat/repeat.cpp @@ -8,7 +8,7 @@ #include #include -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/operations/core/core.hpp" #include "ttnn/operations/data_movement/sharded/sharded_to_interleaved/sharded_to_interleaved.hpp" #include "ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded.hpp" @@ -38,7 +38,7 @@ ttnn::Tensor repeat_upper_dims_rm( const ttnn::Tensor& tensor, const uint32_t dim, const uint32_t repetitions, - uint8_t queue_id, + QueueId queue_id, const MemoryConfig& output_mem_config) { // collapse upper dims to 4D or append 1s // collapse lower dims or insert 1s @@ -72,7 +72,7 @@ ttnn::Tensor repeat_upper_dims_rm( } ttnn::Tensor repeat_last_dim_rm( - const ttnn::Tensor& tensor, const uint32_t repetitions, uint8_t queue_id, const MemoryConfig& output_mem_config) { + const ttnn::Tensor& tensor, const uint32_t repetitions, QueueId queue_id, const MemoryConfig& output_mem_config) { // collapse to 2D // op // un-collapse @@ -140,7 +140,7 @@ ttnn::Tensor RepeatOperation::invoke( const ttnn::Tensor& tensor, const ttnn::SmallVector& provided_repetition_vector, const std::optional& provided_output_mem_config, - uint8_t queue_id) { + QueueId queue_id) { auto [working_tensor, repetition_vector] = detail::match_input_rank(tensor, provided_repetition_vector); MemoryConfig output_mem_config = provided_output_mem_config.value_or(tensor.memory_config()); auto working_output_mem_config = output_mem_config; diff --git a/ttnn/cpp/ttnn/operations/data_movement/repeat/repeat.hpp b/ttnn/cpp/ttnn/operations/data_movement/repeat/repeat.hpp index 76b780faf2c..75facd8a7de 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/repeat/repeat.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/repeat/repeat.hpp @@ -14,7 +14,7 @@ struct RepeatOperation { const ttnn::Tensor& input_tensor, const ttnn::SmallVector& repetition_vector, const std::optional& provided_output_mem_config, - uint8_t queue_id); + QueueId queue_id); static ttnn::Tensor invoke(const ttnn::Tensor& input_tensor, const ttnn::Shape& repeat_dims); }; diff --git a/ttnn/cpp/ttnn/operations/data_movement/repeat/repeat_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/repeat/repeat_pybind.cpp index e2a3883c737..999a2fbb270 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/repeat/repeat_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/repeat/repeat_pybind.cpp @@ -24,12 +24,12 @@ void bind_repeat(py::module& module, const data_movement_operation_t& operation, const ttnn::Tensor& input_tensor, const ttnn::SmallVector& repetition_vector, const std::optional& memory_config, - uint8_t queue_id) { return self(input_tensor, repetition_vector, memory_config, queue_id); }, + QueueId queue_id) { return self(input_tensor, repetition_vector, memory_config, queue_id); }, py::arg("input_tensor"), py::arg("repeat_dims"), py::kw_only(), py::arg("memory_config") = std::nullopt, - py::arg("queue_id") = 0, + py::arg("queue_id") = DefaultQueueId, }); } diff --git a/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape.cpp b/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape.cpp index bf7422ba621..8b472f5ebbb 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/run_operation.hpp" #include "reshape.hpp" #include @@ -57,7 +57,7 @@ static Tensor manual_insertion( } // namespace detail ttnn::Tensor ReshapeOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const ttnn::Shape& logical_output_shape, const ttnn::Shape& padded_output_shape, @@ -97,7 +97,7 @@ ttnn::Tensor ReshapeOperation::invoke( } ttnn::Tensor ReshapeOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const ttnn::Shape& logical_output_shape, const std::optional& memory_config_arg) { @@ -129,7 +129,7 @@ ttnn::Tensor ReshapeOperation::invoke(const ttnn::Tensor& input_tensor, const tt } ttnn::Tensor ReshapeOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, tt::stl::Span shape_vector, const std::optional& memory_config_arg) { diff --git a/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape.hpp b/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape.hpp index d4070309224..1ed0cd2f89a 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape.hpp @@ -11,13 +11,13 @@ namespace operations::data_movement { struct ReshapeOperation { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const ttnn::Shape& logical_shape, const ttnn::Shape& padded_shape, const std::optional& memory_config_arg); static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const ttnn::Shape& logical_shape, const std::optional& memory_config_arg); @@ -37,7 +37,7 @@ struct ReshapeOperation { static ttnn::Tensor invoke(const ttnn::Tensor& input_tensor, const ttnn::Shape& logical_shape); static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, tt::stl::Span shape_vector, const std::optional& memory_config_arg); diff --git a/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape_pybind.cpp index e2774d475f5..29cdc165953 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape_pybind.cpp @@ -29,7 +29,7 @@ void bind_reshape(pybind11::module& module, const data_movement_operation_t& ope int Y, int X, const std::optional& memory_config, - uint8_t queue_id) -> ttnn::Tensor { + QueueId queue_id) -> ttnn::Tensor { return self(queue_id, input_tensor, ttnn::SmallVector{W, Z, Y, X}, memory_config); }, py::arg("input_tensor"), @@ -39,7 +39,7 @@ void bind_reshape(pybind11::module& module, const data_movement_operation_t& ope py::arg("X"), py::kw_only(), py::arg("memory_config") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } } // namespace detail diff --git a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp index 63ee669be3a..2f3b2f33d2c 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp @@ -2,8 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 - -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/run_operation.hpp" #include "reshape.hpp" #include "reshape_common.hpp" @@ -37,7 +36,7 @@ ttnn::Tensor convert_tile_to_rm( const uint32_t tile_first_dim, const uint32_t tile_second_dim, const MemoryConfig& memory_config, - const uint8_t queue_id, + const QueueId queue_id, const PadValue& pad_value) { // Convert the 3D->3D reshaping to row major and back to tile TT_FATAL( @@ -66,7 +65,7 @@ ttnn::Tensor convert_tensor_to_rm_reshape_convert_back_to_orig_layout( const uint32_t tile_first_dim, const uint32_t tile_second_dim, const MemoryConfig& memory_config, - const uint8_t queue_id, + const QueueId queue_id, const PadValue& pad_value) { //This function turns ND -> MD into 2D->MD for row major and 3D->MD for tiled using a 0 cost view const auto layout = tensor.get_layout(); @@ -131,7 +130,7 @@ ttnn::Tensor fix_shape_and_perform_reshape_on_3D_TILE( const uint32_t tile_first_dim, const uint32_t tile_second_dim, const MemoryConfig& memory_config, - const uint8_t queue_id, + const QueueId queue_id, const PadValue& pad_value) { //This function turns a TILE 3D->MD into an equivalent 3D->3D conversion and then turns the 3D output back to MD using a 0 cost view //Collapse into the third last dimension @@ -165,7 +164,7 @@ ttnn::Tensor fix_shape_and_perform_reshape_on_2D_RM( const uint32_t tile_first_dim, const uint32_t tile_second_dim, const MemoryConfig& memory_config, - const uint8_t queue_id) { + const QueueId queue_id) { //This function turns a RM 2D->MD into an equivalent 2D->2D conversion and then turns the 2D output back to MD using a 0 cost view TT_FATAL((logical_shape.rank() != 0), "Can't do reshape to rank 0 tensor"); //Collapse into the second last dimension @@ -193,7 +192,7 @@ ttnn::Tensor perform_reshape_on_2D_RM( const ttnn::Shape& logical_shape, const ttnn::Shape& padded_shape, const MemoryConfig& memory_config, - const uint8_t queue_id) { + const QueueId queue_id) { auto temp_tensor = tensor; auto intermediate_mem_config = tensor.memory_config(); auto intermediate_out_memory_config = memory_config; @@ -350,7 +349,7 @@ ttnn::Tensor ReshapeViewOperation::invoke( const ttnn::Shape& logical_input_shape, const ttnn::Shape& padded_input_shape, const std::optional& memory_config, - const uint8_t queue_id, + const QueueId queue_id, const std::optional& pad_value) { MemoryConfig mem_config = memory_config.value_or(tensor.memory_config()); auto layout = tensor.get_layout(); @@ -436,31 +435,33 @@ ttnn::Tensor ReshapeViewOperation::invoke( const ttnn::Tensor& tensor, const ttnn::Shape& shape, const std::optional& memory_config, - const uint8_t queue_id, + const QueueId queue_id, const std::optional& pad_value) { return invoke(tensor, shape, shape, memory_config, queue_id, pad_value); } ttnn::Tensor ReshapeViewOperation::invoke(const ttnn::Tensor& tensor, const ttnn::Shape& shape) { - return invoke(tensor, shape, shape, std::nullopt, 0, std::nullopt); + return invoke(tensor, shape, shape, std::nullopt, DefaultQueueId, std::nullopt); } ttnn::Tensor ReshapeViewOperation::invoke( const ttnn::Tensor& tensor, const ttnn::Shape& logical_shape, const ttnn::Shape& padded_shape) { - return invoke(tensor, logical_shape, padded_shape, std::nullopt, 0, std::nullopt); + return invoke(tensor, logical_shape, padded_shape, std::nullopt, DefaultQueueId, std::nullopt); } ttnn::Tensor ReshapeViewOperation::invoke( const ttnn::Tensor& tensor, tt::stl::Span shape_vector, const std::optional& memory_config, - const uint8_t queue_id, + const QueueId queue_id, const std::optional& pad_value) { - return invoke(tensor, tt::tt_metal::infer_dims_for_reshape(tensor, shape_vector),memory_config,queue_id,pad_value); + return invoke( + tensor, tt::tt_metal::infer_dims_for_reshape(tensor, shape_vector), memory_config, queue_id, pad_value); } ttnn::Tensor ReshapeViewOperation::invoke(const ttnn::Tensor& tensor, tt::stl::Span shape_vector) { - return invoke(tensor, tt::tt_metal::infer_dims_for_reshape(tensor, shape_vector),std::nullopt,0,std::nullopt); + return invoke( + tensor, tt::tt_metal::infer_dims_for_reshape(tensor, shape_vector), std::nullopt, DefaultQueueId, std::nullopt); } } // ttnn::operations::data_movement namespace diff --git a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.hpp b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.hpp index 78392f8bcdd..587657e34ce 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.hpp @@ -20,7 +20,7 @@ ttnn::Tensor convert_tensor_to_rm_reshape_convert_back_to_orig_layout( const uint32_t tile_first_dim, const uint32_t tile_second_dim, const MemoryConfig& memory_config, - const uint8_t queue_id, + const QueueId queue_id, const PadValue& pad_value); ttnn::Tensor fix_shape_and_perform_reshape_on_2D_RM( const ttnn::Tensor& tensor, @@ -29,7 +29,7 @@ ttnn::Tensor fix_shape_and_perform_reshape_on_2D_RM( const uint32_t tile_first_dim, const uint32_t tile_second_dim, const MemoryConfig& memory_config, - const uint8_t queue_id); + const QueueId queue_id); ttnn::Tensor fix_shape_and_perform_reshape_on_3D_TILE( const ttnn::Tensor& tensor, const ttnn::Shape& logical_shape, @@ -37,14 +37,14 @@ ttnn::Tensor fix_shape_and_perform_reshape_on_3D_TILE( const uint32_t tile_first_dim, const uint32_t tile_second_dim, const MemoryConfig& memory_config, - const uint8_t queue_id, + const QueueId queue_id, const PadValue& pad_value); ttnn::Tensor perform_reshape_on_2D_RM( const ttnn::Tensor& tensor, const ttnn::Shape& logical_shape, const ttnn::Shape& padded_shape, const MemoryConfig& memory_config, - const uint8_t queue_id); + const QueueId queue_id); ttnn::Tensor convert_tile_to_rm( const ttnn::Tensor& tensor, const ttnn::Shape& logical_shape, @@ -52,7 +52,7 @@ ttnn::Tensor convert_tile_to_rm( const uint32_t tile_first_dim, const uint32_t tile_second_dim, const MemoryConfig& memory_config, - const uint8_t queue_id, + const QueueId queue_id, const PadValue& pad_value); } @@ -72,20 +72,20 @@ struct ReshapeViewOperation { const ttnn::Tensor& input_tensor, const ttnn::Shape& logical_shape, const std::optional& memory_config, - const uint8_t queue_id, + const QueueId queue_id, const std::optional& pad_value); static ttnn::Tensor invoke( const ttnn::Tensor& input_tensor, const ttnn::Shape& logical_shape, const ttnn::Shape& padded_shape, const std::optional& memory_config, - const uint8_t queue_id, + const QueueId queue_id, const std::optional& pad_value); static ttnn::Tensor invoke( const ttnn::Tensor& input_tensor, tt::stl::Span shape_vector, const std::optional& memory_config, - const uint8_t queue_id, + const QueueId queue_id, const std::optional& pad_value); static ttnn::Tensor invoke(const ttnn::Tensor& input_tensor, const ttnn::Shape& logical_shape); static ttnn::Tensor invoke( diff --git a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape_pybind.cpp index f5ea3024c0b..2be6c179474 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape_pybind.cpp @@ -28,13 +28,13 @@ void bind_reshape_view(pybind11::module& module, const data_movement_operation_t const ttnn::Tensor& input_tensor, const ttnn::Shape& shape, const std::optional& memory_config, - const uint8_t queue_id, + const QueueId queue_id, const std::optional& pad_value) -> ttnn::Tensor { return self(input_tensor, shape); }, py::arg("input_tensor"), py::arg("shape"), py::kw_only(), py::arg("memory_config") = std::nullopt, - py::arg("queue_id") = 0, + py::arg("queue_id") = DefaultQueueId, py::arg("pad_value") = std::nullopt}, ttnn::pybind_overload_t{ [](const data_movement_operation_t& self, @@ -42,7 +42,7 @@ void bind_reshape_view(pybind11::module& module, const data_movement_operation_t const ttnn::Shape& logical_shape, const ttnn::Shape& padded_shape, const std::optional& memory_config, - const uint8_t queue_id, + const QueueId queue_id, const std::optional& pad_value) -> ttnn::Tensor { return self(input_tensor, logical_shape, padded_shape); }, @@ -51,20 +51,20 @@ void bind_reshape_view(pybind11::module& module, const data_movement_operation_t py::arg("padded_shape"), py::kw_only(), py::arg("memory_config") = std::nullopt, - py::arg("queue_id") = 0, + py::arg("queue_id") = DefaultQueueId, py::arg("pad_value") = std::nullopt}, ttnn::pybind_overload_t{ [](const data_movement_operation_t& self, const ttnn::Tensor& input_tensor, const ttnn::SmallVector shape, const std::optional& memory_config, - const uint8_t queue_id, + const QueueId queue_id, const std::optional& pad_value) -> ttnn::Tensor { return self(input_tensor, shape); }, py::arg("input_tensor"), py::arg("shape"), py::kw_only(), py::arg("memory_config") = std::nullopt, - py::arg("queue_id") = 0, + py::arg("queue_id") = DefaultQueueId, py::arg("pad_value") = std::nullopt}); } diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded.cpp index 7535a51625c..a56e110eb02 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/run_operation.hpp" #include "device/interleaved_to_sharded_op.hpp" #include "interleaved_to_sharded.hpp" @@ -13,7 +13,7 @@ using namespace tt::tt_metal; namespace ttnn::operations::data_movement { ttnn::Tensor InterleavedToShardedOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const MemoryConfig& sharded_memory_config, const std::optional& data_type_arg, @@ -28,7 +28,7 @@ ttnn::Tensor InterleavedToShardedOperation::invoke( } ttnn::Tensor InterleavedToShardedOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const std::variant& grid, const std::array shard_shape, diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded.hpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded.hpp index cf33a13756c..1dcf2072a62 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded.hpp @@ -12,13 +12,13 @@ namespace operations::data_movement { struct InterleavedToShardedOperation { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const MemoryConfig& sharded_memory_config, const std::optional& data_type_arg, const std::optional& keep_l1_aligned = std::nullopt); static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const std::variant& grid, const std::array shard_shape, diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded_pybind.cpp index 694e46c202b..2bd8dd04974 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded_pybind.cpp @@ -29,7 +29,7 @@ void bind_interleaved_to_sharded( tt::tt_metal::TensorMemoryLayout shard_scheme, tt::tt_metal::ShardOrientation shard_orientation, const std::optional& output_dtype, - uint8_t queue_id, + QueueId queue_id, const std::optional& keep_l1_aligned) -> ttnn::Tensor { return self( queue_id, @@ -48,7 +48,7 @@ void bind_interleaved_to_sharded( py::arg("shard_orientation"), py::arg("output_dtype") = std::nullopt, py::kw_only(), - py::arg("queue_id") = 0, + py::arg("queue_id") = DefaultQueueId, py::arg("keep_l1_aligned") = false, }, @@ -57,7 +57,7 @@ void bind_interleaved_to_sharded( const ttnn::Tensor& input_tensor, const MemoryConfig& sharded_memory_config, const std::optional& output_dtype, - uint8_t queue_id, + QueueId queue_id, const std::optional& keep_l1_aligned) -> ttnn::Tensor { return self(queue_id, input_tensor, sharded_memory_config, output_dtype, keep_l1_aligned); }, @@ -65,7 +65,7 @@ void bind_interleaved_to_sharded( py::arg("sharded_memory_config"), py::arg("output_dtype") = std::nullopt, py::kw_only(), - py::arg("queue_id") = 0, + py::arg("queue_id") = DefaultQueueId, py::arg("keep_l1_aligned") = false, }); diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/reshard.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/reshard.cpp index 547dfab9ea2..e92c5efedde 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/reshard.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/reshard.cpp @@ -11,7 +11,7 @@ using namespace tt::tt_metal; namespace ttnn::operations::data_movement { ttnn::Tensor ReshardOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const MemoryConfig& memory_config, const std::optional& optional_output_tensor) { diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/reshard.hpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/reshard.hpp index 101aceee271..d46d3602cd0 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/reshard.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/reshard.hpp @@ -12,7 +12,7 @@ namespace operations::data_movement { struct ReshardOperation { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const MemoryConfig& memory_config, const std::optional& optional_output_tensor); diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/reshard_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/reshard_pybind.cpp index af64f207b8b..a0e8ee9a72c 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/reshard_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/reshard_pybind.cpp @@ -25,14 +25,14 @@ void bind_reshard(pybind11::module& module, const data_movement_sharded_operatio const ttnn::Tensor& input_tensor, const MemoryConfig& output_memory_config, const std::optional& output_tensor, - uint8_t queue_id) -> ttnn::Tensor { + QueueId queue_id) -> ttnn::Tensor { return self(queue_id, input_tensor, output_memory_config, output_tensor); }, py::arg("input_tensor").noconvert(), py::arg("output_memory_config"), py::arg("output_tensor").noconvert() = std::nullopt, py::kw_only(), - py::arg("queue_id") = 0, + py::arg("queue_id") = DefaultQueueId, }); } diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/sharded_to_interleaved.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/sharded_to_interleaved.cpp index ffac44cfca6..58d5bb7a599 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/sharded_to_interleaved.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/sharded_to_interleaved.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/run_operation.hpp" #include "device/sharded_to_interleaved_op.hpp" #include "sharded_to_interleaved.hpp" @@ -12,7 +12,7 @@ using namespace tt::tt_metal; namespace ttnn::operations::data_movement { ttnn::Tensor ShardedToInterleavedOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const MemoryConfig& memory_config, const std::optional& output_dtype, diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/sharded_to_interleaved.hpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/sharded_to_interleaved.hpp index b06e2d3bf6e..f610dc971d9 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/sharded_to_interleaved.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/sharded_to_interleaved.hpp @@ -11,7 +11,7 @@ namespace operations::data_movement { struct ShardedToInterleavedOperation { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const MemoryConfig& memory_config, const std::optional& output_dtype, diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/sharded_to_interleaved_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/sharded_to_interleaved_pybind.cpp index 2c3dfe5db6d..c5fa01ae1eb 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/sharded_to_interleaved_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/sharded_to_interleaved_pybind.cpp @@ -27,7 +27,7 @@ void bind_sharded_to_interleaved( const ttnn::Tensor& input_tensor, const std::optional& memory_config, const std::optional& output_dtype, - uint8_t queue_id, + QueueId queue_id, const std::optional& is_l1_aligned) -> ttnn::Tensor { return self( queue_id, @@ -40,7 +40,7 @@ void bind_sharded_to_interleaved( py::arg("memory_config") = std::nullopt, py::arg("output_dtype") = std::nullopt, py::kw_only(), - py::arg("queue_id") = 0, + py::arg("queue_id") = DefaultQueueId, py::arg("is_l1_aligned") = false, }); } diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial.cpp index 13b9ee1ec58..c386d335b88 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/run_operation.hpp" #include "device/interleaved_to_sharded_partial_op.hpp" #include "interleaved_to_sharded_partial.hpp" @@ -11,7 +11,7 @@ namespace ttnn::operations::data_movement { ttnn::Tensor InterleavedToShardedPartialOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const std::variant& grid, const std::array& shard_shape, diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial.hpp b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial.hpp index bf482c35d1b..8f80f19e233 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial.hpp @@ -12,7 +12,7 @@ namespace operations::data_movement { struct InterleavedToShardedPartialOperation { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const std::variant& grid, const std::array& shard_shape, diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial_pybind.cpp index 459e9c1b4cc..876a455b2cc 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial_pybind.cpp @@ -31,7 +31,7 @@ void bind_interleaved_to_sharded_partial( tt::tt_metal::TensorMemoryLayout shard_scheme, tt::tt_metal::ShardOrientation shard_orientation, const std::optional& output_dtype, - uint8_t queue_id) -> ttnn::Tensor { + QueueId queue_id) -> ttnn::Tensor { return self( queue_id, input_tensor, @@ -52,7 +52,7 @@ void bind_interleaved_to_sharded_partial( py::arg("shard_orientation"), py::kw_only(), py::arg("output_dtype") = std::nullopt, - py::arg("queue_id") = 0, + py::arg("queue_id") = DefaultQueueId, }); } diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/sharded_to_interleaved_partial/sharded_to_interleaved_partial.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/sharded_to_interleaved_partial/sharded_to_interleaved_partial.cpp index 3755265de73..aeb20e1b9f8 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/sharded_to_interleaved_partial/sharded_to_interleaved_partial.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/sharded_to_interleaved_partial/sharded_to_interleaved_partial.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/run_operation.hpp" #include "device/sharded_to_interleaved_partial_op.hpp" #include "sharded_to_interleaved_partial.hpp" @@ -10,7 +10,7 @@ namespace ttnn::operations::data_movement { ttnn::Tensor ShardedToInterleavedPartialOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const ttnn::Tensor& cache_tensor, int64_t& num_slices, diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/sharded_to_interleaved_partial/sharded_to_interleaved_partial.hpp b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/sharded_to_interleaved_partial/sharded_to_interleaved_partial.hpp index de5c6eac85e..a7d2b0040be 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/sharded_to_interleaved_partial/sharded_to_interleaved_partial.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/sharded_to_interleaved_partial/sharded_to_interleaved_partial.hpp @@ -11,7 +11,7 @@ namespace operations::data_movement { struct ShardedToInterleavedPartialOperation { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const ttnn::Tensor& cache_tensor, int64_t& num_slices, diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/sharded_to_interleaved_partial/sharded_to_interleaved_partial_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/sharded_to_interleaved_partial/sharded_to_interleaved_partial_pybind.cpp index e13e6cadd9b..a1aa82fd9fe 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/sharded_to_interleaved_partial/sharded_to_interleaved_partial_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/sharded_to_interleaved_partial/sharded_to_interleaved_partial_pybind.cpp @@ -28,7 +28,7 @@ void bind_sharded_to_interleaved_partial( int64_t& slice_index, const std::optional& memory_config, const std::optional& output_dtype, - uint8_t queue_id) -> ttnn::Tensor { + QueueId queue_id) -> ttnn::Tensor { return self(queue_id, input_tensor, cache_tensor, num_slices, slice_index, memory_config, output_dtype); }, py::arg("input_tensor").noconvert(), @@ -38,7 +38,7 @@ void bind_sharded_to_interleaved_partial( py::kw_only(), py::arg("memory_config") = std::nullopt, py::arg("output_dtype") = std::nullopt, - py::arg("queue_id") = 0, + py::arg("queue_id") = DefaultQueueId, }); } diff --git a/ttnn/cpp/ttnn/operations/data_movement/slice/slice.cpp b/ttnn/cpp/ttnn/operations/data_movement/slice/slice.cpp index 3bcdff4b4d8..6fcb9702889 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/slice/slice.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/slice/slice.cpp @@ -2,13 +2,12 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "ttnn/common/constants.hpp" #include "slice.hpp" #include "device/slice_op.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/run_operation.hpp" #include "ttnn/operations/core/core.hpp" #include "cpp/ttnn/operations/creation.hpp" -#include "ttnn/common/constants.hpp" #include "cpp/ttnn/operations/data_movement/copy/copy.hpp" #include "cpp/ttnn/operations/data_movement/unsqueeze/unsqueeze.hpp" #include "cpp/ttnn/operations/data_movement/common/common.hpp" @@ -17,7 +16,7 @@ namespace ttnn::operations::data_movement { template ttnn::Tensor SliceOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, tt::stl::Span begins, tt::stl::Span ends, @@ -216,7 +215,7 @@ ttnn::Tensor SliceOperation::invoke( // Specialization for uint32_t and N=4 template <> ttnn::Tensor SliceOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const std::array& begins, const std::array& ends, @@ -334,7 +333,7 @@ ttnn::Tensor SliceOperation::invoke( template ttnn::Tensor SliceOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const std::array& output_tensor_start, const std::array& output_tensor_end, @@ -360,7 +359,7 @@ ttnn::Tensor SliceOperation::invoke( } template ttnn::Tensor SliceOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, tt::stl::Span begins, tt::stl::Span ends, @@ -377,7 +376,7 @@ template ttnn::Tensor SliceOperation::invoke( const std::optional& optional_output_tensor); template ttnn::Tensor SliceOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, tt::stl::Span begins, tt::stl::Span ends, @@ -402,7 +401,7 @@ template ttnn::Tensor SliceOperation::invoke( const std::optional& optional_output_tensor); template ttnn::Tensor SliceOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const std::array& output_tensor_start, const std::array& output_tensor_end, diff --git a/ttnn/cpp/ttnn/operations/data_movement/slice/slice.hpp b/ttnn/cpp/ttnn/operations/data_movement/slice/slice.hpp index 7582ff4fed1..8874d79535e 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/slice/slice.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/slice/slice.hpp @@ -13,7 +13,7 @@ namespace data_movement { struct SliceOperation { template static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, tt::stl::Span begins, tt::stl::Span ends, @@ -32,7 +32,7 @@ struct SliceOperation { template static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const ttnn::SmallVector& begins, const ttnn::SmallVector& ends, @@ -68,7 +68,7 @@ struct SliceOperation { template static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const std::array& output_tensor_start, const std::array& output_tensor_end, diff --git a/ttnn/cpp/ttnn/operations/data_movement/slice/slice_pybind.hpp b/ttnn/cpp/ttnn/operations/data_movement/slice/slice_pybind.hpp index 1d983ce0ea3..bb69b16a838 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/slice/slice_pybind.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/slice/slice_pybind.hpp @@ -57,7 +57,7 @@ void bind_slice(py::module& module) { const std::optional>& step, const std::optional& memory_config, const std::optional& optional_output_tensor, - uint8_t queue_id) { + QueueId queue_id) { const auto step_value = step.value_or(ttnn::SmallVector(slice_end.size(), 1)); return self( queue_id, input_tensor, slice_start, slice_end, step_value, memory_config, optional_output_tensor); @@ -69,7 +69,7 @@ void bind_slice(py::module& module) { py::kw_only(), py::arg("memory_config") = std::nullopt, py::arg("output_tensor") = std::nullopt, - py::arg("queue_id") = 0, + py::arg("queue_id") = DefaultQueueId, }, ttnn::pybind_overload_t{ @@ -80,7 +80,7 @@ void bind_slice(py::module& module) { const std::array& step, const std::optional& memory_config, const std::optional& optional_output_tensor, - uint8_t queue_id) { + QueueId queue_id) { return self(queue_id, input_tensor, begins, ends, step, memory_config, optional_output_tensor); }, py::arg("input_tensor"), @@ -90,7 +90,7 @@ void bind_slice(py::module& module) { py::kw_only(), py::arg("memory_config") = std::nullopt, py::arg("output_tensor") = std::nullopt, - py::arg("queue_id") = 0, + py::arg("queue_id") = DefaultQueueId, }); } } // namespace ttnn::operations::data_movement::detail diff --git a/ttnn/cpp/ttnn/operations/data_movement/split/split.cpp b/ttnn/cpp/ttnn/operations/data_movement/split/split.cpp index 886b2ac5b33..a64713f8fee 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/split/split.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/split/split.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/operations/core/core.hpp" #include "ttnn/run_operation.hpp" #include "device/split_op.hpp" @@ -140,7 +140,7 @@ std::vector split_dim_n_chunks_tiled( } // namespace detail std::vector SplitOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, int64_t& num_splits, int64_t& dim, diff --git a/ttnn/cpp/ttnn/operations/data_movement/split/split.hpp b/ttnn/cpp/ttnn/operations/data_movement/split/split.hpp index 11f08fc5211..070c82b2193 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/split/split.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/split/split.hpp @@ -11,7 +11,7 @@ namespace operations::data_movement { struct SplitOperation { static std::vector invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, int64_t& num_splits, int64_t& dim, diff --git a/ttnn/cpp/ttnn/operations/data_movement/split/split_pybind.hpp b/ttnn/cpp/ttnn/operations/data_movement/split/split_pybind.hpp index fd0230edea7..369b6800330 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/split/split_pybind.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/split/split_pybind.hpp @@ -48,13 +48,13 @@ void bind_split(py::module& module) { int64_t& num_splits, int64_t& dim, const std::optional& memory_config, - uint8_t queue_id) { return self(queue_id, input_tensor, num_splits, dim, memory_config); }, + QueueId queue_id) { return self(queue_id, input_tensor, num_splits, dim, memory_config); }, py::arg("input_tensor"), py::arg("num_splits"), py::arg("dim") = 0, py::kw_only(), py::arg("memory_config") = std::nullopt, - py::arg("queue_id") = 0, + py::arg("queue_id") = DefaultQueueId, }); } } // namespace ttnn::operations::data_movement::detail diff --git a/ttnn/cpp/ttnn/operations/data_movement/tilize/tilize.cpp b/ttnn/cpp/ttnn/operations/data_movement/tilize/tilize.cpp index 3093285af82..e3c1dc27251 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/tilize/tilize.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/tilize/tilize.cpp @@ -5,7 +5,7 @@ #include "tilize.hpp" #include "device/tilize_op.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/run_operation.hpp" #include "ttnn/operations/data_movement/common/common.hpp" #include "ttnn/operations/data_movement/reshape_view/reshape.hpp" @@ -38,7 +38,7 @@ MassagedTilize build_ndiml_tilize(BaseTilizeType base_tilize) { } ttnn::Tensor ExecuteTilize::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const std::optional& memory_config, std::optional output_dtype, diff --git a/ttnn/cpp/ttnn/operations/data_movement/tilize/tilize.hpp b/ttnn/cpp/ttnn/operations/data_movement/tilize/tilize.hpp index 61ad37c1c32..79216f62ecf 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/tilize/tilize.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/tilize/tilize.hpp @@ -11,7 +11,7 @@ namespace operations::data_movement { struct ExecuteTilize { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const std::optional& memory_config = std::nullopt, std::optional output_dtype = std::nullopt, diff --git a/ttnn/cpp/ttnn/operations/data_movement/tilize/tilize_pybind.hpp b/ttnn/cpp/ttnn/operations/data_movement/tilize/tilize_pybind.hpp index ea55e0a39a5..d4ee1197956 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/tilize/tilize_pybind.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/tilize/tilize_pybind.hpp @@ -47,13 +47,13 @@ void bind_tilize(py::module& module) { const std::optional& memory_config, std::optional output_dtype, bool use_multicore, - uint8_t queue_id) { return self(queue_id, input_tensor, memory_config, output_dtype, use_multicore); }, + QueueId queue_id) { return self(queue_id, input_tensor, memory_config, output_dtype, use_multicore); }, py::arg("input_tensor"), py::kw_only(), py::arg("memory_config") = std::nullopt, py::arg("dtype") = std::nullopt, py::arg("use_multicore") = true, - py::arg("queue_id") = 0, + py::arg("queue_id") = DefaultQueueId, }); } } // namespace ttnn::operations::data_movement::detail diff --git a/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/tilize_with_val_padding.cpp b/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/tilize_with_val_padding.cpp index 5e6946986b9..06e0d00dce9 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/tilize_with_val_padding.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/tilize_with_val_padding.cpp @@ -5,7 +5,7 @@ #include "tilize_with_val_padding.hpp" #include "device/tilize_with_val_padding_op.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/run_operation.hpp" #include "ttnn/operations/data_movement/common/common.hpp" #include "ttnn/operations/data_movement/reshape_view/reshape.hpp" @@ -58,7 +58,7 @@ ttnn::Shape squeeze_output_shape(const ttnn::Shape& output_shape) { } ttnn::Tensor ExecuteTilizeWithValPadding::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const ttnn::Shape& output_padded_shape, const PadValue pad_value, @@ -94,7 +94,7 @@ ttnn::Tensor ExecuteTilizeWithValPadding::invoke( } ttnn::Tensor ExecuteTilizeWithValPadding::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const ttnn::SmallVector& output_padded_shape, const PadValue pad_value, @@ -123,7 +123,7 @@ ttnn::Tensor ExecuteTilizeWithValPadding::invoke( } ttnn::Tensor ExecuteTilizeWithZeroPadding::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const std::optional& memory_config, std::optional output_dtype, diff --git a/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/tilize_with_val_padding.hpp b/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/tilize_with_val_padding.hpp index 512573585d6..a20e8764914 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/tilize_with_val_padding.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/tilize_with_val_padding.hpp @@ -7,7 +7,7 @@ #include "device/tilize_with_val_padding_op.hpp" #include "ttnn/run_operation.hpp" #include "ttnn/decorators.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "tilize_with_val_padding_common.hpp" namespace ttnn { @@ -16,7 +16,7 @@ namespace operations::data_movement { struct ExecuteTilizeWithValPadding { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const ttnn::SmallVector& output_padded_shape, const PadValue pad_value, @@ -33,7 +33,7 @@ struct ExecuteTilizeWithValPadding { bool use_multicore = true); static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const ttnn::Shape& output_padded_shape, const PadValue pad_value, @@ -52,7 +52,7 @@ struct ExecuteTilizeWithValPadding { struct ExecuteTilizeWithZeroPadding { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const std::optional& memory_config = std::nullopt, std::optional output_dtype = std::nullopt, diff --git a/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/tilize_with_val_padding_pybind.hpp b/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/tilize_with_val_padding_pybind.hpp index 0150e2d31c5..394049b44a8 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/tilize_with_val_padding_pybind.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/tilize_with_val_padding_pybind.hpp @@ -51,7 +51,7 @@ void bind_tilize_with_val_padding(py::module& module) { const std::optional& memory_config, std::optional output_dtype, bool use_multicore, - uint8_t queue_id) { + QueueId queue_id) { return self( queue_id, input_tensor, output_padded_shape, value, memory_config, output_dtype, use_multicore); }, @@ -62,7 +62,7 @@ void bind_tilize_with_val_padding(py::module& module) { py::arg("memory_config") = std::nullopt, py::arg("dtype") = std::nullopt, py::arg("use_multicore") = true, - py::arg("queue_id") = 0, + py::arg("queue_id") = DefaultQueueId, } ); @@ -100,13 +100,13 @@ void bind_tilize_with_zero_padding(py::module& module) { const std::optional& memory_config, std::optional output_dtype, bool use_multicore, - uint8_t queue_id) { return self(queue_id, input_tensor, memory_config, output_dtype, use_multicore); }, + QueueId queue_id) { return self(queue_id, input_tensor, memory_config, output_dtype, use_multicore); }, py::arg("input_tensor"), py::kw_only(), py::arg("memory_config") = std::nullopt, py::arg("output_dtype") = std::nullopt, py::arg("use_multicore") = true, - py::arg("queue_id") = 0, + py::arg("queue_id") = DefaultQueueId, }); } diff --git a/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose.cpp b/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose.cpp index b6a3d76e011..bf70fc59e17 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include "ttnn/run_operation.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/decorators.hpp" #include "device/transpose_op.hpp" #include "ttnn/operations/data_movement/permute/permute.hpp" @@ -80,7 +80,7 @@ ttnn::Tensor transpose_nd( } // namespace detail ttnn::Tensor ExecuteTranspose::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const int64_t& dim1, const int64_t& dim2, diff --git a/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose.hpp b/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose.hpp index 85b90e3b4d6..226854d9bcd 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose.hpp @@ -11,7 +11,7 @@ namespace operations::data_movement { struct ExecuteTranspose { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const int64_t& dim1, const int64_t& dim2, diff --git a/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose_pybind.cpp index 3ecb9bdfad1..f5da37f16ed 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose_pybind.cpp @@ -42,7 +42,7 @@ void bind_transpose(py::module& module) { const int64_t& dim1, const int64_t& dim2, const std::optional& memory_config, - uint8_t queue_id, + QueueId queue_id, const std::optional& pad_value) { return self(queue_id, input_tensor, dim1, dim2, memory_config, pad_value); }, @@ -51,7 +51,7 @@ void bind_transpose(py::module& module) { py::arg("dim2"), py::kw_only(), py::arg("memory_config") = std::nullopt, - py::arg("queue_id") = 0, + py::arg("queue_id") = DefaultQueueId, py::arg("pad_value") = 0.0f, }); } diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize/untilize.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize/untilize.cpp index e9a86bdc64e..8b5801c5da8 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/untilize/untilize.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/untilize/untilize.cpp @@ -5,7 +5,7 @@ #include "untilize.hpp" #include "device/untilize_op.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/run_operation.hpp" #include "ttnn/operations/data_movement/common/common.hpp" #include "ttnn/operations/data_movement/reshape_view/reshape.hpp" @@ -38,7 +38,7 @@ MassagedUntilize build_ndiml_untilize(BaseUntilizeType base_untilize) { } ttnn::Tensor ExecuteUntilize::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const std::optional& memory_config, bool use_multicore, diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize/untilize.hpp b/ttnn/cpp/ttnn/operations/data_movement/untilize/untilize.hpp index 851f0295071..7fe0bc03784 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/untilize/untilize.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/untilize/untilize.hpp @@ -12,7 +12,7 @@ namespace operations::data_movement { struct ExecuteUntilize { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const std::optional& memory_config = std::nullopt, bool use_multicore = true, diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize/untilize_pybind.hpp b/ttnn/cpp/ttnn/operations/data_movement/untilize/untilize_pybind.hpp index e3a5488cd08..3668e1dc776 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/untilize/untilize_pybind.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/untilize/untilize_pybind.hpp @@ -50,7 +50,7 @@ void bind_untilize(py::module& module) { bool use_multicore, bool use_pack_untilize, const std::optional&& sub_core_grids, - uint8_t queue_id) { + QueueId queue_id) { return self(queue_id, input_tensor, memory_config, use_multicore, use_pack_untilize, sub_core_grids); }, py::arg("input_tensor"), @@ -59,7 +59,7 @@ void bind_untilize(py::module& module) { py::arg("use_multicore") = true, py::arg("use_pack_untilize") = true, py::arg("sub_core_grids") = std::nullopt, - py::arg("queue_id") = 0, + py::arg("queue_id") = DefaultQueueId, }); } } // namespace ttnn::operations::data_movement::detail diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/untilize_with_halo_v2.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/untilize_with_halo_v2.cpp index 55a5bb43539..db8aac052ad 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/untilize_with_halo_v2.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/untilize_with_halo_v2.cpp @@ -5,7 +5,7 @@ #include "untilize_with_halo_v2.hpp" #include "device/untilize_with_halo_v2_op.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/run_operation.hpp" using namespace tt::tt_metal; @@ -13,7 +13,7 @@ using namespace tt::tt_metal; namespace ttnn::operations::data_movement { ttnn::Tensor ExecuteUntilizeWithHaloV2::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const Tensor& padding_config, const Tensor& local_config, diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/untilize_with_halo_v2.hpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/untilize_with_halo_v2.hpp index 36002808d06..cd45ec80b51 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/untilize_with_halo_v2.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/untilize_with_halo_v2.hpp @@ -11,7 +11,7 @@ namespace operations::data_movement { struct ExecuteUntilizeWithHaloV2 { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const Tensor& padding_config, const Tensor& local_config, diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/untilize_with_halo_v2_pybind.hpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/untilize_with_halo_v2_pybind.hpp index 7bcbf0ead69..be4f4dc535e 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/untilize_with_halo_v2_pybind.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/untilize_with_halo_v2_pybind.hpp @@ -55,7 +55,7 @@ void bind_untilize_with_halo_v2(py::module& module) { const std::optional& memory_config, const bool remote_read, const bool transpose_mcast, - uint8_t queue_id) { + QueueId queue_id) { return self( queue_id, input_tensor, @@ -80,7 +80,7 @@ void bind_untilize_with_halo_v2(py::module& module) { py::arg("memory_config") = std::nullopt, py::arg("remote_read") = false, py::arg("transpose_mcast") = false, - py::arg("queue_id") = 0, + py::arg("queue_id") = DefaultQueueId, }); } } // namespace ttnn::operations::data_movement::detail diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding.cpp index a483d77caf9..ea73fd0fe0f 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding.cpp @@ -5,7 +5,7 @@ #include "untilize_with_unpadding.hpp" #include "device/untilize_with_unpadding_op.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/run_operation.hpp" #include "ttnn/operations/data_movement/common/common.hpp" @@ -58,7 +58,7 @@ MassagedUntilizeVal build_ndiml_untilize_val(BaseUntilizeValType base_untilize) } ttnn::Tensor ExecuteUntilizeWithUnpadding::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const ttnn::Shape& output_tensor_end, const std::optional& memory_config, diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding.hpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding.hpp index 4169ca7bc0c..802959dc319 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding.hpp @@ -11,7 +11,7 @@ namespace operations::data_movement { struct ExecuteUntilizeWithUnpadding { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor, const ttnn::Shape& output_tensor_end, const std::optional& memory_config, diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding_pybind.hpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding_pybind.hpp index 4aa2df927a6..df1d1edb7f5 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding_pybind.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding_pybind.hpp @@ -48,7 +48,7 @@ void bind_untilize_with_unpadding(py::module& module) { const std::optional& memory_config, bool use_multicore, bool use_pack_untilize, - uint8_t queue_id) { + QueueId queue_id) { return self(queue_id, input_tensor, output_tensor_end, memory_config, use_multicore, use_pack_untilize); }, py::arg("input_tensor"), @@ -57,7 +57,7 @@ void bind_untilize_with_unpadding(py::module& module) { py::arg("memory_config") = std::nullopt, py::arg("use_multicore") = true, py::arg("use_pack_untilize") = true, - py::arg("queue_id") = 0, + py::arg("queue_id") = DefaultQueueId, }); } diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/binary.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/binary.cpp index 3ef20924155..fb6033d77eb 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary/binary.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/binary.cpp @@ -24,7 +24,7 @@ constexpr bool is_associative(BinaryOpType op) { // Tensor - Scalar inline Tensor binary_impl( - uint8_t queue_id, + QueueId queue_id, BinaryOpType binary_op_type, const ttnn::Tensor& input_tensor, const float scalar, @@ -70,7 +70,7 @@ inline Tensor binary_impl( // Scalar - Tensor inline Tensor binary_impl( - uint8_t queue_id, + QueueId queue_id, BinaryOpType binary_op_type, const float scalar, const ttnn::Tensor& input_tensor, @@ -147,7 +147,7 @@ auto preprocess_inputs(const Tensor& input_tensor_a_arg, const Tensor& input_ten template Tensor BinaryOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a_arg, const Tensor& input_tensor_b_arg, const std::optional& output_dtype, @@ -192,7 +192,7 @@ Tensor BinaryOperation::invoke( template Tensor BinaryOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor_a, float scalar, const std::optional& output_dtype, @@ -236,7 +236,7 @@ Tensor BinaryOperation::invoke( template Tensor RelationalBinary::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a_arg, const Tensor& input_tensor_b_arg, const std::optional& output_dtype, @@ -306,7 +306,7 @@ Tensor RelationalBinary::invoke( template Tensor RelationalBinary::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor_a, const float scalar, const std::optional& dtype, @@ -320,7 +320,7 @@ Tensor RelationalBinary::invoke( // scalar - tensor combination not available on Pytorch for this op template Tensor RelationalBinary::invoke( - uint8_t queue_id, + QueueId queue_id, const float scalar, const ttnn::Tensor& input_tensor_a, const std::optional& dtype, @@ -390,7 +390,7 @@ Tensor InplaceBinaryOperation::invoke( template Tensor BinaryOperationSfpu::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a_arg, const Tensor& input_tensor_b_arg, const std::optional& output_dtype, diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/binary.hpp b/ttnn/cpp/ttnn/operations/eltwise/binary/binary.hpp index 514b3df54ca..5691a36e937 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary/binary.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/binary.hpp @@ -21,7 +21,7 @@ namespace binary { template struct BinaryOperation { static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a_arg, const Tensor& input_tensor_b_arg, const std::optional& output_dtype = std::nullopt, @@ -40,7 +40,7 @@ struct BinaryOperation { const std::optional& input_tensor_a_activation = std::nullopt); static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor_a, float scalar, const std::optional& output_dtype = std::nullopt, @@ -62,7 +62,7 @@ struct BinaryOperation { template struct RelationalBinary { static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a_arg, const Tensor& input_tensor_b_arg, const std::optional& output_dtype = std::nullopt, @@ -90,7 +90,7 @@ struct RelationalBinary { const std::optional& input_tensor_a_activation = std::nullopt); static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor_a, const float scalar, const std::optional& dtype = std::nullopt, @@ -101,7 +101,7 @@ struct RelationalBinary { // scalar - tensor combination not available on Pytorch for this op static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const float scalar, const ttnn::Tensor& input_tensor_a, const std::optional& dtype = std::nullopt, @@ -139,7 +139,7 @@ struct InplaceBinaryOperation { template struct BinaryOperationSfpu { static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a_arg, const Tensor& input_tensor_b_arg, const std::optional& output_dtype = std::nullopt, diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_composite.hpp b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_composite.hpp index 1981218a5e4..6af5bc49a0d 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_composite.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_composite.hpp @@ -5,7 +5,7 @@ #pragma once #include "ttnn/decorators.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/operations/core/core.hpp" #include "ttnn/operations/eltwise/binary/device/binary_composite_op.hpp" #include "ttnn/operations/eltwise/binary/device/binary_device_operation.hpp" @@ -29,7 +29,7 @@ namespace binary { */ struct ExecutePower { static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, uint32_t exponent, const std::optional& memory_config = std::nullopt, @@ -42,7 +42,7 @@ struct ExecutePower { const std::optional& optional_output_tensor = std::nullopt); static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, float exponent, const std::optional& memory_config = std::nullopt, @@ -55,7 +55,7 @@ struct ExecutePower { const std::optional& optional_output_tensor = std::nullopt); static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, float input_a, const Tensor& exponent, const std::optional& memory_config = std::nullopt, @@ -68,7 +68,7 @@ struct ExecutePower { const std::optional& optional_output_tensor = std::nullopt); static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const Tensor& exponent, const std::optional& memory_config = std::nullopt, @@ -148,7 +148,7 @@ struct ExecuteDiv { std::optional optional_output_tensor = std::nullopt); static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a, const Tensor& input_tensor_b, bool accurate_mode = false, @@ -157,7 +157,7 @@ struct ExecuteDiv { std::optional optional_output_tensor = std::nullopt); static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, float value, bool accurate_mode = false, @@ -169,7 +169,7 @@ struct ExecuteDiv { template struct ExecuteBiasGelu { static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a_arg, const Tensor& input_tensor_b_arg, const std::optional& output_dtype = std::nullopt, @@ -208,7 +208,7 @@ struct ExecuteBiasGelu { } static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor_a, const float bias, const std::optional& dtype = std::nullopt, @@ -325,7 +325,7 @@ struct ExecutePrelu { struct ExecuteRsub { static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a_arg, const Tensor& input_tensor_b_arg, const std::optional& output_dtype = std::nullopt, @@ -344,7 +344,7 @@ struct ExecuteRsub { const std::optional& input_tensor_a_activation = std::nullopt); static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, float input_b, const std::optional& memory_config = std::nullopt, @@ -359,7 +359,7 @@ struct ExecuteRsub { struct ExecuteBitwiseAnd { static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a_arg, const Tensor& input_tensor_b_arg, const std::optional& memory_config = std::nullopt, @@ -372,7 +372,7 @@ struct ExecuteBitwiseAnd { const std::optional& optional_output_tensor = std::nullopt); static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, int32_t input_b, const std::optional& memory_config = std::nullopt, @@ -387,7 +387,7 @@ struct ExecuteBitwiseAnd { struct ExecuteBitwiseOr { static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a_arg, const Tensor& input_tensor_b_arg, const std::optional& memory_config = std::nullopt, @@ -400,7 +400,7 @@ struct ExecuteBitwiseOr { const std::optional& optional_output_tensor = std::nullopt); static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, int32_t input_b, const std::optional& memory_config = std::nullopt, @@ -415,7 +415,7 @@ struct ExecuteBitwiseOr { struct ExecuteBitwiseXor { static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a_arg, const Tensor& input_tensor_b_arg, const std::optional& memory_config = std::nullopt, @@ -428,7 +428,7 @@ struct ExecuteBitwiseXor { const std::optional& optional_output_tensor = std::nullopt); static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, int32_t input_b, const std::optional& memory_config = std::nullopt, @@ -443,7 +443,7 @@ struct ExecuteBitwiseXor { struct ExecuteBitwiseLeftShift { static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a_arg, const Tensor& input_tensor_b_arg, const std::optional& memory_config = std::nullopt, @@ -456,7 +456,7 @@ struct ExecuteBitwiseLeftShift { const std::optional& optional_output_tensor = std::nullopt); static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, int32_t input_b, const std::optional& memory_config = std::nullopt, @@ -471,7 +471,7 @@ struct ExecuteBitwiseLeftShift { struct ExecuteBitwiseRightShift { static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a_arg, const Tensor& input_tensor_b_arg, const std::optional& memory_config = std::nullopt, @@ -484,7 +484,7 @@ struct ExecuteBitwiseRightShift { const std::optional& optional_output_tensor = std::nullopt); static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, int32_t input_b, const std::optional& memory_config = std::nullopt, diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp index 000d0726c22..cbda641693b 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp @@ -160,7 +160,7 @@ void bind_binary_operation( const std::optional& output_tensor, const std::optional& activations, const std::optional& input_tensor_a_activation, - const uint8_t& queue_id) -> ttnn::Tensor { + QueueId queue_id) -> ttnn::Tensor { return self( queue_id, input_tensor_a, @@ -179,7 +179,7 @@ void bind_binary_operation( py::arg("output_tensor") = std::nullopt, py::arg("activations") = std::nullopt, py::arg("input_tensor_a_activation") = std::nullopt, - py::arg("queue_id") = 0}, + py::arg("queue_id") = DefaultQueueId}, // tensor and tensor ttnn::pybind_overload_t{ @@ -191,7 +191,7 @@ void bind_binary_operation( const std::optional& output_tensor, const std::optional& activations, const std::optional& input_tensor_a_activation, - uint8_t queue_id) -> ttnn::Tensor { + QueueId queue_id) -> ttnn::Tensor { return self( queue_id, input_tensor_a, @@ -210,7 +210,7 @@ void bind_binary_operation( py::arg("output_tensor") = std::nullopt, py::arg("activations") = std::nullopt, py::arg("input_tensor_a_activation") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } template @@ -283,7 +283,7 @@ void bind_binary_unary_operation( const float scalar, const std::optional& memory_config, const std::optional& output_tensor, - const uint8_t& queue_id) -> ttnn::Tensor { + QueueId queue_id) -> ttnn::Tensor { return self(queue_id, input_tensor_a, scalar, memory_config, output_tensor); }, py::arg("input_tensor_a"), @@ -291,7 +291,7 @@ void bind_binary_unary_operation( py::kw_only(), py::arg("memory_config") = std::nullopt, py::arg("output_tensor") = std::nullopt, - py::arg("queue_id") = 0}, + py::arg("queue_id") = DefaultQueueId}, // tensor and tensor ttnn::pybind_overload_t{ @@ -303,7 +303,7 @@ void bind_binary_unary_operation( const std::optional& output_tensor, const std::optional& activations, const std::optional& input_tensor_a_activation, - uint8_t queue_id) -> ttnn::Tensor { + QueueId queue_id) -> ttnn::Tensor { return self( queue_id, input_tensor_a, @@ -322,7 +322,7 @@ void bind_binary_unary_operation( py::arg("output_tensor") = std::nullopt, py::arg("activations") = std::nullopt, py::arg("input_tensor_a_activation") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } template @@ -393,7 +393,7 @@ void bind_bitwise_binary_ops_operation( const int32_t scalar, const std::optional& memory_config, const std::optional& output_tensor, - const uint8_t& queue_id) -> ttnn::Tensor { + QueueId queue_id) -> ttnn::Tensor { return self(queue_id, input_tensor_a, scalar, memory_config, output_tensor); }, py::arg("input_tensor_a"), @@ -401,7 +401,7 @@ void bind_bitwise_binary_ops_operation( py::kw_only(), py::arg("memory_config") = std::nullopt, py::arg("output_tensor") = std::nullopt, - py::arg("queue_id") = 0}, + py::arg("queue_id") = DefaultQueueId}, // tensor and tensor ttnn::pybind_overload_t{ @@ -410,7 +410,7 @@ void bind_bitwise_binary_ops_operation( const ttnn::Tensor& input_tensor_b, const std::optional& memory_config, const std::optional& output_tensor, - uint8_t queue_id) -> ttnn::Tensor { + QueueId queue_id) -> ttnn::Tensor { return self( queue_id, input_tensor_a, @@ -423,7 +423,7 @@ void bind_bitwise_binary_ops_operation( py::kw_only(), py::arg("memory_config") = std::nullopt, py::arg("output_tensor") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } template @@ -896,7 +896,7 @@ void bind_div( const std::optional round_mode, const std::optional& memory_config, const std::optional& output_tensor, - uint8_t queue_id) -> ttnn::Tensor { + QueueId queue_id) -> ttnn::Tensor { return self( queue_id, input_tensor_a, input_tensor_b, accurate_mode, round_mode, memory_config, output_tensor); }, @@ -917,7 +917,7 @@ void bind_div( const std::optional round_mode, const std::optional& memory_config, const std::optional& output_tensor, - uint8_t queue_id) -> ttnn::Tensor { + QueueId queue_id) -> ttnn::Tensor { return self(queue_id, input_tensor_a, value, accurate_mode, round_mode, memory_config, output_tensor); }, py::arg("input_tensor_a"), @@ -1322,7 +1322,7 @@ void bind_power(py::module& module, const binary_operation_t& operation, const s uint32_t exponent, const std::optional& memory_config, const std::optional& output_tensor, - const uint8_t queue_id) -> ttnn::Tensor { + const QueueId queue_id) -> ttnn::Tensor { return self(queue_id, input_tensor, exponent, memory_config, output_tensor); }, py::arg("input_tensor"), @@ -1330,7 +1330,7 @@ void bind_power(py::module& module, const binary_operation_t& operation, const s py::kw_only(), py::arg("memory_config") = std::nullopt, py::arg("output_tensor") = std::nullopt, - py::arg("queue_id") = 0}, + py::arg("queue_id") = DefaultQueueId}, // float exponent ttnn::pybind_overload_t{ @@ -1339,7 +1339,7 @@ void bind_power(py::module& module, const binary_operation_t& operation, const s float exponent, const std::optional& memory_config, std::optional output_tensor, - const uint8_t queue_id) -> ttnn::Tensor { + const QueueId queue_id) -> ttnn::Tensor { return self(queue_id, input_tensor, exponent, memory_config, output_tensor); }, py::arg("input_tensor"), @@ -1356,7 +1356,7 @@ void bind_power(py::module& module, const binary_operation_t& operation, const s const Tensor& exponent, const std::optional& memory_config, std::optional output_tensor, - const uint8_t queue_id) -> ttnn::Tensor { + const QueueId queue_id) -> ttnn::Tensor { return self(queue_id, input_tensor, exponent, memory_config, output_tensor); }, py::arg("input_tensor"), @@ -1373,7 +1373,7 @@ void bind_power(py::module& module, const binary_operation_t& operation, const s const Tensor& exponent, const std::optional& memory_config, std::optional output_tensor, - const uint8_t queue_id) -> ttnn::Tensor { + const QueueId queue_id) -> ttnn::Tensor { return self(queue_id, input, exponent, memory_config, output_tensor); }, py::arg("input"), diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp index 1a33886f931..49b23d539e1 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp @@ -180,7 +180,7 @@ Tensor _atan2(const Tensor& input_a, const Tensor& input_b, const std::optional< } Tensor ExecuteDiv::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input, float value, bool accurate_mode, @@ -212,7 +212,7 @@ Tensor ExecuteDiv::invoke( } Tensor ExecuteDiv::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_a, const Tensor& input_b, bool accurate_mode, @@ -479,9 +479,15 @@ Tensor _floor_div(const Tensor& input_a, const Tensor& input_b, const std::optio Tensor _scatter(const Tensor& input_a, const Tensor& input_b, const std::optional& output_mem_config) { tt::tt_metal::Array4D start_index = {0, 0, 0, 0}; Tensor index_pad = ttnn::pad( - 0, ttnn::ones_like(input_a), input_b.get_padded_shape().to_array_4D(), start_index, 0, false, std::nullopt); - Tensor temp_a = - ttnn::pad(0, input_a, input_b.get_padded_shape().to_array_4D(), start_index, 0, false, std::nullopt); + ttnn::DefaultQueueId, + ttnn::ones_like(input_a), + input_b.get_padded_shape().to_array_4D(), + start_index, + 0, + false, + std::nullopt); + Tensor temp_a = ttnn::pad( + ttnn::DefaultQueueId, input_a, input_b.get_padded_shape().to_array_4D(), start_index, 0, false, std::nullopt); return ttnn::where(index_pad, temp_a, input_b); } @@ -584,7 +590,7 @@ Tensor ExecuteLCM::invoke( // power - floating point exponent Tensor ExecutePower::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_a, float exponent, const std::optional& output_mem_config, @@ -630,7 +636,7 @@ Tensor ExecutePower::invoke( // power - integer exponent Tensor ExecutePower::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input, uint32_t exponent, const std::optional& output_mem_config, @@ -649,7 +655,7 @@ Tensor ExecutePower::invoke( // power - tensor exponent Tensor ExecutePower::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input, const Tensor& exponent, const std::optional& output_mem_config, @@ -669,7 +675,7 @@ Tensor ExecutePower::invoke( // power - scalar input Tensor ExecutePower::invoke( - uint8_t queue_id, + QueueId queue_id, float input_a, const Tensor& exponent, const std::optional& output_mem_config, @@ -688,7 +694,7 @@ Tensor ExecutePower::invoke( } Tensor ExecuteRsub::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a, const Tensor& input_tensor_b, const std::optional& output_dtype, @@ -728,7 +734,7 @@ Tensor ExecuteRsub::invoke( } Tensor ExecuteRsub::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a, const float input_b, const std::optional& memory_config, @@ -753,7 +759,7 @@ Tensor ExecuteRsub::invoke( // Bitwise AND Tensor ExecuteBitwiseAnd::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a, const Tensor& input_tensor_b, const std::optional& memory_config, @@ -777,7 +783,7 @@ Tensor ExecuteBitwiseAnd::invoke( } Tensor ExecuteBitwiseAnd::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a, const int32_t input_b, const std::optional& memory_config, @@ -803,7 +809,7 @@ Tensor ExecuteBitwiseAnd::invoke( // Bitwise OR Tensor ExecuteBitwiseOr::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a, const Tensor& input_tensor_b, const std::optional& memory_config, @@ -827,7 +833,7 @@ Tensor ExecuteBitwiseOr::invoke( } Tensor ExecuteBitwiseOr::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a, const int32_t input_b, const std::optional& memory_config, @@ -853,7 +859,7 @@ Tensor ExecuteBitwiseOr::invoke( // Bitwise XOR Tensor ExecuteBitwiseXor::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a, const Tensor& input_tensor_b, const std::optional& memory_config, @@ -877,7 +883,7 @@ Tensor ExecuteBitwiseXor::invoke( } Tensor ExecuteBitwiseXor::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a, const int32_t input_b, const std::optional& memory_config, @@ -903,7 +909,7 @@ Tensor ExecuteBitwiseXor::invoke( // Bitwise Left Shift Tensor ExecuteBitwiseLeftShift::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a, const Tensor& input_tensor_b, const std::optional& memory_config, @@ -922,7 +928,7 @@ Tensor ExecuteBitwiseLeftShift::invoke( } Tensor ExecuteBitwiseLeftShift::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a, const int32_t input_b, const std::optional& memory_config, @@ -943,7 +949,7 @@ Tensor ExecuteBitwiseLeftShift::invoke( // Bitwise Right Shift Tensor ExecuteBitwiseRightShift::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a, const Tensor& input_tensor_b, const std::optional& memory_config, @@ -962,7 +968,7 @@ Tensor ExecuteBitwiseRightShift::invoke( } Tensor ExecuteBitwiseRightShift::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a, const int32_t input_b, const std::optional& memory_config, diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_device_operation.hpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_device_operation.hpp index cd9dbf8effe..03f10bf35f4 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_device_operation.hpp @@ -10,7 +10,7 @@ #include #include -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/core.hpp" #include "ttnn/decorators.hpp" #include "ttnn/device_operation.hpp" diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_backward/binary_backward.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary_backward/binary_backward.cpp index 6fc64c269f3..9bcf23a6973 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary_backward/binary_backward.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary_backward/binary_backward.cpp @@ -19,10 +19,10 @@ #include "ttnn/operations/eltwise/complex_unary/complex_unary.hpp" #include #include "cpp/ttnn/common/constants.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/operations/eltwise/ternary/where.hpp" #include "ttnn/operations/creation.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/operations/eltwise/binary_backward/binary_backward.hpp" #include "tools/profiler/op_profiler.hpp" #include @@ -75,7 +75,7 @@ std::vector ExecuteBackwardAtan2::invoke( } std::vector> ExecuteAddalphaBW::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad, const Tensor& input, const Tensor& other, @@ -122,7 +122,7 @@ std::vector> ExecuteAddalphaBW::invoke( } std::vector> ExecuteBackwardSubAlpha::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad, const Tensor& input, const Tensor& other, @@ -169,7 +169,7 @@ std::vector> ExecuteBackwardSubAlpha::invoke( } std::vector> ExecuteBackwardAdd::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad, const Tensor& input, float alpha, @@ -192,7 +192,7 @@ std::vector> ExecuteBackwardAdd::invoke( } std::vector> ExecuteBackwardAdd::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad, const Tensor& input, const Tensor& other, @@ -252,7 +252,7 @@ std::vector ExecuteBackwardAdd::invoke( } std::vector> ExecuteBackwardSub::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad, const Tensor& input, float alpha, @@ -275,7 +275,7 @@ std::vector> ExecuteBackwardSub::invoke( } std::vector> ExecuteBackwardSub::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad, const Tensor& input, const Tensor& other, @@ -551,7 +551,7 @@ std::vector ExecuteBackwardSquaredDifference::invoke( } std::vector> ExecuteBackwardAssign::invoke( - uint8_t cq_id, + QueueId cq_id, const Tensor& grad, const Tensor& input, const Tensor& other, @@ -576,7 +576,7 @@ std::vector> ExecuteBackwardAssign::invoke( } std::vector> ExecuteBackwardAssign::invoke( - uint8_t cq_id, + QueueId cq_id, const Tensor& grad, const Tensor& input, const std::optional& output_mem_config, @@ -614,7 +614,7 @@ std::vector> ExecuteBackwardAssign::invoke( } std::vector> ExecuteBackwardConcat::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad, const Tensor& input, const Tensor& other, @@ -680,7 +680,7 @@ std::vector> ExecuteBackwardConcat::invoke( } std::vector> ExecuteBackwardRsub::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad, const Tensor& input, const Tensor& other, @@ -819,7 +819,7 @@ std::vector ExecuteBackwardMin::invoke( } std::vector> ExecuteBackwardDiv::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad, const Tensor& input, float scalar, @@ -871,7 +871,7 @@ std::vector> ExecuteBackwardDiv::invoke( } std::vector> ExecuteBackwardDiv::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad, const Tensor& input, const Tensor& other, @@ -1030,7 +1030,7 @@ std::vector ExecuteBackwardDiv::invoke( } std::vector> ExecuteBackwardMul::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad, const Tensor& input, float scalar, @@ -1070,7 +1070,7 @@ std::vector ExecuteBackwardMul::invoke( } std::vector> ExecuteBackwardMul::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad, const Tensor& input, const Tensor& other, diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_backward/binary_backward.hpp b/ttnn/cpp/ttnn/operations/eltwise/binary_backward/binary_backward.hpp index 526a10a9dd4..ce55f56178b 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary_backward/binary_backward.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary_backward/binary_backward.hpp @@ -5,7 +5,7 @@ #pragma once -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/device_operation.hpp" #include "ttnn/operations/eltwise/complex_binary/device/complex_binary_op.hpp" #include "ttnn/operations/eltwise/complex/complex.hpp" @@ -88,7 +88,7 @@ struct ExecuteBackwardMin { struct ExecuteBackwardMul { static std::vector> invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad_tensor_arg, const Tensor& input_tensor_arg, float scalar, @@ -96,7 +96,7 @@ struct ExecuteBackwardMul { std::optional input_grad = std::nullopt); static std::vector> invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad_tensor_arg, const Tensor& input_tensor_arg, const Tensor& other_tensor_arg, @@ -130,14 +130,14 @@ struct ExecuteBackwardMul { struct ExecuteBackwardAssign { static std::vector> invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad_tensor_arg, const Tensor& input_tensor_arg, const std::optional& memory_config = std::nullopt, std::optional input_a_grad = std::nullopt); static std::vector> invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad_tensor_arg, const Tensor& input_tensor_arg, const Tensor& other_tensor_arg, @@ -180,7 +180,7 @@ struct ExecuteBackwardBiasGelu { struct ExecuteBackwardLT { static std::vector> invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad_tensor_arg, const Tensor& input_tensor_arg, const Tensor& other_tensor_arg, @@ -190,7 +190,7 @@ struct ExecuteBackwardLT { std::optional other_grad = std::nullopt); static std::vector> invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad_tensor_arg, const Tensor& input_tensor_arg, float other, @@ -216,7 +216,7 @@ struct ExecuteBackwardLT { struct ExecuteBackwardAdd { static std::vector> invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad_tensor_arg, const Tensor& input_tensor_arg, float scalar, @@ -224,7 +224,7 @@ struct ExecuteBackwardAdd { std::optional input_grad = std::nullopt); static std::vector> invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad_tensor_arg, const Tensor& input_tensor_a_arg, const Tensor& input_tensor_b_arg, @@ -259,7 +259,7 @@ struct ExecuteBackwardAdd { struct ExecuteBackwardSub { static std::vector> invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad_tensor_arg, const Tensor& input_tensor_arg, float scalar, @@ -267,7 +267,7 @@ struct ExecuteBackwardSub { std::optional input_grad = std::nullopt); static std::vector> invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad_tensor_arg, const Tensor& input_tensor_a_arg, const Tensor& input_tensor_b_arg, @@ -302,7 +302,7 @@ struct ExecuteBackwardSub { struct ExecuteBackwardDiv { static std::vector> invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad_tensor_arg, const Tensor& input_tensor_arg, float scalar, @@ -311,7 +311,7 @@ struct ExecuteBackwardDiv { std::optional input_grad = std::nullopt); static std::vector> invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad_tensor_arg, const Tensor& input_tensor_arg, const Tensor& other_tensor_arg, @@ -376,7 +376,7 @@ struct ExecuteBackwardFmod { struct ExecuteAddalphaBW { static std::vector> invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad_tensor_arg, const Tensor& input_tensor_a_arg, const Tensor& input_tensor_b_arg, @@ -399,7 +399,7 @@ struct ExecuteAddalphaBW { struct ExecuteBackwardSubAlpha { static std::vector> invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad_tensor_arg, const Tensor& input_tensor_a_arg, const Tensor& input_tensor_b_arg, @@ -422,7 +422,7 @@ struct ExecuteBackwardSubAlpha { struct ExecuteBackwardRsub { static std::vector> invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad_tensor_arg, const Tensor& input_tensor_a_arg, const Tensor& input_tensor_b_arg, @@ -443,7 +443,7 @@ struct ExecuteBackwardRsub { struct ExecuteBackwardConcat { static std::vector> invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad_tensor_arg, const Tensor& input_tensor_a_arg, const Tensor& input_tensor_b_arg, diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_backward/binary_backward_pybind.hpp b/ttnn/cpp/ttnn/operations/eltwise/binary_backward/binary_backward_pybind.hpp index 73e73224f56..80104158eea 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary_backward/binary_backward_pybind.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary_backward/binary_backward_pybind.hpp @@ -174,7 +174,7 @@ void bind_binary_backward_concat( const std::optional& memory_config, const std::optional& input_grad, const std::optional& other_grad, - const uint8_t& queue_id) -> std::vector> { + QueueId queue_id) -> std::vector> { return self( grad_tensor, input_tensor_a, @@ -278,7 +278,7 @@ void bind_binary_backward_addalpha( const std::optional& memory_config, const std::optional& input_a_grad, const std::optional& input_b_grad, - const uint8_t& queue_id) -> std::vector> { + QueueId queue_id) -> std::vector> { return self( queue_id, grad_tensor, @@ -476,7 +476,7 @@ void bind_binary_backward_sub_alpha( const std::optional& memory_config, const std::optional& input_grad, const std::optional& other_grad, - const uint8_t& queue_id) -> std::vector> { + QueueId queue_id) -> std::vector> { return self( queue_id, grad_tensor, @@ -563,7 +563,7 @@ void bind_binary_backward_rsub( const std::optional& memory_config, const std::optional& input_grad, const std::optional& other_grad, - const uint8_t& queue_id) -> std::vector> { + QueueId queue_id) -> std::vector> { return self( queue_id, grad_tensor, @@ -654,7 +654,7 @@ void bind_binary_bw_mul( const float scalar, const std::optional& memory_config, const std::optional& input_grad, - const uint8_t& queue_id) -> std::vector> { + QueueId queue_id) -> std::vector> { return self(queue_id, grad_tensor, input_tensor_a, scalar, memory_config, input_grad); }, py::arg("grad_tensor"), @@ -675,7 +675,7 @@ void bind_binary_bw_mul( const std::optional& memory_config, const std::optional& input_grad, const std::optional& other_grad, - const uint8_t& queue_id) -> std::vector> { + QueueId queue_id) -> std::vector> { return self( queue_id, grad_tensor, @@ -784,7 +784,7 @@ void bind_binary_bw( const float scalar, const std::optional& memory_config, const std::optional& input_grad, - const uint8_t& queue_id) -> std::vector> { + QueueId queue_id) -> std::vector> { return self(queue_id, grad_tensor, input_tensor_a, scalar, memory_config, input_grad); }, py::arg("grad_tensor"), @@ -805,7 +805,7 @@ void bind_binary_bw( const std::optional& memory_config, const std::optional& input_grad, const std::optional& other_grad, - const uint8_t& queue_id) -> std::vector> { + QueueId queue_id) -> std::vector> { return self( queue_id, grad_tensor, @@ -920,7 +920,7 @@ void bind_binary_bw_div( const std::optional round_mode, const std::optional& memory_config, const std::optional& input_grad, - const uint8_t& queue_id) -> std::vector> { + QueueId queue_id) -> std::vector> { return self(queue_id, grad_tensor, input_tensor_a, scalar, round_mode, memory_config, input_grad); }, py::arg("grad_tensor"), @@ -943,7 +943,7 @@ void bind_binary_bw_div( const std::optional& memory_config, const std::optional& input_grad, const std::optional& other_grad, - const uint8_t& queue_id) -> std::vector> { + QueueId queue_id) -> std::vector> { return self( queue_id, grad_tensor, @@ -1136,7 +1136,7 @@ void bind_binary_backward_assign( const ttnn::Tensor& input_tensor, const std::optional& memory_config, const std::optional& input_grad, - const uint8_t& queue_id) -> std::vector> { + QueueId queue_id) -> std::vector> { return self(queue_id, grad_tensor, input_tensor, memory_config, input_grad); }, py::arg("grad_tensor"), @@ -1156,7 +1156,7 @@ void bind_binary_backward_assign( const std::optional& memory_config, const std::optional& input_a_grad, const std::optional& input_b_grad, - const uint8_t& queue_id) -> std::vector> { + QueueId queue_id) -> std::vector> { return self( queue_id, grad_tensor, diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/binary_ng.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/binary_ng.cpp index b74c23672ad..99c1a77dab0 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/binary_ng.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/binary_ng.cpp @@ -18,7 +18,7 @@ namespace ttnn::operations::binary_ng { template Tensor BinaryNg::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a, const Tensor& input_tensor_b, const std::optional& output_dtype, @@ -103,7 +103,7 @@ Tensor BinaryNg::invoke( template Tensor BinaryNg::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a, float scalar, const std::optional& output_dtype, @@ -185,7 +185,7 @@ Tensor BinaryNg::invoke( template Tensor InplaceBinaryNg::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a, const Tensor& input_tensor_b, tt::stl::Span lhs_activations, @@ -224,7 +224,7 @@ Tensor InplaceBinaryNg::invoke( template Tensor InplaceBinaryNg::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a, const float scalar, tt::stl::Span lhs_activations, @@ -263,7 +263,7 @@ Tensor InplaceBinaryNg::invoke( template Tensor BinaryNgBitwise::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a, const Tensor& input_tensor_b, const std::optional& memory_config, @@ -301,7 +301,7 @@ Tensor BinaryNgBitwise::invoke( template Tensor BinaryNgBitwise::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a, float scalar, const std::optional& memory_config, diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/binary_ng.hpp b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/binary_ng.hpp index 54767414f3c..29c7b1d9481 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/binary_ng.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/binary_ng.hpp @@ -15,7 +15,7 @@ namespace ttnn::operations::binary_ng { template struct BinaryNg { static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a, const Tensor& input_tensor_b, const std::optional& output_dtype = std::nullopt, @@ -36,7 +36,7 @@ struct BinaryNg { tt::stl::Span post_activations = {}); static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a, float scalar, const std::optional& output_dtype = std::nullopt, @@ -60,7 +60,7 @@ struct BinaryNg { template struct BinaryNgBitwise { static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a, const Tensor& input_tensor_b, const std::optional& memory_config = std::nullopt, @@ -73,7 +73,7 @@ struct BinaryNgBitwise { std::optional optional_output_tensor = std::nullopt); static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a, float scalar, const std::optional& memory_config = std::nullopt, @@ -89,7 +89,7 @@ struct BinaryNgBitwise { template struct InplaceBinaryNg { static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a, const Tensor& input_tensor_b, tt::stl::Span lhs_activations = {}, @@ -104,7 +104,7 @@ struct InplaceBinaryNg { tt::stl::Span post_activations = {}); static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, float scalar, tt::stl::Span lhs_activations = {}, diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/binary_ng_pybind.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/binary_ng_pybind.cpp index aa4239d894d..c5a3d8eb462 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/binary_ng_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/binary_ng_pybind.cpp @@ -27,7 +27,7 @@ void bind_binary_ng_operation(py::module& module, T op, const std::string& docst const ttnn::SmallVector& lhs_activations, const ttnn::SmallVector& rhs_activations, const ttnn::SmallVector& post_activations, - const uint8_t& queue_id) -> ttnn::Tensor { + QueueId queue_id) -> ttnn::Tensor { return self( queue_id, input_tensor_a, @@ -48,7 +48,7 @@ void bind_binary_ng_operation(py::module& module, T op, const std::string& docst py::arg("lhs_activations") = ttnn::SmallVector(), py::arg("rhs_activations") = ttnn::SmallVector(), py::arg("post_activations") = ttnn::SmallVector(), - py::arg("queue_id") = 0}, + py::arg("queue_id") = DefaultQueueId}, // tensor and tensor ttnn::pybind_overload_t{ @@ -61,7 +61,7 @@ void bind_binary_ng_operation(py::module& module, T op, const std::string& docst const ttnn::SmallVector& lhs_activations, const ttnn::SmallVector& rhs_activations, const ttnn::SmallVector& post_activations, - uint8_t queue_id) -> ttnn::Tensor { + QueueId queue_id) -> ttnn::Tensor { return self( queue_id, input_tensor_a, @@ -82,7 +82,7 @@ void bind_binary_ng_operation(py::module& module, T op, const std::string& docst py::arg("lhs_activations") = ttnn::SmallVector(), py::arg("rhs_activations") = ttnn::SmallVector(), py::arg("post_activations") = ttnn::SmallVector(), - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } template @@ -99,7 +99,7 @@ void bind_binary_ng_bitwise_ops(py::module& module, T op, const std::string& doc const float scalar, const std::optional& memory_config, const std::optional& output_tensor, - const uint8_t& queue_id) -> ttnn::Tensor { + QueueId queue_id) -> ttnn::Tensor { return self(queue_id, input_tensor_a, scalar, memory_config, output_tensor); }, py::arg("input_tensor_a"), @@ -107,7 +107,7 @@ void bind_binary_ng_bitwise_ops(py::module& module, T op, const std::string& doc py::kw_only(), py::arg("memory_config") = std::nullopt, py::arg("output_tensor") = std::nullopt, - py::arg("queue_id") = 0}, + py::arg("queue_id") = DefaultQueueId}, // tensor and tensor ttnn::pybind_overload_t{ @@ -116,7 +116,7 @@ void bind_binary_ng_bitwise_ops(py::module& module, T op, const std::string& doc const ttnn::Tensor& input_tensor_b, const std::optional& memory_config, const std::optional& output_tensor, - uint8_t queue_id) -> ttnn::Tensor { + QueueId queue_id) -> ttnn::Tensor { return self(queue_id, input_tensor_a, input_tensor_b, memory_config, output_tensor); }, py::arg("input_tensor_a"), @@ -124,7 +124,7 @@ void bind_binary_ng_bitwise_ops(py::module& module, T op, const std::string& doc py::kw_only(), py::arg("memory_config") = std::nullopt, py::arg("output_tensor") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } template @@ -142,7 +142,7 @@ void bind_inplace_binary_ng_operation(py::module& module, T op, const std::strin const ttnn::SmallVector& lhs_activations, const ttnn::SmallVector& rhs_activations, const ttnn::SmallVector& post_activations, - const uint8_t& queue_id) -> ttnn::Tensor { + QueueId queue_id) -> ttnn::Tensor { return self(queue_id, input_tensor_a, scalar, lhs_activations, rhs_activations, post_activations); }, py::arg("input_tensor_a"), @@ -151,7 +151,7 @@ void bind_inplace_binary_ng_operation(py::module& module, T op, const std::strin py::arg("lhs_activations") = ttnn::SmallVector(), py::arg("rhs_activations") = ttnn::SmallVector(), py::arg("post_activations") = ttnn::SmallVector(), - py::arg("queue_id") = 0}, + py::arg("queue_id") = DefaultQueueId}, // tensor and tensor ttnn::pybind_overload_t{ @@ -161,7 +161,7 @@ void bind_inplace_binary_ng_operation(py::module& module, T op, const std::strin const ttnn::SmallVector& lhs_activations, const ttnn::SmallVector& rhs_activations, const ttnn::SmallVector& post_activations, - uint8_t queue_id) -> ttnn::Tensor { + QueueId queue_id) -> ttnn::Tensor { return self( queue_id, input_tensor_a, input_tensor_b, lhs_activations, rhs_activations, post_activations); }, @@ -171,7 +171,7 @@ void bind_inplace_binary_ng_operation(py::module& module, T op, const std::strin py::arg("lhs_activations") = ttnn::SmallVector(), py::arg("rhs_activations") = ttnn::SmallVector(), py::arg("post_activations") = ttnn::SmallVector(), - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } } // namespace detail diff --git a/ttnn/cpp/ttnn/operations/eltwise/ternary/ternary_pybind.hpp b/ttnn/cpp/ttnn/operations/eltwise/ternary/ternary_pybind.hpp index 8448bfb3817..dc023985e91 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/ternary/ternary_pybind.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/ternary/ternary_pybind.hpp @@ -145,7 +145,7 @@ void bind_ternary_where(py::module& module, const ternary_operation_t& operation const Tensor& false_value, const std::optional& memory_config, std::optional output_tensor, - uint8_t queue_id) { + QueueId queue_id) { return self(queue_id, predicate, true_value, false_value, memory_config, output_tensor); }, py::arg("predicate"), @@ -154,7 +154,7 @@ void bind_ternary_where(py::module& module, const ternary_operation_t& operation py::kw_only(), py::arg("memory_config") = std::nullopt, py::arg("output_tensor") = std::nullopt, - py::arg("queue_id") = 0}, + py::arg("queue_id") = DefaultQueueId}, ttnn::pybind_overload_t{ [](const ternary_operation_t& self, const Tensor& predicate, @@ -162,7 +162,7 @@ void bind_ternary_where(py::module& module, const ternary_operation_t& operation const Tensor& false_value, const std::optional& memory_config, std::optional output_tensor, - uint8_t queue_id) { + QueueId queue_id) { return self(queue_id, predicate, true_value, false_value, memory_config, output_tensor); }, py::arg("predicate"), @@ -171,7 +171,7 @@ void bind_ternary_where(py::module& module, const ternary_operation_t& operation py::kw_only(), py::arg("memory_config") = std::nullopt, py::arg("output_tensor") = std::nullopt, - py::arg("queue_id") = 0}, + py::arg("queue_id") = DefaultQueueId}, ttnn::pybind_overload_t{ [](const ternary_operation_t& self, const Tensor& predicate, @@ -179,7 +179,7 @@ void bind_ternary_where(py::module& module, const ternary_operation_t& operation const float false_value, const std::optional& memory_config, std::optional output_tensor, - uint8_t queue_id) { + QueueId queue_id) { return self(queue_id, predicate, true_value, false_value, memory_config, output_tensor); }, py::arg("predicate"), @@ -188,7 +188,7 @@ void bind_ternary_where(py::module& module, const ternary_operation_t& operation py::kw_only(), py::arg("memory_config") = std::nullopt, py::arg("output_tensor") = std::nullopt, - py::arg("queue_id") = 0}, + py::arg("queue_id") = DefaultQueueId}, ttnn::pybind_overload_t{ [](const ternary_operation_t& self, const Tensor& predicate, @@ -196,7 +196,7 @@ void bind_ternary_where(py::module& module, const ternary_operation_t& operation const float false_value, const std::optional& memory_config, std::optional output_tensor, - uint8_t queue_id) { + QueueId queue_id) { return self(queue_id, predicate, true_value, false_value, memory_config, output_tensor); }, py::arg("predicate"), @@ -205,7 +205,7 @@ void bind_ternary_where(py::module& module, const ternary_operation_t& operation py::kw_only(), py::arg("memory_config") = std::nullopt, py::arg("output_tensor") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } template diff --git a/ttnn/cpp/ttnn/operations/eltwise/ternary/where.cpp b/ttnn/cpp/ttnn/operations/eltwise/ternary/where.cpp index 9a19d67d299..91b28f64641 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/ternary/where.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/ternary/where.cpp @@ -8,7 +8,7 @@ #include #include -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/decorators.hpp" #include "ttnn/operations/eltwise/binary/binary.hpp" @@ -26,7 +26,7 @@ namespace ternary_utils { using FloatOrTensor = std::variant; Tensor where_impl( - uint8_t queue_id, + QueueId queue_id, const Tensor& predicate, const FloatOrTensor& value_true, const FloatOrTensor& value_false, @@ -54,7 +54,7 @@ Tensor where_impl( } // namespace ternary_utils Tensor WhereOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& predicate, const Tensor& value_true, const Tensor& value_false, @@ -70,7 +70,7 @@ Tensor WhereOperation::invoke( } Tensor WhereOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& predicate, const float value_true, const Tensor& value_false, @@ -86,7 +86,7 @@ Tensor WhereOperation::invoke( } Tensor WhereOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& predicate, const Tensor& value_true, const float value_false, @@ -102,7 +102,7 @@ Tensor WhereOperation::invoke( } Tensor WhereOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& predicate, const float value_true, const float value_false, diff --git a/ttnn/cpp/ttnn/operations/eltwise/ternary/where.hpp b/ttnn/cpp/ttnn/operations/eltwise/ternary/where.hpp index 1900d994a23..db96323468a 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/ternary/where.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/ternary/where.hpp @@ -7,7 +7,7 @@ #include #include "ttnn/decorators.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" namespace ttnn { @@ -17,7 +17,7 @@ namespace ternary { struct WhereOperation { static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& predicate, const Tensor& value_true, const Tensor& value_false, @@ -25,7 +25,7 @@ struct WhereOperation { std::optional output_tensor = std::nullopt); static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& predicate, const float value_true, const Tensor& value_false, @@ -33,7 +33,7 @@ struct WhereOperation { std::optional output_tensor = std::nullopt); static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& predicate, const Tensor& value_true, const float value_false, @@ -41,7 +41,7 @@ struct WhereOperation { std::optional output_tensor = std::nullopt); static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& predicate, const float value_true, const float value_false, diff --git a/ttnn/cpp/ttnn/operations/eltwise/ternary_backward/ternary_backward.cpp b/ttnn/cpp/ttnn/operations/eltwise/ternary_backward/ternary_backward.cpp index a6d96d9ede4..4dd0884b2b9 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/ternary_backward/ternary_backward.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/ternary_backward/ternary_backward.cpp @@ -70,7 +70,7 @@ std::vector AddcdivBackwardOperation::invoke( } std::vector WhereBackwardOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad, const Tensor& condition, const Tensor& input, diff --git a/ttnn/cpp/ttnn/operations/eltwise/ternary_backward/ternary_backward.hpp b/ttnn/cpp/ttnn/operations/eltwise/ternary_backward/ternary_backward.hpp index 70e18af2587..5ef16a8aaab 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/ternary_backward/ternary_backward.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/ternary_backward/ternary_backward.hpp @@ -35,7 +35,7 @@ struct AddcdivBackwardOperation { struct WhereBackwardOperation { static std::vector invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad_tensor_arg, const Tensor& input_tensor_a_arg, const Tensor& input_tensor_b_arg, diff --git a/ttnn/cpp/ttnn/operations/eltwise/ternary_backward/ternary_backward_pybind.hpp b/ttnn/cpp/ttnn/operations/eltwise/ternary_backward/ternary_backward_pybind.hpp index 78d4c27dc53..33c7d260f98 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/ternary_backward/ternary_backward_pybind.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/ternary_backward/ternary_backward_pybind.hpp @@ -271,7 +271,7 @@ void bind_ternary_backward_optional_output( const std::vector& are_required_outputs, const std::optional& input_a_grad, const std::optional& input_b_grad, - const uint8_t& queue_id) -> std::vector> { + QueueId queue_id) -> std::vector> { return self( queue_id, grad_tensor, @@ -292,7 +292,7 @@ void bind_ternary_backward_optional_output( py::arg("are_required_outputs") = std::vector{true, true}, py::arg("input_a_grad") = std::nullopt, py::arg("input_b_grad") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } } // namespace detail diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.cpp index 92461a79793..7cee4b3445c 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.cpp @@ -365,7 +365,7 @@ Tensor _swish(const Tensor& a, const std::optional& output_mem_con } Tensor ExecuteTrunc::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input, const std::optional& output_mem_config, std::optional output_tensor) { @@ -410,7 +410,7 @@ Tensor _variance_impl( return ttnn::sum(sqr_y_minus_mean_y, dims, true, std::nullopt, std::nullopt, scale); } Tensor _variance_impl(const Tensor& y, const Tensor& mean_y, const std::optional& output_mem_config) { - Tensor y_minus_mean_y = ttnn::bcast(0, y, mean_y, ttnn::BcastOpMath::SUB, ttnn::BcastOpDim::HW); + Tensor y_minus_mean_y = ttnn::bcast(ttnn::DefaultQueueId, y, mean_y, ttnn::BcastOpMath::SUB, ttnn::BcastOpDim::HW); return _variance_impl(y, mean_y, y_minus_mean_y, output_mem_config); } @@ -445,7 +445,7 @@ Tensor _std_overload(const Tensor& y, const std::optional& output_ Tensor _normalize(const Tensor& y, const std::optional& output_mem_config) { ttnn::SmallVector dims = {2, 3}; Tensor mean_y = ttnn::mean(y, dims, true); - Tensor y_minus_mean_y = ttnn::bcast(0, y, mean_y, ttnn::BcastOpMath::SUB, ttnn::BcastOpDim::HW); + Tensor y_minus_mean_y = ttnn::bcast(ttnn::DefaultQueueId, y, mean_y, ttnn::BcastOpMath::SUB, ttnn::BcastOpDim::HW); Tensor std_y = _std(y, mean_y, y_minus_mean_y, output_mem_config); Tensor recip_std_y = ttnn::reciprocal(std_y, output_mem_config); Tensor z = ttnn::multiply(y_minus_mean_y, recip_std_y, std::nullopt, output_mem_config); @@ -760,7 +760,7 @@ Tensor _polygamma(const Tensor& input_a, int32_t k, const std::optional& round_mode, diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/unary.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary/unary.cpp index 06d086e1aeb..96451895ee0 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/unary/unary.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/unary/unary.cpp @@ -4,7 +4,7 @@ #include "unary.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "device/unary_device_operation.hpp" #include "ttnn/run_operation.hpp" #include "ttnn/operations/pool/downsample/device/downsample_op.hpp" @@ -17,7 +17,7 @@ namespace ttnn::operations::unary { namespace detail { inline Tensor unary_impl( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const std::vector& op_chain, const std::optional& memory_config = std::nullopt, @@ -51,7 +51,7 @@ inline Tensor unary_impl( template Tensor ExecuteUnary::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const std::optional& memory_config, const std::optional& optional_output_tensor) { @@ -126,7 +126,7 @@ template struct ExecuteUnary; template Tensor ExecuteUnaryWithFastAndApproximateMode::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const bool parameter, const std::optional& memory_config, @@ -161,7 +161,7 @@ template struct ExecuteUnaryWithFastAndApproximateMode; template Tensor ExecuteUnaryWithFloatParameter::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const float parameter, const std::optional& memory_config, @@ -202,7 +202,7 @@ template struct ExecuteUnaryWithFloatParameter; template struct ExecuteUnaryWithFloatParameter; Tensor Sigmoid_accurate::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input, const std::optional& memory_config, const std::optional& optional_output_tensor) { @@ -233,7 +233,7 @@ Tensor Sigmoid_accurate::invoke( } Tensor Unary_chain::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const std::vector& ops_chain, const std::optional& memory_config, @@ -252,7 +252,7 @@ Tensor Unary_chain::invoke( } Tensor Softplus::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input, const float beta, const float threshold, @@ -283,7 +283,7 @@ Tensor Softplus::invoke( } Tensor Prelu::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input, float value, const std::optional& memory_config, @@ -302,7 +302,7 @@ Tensor Prelu::invoke( } Tensor Identity::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const std::optional& memory_config, const std::optional& optional_output_tensor) { @@ -328,7 +328,7 @@ Tensor Identity::invoke( } Tensor Abs::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const std::optional& memory_config, const std::optional& optional_output_tensor) { @@ -356,7 +356,7 @@ Tensor Abs::invoke(const ComplexTensor& input_tensor, const MemoryConfig& output } Tensor Floor::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const std::optional& memory_config, const std::optional& optional_output_tensor) { @@ -382,7 +382,7 @@ Tensor Floor::invoke( } Tensor Ceil::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const std::optional& memory_config, const std::optional& optional_output_tensor) { @@ -409,7 +409,7 @@ Tensor Ceil::invoke( template Tensor ExecuteUnaryWithIntegerParameter::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, T parameter, const std::optional& memory_config, @@ -445,7 +445,7 @@ template struct ExecuteUnaryWithIntegerParameter Tensor SymmetricBinop::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, T param, const std::optional& memory_config, @@ -460,7 +460,7 @@ Tensor SymmetricBinop::invoke( template Tensor SymmetricBinop::invoke( - uint8_t queue_id, + QueueId queue_id, T param, const Tensor& input_tensor, const std::optional& memory_config, @@ -507,7 +507,7 @@ template struct SymmetricBinop; template Tensor AsymmetricBinop::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, float param, const std::optional& memory_config, @@ -522,7 +522,7 @@ Tensor AsymmetricBinop::invoke( template Tensor AsymmetricBinop::invoke( - uint8_t queue_id, + QueueId queue_id, float param, const Tensor& input_tensor, const std::optional& memory_config, diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/unary.hpp b/ttnn/cpp/ttnn/operations/eltwise/unary/unary.hpp index 5bffdd7e54c..c1f555a8a83 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/unary/unary.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/unary/unary.hpp @@ -24,7 +24,7 @@ struct ExecuteUnaryInvokeResult { template struct ExecuteUnary { static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); @@ -41,7 +41,7 @@ struct ExecuteUnary { template struct ExecuteUnaryWithFastAndApproximateMode { static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const bool parameter = false, const std::optional& memory_config = std::nullopt, @@ -57,7 +57,7 @@ struct ExecuteUnaryWithFastAndApproximateMode { template struct ExecuteUnaryWithFloatParameter { static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const float parameter, const std::optional& memory_config = std::nullopt, @@ -72,7 +72,7 @@ struct ExecuteUnaryWithFloatParameter { struct Sigmoid_accurate { static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input, const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); @@ -85,7 +85,7 @@ struct Sigmoid_accurate { struct Unary_chain { static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const std::vector& ops_chain, const std::optional& memory_config = std::nullopt, @@ -100,7 +100,7 @@ struct Unary_chain { struct Softplus { static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input, const float beta, const float threshold, @@ -117,7 +117,7 @@ struct Softplus { struct Prelu { static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input, float value, const std::optional& memory_config = std::nullopt, @@ -132,7 +132,7 @@ struct Prelu { struct Identity { static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); @@ -145,7 +145,7 @@ struct Identity { struct Abs { static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); @@ -160,7 +160,7 @@ struct Abs { struct Floor { static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); @@ -173,7 +173,7 @@ struct Floor { struct Ceil { static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); @@ -193,7 +193,7 @@ struct Dropout { const std::optional& optional_output_tensor = std::nullopt); static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input, const uint32_t seed, const float probability, @@ -205,7 +205,7 @@ struct Dropout { template struct ExecuteUnaryWithIntegerParameter { static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, T parameter, const std::optional& memory_config = std::nullopt, @@ -221,14 +221,14 @@ struct ExecuteUnaryWithIntegerParameter { template struct SymmetricBinop { static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, T param, const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, T param, const Tensor& input_tensor, const std::optional& memory_config = std::nullopt, @@ -250,14 +250,14 @@ struct SymmetricBinop { template struct AsymmetricBinop { static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, float param, const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, float param, const Tensor& input_tensor, const std::optional& memory_config = std::nullopt, diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/unary_composite.hpp b/ttnn/cpp/ttnn/operations/eltwise/unary/unary_composite.hpp index b7c2535aa01..06a785003e3 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/unary/unary_composite.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/unary/unary_composite.hpp @@ -22,7 +22,7 @@ struct ExecuteUnaryCompositeOp { struct ExecuteTrunc { static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const std::optional& memory_config = std::nullopt, std::optional optional_output_tensor = std::nullopt); @@ -112,7 +112,7 @@ struct ExecuteUnaryCompositeOpWithInt { struct ExecuteRdiv { static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, float value, const std::optional& round_mode = std::nullopt, diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/unary_pybind.hpp b/ttnn/cpp/ttnn/operations/eltwise/unary/unary_pybind.hpp index 4d494c685ec..e1b7b607d3d 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/unary/unary_pybind.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/unary/unary_pybind.hpp @@ -180,14 +180,14 @@ void bind_unary_operation( const Tensor& input_tensor, const std::optional& memory_config, const std::optional& output_tensor, - const uint8_t& queue_id) -> ttnn::Tensor { + QueueId queue_id) -> ttnn::Tensor { return self(queue_id, input_tensor, memory_config, output_tensor); }, py::arg("input_tensor"), py::kw_only(), py::arg("memory_config") = std::nullopt, py::arg("output_tensor") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } @@ -250,14 +250,14 @@ void bind_unary_operation_overload_complex( const Tensor& input_tensor, const std::optional& memory_config, const std::optional& output_tensor, - const uint8_t& queue_id) -> ttnn::Tensor { + QueueId queue_id) -> ttnn::Tensor { return self(queue_id, input_tensor, memory_config, output_tensor); }, py::arg("input_tensor"), py::kw_only(), py::arg("memory_config") = std::nullopt, py::arg("output_tensor") = std::nullopt, - py::arg("queue_id") = 0}, + py::arg("queue_id") = DefaultQueueId}, ttnn::pybind_overload_t{ [](const unary_operation_t& self, @@ -328,14 +328,14 @@ void bind_unary_operation_overload_complex_return_complex( const Tensor& input_tensor, const std::optional& memory_config, const std::optional& output_tensor, - const uint8_t& queue_id) -> ttnn::Tensor { + QueueId queue_id) -> ttnn::Tensor { return self(queue_id, input_tensor, memory_config, output_tensor); }, py::arg("input_tensor"), py::kw_only(), py::arg("memory_config") = std::nullopt, py::arg("output_tensor") = std::nullopt, - py::arg("queue_id") = 0}, + py::arg("queue_id") = DefaultQueueId}, ttnn::pybind_overload_t{ [](const unary_operation_t& self, @@ -404,7 +404,7 @@ void bind_unary_operation_with_fast_and_approximate_mode(py::module& module, con const bool parameter, const std::optional& memory_config, const std::optional& output_tensor, - const uint8_t& queue_id) -> ttnn::Tensor { + QueueId queue_id) -> ttnn::Tensor { return self(queue_id, input_tensor, parameter, memory_config, output_tensor); }, py::arg("input_tensor"), @@ -412,7 +412,7 @@ void bind_unary_operation_with_fast_and_approximate_mode(py::module& module, con py::arg("fast_and_approximate_mode") = false, py::arg("memory_config") = std::nullopt, py::arg("output_tensor") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } template @@ -483,7 +483,7 @@ void bind_unary_operation_with_float_parameter( const float parameter, const std::optional& memory_config, const std::optional& output_tensor, - const uint8_t& queue_id) { + QueueId queue_id) { return self(queue_id, input_tensor, parameter, memory_config, output_tensor); }, py::arg("input_tensor"), @@ -491,7 +491,7 @@ void bind_unary_operation_with_float_parameter( py::kw_only(), py::arg("memory_config") = std::nullopt, py::arg("output_tensor") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } @@ -640,7 +640,7 @@ void bind_unary_rdiv( const std::optional parameter_b, const std::optional& memory_config, const std::optional& output_tensor, - const uint8_t& queue_id) { + QueueId queue_id) { return self(queue_id, input_tensor, parameter_a, parameter_b, memory_config, output_tensor); }, py::arg("input_tensor"), @@ -649,7 +649,7 @@ void bind_unary_rdiv( py::arg(parameter_name_b.c_str()) = std::nullopt, py::arg("memory_config") = std::nullopt, py::arg("output_tensor") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } template @@ -705,7 +705,7 @@ void bind_softplus(py::module& module, const unary_operation_t& operation) { const float threshold, const std::optional& memory_config, const std::optional& output_tensor, - const uint8_t queue_id) { + const QueueId queue_id) { return self(queue_id, input, beta, threshold, memory_config, output_tensor); }, py::arg("input_tensor"), @@ -714,7 +714,7 @@ void bind_softplus(py::module& module, const unary_operation_t& operation) { py::arg("threshold") = 20.0f, py::arg("memory_config") = std::nullopt, py::arg("output_tensor") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } template @@ -766,14 +766,14 @@ void bind_sigmoid_accurate(py::module& module, const unary_operation_t& operatio const Tensor& input_tensor, const std::optional& memory_config, const std::optional& output_tensor, - const uint8_t queue_id) -> ttnn::Tensor { + const QueueId queue_id) -> ttnn::Tensor { return self(queue_id, input_tensor, memory_config, output_tensor); }, py::arg("input_tensor"), py::kw_only(), py::arg("memory_config") = std::nullopt, py::arg("output_tensor") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } template @@ -829,7 +829,7 @@ void bind_unary_chain(py::module& module, const unary_operation_t& operation) { const FusedActivations& ops_chain, const std::optional& memory_config, const std::optional& output_tensor, - const uint8_t queue_id) { + const QueueId queue_id) { return self(queue_id, input_tensor, ops_chain, memory_config, output_tensor); }, py::arg("input_tensor"), @@ -837,7 +837,7 @@ void bind_unary_chain(py::module& module, const unary_operation_t& operation) { py::kw_only(), py::arg("memory_config") = std::nullopt, py::arg("output_tensor") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } template @@ -890,14 +890,14 @@ void bind_identity(py::module& module, const unary_operation_t& operation) { const Tensor& input_tensor, const std::optional& memory_config, const std::optional& output_tensor, - const uint8_t queue_id) { + const QueueId queue_id) { return self(queue_id, input_tensor, memory_config, output_tensor); }, py::arg("input_tensor"), py::kw_only(), py::arg("memory_config") = std::nullopt, py::arg("output_tensor") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } template @@ -1371,7 +1371,7 @@ void bind_unary_composite_trunc(py::module& module, const unary_operation_t& ope const Tensor& input_tensor, const std::optional& memory_config, const std::optional& output_tensor, - const uint8_t& queue_id) { + QueueId queue_id) { return self(queue_id, input_tensor, memory_config, output_tensor); }, py::arg("input_tensor"), diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward.cpp index 5acf6919b8c..3f63d85c7f6 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward.cpp @@ -6,7 +6,7 @@ #include #include "ttnn/operations/data_movement/bcast/bcast.hpp" #include -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/operations/eltwise/unary/unary.hpp" #include "ttnn/operations/eltwise/binary/binary.hpp" #include "ttnn/operations/moreh/moreh_sum/moreh_sum.hpp" @@ -237,7 +237,7 @@ std::vector ExecuteUnaryBackwardRdiv::invoke( // unary_pow: // grad_input = grad * exponent * torch.pow(input, exponent - 1) std::vector> ExecuteUnaryBackwardPow::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad, const Tensor& input, float exponent, @@ -290,7 +290,7 @@ std::vector> ExecuteUnaryBackwardPow::invoke( } std::vector> ExecuteUnaryBackwardExp::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad, const Tensor& input, const std::optional& output_mem_config, @@ -335,7 +335,7 @@ std::vector> ExecuteUnaryBackwardExp::invoke( } std::vector> ExecuteUnaryBackwardTanh::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad, const Tensor& input, const std::optional& output_mem_config, @@ -360,7 +360,7 @@ std::vector> ExecuteUnaryBackwardTanh::invoke( } std::vector> ExecuteUnaryBackwardSqrt::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad, const Tensor& input, const std::optional& output_mem_config, @@ -551,7 +551,7 @@ std::vector ExecuteUnaryBackwardSigmoid::invoke( } std::vector> ExecuteUnaryBackwardRsqrt::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad, const Tensor& input, const std::optional& output_mem_config, @@ -612,7 +612,7 @@ std::vector> ExecuteUnaryBackwardRsqrt::invoke( } std::vector> ExecuteUnaryBackwardNeg::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad, const Tensor& input, const std::optional& output_mem_config, @@ -644,7 +644,7 @@ std::vector ExecuteUnaryBackwardRelu::invoke( // self: zeros_like(grad) // result: at::fill(self_t, 0) std::vector> ExecuteUnaryBackwardFill::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad, const Tensor& input, const std::optional& output_mem_config, @@ -984,7 +984,7 @@ std::vector _abs_bw( // Silu // result: grad * sigmoid_result * (1 + input * (1 - sigmoid_result)) std::vector> ExecuteUnaryBackwardSilu::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad, const Tensor& input, const std::optional& output_mem_config, @@ -1673,7 +1673,7 @@ std::vector ExecuteUnaryBackwardDeg2rad::invoke( } std::vector> ExecuteUnaryBackwardGelu::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad, const Tensor& input, const string& approximate, @@ -1913,13 +1913,23 @@ std::vector ExecuteUnaryBackwardProd::invoke( temp = ttnn::operations::unary_backward::change_layout_to_tile(temp, output_memory_config); } if (dim == 3 || dim == -1) { - Tensor grad_result = - ttnn::bcast(0, reciprocal_input, temp, ttnn::BcastOpMath::MUL, ttnn::BcastOpDim::W, output_memory_config); + Tensor grad_result = ttnn::bcast( + ttnn::DefaultQueueId, + reciprocal_input, + temp, + ttnn::BcastOpMath::MUL, + ttnn::BcastOpDim::W, + output_memory_config); grad_tensor.emplace_back(grad_result); return grad_tensor; } else if (dim == 2 || dim == -2) { - Tensor grad_result = - ttnn::bcast(0, reciprocal_input, temp, ttnn::BcastOpMath::MUL, ttnn::BcastOpDim::H, output_memory_config); + Tensor grad_result = ttnn::bcast( + ttnn::DefaultQueueId, + reciprocal_input, + temp, + ttnn::BcastOpMath::MUL, + ttnn::BcastOpDim::H, + output_memory_config); grad_tensor.emplace_back(grad_result); return grad_tensor; } else if (dim == 1 || dim == -3) { @@ -1927,7 +1937,7 @@ std::vector ExecuteUnaryBackwardProd::invoke( if (reciprocal_input.padded_shape()[1] % 32 != 0) { ttnn::SmallVector> padding = { {0, 0}, {0, 32 - (reciprocal_input.padded_shape()[1] % 32)}, {0, 0}, {0, 0}}; - tensor_1_temp = ttnn::pad(0, reciprocal_input, padding, 0, true, std::nullopt); + tensor_1_temp = ttnn::pad(ttnn::DefaultQueueId, reciprocal_input, padding, 0, true, std::nullopt); } ttnn::SmallVector after_permute_dims = {0, 2, 3, 1}; Tensor tensor_1 = ttnn::permute(tensor_1_temp, after_permute_dims, output_memory_config); @@ -1940,7 +1950,13 @@ std::vector ExecuteUnaryBackwardProd::invoke( after_permute_dims = {0, 3, 1, 2}; Tensor result = permute( - ttnn::bcast(0, tensor_1, tensor_2, ttnn::BcastOpMath::MUL, ttnn::BcastOpDim::W, output_memory_config), + ttnn::bcast( + ttnn::DefaultQueueId, + tensor_1, + tensor_2, + ttnn::BcastOpMath::MUL, + ttnn::BcastOpDim::W, + output_memory_config), after_permute_dims, output_memory_config); Tensor grad_result = result; @@ -1959,7 +1975,7 @@ std::vector ExecuteUnaryBackwardProd::invoke( if (reciprocal_input.padded_shape()[0] % 32 != 0) { ttnn::SmallVector> padding = { {0, (32 - (reciprocal_input.padded_shape()[0] % 32))}, {0, 0}, {0, 0}, {0, 0}}; - tensor_1_temp = ttnn::pad(0, reciprocal_input, padding, 0, false, std::nullopt); + tensor_1_temp = ttnn::pad(ttnn::DefaultQueueId, reciprocal_input, padding, 0, false, std::nullopt); } ttnn::SmallVector after_permute_dims = {3, 1, 2, 0}; Tensor tensor_1 = ttnn::permute(tensor_1_temp, after_permute_dims, output_memory_config); @@ -1971,7 +1987,13 @@ std::vector ExecuteUnaryBackwardProd::invoke( tensor_2, tensor_1.device(), tensor_1.get_layout(), tensor_1.memory_config()); Tensor result = ttnn::permute( - ttnn::bcast(0, tensor_1, tensor_2, ttnn::BcastOpMath::MUL, ttnn::BcastOpDim::W, output_memory_config), + ttnn::bcast( + ttnn::DefaultQueueId, + tensor_1, + tensor_2, + ttnn::BcastOpMath::MUL, + ttnn::BcastOpDim::W, + output_memory_config), after_permute_dims, output_memory_config); Tensor grad_result = result; diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward.hpp b/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward.hpp index 996d1181357..813c39314f4 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward.hpp @@ -16,7 +16,7 @@ Tensor change_layout_to_tile(const Tensor& temp, const MemoryConfig& output_mem_ struct ExecuteUnaryBackwardNeg { static std::vector> invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad_tensor_arg, const Tensor& input_tensor_arg, const std::optional& memory_config = std::nullopt, @@ -438,7 +438,7 @@ struct ExecuteUnaryBackwardErf { struct ExecuteUnaryBackwardRsqrt { static std::vector> invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad_tensor_arg, const Tensor& input_tensor_arg, const std::optional& memory_config = std::nullopt, @@ -502,7 +502,7 @@ struct ExecuteUnaryBackwardRepeat { struct ExecuteUnaryBackwardPow { static std::vector> invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad_tensor_arg, const Tensor& input_tensor_arg, float parameter, @@ -519,7 +519,7 @@ struct ExecuteUnaryBackwardPow { struct ExecuteUnaryBackwardExp { static std::vector> invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad_tensor_arg, const Tensor& input_tensor_arg, const std::optional& memory_config = std::nullopt, @@ -534,7 +534,7 @@ struct ExecuteUnaryBackwardExp { struct ExecuteUnaryBackwardTanh { static std::vector> invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad_tensor_arg, const Tensor& input_tensor_arg, const std::optional& memory_config = std::nullopt, @@ -549,7 +549,7 @@ struct ExecuteUnaryBackwardTanh { struct ExecuteUnaryBackwardSqrt { static std::vector> invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad_tensor_arg, const Tensor& input_tensor_arg, const std::optional& memory_config = std::nullopt, @@ -564,7 +564,7 @@ struct ExecuteUnaryBackwardSqrt { struct ExecuteUnaryBackwardSilu { static std::vector> invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad_tensor_arg, const Tensor& input_tensor_arg, const std::optional& memory_config = std::nullopt, @@ -579,7 +579,7 @@ struct ExecuteUnaryBackwardSilu { struct ExecuteUnaryBackwardFill { static std::vector> invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad_tensor_arg, const Tensor& input_tensor_arg, const std::optional& memory_config = std::nullopt, @@ -625,7 +625,7 @@ struct ExecuteUnaryBackwardAbs { struct ExecuteUnaryBackwardGelu { static std::vector> invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& grad_tensor_arg, const Tensor& input_tensor_arg, const string& parameter_a, diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward_pybind.hpp b/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward_pybind.hpp index 1c58cde3342..86c8347c8b0 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward_pybind.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward_pybind.hpp @@ -222,7 +222,7 @@ void bind_unary_backward_rsqrt( const ttnn::Tensor& input_tensor, const std::optional& memory_config, const std::optional& input_grad, - const uint8_t& queue_id) -> std::vector> { + QueueId queue_id) -> std::vector> { return self(queue_id, grad_tensor, input_tensor, memory_config, input_grad); }, py::arg("grad_tensor"), @@ -866,7 +866,7 @@ void bind_unary_backward_unary_optional_float( float parameter, const std::optional& memory_config, const std::optional& input_grad, - const uint8_t& queue_id) -> std::vector> { + QueueId queue_id) -> std::vector> { return self(queue_id, grad_tensor, input_tensor, parameter, memory_config, input_grad); }, py::arg("grad_tensor"), @@ -875,7 +875,7 @@ void bind_unary_backward_unary_optional_float( py::kw_only(), py::arg("memory_config") = std::nullopt, py::arg("input_grad") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } template @@ -1012,7 +1012,7 @@ void bind_unary_backward_optional( const ttnn::Tensor& input_tensor, const std::optional& memory_config, const std::optional& input_grad, - const uint8_t& queue_id) -> std::vector> { + QueueId queue_id) -> std::vector> { return self(queue_id, grad_tensor, input_tensor, memory_config, input_grad); }, py::arg("grad_tensor"), @@ -1083,7 +1083,7 @@ void bind_unary_backward_neg( const ttnn::Tensor& input_tensor, const std::optional& memory_config, const std::optional& input_grad, - const uint8_t& queue_id) -> std::vector> { + QueueId queue_id) -> std::vector> { return self(queue_id, grad_tensor, input_tensor, memory_config, input_grad); }, py::arg("grad_tensor"), @@ -1227,7 +1227,7 @@ void bind_unary_backward_gelu( string parameter_a, const std::optional& memory_config, const std::optional& input_grad, - const uint8_t& queue_id) -> std::vector> { + QueueId queue_id) -> std::vector> { return self(queue_id, grad_tensor, input_tensor, parameter_a, memory_config, input_grad); }, py::arg("grad_tensor"), diff --git a/ttnn/cpp/ttnn/operations/embedding/embedding.cpp b/ttnn/cpp/ttnn/operations/embedding/embedding.cpp index f10c793c4fb..ab546a0fa70 100644 --- a/ttnn/cpp/ttnn/operations/embedding/embedding.cpp +++ b/ttnn/cpp/ttnn/operations/embedding/embedding.cpp @@ -6,7 +6,7 @@ #include #include "ttnn/operations/core/core.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/operations/embedding/device/embedding_device_operation.hpp" #include "ttnn/run_operation.hpp" #include "ttnn/operations/data_movement/unsqueeze/unsqueeze.hpp" @@ -14,7 +14,7 @@ namespace ttnn::operations::embedding { ttnn::Tensor EmbeddingOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_arg, const Tensor& weight_arg, const std::optional& pad_token, diff --git a/ttnn/cpp/ttnn/operations/embedding/embedding.hpp b/ttnn/cpp/ttnn/operations/embedding/embedding.hpp index e41e513434a..c2eac637ab1 100644 --- a/ttnn/cpp/ttnn/operations/embedding/embedding.hpp +++ b/ttnn/cpp/ttnn/operations/embedding/embedding.hpp @@ -15,7 +15,7 @@ namespace embedding { struct EmbeddingOperation { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_arg, const Tensor& weight_arg, const std::optional& pad_token = std::nullopt, diff --git a/ttnn/cpp/ttnn/operations/embedding/embedding_pybind.hpp b/ttnn/cpp/ttnn/operations/embedding/embedding_pybind.hpp index b384c2916a4..26317cc9696 100644 --- a/ttnn/cpp/ttnn/operations/embedding/embedding_pybind.hpp +++ b/ttnn/cpp/ttnn/operations/embedding/embedding_pybind.hpp @@ -74,7 +74,7 @@ void py_module(py::module& module) { const std::optional dtype, std::optional& optional_output_tensor, const std::optional& memory_config, - uint8_t queue_id) { + QueueId queue_id) { return self( queue_id, input_tensor, @@ -95,7 +95,7 @@ void py_module(py::module& module) { py::arg("dtype").noconvert() = std::nullopt, py::arg("output_tensor").noconvert() = std::nullopt, py::arg("memory_config") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } } // namespace ttnn::operations::embedding diff --git a/ttnn/cpp/ttnn/operations/embedding_backward/embedding_backward.cpp b/ttnn/cpp/ttnn/operations/embedding_backward/embedding_backward.cpp index 488b88a9a75..99825d65a61 100644 --- a/ttnn/cpp/ttnn/operations/embedding_backward/embedding_backward.cpp +++ b/ttnn/cpp/ttnn/operations/embedding_backward/embedding_backward.cpp @@ -14,7 +14,7 @@ namespace ttnn::operations::embedding_backward { Tensor EmbeddingBackwardOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_arg, const Tensor& weight_tensor_arg, const Tensor& output_gradient_tensor_arg, diff --git a/ttnn/cpp/ttnn/operations/embedding_backward/embedding_backward.hpp b/ttnn/cpp/ttnn/operations/embedding_backward/embedding_backward.hpp index 6a1859fdde1..bfea45afdd0 100644 --- a/ttnn/cpp/ttnn/operations/embedding_backward/embedding_backward.hpp +++ b/ttnn/cpp/ttnn/operations/embedding_backward/embedding_backward.hpp @@ -14,7 +14,7 @@ namespace embedding_backward { struct EmbeddingBackwardOperation { static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_arg, const Tensor& weight_tensor_arg, const Tensor& output_gradient_tensor_arg, diff --git a/ttnn/cpp/ttnn/operations/embedding_backward/embedding_backward_pybind.cpp b/ttnn/cpp/ttnn/operations/embedding_backward/embedding_backward_pybind.cpp index 98df95aef60..c58b52cb640 100644 --- a/ttnn/cpp/ttnn/operations/embedding_backward/embedding_backward_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/embedding_backward/embedding_backward_pybind.cpp @@ -70,7 +70,7 @@ void py_bind_embedding_backward(py::module& module) { const std::optional dtype, std::optional& optional_output_tensor, const std::optional& memory_config, - uint8_t queue_id) { + QueueId queue_id) { return self( queue_id, input_tensor, @@ -87,7 +87,7 @@ void py_bind_embedding_backward(py::module& module) { py::arg("dtype").noconvert() = std::nullopt, py::arg("output_tensor").noconvert() = std::nullopt, py::arg("memory_config") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } } // namespace ttnn::operations::embedding_backward diff --git a/ttnn/cpp/ttnn/operations/experimental/auto_format/auto_format.cpp b/ttnn/cpp/ttnn/operations/experimental/auto_format/auto_format.cpp index 5f60337d2ab..71129d82a7b 100644 --- a/ttnn/cpp/ttnn/operations/experimental/auto_format/auto_format.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/auto_format/auto_format.cpp @@ -92,7 +92,7 @@ Tensor AutoFormat::format_input_tensor( } else if (!convert_layout && pad_input) { if (formatted_input.get_layout() == Layout::ROW_MAJOR || formatted_input.get_layout() == Layout::TILE) { return ttnn::pad( - 0, + DefaultQueueId, (const ttnn::Tensor)formatted_input, padded_shape.to_array_4D(), tt::tt_metal::Array4D({0, 0, 0, 0}), @@ -113,7 +113,7 @@ Tensor AutoFormat::format_input_tensor( } else if (formatted_input.get_layout() == Layout::TILE && target_layout == Layout::ROW_MAJOR) { formatted_input = ttnn::untilize(formatted_input, mem_config); return ttnn::pad( - 0, + DefaultQueueId, (const ttnn::Tensor)formatted_input, padded_shape.to_array_4D(), tt::tt_metal::Array4D({0, 0, 0, 0}), diff --git a/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/convert_to_chw.cpp b/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/convert_to_chw.cpp index 300c112e968..1ea1da85ce0 100644 --- a/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/convert_to_chw.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/convert_to_chw.cpp @@ -5,12 +5,12 @@ #include "convert_to_chw.hpp" #include "device/convert_to_chw_op.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" namespace ttnn::operations::experimental::cnn { ttnn::Tensor ExecuteConvertToCHW::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& a, const std::optional& memory_config, const std::optional& dtype) { diff --git a/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/convert_to_chw.hpp b/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/convert_to_chw.hpp index 16404d4c511..8dd15d4d3f3 100644 --- a/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/convert_to_chw.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/convert_to_chw.hpp @@ -11,7 +11,10 @@ namespace ttnn::operations::experimental::cnn { struct ExecuteConvertToCHW { static ttnn::Tensor invoke( - uint8_t queue_id, const Tensor& a, const std::optional& memory_config = std::nullopt, const std::optional& dtype = std::nullopt); + QueueId queue_id, + const Tensor& a, + const std::optional& memory_config = std::nullopt, + const std::optional& dtype = std::nullopt); static ttnn::Tensor invoke(const Tensor& a, const std::optional& memory_config = std::nullopt, const std::optional& dtype = std::nullopt); }; diff --git a/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/convert_to_chw_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/convert_to_chw_pybind.cpp index a605c9655c8..bd637966797 100644 --- a/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/convert_to_chw_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/convert_to_chw_pybind.cpp @@ -28,12 +28,12 @@ void bind_convert_to_chw(py::module& module) { const ttnn::Tensor& input, const std::optional& memory_config, const std::optional dtype, - uint8_t queue_id) { return self(queue_id, input, memory_config, dtype); }, + QueueId queue_id) { return self(queue_id, input, memory_config, dtype); }, py::arg("input"), py::kw_only(), py::arg("memory_config") = std::nullopt, py::arg("dtype") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } } // namespace ttnn::operations::experimental::cnn::detail diff --git a/ttnn/cpp/ttnn/operations/experimental/copy/typecast/typecast.cpp b/ttnn/cpp/ttnn/operations/experimental/copy/typecast/typecast.cpp index 8e399c4c76a..5dba4ac14f4 100644 --- a/ttnn/cpp/ttnn/operations/experimental/copy/typecast/typecast.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/copy/typecast/typecast.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/run_operation.hpp" #include "ttnn/decorators.hpp" #include "typecast.hpp" @@ -11,7 +11,7 @@ namespace ttnn::operations::experimental::copy { ttnn::Tensor TypecastOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const DataType& dtype, const std::optional& output_mem_config, diff --git a/ttnn/cpp/ttnn/operations/experimental/copy/typecast/typecast.hpp b/ttnn/cpp/ttnn/operations/experimental/copy/typecast/typecast.hpp index 87884e865fd..fcd8ef371fe 100644 --- a/ttnn/cpp/ttnn/operations/experimental/copy/typecast/typecast.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/copy/typecast/typecast.hpp @@ -13,7 +13,7 @@ namespace operations::experimental::copy { struct TypecastOperation { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const DataType& dtype, const std::optional& output_mem_config = std::nullopt, diff --git a/ttnn/cpp/ttnn/operations/experimental/copy/typecast/typecast_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/copy/typecast/typecast_pybind.cpp index 6540a57ec8f..83fe64915e0 100644 --- a/ttnn/cpp/ttnn/operations/experimental/copy/typecast/typecast_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/copy/typecast/typecast_pybind.cpp @@ -42,12 +42,12 @@ void py_bind_typecast(py::module& module) { const ttnn::DataType dtype, const std::optional& memory_config, const std::optional& optional_output_tensor, - uint8_t queue_id) { return self(queue_id, input_tensor, dtype, memory_config, optional_output_tensor); }, + QueueId queue_id) { return self(queue_id, input_tensor, dtype, memory_config, optional_output_tensor); }, py::arg("input_tensor").noconvert(), py::arg("dtype").noconvert(), py::arg("memory_config") = std::nullopt, py::arg("optional_output_tensor") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } } // namespace ttnn::operations::experimental::copy::detail diff --git a/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/attn_matmul.cpp b/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/attn_matmul.cpp index 75ec4161d59..0788ebf8fdc 100644 --- a/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/attn_matmul.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/attn_matmul.cpp @@ -12,7 +12,7 @@ namespace ttnn::operations::experimental::matmul { ttnn::Tensor AttnMatmulOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a, const Tensor& input_tensor_b, const CoreCoord& compute_with_storage_grid_size, @@ -60,7 +60,7 @@ ttnn::Tensor AttnMatmulOperation::invoke( // TODO: Should we support option to read directly from cache (with optional transpose_hw)? ttnn::Tensor AttnMatmulFromCacheOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a, const Tensor& input_tensor_b, const uint32_t num_tokens, diff --git a/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/attn_matmul.hpp b/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/attn_matmul.hpp index 9bd8819ace8..97b01b3f687 100644 --- a/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/attn_matmul.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/attn_matmul.hpp @@ -14,7 +14,7 @@ namespace operations::experimental::matmul { // KV heads = 1) a special case of group_attn_matmul and run the same op struct AttnMatmulOperation { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a, const Tensor& input_tensor_b, const CoreCoord& compute_with_storage_grid_size, @@ -35,7 +35,7 @@ struct AttnMatmulOperation { struct AttnMatmulFromCacheOperation { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a, const Tensor& input_tensor_b, const uint32_t num_tokens, diff --git a/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/attn_matmul_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/attn_matmul_pybind.cpp index dc5e33aa796..eff4087f40c 100644 --- a/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/attn_matmul_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/attn_matmul_pybind.cpp @@ -26,7 +26,7 @@ void bind_attn_matmul(pybind11::module& module) { std::optional compute_kernel_config, const std::optional& memory_config, std::optional optional_output_tensor, - uint8_t queue_id) { + QueueId queue_id) { return self( queue_id, input_tensor_a, @@ -45,7 +45,7 @@ void bind_attn_matmul(pybind11::module& module) { pybind11::arg("compute_kernel_config").noconvert() = std::nullopt, pybind11::arg("memory_config") = std::nullopt, pybind11::arg("output_tensor") = std::nullopt, - pybind11::arg("queue_id") = 0}); + pybind11::arg("queue_id") = DefaultQueueId}); } void bind_attn_matmul_from_cache(pybind11::module& module) { @@ -66,7 +66,7 @@ void bind_attn_matmul_from_cache(pybind11::module& module) { const std::optional& memory_config, std::optional dtype, std::optional compute_kernel_config, - uint8_t queue_id) { + QueueId queue_id) { return self( queue_id, input_tensor_a, @@ -87,7 +87,7 @@ void bind_attn_matmul_from_cache(pybind11::module& module) { pybind11::arg("memory_config") = std::nullopt, pybind11::arg("dtype") = std::nullopt, pybind11::arg("compute_kernel_config") = std::nullopt, - pybind11::arg("queue_id") = 0}); + pybind11::arg("queue_id") = DefaultQueueId}); } } // namespace ttnn::operations::experimental::matmul::detail diff --git a/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/device/attn_matmul_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/device/attn_matmul_device_operation.hpp index 2427cc9c7a4..2db317e53aa 100644 --- a/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/device/attn_matmul_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/device/attn_matmul_device_operation.hpp @@ -6,7 +6,7 @@ #include -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp" #include "ttnn/operation.hpp" diff --git a/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/group_attn_matmul_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/group_attn_matmul_device_operation.hpp index 3646bbf74d3..14531364344 100644 --- a/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/group_attn_matmul_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/group_attn_matmul_device_operation.hpp @@ -6,7 +6,7 @@ #include -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp" #include "ttnn/operation.hpp" diff --git a/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/group_attn_matmul.cpp b/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/group_attn_matmul.cpp index 21c5678f9a2..a4b967fc04c 100644 --- a/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/group_attn_matmul.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/group_attn_matmul.cpp @@ -12,7 +12,7 @@ namespace ttnn::operations::experimental::matmul { ttnn::Tensor GroupAttnMatmulOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a, const Tensor& input_tensor_b, const CoreCoord& compute_with_storage_grid_size, diff --git a/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/group_attn_matmul.hpp b/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/group_attn_matmul.hpp index a6cbeb1dd4d..74faf842112 100644 --- a/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/group_attn_matmul.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/group_attn_matmul.hpp @@ -14,7 +14,7 @@ namespace operations::experimental::matmul { // KV heads = 1) a special case of group_attn_matmul and run the same op struct GroupAttnMatmulOperation { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_a, const Tensor& input_tensor_b, const CoreCoord& compute_with_storage_grid_size, diff --git a/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/group_attn_matmul_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/group_attn_matmul_pybind.cpp index 96ef379d6a9..ec5c3b375bb 100644 --- a/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/group_attn_matmul_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/group_attn_matmul_pybind.cpp @@ -28,7 +28,7 @@ void bind_group_attn_matmul(pybind11::module& module) { std::optional output_dtype, std::optional compute_kernel_config, std::optional optional_output_tensor, - uint8_t queue_id) { + QueueId queue_id) { return self( queue_id, input_tensor_a, @@ -47,7 +47,7 @@ void bind_group_attn_matmul(pybind11::module& module) { pybind11::arg("dtype").noconvert() = std::nullopt, pybind11::arg("compute_kernel_config").noconvert() = std::nullopt, pybind11::arg("optional_output_tensor").noconvert() = std::nullopt, - pybind11::arg("queue_id").noconvert() = 0}); + pybind11::arg("queue_id") = DefaultQueueId}); } } // namespace ttnn::operations::experimental::matmul::detail diff --git a/ttnn/cpp/ttnn/operations/experimental/plusone/device/plusone_op.hpp b/ttnn/cpp/ttnn/operations/experimental/plusone/device/plusone_op.hpp index 3fe32c30642..fe5649ae288 100644 --- a/ttnn/cpp/ttnn/operations/experimental/plusone/device/plusone_op.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/plusone/device/plusone_op.hpp @@ -6,7 +6,7 @@ #include -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/run_operation.hpp" diff --git a/ttnn/cpp/ttnn/operations/experimental/plusone/plusone.cpp b/ttnn/cpp/ttnn/operations/experimental/plusone/plusone.cpp index e58b96c3616..9ff82db36ce 100644 --- a/ttnn/cpp/ttnn/operations/experimental/plusone/plusone.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/plusone/plusone.cpp @@ -11,7 +11,7 @@ namespace ttnn::operations::experimental { -ttnn::Tensor PlusOneOperation::invoke(uint8_t queue_id, const Tensor& input_tensor) { +ttnn::Tensor PlusOneOperation::invoke(QueueId queue_id, const Tensor& input_tensor) { return operation::run(PlusOne{}, {input_tensor}, {}, {}, queue_id).at(0); } diff --git a/ttnn/cpp/ttnn/operations/experimental/plusone/plusone.hpp b/ttnn/cpp/ttnn/operations/experimental/plusone/plusone.hpp index 8e6b1cc69f4..4ffeafeb2aa 100644 --- a/ttnn/cpp/ttnn/operations/experimental/plusone/plusone.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/plusone/plusone.hpp @@ -12,7 +12,7 @@ namespace ttnn { namespace operations::experimental { struct PlusOneOperation { - static ttnn::Tensor invoke(uint8_t queue_id, const Tensor& input_tensor); + static ttnn::Tensor invoke(QueueId queue_id, const Tensor& input_tensor); static ttnn::Tensor invoke(const Tensor& input_tensor); }; diff --git a/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/device/fast_reduce_nc_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/device/fast_reduce_nc_device_operation.cpp index 84849800527..c5ff7ca5b85 100644 --- a/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/device/fast_reduce_nc_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/device/fast_reduce_nc_device_operation.cpp @@ -13,7 +13,7 @@ namespace ttnn::operations::experimental::reduction::detail { Tensor _fast_reduce_nc( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input, const int32_t& dim, const std::optional& output, @@ -99,7 +99,7 @@ operation::ProgramWithCallbacks FastReduceNCDeviceOperation::create_program( } Tensor fast_reduce_nc( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input, tt::stl::Span dims, const std::optional& output, diff --git a/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/device/fast_reduce_nc_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/device/fast_reduce_nc_device_operation.hpp index 9e5de56181f..99d44575f9e 100644 --- a/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/device/fast_reduce_nc_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/device/fast_reduce_nc_device_operation.hpp @@ -6,7 +6,7 @@ #include -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/run_operation.hpp" #include "ttnn/operations/core/core.hpp" @@ -28,7 +28,7 @@ struct FastReduceNCDeviceOperation { }; Tensor fast_reduce_nc( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input, tt::stl::Span dims, const std::optional& output = std::nullopt, diff --git a/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/fast_reduce_nc.cpp b/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/fast_reduce_nc.cpp index e08e3e02c64..68659d1c35d 100644 --- a/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/fast_reduce_nc.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/fast_reduce_nc.cpp @@ -12,7 +12,7 @@ namespace ttnn { namespace operations::experimental::reduction { ttnn::Tensor FastReduceNCOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input, tt::stl::Span dims, const std::optional& output, diff --git a/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/fast_reduce_nc.hpp b/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/fast_reduce_nc.hpp index 9f0220f1ebc..a8a771c8a22 100644 --- a/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/fast_reduce_nc.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/fast_reduce_nc.hpp @@ -14,7 +14,7 @@ namespace operations::experimental::reduction { struct FastReduceNCOperation { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input, tt::stl::Span dims, const std::optional& output, diff --git a/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/fast_reduce_nc_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/fast_reduce_nc_pybind.cpp index e0dd667fdb4..20fdbd17ed0 100644 --- a/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/fast_reduce_nc_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/fast_reduce_nc_pybind.cpp @@ -24,14 +24,14 @@ void bind_fast_reduce_nc(pybind11::module& module) { const std::optional& output, const ttnn::MemoryConfig& memory_config, std::optional compute_kernel_config, - uint8_t queue_id) { return self(queue_id, input, dims, output, memory_config, compute_kernel_config); }, + QueueId queue_id) { return self(queue_id, input, dims, output, memory_config, compute_kernel_config); }, pybind11::arg("input").noconvert(), pybind11::kw_only(), pybind11::arg("dims").noconvert() = ttnn::SmallVector(), pybind11::arg("output").noconvert() = std::nullopt, pybind11::arg("memory_config").noconvert() = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, pybind11::arg("compute_kernel_config").noconvert() = std::nullopt, - pybind11::arg("queue_id") = 0}); + pybind11::arg("queue_id") = DefaultQueueId}); } } // namespace ttnn::operations::experimental::reduction::detail diff --git a/ttnn/cpp/ttnn/operations/experimental/reshape/view.cpp b/ttnn/cpp/ttnn/operations/experimental/reshape/view.cpp index d5712222b3c..1a7aaf2fa0d 100644 --- a/ttnn/cpp/ttnn/operations/experimental/reshape/view.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/reshape/view.cpp @@ -4,7 +4,7 @@ #include "view.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/run_operation.hpp" #include #include diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/device/hc_sum_reduce_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/device/hc_sum_reduce_program_factory.cpp index 06c36105c48..0118bdf4cdc 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/device/hc_sum_reduce_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/device/hc_sum_reduce_program_factory.cpp @@ -4,7 +4,7 @@ #include "hc_sum_reduce_program_factory.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include namespace ttnn::operations::experimental::ssm::detail { diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/hc_sum_reduce.cpp b/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/hc_sum_reduce.cpp index 36955ec9e83..c8b56723fdf 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/hc_sum_reduce.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/hc_sum_reduce.cpp @@ -5,14 +5,14 @@ #include "hc_sum_reduce.hpp" #include "device/hc_sum_reduce_op.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" using namespace tt::tt_metal; namespace ttnn::operations::experimental::ssm { ttnn::Tensor ExecuteHCSumReduce::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input, const std::optional& memory_config, const std::optional dtype, diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/hc_sum_reduce.hpp b/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/hc_sum_reduce.hpp index def47be1df5..cc4b999db4e 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/hc_sum_reduce.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/hc_sum_reduce.hpp @@ -11,7 +11,7 @@ namespace ttnn::operations::experimental::ssm { struct ExecuteHCSumReduce { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input, const std::optional& memory_config = std::nullopt, const std::optional dtype = std::nullopt, diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/hc_sum_reduce_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/hc_sum_reduce_pybind.cpp index 2431302e48e..c69b183ee58 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/hc_sum_reduce_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/hc_sum_reduce_pybind.cpp @@ -32,13 +32,13 @@ void bind_hc_sum_reduce(py::module& module) { const std::optional& memory_config, const std::optional dtype, const std::optional math_fidelity, - uint8_t queue_id) { return self(queue_id, input, memory_config, dtype, math_fidelity); }, + QueueId queue_id) { return self(queue_id, input, memory_config, dtype, math_fidelity); }, py::arg("input"), py::kw_only(), py::arg("memory_config") = std::nullopt, py::arg("dtype") = std::nullopt, py::arg("math_fidelity") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } } // namespace ttnn::operations::experimental::ssm::detail diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/prefix_scan.cpp b/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/prefix_scan.cpp index 71235041dd4..70c9eb21f5d 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/prefix_scan.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/prefix_scan.cpp @@ -5,14 +5,14 @@ #include "prefix_scan.hpp" #include "device/prefix_scan_op.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" using namespace tt::tt_metal; namespace ttnn::operations::experimental::ssm { ttnn::Tensor ExecutePrefixScan::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& a, const Tensor& bx, const Tensor& h_prev, diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/prefix_scan.hpp b/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/prefix_scan.hpp index 71c6d9dc5af..7191853626d 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/prefix_scan.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/prefix_scan.hpp @@ -11,7 +11,7 @@ namespace ttnn::operations::experimental::ssm { struct ExecutePrefixScan { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& a, const Tensor& bx, const Tensor& h_prev, diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/prefix_scan_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/prefix_scan_pybind.cpp index 8a6f8506eb9..4451a71685b 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/prefix_scan_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/prefix_scan_pybind.cpp @@ -32,7 +32,7 @@ void bind_prefix_scan(py::module& module) { const std::optional& memory_config, const std::optional dtype, const std::optional math_fidelity, - uint8_t queue_id) { return self(queue_id, a, bx, h_prev, memory_config, dtype, math_fidelity); }, + QueueId queue_id) { return self(queue_id, a, bx, h_prev, memory_config, dtype, math_fidelity); }, py::arg("a"), py::arg("bx"), py::arg("h_prev"), @@ -40,7 +40,7 @@ void bind_prefix_scan(py::module& module) { py::arg("memory_config") = std::nullopt, py::arg("dtype") = std::nullopt, py::arg("math_fidelity") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } } // namespace ttnn::operations::experimental::ssm::detail diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/device/repeat_and_interleave_eltwise_mul_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/device/repeat_and_interleave_eltwise_mul_program_factory.cpp index 746b5cc8d7a..af5752dfe1e 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/device/repeat_and_interleave_eltwise_mul_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/device/repeat_and_interleave_eltwise_mul_program_factory.cpp @@ -4,7 +4,7 @@ #include "repeat_and_interleave_eltwise_mul_program_factory.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include namespace ttnn::operations::experimental::ssm::detail { diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/repeat_and_interleave_eltwise_mul.cpp b/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/repeat_and_interleave_eltwise_mul.cpp index 7f60bbaa80f..52fabc138df 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/repeat_and_interleave_eltwise_mul.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/repeat_and_interleave_eltwise_mul.cpp @@ -5,14 +5,14 @@ #include "repeat_and_interleave_eltwise_mul.hpp" #include "device/repeat_and_interleave_eltwise_mul_op.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" using namespace tt::tt_metal; namespace ttnn::operations::experimental::ssm { ttnn::Tensor ExecuteRepeatAndInterleaveEltwiseMul::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& a, const Tensor& b, const std::optional& memory_config, diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/repeat_and_interleave_eltwise_mul.hpp b/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/repeat_and_interleave_eltwise_mul.hpp index 7b5eed045f4..446b568947f 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/repeat_and_interleave_eltwise_mul.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/repeat_and_interleave_eltwise_mul.hpp @@ -12,7 +12,7 @@ namespace ttnn::operations::experimental::ssm { struct ExecuteRepeatAndInterleaveEltwiseMul { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& a, const Tensor& b, const std::optional& memory_config = std::nullopt, diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/repeat_and_interleave_eltwise_mul_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/repeat_and_interleave_eltwise_mul_pybind.cpp index be99bd40725..112a1b7ebcf 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/repeat_and_interleave_eltwise_mul_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/repeat_and_interleave_eltwise_mul_pybind.cpp @@ -32,14 +32,14 @@ void bind_repeat_and_interleave_eltwise_mul(py::module& module) { const std::optional& memory_config, const std::optional dtype, const std::optional math_fidelity, - uint8_t queue_id) { return self(queue_id, a, b, memory_config, dtype, math_fidelity); }, + QueueId queue_id) { return self(queue_id, a, b, memory_config, dtype, math_fidelity); }, py::arg("a"), py::arg("b"), py::kw_only(), py::arg("memory_config") = std::nullopt, py::arg("dtype") = std::nullopt, py::arg("math_fidelity") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } } // namespace ttnn::operations::experimental::ssm::detail diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/concatenate_heads/concatenate_heads.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/concatenate_heads/concatenate_heads.hpp index a6bf0cf13a1..38b6905baec 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/concatenate_heads/concatenate_heads.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/concatenate_heads/concatenate_heads.hpp @@ -13,7 +13,7 @@ namespace operations::experimental::transformer { struct ConcatenateHeadsOperation { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const CoreCoord& compute_with_storage_grid_size, const std::optional& memory_config = std::nullopt, diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/concatenate_heads/concatenate_heads_pybind.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/concatenate_heads/concatenate_heads_pybind.hpp index 1c59b9091ae..468d2f3b7b6 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/concatenate_heads/concatenate_heads_pybind.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/concatenate_heads/concatenate_heads_pybind.hpp @@ -40,7 +40,7 @@ void bind_concatenate_heads(py::module& module) { const CoreCoord& compute_with_storage_grid_size, const std::optional& memory_config, std::optional optional_output_tensor, - uint8_t queue_id) { + QueueId queue_id) { return self( queue_id, input_tensor, compute_with_storage_grid_size, memory_config, optional_output_tensor); }, @@ -49,7 +49,7 @@ void bind_concatenate_heads(py::module& module) { py::kw_only(), py::arg("memory_config") = std::nullopt, py::arg("output_tensor") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } } // namespace ttnn::operations::experimental::transformer::detail diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/concatenate_heads/device/concatenate_heads_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/concatenate_heads/device/concatenate_heads_device_operation.hpp index 33e98d9d223..cc62e8f8e48 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/concatenate_heads/device/concatenate_heads_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/concatenate_heads/device/concatenate_heads_device_operation.hpp @@ -6,7 +6,7 @@ #include -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/run_operation.hpp" diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/create_qkv_heads.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/create_qkv_heads.cpp index c2cd68f72c4..43d3a084faf 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/create_qkv_heads.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/create_qkv_heads.cpp @@ -8,12 +8,12 @@ #include "device/create_qkv_heads_device_operation.hpp" #include "ttnn/run_operation.hpp" #include "ttnn/operations/core/core.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" namespace ttnn::operations::experimental::transformer { std::tuple CreateQKVHeadsOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const uint32_t num_q_heads, const std::optional num_kv_heads, diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/create_qkv_heads.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/create_qkv_heads.hpp index a04f88f481e..48d68afa3dd 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/create_qkv_heads.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/create_qkv_heads.hpp @@ -12,7 +12,7 @@ namespace operations::experimental::transformer { struct CreateQKVHeadsOperation { static std::tuple invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const uint32_t num_q_heads, const std::optional num_kv_heads, diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/create_qkv_heads_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/create_qkv_heads_pybind.cpp index 483597182ab..681536849a6 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/create_qkv_heads_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/create_qkv_heads_pybind.cpp @@ -25,7 +25,7 @@ void bind_create_qkv_heads_template(pybind11::module& module, const transformer_ const bool transpose_k_heads, const std::optional& memory_config, std::optional> optional_output_tensors, - uint8_t queue_id) { + QueueId queue_id) { return self( queue_id, input_tensor_q, @@ -42,7 +42,7 @@ void bind_create_qkv_heads_template(pybind11::module& module, const transformer_ pybind11::arg("transpose_k_heads").noconvert() = true, pybind11::arg("memory_config").noconvert() = std::nullopt, pybind11::arg("output_tensors").noconvert() = std::nullopt, - pybind11::arg("queue_id") = 0}); + pybind11::arg("queue_id") = DefaultQueueId}); }; void bind_create_qkv_heads(pybind11::module& module) { diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/create_qkv_heads_from_separate_tensors.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/create_qkv_heads_from_separate_tensors.cpp index e4ee61ce496..bc78ada8d7b 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/create_qkv_heads_from_separate_tensors.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/create_qkv_heads_from_separate_tensors.cpp @@ -8,12 +8,12 @@ #include "device/create_qkv_heads_from_separate_tensors_device_operation.hpp" #include "ttnn/run_operation.hpp" #include "ttnn/operations/core/core.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" namespace ttnn::operations::experimental::transformer { std::tuple CreateQKVHeadsSeparateTensorsOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_q, const Tensor& input_tensor_kv, const uint32_t num_q_heads, diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/create_qkv_heads_from_separate_tensors.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/create_qkv_heads_from_separate_tensors.hpp index a4d36cf5505..d48235bb338 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/create_qkv_heads_from_separate_tensors.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/create_qkv_heads_from_separate_tensors.hpp @@ -12,7 +12,7 @@ namespace operations::experimental::transformer { struct CreateQKVHeadsSeparateTensorsOperation { static std::tuple invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const Tensor& input_tensor_kv, const uint32_t num_q_heads, diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/create_qkv_heads_from_separate_tensors_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/create_qkv_heads_from_separate_tensors_pybind.cpp index 99b8a07c7b7..2dfa333c095 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/create_qkv_heads_from_separate_tensors_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/create_qkv_heads_from_separate_tensors_pybind.cpp @@ -27,7 +27,7 @@ void bind_create_qkv_heads_from_separate_tensors_template( const bool transpose_k_heads, const std::optional& memory_config, std::optional> optional_output_tensors, - uint8_t queue_id) { + QueueId queue_id) { return self( queue_id, input_tensor_q, @@ -46,7 +46,7 @@ void bind_create_qkv_heads_from_separate_tensors_template( pybind11::arg("transpose_k_heads").noconvert() = true, pybind11::arg("memory_config").noconvert() = std::nullopt, pybind11::arg("output_tensors").noconvert() = std::nullopt, - pybind11::arg("queue_id") = 0}); + pybind11::arg("queue_id") = DefaultQueueId}); }; void bind_create_qkv_heads_from_separate_tensors(pybind11::module& module) { diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/nlp_concat_heads_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/nlp_concat_heads_device_operation.hpp index 9a8f89518ec..ddfafc6d76a 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/nlp_concat_heads_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/nlp_concat_heads_device_operation.hpp @@ -6,7 +6,7 @@ #include -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/run_operation.hpp" diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/nlp_concat_heads.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/nlp_concat_heads.cpp index 61e678c3cd2..2c7b211b3c9 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/nlp_concat_heads.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/nlp_concat_heads.cpp @@ -12,7 +12,7 @@ namespace ttnn::operations::experimental::transformer { ttnn::Tensor NLPConcatHeadsOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const std::optional& memory_config, std::optional optional_output_tensor) { diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/nlp_concat_heads.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/nlp_concat_heads.hpp index 097a2ccd438..c178bebb842 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/nlp_concat_heads.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/nlp_concat_heads.hpp @@ -11,7 +11,7 @@ namespace operations::experimental::transformer { struct NLPConcatHeadsOperation { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const std::optional& memory_config = std::nullopt, std::optional optional_output_tensor = std::nullopt); diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/nlp_concat_heads_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/nlp_concat_heads_pybind.cpp index a6988201ac4..d9a8d029e78 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/nlp_concat_heads_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/nlp_concat_heads_pybind.cpp @@ -23,12 +23,12 @@ void bind_nlp_concat_heads(py::module& module) { const ttnn::Tensor& input_tensor, const std::optional& memory_config, std::optional optional_output_tensor, - uint8_t queue_id) { return self(queue_id, input_tensor, memory_config, optional_output_tensor); }, + QueueId queue_id) { return self(queue_id, input_tensor, memory_config, optional_output_tensor); }, py::arg("input_tensor").noconvert(), py::kw_only(), py::arg("memory_config") = std::nullopt, py::arg("output_tensor") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } } // namespace ttnn::operations::experimental::transformer::detail diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/device/nlp_concat_heads_decode_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/device/nlp_concat_heads_decode_device_operation.hpp index 9758dd76658..6f928ceee9d 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/device/nlp_concat_heads_decode_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/device/nlp_concat_heads_decode_device_operation.hpp @@ -6,7 +6,7 @@ #include -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/run_operation.hpp" diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/nlp_concat_heads_decode.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/nlp_concat_heads_decode.cpp index 901b9c98988..1c58a79db04 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/nlp_concat_heads_decode.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/nlp_concat_heads_decode.cpp @@ -12,7 +12,7 @@ namespace ttnn::operations::experimental::transformer { ttnn::Tensor NLPConcatHeadsDecodeOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const uint32_t num_heads, const std::optional& memory_config, diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/nlp_concat_heads_decode.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/nlp_concat_heads_decode.hpp index 2296fafe9e0..5089f9e6708 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/nlp_concat_heads_decode.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/nlp_concat_heads_decode.hpp @@ -11,7 +11,7 @@ namespace operations::experimental::transformer { struct NLPConcatHeadsDecodeOperation { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const uint32_t num_heads, const std::optional& memory_config = std::nullopt, diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/nlp_concat_heads_decode_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/nlp_concat_heads_decode_pybind.cpp index c0d333dabcc..b0fce2bdc0e 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/nlp_concat_heads_decode_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/nlp_concat_heads_decode_pybind.cpp @@ -24,7 +24,7 @@ void bind_nlp_concat_heads_decode(py::module& module) { const uint32_t num_heads, const std::optional& memory_config, std::optional optional_output_tensor, - uint8_t queue_id) { + QueueId queue_id) { return self(queue_id, input_tensor, num_heads, memory_config, optional_output_tensor); }, py::arg("input_tensor").noconvert(), @@ -32,7 +32,7 @@ void bind_nlp_concat_heads_decode(py::module& module) { py::arg("num_heads").noconvert(), py::arg("memory_config") = std::nullopt, py::arg("output_tensor") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } } // namespace ttnn::operations::experimental::transformer::detail diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/nlp_create_qkv_heads_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/nlp_create_qkv_heads_device_operation.hpp index 215c925db7a..86cfc534c5e 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/nlp_create_qkv_heads_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/nlp_create_qkv_heads_device_operation.hpp @@ -9,7 +9,7 @@ #include "ttnn/run_operation.hpp" #include -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/device_operation.hpp" #include "ttnn/decorators.hpp" diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/nlp_create_qkv_heads.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/nlp_create_qkv_heads.cpp index 7d2376758b0..bb1df667e7a 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/nlp_create_qkv_heads.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/nlp_create_qkv_heads.cpp @@ -9,7 +9,7 @@ namespace ttnn::operations::experimental::transformer { std::tuple NlpCreateHeadsOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_q, const std::optional& input_tensor_kv, const uint32_t num_q_heads, diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/nlp_create_qkv_heads.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/nlp_create_qkv_heads.hpp index 0db5a18b772..0726da7d96c 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/nlp_create_qkv_heads.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/nlp_create_qkv_heads.hpp @@ -13,7 +13,7 @@ namespace operations::experimental::transformer { struct NlpCreateHeadsOperation { static std::tuple invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_q, const std::optional& input_tensor_kv, const uint32_t num_q_heads, diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/nlp_create_qkv_heads_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/nlp_create_qkv_heads_pybind.cpp index eb3d0b5ea65..61eb5f1283b 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/nlp_create_qkv_heads_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/nlp_create_qkv_heads_pybind.cpp @@ -26,7 +26,7 @@ void bind_nlp_create_qkv_heads_template(pybind11::module& module, const transfor const bool transpose_k_heads, const std::optional& memory_config, std::optional>>& optional_output_tensors, - uint8_t queue_id) { + QueueId queue_id) { return self( queue_id, input_tensor_q, @@ -45,7 +45,7 @@ void bind_nlp_create_qkv_heads_template(pybind11::module& module, const transfor pybind11::arg("transpose_k_heads").noconvert() = true, pybind11::arg("memory_config").noconvert() = std::nullopt, pybind11::arg("output_tensors").noconvert() = std::nullopt, - pybind11::arg("queue_id") = 0}); + pybind11::arg("queue_id") = DefaultQueueId}); }; void bind_nlp_create_qkv_heads(pybind11::module& module) { diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/nlp_create_qkv_heads_decode.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/nlp_create_qkv_heads_decode.cpp index 736df093c67..502f186e2d3 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/nlp_create_qkv_heads_decode.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/nlp_create_qkv_heads_decode.cpp @@ -8,12 +8,12 @@ #include "device/nlp_create_qkv_heads_decode_device_operation.hpp" #include "ttnn/run_operation.hpp" #include "ttnn/operations/core/core.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" namespace ttnn::operations::experimental::transformer { std::tuple NLPCreateHeadsDecodeOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const uint32_t num_heads, const std::optional num_kv_heads, diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/nlp_create_qkv_heads_decode.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/nlp_create_qkv_heads_decode.hpp index cd32396e2a5..c7e78004ac8 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/nlp_create_qkv_heads_decode.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/nlp_create_qkv_heads_decode.hpp @@ -12,7 +12,7 @@ namespace operations::experimental::transformer { struct NLPCreateHeadsDecodeOperation { static std::tuple invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const uint32_t num_heads, const std::optional num_kv_heads, diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/nlp_create_qkv_heads_decode_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/nlp_create_qkv_heads_decode_pybind.cpp index d005aaef90d..57edef12c29 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/nlp_create_qkv_heads_decode_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/nlp_create_qkv_heads_decode_pybind.cpp @@ -27,7 +27,7 @@ void bind_nlp_create_qkv_heads_decode(pybind11::module& module) { const std::optional slice_size, const std::optional& memory_config, std::optional> optional_output_tensors, - uint8_t queue_id) { + QueueId queue_id) { return self( queue_id, input_tensor, @@ -48,7 +48,7 @@ void bind_nlp_create_qkv_heads_decode(pybind11::module& module) { pybind11::arg("slice_size").noconvert() = std::nullopt, pybind11::arg("memory_config") = std::nullopt, pybind11::arg("output_tensors") = std::nullopt, - pybind11::arg("queue_id") = 0}); + pybind11::arg("queue_id") = DefaultQueueId}); } } // namespace ttnn::operations::experimental::transformer::detail diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/device/nlp_create_qkv_heads_falcon7b_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/device/nlp_create_qkv_heads_falcon7b_device_operation.hpp index aa454997871..1f9e2ecfe52 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/device/nlp_create_qkv_heads_falcon7b_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/device/nlp_create_qkv_heads_falcon7b_device_operation.hpp @@ -9,7 +9,7 @@ #include "ttnn/run_operation.hpp" #include -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/device_operation.hpp" diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/nlp_create_qkv_heads_falcon7b.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/nlp_create_qkv_heads_falcon7b.cpp index b23db330c65..e899b817946 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/nlp_create_qkv_heads_falcon7b.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/nlp_create_qkv_heads_falcon7b.cpp @@ -9,7 +9,7 @@ namespace ttnn::operations::experimental::transformer { std::tuple NLPCreateHeadsFalcon7bOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_q, const std::optional& memory_config, std::optional>> optional_output_tensors) { diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/nlp_create_qkv_heads_falcon7b.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/nlp_create_qkv_heads_falcon7b.hpp index d422af8fdaf..5f7db851efb 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/nlp_create_qkv_heads_falcon7b.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/nlp_create_qkv_heads_falcon7b.hpp @@ -13,7 +13,7 @@ namespace operations::experimental::transformer { struct NLPCreateHeadsFalcon7bOperation { static std::tuple invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_q, const std::optional& memory_config, std::optional>> optional_output_tensors = std::nullopt); diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/nlp_create_qkv_heads_falcon7b_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/nlp_create_qkv_heads_falcon7b_pybind.cpp index 320914c1388..7d7e802480f 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/nlp_create_qkv_heads_falcon7b_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/nlp_create_qkv_heads_falcon7b_pybind.cpp @@ -21,11 +21,11 @@ void bind_nlp_create_qkv_heads_falcon7b(pybind11::module& module) { const ttnn::Tensor& input_tensor_q, const std::optional& memory_config, std::optional>>& optional_output_tensors, - uint8_t queue_id) { return self(queue_id, input_tensor_q, memory_config, optional_output_tensors); }, + QueueId queue_id) { return self(queue_id, input_tensor_q, memory_config, optional_output_tensors); }, pybind11::arg("input").noconvert(), pybind11::kw_only(), pybind11::arg("memory_config").noconvert() = std::nullopt, pybind11::arg("output_tensors").noconvert() = std::nullopt, - pybind11::arg("queue_id") = 0}); + pybind11::arg("queue_id") = DefaultQueueId}); }; } // namespace ttnn::operations::experimental::transformer::detail diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_device_operation.hpp index 09b08e2ddf1..37acf28eb27 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_device_operation.hpp @@ -9,7 +9,7 @@ #include "ttnn/run_operation.hpp" #include -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/device_operation.hpp" diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer.cpp index b8e44768999..2bc5c409dbf 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer.cpp @@ -9,7 +9,7 @@ namespace ttnn::operations::experimental::transformer { std::tuple NLPCreateHeadsSegformerOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_q, const std::optional& memory_config, std::optional>> optional_output_tensors) { diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer.hpp index 384dbbc5571..67837e650e0 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer.hpp @@ -13,7 +13,7 @@ namespace operations::experimental::transformer { struct NLPCreateHeadsSegformerOperation { static std::tuple invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_q, const std::optional& memory_config, std::optional>> optional_output_tensors = std::nullopt); diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer_pybind.cpp index c0c16fe9335..baec17c263f 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer_pybind.cpp @@ -21,11 +21,11 @@ void bind_nlp_create_qkv_heads_segformer(pybind11::module& module) { const ttnn::Tensor& input_tensor_q, const std::optional& memory_config, std::optional>>& optional_output_tensors, - uint8_t queue_id) { return self(queue_id, input_tensor_q, memory_config, optional_output_tensors); }, + QueueId queue_id) { return self(queue_id, input_tensor_q, memory_config, optional_output_tensors); }, pybind11::arg("input").noconvert(), pybind11::kw_only(), pybind11::arg("memory_config").noconvert() = std::nullopt, pybind11::arg("output_tensors").noconvert() = std::nullopt, - pybind11::arg("queue_id") = 0}); + pybind11::arg("queue_id") = DefaultQueueId}); }; } // namespace ttnn::operations::experimental::transformer::detail diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_device_operation.hpp index f49dcb773f8..fb938937037 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_device_operation.hpp @@ -9,7 +9,7 @@ #include "ttnn/run_operation.hpp" #include -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/device_operation.hpp" diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit.cpp index 1d370aeb57c..a7577184fcd 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit.cpp @@ -9,7 +9,7 @@ namespace ttnn::operations::experimental::transformer { std::tuple NLPCreateHeadsVitOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_q, const std::optional& memory_config, std::optional>> optional_output_tensors) { diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit.hpp index 2651a9dc9bb..41fe48dba69 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit.hpp @@ -13,7 +13,7 @@ namespace operations::experimental::transformer { struct NLPCreateHeadsVitOperation { static std::tuple invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor_q, const std::optional& memory_config, std::optional>> optional_output_tensors = std::nullopt); diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit_pybind.cpp index 00ed867cec6..ace277ccc2c 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit_pybind.cpp @@ -21,11 +21,11 @@ void bind_nlp_create_qkv_heads_vit(pybind11::module& module) { const ttnn::Tensor& input_tensor_q, const std::optional& memory_config, std::optional>>& optional_output_tensors, - uint8_t queue_id) { return self(queue_id, input_tensor_q, memory_config, optional_output_tensors); }, + QueueId queue_id) { return self(queue_id, input_tensor_q, memory_config, optional_output_tensors); }, pybind11::arg("input").noconvert(), pybind11::kw_only(), pybind11::arg("memory_config").noconvert() = std::nullopt, pybind11::arg("output_tensors").noconvert() = std::nullopt, - pybind11::arg("queue_id") = 0}); + pybind11::arg("queue_id") = DefaultQueueId}); }; } // namespace ttnn::operations::experimental::transformer::detail diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_device_operation.hpp index 2e831409dd7..9788f53f272 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_device_operation.hpp @@ -6,7 +6,7 @@ #include -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/run_operation.hpp" diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/nlp_kv_cache_load_slice.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/nlp_kv_cache_load_slice.cpp index a787caff380..6df2cde1478 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/nlp_kv_cache_load_slice.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/nlp_kv_cache_load_slice.cpp @@ -12,7 +12,7 @@ namespace ttnn::operations::experimental::transformer { ttnn::Tensor NLPKVCacheLoadSliceOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const uint32_t seq_len_start, const uint32_t seq_len_end, diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/nlp_kv_cache_load_slice.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/nlp_kv_cache_load_slice.hpp index 042458091bd..6f51a88bf8e 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/nlp_kv_cache_load_slice.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/nlp_kv_cache_load_slice.hpp @@ -11,7 +11,7 @@ namespace operations::experimental::transformer { struct NLPKVCacheLoadSliceOperation { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const uint32_t seq_len_start, const uint32_t seq_len_end, diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/nlp_kv_cache_load_slice_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/nlp_kv_cache_load_slice_pybind.cpp index 8f979d12e4f..b1c6b69b74d 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/nlp_kv_cache_load_slice_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/nlp_kv_cache_load_slice_pybind.cpp @@ -24,7 +24,7 @@ void bind_nlp_kv_cache_load_slice(pybind11::module& module) { const uint32_t seq_len_end, const std::optional& memory_config, std::optional optional_output_tensor, - uint8_t queue_id) { + QueueId queue_id) { return self(queue_id, input_tensor, seq_len_start, seq_len_end, memory_config, optional_output_tensor); }, pybind11::arg("input_tensor").noconvert(), @@ -33,7 +33,7 @@ void bind_nlp_kv_cache_load_slice(pybind11::module& module) { pybind11::arg("seq_len_end").noconvert(), pybind11::arg("memory_config") = std::nullopt, pybind11::arg("output_tensor") = std::nullopt, - pybind11::arg("queue_id") = 0}); + pybind11::arg("queue_id") = DefaultQueueId}); } } // namespace ttnn::operations::experimental::transformer::detail diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/split_query_key_value_and_split_heads_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/split_query_key_value_and_split_heads_device_operation.hpp index d0bedcbd258..6ed228d4bcb 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/split_query_key_value_and_split_heads_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/split_query_key_value_and_split_heads_device_operation.hpp @@ -6,7 +6,7 @@ #include -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/run_operation.hpp" diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/split_query_key_value_and_split_heads.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/split_query_key_value_and_split_heads.hpp index 2235fdf86f0..0c3a12c4be0 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/split_query_key_value_and_split_heads.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/split_query_key_value_and_split_heads.hpp @@ -13,7 +13,7 @@ namespace operations::experimental::transformer { struct SplitFusedQKVAndSplitHeadsOperation { static std::tuple invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const CoreCoord& compute_with_storage_grid_size, const std::optional& memory_config = std::nullopt, diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/split_query_key_value_and_split_heads_pybind.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/split_query_key_value_and_split_heads_pybind.hpp index 0341a2f11b5..fc59a6c394a 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/split_query_key_value_and_split_heads_pybind.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/split_query_key_value_and_split_heads_pybind.hpp @@ -41,7 +41,7 @@ void bind_split_qkv(py::module& module) { const std::optional& memory_config, const uint32_t num_heads, std::optional>> optional_output_tensors, - uint8_t queue_id) { + QueueId queue_id) { return self( queue_id, input_tensor, @@ -56,7 +56,7 @@ void bind_split_qkv(py::module& module) { py::arg("memory_config") = std::nullopt, py::arg("num_heads") = 16, py::arg("output_tensors") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } } // namespace ttnn::operations::experimental::transformer::detail diff --git a/ttnn/cpp/ttnn/operations/kv_cache/kv_cache_pybind.cpp b/ttnn/cpp/ttnn/operations/kv_cache/kv_cache_pybind.cpp index 752e56037eb..17fddfe861a 100644 --- a/ttnn/cpp/ttnn/operations/kv_cache/kv_cache_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/kv_cache/kv_cache_pybind.cpp @@ -88,7 +88,7 @@ void bind_update_cache_for_token_(py::module& module, const kv_cache_operation_t py::arg("cache"), py::arg("input"), py::arg("update_index"), - py::arg("batch_offset") = 0}); + py::arg("batch_offset") = DefaultQueueId}); } template diff --git a/ttnn/cpp/ttnn/operations/loss/loss.cpp b/ttnn/cpp/ttnn/operations/loss/loss.cpp index a2bc571df73..122497e838b 100644 --- a/ttnn/cpp/ttnn/operations/loss/loss.cpp +++ b/ttnn/cpp/ttnn/operations/loss/loss.cpp @@ -23,7 +23,7 @@ using ttnn::operations::unary::UnaryOpType; using ttnn::operations::unary::UnaryWithParam; Tensor loss_function( - uint8_t queue_id, + QueueId queue_id, const Tensor& ref, const Tensor& prediction, const LossFunction loss_kind, @@ -56,7 +56,7 @@ Tensor loss_function( } // namespace loss_utils Tensor MseLossOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& ref, const Tensor& prediction, const LossReductionMode mode, @@ -67,7 +67,7 @@ Tensor MseLossOperation::invoke( } Tensor MaeLossOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& ref, const Tensor& prediction, const LossReductionMode mode, diff --git a/ttnn/cpp/ttnn/operations/loss/loss.hpp b/ttnn/cpp/ttnn/operations/loss/loss.hpp index f4c8f2f7ecf..156192bac6f 100644 --- a/ttnn/cpp/ttnn/operations/loss/loss.hpp +++ b/ttnn/cpp/ttnn/operations/loss/loss.hpp @@ -9,7 +9,7 @@ #include "loss_types.hpp" #include "ttnn/decorators.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" namespace ttnn { @@ -17,7 +17,7 @@ namespace operations::loss { struct MseLossOperation { static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& ref, const Tensor& prediction, const LossReductionMode mode = LossReductionMode::NONE, @@ -36,7 +36,7 @@ struct MseLossOperation { struct MaeLossOperation { static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& ref, const Tensor& prediction, const LossReductionMode mode = LossReductionMode::NONE, diff --git a/ttnn/cpp/ttnn/operations/loss/loss_pybind.cpp b/ttnn/cpp/ttnn/operations/loss/loss_pybind.cpp index ea67e97b9b6..5773399fd00 100644 --- a/ttnn/cpp/ttnn/operations/loss/loss_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/loss/loss_pybind.cpp @@ -60,7 +60,7 @@ void bind_mse_loss_function(py::module& module) { const LossReductionMode mode, const std::optional& memory_config, std::optional optional_output_tensor, - uint8_t queue_id) -> ttnn::Tensor { + QueueId queue_id) -> ttnn::Tensor { return self(queue_id, ref, prediction, mode, memory_config, optional_output_tensor); }, py::arg("input_reference"), @@ -69,7 +69,7 @@ void bind_mse_loss_function(py::module& module) { py::arg("reduction") = LossReductionMode::NONE, py::arg("memory_config") = std::nullopt, py::arg("output_tensor") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } void bind_mae_loss_function(py::module& module) { @@ -111,7 +111,7 @@ void bind_mae_loss_function(py::module& module) { const LossReductionMode mode, const std::optional& memory_config, std::optional optional_output_tensor, - uint8_t queue_id) -> ttnn::Tensor { + QueueId queue_id) -> ttnn::Tensor { return self(queue_id, ref, prediction, mode, memory_config, optional_output_tensor); }, py::arg("input_reference"), @@ -120,7 +120,7 @@ void bind_mae_loss_function(py::module& module) { py::arg("reduction") = LossReductionMode::NONE, py::arg("memory_config") = std::nullopt, py::arg("output_tensor") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } } // namespace detail diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp index 979eebd4233..b027c70e19c 100644 --- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp @@ -1349,7 +1349,7 @@ Tensor matmul( const Tensor& input_tensor_b, const std::optional& bias, const struct Matmul& parameters, - const uint8_t queue_id, + const QueueId queue_id, const std::optional& optional_output_tensor) { std::vector> optional_input_tensors = {}; std::vector output_tensors; diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.hpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.hpp index 4dcfbd275c3..969d458b52e 100644 --- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.hpp +++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.hpp @@ -237,7 +237,7 @@ Tensor matmul( const Tensor& input_tensor_b, const std::optional& bias = std::nullopt, const struct Matmul& parameters = Matmul{}, - const uint8_t queue_id = 0, + const QueueId queue_id = DefaultQueueId, const std::optional& optional_output_tensor = std::nullopt); } // namespace matmul diff --git a/ttnn/cpp/ttnn/operations/matmul/matmul.cpp b/ttnn/cpp/ttnn/operations/matmul/matmul.cpp index 72b50ca7de2..e3a7b866bc1 100644 --- a/ttnn/cpp/ttnn/operations/matmul/matmul.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/matmul.cpp @@ -4,7 +4,7 @@ #include "matmul.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/operations/core/core.hpp" #include "ttnn/operations/eltwise/unary/unary.hpp" #include "ttnn/operations/data_movement/transpose/transpose.hpp" @@ -82,7 +82,7 @@ ttnn::Tensor bound_matmul( input_tensor_b_adjusted, post_process_bias ? std::nullopt : bias, parameters, - 0, + DefaultQueueId, optional_output_tensor = optional_output_tensor); if (post_process_bias) { diff --git a/ttnn/cpp/ttnn/operations/pool/generic/generic_pools.cpp b/ttnn/cpp/ttnn/operations/pool/generic/generic_pools.cpp index 5206983a317..0c62e5c91f8 100644 --- a/ttnn/cpp/ttnn/operations/pool/generic/generic_pools.cpp +++ b/ttnn/cpp/ttnn/operations/pool/generic/generic_pools.cpp @@ -32,7 +32,7 @@ uint32_t get_bf16_pool_init_value(Pool2DType pool_type) { template Tensor Pool2DOp::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, uint32_t batch_size, uint32_t input_h, diff --git a/ttnn/cpp/ttnn/operations/pool/generic/generic_pools.hpp b/ttnn/cpp/ttnn/operations/pool/generic/generic_pools.hpp index 91e5e11ebb9..d172c7b1d57 100644 --- a/ttnn/cpp/ttnn/operations/pool/generic/generic_pools.hpp +++ b/ttnn/cpp/ttnn/operations/pool/generic/generic_pools.hpp @@ -18,7 +18,7 @@ namespace operations::pool { template struct Pool2DOp { static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, uint32_t batch_size, uint32_t input_h, diff --git a/ttnn/cpp/ttnn/operations/pool/generic/generic_pools_pybind.cpp b/ttnn/cpp/ttnn/operations/pool/generic/generic_pools_pybind.cpp index 9b21cd943fa..7393450922a 100644 --- a/ttnn/cpp/ttnn/operations/pool/generic/generic_pools_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/pool/generic/generic_pools_pybind.cpp @@ -88,7 +88,7 @@ void bind_max_pool2d_operation(py::module& module) { const std::optional& memory_config, const std::optional applied_shard_scheme, bool ceil_mode, - const uint8_t& queue_id) -> ttnn::Tensor { + QueueId queue_id) -> ttnn::Tensor { return self( queue_id, input_tensor, @@ -117,7 +117,7 @@ void bind_max_pool2d_operation(py::module& module) { py::arg("memory_config") = std::nullopt, py::arg("applied_shard_scheme") = std::nullopt, py::arg("ceil_mode") = false, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } void py_module(py::module& module) { bind_max_pool2d_operation(module); } diff --git a/ttnn/cpp/ttnn/operations/reduction/argmax/argmax.cpp b/ttnn/cpp/ttnn/operations/reduction/argmax/argmax.cpp index ec68ae046c2..d43c7df809a 100644 --- a/ttnn/cpp/ttnn/operations/reduction/argmax/argmax.cpp +++ b/ttnn/cpp/ttnn/operations/reduction/argmax/argmax.cpp @@ -14,7 +14,7 @@ namespace ttnn::operations::reduction { ttnn::Tensor ArgMaxOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const std::optional dim, const bool use_muticore, diff --git a/ttnn/cpp/ttnn/operations/reduction/argmax/argmax.hpp b/ttnn/cpp/ttnn/operations/reduction/argmax/argmax.hpp index 62d52047919..a708b177af9 100644 --- a/ttnn/cpp/ttnn/operations/reduction/argmax/argmax.hpp +++ b/ttnn/cpp/ttnn/operations/reduction/argmax/argmax.hpp @@ -13,7 +13,7 @@ namespace operations::reduction { struct ArgMaxOperation { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const std::optional dim = std::nullopt, const bool use_muticore = false, diff --git a/ttnn/cpp/ttnn/operations/reduction/argmax/argmax_pybind.hpp b/ttnn/cpp/ttnn/operations/reduction/argmax/argmax_pybind.hpp index 4f6468aa689..3bda8500c9d 100644 --- a/ttnn/cpp/ttnn/operations/reduction/argmax/argmax_pybind.hpp +++ b/ttnn/cpp/ttnn/operations/reduction/argmax/argmax_pybind.hpp @@ -58,7 +58,7 @@ void bind_reduction_argmax_operation(py::module& module) { const bool use_multicore, const std::optional& memory_config, std::optional optional_output_tensor, - uint8_t queue_id) { + QueueId queue_id) { return self(queue_id, input_tensor, dim, use_multicore, memory_config, optional_output_tensor); }, py::arg("input_tensor").noconvert(), @@ -67,7 +67,7 @@ void bind_reduction_argmax_operation(py::module& module) { py::arg("use_multicore") = false, py::arg("memory_config") = std::nullopt, py::arg("output_tensor") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } } // namespace ttnn::operations::reduction::detail diff --git a/ttnn/cpp/ttnn/operations/reduction/argmax/device/argmax_op.hpp b/ttnn/cpp/ttnn/operations/reduction/argmax/device/argmax_op.hpp index d0b6fa0b858..bd6bc5ed104 100644 --- a/ttnn/cpp/ttnn/operations/reduction/argmax/device/argmax_op.hpp +++ b/ttnn/cpp/ttnn/operations/reduction/argmax/device/argmax_op.hpp @@ -6,7 +6,7 @@ #include -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/run_operation.hpp" diff --git a/ttnn/cpp/ttnn/operations/reduction/moe/moe.cpp b/ttnn/cpp/ttnn/operations/reduction/moe/moe.cpp index dbf98519483..a230aea3d7d 100644 --- a/ttnn/cpp/ttnn/operations/reduction/moe/moe.cpp +++ b/ttnn/cpp/ttnn/operations/reduction/moe/moe.cpp @@ -16,7 +16,7 @@ namespace ttnn::operations::reduction { ttnn::Tensor MoeOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const Tensor& expert_mask_tensor, const Tensor& topk_mask_tensor, diff --git a/ttnn/cpp/ttnn/operations/reduction/moe/moe.hpp b/ttnn/cpp/ttnn/operations/reduction/moe/moe.hpp index 003f127dd0e..41275826aac 100644 --- a/ttnn/cpp/ttnn/operations/reduction/moe/moe.hpp +++ b/ttnn/cpp/ttnn/operations/reduction/moe/moe.hpp @@ -14,7 +14,7 @@ namespace operations::reduction { struct MoeOperation { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const Tensor& expert_mask_tensor, const Tensor& topk_mask_tensor, diff --git a/ttnn/cpp/ttnn/operations/reduction/moe/moe_pybind.hpp b/ttnn/cpp/ttnn/operations/reduction/moe/moe_pybind.hpp index 8c5a2ad83b1..a5719dbcc75 100644 --- a/ttnn/cpp/ttnn/operations/reduction/moe/moe_pybind.hpp +++ b/ttnn/cpp/ttnn/operations/reduction/moe/moe_pybind.hpp @@ -56,7 +56,7 @@ void bind_reduction_moe_operation(py::module& module) { const uint16_t k, const std::optional& memory_config, std::optional optional_output_tensor, - uint8_t queue_id) { + QueueId queue_id) { return self( queue_id, input_tensor, @@ -73,7 +73,7 @@ void bind_reduction_moe_operation(py::module& module) { py::kw_only(), py::arg("memory_config") = std::nullopt, py::arg("output_tensor") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } } // namespace ttnn::operations::reduction::detail diff --git a/ttnn/cpp/ttnn/operations/reduction/prod/prod.cpp b/ttnn/cpp/ttnn/operations/reduction/prod/prod.cpp index efff9da104a..2d9e1d84a4a 100644 --- a/ttnn/cpp/ttnn/operations/reduction/prod/prod.cpp +++ b/ttnn/cpp/ttnn/operations/reduction/prod/prod.cpp @@ -11,7 +11,7 @@ #include "ttnn/operations/data_movement/permute/permute.hpp" #include "ttnn/operations/functions.hpp" #include "ttnn/types.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "cpp/ttnn/operations/data_movement/squeeze/squeeze.hpp" #include "ttnn/operations/core/core.hpp" diff --git a/ttnn/cpp/ttnn/operations/reduction/sampling/device/sampling_op.hpp b/ttnn/cpp/ttnn/operations/reduction/sampling/device/sampling_op.hpp index 3591c6d0f51..0841fb17245 100644 --- a/ttnn/cpp/ttnn/operations/reduction/sampling/device/sampling_op.hpp +++ b/ttnn/cpp/ttnn/operations/reduction/sampling/device/sampling_op.hpp @@ -6,7 +6,7 @@ #include -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/run_operation.hpp" diff --git a/ttnn/cpp/ttnn/operations/reduction/sampling/sampling.cpp b/ttnn/cpp/ttnn/operations/reduction/sampling/sampling.cpp index 72bba1e3685..25102a5c799 100644 --- a/ttnn/cpp/ttnn/operations/reduction/sampling/sampling.cpp +++ b/ttnn/cpp/ttnn/operations/reduction/sampling/sampling.cpp @@ -14,7 +14,7 @@ namespace ttnn::operations::reduction { ttnn::Tensor SamplingOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_values_tensor, const Tensor& input_indices_tensor, const std::vector& k, diff --git a/ttnn/cpp/ttnn/operations/reduction/sampling/sampling.hpp b/ttnn/cpp/ttnn/operations/reduction/sampling/sampling.hpp index 5f8436dd727..e53f1325d2b 100644 --- a/ttnn/cpp/ttnn/operations/reduction/sampling/sampling.hpp +++ b/ttnn/cpp/ttnn/operations/reduction/sampling/sampling.hpp @@ -13,7 +13,7 @@ namespace operations::reduction { struct SamplingOperation { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_values_tensor, const Tensor& input_indices_tensor, const std::vector& k, diff --git a/ttnn/cpp/ttnn/operations/reduction/sampling/sampling_pybind.cpp b/ttnn/cpp/ttnn/operations/reduction/sampling/sampling_pybind.cpp index ebd412ece4d..81c6dd1daf6 100644 --- a/ttnn/cpp/ttnn/operations/reduction/sampling/sampling_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/reduction/sampling/sampling_pybind.cpp @@ -91,7 +91,7 @@ void bind_reduction_sampling_operation(py::module& module) { const uint32_t seed, const std::optional& sub_core_grids, std::optional optional_output_tensor, - uint8_t queue_id) { + QueueId queue_id) { return self( queue_id, input_values_tensor, @@ -110,7 +110,7 @@ void bind_reduction_sampling_operation(py::module& module) { py::arg("seed").noconvert() = 0, py::arg("sub_core_grids") = std::nullopt, py::arg("output_tensor") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } } // namespace ttnn::operations::reduction::detail diff --git a/ttnn/cpp/ttnn/operations/reduction/topk/topk.hpp b/ttnn/cpp/ttnn/operations/reduction/topk/topk.hpp index 7ee5ec9ec1f..9c4b17f659b 100644 --- a/ttnn/cpp/ttnn/operations/reduction/topk/topk.hpp +++ b/ttnn/cpp/ttnn/operations/reduction/topk/topk.hpp @@ -4,7 +4,7 @@ #pragma once -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/decorators.hpp" #include "ttnn/operations/core/core.hpp" @@ -24,7 +24,7 @@ namespace operations::reduction { struct ExecuteTopK { static inline std::vector invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const uint16_t k, const int8_t dim, diff --git a/ttnn/cpp/ttnn/operations/reduction/topk/topk_pybind.hpp b/ttnn/cpp/ttnn/operations/reduction/topk/topk_pybind.hpp index 018cf18d1c5..4f622d6927b 100644 --- a/ttnn/cpp/ttnn/operations/reduction/topk/topk_pybind.hpp +++ b/ttnn/cpp/ttnn/operations/reduction/topk/topk_pybind.hpp @@ -68,7 +68,7 @@ void bind_reduction_topk_operation(py::module& module) { const bool sorted, std::optional> optional_output_tensors, const std::optional& memory_config, - uint8_t queue_id) { + QueueId queue_id) { return self(queue_id, input_tensor, k, dim, largest, sorted, memory_config, optional_output_tensors); }, py::arg("input_tensor").noconvert(), @@ -79,7 +79,7 @@ void bind_reduction_topk_operation(py::module& module) { py::kw_only(), py::arg("out") = std::nullopt, py::arg("memory_config") = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } } // namespace ttnn::operations::reduction::detail diff --git a/ttnn/cpp/ttnn/operations/sliding_window/halo/halo.cpp b/ttnn/cpp/ttnn/operations/sliding_window/halo/halo.cpp index 2e0afd9ce43..b054af4cc72 100644 --- a/ttnn/cpp/ttnn/operations/sliding_window/halo/halo.cpp +++ b/ttnn/cpp/ttnn/operations/sliding_window/halo/halo.cpp @@ -8,7 +8,7 @@ #include "device/halo_device_operation.hpp" namespace ttnn::operations::sliding_window::halo { Tensor HaloOperation::invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const SlidingWindowConfig& config, uint32_t pad_val, diff --git a/ttnn/cpp/ttnn/operations/sliding_window/halo/halo.hpp b/ttnn/cpp/ttnn/operations/sliding_window/halo/halo.hpp index f7fef773713..31df09955ea 100644 --- a/ttnn/cpp/ttnn/operations/sliding_window/halo/halo.hpp +++ b/ttnn/cpp/ttnn/operations/sliding_window/halo/halo.hpp @@ -12,7 +12,7 @@ namespace ttnn::operations::sliding_window::halo { struct HaloOperation { // This how the user can call the operation static Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const Tensor& input_tensor, const SlidingWindowConfig& config, uint32_t pad_val = 0x0, diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa/sdpa.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa/sdpa.cpp index ee52f3c299a..6d8bc2723fe 100644 --- a/ttnn/cpp/ttnn/operations/transformer/sdpa/sdpa.cpp +++ b/ttnn/cpp/ttnn/operations/transformer/sdpa/sdpa.cpp @@ -8,7 +8,7 @@ #include "device/sdpa_op.hpp" #include "device/joint_sdpa_op.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/run_operation.hpp" using namespace tt::tt_metal; @@ -16,7 +16,7 @@ using namespace tt::tt_metal; namespace ttnn::operations::transformer { ttnn::Tensor ExecuteScaledDotProductAttention::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor_q, const ttnn::Tensor& input_tensor_k, const ttnn::Tensor& input_tensor_v, @@ -71,7 +71,7 @@ ttnn::Tensor ExecuteScaledDotProductAttention::invoke( } ttnn::Tensor ExecuteChunkedScaledDotProductAttention::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor_q, const ttnn::Tensor& input_tensor_k, const ttnn::Tensor& input_tensor_v, @@ -126,7 +126,7 @@ ttnn::Tensor ExecuteChunkedScaledDotProductAttention::invoke( } std::tuple ExecuteJointAttention::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor_q, const ttnn::Tensor& input_tensor_k, const ttnn::Tensor& input_tensor_v, diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa/sdpa.hpp b/ttnn/cpp/ttnn/operations/transformer/sdpa/sdpa.hpp index abea10a2d59..b89488c9d02 100644 --- a/ttnn/cpp/ttnn/operations/transformer/sdpa/sdpa.hpp +++ b/ttnn/cpp/ttnn/operations/transformer/sdpa/sdpa.hpp @@ -13,7 +13,7 @@ namespace operations::transformer { struct ExecuteScaledDotProductAttention { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor_q, const ttnn::Tensor& input_tensor_k, const ttnn::Tensor& input_tensor_v, @@ -38,7 +38,7 @@ struct ExecuteScaledDotProductAttention { struct ExecuteChunkedScaledDotProductAttention { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor_q, const ttnn::Tensor& input_tensor_k, const ttnn::Tensor& input_tensor_v, @@ -63,7 +63,7 @@ struct ExecuteChunkedScaledDotProductAttention { struct ExecuteJointAttention { static std::tuple invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor_q, const ttnn::Tensor& input_tensor_k, const ttnn::Tensor& input_tensor_v, diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa/sdpa_pybind.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa/sdpa_pybind.cpp index 9bde1cb8d49..4f7e0ff1340 100644 --- a/ttnn/cpp/ttnn/operations/transformer/sdpa/sdpa_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/transformer/sdpa/sdpa_pybind.cpp @@ -56,7 +56,7 @@ void py_bind_sdpa(py::module& module) { const std::optional& memory_config, std::optional program_config, std::optional compute_kernel_config, - uint8_t queue_id) { + QueueId queue_id) { return self( queue_id, input_tensor_q, @@ -79,7 +79,7 @@ void py_bind_sdpa(py::module& module) { py::arg("memory_config").noconvert() = std::nullopt, py::arg("program_config").noconvert() = std::nullopt, py::arg("compute_kernel_config").noconvert() = std::nullopt, - py::arg("queue_id") = 0, + py::arg("queue_id") = DefaultQueueId, }); auto chunked_doc = @@ -124,7 +124,7 @@ void py_bind_sdpa(py::module& module) { const std::optional& memory_config, std::optional program_config, std::optional compute_kernel_config, - uint8_t queue_id) { + QueueId queue_id) { return self( queue_id, input_tensor_q, @@ -147,7 +147,7 @@ void py_bind_sdpa(py::module& module) { py::arg("memory_config").noconvert() = std::nullopt, py::arg("program_config").noconvert() = std::nullopt, py::arg("compute_kernel_config").noconvert() = std::nullopt, - py::arg("queue_id") = 0, + py::arg("queue_id") = DefaultQueueId, }); auto joint_doc = R"doc( @@ -200,7 +200,7 @@ void py_bind_sdpa(py::module& module) { SDPAProgramConfig program_config, std::optional scale, std::optional compute_kernel_config, - uint8_t queue_id) { + QueueId queue_id) { auto outputs = self( queue_id, input_tensor_q, @@ -226,6 +226,6 @@ void py_bind_sdpa(py::module& module) { py::arg("program_config").noconvert(), py::arg("scale").noconvert() = std::nullopt, py::arg("compute_kernel_config").noconvert() = std::nullopt, - py::arg("queue_id") = 0}); + py::arg("queue_id") = DefaultQueueId}); } } // namespace ttnn::operations::transformer diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/sdpa_decode.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/sdpa_decode.cpp index f328a3c6412..2cb5f89a540 100644 --- a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/sdpa_decode.cpp +++ b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/sdpa_decode.cpp @@ -7,7 +7,7 @@ #include #include "device/sdpa_decode_op.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/run_operation.hpp" using namespace tt::tt_metal; @@ -33,7 +33,7 @@ inline uint32_t get_chunk_size(uint32_t s) { namespace ttnn::operations::transformer { ttnn::Tensor ExecuteScaledDotProductAttentionDecode::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor_q, const ttnn::Tensor& input_tensor_k, const ttnn::Tensor& input_tensor_v, @@ -114,7 +114,7 @@ ttnn::Tensor ExecuteScaledDotProductAttentionDecode::invoke( } ttnn::Tensor ExecutePagedScaledDotProductAttentionDecode::invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor_q, const ttnn::Tensor& input_tensor_k, const ttnn::Tensor& input_tensor_v, diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/sdpa_decode.hpp b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/sdpa_decode.hpp index edc25eb804b..b3389b07a20 100644 --- a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/sdpa_decode.hpp +++ b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/sdpa_decode.hpp @@ -13,7 +13,7 @@ namespace operations::transformer { struct ExecuteScaledDotProductAttentionDecode { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor_q, const ttnn::Tensor& input_tensor_k, const ttnn::Tensor& input_tensor_v, @@ -42,7 +42,7 @@ struct ExecuteScaledDotProductAttentionDecode { struct ExecutePagedScaledDotProductAttentionDecode { static ttnn::Tensor invoke( - uint8_t queue_id, + QueueId queue_id, const ttnn::Tensor& input_tensor_q, const ttnn::Tensor& input_tensor_k, const ttnn::Tensor& input_tensor_v, diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/sdpa_decode_pybind.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/sdpa_decode_pybind.cpp index ef9981731b2..2f588990077 100644 --- a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/sdpa_decode_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/sdpa_decode_pybind.cpp @@ -66,7 +66,7 @@ void py_bind_sdpa_decode(py::module& module) { const std::optional& memory_config, std::optional program_config, std::optional compute_kernel_config, - uint8_t queue_id) { + QueueId queue_id) { return self( queue_id, input_tensor_q, @@ -93,7 +93,7 @@ void py_bind_sdpa_decode(py::module& module) { py::arg("memory_config").noconvert() = std::nullopt, py::arg("program_config").noconvert() = std::nullopt, py::arg("compute_kernel_config").noconvert() = std::nullopt, - py::arg("queue_id") = 0, + py::arg("queue_id") = DefaultQueueId, }); using PagedOperationType = decltype(ttnn::transformer::paged_scaled_dot_product_attention_decode); @@ -114,7 +114,7 @@ void py_bind_sdpa_decode(py::module& module) { const std::optional& memory_config, std::optional program_config, std::optional compute_kernel_config, - uint8_t queue_id) { + QueueId queue_id) { return self( queue_id, input_tensor_q, @@ -141,7 +141,7 @@ void py_bind_sdpa_decode(py::module& module) { py::arg("memory_config").noconvert() = std::nullopt, py::arg("program_config").noconvert() = std::nullopt, py::arg("compute_kernel_config").noconvert() = std::nullopt, - py::arg("queue_id") = 0, + py::arg("queue_id") = DefaultQueueId, }); } } // namespace ttnn::operations::transformer diff --git a/ttnn/cpp/ttnn/run_operation.cpp b/ttnn/cpp/ttnn/run_operation.cpp index 83a4ce29a53..022ac257070 100644 --- a/ttnn/cpp/ttnn/run_operation.cpp +++ b/ttnn/cpp/ttnn/run_operation.cpp @@ -278,7 +278,7 @@ OutputTensors run( const Tensors& input_tensors, const OptionalConstTensors& optional_input_tensors, const OptionalTensors& optional_output_tensors, - uint8_t cq_id) { + QueueId cq_id) { if constexpr (std::is_same_v) { return ttnn::prim::old_infra_device_operation( cq_id, std::move(operation), input_tensors, optional_input_tensors, optional_output_tensors); @@ -293,14 +293,14 @@ template Tensors run( const Tensors& input_tensors, const OptionalConstTensors& optional_input_tensors, const OptionalTensors& optional_output_tensors, - uint8_t cq_id); + QueueId cq_id); template OptionalTensors run( DeviceOperation&& operation, const Tensors& input_tensors, const OptionalConstTensors& optional_input_tensors, const OptionalTensors& optional_output_tensors, - uint8_t cq_id); + QueueId cq_id); template OutputTensors run_without_autoformat( @@ -308,7 +308,7 @@ OutputTensors run_without_autoformat( const Tensors& input_tensors, const OptionalConstTensors& optional_input_tensors, const OptionalTensors& optional_output_tensors, - uint8_t cq_id) { + QueueId cq_id) { using ttnn::operations::experimental::auto_format::AutoFormat; ZoneScoped; IDevice* device = detail::get_device(input_tensors, optional_input_tensors); @@ -340,14 +340,14 @@ template Tensors run_without_autoformat( const Tensors& input_tensors, const OptionalConstTensors& optional_input_tensors, const OptionalTensors& optional_output_tensors, - uint8_t cq_id); + QueueId cq_id); template OptionalTensors run_without_autoformat( DeviceOperation&& operation, const Tensors& input_tensors, const OptionalConstTensors& optional_input_tensors, const OptionalTensors& optional_output_tensors, - uint8_t cq_id); + QueueId cq_id); std::vector extract_padded_shapes( const std::vector& tensor_specs, @@ -373,7 +373,7 @@ Tensors run_with_autoformat( const OptionalTensors& optional_output_tensors, const float pad_value, const bool pad_c, - uint8_t cq_id) { + QueueId cq_id) { using ttnn::operations::experimental::auto_format::AutoFormat; ZoneScoped; IDevice* device = detail::get_device(input_tensors, optional_input_tensors); @@ -445,7 +445,7 @@ Tensors run_with_autoformat( const OptionalConstTensors& optional_input_tensors, const std::vector>& optional_input_formatting, const OptionalTensors& optional_output_tensors, - uint8_t cq_id) { + ttnn::QueueId cq_id) { using ttnn::operations::experimental::auto_format::AutoFormat; ZoneScoped; IDevice* device = detail::get_device(input_tensors, optional_input_tensors); diff --git a/ttnn/cpp/ttnn/run_operation.hpp b/ttnn/cpp/ttnn/run_operation.hpp index e8f5cb0c420..aa1a44367c0 100644 --- a/ttnn/cpp/ttnn/run_operation.hpp +++ b/ttnn/cpp/ttnn/run_operation.hpp @@ -9,6 +9,7 @@ #include "ttnn/operations/experimental/auto_format/auto_format.hpp" #include "ttnn/operation.hpp" +#include "ttnn/common/queue_id.hpp" #include #include @@ -24,7 +25,7 @@ OutputTensors run( const Tensors& input_tensors, const OptionalConstTensors& optional_input_tensors = {}, const OptionalTensors& optional_output_tensors = {}, - uint8_t cq_id = 0); + ttnn::QueueId cq_id = ttnn::DefaultQueueId); template inline auto run( @@ -32,7 +33,7 @@ inline auto run( const Tensors& input_tensors, const OptionalConstTensors& optional_input_tensors = {}, const OptionalTensors& optional_output_tensors = {}, - uint8_t cq_id = 0) -> ProgramOutputTensors { + ttnn::QueueId cq_id = ttnn::DefaultQueueId) -> ProgramOutputTensors { using OutputTensors = ProgramOutputTensors; if constexpr (detail::is_device_operation()) { auto operation = DeviceOperation(concrete_op); @@ -49,14 +50,14 @@ OutputTensors run_without_autoformat( const Tensors& input_tensors, const OptionalConstTensors& optional_input_tensors = {}, const OptionalTensors& optional_output_tensors = {}, - uint8_t cq_id = 0); + ttnn::QueueId cq_id = ttnn::DefaultQueueId); template inline auto run_without_autoformat( ConcreteOperation&& concrete_op, const std::vector& input_tensors, const std::vector>& optional_input_tensors = {}, const std::vector>& optional_output_tensors = {}, - uint8_t cq_id = 0) -> ProgramOutputTensors { + ttnn::QueueId cq_id = ttnn::DefaultQueueId) -> ProgramOutputTensors { using OutputTensors = ProgramOutputTensors; auto operation = DeviceOperation(concrete_op); return run_without_autoformat( @@ -70,7 +71,7 @@ Tensors run_with_autoformat( const OptionalTensors& optional_output_tensors = {}, const float pad_value = 0, const bool pad_c = false, - uint8_t cq_id = 0); + ttnn::QueueId cq_id = ttnn::DefaultQueueId); template inline auto run_with_autoformat( @@ -80,7 +81,7 @@ inline auto run_with_autoformat( const std::vector>& optional_output_tensors = {}, const float pad_value = 0, const bool pad_c = false, - uint8_t cq_id = 0) -> Tensors { + ttnn::QueueId cq_id = ttnn::DefaultQueueId) -> Tensors { using OutputTensors = ProgramOutputTensors; auto operation = DeviceOperation(concrete_op); return run_with_autoformat( @@ -95,7 +96,7 @@ Tensors run_with_autoformat( const OptionalConstTensors& optional_input_tensors = {}, const std::vector>& optional_input_formatting = {}, const OptionalTensors& optional_output_tensors = {}, - uint8_t cq_id = 0); + ttnn::QueueId cq_id = ttnn::DefaultQueueId); template inline auto run_with_autoformat( ConcreteOperation&& concrete_op, @@ -105,7 +106,7 @@ inline auto run_with_autoformat( const std::vector>& optional_input_tensors = {}, const std::vector>& optional_input_formatting = {}, const OptionalTensors& optional_output_tensors = {}, - uint8_t cq_id = 0) -> ProgramOutputTensors { + ttnn::QueueId cq_id = ttnn::DefaultQueueId) -> ProgramOutputTensors { using OutputTensors = ProgramOutputTensors; auto operation = DeviceOperation(concrete_op); return run_with_autoformat( diff --git a/ttnn/cpp/ttnn/tensor/tensor.cpp b/ttnn/cpp/ttnn/tensor/tensor.cpp index dd21761699d..1e5e153417b 100644 --- a/ttnn/cpp/ttnn/tensor/tensor.cpp +++ b/ttnn/cpp/ttnn/tensor/tensor.cpp @@ -735,20 +735,20 @@ template std::vector Tensor::to_vector() const; template std::vector Tensor::to_vector() const; template std::vector Tensor::to_vector() const; -Tensor Tensor::to_device(IDevice* target_device, const MemoryConfig& mem_config, uint8_t cq_id) const { +Tensor Tensor::to_device(IDevice* target_device, const MemoryConfig& mem_config, QueueId cq_id) const { return tensor_ops::tensor_to_device(*this, target_device, mem_config, cq_id); } -Tensor Tensor::to_device(distributed::MeshDevice* mesh_device, const MemoryConfig& mem_config, uint8_t cq_id) const { +Tensor Tensor::to_device(distributed::MeshDevice* mesh_device, const MemoryConfig& mem_config, QueueId cq_id) const { std::vector workers_to_use = ttnn::distributed::get_mapped_devices(*this, *mesh_device); return tensor_ops::tensor_to_device(*this, workers_to_use, mem_config, cq_id); } -Tensor Tensor::to_device(const std::vector& workers, const MemoryConfig& mem_config, uint8_t cq_id) const { +Tensor Tensor::to_device(const std::vector& workers, const MemoryConfig& mem_config, QueueId cq_id) const { return tensor_ops::tensor_to_device(*this, workers, mem_config, cq_id); } -Tensor Tensor::cpu(bool blocking, uint8_t cq_id) const { return tensor_ops::tensor_cpu(*this, blocking, cq_id); } +Tensor Tensor::cpu(bool blocking, QueueId cq_id) const { return tensor_ops::tensor_cpu(*this, blocking, cq_id); } Tensor Tensor::extract_shard(const CoreCoord& core) const { ZoneScoped; @@ -1020,7 +1020,7 @@ Tensor allocate_tensor_on_mesh(const TensorSpec& tensor_spec, distributed::MeshD return Tensor(std::move(multi_device_storage), tensor_spec); } -void write_tensor(const Tensor& host_tensor, Tensor device_tensor, uint8_t cq_id) { +void write_tensor(const Tensor& host_tensor, Tensor device_tensor, QueueId cq_id) { // Top level wrapper to copy a host tensor to a preallocated device tensor TT_ASSERT(device_tensor.workers.size(), "Workers must be specified for device_tensor in write_tensor"); @@ -1069,7 +1069,7 @@ void write_tensor(const Tensor& host_tensor, Tensor device_tensor, uint8_t cq_id }, async_safe_tensor.get_storage()); EnqueueWriteBuffer( - worker->command_queue(cq_id), + worker->command_queue(*cq_id), device_storage.get_buffer(), host_data, /*blocking=*/false); @@ -1084,7 +1084,7 @@ void write_tensor(const Tensor& host_tensor, Tensor device_tensor, uint8_t cq_id void* host_data = std::visit( [](auto&& b) -> void* { return b.begin(); }, host_storage.get_buffer(worker_index)); EnqueueWriteBuffer( - worker->command_queue(cq_id), + worker->command_queue(*cq_id), device_storage.get_buffer_for_device(worker), host_data, /*blocking=*/false); diff --git a/ttnn/cpp/ttnn/tensor/tensor.hpp b/ttnn/cpp/ttnn/tensor/tensor.hpp index 79f4adcdd26..ce8aedb3e2d 100644 --- a/ttnn/cpp/ttnn/tensor/tensor.hpp +++ b/ttnn/cpp/ttnn/tensor/tensor.hpp @@ -16,7 +16,7 @@ #include #include #include "ttnn/any_device.hpp" -#include "ttnn/common/constants.hpp" +#include "ttnn/common/queue_id.hpp" #include "ttnn/distributed/distributed_tensor_config.hpp" #include "ttnn/tensor/types.hpp" #include "ttnn/tensor/storage.hpp" @@ -177,17 +177,17 @@ class Tensor { Tensor to_device( IDevice* target_device, const MemoryConfig& mem_config = {.memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED}, - uint8_t cq_id = ttnn::DefaultQueueId) const; + ttnn::QueueId cq_id = ttnn::DefaultQueueId) const; Tensor to_device( distributed::MeshDevice* mesh_device, const MemoryConfig& mem_config = {.memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED}, - uint8_t cq_id = ttnn::DefaultQueueId) const; + ttnn::QueueId cq_id = ttnn::DefaultQueueId) const; Tensor to_device( const std::vector& workers, const MemoryConfig& mem_config = {.memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED}, - uint8_t cq_id = ttnn::DefaultQueueId) const; + ttnn::QueueId cq_id = ttnn::DefaultQueueId) const; Tensor to_layout(Layout target_layout, IDevice* worker = nullptr) const; @@ -195,7 +195,7 @@ class Tensor { Tensor pad(const ttnn::Shape& output_padded_shape, const ttnn::Shape& input_tensor_start, float pad_value) const; - Tensor cpu(bool blocking = true, uint8_t cq_id = ttnn::DefaultQueueId) const; + Tensor cpu(bool blocking = true, ttnn::QueueId cq_id = ttnn::DefaultQueueId) const; Tensor unpad(const ttnn::Shape& output_tensor_start, const ttnn::Shape& output_tensor_end) const; @@ -382,7 +382,7 @@ Tensor allocate_tensor_on_devices(const TensorSpec& spec, const std::vector( // ====================================================================================== template -Tensor to_host_helper(const Tensor& tensor, bool blocking = true, uint8_t cq_id = ttnn::DefaultQueueId) { +Tensor to_host_helper(const Tensor& tensor, bool blocking = true, ttnn::QueueId cq_id = ttnn::DefaultQueueId) { TT_ASSERT(tensor.is_allocated(), "Buffer must be allocated on device!"); auto device_buffer = tensor.device_buffer(); auto device = tensor.device(); @@ -530,7 +530,7 @@ Tensor to_host_helper(const Tensor& tensor, bool blocking = true, uint8_t cq_id const char* TT_METAL_SLOW_DISPATCH_MODE = std::getenv("TT_METAL_SLOW_DISPATCH_MODE"); if (TT_METAL_SLOW_DISPATCH_MODE == nullptr) { data_vec.resize(size_in_bytes / sizeof(T)); - read_data_from_device_buffer(device->command_queue(cq_id), device_buffer, data_vec.data(), blocking); + read_data_from_device_buffer(device->command_queue(*cq_id), device_buffer, data_vec.data(), blocking); } else { read_data_from_device_buffer(device_buffer, data_vec); } @@ -539,7 +539,7 @@ Tensor to_host_helper(const Tensor& tensor, bool blocking = true, uint8_t cq_id } template -Tensor to_host(const Tensor& tensor, bool blocking, uint8_t cq_id) { +Tensor to_host(const Tensor& tensor, bool blocking, ttnn::QueueId cq_id) { if (tensor.storage_type() == StorageType::DEVICE) { return to_host_helper(tensor, blocking, cq_id); } else if (tensor.storage_type() == StorageType::MULTI_DEVICE) { @@ -558,20 +558,20 @@ Tensor to_host(const Tensor& tensor, bool blocking, uint8_t cq_id) { } } -template Tensor to_host(const Tensor& tensor, bool blocking, uint8_t cq_id); -template Tensor to_host(const Tensor& tensor, bool blocking, uint8_t cq_id); -template Tensor to_host(const Tensor& tensor, bool blocking, uint8_t cq_id); -template Tensor to_host(const Tensor& tensor, bool blocking, uint8_t cq_id); -template Tensor to_host(const Tensor& tensor, bool blocking, uint8_t cq_id); -template Tensor to_host(const Tensor& tensor, bool blocking, uint8_t cq_id); +template Tensor to_host(const Tensor& tensor, bool blocking, ttnn::QueueId cq_id); +template Tensor to_host(const Tensor& tensor, bool blocking, ttnn::QueueId cq_id); +template Tensor to_host(const Tensor& tensor, bool blocking, ttnn::QueueId cq_id); +template Tensor to_host(const Tensor& tensor, bool blocking, ttnn::QueueId cq_id); +template Tensor to_host(const Tensor& tensor, bool blocking, ttnn::QueueId cq_id); +template Tensor to_host(const Tensor& tensor, bool blocking, ttnn::QueueId cq_id); template <> -Tensor to_host(const Tensor& tensor, bool blocking, uint8_t cq_id) { +Tensor to_host(const Tensor& tensor, bool blocking, ttnn::QueueId cq_id) { return to_host(tensor, blocking, cq_id); } template <> -Tensor to_host(const Tensor& tensor, bool blocking, uint8_t cq_id) { +Tensor to_host(const Tensor& tensor, bool blocking, ttnn::QueueId cq_id) { return to_host(tensor, blocking, cq_id); } @@ -662,7 +662,7 @@ std::shared_ptr initialize_data_on_device( BufferType& data_to_write, IDevice* device, const TensorSpec& tensor_spec, - uint8_t cq_id = ttnn::DefaultQueueId) { + ttnn::QueueId cq_id = ttnn::DefaultQueueId) { ZoneScoped; TT_ASSERT(device != nullptr); @@ -670,7 +670,7 @@ std::shared_ptr initialize_data_on_device( const char* TT_METAL_SLOW_DISPATCH_MODE = std::getenv("TT_METAL_SLOW_DISPATCH_MODE"); if (TT_METAL_SLOW_DISPATCH_MODE == nullptr) { - write_data_to_device_buffer(device->command_queue(cq_id), data_to_write, device_buffer); + write_data_to_device_buffer(device->command_queue(*cq_id), data_to_write, device_buffer); } else { write_data_to_device_buffer(data_to_write, *device_buffer); } @@ -679,7 +679,7 @@ std::shared_ptr initialize_data_on_device( template std::shared_ptr to_device_buffer( - const Storage& storage, IDevice* device, const TensorSpec& tensor_spec, uint8_t cq_id) { + const Storage& storage, IDevice* device, const TensorSpec& tensor_spec, ttnn::QueueId cq_id) { return std::visit( tt::stl::overloaded{ [&device, &tensor_spec, cq_id](const StorageType& storage) { @@ -705,7 +705,7 @@ std::shared_ptr to_device_buffer( // ====================================================================================== template -Tensor to_device(const Tensor& tensor, IDevice* target_device, const MemoryConfig& memory_config, uint8_t cq_id) { +Tensor to_device(const Tensor& tensor, IDevice* target_device, const MemoryConfig& memory_config, ttnn::QueueId cq_id) { TT_FATAL(tensor.storage_type() != StorageType::DEVICE, "Tensor is already on device!"); TT_FATAL(target_device != nullptr, "Need target device in order to move tensor to device!"); TT_FATAL(tensor.is_allocated(), "Need data to exist in order to move it to device"); @@ -717,27 +717,27 @@ Tensor to_device(const Tensor& tensor, IDevice* target_device, const MemoryConfi } template Tensor to_device( - const Tensor& tensor, IDevice* target_device, const MemoryConfig& memory_config, uint8_t cq_id); + const Tensor& tensor, IDevice* target_device, const MemoryConfig& memory_config, ttnn::QueueId cq_id); template Tensor to_device( - const Tensor& tensor, IDevice* target_device, const MemoryConfig& memory_config, uint8_t cq_id); + const Tensor& tensor, IDevice* target_device, const MemoryConfig& memory_config, ttnn::QueueId cq_id); template Tensor to_device( - const Tensor& tensor, IDevice* target_device, const MemoryConfig& memory_config, uint8_t cq_id); + const Tensor& tensor, IDevice* target_device, const MemoryConfig& memory_config, ttnn::QueueId cq_id); template Tensor to_device( - const Tensor& tensor, IDevice* target_device, const MemoryConfig& memory_config, uint8_t cq_id); + const Tensor& tensor, IDevice* target_device, const MemoryConfig& memory_config, ttnn::QueueId cq_id); template Tensor to_device( - const Tensor& tensor, IDevice* target_device, const MemoryConfig& memory_config, uint8_t cq_id); + const Tensor& tensor, IDevice* target_device, const MemoryConfig& memory_config, ttnn::QueueId cq_id); template Tensor to_device( - const Tensor& tensor, IDevice* target_device, const MemoryConfig& memory_config, uint8_t cq_id); + const Tensor& tensor, IDevice* target_device, const MemoryConfig& memory_config, ttnn::QueueId cq_id); template <> Tensor to_device( - const Tensor& tensor, IDevice* target_device, const MemoryConfig& memory_config, uint8_t cq_id) { + const Tensor& tensor, IDevice* target_device, const MemoryConfig& memory_config, ttnn::QueueId cq_id) { return to_device(tensor, target_device, memory_config, cq_id); } template <> Tensor to_device( - const Tensor& tensor, IDevice* target_device, const MemoryConfig& memory_config, uint8_t cq_id) { + const Tensor& tensor, IDevice* target_device, const MemoryConfig& memory_config, ttnn::QueueId cq_id) { return to_device(tensor, target_device, memory_config, cq_id); } diff --git a/ttnn/cpp/ttnn/tensor/tensor_impl.hpp b/ttnn/cpp/ttnn/tensor/tensor_impl.hpp index 2a4654b8aac..cf34ac215c2 100644 --- a/ttnn/cpp/ttnn/tensor/tensor_impl.hpp +++ b/ttnn/cpp/ttnn/tensor/tensor_impl.hpp @@ -8,17 +8,19 @@ #include #include -#include "tt-metalium/mesh_device.hpp" +#include +#include +#include +#include +#include +#include + #include "ttnn/tensor/host_buffer/functions.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/tensor/tensor_utils.hpp" #include "ttnn/tensor/types.hpp" #include "ttnn/tensor/layout/tensor_layout.hpp" -#include -#include -#include -#include -#include +#include "ttnn/types.hpp" namespace tt { @@ -189,7 +191,7 @@ void read_data_from_device_buffer(std::shared_ptr device_buffer, std::ve // ====================================================================================== template -Tensor to_host(const Tensor& tensor, bool blocking = true, uint8_t cq_id = ttnn::DefaultQueueId); +Tensor to_host(const Tensor& tensor, bool blocking = true, QueueId cq_id = ttnn::DefaultQueueId); // TODO: #17215 - This will eventually subsume `to_host`, when "mesh buffer" backed tensors become the default. template @@ -200,7 +202,7 @@ Tensor to_device( const Tensor& tensor, IDevice* target_device, const MemoryConfig& memory_config, - uint8_t cq_id = ttnn::DefaultQueueId); + QueueId cq_id = ttnn::DefaultQueueId); // TODO: #17215 - This will eventually subsume `to_device`, when "mesh buffer" backed tensors become the default. template diff --git a/ttnn/cpp/ttnn/tensor/tensor_ops.cpp b/ttnn/cpp/ttnn/tensor/tensor_ops.cpp index 5896e7b6f3a..5f250738ed4 100644 --- a/ttnn/cpp/ttnn/tensor/tensor_ops.cpp +++ b/ttnn/cpp/ttnn/tensor/tensor_ops.cpp @@ -28,7 +28,7 @@ namespace tt::tt_metal::tensor_ops { Tensor tensor_to_device( - const Tensor& input_tensor, IDevice* target_device, const MemoryConfig& mem_config, uint8_t cq_id) { + const Tensor& input_tensor, IDevice* target_device, const MemoryConfig& mem_config, QueueId cq_id) { ZoneScoped; GraphTracker::instance().track_function_start("Tensor::to", input_tensor, target_device, mem_config); // Tensor can be using borrowed storage. If so, when running in async mode, copy this tensor to owned storage. @@ -65,7 +65,7 @@ Tensor tensor_to_device( } Tensor tensor_to_device( - const Tensor& input_tensor, const std::vector& workers, const MemoryConfig& mem_config, uint8_t cq_id) { + const Tensor& input_tensor, const std::vector& workers, const MemoryConfig& mem_config, QueueId cq_id) { ZoneScoped; GraphTracker::instance().track_function_start("Tensor::to", input_tensor, workers, mem_config); TT_FATAL( @@ -98,7 +98,7 @@ Tensor tensor_to_device( return device_tensor; } -Tensor tensor_cpu(const Tensor& input_tensor, bool blocking, uint8_t cq_id) { +Tensor tensor_cpu(const Tensor& input_tensor, bool blocking, QueueId cq_id) { ZoneScoped; GraphTracker::instance().track_function_start("Tensor::cpu", input_tensor, blocking); auto workers = input_tensor.get_workers(blocking); diff --git a/ttnn/cpp/ttnn/tensor/tensor_ops.hpp b/ttnn/cpp/ttnn/tensor/tensor_ops.hpp index 9deb78bad6f..598b75c4c78 100644 --- a/ttnn/cpp/ttnn/tensor/tensor_ops.hpp +++ b/ttnn/cpp/ttnn/tensor/tensor_ops.hpp @@ -4,6 +4,7 @@ #pragma once #include "types.hpp" +#include "ttnn/common/queue_id.hpp" namespace tt::tt_metal { struct Tensor; @@ -21,16 +22,16 @@ class IDevice; namespace tt::tt_metal::tensor_ops { Tensor tensor_to_device( - const Tensor& input_tensor, IDevice* target_device, const MemoryConfig& mem_config, uint8_t cq_id); + const Tensor& input_tensor, IDevice* target_device, const MemoryConfig& mem_config, QueueId cq_id); Tensor tensor_to_device( - const Tensor& input_tensor, const std::vector& workers, const MemoryConfig& mem_config, uint8_t cq_id); + const Tensor& input_tensor, const std::vector& workers, const MemoryConfig& mem_config, QueueId cq_id); Tensor tensor_to_layout(const Tensor& input_tensor, Layout target_layout, IDevice* worker); Tensor tensor_to_layout(const Tensor& input_tensor, Layout target_layout, distributed::MeshDevice* mesh_device); -Tensor tensor_cpu(const Tensor& input_tensor, bool blocking, uint8_t cq_id); +Tensor tensor_cpu(const Tensor& input_tensor, bool blocking, QueueId cq_id); void tensor_print(const Tensor& input_tensor); diff --git a/ttnn/cpp/ttnn/types.hpp b/ttnn/cpp/ttnn/types.hpp index 740b3db00ff..aa19295ec5f 100644 --- a/ttnn/cpp/ttnn/types.hpp +++ b/ttnn/cpp/ttnn/types.hpp @@ -9,6 +9,7 @@ #include #include #include + #include "ttnn/distributed/types.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/tensor/types.hpp" From b7a29954ba70f7eeabb590ca60a2c0d696ca69f6 Mon Sep 17 00:00:00 2001 From: Joseph Chu Date: Thu, 6 Feb 2025 23:52:09 +0000 Subject: [PATCH 018/316] #0: Fix failing Llama TG tests by preserving old behavior for ShardTensorToMesh Previously, when we had a MxN MeshDevice, a mesh_mapper of ShardTensorToMesh would behave differently based on whether `mesh_type` passed into the MeshDevice was MeshType::RowMajor, MeshType::Ring. With the removal of `MeshType` from MeshDevice specification, this changed the default behavior for users constructing a MeshDevice with default mesh_type=MeshType::RowMajor. This change now preserves the old behavior so that shards are distributed in row-major instead of a line. --- conftest.py | 1 + ttnn/cpp/ttnn/distributed/api.cpp | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/conftest.py b/conftest.py index 510905dd8f7..4be5deca442 100644 --- a/conftest.py +++ b/conftest.py @@ -258,6 +258,7 @@ def pcie_mesh_device(request, silicon_arch_name, silicon_arch_wormhole_b0, devic **updated_device_params, offset=ttnn.MeshOffset(0, 1), ) + mesh_device.reshape(ttnn.MeshShape(1, 4)) logger.debug(f"multidevice with {mesh_device.get_num_devices()} devices is created") yield mesh_device diff --git a/ttnn/cpp/ttnn/distributed/api.cpp b/ttnn/cpp/ttnn/distributed/api.cpp index 831c1f4cbd5..bd0fd35a206 100644 --- a/ttnn/cpp/ttnn/distributed/api.cpp +++ b/ttnn/cpp/ttnn/distributed/api.cpp @@ -153,7 +153,6 @@ std::vector get_mapped_devices(const Tensor& tensor, MeshDevice& mesh_ [&](const ShardTensor2D& s) { return mesh_device.get_view().get_devices(MeshShape{s.shard_mesh.y, s.shard_mesh.x}); }, - [&](const ShardTensor& s) { return get_workers_for_tensor(mesh_device.get_view().get_line_devices()); }, [&](const auto&) { return get_workers_for_tensor(mesh_device.get_devices()); }}, host_storage.strategy); } else if (std::holds_alternative(tensor.get_storage())) { From 023102891d119f48e564092c143f6e299d92d435 Mon Sep 17 00:00:00 2001 From: Umair Date: Thu, 6 Feb 2025 23:51:41 +0000 Subject: [PATCH 019/316] #0: skip credit handshake when no words have been received. --- tt_fabric/hw/inc/tt_fabric.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tt_fabric/hw/inc/tt_fabric.h b/tt_fabric/hw/inc/tt_fabric.h index c84ba88094a..04fa643b82c 100644 --- a/tt_fabric/hw/inc/tt_fabric.h +++ b/tt_fabric/hw/inc/tt_fabric.h @@ -404,6 +404,9 @@ typedef struct fvc_producer_state { FORCE_INLINE uint32_t get_num_words_available() { if constexpr (fvc_mode == FVC_MODE_ROUTER) { uint32_t new_words = *words_received; + if (new_words == 0) { + return words_inbound; + } *words_received_local_update = (-new_words) << REMOTE_DEST_BUF_WORDS_FREE_INC; words_inbound += new_words; uint32_t temp = inbound_wrptr.ptr + new_words; From 351d7552eaa1d0f3444612d2befc18926f57b8d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bojan=20Ro=C5=A1ko?= <156314064+broskoTT@users.noreply.github.com> Date: Fri, 7 Feb 2025 22:10:00 +0100 Subject: [PATCH 020/316] [UMD] Remove virtual_to_umd_coord_mapping_ (#17678) ### Ticket Related to https://github.com/tenstorrent/tt-metal/issues/17002 ### Problem description Change virtual_to_umd_coord_mapping_ to new CoreCoord API ### What's changed - Remove virtual_to_umd_coord_mapping_ in favor of translate_coord_to(core, TRANSLATED, get_coord_system_used()). As a reminder, get_coord_system_used is PHYSICAL for grayskull and VIRTUAL for others - Also, used get_coord_at instead when there was an API accepting CoreCoord in UMD directly. - Fill virtual_worker_cores_ and virtual_eth_cores_ by get_cores() api ### Testing I've manually added get_cores in TRANSLATED coords to verify that the collection is the same as the old way this was filled. Also, tested that translate_coord_to(core, TRANSLATED, get_coord_system_used()) exactly returns the same map as virtual_to_umd_coord_mapping_. This was tested both on wormhole and grayskull ### Checklist - [x] All post-commit tests : https://github.com/tenstorrent/tt-metal/actions/runs/13195340513 - [x] Blackhole post-commit tests : https://github.com/tenstorrent/tt-metal/actions/runs/13195342107 - [ ] (Single-card) Model perf tests : https://github.com/tenstorrent/tt-metal/actions/runs/13195343430 - [ ] (Single-card) Device perf regressions : https://github.com/tenstorrent/tt-metal/actions/runs/13195344720 - [ ] (T3K) T3000 unit tests : https://github.com/tenstorrent/tt-metal/actions/runs/13195346328 - [ ] (T3K) T3000 demo tests : https://github.com/tenstorrent/tt-metal/actions/runs/13195347735 - [ ] (TG) TG unit tests : https://github.com/tenstorrent/tt-metal/actions/runs/13195349258 - [ ] (TG) TG demo tests : https://github.com/tenstorrent/tt-metal/actions/runs/13195351136 - [x] (TGG) TGG unit tests : https://github.com/tenstorrent/tt-metal/actions/runs/13195352473 - [x] (TGG) TGG demo tests : https://github.com/tenstorrent/tt-metal/actions/runs/13195353601 --- tt_metal/api/tt-metalium/tt_cluster.hpp | 8 ++-- tt_metal/llrt/tt_cluster.cpp | 61 +++++++++---------------- 2 files changed, 25 insertions(+), 44 deletions(-) diff --git a/tt_metal/api/tt-metalium/tt_cluster.hpp b/tt_metal/api/tt-metalium/tt_cluster.hpp index dbadf61613b..ff71e87ca00 100644 --- a/tt_metal/api/tt-metalium/tt_cluster.hpp +++ b/tt_metal/api/tt-metalium/tt_cluster.hpp @@ -106,8 +106,8 @@ class Cluster { std::optional> get_tlb_data(const tt_cxy_pair &target) const { tt::umd::Cluster *device = dynamic_cast(driver_.get()); - tt_cxy_pair umd_target = this->virtual_to_umd_coord_mapping_.at(target); - return device->get_tlb_data_from_target(umd_target); + tt::umd::CoreCoord target_coord = get_soc_desc(target.chip).get_coord_at(target, CoordSystem::TRANSLATED); + return device->get_tlb_data_from_target(target.chip, target_coord); } std::function get_fast_pcie_static_tlb_write_callable( @@ -121,8 +121,8 @@ class Cluster { // Allows for fast writes when targeting same device core by only doing the lookup once and avoiding repeated stack traversals tt::Writer get_static_tlb_writer(tt_cxy_pair target) const { tt::umd::Cluster *device = dynamic_cast(driver_.get()); - tt_cxy_pair umd_target = this->virtual_to_umd_coord_mapping_.at(target); - return device->get_static_tlb_writer(umd_target); + tt::umd::CoreCoord target_coord = get_soc_desc(target.chip).get_coord_at(target, CoordSystem::TRANSLATED); + return device->get_static_tlb_writer(target.chip, target_coord); } std::uint32_t get_numa_node_for_device(uint32_t device_id) const { diff --git a/tt_metal/llrt/tt_cluster.cpp b/tt_metal/llrt/tt_cluster.cpp index 59de00cd515..f699180ee89 100644 --- a/tt_metal/llrt/tt_cluster.cpp +++ b/tt_metal/llrt/tt_cluster.cpp @@ -334,27 +334,15 @@ const metal_SocDescriptor &Cluster::get_soc_desc(chip_id_t chip) const { } void Cluster::generate_virtual_to_umd_coord_mapping() { - // UMD APIs currently use a coordinate system that is not Physical, Virtual or Logical. - // TT-Metal uses Virtual Coordinates when programming txns on device. - // This mapping allows Cluster APIs to be consistent with the rest of TT-Metal, while correctly - // using UMD under the hood. - // This will be kept around until UMD supports generic coordinates in its APIs, at which point TT-Metal - // virtual coordinates can be passed to UMD directly. for (auto chip_id : this->cluster_desc_->get_all_chips()) { this->virtual_worker_cores_[chip_id] = {}; + for (const tt::umd::CoreCoord& core : + get_soc_desc(chip_id).get_cores(CoreType::TENSIX, CoordSystem::TRANSLATED)) { + this->virtual_worker_cores_[chip_id].insert({core.x, core.y}); + } this->virtual_eth_cores_[chip_id] = {}; - for (tt::umd::CoreCoord core : this->get_soc_desc(chip_id).get_all_cores(CoordSystem::PHYSICAL)) { - CoreCoord virtual_coords = - this->get_virtual_coordinate_from_physical_coordinates(chip_id, {core.x, core.y}); - tt_cxy_pair virtual_core = tt_cxy_pair(chip_id, virtual_coords.x, virtual_coords.y); - tt_cxy_pair umd_core = - this->get_soc_desc(chip_id).convert_to_umd_coordinates(tt_cxy_pair(chip_id, core.x, core.y)); - this->virtual_to_umd_coord_mapping_[virtual_core] = umd_core; - if (core.core_type == CoreType::TENSIX) { - this->virtual_worker_cores_[chip_id].insert(virtual_coords); - } else if (core.core_type == CoreType::ETH) { - this->virtual_eth_cores_[chip_id].insert(virtual_coords); - } + for (const tt::umd::CoreCoord& core : get_soc_desc(chip_id).get_cores(CoreType::ETH, CoordSystem::TRANSLATED)) { + this->virtual_eth_cores_[chip_id].insert({core.x, core.y}); } } } @@ -465,8 +453,9 @@ CoreCoord Cluster::get_physical_coordinate_from_logical_coordinates( CoreCoord Cluster::get_logical_ethernet_core_from_virtual(chip_id_t chip, CoreCoord core) const { const metal_SocDescriptor &soc_desc = tt::Cluster::instance().get_soc_desc(chip); - auto phys_eth_core = this->virtual_to_umd_coord_mapping_.at(tt_cxy_pair(chip, core.x, core.y)); - return soc_desc.get_logical_ethernet_core_from_physical(phys_eth_core); + tt::umd::CoreCoord logical_core = + get_soc_desc(chip).translate_coord_to(core, CoordSystem::TRANSLATED, CoordSystem::LOGICAL); + return {logical_core.x, logical_core.y}; } uint32_t Cluster::get_harvested_rows(chip_id_t chip) const { @@ -495,14 +484,14 @@ int Cluster::get_device_aiclk(const chip_id_t &chip_id) const { void Cluster::deassert_risc_reset_at_core(const tt_cxy_pair &core) const { const metal_SocDescriptor &soc_desc = this->get_soc_desc(core.chip); - tt_cxy_pair umd_core = this->virtual_to_umd_coord_mapping_.at(core); - this->driver_->deassert_risc_reset_at_core(umd_core); + tt::umd::CoreCoord core_coord = soc_desc.get_coord_at(core, CoordSystem::TRANSLATED); + this->driver_->deassert_risc_reset_at_core(core.chip, core_coord); } void Cluster::assert_risc_reset_at_core(const tt_cxy_pair &core) const { const metal_SocDescriptor &soc_desc = this->get_soc_desc(core.chip); - tt_cxy_pair umd_core = this->virtual_to_umd_coord_mapping_.at(core); - this->driver_->assert_risc_reset_at_core(umd_core); + tt::umd::CoreCoord core_coord = soc_desc.get_coord_at(core, CoordSystem::TRANSLATED); + this->driver_->assert_risc_reset_at_core(core.chip, core_coord); } void Cluster::write_dram_vec(std::vector &vec, tt_target_dram dram, uint64_t addr, bool small_access) const { @@ -550,13 +539,9 @@ void Cluster::write_core( tt::watcher_sanitize_host_noc_write(soc_desc, this->virtual_worker_cores_.at(chip_id), this->virtual_eth_cores_.at(chip_id), {core.x, core.y}, addr, sz_in_bytes); } - TT_FATAL( - this->virtual_to_umd_coord_mapping_.find(core) != this->virtual_to_umd_coord_mapping_.end(), - "Cannot find UMD core for virtual core {}", - core.str()); - tt_cxy_pair umd_core = this->virtual_to_umd_coord_mapping_.at(core); + tt::umd::CoreCoord core_coord = soc_desc.get_coord_at(core, CoordSystem::TRANSLATED); - this->driver_->write_to_device(mem_ptr, sz_in_bytes, umd_core, addr, "LARGE_WRITE_TLB"); + this->driver_->write_to_device(mem_ptr, sz_in_bytes, core.chip, core_coord, addr, "LARGE_WRITE_TLB"); if (this->cluster_desc_->is_chip_remote(chip_id)) { this->driver_->wait_for_non_mmio_flush(chip_id); } @@ -570,13 +555,9 @@ void Cluster::read_core( if (tt::llrt::RunTimeOptions::get_instance().get_watcher_enabled()) { tt::watcher_sanitize_host_noc_read(soc_desc, this->virtual_worker_cores_.at(chip_id), this->virtual_eth_cores_.at(chip_id), {core.x, core.y}, addr, size_in_bytes); } - TT_FATAL( - this->virtual_to_umd_coord_mapping_.find(core) != this->virtual_to_umd_coord_mapping_.end(), - "Cannot find UMD core for virtual core {}", - core.str()); - tt_cxy_pair umd_core = this->virtual_to_umd_coord_mapping_.at(core); + tt::umd::CoreCoord core_coord = soc_desc.get_coord_at(core, CoordSystem::TRANSLATED); - this->driver_->read_from_device(mem_ptr, umd_core, addr, size_in_bytes, "LARGE_READ_TLB"); + this->driver_->read_from_device(mem_ptr, core.chip, core_coord, addr, size_in_bytes, "LARGE_READ_TLB"); } void Cluster::read_core( @@ -593,8 +574,8 @@ void Cluster::write_reg(const std::uint32_t *mem_ptr, tt_cxy_pair target, uint64 if (tt::llrt::RunTimeOptions::get_instance().get_watcher_enabled()) { tt::watcher_sanitize_host_noc_write(soc_desc, this->virtual_worker_cores_.at(chip_id), this->virtual_eth_cores_.at(chip_id), {target.x, target.y}, addr, size_in_bytes); } - tt_cxy_pair umd_target = this->virtual_to_umd_coord_mapping_.at(target); - this->driver_->write_to_device(mem_ptr, size_in_bytes, umd_target, addr, "REG_TLB"); + tt::umd::CoreCoord target_coord = soc_desc.get_coord_at(target, CoordSystem::TRANSLATED); + this->driver_->write_to_device(mem_ptr, size_in_bytes, target.chip, target_coord, addr, "REG_TLB"); if (this->cluster_desc_->is_chip_remote(chip_id)) { this->driver_->wait_for_non_mmio_flush(chip_id); } @@ -608,8 +589,8 @@ void Cluster::read_reg(std::uint32_t *mem_ptr, tt_cxy_pair target, uint64_t addr if (tt::llrt::RunTimeOptions::get_instance().get_watcher_enabled()) { tt::watcher_sanitize_host_noc_read(soc_desc, this->virtual_worker_cores_.at(chip_id), this->virtual_eth_cores_.at(chip_id), {target.x, target.y}, addr, size_in_bytes); } - tt_cxy_pair umd_target = this->virtual_to_umd_coord_mapping_.at(target); - this->driver_->read_from_device(mem_ptr, umd_target, addr, size_in_bytes, "REG_TLB"); + tt::umd::CoreCoord target_coord = soc_desc.get_coord_at(target, CoordSystem::TRANSLATED); + this->driver_->read_from_device(mem_ptr, target.chip, target_coord, addr, size_in_bytes, "REG_TLB"); } void Cluster::write_sysmem( From 86ca0bc3f47cae60a03b24dbd3e3d65e2de640d4 Mon Sep 17 00:00:00 2001 From: Andrew Fuller Date: Fri, 7 Feb 2025 16:16:09 -0500 Subject: [PATCH 021/316] Use the same linker preference in all toolchains (#17735) ### Ticket None ### Problem description We only set the linker preference in 1of3 toolchains. ### What's changed Use the same linker preference in all our toolchains. --- cmake/x86_64-linux-clang-17-libcpp-toolchain.cmake | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cmake/x86_64-linux-clang-17-libcpp-toolchain.cmake b/cmake/x86_64-linux-clang-17-libcpp-toolchain.cmake index dd35de0487d..4dc15e0c413 100644 --- a/cmake/x86_64-linux-clang-17-libcpp-toolchain.cmake +++ b/cmake/x86_64-linux-clang-17-libcpp-toolchain.cmake @@ -10,3 +10,15 @@ set(CMAKE_SHARED_LINKER_FLAGS_INIT "-lc++ -lc++abi") # Use for configure time set(ENABLE_LIBCXX TRUE CACHE INTERNAL "Using clang's libc++") + +# Our build is super slow; put a band-aid on it by choosing a linker that can cope better. +# We really need to fix out code, though. +find_program(MOLD ld.mold) +if(MOLD) + set(CMAKE_LINKER_TYPE MOLD) +else() + find_program(LLD ld.lld-17) + if(LLD) + set(CMAKE_LINKER_TYPE LLD) + endif() +endif() From 6f3a381647a725695fff1a74d79013b3b6fb4497 Mon Sep 17 00:00:00 2001 From: Andrew Fuller Date: Fri, 7 Feb 2025 16:21:43 -0500 Subject: [PATCH 022/316] Run clang-tidy scan in a single container (#17734) ### Ticket None ### Problem description The workflow is a little convoluted with Docker at the Step level. We can now use Docker at the Job level which makes it more clear. Also, I'm seeing a strange error on a branch, so I'm hoping this will help surface what's going wrong when I combine these two. ### What's changed Refactored the Clang Tidy job to make use of Container: at the job level and linearlize the steps (also a bit of de-duplication). ### Checklist - [x] [Incremental Clang Tidy is incremental](https://github.com/tenstorrent/tt-metal/actions/runs/13207178433/job/36873010043) - [x] [Full scan Clang Tidy runs correctly](https://github.com/tenstorrent/tt-metal/actions/runs/13206712180/job/36871445213) --- .github/workflows/code-analysis.yaml | 186 ++++++++++++--------------- 1 file changed, 82 insertions(+), 104 deletions(-) diff --git a/.github/workflows/code-analysis.yaml b/.github/workflows/code-analysis.yaml index b78af4fb6c1..b096bb0c5e0 100644 --- a/.github/workflows/code-analysis.yaml +++ b/.github/workflows/code-analysis.yaml @@ -10,7 +10,7 @@ on: version: required: false type: string - default: "20.04" + default: "22.04" architecture: required: false type: string @@ -28,7 +28,7 @@ on: version: required: false type: string - default: "20.04" + default: "22.04" architecture: required: false type: string @@ -48,51 +48,53 @@ jobs: architecture: ${{ inputs.architecture }} clang-tidy: + name: 🤖 Clang Tidy needs: build-docker-image - env: - ARCH_NAME: wormhole_b0 - IMAGE_PARAMS: "${{ inputs.distro }}-${{ inputs.version }}-${{ inputs.architecture }}" runs-on: - build - in-service + container: + image: ${{ needs.build-docker-image.outputs.ci-build-tag }} + env: + CCACHE_TEMPDIR: /tmp/ccache + CARGO_HOME: /tmp/.cargo + TT_FROM_PRECOMPILED_DIR: /work + volumes: + - ${{ github.workspace }}/docker-job:/work # Subdir to workaround https://github.com/actions/runner/issues/691 + - /home/ubuntu/.ccache-ci:/github/home/.ccache # HOME is hardcoded for no clear reason: https://github.com/actions/runner/issues/863 + - /mnt/MLPerf/ccache:/mnt/MLPerf/ccache + # Group 1457 is for the shared ccache drive + # tmpfs is for efficiency + options: > + --group-add 1457 + --tmpfs /tmp + defaults: + run: + shell: bash + working-directory: /work # https://github.com/actions/runner/issues/878 steps: - name: Verify ccache availability - shell: bash run: | if [ ! -d "/mnt/MLPerf/ccache" ]; then echo "::error title=ccache-mlperf-not-mounted::NFS drive is not mounted; build machine not properly provisioned." exit 1 fi - if [ ! -d "$HOME/.ccache-ci" ]; then + if [ ! -d "$HOME/.ccache" ]; then echo "::error title=ccache-not-provisioned::Ccache is not properly provisioned." exit 1 fi - - name: Check out repo - uses: actions/checkout@v4 - - name: Set up dynamic env vars for build + + - name: Create ccache tmpdir run: | - echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV - echo "RUNNER_UID=$(id -u)" >> $GITHUB_ENV - echo "RUNNER_GID=$(id -g)" >> $GITHUB_ENV - - name: Generate docker tag - id: generate-docker-tag - uses: ./.github/actions/generate-docker-tag - with: - image: tt-metalium/${{ env.IMAGE_PARAMS }} - - name: Docker login - uses: docker/login-action@v3 - with: - registry: https://ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - name: Pull docker image - run: docker pull ${{ env.TT_METAL_DOCKER_IMAGE_TAG }} + mkdir -p /tmp/ccache - name: Check out repo uses: actions/checkout@v4 with: fetch-depth: 0 + fetch-tags: true submodules: recursive + path: docker-job # Here be dragons; keep it scoped to our desired volume, yet must be under github.workspace and be sure to clean up at the end clean: true - name: Determine merge base @@ -109,103 +111,79 @@ jobs: with: ref: ${{ env.MERGE_BASE }} fetch-depth: 0 + fetch-tags: true submodules: recursive + path: docker-job # Here be dragons; keep it scoped to our desired volume, yet must be under github.workspace and be sure to clean up at the end clean: true - - name: Create baseline + - name: Create shim + run: | + # Suppress clang-tidy to first get an up-to-date build tree + ln -sf /usr/bin/true ./clang-tidy-shim + + - name: 🔧 CMake configure + run: | + cmake --preset clang-tidy -DCMAKE_CXX_CLANG_TIDY="$(pwd)/clang-tidy-shim;--warnings-as-errors=*" -DCMAKE_C_CLANG_TIDY="$(pwd)/clang-tidy-shim;--warnings-as-errors=*" + + - name: Prepare baseline ccache summary if: github.ref_name != 'main' && !inputs.full-scan - uses: tenstorrent/docker-run-action@v5 - with: - image: ${{ env.TT_METAL_DOCKER_IMAGE_TAG }} - options: | - --rm - --tmpfs /tmp - -u ${{ env.RUNNER_UID }}:${{ env.RUNNER_GID }} - --group-add 1457 - -v ${{ github.workspace }}:${{ github.workspace }} - -v /etc/passwd:/etc/passwd:ro - -v /etc/shadow:/etc/shadow:ro - -v /etc/bashrc:/etc/bashrc:ro - -v /home/ubuntu/.ccache-ci:/home/ubuntu/.ccache - -v /mnt/MLPerf/ccache:/mnt/MLPerf/ccache - -e ARCH_NAME=${{ env.ARCH_NAME }} - -e CARGO_HOME=${{ github.workspace }}/.cargo - -w ${{ github.workspace }} - run: | - set -eu # basic shell hygiene - - # /tmp is a tmpfs; more efficient than persisted storage - mkdir -p /tmp/ccache - export CCACHE_TEMPDIR=/tmp/ccache - - # Zero out the stats so we can see how we did this build - # NOTE: may be inaccurate if we have >1 build runner on the same machine, using the same local cache - ccache -z - - # Suppress clang-tidy to first get an up-to-date build tree - ln -sf /usr/bin/true ./clang-tidy-shim - - cmake --preset clang-tidy -DCMAKE_CXX_CLANG_TIDY="$(pwd)/clang-tidy-shim;--warnings-as-errors=*" -DCMAKE_C_CLANG_TIDY="$(pwd)/clang-tidy-shim;--warnings-as-errors=*" - nice -n 19 cmake --build --preset clang-tidy - - mkdir -p out - ccache -s > out/ccache.stats + run: | + # Zero out the stats so we can see how we did this build + # NOTE: may be inaccurate if we have >1 build runner on the same machine, using the same local cache + ccache -z + + - name: 🛠️ Baseline Build + if: github.ref_name != 'main' && !inputs.full-scan + run: | + nice -n 19 cmake --build --preset clang-tidy - name: Publish Ccache summary if: github.ref_name != 'main' && !inputs.full-scan run: | - echo '## CCache Summary (baseline)' >> $GITHUB_STEP_SUMMARY + echo '## CCache Summary' >> $GITHUB_STEP_SUMMARY echo '```' >> $GITHUB_STEP_SUMMARY - cat out/ccache.stats >> $GITHUB_STEP_SUMMARY + ccache -s >> $GITHUB_STEP_SUMMARY echo '```' >> $GITHUB_STEP_SUMMARY - name: Checkout repo uses: actions/checkout@v4 with: + fetch-depth: 0 + fetch-tags: true submodules: recursive + path: docker-job # Here be dragons; keep it scoped to our desired volume, yet must be under github.workspace and be sure to clean up at the end clean: false - - name: Analyze code with clang-tidy - uses: tenstorrent/docker-run-action@v5 - with: - image: ${{ env.TT_METAL_DOCKER_IMAGE_TAG }} - options: | - --rm - --tmpfs /tmp - -u ${{ env.RUNNER_UID }}:${{ env.RUNNER_GID }} - --group-add 1457 - -v ${{ github.workspace }}:${{ github.workspace }} - -v /etc/passwd:/etc/passwd:ro - -v /etc/shadow:/etc/shadow:ro - -v /etc/bashrc:/etc/bashrc:ro - -v /home/ubuntu/.ccache-ci:/home/ubuntu/.ccache - -v /mnt/MLPerf/ccache:/mnt/MLPerf/ccache - -e ARCH_NAME=${{ env.ARCH_NAME }} - -e CARGO_HOME=${{ github.workspace }}/.cargo - -w ${{ github.workspace }} - run: | - set -eu # basic shell hygiene - - # /tmp is a tmpfs; more efficient than persisted storage - mkdir -p /tmp/ccache - export CCACHE_TEMPDIR=/tmp/ccache - - # Zero out the stats so we can see how we did this build - # NOTE: may be inaccurate if we have >1 build runner on the same machine, using the same local cache - ccache -z - - # Restore shim to legit clang-tidy - # Symlink tomfoolery here so that Ninja believes the build command has not changed from the previous run - ln -sf $(which clang-tidy-17) ./clang-tidy-shim - - # Keep this line _exactly_ the same as the one in the "Create baseline" or it will not be incremental - cmake --preset clang-tidy -DCMAKE_CXX_CLANG_TIDY="$(pwd)/clang-tidy-shim;--warnings-as-errors=*" -DCMAKE_C_CLANG_TIDY="$(pwd)/clang-tidy-shim;--warnings-as-errors=*" - nice -n 19 cmake --build --preset clang-tidy - mkdir -p out - ccache -s > out/ccache.stats + - name: Restore shim + run: | + # Restore shim to legit clang-tidy + # Symlink tomfoolery here so that Ninja believes the build command has not changed from the previous run + ln -sf $(which clang-tidy-17) ./clang-tidy-shim + + - name: Prepare ccache summary + run: | + # Zero out the stats so we can see how we did this build + # NOTE: may be inaccurate if we have >1 build runner on the same machine, using the same local cache + ccache -z + + - name: 🔍 Analyze code with clang-tidy + run: | + nice -n 19 cmake --build --preset clang-tidy + - name: Publish Ccache summary run: | echo '## CCache Summary' >> $GITHUB_STEP_SUMMARY echo '```' >> $GITHUB_STEP_SUMMARY - cat out/ccache.stats >> $GITHUB_STEP_SUMMARY + ccache -s >> $GITHUB_STEP_SUMMARY echo '```' >> $GITHUB_STEP_SUMMARY + + - name: Cleanup + if: always() + run: | + # We are forced to checkout the repo into a subdir of the host's workdir; this pollutes the host + # with root-owned files. Be sure to clean up after ourselves in case we're on a non-ephemeral runner. + echo "pre rm" + ls -al /__w/tt-metal/tt-metal + rm -rf /__w/tt-metal/tt-metal/docker-job + echo "post rm" + ls -al /__w/tt-metal/tt-metal From fee23688865aab3d5d85e9c5ec73a2d939a38cc1 Mon Sep 17 00:00:00 2001 From: Wenbin Lyu Date: Fri, 7 Feb 2025 15:43:15 -0600 Subject: [PATCH 023/316] Fix undefined QueueId in ttnn events (#17739) ### Ticket None ### Problem description `QueueId` is undefined in `ttnn/cpp/ttnn/events.cpp/hpp`. ### What's changed Include the appropriate header for `QueueId`. ### Checklist - [ ] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [ ] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) --- ttnn/cpp/ttnn/events.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ttnn/cpp/ttnn/events.hpp b/ttnn/cpp/ttnn/events.hpp index 1e1eedbaac9..b07435706b8 100644 --- a/ttnn/cpp/ttnn/events.hpp +++ b/ttnn/cpp/ttnn/events.hpp @@ -1,10 +1,11 @@ -// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2024-2025 Tenstorrent Inc. // // SPDX-License-Identifier: Apache-2.0 #pragma once #include +#include "ttnn/common/queue_id.hpp" #include "ttnn/distributed/types.hpp" #include "tt-metalium/device.hpp" From 74d36b9f41f580ed2882fd9ed122cf4ec04a2a43 Mon Sep 17 00:00:00 2001 From: William Ly Date: Fri, 7 Feb 2025 16:51:08 -0500 Subject: [PATCH 024/316] [skip ci] #10718: Fix produce_data workflow crash when job log not found (#17738) ### Ticket [10718](https://github.com/tenstorrent/tt-metal/issues/10718) ### Problem description Crashes when job log not found https://github.com/tenstorrent/tt-metal/actions/runs/13208296697/job/36876544774 offending job: https://github.com/tenstorrent/tt-metal/actions/runs/13204550781/job/36870214350 ### What's changed Add `|| true` after `gh api` command ### Checklist - [x] New/Existing tests provide coverage for changes Same workflow run on fix branch https://github.com/tenstorrent/tt-metal/actions/runs/13208367694/job/36876767080 --- .../data_collection/github/download_cicd_logs_and_artifacts.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infra/data_collection/github/download_cicd_logs_and_artifacts.sh b/infra/data_collection/github/download_cicd_logs_and_artifacts.sh index 4e05809206a..1c5d3852a8d 100755 --- a/infra/data_collection/github/download_cicd_logs_and_artifacts.sh +++ b/infra/data_collection/github/download_cicd_logs_and_artifacts.sh @@ -34,7 +34,7 @@ download_logs_for_all_jobs() { job_id=$(echo "$job" | jq -r '.id') job_conclusion=$(echo "$job" | jq -r '.conclusion') echo "[info] download logs for job with id $job_id, attempt number $attempt_number" - gh api /repos/$repo/actions/jobs/$job_id/logs > generated/cicd/$workflow_run_id/logs/$job_id.log + gh api /repos/$repo/actions/jobs/$job_id/logs > generated/cicd/$workflow_run_id/logs/$job_id.log || true # Only download annotations for failed jobs if [[ "$job_conclusion" == "failure" ]]; then From 9f987a07a8bffebcc3886c53dd8015a19800fc30 Mon Sep 17 00:00:00 2001 From: Salar Hosseini Date: Fri, 7 Feb 2025 21:38:26 +0000 Subject: [PATCH 025/316] [Old-llama70b-vLLM] Remove 2x4 device assertion since t3k mesh now opens with 1x8 Signed-off-by: Salar Hosseini --- models/demos/t3000/llama2_70b/tt/generator_vllm.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/models/demos/t3000/llama2_70b/tt/generator_vllm.py b/models/demos/t3000/llama2_70b/tt/generator_vllm.py index 3855efcb8e5..64d5f405d22 100644 --- a/models/demos/t3000/llama2_70b/tt/generator_vllm.py +++ b/models/demos/t3000/llama2_70b/tt/generator_vllm.py @@ -42,9 +42,6 @@ class TTArgs: llama_version=llama_version, ) - mesh_rows = t3k_mesh_device.shape.num_rows - mesh_cols = t3k_mesh_device.shape.num_cols - assert mesh_rows == 2 and mesh_cols == 4, f"Invalid mesh device shape: {mesh_rows}x{mesh_cols}" check_mesh_device(t3k_mesh_device, model_config) # initialize arg classes From 404af336881657378ff01a1c8c8e298219306bb3 Mon Sep 17 00:00:00 2001 From: Kalaivani Baskar <156762498+KalaivaniMCW@users.noreply.github.com> Date: Sat, 8 Feb 2025 05:56:24 +0530 Subject: [PATCH 026/316] #16733: binary pow sfpu operation (#17228) ### Ticket Link to Github Issue #16733 ### Problem description Incorrect result for certain values in `ttnn.pow` required LLK side fixes merged in #17267 ### What's changed In binary device operation, based on `dtype` we choose the FPU (bfloat16) or SFPU (float32) operation for compute. Binary Pow is an exception here, regardless of dtype the operation runs on SFPU. For all SFPU ops - UnpackToDestMode is set to `UnpackToDestFp32`, hence adding another check to set this only if input dtype's not bfloat16. **Observation:** As Radomir confirmed with the HW team, In bfloat16 dtype, NaN values should become Inf when they get packed out. This is a HW limitation. For float32, NaN values are unaffected. ### Checklist - [x] Post commit CI passes https://github.com/tenstorrent/tt-metal/actions/runs/13179137515 - [x] Blackhole Post commit (if applicable) https://github.com/tenstorrent/tt-metal/actions/runs/13182941436 - [ ] Model regression CI testing passes (if applicable) - [ ] Device performance regression CI testing passes (if applicable) - [ ] **(For models and ops writers)** Full [new models](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) tests passes - [ ] New/Existing tests provide coverage for changes --- .../unit_tests/operations/eltwise/test_pow.py | 101 ++++++++++++++++++ ...ement_wise_multi_core_sfpu_pgm_factory.cpp | 30 ++++-- .../device/binary_ng_device_operation.cpp | 10 +- .../device/binary_ng_program_factory.cpp | 32 ++++-- 4 files changed, 150 insertions(+), 23 deletions(-) diff --git a/tests/ttnn/unit_tests/operations/eltwise/test_pow.py b/tests/ttnn/unit_tests/operations/eltwise/test_pow.py index 9d19ca9fc86..c2574a0a870 100644 --- a/tests/ttnn/unit_tests/operations/eltwise/test_pow.py +++ b/tests/ttnn/unit_tests/operations/eltwise/test_pow.py @@ -182,3 +182,104 @@ def test_binary_sfpu_pow_neg( pcc = ttnn.pearson_correlation_coefficient(torch_output_tensor, output) assert pcc >= 0.99 + + +@skip_for_grayskull("Unsupported dtype for Grayskull") +@pytest.mark.parametrize( + "dtype_a", + [ + "float32", + "bfloat16", + ], +) +@pytest.mark.parametrize( + "dtype_b", + [ + "float32", + "bfloat16", + ], +) +@pytest.mark.parametrize( + "ttnn_function", + [ + ttnn.pow, + ttnn.experimental.pow, + ], +) +def test_binary_pow(device, dtype_a, dtype_b, ttnn_function): + torch_dtype_a = getattr(torch, dtype_a) + ttnn_dtype_a = getattr(ttnn, dtype_a) + torch_dtype_b = getattr(torch, dtype_b) + ttnn_dtype_b = getattr(ttnn, dtype_b) + x_torch = torch.tensor([[0.98828125, 0.47851562, 1.1875, -1.59375]], dtype=torch_dtype_a) + y_torch = torch.tensor([[0.0751953125, 0.53125, -0.6640625, 0.1533203125]], dtype=torch_dtype_b) + golden_fn = ttnn.get_golden_function(ttnn_function) + z_torch = golden_fn(x_torch, y_torch) + x_tt = ttnn.from_torch(x_torch, dtype=ttnn_dtype_a, layout=ttnn.TILE_LAYOUT, device=device) + y_tt = ttnn.from_torch(y_torch, dtype=ttnn_dtype_b, layout=ttnn.TILE_LAYOUT, device=device) + z_tt_pow = ttnn_function(x_tt, y_tt) + tt_out = ttnn.to_torch(z_tt_pow) + # output - bfloat16 + # Due to HW limitations for bfloat16 dtype, NaN value gets packed as inf. + # z_tt_pow ttnn.Tensor([[ 0.99609, 0.67969, ..., 0.89844, inf]]) + # z_torch tensor([[1.0000, 0.6758, 0.8906, nan]], dtype=torch.bfloat16) + # output - float32 + # z_tt_pow ttnn.Tensor([[ 0.99930, 0.68274, ..., 0.90147, nan]]) + # z_torch tensor([[0.9991, 0.6760, 0.8922, nan]]) + + status = ttnn.pearson_correlation_coefficient(z_torch, tt_out) >= 0.99 + assert status + + +@skip_for_grayskull() +@pytest.mark.parametrize( + "input_shapes", + ( + [32, 64], + [1, 128, 96], + [5, 3, 64, 128], + ), +) +@skip_for_grayskull("Unsupported dtype for Grayskull") +@pytest.mark.parametrize( + "dtype_a", + [ + "float32", + "bfloat16", + ], +) +@pytest.mark.parametrize( + "dtype_b", + [ + "float32", + "bfloat16", + ], +) +@pytest.mark.parametrize( + "ttnn_function", + [ + ttnn.pow, + ttnn.experimental.pow, + ], +) +def test_binary_sfpu_pow_bug(device, input_shapes, dtype_a, dtype_b, ttnn_function): + if (ttnn_function == ttnn.pow) and (dtype_a != dtype_b): + pytest.skip("Mixed datatypes not supported in ttnn.pow") + torch.manual_seed(0) + torch_dtype_a = getattr(torch, dtype_a) + ttnn_dtype_a = getattr(ttnn, dtype_a) + torch_dtype_b = getattr(torch, dtype_b) + ttnn_dtype_b = getattr(ttnn, dtype_b) + torch_input_tensor_a = torch.randn(input_shapes, dtype=torch_dtype_a) + torch_input_tensor_b = torch.randn(input_shapes, dtype=torch_dtype_b) + golden_fn = ttnn.get_golden_function(ttnn_function) + torch_output_tensor = golden_fn(torch_input_tensor_a, torch_input_tensor_b) + + input_tensor_a = ttnn.from_torch(torch_input_tensor_a, dtype=ttnn_dtype_a, layout=ttnn.TILE_LAYOUT, device=device) + input_tensor_b = ttnn.from_torch(torch_input_tensor_b, dtype=ttnn_dtype_b, layout=ttnn.TILE_LAYOUT, device=device) + + output = ttnn_function(input_tensor_a, input_tensor_b) + output = ttnn.to_torch(output) + + pcc = ttnn.pearson_correlation_coefficient(torch_output_tensor, output) + assert pcc >= 0.999 diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_sfpu_pgm_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_sfpu_pgm_factory.cpp index 286378d2652..ecd0f9258e9 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_sfpu_pgm_factory.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_sfpu_pgm_factory.cpp @@ -7,7 +7,7 @@ #include "binary_device_operation.hpp" #include "cpp/ttnn/operations/eltwise/binary/device/eltwise_multi_core_program_factory_common.hpp" #include "ttnn/operations/eltwise/unary/common/unary_op_types.hpp" - +#include "ttnn/operations/eltwise/binary/common/binary_op_types.hpp" #include #include @@ -28,6 +28,8 @@ BinaryDeviceOperation::ElementWiseMultiCoreSfpu::create( const auto& a = tensor_args.input_tensor_a; const auto& b = tensor_args.input_tensor_b; + auto a_dtype = a.get_dtype(); + auto b_dtype = b.has_value() ? b->get_dtype() : a_dtype; auto& output = tensor_return_value; const auto& op_type = operation_attributes.binary_op_type; @@ -36,9 +38,9 @@ BinaryDeviceOperation::ElementWiseMultiCoreSfpu::create( Program program{}; - tt::DataFormat src0_cb_data_format = tt_metal::datatype_to_dataformat_converter(a.get_dtype()); + tt::DataFormat src0_cb_data_format = tt_metal::datatype_to_dataformat_converter(a_dtype); uint32_t src0_single_tile_size = tt_metal::detail::TileSize(src0_cb_data_format); - tt::DataFormat src1_cb_data_format = tt_metal::datatype_to_dataformat_converter(b->get_dtype()); + tt::DataFormat src1_cb_data_format = tt_metal::datatype_to_dataformat_converter(b_dtype); uint32_t src1_single_tile_size = tt_metal::detail::TileSize(src1_cb_data_format); tt::DataFormat dst_cb_data_format = tt_metal::datatype_to_dataformat_converter(output.get_dtype()); uint32_t dst_single_tile_size = tt_metal::detail::TileSize(dst_cb_data_format); @@ -101,7 +103,7 @@ BinaryDeviceOperation::ElementWiseMultiCoreSfpu::create( auto cb_src1 = tt_metal::CreateCircularBuffer(program, all_device_cores, cb_src1_config); std::map eltwise_defines = utils::get_defines_fp32( - op_type, a.get_dtype(), b->get_dtype(), fused_activations, operation_attributes.input_tensor_a_activation); + op_type, a_dtype, b_dtype, fused_activations, operation_attributes.input_tensor_a_activation); uint32_t src0interim_cb_index = tt::CBIndex::c_3; if (eltwise_defines.find("SFPU_OP_INIT_PRE_IN0_0") != eltwise_defines.end()) { @@ -172,11 +174,21 @@ BinaryDeviceOperation::ElementWiseMultiCoreSfpu::create( (dst_cb_data_format == tt::DataFormat::UInt32); std::vector unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default); - unpack_to_dest_mode[src0_cb_index] = UnpackToDestMode::UnpackToDestFp32; - unpack_to_dest_mode[src1_cb_index] = UnpackToDestMode::UnpackToDestFp32; - unpack_to_dest_mode[src0interim_cb_index] = UnpackToDestMode::UnpackToDestFp32; - unpack_to_dest_mode[src1interim_cb_index] = UnpackToDestMode::UnpackToDestFp32; - + if (op_type != BinaryOpType::POWER) { + unpack_to_dest_mode[src0_cb_index] = UnpackToDestMode::UnpackToDestFp32; + unpack_to_dest_mode[src1_cb_index] = UnpackToDestMode::UnpackToDestFp32; + unpack_to_dest_mode[src0interim_cb_index] = UnpackToDestMode::UnpackToDestFp32; + unpack_to_dest_mode[src1interim_cb_index] = UnpackToDestMode::UnpackToDestFp32; + } else { + unpack_to_dest_mode[src0_cb_index] = + (a_dtype == DataType::FLOAT32) ? UnpackToDestMode::UnpackToDestFp32 : UnpackToDestMode::Default; + unpack_to_dest_mode[src1_cb_index] = + (b_dtype == DataType::FLOAT32) ? UnpackToDestMode::UnpackToDestFp32 : UnpackToDestMode::Default; + unpack_to_dest_mode[src0interim_cb_index] = + (a_dtype == DataType::FLOAT32) ? UnpackToDestMode::UnpackToDestFp32 : UnpackToDestMode::Default; + unpack_to_dest_mode[src1interim_cb_index] = + (b_dtype == DataType::FLOAT32) ? UnpackToDestMode::UnpackToDestFp32 : UnpackToDestMode::Default; + } auto eltwise_binary_kernel_id = tt_metal::CreateKernel( program, diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/binary_ng_device_operation.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/binary_ng_device_operation.cpp index 6dfdcc53a72..4c65a5473f3 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/binary_ng_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/binary_ng_device_operation.cpp @@ -389,10 +389,10 @@ BinaryNgDeviceOperation::invoke( input_tensor_b.get_logical_shape()[-2], input_tensor_b.get_logical_shape()[-1]); - DataType dtype1 = input_tensor_a.get_dtype(); - DataType dtype2 = input_tensor_a.get_dtype(); + DataType dtype_a = input_tensor_a.get_dtype(); + DataType dtype_b = input_tensor_b.get_dtype(); bool device_check = input_tensor_a.device()->arch() != tt::ARCH::GRAYSKULL; - bool is_sfpu_op = (utils::is_binary_sfpu_op(binary_op_type, dtype1, dtype2) && device_check); + bool is_sfpu_op = (utils::is_binary_sfpu_op(binary_op_type, dtype_a, dtype_b) && device_check); return { operation_attributes_t{ @@ -422,9 +422,9 @@ BinaryNgDeviceOperation::invoke( tt::stl::Span lhs_activations, tt::stl::Span rhs_activations, tt::stl::Span post_activations) { - DataType dtype1 = input_tensor_a.get_dtype(); + DataType dtype_a = input_tensor_a.get_dtype(); bool device_check = input_tensor_a.device()->arch() != tt::ARCH::GRAYSKULL; - bool is_sfpu_op = (utils::is_binary_sfpu_op(binary_op_type, dtype1, dtype1) && device_check); + bool is_sfpu_op = (utils::is_binary_sfpu_op(binary_op_type, dtype_a, dtype_a) && device_check); return { operation_attributes_t{ binary_op_type, diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/binary_ng_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/binary_ng_program_factory.cpp index 92bb3c8ea55..6c886ef4733 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/binary_ng_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/binary_ng_program_factory.cpp @@ -358,6 +358,8 @@ BinaryNgDeviceOperation::ProgramFactory::cached_program_t BinaryNgDeviceOperatio const auto& a = tensor_args.input_tensor_a; const auto& b = tensor_args.input_tensor_b; + const auto a_dtype = a.get_dtype(); + const auto b_dtype = b.has_value() ? b->get_dtype() : a_dtype; auto is_sfpu_op = operation_attributes.is_sfpu; auto program = CreateProgram(); @@ -371,9 +373,9 @@ BinaryNgDeviceOperation::ProgramFactory::cached_program_t BinaryNgDeviceOperatio uint32_t b_num_tiles_per_shard = has_sharding ? shard_specs->b_shard_spec.numel() / tile_hw : 0; uint32_t c_num_tiles_per_shard = has_sharding ? shard_specs->c_shard_spec.numel() / tile_hw : 0; - auto a_data_format = datatype_to_dataformat_converter(a.get_dtype()); + auto a_data_format = datatype_to_dataformat_converter(a_dtype); auto b_data_format = b.has_value() ? datatype_to_dataformat_converter(b->get_dtype()) - : is_sfpu_op ? datatype_to_dataformat_converter(a.get_dtype()) + : is_sfpu_op ? datatype_to_dataformat_converter(a_dtype) : DataFormat::Float16_b; auto c_data_format = datatype_to_dataformat_converter(c.get_dtype()); @@ -394,7 +396,7 @@ BinaryNgDeviceOperation::ProgramFactory::cached_program_t BinaryNgDeviceOperatio const auto op_config = is_sfpu_op ? OpConfig(op_type, std::in_place_type) : OpConfig(op_type, std::in_place_type); - auto compute_kernel_defines = op_config.as_defines(a.get_dtype()); + auto compute_kernel_defines = op_config.as_defines(a_dtype); { ttnn::SmallVector lhs_activations = operation_attributes.lhs_activations; @@ -487,12 +489,12 @@ BinaryNgDeviceOperation::ProgramFactory::cached_program_t BinaryNgDeviceOperatio auto kernel_config = CMAKE_UNIQUE_NAMESPACE::BinaryNgKernelConfig(operation_attributes.subtile_broadcast_type); std::map dataflow_defines; - if (is_sfpu_op && a.get_dtype() == DataType::FLOAT32) { + if (is_sfpu_op && a_dtype == DataType::FLOAT32) { dataflow_defines["FILL_TILE_WITH_FIRST_COLUMN"] = "fill_tile_with_first_column"; dataflow_defines["FILL_TILE_WITH_FIRST_ROW"] = "fill_tile_with_first_row"; dataflow_defines["FILL_TILE_WITH_FIRST_ELEMENT"] = "fill_tile_with_first_element"; dataflow_defines["FILL_WITH_VALUE_FLOAT"] = "fill_with_val<1024, float>"; - } else if (is_sfpu_op && a.get_dtype() == DataType::INT32) { + } else if (is_sfpu_op && a_dtype == DataType::INT32) { dataflow_defines["FILL_TILE_WITH_FIRST_COLUMN"] = "fill_tile_with_first_column"; dataflow_defines["FILL_TILE_WITH_FIRST_ROW"] = "fill_tile_with_first_row"; dataflow_defines["FILL_TILE_WITH_FIRST_ELEMENT"] = "fill_tile_with_first_element"; @@ -541,11 +543,23 @@ BinaryNgDeviceOperation::ProgramFactory::cached_program_t BinaryNgDeviceOperatio uint32_t src1interim_cb_index = tt::CBIndex::c_4; std::vector unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default); + if (is_sfpu_op) { - unpack_to_dest_mode[src0_cb_index] = UnpackToDestMode::UnpackToDestFp32; - unpack_to_dest_mode[src1_cb_index] = UnpackToDestMode::UnpackToDestFp32; - unpack_to_dest_mode[src0interim_cb_index] = UnpackToDestMode::UnpackToDestFp32; - unpack_to_dest_mode[src1interim_cb_index] = UnpackToDestMode::UnpackToDestFp32; + if (op_type != BinaryOpType::POWER) { + unpack_to_dest_mode[src0_cb_index] = UnpackToDestMode::UnpackToDestFp32; + unpack_to_dest_mode[src1_cb_index] = UnpackToDestMode::UnpackToDestFp32; + unpack_to_dest_mode[src0interim_cb_index] = UnpackToDestMode::UnpackToDestFp32; + unpack_to_dest_mode[src1interim_cb_index] = UnpackToDestMode::UnpackToDestFp32; + } else { + unpack_to_dest_mode[src0_cb_index] = + (a_dtype == DataType::FLOAT32) ? UnpackToDestMode::UnpackToDestFp32 : UnpackToDestMode::Default; + unpack_to_dest_mode[src1_cb_index] = + (b_dtype == DataType::FLOAT32) ? UnpackToDestMode::UnpackToDestFp32 : UnpackToDestMode::Default; + unpack_to_dest_mode[src0interim_cb_index] = + (a_dtype == DataType::FLOAT32) ? UnpackToDestMode::UnpackToDestFp32 : UnpackToDestMode::Default; + unpack_to_dest_mode[src1interim_cb_index] = + (b_dtype == DataType::FLOAT32) ? UnpackToDestMode::UnpackToDestFp32 : UnpackToDestMode::Default; + } } compute_kernel_defines["BCAST_INPUT"] = kernel_config.bcast_input_str(); From 65caa8835307d0b14a64060b31b538ca0c2f9ff0 Mon Sep 17 00:00:00 2001 From: Artem Yerofieiev <169092593+ayerofieiev-tt@users.noreply.github.com> Date: Fri, 7 Feb 2025 17:22:58 -0800 Subject: [PATCH 027/316] Automatically generate an overload w/o QueueId (#17640) ### Ticket https://github.com/tenstorrent/tt-metal/issues/10605 ### Problem description This is an experiment aimed to reduce amount of code OP developers write. ### What's changed If an OP provides `::invoke` which accepts QueueId as the first argument, decorator of a registered operation will handle invokes when `queue_id` is not provided by passing in the `DefaultQueueId`. While a reduction in code, it comes at a cost of compilation time. Besides that, the idea of passing queue_id into ops in this way is not something that I want to support. This PR is opened to facilitate a conversation. ### Checklist - [x] [All post commit](https://github.com/tenstorrent/tt-metal/actions/runs/13208462636) --- ttnn/cpp/ttnn/decorators.hpp | 85 ++++--- .../ccl/all_gather/device/all_gather_op.cpp | 2 +- .../core/to_layout/to_layout_op.cpp | 2 +- .../data_movement/concat/concat.cpp | 9 - .../data_movement/concat/concat.hpp | 7 - .../data_movement/fill_pad/fill_pad.cpp | 5 - .../data_movement/fill_pad/fill_pad.hpp | 5 - .../data_movement/fill_rm/fill_rm.cpp | 26 -- .../data_movement/fill_rm/fill_rm.hpp | 22 -- .../indexed_fill/indexed_fill.cpp | 9 - .../indexed_fill/indexed_fill.hpp | 7 - .../non_zero_indices/non_zero_indices.cpp | 5 - .../non_zero_indices/non_zero_indices.hpp | 3 - .../reshape_on_device/reshape.cpp | 37 +-- .../reshape_on_device/reshape.hpp | 26 +- .../data_movement/tilize/tilize.cpp | 8 - .../data_movement/tilize/tilize.hpp | 6 - .../data_movement/untilize/untilize.cpp | 9 - .../data_movement/untilize/untilize.hpp | 7 - .../untilize_with_unpadding.cpp | 9 - .../untilize_with_unpadding.hpp | 7 - .../eltwise/binary/binary_composite.hpp | 153 ------------ .../binary/device/binary_composite_op.cpp | 212 ----------------- .../binary_backward/binary_backward.cpp | 224 ------------------ .../binary_backward/binary_backward.hpp | 136 ----------- .../ttnn/operations/eltwise/unary/unary.cpp | 206 ---------------- .../ttnn/operations/eltwise/unary/unary.hpp | 100 -------- .../eltwise/unary_backward/unary_backward.cpp | 80 +------ .../eltwise/unary_backward/unary_backward.hpp | 56 ----- .../experimental/auto_format/auto_format.cpp | 4 +- .../cnn/convert_to_chw/convert_to_chw.cpp | 5 - .../cnn/convert_to_chw/convert_to_chw.hpp | 1 - .../experimental/plusone/plusone.cpp | 2 - .../experimental/plusone/plusone.hpp | 2 - .../fast_reduce_nc/fast_reduce_nc.cpp | 9 - .../fast_reduce_nc/fast_reduce_nc.hpp | 7 - .../ssm/hc_sum_reduce/hc_sum_reduce.cpp | 8 - .../ssm/hc_sum_reduce/hc_sum_reduce.hpp | 6 - .../ssm/prefix_scan/prefix_scan.cpp | 10 - .../ssm/prefix_scan/prefix_scan.hpp | 8 - .../repeat_and_interleave_eltwise_mul.cpp | 9 - .../repeat_and_interleave_eltwise_mul.hpp | 7 - .../operations/reduction/argmax/argmax.cpp | 9 - .../operations/reduction/argmax/argmax.hpp | 7 - .../ttnn/operations/reduction/prod/prod.cpp | 5 +- ttnn/cpp/ttnn/tensor/tensor_impl.hpp | 1 + 46 files changed, 68 insertions(+), 1495 deletions(-) diff --git a/ttnn/cpp/ttnn/decorators.hpp b/ttnn/cpp/ttnn/decorators.hpp index f1217df35b8..f571ed9c86e 100644 --- a/ttnn/cpp/ttnn/decorators.hpp +++ b/ttnn/cpp/ttnn/decorators.hpp @@ -201,6 +201,15 @@ concept PrimitiveOperationConcept = device_operation::DeviceOperationConcept concept CompositeOperationConcept = !PrimitiveOperationConcept; +template +concept HasInvoke = requires { + { Op::invoke(std::declval()...) }; +}; + +template +concept FirstArgIs = + sizeof...(Args) > 0 && std::same_as>>, T>; + template struct registered_operation_t { static constexpr auto is_primitive = PrimitiveOperationConcept; @@ -216,6 +225,45 @@ struct registered_operation_t { return detail::python_fully_qualified_name(std::string{cpp_fully_qualified_name}); } + // --- operator() Overloads --- + + // (1) Overload when the first argument is a QueueId. + template + requires std::same_as, QueueId> + auto operator()(First&& first, Rest&&... rest) const { + return traced_invoke(std::forward(first), std::forward(rest)...); + } + + // (2a) Overload when no QueueId is provided AND the operation is invocable without a QueueId. + template + requires(sizeof...(Args) == 0 || (!FirstArgIs && HasInvoke)) + auto operator()(Args&&... args) const { + return traced_invoke(std::forward(args)...); + } + + // (2b) Overload when no QueueId is provided but the operation is NOT invocable without a QueueId, + // so we inject DefaultQueueId. + template + requires( + sizeof...(Args) == 0 || (!FirstArgIs && !HasInvoke && + HasInvoke)) + auto operator()(Args&&... args) const { + return traced_invoke(DefaultQueueId, std::forward(args)...); + } + +private: + template + auto traced_invoke(args_t&&... args) const { + tt::log_debug(tt::LogOp, "Started C++ ttnn operation: {}", std::string_view{cpp_fully_qualified_name}); + tt::tt_metal::GraphTracker::instance().track_function_start(cpp_fully_qualified_name, args...); + + auto output = invoke(std::forward(args)...); + + tt::tt_metal::GraphTracker::instance().track_function_end(output); + tt::log_debug(tt::LogOp, "Finished C++ ttnn operation: {}", std::string_view{cpp_fully_qualified_name}); + return output; + } + template requires PrimitiveOperationConcept auto invoke(QueueId queue_id, args_t&&... args) const { @@ -234,6 +282,12 @@ struct registered_operation_t { return invoke(DefaultQueueId, std::forward(args)...); } + template + requires(CompositeOperationConcept) + auto invoke(args_t&&... args) const { + return invoke_composite(std::forward(args)...); + } + template requires(not auto_launch_op) auto invoke_composite(args_t&&... args) const { @@ -300,30 +354,6 @@ struct registered_operation_t { "Tensor(s)."); } } - - template - requires(CompositeOperationConcept) - auto invoke(args_t&&... args) const { - return invoke_composite(std::forward(args)...); - } - - template - auto operator()(args_t&&... args) const { - tt::log_debug(tt::LogOp, "Started C++ ttnn operation: {}", std::string_view{cpp_fully_qualified_name}); - tt::tt_metal::GraphTracker::instance().track_function_start(cpp_fully_qualified_name, args...); - auto output = invoke(std::forward(args)...); - - // Should every output tensor be tracked? - /* - if (GraphTracker::instance().is_enabled()) { - output = tt::stl::reflection::transform_object_of_type(tt::tt_metal::set_tensor_id, output); - } - */ - - tt::tt_metal::GraphTracker::instance().track_function_end(output); - tt::log_debug(tt::LogOp, "Finished C++ ttnn operation: {}", std::string_view{cpp_fully_qualified_name}); - return output; - } }; template @@ -393,13 +423,6 @@ constexpr auto register_operation_with_auto_launch_op() { return register_operation_impl(); } -namespace detail { -template -struct lambda_operation_t { - static auto invoke(auto&&... args) { return lambda_t(std::forward(args)...); } -}; -} // namespace detail - } // namespace decorators using ttnn::decorators::register_operation; diff --git a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.cpp b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.cpp index f3d458c821b..b763cab08f4 100644 --- a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.cpp @@ -286,7 +286,7 @@ Tensor all_gather( if (input_tensor.get_dtype() != DataType::BFLOAT16 && input_tensor.get_dtype() != DataType::FLOAT32) { input_tensor = ttnn::typecast(input_tensor, DataType::BFLOAT16); } - input_tensor = ttnn::pad(ttnn::DefaultQueueId, input_tensor, padding, 0, false, std::nullopt); + input_tensor = ttnn::pad(input_tensor, padding, 0, false, std::nullopt); if (original_dtype != input_tensor.get_dtype()) { input_tensor = ttnn::typecast(input_tensor, original_dtype); } diff --git a/ttnn/cpp/ttnn/operations/core/to_layout/to_layout_op.cpp b/ttnn/cpp/ttnn/operations/core/to_layout/to_layout_op.cpp index c88c5c1c629..87968b85b31 100644 --- a/ttnn/cpp/ttnn/operations/core/to_layout/to_layout_op.cpp +++ b/ttnn/cpp/ttnn/operations/core/to_layout/to_layout_op.cpp @@ -165,7 +165,7 @@ Tensor to_layout_impl( {0, 0}, {0, padded_output_shape[2] - output_shape[2]}, {0, padded_output_shape[3] - output_shape[3]}}; - tensor = ttnn::pad(ttnn::DefaultQueueId, tensor, padding, 0, true, std::nullopt); + tensor = ttnn::pad(tensor, padding, 0, true, std::nullopt); return ttnn::tilize(tensor, output_memory_config, dtype, use_multicore_tilize); } else { PadValue pad_value_variant; diff --git a/ttnn/cpp/ttnn/operations/data_movement/concat/concat.cpp b/ttnn/cpp/ttnn/operations/data_movement/concat/concat.cpp index 478eb4f127f..d0192a1a4b6 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/concat/concat.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/concat/concat.cpp @@ -329,15 +329,6 @@ ttnn::Tensor ConcatOperation::invoke( return res; } -ttnn::Tensor ConcatOperation::invoke( - const std::vector& input_tensors, - int dim, - const std::optional& memory_config, - const std::optional& optional_output_tensor, - unsigned int groups) { - return invoke(DefaultQueueId, input_tensors, dim, memory_config, std::move(optional_output_tensor), groups); -} - } // namespace data_movement } // namespace operations } // namespace ttnn diff --git a/ttnn/cpp/ttnn/operations/data_movement/concat/concat.hpp b/ttnn/cpp/ttnn/operations/data_movement/concat/concat.hpp index 08d06975590..23ff42804ae 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/concat/concat.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/concat/concat.hpp @@ -22,13 +22,6 @@ struct ConcatOperation { const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt, unsigned int groups = 1); - - static ttnn::Tensor invoke( - const std::vector& input_tensors, - int dim, - const std::optional& memory_config = std::nullopt, - const std::optional& optional_output_tensor = std::nullopt, - unsigned int groups = 1); }; } // namespace data_movement diff --git a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/fill_pad.cpp b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/fill_pad.cpp index 3b5d0a3dbcd..85a08a96718 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/fill_pad.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/fill_pad.cpp @@ -50,9 +50,4 @@ ttnn::Tensor FillPadOperation::invoke( .at(0); } -ttnn::Tensor FillPadOperation::invoke( - const ttnn::Tensor& input_tensor, float fill_value, const std::optional& memory_config_arg) { - return invoke(DefaultQueueId, input_tensor, fill_value, memory_config_arg); -} - } // namespace ttnn::operations::data_movement diff --git a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/fill_pad.hpp b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/fill_pad.hpp index 0213d996ea7..5233ccf85fb 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/fill_pad.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/fill_pad.hpp @@ -16,11 +16,6 @@ struct FillPadOperation { const ttnn::Tensor& input_tensor, float fill_value, const std::optional& memory_config = std::nullopt); - - static ttnn::Tensor invoke( - const ttnn::Tensor& input_tensor, - float fill_value, - const std::optional& memory_config = std::nullopt); }; } // namespace data_movement diff --git a/ttnn/cpp/ttnn/operations/data_movement/fill_rm/fill_rm.cpp b/ttnn/cpp/ttnn/operations/data_movement/fill_rm/fill_rm.cpp index 00de17b432d..b80ee00f20a 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/fill_rm/fill_rm.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/fill_rm/fill_rm.cpp @@ -30,20 +30,6 @@ ttnn::Tensor FillRMOperation::invoke( .at(0); } -ttnn::Tensor FillRMOperation::invoke( - uint32_t N, - uint32_t C, - uint32_t H, - uint32_t W, - uint32_t hFill, - uint32_t wFill, - const ttnn::Tensor& any, - float val_hi, - float val_lo, - const std::optional& memory_config_arg) { - return invoke(DefaultQueueId, N, C, H, W, hFill, wFill, any, val_hi, val_lo, memory_config_arg); -} - ttnn::Tensor FillOnesRMOperation::invoke( QueueId queue_id, uint32_t N, @@ -60,16 +46,4 @@ ttnn::Tensor FillOnesRMOperation::invoke( .at(0); } -ttnn::Tensor FillOnesRMOperation::invoke( - uint32_t N, - uint32_t C, - uint32_t H, - uint32_t W, - uint32_t hFill, - uint32_t wFill, - const ttnn::Tensor& any, - const std::optional& memory_config_arg) { - return invoke(DefaultQueueId, N, C, H, W, hFill, wFill, any, memory_config_arg); -} - } // namespace ttnn::operations::data_movement diff --git a/ttnn/cpp/ttnn/operations/data_movement/fill_rm/fill_rm.hpp b/ttnn/cpp/ttnn/operations/data_movement/fill_rm/fill_rm.hpp index ddebbc6e4bb..7a70d6c5a71 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/fill_rm/fill_rm.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/fill_rm/fill_rm.hpp @@ -23,18 +23,6 @@ struct FillRMOperation { float val_hi, float val_lo, const std::optional& memory_config = std::nullopt); - - static ttnn::Tensor invoke( - uint32_t N, - uint32_t C, - uint32_t H, - uint32_t W, - uint32_t hFill, - uint32_t wFill, - const ttnn::Tensor& any, - float val_hi, - float val_lo, - const std::optional& memory_config = std::nullopt); }; struct FillOnesRMOperation { @@ -48,16 +36,6 @@ struct FillOnesRMOperation { uint32_t wFill, const ttnn::Tensor& any, const std::optional& memory_config = std::nullopt); - - static ttnn::Tensor invoke( - uint32_t N, - uint32_t C, - uint32_t H, - uint32_t W, - uint32_t hFill, - uint32_t wFill, - const ttnn::Tensor& any, - const std::optional& memory_config = std::nullopt); }; } // namespace data_movement diff --git a/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/indexed_fill.cpp b/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/indexed_fill.cpp index 370eace29bf..1d81ecd0884 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/indexed_fill.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/indexed_fill.cpp @@ -23,13 +23,4 @@ ttnn::Tensor IndexedFillOperation::invoke( .at(0); } -ttnn::Tensor IndexedFillOperation::invoke( - const ttnn::Tensor& batch_id, - const ttnn::Tensor& input_tensor_a, - const ttnn::Tensor& input_tensor_b, - const std::optional& memory_config, - int64_t dim) { - return invoke(DefaultQueueId, batch_id, input_tensor_a, input_tensor_b, memory_config, dim); -} - } // namespace ttnn::operations::data_movement diff --git a/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/indexed_fill.hpp b/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/indexed_fill.hpp index f07b71b8e31..fe80391e3b5 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/indexed_fill.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/indexed_fill.hpp @@ -19,13 +19,6 @@ struct IndexedFillOperation { const ttnn::Tensor& input_tensor_b, const std::optional& memory_config = std::nullopt, int64_t dim = 0); - - static ttnn::Tensor invoke( - const ttnn::Tensor& batch_id, - const ttnn::Tensor& input_tensor_a, - const ttnn::Tensor& input_tensor_b, - const std::optional& memory_config = std::nullopt, - int64_t dim = 0); }; } // namespace data_movement diff --git a/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/non_zero_indices.cpp b/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/non_zero_indices.cpp index 2a75c0bf822..2a67e247b00 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/non_zero_indices.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/non_zero_indices.cpp @@ -18,9 +18,4 @@ std::vector NonZeroIndicesOperation::invoke( return operation::run_without_autoformat(NonZeroIndices{memory_config}, {input_tensor}, {}, {}, queue_id); } -std::vector NonZeroIndicesOperation::invoke( - const ttnn::Tensor& input_tensor, const std::optional& memory_config_arg) { - return invoke(DefaultQueueId, input_tensor, memory_config_arg); -} - } // namespace ttnn::operations::data_movement diff --git a/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/non_zero_indices.hpp b/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/non_zero_indices.hpp index 2b9933836a4..52feb94c11c 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/non_zero_indices.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/non_zero_indices.hpp @@ -14,9 +14,6 @@ namespace operations::data_movement { struct NonZeroIndicesOperation { static std::vector invoke( QueueId queue_id, const ttnn::Tensor& input_tensor, const std::optional& memory_config); - - static std::vector invoke( - const ttnn::Tensor& input_tensor, const std::optional& memory_config); }; } // namespace operations::data_movement diff --git a/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape.cpp b/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape.cpp index 8b472f5ebbb..e3d9ca247d9 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "ttnn/common/queue_id.hpp" +#include "ttnn/common/constants.hpp" #include "ttnn/run_operation.hpp" #include "reshape.hpp" #include @@ -104,30 +104,6 @@ ttnn::Tensor ReshapeOperation::invoke( return invoke(queue_id, input_tensor, logical_output_shape, logical_output_shape, memory_config_arg); } -ttnn::Tensor ReshapeOperation::invoke( - const ttnn::Tensor& input_tensor, - const ttnn::Shape& logical_shape, - const ttnn::Shape& padded_shape, - const std::optional& memory_config) { - return invoke(DefaultQueueId, input_tensor, logical_shape, padded_shape, memory_config); -} - -ttnn::Tensor ReshapeOperation::invoke( - const ttnn::Tensor& input_tensor, - const ttnn::Shape& logical_shape, - const std::optional& memory_config) { - return invoke(input_tensor, logical_shape, logical_shape, memory_config); -} - -ttnn::Tensor ReshapeOperation::invoke( - const ttnn::Tensor& input_tensor, const ttnn::Shape& logical_shape, const ttnn::Shape& padded_shape) { - return invoke(DefaultQueueId, input_tensor, logical_shape, padded_shape, std::nullopt); -} - -ttnn::Tensor ReshapeOperation::invoke(const ttnn::Tensor& input_tensor, const ttnn::Shape& logical_shape) { - return invoke(input_tensor, logical_shape, logical_shape); -} - ttnn::Tensor ReshapeOperation::invoke( QueueId queue_id, const ttnn::Tensor& input_tensor, @@ -136,15 +112,4 @@ ttnn::Tensor ReshapeOperation::invoke( return invoke(queue_id, input_tensor, infer_dims_for_reshape(input_tensor, shape_vector), memory_config_arg); } -ttnn::Tensor ReshapeOperation::invoke( - const ttnn::Tensor& input_tensor, - tt::stl::Span shape_vector, - const std::optional& memory_config_arg) { - return invoke(DefaultQueueId, input_tensor, shape_vector, memory_config_arg); -} - -ttnn::Tensor ReshapeOperation::invoke(const ttnn::Tensor& input_tensor, tt::stl::Span shape_vector) { - return invoke(input_tensor, shape_vector, std::nullopt); -} - } // namespace ttnn::operations::data_movement diff --git a/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape.hpp b/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape.hpp index 1ed0cd2f89a..19fcee6c90d 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape.hpp @@ -15,37 +15,19 @@ struct ReshapeOperation { const ttnn::Tensor& input_tensor, const ttnn::Shape& logical_shape, const ttnn::Shape& padded_shape, - const std::optional& memory_config_arg); - static ttnn::Tensor invoke( - QueueId queue_id, - const ttnn::Tensor& input_tensor, - const ttnn::Shape& logical_shape, - const std::optional& memory_config_arg); + const std::optional& memory_config_arg = std::nullopt); static ttnn::Tensor invoke( + QueueId queue_id, const ttnn::Tensor& input_tensor, const ttnn::Shape& logical_shape, - const ttnn::Shape& padded_shape, - const std::optional& memory_config); - static ttnn::Tensor invoke( - const ttnn::Tensor& input_tensor, - const ttnn::Shape& logical_shape, - const std::optional& memory_config); - - static ttnn::Tensor invoke( - const ttnn::Tensor& input_tensor, const ttnn::Shape& logical_shape, const ttnn::Shape& padded_shape); - static ttnn::Tensor invoke(const ttnn::Tensor& input_tensor, const ttnn::Shape& logical_shape); + const std::optional& memory_config_arg = std::nullopt); static ttnn::Tensor invoke( QueueId queue_id, const ttnn::Tensor& input_tensor, tt::stl::Span shape_vector, - const std::optional& memory_config_arg); - static ttnn::Tensor invoke( - const ttnn::Tensor& input_tensor, - tt::stl::Span shape_vector, - const std::optional& memory_config_arg); - static ttnn::Tensor invoke(const ttnn::Tensor& input_tensor, tt::stl::Span shape_vector); + const std::optional& memory_config_arg = std::nullopt); }; } // namespace operations::data_movement diff --git a/ttnn/cpp/ttnn/operations/data_movement/tilize/tilize.cpp b/ttnn/cpp/ttnn/operations/data_movement/tilize/tilize.cpp index e3c1dc27251..95deb5b3156 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/tilize/tilize.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/tilize/tilize.cpp @@ -58,12 +58,4 @@ ttnn::Tensor ExecuteTilize::invoke( return build_ndiml_tilize(base_tilize)(input_tensor); } -ttnn::Tensor ExecuteTilize::invoke( - const ttnn::Tensor& input_tensor, - const std::optional& memory_config, - std::optional output_dtype, - bool use_multicore) { - return invoke(DefaultQueueId, input_tensor, memory_config, output_dtype, use_multicore); -} - } // namespace ttnn::operations::data_movement diff --git a/ttnn/cpp/ttnn/operations/data_movement/tilize/tilize.hpp b/ttnn/cpp/ttnn/operations/data_movement/tilize/tilize.hpp index 79216f62ecf..b424051277b 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/tilize/tilize.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/tilize/tilize.hpp @@ -16,12 +16,6 @@ struct ExecuteTilize { const std::optional& memory_config = std::nullopt, std::optional output_dtype = std::nullopt, bool use_multicore = false); - - static ttnn::Tensor invoke( - const ttnn::Tensor& input_tensor, - const std::optional& memory_config = std::nullopt, - std::optional output_dtype = std::nullopt, - bool use_multicore = false); }; } // namespace operations::data_movement diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize/untilize.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize/untilize.cpp index 8b5801c5da8..c3b6c94a94a 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/untilize/untilize.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/untilize/untilize.cpp @@ -65,13 +65,4 @@ ttnn::Tensor ExecuteUntilize::invoke( return build_ndiml_untilize(base_untilize)(input_tensor); } -ttnn::Tensor ExecuteUntilize::invoke( - const ttnn::Tensor& input_tensor, - const std::optional& memory_config, - bool use_multicore, - bool use_pack_untilize, - const std::optional& sub_core_grids) { - return invoke(DefaultQueueId, input_tensor, memory_config, use_multicore, use_pack_untilize, sub_core_grids); -} - } // namespace ttnn::operations::data_movement diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize/untilize.hpp b/ttnn/cpp/ttnn/operations/data_movement/untilize/untilize.hpp index 7fe0bc03784..ef3c2610de3 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/untilize/untilize.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/untilize/untilize.hpp @@ -18,13 +18,6 @@ struct ExecuteUntilize { bool use_multicore = true, bool use_pack_untilize = true, const std::optional& sub_core_grids = std::nullopt); - - static ttnn::Tensor invoke( - const ttnn::Tensor& input_tensor, - const std::optional& memory_config = std::nullopt, - bool use_multicore = true, - bool use_pack_untilize = true, - const std::optional& sub_core_grids = std::nullopt); }; } // namespace operations::data_movement diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding.cpp index ea73fd0fe0f..24dea61f3bb 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding.cpp @@ -99,13 +99,4 @@ ttnn::Tensor ExecuteUntilizeWithUnpadding::invoke( return build_ndiml_untilize_val(base_untilize)(input_tensor); } -ttnn::Tensor ExecuteUntilizeWithUnpadding::invoke( - const ttnn::Tensor& input_tensor, - const ttnn::Shape& output_tensor_end, - const std::optional& memory_config, - bool use_multicore, - bool use_pack_untilize) { - return invoke(DefaultQueueId, input_tensor, output_tensor_end, memory_config, use_multicore, use_pack_untilize); -} - } // namespace ttnn::operations::data_movement diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding.hpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding.hpp index 802959dc319..b0fb7ec38b1 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding.hpp @@ -17,13 +17,6 @@ struct ExecuteUntilizeWithUnpadding { const std::optional& memory_config, bool use_multicore = true, bool use_pack_untilize = true); - - static ttnn::Tensor invoke( - const ttnn::Tensor& input_tensor, - const ttnn::Shape& output_tensor_end, - const std::optional& memory_config, - bool use_multicore = true, - bool use_pack_untilize = true); }; } // namespace operations::data_movement diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_composite.hpp b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_composite.hpp index 6af5bc49a0d..399413fbb28 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_composite.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_composite.hpp @@ -35,12 +35,6 @@ struct ExecutePower { const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); - static Tensor invoke( - const Tensor& input_tensor, - uint32_t exponent, - const std::optional& memory_config = std::nullopt, - const std::optional& optional_output_tensor = std::nullopt); - static Tensor invoke( QueueId queue_id, const Tensor& input_tensor, @@ -48,12 +42,6 @@ struct ExecutePower { const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); - static Tensor invoke( - const Tensor& input_tensor, - float exponent, - const std::optional& memory_config = std::nullopt, - const std::optional& optional_output_tensor = std::nullopt); - static Tensor invoke( QueueId queue_id, float input_a, @@ -61,24 +49,12 @@ struct ExecutePower { const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); - static Tensor invoke( - float input_a, - const Tensor& exponent, - const std::optional& memory_config = std::nullopt, - const std::optional& optional_output_tensor = std::nullopt); - static Tensor invoke( QueueId queue_id, const Tensor& input_tensor, const Tensor& exponent, const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); - - static Tensor invoke( - const Tensor& input_tensor, - const Tensor& exponent, - const std::optional& memory_config = std::nullopt, - const std::optional& optional_output_tensor = std::nullopt); }; template @@ -131,22 +107,6 @@ struct ExecuteDivLikeOps { }; struct ExecuteDiv { - static Tensor invoke( - const Tensor& input_tensor_a, - const Tensor& input_tensor_b, - bool accurate_mode = false, - const std::optional& round_mode = std::nullopt, - const std::optional& memory_config = std::nullopt, - std::optional optional_output_tensor = std::nullopt); - - static Tensor invoke( - const Tensor& input_tensor, - float value, - bool accurate_mode = false, - const std::optional& round_mode = std::nullopt, - const std::optional& memory_config = std::nullopt, - std::optional optional_output_tensor = std::nullopt); - static Tensor invoke( QueueId queue_id, const Tensor& input_tensor_a, @@ -188,25 +148,6 @@ struct ExecuteBiasGelu { input_tensor_a_activation); } - static Tensor invoke( - const Tensor& input_tensor_a_arg, - const Tensor& input_tensor_b_arg, - const std::optional& output_dtype = std::nullopt, - const std::optional& memory_config = std::nullopt, - std::optional optional_output_tensor = std::nullopt, - std::optional activations = std::nullopt, - std::optional input_tensor_a_activation = std::nullopt) { - return BinaryOperation::invoke( - DefaultQueueId, - input_tensor_a_arg, - input_tensor_b_arg, - output_dtype, - memory_config, - optional_output_tensor, - activations, - input_tensor_a_activation); - } - static Tensor invoke( QueueId queue_id, const ttnn::Tensor& input_tensor_a, @@ -223,25 +164,6 @@ struct ExecuteBiasGelu { memory_config, optional_output_tensor); } - - static Tensor invoke( - const ttnn::Tensor& input_tensor_a, - const float bias, - const std::optional& dtype = std::nullopt, - const std::optional& memory_config = std::nullopt, - const std::optional& optional_output_tensor = std::nullopt, - std::optional activations = std::nullopt, - std::optional input_tensor_a_activation = std::nullopt) { - return invoke( - DefaultQueueId, - input_tensor_a, - bias, - dtype, - memory_config, - optional_output_tensor, - activations, - input_tensor_a_activation); - } }; template @@ -334,27 +256,12 @@ struct ExecuteRsub { const std::optional& activations = std::nullopt, const std::optional& input_tensor_a_activation = std::nullopt); - static Tensor invoke( - const Tensor& input_tensor_a_arg, - const Tensor& input_tensor_b_arg, - const std::optional& output_dtype = std::nullopt, - const std::optional& memory_config = std::nullopt, - const std::optional& optional_output_tensor = std::nullopt, - const std::optional& activations = std::nullopt, - const std::optional& input_tensor_a_activation = std::nullopt); - static Tensor invoke( QueueId queue_id, const Tensor& input_tensor, float input_b, const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); - - static Tensor invoke( - const Tensor& input_tensor, - float input_b, - const std::optional& memory_config = std::nullopt, - const std::optional& optional_output_tensor = std::nullopt); }; struct ExecuteBitwiseAnd { @@ -365,24 +272,12 @@ struct ExecuteBitwiseAnd { const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); - static Tensor invoke( - const Tensor& input_tensor_a_arg, - const Tensor& input_tensor_b_arg, - const std::optional& memory_config = std::nullopt, - const std::optional& optional_output_tensor = std::nullopt); - static Tensor invoke( QueueId queue_id, const Tensor& input_tensor, int32_t input_b, const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); - - static Tensor invoke( - const Tensor& input_tensor, - int32_t input_b, - const std::optional& memory_config = std::nullopt, - const std::optional& optional_output_tensor = std::nullopt); }; struct ExecuteBitwiseOr { @@ -393,24 +288,12 @@ struct ExecuteBitwiseOr { const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); - static Tensor invoke( - const Tensor& input_tensor_a_arg, - const Tensor& input_tensor_b_arg, - const std::optional& memory_config = std::nullopt, - const std::optional& optional_output_tensor = std::nullopt); - static Tensor invoke( QueueId queue_id, const Tensor& input_tensor, int32_t input_b, const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); - - static Tensor invoke( - const Tensor& input_tensor, - int32_t input_b, - const std::optional& memory_config = std::nullopt, - const std::optional& optional_output_tensor = std::nullopt); }; struct ExecuteBitwiseXor { @@ -421,24 +304,12 @@ struct ExecuteBitwiseXor { const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); - static Tensor invoke( - const Tensor& input_tensor_a_arg, - const Tensor& input_tensor_b_arg, - const std::optional& memory_config = std::nullopt, - const std::optional& optional_output_tensor = std::nullopt); - static Tensor invoke( QueueId queue_id, const Tensor& input_tensor, int32_t input_b, const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); - - static Tensor invoke( - const Tensor& input_tensor, - int32_t input_b, - const std::optional& memory_config = std::nullopt, - const std::optional& optional_output_tensor = std::nullopt); }; struct ExecuteBitwiseLeftShift { @@ -449,24 +320,12 @@ struct ExecuteBitwiseLeftShift { const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); - static Tensor invoke( - const Tensor& input_tensor_a_arg, - const Tensor& input_tensor_b_arg, - const std::optional& memory_config = std::nullopt, - const std::optional& optional_output_tensor = std::nullopt); - static Tensor invoke( QueueId queue_id, const Tensor& input_tensor, int32_t input_b, const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); - - static Tensor invoke( - const Tensor& input_tensor, - int32_t input_b, - const std::optional& memory_config = std::nullopt, - const std::optional& optional_output_tensor = std::nullopt); }; struct ExecuteBitwiseRightShift { @@ -477,24 +336,12 @@ struct ExecuteBitwiseRightShift { const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); - static Tensor invoke( - const Tensor& input_tensor_a_arg, - const Tensor& input_tensor_b_arg, - const std::optional& memory_config = std::nullopt, - const std::optional& optional_output_tensor = std::nullopt); - static Tensor invoke( QueueId queue_id, const Tensor& input_tensor, int32_t input_b, const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); - - static Tensor invoke( - const Tensor& input_tensor, - int32_t input_b, - const std::optional& memory_config = std::nullopt, - const std::optional& optional_output_tensor = std::nullopt); }; } // namespace binary diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp index 49b23d539e1..a4dac8812f1 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp @@ -200,17 +200,6 @@ Tensor ExecuteDiv::invoke( return output_tensor.value(); } -Tensor ExecuteDiv::invoke( - const Tensor& input, - float value, - bool accurate_mode, - const std::optional& round_mode, - const std::optional& output_mem_config, - std::optional output_tensor) { - return ExecuteDiv::invoke( - DefaultQueueId, input, value, accurate_mode, round_mode, output_mem_config, std::move(output_tensor)); -} - Tensor ExecuteDiv::invoke( QueueId queue_id, const Tensor& input_a, @@ -307,17 +296,6 @@ Tensor ExecuteDiv::invoke( } } -Tensor ExecuteDiv::invoke( - const Tensor& input_a, - const Tensor& input_b, - bool accurate_mode, - const std::optional& round_mode, - const std::optional& output_mem_config, - std::optional output_tensor) { - return ExecuteDiv::invoke( - DefaultQueueId, input_a, input_b, accurate_mode, round_mode, output_mem_config, std::move(output_tensor)); -} - Tensor _div_no_nan_overload(const Tensor& input_a, float value, const std::optional& output_mem_config) { if (value == 0) { return ttnn::zeros_like(input_a); @@ -625,15 +603,6 @@ Tensor ExecutePower::invoke( return result; } -// power - floating point exponent -Tensor ExecutePower::invoke( - const Tensor& input_a, - float exponent, - const std::optional& output_mem_config, - const std::optional& output_tensor) { - return ExecutePower::invoke(DefaultQueueId, input_a, exponent, output_mem_config, std::move(output_tensor)); -} - // power - integer exponent Tensor ExecutePower::invoke( QueueId queue_id, @@ -644,15 +613,6 @@ Tensor ExecutePower::invoke( return ttnn::power(queue_id, input, exponent, output_mem_config, output_tensor); } -// power - integer exponent -Tensor ExecutePower::invoke( - const Tensor& input, - uint32_t exponent, - const std::optional& output_mem_config, - const std::optional& output_tensor) { - return ExecutePower::invoke(DefaultQueueId, input, exponent, output_mem_config, std::move(output_tensor)); -} - // power - tensor exponent Tensor ExecutePower::invoke( QueueId queue_id, @@ -664,15 +624,6 @@ Tensor ExecutePower::invoke( queue_id, input, exponent, std::nullopt, output_mem_config, output_tensor); } -// power - tensor exponent -Tensor ExecutePower::invoke( - const Tensor& input, - const Tensor& exponent, - const std::optional& output_mem_config, - const std::optional& output_tensor) { - return ExecutePower::invoke(DefaultQueueId, input, exponent, output_mem_config, std::move(output_tensor)); -} - // power - scalar input Tensor ExecutePower::invoke( QueueId queue_id, @@ -684,15 +635,6 @@ Tensor ExecutePower::invoke( return ExecutePower::invoke(queue_id, input, exponent, output_mem_config, std::move(output_tensor)); } -// power - scalar input -Tensor ExecutePower::invoke( - float input_a, - const Tensor& exponent, - const std::optional& output_mem_config, - const std::optional& output_tensor) { - return ExecutePower::invoke(DefaultQueueId, input_a, exponent, output_mem_config, std::move(output_tensor)); -} - Tensor ExecuteRsub::invoke( QueueId queue_id, const Tensor& input_tensor_a, @@ -713,26 +655,6 @@ Tensor ExecuteRsub::invoke( input_tensor_a_activation); } -Tensor ExecuteRsub::invoke( - const Tensor& input_tensor_a, - const Tensor& input_tensor_b, - const std::optional& output_dtype, - const std::optional& memory_config, - const std::optional& optional_output_tensor, - const std::optional& activations, - const std::optional& input_tensor_a_activation) { - - return ExecuteRsub::invoke( - ttnn::DefaultQueueId, - input_tensor_a, - input_tensor_b, - output_dtype, - memory_config, - optional_output_tensor, - activations, - input_tensor_a_activation); -} - Tensor ExecuteRsub::invoke( QueueId queue_id, const Tensor& input_tensor_a, @@ -743,20 +665,6 @@ Tensor ExecuteRsub::invoke( queue_id, input_tensor_a, input_b, memory_config, optional_output_tensor); } -Tensor ExecuteRsub::invoke( - const Tensor& input_tensor_a, - const float input_b, - const std::optional& memory_config, - const std::optional& optional_output_tensor) { - - return ExecuteRsub::invoke( - ttnn::DefaultQueueId, - input_tensor_a, - input_b, - memory_config, - std::move(optional_output_tensor)); -} - // Bitwise AND Tensor ExecuteBitwiseAnd::invoke( QueueId queue_id, @@ -768,20 +676,6 @@ Tensor ExecuteBitwiseAnd::invoke( queue_id, input_tensor_a, input_tensor_b, std::nullopt, memory_config, optional_output_tensor); } -Tensor ExecuteBitwiseAnd::invoke( - const Tensor& input_tensor_a, - const Tensor& input_tensor_b, - const std::optional& memory_config, - const std::optional& optional_output_tensor) { - - return ExecuteBitwiseAnd::invoke( - ttnn::DefaultQueueId, - input_tensor_a, - input_tensor_b, - memory_config, - optional_output_tensor); -} - Tensor ExecuteBitwiseAnd::invoke( QueueId queue_id, const Tensor& input_tensor_a, @@ -793,20 +687,6 @@ Tensor ExecuteBitwiseAnd::invoke( queue_id, input_tensor_a, input_b, memory_config, optional_output_tensor); } -Tensor ExecuteBitwiseAnd::invoke( - const Tensor& input_tensor_a, - const int32_t input_b, - const std::optional& memory_config, - const std::optional& optional_output_tensor) { - - return ExecuteBitwiseAnd::invoke( - ttnn::DefaultQueueId, - input_tensor_a, - input_b, - memory_config, - std::move(optional_output_tensor)); -} - // Bitwise OR Tensor ExecuteBitwiseOr::invoke( QueueId queue_id, @@ -818,20 +698,6 @@ Tensor ExecuteBitwiseOr::invoke( queue_id, input_tensor_a, input_tensor_b, std::nullopt, memory_config, optional_output_tensor); } -Tensor ExecuteBitwiseOr::invoke( - const Tensor& input_tensor_a, - const Tensor& input_tensor_b, - const std::optional& memory_config, - const std::optional& optional_output_tensor) { - - return ExecuteBitwiseOr::invoke( - ttnn::DefaultQueueId, - input_tensor_a, - input_tensor_b, - memory_config, - optional_output_tensor); -} - Tensor ExecuteBitwiseOr::invoke( QueueId queue_id, const Tensor& input_tensor_a, @@ -843,20 +709,6 @@ Tensor ExecuteBitwiseOr::invoke( queue_id, input_tensor_a, input_b, memory_config, optional_output_tensor); } -Tensor ExecuteBitwiseOr::invoke( - const Tensor& input_tensor_a, - const int32_t input_b, - const std::optional& memory_config, - const std::optional& optional_output_tensor) { - - return ExecuteBitwiseOr::invoke( - ttnn::DefaultQueueId, - input_tensor_a, - input_b, - memory_config, - std::move(optional_output_tensor)); -} - // Bitwise XOR Tensor ExecuteBitwiseXor::invoke( QueueId queue_id, @@ -868,20 +720,6 @@ Tensor ExecuteBitwiseXor::invoke( queue_id, input_tensor_a, input_tensor_b, std::nullopt, memory_config, optional_output_tensor); } -Tensor ExecuteBitwiseXor::invoke( - const Tensor& input_tensor_a, - const Tensor& input_tensor_b, - const std::optional& memory_config, - const std::optional& optional_output_tensor) { - - return ExecuteBitwiseXor::invoke( - ttnn::DefaultQueueId, - input_tensor_a, - input_tensor_b, - memory_config, - optional_output_tensor); -} - Tensor ExecuteBitwiseXor::invoke( QueueId queue_id, const Tensor& input_tensor_a, @@ -893,20 +731,6 @@ Tensor ExecuteBitwiseXor::invoke( queue_id, input_tensor_a, input_b, memory_config, optional_output_tensor); } -Tensor ExecuteBitwiseXor::invoke( - const Tensor& input_tensor_a, - const int32_t input_b, - const std::optional& memory_config, - const std::optional& optional_output_tensor) { - - return ExecuteBitwiseXor::invoke( - ttnn::DefaultQueueId, - input_tensor_a, - input_b, - memory_config, - std::move(optional_output_tensor)); -} - // Bitwise Left Shift Tensor ExecuteBitwiseLeftShift::invoke( QueueId queue_id, @@ -918,15 +742,6 @@ Tensor ExecuteBitwiseLeftShift::invoke( queue_id, input_tensor_a, input_tensor_b, std::nullopt, memory_config, optional_output_tensor); } -Tensor ExecuteBitwiseLeftShift::invoke( - const Tensor& input_tensor_a, - const Tensor& input_tensor_b, - const std::optional& memory_config, - const std::optional& optional_output_tensor) { - return ExecuteBitwiseLeftShift::invoke( - ttnn::DefaultQueueId, input_tensor_a, input_tensor_b, memory_config, optional_output_tensor); -} - Tensor ExecuteBitwiseLeftShift::invoke( QueueId queue_id, const Tensor& input_tensor_a, @@ -938,15 +753,6 @@ Tensor ExecuteBitwiseLeftShift::invoke( queue_id, input_tensor_a, input_b, memory_config, optional_output_tensor); } -Tensor ExecuteBitwiseLeftShift::invoke( - const Tensor& input_tensor_a, - const int32_t input_b, - const std::optional& memory_config, - const std::optional& optional_output_tensor) { - return ExecuteBitwiseLeftShift::invoke( - ttnn::DefaultQueueId, input_tensor_a, input_b, memory_config, std::move(optional_output_tensor)); -} - // Bitwise Right Shift Tensor ExecuteBitwiseRightShift::invoke( QueueId queue_id, @@ -958,15 +764,6 @@ Tensor ExecuteBitwiseRightShift::invoke( queue_id, input_tensor_a, input_tensor_b, std::nullopt, memory_config, optional_output_tensor); } -Tensor ExecuteBitwiseRightShift::invoke( - const Tensor& input_tensor_a, - const Tensor& input_tensor_b, - const std::optional& memory_config, - const std::optional& optional_output_tensor) { - return ExecuteBitwiseRightShift::invoke( - ttnn::DefaultQueueId, input_tensor_a, input_tensor_b, memory_config, optional_output_tensor); -} - Tensor ExecuteBitwiseRightShift::invoke( QueueId queue_id, const Tensor& input_tensor_a, @@ -978,13 +775,4 @@ Tensor ExecuteBitwiseRightShift::invoke( queue_id, input_tensor_a, input_b, memory_config, optional_output_tensor); } -Tensor ExecuteBitwiseRightShift::invoke( - const Tensor& input_tensor_a, - const int32_t input_b, - const std::optional& memory_config, - const std::optional& optional_output_tensor) { - return ExecuteBitwiseRightShift::invoke( - ttnn::DefaultQueueId, input_tensor_a, input_b, memory_config, std::move(optional_output_tensor)); -} - } // namespace ttnn::operations::binary diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_backward/binary_backward.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary_backward/binary_backward.cpp index 9bcf23a6973..49073c1b796 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary_backward/binary_backward.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary_backward/binary_backward.cpp @@ -100,27 +100,6 @@ std::vector> ExecuteAddalphaBW::invoke( return result; } -std::vector> ExecuteAddalphaBW::invoke( - const Tensor& grad, - const Tensor& input, - const Tensor& other, - float alpha, - const std::vector& are_required_outputs, - const std::optional& output_mem_config, - std::optional input_grad, - std::optional other_grad) { - return ExecuteAddalphaBW::invoke( - ttnn::DefaultQueueId, - grad, - input, - other, - alpha, - are_required_outputs, - output_mem_config, - std::move(input_grad), - std::move(other_grad)); -} - std::vector> ExecuteBackwardSubAlpha::invoke( QueueId queue_id, const Tensor& grad, @@ -147,27 +126,6 @@ std::vector> ExecuteBackwardSubAlpha::invoke( return result; } -std::vector> ExecuteBackwardSubAlpha::invoke( - const Tensor& grad, - const Tensor& input, - const Tensor& other, - float alpha, - const std::vector& are_required_outputs, - const std::optional& output_mem_config, - std::optional input_grad, - std::optional other_grad) { - return ExecuteBackwardSubAlpha::invoke( - DefaultQueueId, - grad, - input, - other, - alpha, - are_required_outputs, - output_mem_config, - std::move(input_grad), - std::move(other_grad)); -} - std::vector> ExecuteBackwardAdd::invoke( QueueId queue_id, const Tensor& grad, @@ -182,15 +140,6 @@ std::vector> ExecuteBackwardAdd::invoke( return result; } -std::vector> ExecuteBackwardAdd::invoke( - const Tensor& grad, - const Tensor& input, - float alpha, - const std::optional& output_mem_config, - std::optional input_grad) { - return ExecuteBackwardAdd::invoke(DefaultQueueId, grad, input, alpha, output_mem_config, std::move(input_grad)); -} - std::vector> ExecuteBackwardAdd::invoke( QueueId queue_id, const Tensor& grad, @@ -214,25 +163,6 @@ std::vector> ExecuteBackwardAdd::invoke( return result; } -std::vector> ExecuteBackwardAdd::invoke( - const Tensor& grad, - const Tensor& input, - const Tensor& other, - const std::vector& are_required_outputs, - const std::optional& output_mem_config, - std::optional input_grad, - std::optional other_grad) { - return ExecuteBackwardAdd::invoke( - DefaultQueueId, - grad, - input, - other, - are_required_outputs, - output_mem_config, - std::move(input_grad), - std::move(other_grad)); -} - std::vector ExecuteBackwardAdd::invoke( const ComplexTensor& grad, const ComplexTensor& input, @@ -265,15 +195,6 @@ std::vector> ExecuteBackwardSub::invoke( return result; } -std::vector> ExecuteBackwardSub::invoke( - const Tensor& grad, - const Tensor& input, - float alpha, - const std::optional& output_mem_config, - std::optional input_grad) { - return ExecuteBackwardSub::invoke(DefaultQueueId, grad, input, alpha, output_mem_config, std::move(input_grad)); -} - std::vector> ExecuteBackwardSub::invoke( QueueId queue_id, const Tensor& grad, @@ -287,24 +208,6 @@ std::vector> ExecuteBackwardSub::invoke( queue_id, grad, input, other, 1.0f, are_required_outputs, output_mem_config, input_grad, other_grad); } -std::vector> ExecuteBackwardSub::invoke( - const Tensor& grad, - const Tensor& input, - const Tensor& other, - const std::vector& are_required_outputs, - const std::optional& output_mem_config, - std::optional input_grad, - std::optional other_grad) { - return ExecuteBackwardSub::invoke( - DefaultQueueId, - grad, - input, - other, - are_required_outputs, - output_mem_config, - std::move(input_grad), - std::move(other_grad)); -} std::vector ExecuteBackwardSub::invoke( const ComplexTensor& grad, const ComplexTensor& input, @@ -586,33 +489,6 @@ std::vector> ExecuteBackwardAssign::invoke( return grad_tensor; } -std::vector> ExecuteBackwardAssign::invoke( - const Tensor& grad, - const Tensor& input, - const Tensor& other, - const std::vector& are_required_outputs, - const std::optional& output_mem_config, - std::optional input_grad, - std::optional other_grad) { - return ExecuteBackwardAssign::invoke( - ttnn::DefaultQueueId, - grad, - input, - other, - are_required_outputs, - output_mem_config, - std::move(input_grad), - std::move(other_grad)); -} - -std::vector> ExecuteBackwardAssign::invoke( - const Tensor& grad, - const Tensor& input, - const std::optional& output_mem_config, - std::optional input_grad) { - return ExecuteBackwardAssign::invoke(ttnn::DefaultQueueId, grad, input, output_mem_config, std::move(input_grad)); -} - std::vector> ExecuteBackwardConcat::invoke( QueueId queue_id, const Tensor& grad, @@ -658,27 +534,6 @@ std::vector> ExecuteBackwardConcat::invoke( return grad_tensor; } -std::vector> ExecuteBackwardConcat::invoke( - const Tensor& grad, - const Tensor& input, - const Tensor& other, - int dim, - const std::vector& are_required_outputs, - const std::optional& memory_config, - std::optional input_grad, - std::optional other_grad) { - return ExecuteBackwardConcat::invoke( - ttnn::DefaultQueueId, - grad, - input, - other, - dim, - are_required_outputs, - memory_config, - std::move(input_grad), - std::move(other_grad)); -} - std::vector> ExecuteBackwardRsub::invoke( QueueId queue_id, const Tensor& grad, @@ -703,25 +558,6 @@ std::vector> ExecuteBackwardRsub::invoke( return result; } -std::vector> ExecuteBackwardRsub::invoke( - const Tensor& grad, - const Tensor& input, - const Tensor& other, - const std::vector& are_required_outputs, - const std::optional& output_mem_config, - std::optional input_grad, - std::optional other_grad) { - return ExecuteBackwardRsub::invoke( - DefaultQueueId, - grad, - input, - other, - are_required_outputs, - output_mem_config, - std::move(input_grad), - std::move(other_grad)); -} - std::vector ExecuteBackwardBiasGelu::invoke( const Tensor& grad, const Tensor& input_a, @@ -859,17 +695,6 @@ std::vector> ExecuteBackwardDiv::invoke( return result; } -std::vector> ExecuteBackwardDiv::invoke( - const Tensor& grad, - const Tensor& input, - float scalar, - const std::optional& round_mode, - const std::optional& output_mem_config, - std::optional input_grad) { - return ExecuteBackwardDiv::invoke( - DefaultQueueId, grad, input, scalar, round_mode, output_mem_config, std::move(input_grad)); -} - std::vector> ExecuteBackwardDiv::invoke( QueueId queue_id, const Tensor& grad, @@ -969,27 +794,6 @@ std::vector> ExecuteBackwardDiv::invoke( return result; } -std::vector> ExecuteBackwardDiv::invoke( - const Tensor& grad, - const Tensor& input, - const Tensor& other, - const std::optional& round_mode, - const std::vector& are_required_outputs, - const std::optional& output_mem_config, - std::optional input_grad, - std::optional other_grad) { - return ExecuteBackwardDiv::invoke( - DefaultQueueId, - grad, - input, - other, - std::move(round_mode), - are_required_outputs, - output_mem_config, - std::move(input_grad), - std::move(other_grad)); -} - std::vector ExecuteBackwardDiv::invoke( const ComplexTensor& grad, const ComplexTensor& input, @@ -1045,15 +849,6 @@ std::vector> ExecuteBackwardMul::invoke( return result; } -std::vector> ExecuteBackwardMul::invoke( - const Tensor& grad, - const Tensor& input, - float scalar, - const std::optional& output_mem_config, - std::optional input_grad) { - return ExecuteBackwardMul::invoke(DefaultQueueId, grad, input, scalar, output_mem_config, std::move(input_grad)); -} - std::vector ExecuteBackwardMul::invoke( const ComplexTensor& grad, const ComplexTensor& input, @@ -1092,23 +887,4 @@ std::vector> ExecuteBackwardMul::invoke( } return result; } - -std::vector> ExecuteBackwardMul::invoke( - const Tensor& grad, - const Tensor& input, - const Tensor& other, - const std::vector& are_required_outputs, - const std::optional& output_mem_config, - std::optional input_grad, - std::optional other_grad) { - return ExecuteBackwardMul::invoke( - DefaultQueueId, - grad, - input, - other, - are_required_outputs, - output_mem_config, - std::move(input_grad), - std::move(other_grad)); -} } // namespace ttnn::operations::binary_backward diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_backward/binary_backward.hpp b/ttnn/cpp/ttnn/operations/eltwise/binary_backward/binary_backward.hpp index ce55f56178b..eacd021580b 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary_backward/binary_backward.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary_backward/binary_backward.hpp @@ -105,22 +105,6 @@ struct ExecuteBackwardMul { std::optional input_grad = std::nullopt, std::optional other_grad = std::nullopt); - static std::vector> invoke( - const Tensor& grad_tensor_arg, - const Tensor& input_tensor_arg, - float scalar, - const std::optional& memory_config = std::nullopt, - std::optional input_grad = std::nullopt); - - static std::vector> invoke( - const Tensor& grad_tensor_arg, - const Tensor& input_tensor_arg, - const Tensor& other_tensor_arg, - const std::vector& are_required_outputs = std::vector{true, true}, - const std::optional& memory_config = std::nullopt, - std::optional input_grad = std::nullopt, - std::optional other_grad = std::nullopt); - static std::vector invoke( const ComplexTensor& grad_tensor_arg, const ComplexTensor& input_tensor_a_arg, @@ -145,21 +129,6 @@ struct ExecuteBackwardAssign { const std::optional& memory_config = std::nullopt, std::optional input_a_grad = std::nullopt, std::optional input_b_grad = std::nullopt); - - static std::vector> invoke( - const Tensor& grad_tensor_arg, - const Tensor& input_tensor_arg, - const std::optional& memory_config = std::nullopt, - std::optional input_a_grad = std::nullopt); - - static std::vector> invoke( - const Tensor& grad_tensor_arg, - const Tensor& input_tensor_arg, - const Tensor& other_tensor_arg, - const std::vector& are_required_outputs = std::vector{true, true}, - const std::optional& memory_config = std::nullopt, - std::optional input_a_grad = std::nullopt, - std::optional input_b_grad = std::nullopt); }; struct ExecuteBackwardBiasGelu { @@ -196,22 +165,6 @@ struct ExecuteBackwardLT { float other, const std::optional& memory_config = std::nullopt, std::optional input_grad = std::nullopt); - - static std::vector> invoke( - const Tensor& grad_tensor_arg, - const Tensor& input_tensor_arg, - const Tensor& other_tensor_arg, - const std::optional& memory_config = std::nullopt, - const std::vector& are_required_outputs = std::vector{true, true}, - std::optional input_grad = std::nullopt, - std::optional other_grad = std::nullopt); - - static std::vector> invoke( - const Tensor& grad_tensor_arg, - const Tensor& input_tensor_arg, - float other, - const std::optional& memory_config = std::nullopt, - std::optional input_grad = std::nullopt); }; struct ExecuteBackwardAdd { @@ -233,22 +186,6 @@ struct ExecuteBackwardAdd { std::optional input_grad = std::nullopt, std::optional other_grad = std::nullopt); - static std::vector> invoke( - const Tensor& grad_tensor_arg, - const Tensor& input_tensor_arg, - float scalar, - const std::optional& memory_config = std::nullopt, - std::optional input_grad = std::nullopt); - - static std::vector> invoke( - const Tensor& grad_tensor_arg, - const Tensor& input_tensor_a_arg, - const Tensor& input_tensor_b_arg, - const std::vector& are_required_outputs = std::vector{true, true}, - const std::optional& memory_config = std::nullopt, - std::optional input_grad = std::nullopt, - std::optional other_grad = std::nullopt); - static std::vector invoke( const ComplexTensor& grad_tensor_arg, const ComplexTensor& input_tensor_a_arg, @@ -276,22 +213,6 @@ struct ExecuteBackwardSub { std::optional input_grad = std::nullopt, std::optional other_grad = std::nullopt); - static std::vector> invoke( - const Tensor& grad_tensor_arg, - const Tensor& input_tensor_arg, - float scalar, - const std::optional& memory_config = std::nullopt, - std::optional input_grad = std::nullopt); - - static std::vector> invoke( - const Tensor& grad_tensor_arg, - const Tensor& input_tensor_a_arg, - const Tensor& input_tensor_b_arg, - const std::vector& are_required_outputs = std::vector{true, true}, - const std::optional& memory_config = std::nullopt, - std::optional input_grad = std::nullopt, - std::optional other_grad = std::nullopt); - static std::vector invoke( const ComplexTensor& grad_tensor_arg, const ComplexTensor& input_tensor_a_arg, @@ -321,24 +242,6 @@ struct ExecuteBackwardDiv { std::optional input_grad = std::nullopt, std::optional other_grad = std::nullopt); - static std::vector> invoke( - const Tensor& grad_tensor_arg, - const Tensor& input_tensor_arg, - float scalar, - const std::optional& round_mode = std::nullopt, - const std::optional& memory_config = std::nullopt, - std::optional input_grad = std::nullopt); - - static std::vector> invoke( - const Tensor& grad_tensor_arg, - const Tensor& input_tensor_arg, - const Tensor& other_tensor_arg, - const std::optional& round_mode = std::nullopt, - const std::vector& are_required_outputs = std::vector{true, true}, - const std::optional& memory_config = std::nullopt, - std::optional input_grad = std::nullopt, - std::optional other_grad = std::nullopt); - static std::vector invoke( const ComplexTensor& grad_tensor_arg, const ComplexTensor& input_tensor_a_arg, @@ -385,16 +288,6 @@ struct ExecuteAddalphaBW { const std::optional& memory_config = std::nullopt, std::optional input_a_grad = std::nullopt, std::optional input_b_grad = std::nullopt); - - static std::vector> invoke( - const Tensor& grad_tensor_arg, - const Tensor& input_tensor_a_arg, - const Tensor& input_tensor_b_arg, - float parameter, - const std::vector& are_required_outputs = std::vector{true, true}, - const std::optional& memory_config = std::nullopt, - std::optional input_a_grad = std::nullopt, - std::optional input_b_grad = std::nullopt); }; struct ExecuteBackwardSubAlpha { @@ -408,16 +301,6 @@ struct ExecuteBackwardSubAlpha { const std::optional& memory_config = std::nullopt, std::optional input_grad = std::nullopt, std::optional other_grad = std::nullopt); - - static std::vector> invoke( - const Tensor& grad_tensor_arg, - const Tensor& input_tensor_a_arg, - const Tensor& input_tensor_b_arg, - float alpha, - const std::vector& are_required_outputs = std::vector{true, true}, - const std::optional& memory_config = std::nullopt, - std::optional input_grad = std::nullopt, - std::optional other_grad = std::nullopt); }; struct ExecuteBackwardRsub { @@ -430,15 +313,6 @@ struct ExecuteBackwardRsub { const std::optional& memory_config = std::nullopt, std::optional input_grad = std::nullopt, std::optional other_grad = std::nullopt); - - static std::vector> invoke( - const Tensor& grad_tensor_arg, - const Tensor& input_tensor_a_arg, - const Tensor& input_tensor_b_arg, - const std::vector& are_required_outputs = std::vector{true, true}, - const std::optional& memory_config = std::nullopt, - std::optional input_grad = std::nullopt, - std::optional other_grad = std::nullopt); }; struct ExecuteBackwardConcat { @@ -452,16 +326,6 @@ struct ExecuteBackwardConcat { const std::optional& memory_config = std::nullopt, std::optional input_grad = std::nullopt, std::optional other_grad = std::nullopt); - - static std::vector> invoke( - const Tensor& grad_tensor_arg, - const Tensor& input_tensor_a_arg, - const Tensor& input_tensor_b_arg, - int dim, - const std::vector& are_required_outputs = std::vector{true, true}, - const std::optional& memory_config = std::nullopt, - std::optional input_grad = std::nullopt, - std::optional other_grad = std::nullopt); }; } // namespace operations::binary_backward diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/unary.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary/unary.cpp index 96451895ee0..85d5dc6c6b0 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/unary/unary.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/unary/unary.cpp @@ -59,15 +59,6 @@ Tensor ExecuteUnary::invoke( queue_id, input_tensor, {UnaryWithParam{unary_op_types}...}, memory_config, optional_output_tensor); } -template -Tensor ExecuteUnary::invoke( - const Tensor& input_tensor, - const std::optional& memory_config, - const std::optional& optional_output_tensor) { - return detail::unary_impl( - DefaultQueueId, input_tensor, {UnaryWithParam{unary_op_types}...}, memory_config, optional_output_tensor); -} - template <> ComplexTensor ExecuteUnary::invoke( const ComplexTensor& input, const MemoryConfig& output_mem_config) { @@ -139,20 +130,6 @@ Tensor ExecuteUnaryWithFastAndApproximateMode::invoke( optional_output_tensor); } -template -Tensor ExecuteUnaryWithFastAndApproximateMode::invoke( - const Tensor& input_tensor, - const bool parameter, - const std::optional& memory_config, - const std::optional& optional_output_tensor) { - return detail::unary_impl( - DefaultQueueId, - input_tensor, - {UnaryWithParam{unary_op_type, static_cast(parameter)}}, - memory_config, - optional_output_tensor); -} - template struct ExecuteUnaryWithFastAndApproximateMode; template struct ExecuteUnaryWithFastAndApproximateMode; template struct ExecuteUnaryWithFastAndApproximateMode; @@ -174,20 +151,6 @@ Tensor ExecuteUnaryWithFloatParameter::invoke( optional_output_tensor); } -template -Tensor ExecuteUnaryWithFloatParameter::invoke( - const Tensor& input_tensor, - const float parameter, - const std::optional& memory_config, - const std::optional& optional_output_tensor) { - return detail::unary_impl( - DefaultQueueId, - input_tensor, - {UnaryWithParam{unary_op_type, static_cast(parameter)}}, - memory_config, - optional_output_tensor); -} - template struct ExecuteUnaryWithFloatParameter; template struct ExecuteUnaryWithFloatParameter; template struct ExecuteUnaryWithFloatParameter; @@ -217,21 +180,6 @@ Tensor Sigmoid_accurate::invoke( optional_output_tensor); } -Tensor Sigmoid_accurate::invoke( - const Tensor& input, - const std::optional& memory_config, - const std::optional& optional_output_tensor) { - return detail::unary_impl( - DefaultQueueId, - input, - {UnaryWithParam(UnaryOpType::NEG), - UnaryWithParam(UnaryOpType::EXP, 1.0f), - UnaryWithParam(UnaryOpType::ADD_UNARY_SFPU, 1.0f), - UnaryWithParam(UnaryOpType::RECIP)}, - memory_config, - optional_output_tensor); -} - Tensor Unary_chain::invoke( QueueId queue_id, const Tensor& input_tensor, @@ -242,15 +190,6 @@ Tensor Unary_chain::invoke( return detail::unary_impl(queue_id, input_tensor, ops_chain, memory_config, optional_output_tensor); } -Tensor Unary_chain::invoke( - const Tensor& input_tensor, - const std::vector& ops_chain, - const std::optional& memory_config, - const std::optional& optional_output_tensor) { - TT_FATAL(ops_chain.size() > 0, "Op chain cannot be empty"); - return detail::unary_impl(DefaultQueueId, input_tensor, ops_chain, memory_config, optional_output_tensor); -} - Tensor Softplus::invoke( QueueId queue_id, const Tensor& input, @@ -267,21 +206,6 @@ Tensor Softplus::invoke( optional_output_tensor); } -Tensor Softplus::invoke( - const Tensor& input, - const float beta, - const float threshold, - const std::optional& memory_config, - const std::optional& optional_output_tensor) { - TT_ASSERT(input.device()->arch() != tt::ARCH::GRAYSKULL, "Softplus is not currently supported on Grayskull"); - return detail::unary_impl( - DefaultQueueId, - input, - {UnaryWithParam{UnaryOpType::SOFTPLUS, {beta, threshold}}}, - memory_config, - optional_output_tensor); -} - Tensor Prelu::invoke( QueueId queue_id, const Tensor& input, @@ -292,15 +216,6 @@ Tensor Prelu::invoke( queue_id, input, {UnaryWithParam{UnaryOpType::PRELU_SFPU, value}}, memory_config, optional_output_tensor); } -Tensor Prelu::invoke( - const Tensor& input, - float value, - const std::optional& memory_config, - const std::optional& optional_output_tensor) { - return detail::unary_impl( - DefaultQueueId, input, {UnaryWithParam{UnaryOpType::PRELU_SFPU, value}}, memory_config, optional_output_tensor); -} - Tensor Identity::invoke( QueueId queue_id, const Tensor& input_tensor, @@ -314,19 +229,6 @@ Tensor Identity::invoke( return detail::unary_impl(queue_id, input_tensor, {UnaryWithParam{op_type}}, memory_config, optional_output_tensor); } -Tensor Identity::invoke( - const Tensor& input_tensor, - const std::optional& memory_config, - const std::optional& optional_output_tensor) { - UnaryOpType op_type = UnaryOpType::IDENTITY; - if (input_tensor.get_dtype() == DataType::UINT32) { - op_type = UnaryOpType::IDENTITY_UINT32; - } - - return detail::unary_impl( - DefaultQueueId, input_tensor, {UnaryWithParam{op_type}}, memory_config, optional_output_tensor); -} - Tensor Abs::invoke( QueueId queue_id, const Tensor& input_tensor, @@ -339,18 +241,6 @@ Tensor Abs::invoke( return detail::unary_impl(queue_id, input_tensor, {UnaryWithParam{op_type}}, memory_config, optional_output_tensor); } -Tensor Abs::invoke( - const Tensor& input_tensor, - const std::optional& memory_config, - const std::optional& optional_output_tensor) { - UnaryOpType op_type = UnaryOpType::ABS; - if (input_tensor.get_dtype() == DataType::INT32) { - op_type = UnaryOpType::ABS_INT32; - } - return detail::unary_impl( - DefaultQueueId, input_tensor, {UnaryWithParam{op_type}}, memory_config, optional_output_tensor); -} - Tensor Abs::invoke(const ComplexTensor& input_tensor, const MemoryConfig& output_mem_config) { return ttnn::hypot(input_tensor[0], input_tensor[1], output_mem_config); } @@ -368,19 +258,6 @@ Tensor Floor::invoke( return detail::unary_impl(queue_id, input_tensor, {UnaryWithParam{op_type}}, memory_config, optional_output_tensor); } -Tensor Floor::invoke( - const Tensor& input_tensor, - const std::optional& memory_config, - const std::optional& optional_output_tensor) { - UnaryOpType op_type = UnaryOpType::FLOOR; - if (input_tensor.get_dtype() == DataType::FLOAT32) { - op_type = UnaryOpType::FLOOR_FLOAT32; - } - - return detail::unary_impl( - DefaultQueueId, input_tensor, {UnaryWithParam{op_type}}, memory_config, optional_output_tensor); -} - Tensor Ceil::invoke( QueueId queue_id, const Tensor& input_tensor, @@ -394,19 +271,6 @@ Tensor Ceil::invoke( return detail::unary_impl(queue_id, input_tensor, {UnaryWithParam{op_type}}, memory_config, optional_output_tensor); } -Tensor Ceil::invoke( - const Tensor& input_tensor, - const std::optional& memory_config, - const std::optional& optional_output_tensor) { - UnaryOpType op_type = UnaryOpType::CEIL; - if (input_tensor.get_dtype() == DataType::FLOAT32) { - op_type = UnaryOpType::CEIL_FLOAT32; - } - - return detail::unary_impl( - DefaultQueueId, input_tensor, {UnaryWithParam{op_type}}, memory_config, optional_output_tensor); -} - template Tensor ExecuteUnaryWithIntegerParameter::invoke( QueueId queue_id, @@ -422,20 +286,6 @@ Tensor ExecuteUnaryWithIntegerParameter::invoke( optional_output_tensor); } -template -Tensor ExecuteUnaryWithIntegerParameter::invoke( - const Tensor& input_tensor, - T parameter, - const std::optional& memory_config, - const std::optional& optional_output_tensor) { - return detail::unary_impl( - DefaultQueueId, - input_tensor, - {UnaryWithParam{unary_op_type, static_cast(parameter)}}, - memory_config, - optional_output_tensor); -} - template struct ExecuteUnaryWithIntegerParameter; template struct ExecuteUnaryWithIntegerParameter; template struct ExecuteUnaryWithIntegerParameter; @@ -473,34 +323,6 @@ Tensor SymmetricBinop::invoke( optional_output_tensor); } -template -Tensor SymmetricBinop::invoke( - const Tensor& input_tensor, - T param, - const std::optional& memory_config, - const std::optional& optional_output_tensor) { - return detail::unary_impl( - DefaultQueueId, - input_tensor, - {UnaryWithParam(unary_op_type, static_cast(param))}, - memory_config, - optional_output_tensor); -} - -template -Tensor SymmetricBinop::invoke( - T param, - const Tensor& input_tensor, - const std::optional& memory_config, - const std::optional& optional_output_tensor) { - return detail::unary_impl( - DefaultQueueId, - input_tensor, - {UnaryWithParam(unary_op_type, static_cast(param))}, - memory_config, - optional_output_tensor); -} - // Explicit template instantiation template struct SymmetricBinop; template struct SymmetricBinop; @@ -535,34 +357,6 @@ Tensor AsymmetricBinop::invoke( optional_output_tensor); } -template -Tensor AsymmetricBinop::invoke( - const Tensor& input_tensor, - float param, - const std::optional& memory_config, - const std::optional& optional_output_tensor) { - return detail::unary_impl( - DefaultQueueId, - input_tensor, - {UnaryWithParam(unary_op_type, static_cast(param))}, - memory_config, - optional_output_tensor); -} - -template -Tensor AsymmetricBinop::invoke( - float param, - const Tensor& input_tensor, - const std::optional& memory_config, - const std::optional& optional_output_tensor) { - return detail::unary_impl( - DefaultQueueId, - input_tensor, - {UnaryWithParam(unary_op_rev_type, static_cast(param))}, - memory_config, - optional_output_tensor); -} - template struct AsymmetricBinop; template struct AsymmetricBinop; diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/unary.hpp b/ttnn/cpp/ttnn/operations/eltwise/unary/unary.hpp index c1f555a8a83..933644706ae 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/unary/unary.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/unary/unary.hpp @@ -29,11 +29,6 @@ struct ExecuteUnary { const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); - static Tensor invoke( - const Tensor& input_tensor, - const std::optional& memory_config = std::nullopt, - const std::optional& optional_output_tensor = std::nullopt); - static typename ExecuteUnaryInvokeResult::type invoke( const ComplexTensor& input_tensor, const MemoryConfig& memory_config); }; @@ -46,12 +41,6 @@ struct ExecuteUnaryWithFastAndApproximateMode { const bool parameter = false, const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); - - static Tensor invoke( - const Tensor& input_tensor, - const bool parameter = false, - const std::optional& memory_config = std::nullopt, - const std::optional& optional_output_tensor = std::nullopt); }; template @@ -62,12 +51,6 @@ struct ExecuteUnaryWithFloatParameter { const float parameter, const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); - - static Tensor invoke( - const Tensor& input_tensor, - const float parameter, - const std::optional& memory_config = std::nullopt, - const std::optional& optional_output_tensor = std::nullopt); }; struct Sigmoid_accurate { @@ -76,11 +59,6 @@ struct Sigmoid_accurate { const Tensor& input, const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); - - static Tensor invoke( - const Tensor& input, - const std::optional& memory_config = std::nullopt, - const std::optional& optional_output_tensor = std::nullopt); }; struct Unary_chain { @@ -90,12 +68,6 @@ struct Unary_chain { const std::vector& ops_chain, const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); - - static Tensor invoke( - const Tensor& input_tensor, - const std::vector& ops_chain, - const std::optional& memory_config = std::nullopt, - const std::optional& optional_output_tensor = std::nullopt); }; struct Softplus { @@ -106,13 +78,6 @@ struct Softplus { const float threshold, const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); - - static Tensor invoke( - const Tensor& input, - const float beta, - const float threshold, - const std::optional& memory_config = std::nullopt, - const std::optional& optional_output_tensor = std::nullopt); }; struct Prelu { @@ -122,12 +87,6 @@ struct Prelu { float value, const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); - - static Tensor invoke( - const Tensor& input, - float value, - const std::optional& memory_config = std::nullopt, - const std::optional& optional_output_tensor = std::nullopt); }; struct Identity { @@ -136,11 +95,6 @@ struct Identity { const Tensor& input_tensor, const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); - - static Tensor invoke( - const Tensor& input_tensor, - const std::optional& memory_config = std::nullopt, - const std::optional& optional_output_tensor = std::nullopt); }; struct Abs { @@ -150,11 +104,6 @@ struct Abs { const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); - static Tensor invoke( - const Tensor& input_tensor, - const std::optional& memory_config = std::nullopt, - const std::optional& optional_output_tensor = std::nullopt); - static Tensor invoke(const ComplexTensor& input_tensor, const MemoryConfig& memory_config); }; @@ -164,11 +113,6 @@ struct Floor { const Tensor& input_tensor, const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); - - static Tensor invoke( - const Tensor& input_tensor, - const std::optional& memory_config = std::nullopt, - const std::optional& optional_output_tensor = std::nullopt); }; struct Ceil { @@ -177,11 +121,6 @@ struct Ceil { const Tensor& input_tensor, const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); - - static Tensor invoke( - const Tensor& input_tensor, - const std::optional& memory_config = std::nullopt, - const std::optional& optional_output_tensor = std::nullopt); }; struct Dropout { static Tensor invoke( @@ -191,15 +130,6 @@ struct Dropout { const float scale, const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); - - static Tensor invoke( - QueueId queue_id, - const Tensor& input, - const uint32_t seed, - const float probability, - const float scale, - const std::optional& memory_config = std::nullopt, - const std::optional& optional_output_tensor = std::nullopt); }; template @@ -210,12 +140,6 @@ struct ExecuteUnaryWithIntegerParameter { T parameter, const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); - - static Tensor invoke( - const Tensor& input_tensor, - T parameter, - const std::optional& memory_config = std::nullopt, - const std::optional& optional_output_tensor = std::nullopt); }; template @@ -233,18 +157,6 @@ struct SymmetricBinop { const Tensor& input_tensor, const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); - - static Tensor invoke( - const Tensor& input_tensor, - T param, - const std::optional& memory_config = std::nullopt, - const std::optional& optional_output_tensor = std::nullopt); - - static Tensor invoke( - T param, - const Tensor& input_tensor, - const std::optional& memory_config = std::nullopt, - const std::optional& optional_output_tensor = std::nullopt); }; template @@ -262,18 +174,6 @@ struct AsymmetricBinop { const Tensor& input_tensor, const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt); - - static Tensor invoke( - const Tensor& input_tensor, - float param, - const std::optional& memory_config = std::nullopt, - const std::optional& optional_output_tensor = std::nullopt); - - static Tensor invoke( - float param, - const Tensor& input_tensor, - const std::optional& memory_config = std::nullopt, - const std::optional& optional_output_tensor = std::nullopt); }; } // namespace unary diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward.cpp index 3f63d85c7f6..6e6a4280680 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward.cpp @@ -279,16 +279,6 @@ std::vector> ExecuteUnaryBackwardPow::invoke( return grad_tensor; } -std::vector> ExecuteUnaryBackwardPow::invoke( - const Tensor& grad, - const Tensor& input, - float exponent, - const std::optional& output_mem_config, - std::optional input_grad) { - return ExecuteUnaryBackwardPow::invoke( - DefaultQueueId, grad, input, exponent, output_mem_config, std::move(input_grad)); -} - std::vector> ExecuteUnaryBackwardExp::invoke( QueueId queue_id, const Tensor& grad, @@ -326,14 +316,6 @@ std::vector> ExecuteUnaryBackwardExp::invoke( return grad_tensor; } -std::vector> ExecuteUnaryBackwardExp::invoke( - const Tensor& grad, - const Tensor& input, - const std::optional& output_mem_config, - std::optional input_grad) { - return ExecuteUnaryBackwardExp::invoke(DefaultQueueId, grad, input, output_mem_config, std::move(input_grad)); -} - std::vector> ExecuteUnaryBackwardTanh::invoke( QueueId queue_id, const Tensor& grad, @@ -351,14 +333,6 @@ std::vector> ExecuteUnaryBackwardTanh::invoke( return grad_tensor; } -std::vector> ExecuteUnaryBackwardTanh::invoke( - const Tensor& grad, - const Tensor& input, - const std::optional& output_mem_config, - std::optional input_grad) { - return ExecuteUnaryBackwardTanh::invoke(DefaultQueueId, grad, input, output_mem_config, std::move(input_grad)); -} - std::vector> ExecuteUnaryBackwardSqrt::invoke( QueueId queue_id, const Tensor& grad, @@ -417,14 +391,6 @@ std::vector> ExecuteUnaryBackwardSqrt::invoke( return grad_tensor; } -std::vector> ExecuteUnaryBackwardSqrt::invoke( - const Tensor& grad, - const Tensor& input, - const std::optional& output_mem_config, - std::optional input_grad) { - return ExecuteUnaryBackwardSqrt::invoke(DefaultQueueId, grad, input, output_mem_config, std::move(input_grad)); -} - std::vector ExecuteUnaryBackwardMultigammaln::invoke( const Tensor& grad, const Tensor& input, const std::optional& output_mem_config) { std::vector grad_tensor; @@ -603,14 +569,6 @@ std::vector> ExecuteUnaryBackwardRsqrt::invoke( return result; } -std::vector> ExecuteUnaryBackwardRsqrt::invoke( - const Tensor& grad, - const Tensor& input, - const std::optional& output_mem_config, - std::optional input_grad) { - return ExecuteUnaryBackwardRsqrt::invoke(DefaultQueueId, grad, input, output_mem_config, std::move(input_grad)); -} - std::vector> ExecuteUnaryBackwardNeg::invoke( QueueId queue_id, const Tensor& grad, @@ -623,14 +581,6 @@ std::vector> ExecuteUnaryBackwardNeg::invoke( return result; } -std::vector> ExecuteUnaryBackwardNeg::invoke( - const Tensor& grad, - const Tensor& input, - const std::optional& output_mem_config, - std::optional input_grad) { - return ExecuteUnaryBackwardNeg::invoke(DefaultQueueId, grad, input, output_mem_config, std::move(input_grad)); -} - std::vector ExecuteUnaryBackwardRelu::invoke( const Tensor& grad, const Tensor& input, const std::optional& output_mem_config) { std::vector grad_tensor; @@ -657,14 +607,6 @@ std::vector> ExecuteUnaryBackwardFill::invoke( return result; } -std::vector> ExecuteUnaryBackwardFill::invoke( - const Tensor& grad, - const Tensor& input, - const std::optional& output_mem_config, - std::optional input_grad) { - return ExecuteUnaryBackwardFill::invoke(DefaultQueueId, grad, input, output_mem_config, std::move(input_grad)); -} - std::vector ExecuteUnaryBackwardHardsigmoid::invoke( const Tensor& grad, const Tensor& input, const std::optional& output_mem_config) { std::vector grad_tensor; @@ -1007,14 +949,6 @@ std::vector> ExecuteUnaryBackwardSilu::invoke( return result; } -std::vector> ExecuteUnaryBackwardSilu::invoke( - const Tensor& grad, - const Tensor& input, - const std::optional& output_mem_config, - std::optional input_grad) { - return ExecuteUnaryBackwardSilu::invoke(DefaultQueueId, grad, input, output_mem_config, std::move(input_grad)); -} - // Selu // result: torch.where(input > 0, grad * lambd, grad * lambd * alpha * torch.exp(input)) std::vector ExecuteUnaryBackwardSelu::invoke( @@ -1760,16 +1694,6 @@ std::vector> ExecuteUnaryBackwardGelu::invoke( return result; } -std::vector> ExecuteUnaryBackwardGelu::invoke( - const Tensor& grad, - const Tensor& input, - const string& approximate, - const std::optional& output_mem_config, - std::optional input_grad) { - return ExecuteUnaryBackwardGelu::invoke( - DefaultQueueId, grad, input, std::move(approximate), output_mem_config, std::move(input_grad)); -} - std::vector ExecuteUnaryBackwardRepeat::invoke( const Tensor& grad, const Tensor& input, @@ -1937,7 +1861,7 @@ std::vector ExecuteUnaryBackwardProd::invoke( if (reciprocal_input.padded_shape()[1] % 32 != 0) { ttnn::SmallVector> padding = { {0, 0}, {0, 32 - (reciprocal_input.padded_shape()[1] % 32)}, {0, 0}, {0, 0}}; - tensor_1_temp = ttnn::pad(ttnn::DefaultQueueId, reciprocal_input, padding, 0, true, std::nullopt); + tensor_1_temp = ttnn::pad(reciprocal_input, padding, 0, true, std::nullopt); } ttnn::SmallVector after_permute_dims = {0, 2, 3, 1}; Tensor tensor_1 = ttnn::permute(tensor_1_temp, after_permute_dims, output_memory_config); @@ -1975,7 +1899,7 @@ std::vector ExecuteUnaryBackwardProd::invoke( if (reciprocal_input.padded_shape()[0] % 32 != 0) { ttnn::SmallVector> padding = { {0, (32 - (reciprocal_input.padded_shape()[0] % 32))}, {0, 0}, {0, 0}, {0, 0}}; - tensor_1_temp = ttnn::pad(ttnn::DefaultQueueId, reciprocal_input, padding, 0, false, std::nullopt); + tensor_1_temp = ttnn::pad(reciprocal_input, padding, 0, false, std::nullopt); } ttnn::SmallVector after_permute_dims = {3, 1, 2, 0}; Tensor tensor_1 = ttnn::permute(tensor_1_temp, after_permute_dims, output_memory_config); diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward.hpp b/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward.hpp index 813c39314f4..9fc4942a7f1 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward.hpp @@ -21,12 +21,6 @@ struct ExecuteUnaryBackwardNeg { const Tensor& input_tensor_arg, const std::optional& memory_config = std::nullopt, std::optional input_grad = std::nullopt); - - static std::vector> invoke( - const Tensor& grad_tensor_arg, - const Tensor& input_tensor_arg, - const std::optional& memory_config = std::nullopt, - std::optional input_grad = std::nullopt); }; struct ExecuteUnaryBackwardThreshold { @@ -443,12 +437,6 @@ struct ExecuteUnaryBackwardRsqrt { const Tensor& input_tensor_arg, const std::optional& memory_config = std::nullopt, std::optional input_grad = std::nullopt); - - static std::vector> invoke( - const Tensor& grad_tensor_arg, - const Tensor& input_tensor_arg, - const std::optional& memory_config = std::nullopt, - std::optional input_grad = std::nullopt); }; struct ExecuteUnaryBackwardClamp { @@ -508,13 +496,6 @@ struct ExecuteUnaryBackwardPow { float parameter, const std::optional& memory_config = std::nullopt, std::optional input_grad = std::nullopt); - - static std::vector> invoke( - const Tensor& grad_tensor_arg, - const Tensor& input_tensor_arg, - float parameter, - const std::optional& memory_config = std::nullopt, - std::optional input_grad = std::nullopt); }; struct ExecuteUnaryBackwardExp { @@ -524,12 +505,6 @@ struct ExecuteUnaryBackwardExp { const Tensor& input_tensor_arg, const std::optional& memory_config = std::nullopt, std::optional input_grad = std::nullopt); - - static std::vector> invoke( - const Tensor& grad_tensor_arg, - const Tensor& input_tensor_arg, - const std::optional& memory_config = std::nullopt, - std::optional input_grad = std::nullopt); }; struct ExecuteUnaryBackwardTanh { @@ -539,12 +514,6 @@ struct ExecuteUnaryBackwardTanh { const Tensor& input_tensor_arg, const std::optional& memory_config = std::nullopt, std::optional input_grad = std::nullopt); - - static std::vector> invoke( - const Tensor& grad_tensor_arg, - const Tensor& input_tensor_arg, - const std::optional& memory_config = std::nullopt, - std::optional input_grad = std::nullopt); }; struct ExecuteUnaryBackwardSqrt { @@ -554,12 +523,6 @@ struct ExecuteUnaryBackwardSqrt { const Tensor& input_tensor_arg, const std::optional& memory_config = std::nullopt, std::optional input_grad = std::nullopt); - - static std::vector> invoke( - const Tensor& grad_tensor_arg, - const Tensor& input_tensor_arg, - const std::optional& memory_config = std::nullopt, - std::optional input_grad = std::nullopt); }; struct ExecuteUnaryBackwardSilu { @@ -569,12 +532,6 @@ struct ExecuteUnaryBackwardSilu { const Tensor& input_tensor_arg, const std::optional& memory_config = std::nullopt, std::optional input_grad = std::nullopt); - - static std::vector> invoke( - const Tensor& grad_tensor_arg, - const Tensor& input_tensor_arg, - const std::optional& memory_config = std::nullopt, - std::optional input_grad = std::nullopt); }; struct ExecuteUnaryBackwardFill { @@ -584,12 +541,6 @@ struct ExecuteUnaryBackwardFill { const Tensor& input_tensor_arg, const std::optional& memory_config = std::nullopt, std::optional input_grad = std::nullopt); - - static std::vector> invoke( - const Tensor& grad_tensor_arg, - const Tensor& input_tensor_arg, - const std::optional& memory_config = std::nullopt, - std::optional input_grad = std::nullopt); }; struct ExecuteUnaryBackwardProd { @@ -631,13 +582,6 @@ struct ExecuteUnaryBackwardGelu { const string& parameter_a, const std::optional& memory_config = std::nullopt, std::optional input_grad = std::nullopt); - - static std::vector> invoke( - const Tensor& grad_tensor_arg, - const Tensor& input_tensor_arg, - const string& parameter_a, - const std::optional& memory_config = std::nullopt, - std::optional input_grad = std::nullopt); }; } // namespace operations::unary_backward diff --git a/ttnn/cpp/ttnn/operations/experimental/auto_format/auto_format.cpp b/ttnn/cpp/ttnn/operations/experimental/auto_format/auto_format.cpp index 71129d82a7b..9a3a24b2d80 100644 --- a/ttnn/cpp/ttnn/operations/experimental/auto_format/auto_format.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/auto_format/auto_format.cpp @@ -188,7 +188,7 @@ Tensor AutoFormat::format_output_tensor( auto ends = std::array({shape[0], shape[1], shape[2], shape[3]}); auto step = std::array({1, 1, 1, 1}); - formatted_output = ttnn::slice(DefaultQueueId, formatted_output, begins, ends, step, mem_config); + formatted_output = ttnn::slice(formatted_output, begins, ends, step, mem_config); return formatted_output; // Output is tile but shape cannot be tile. We leave in RM } else if (formatted_output.get_layout() == Layout::TILE && AutoFormat::legal_rm_shape(shape)) { @@ -212,7 +212,7 @@ Tensor AutoFormat::format_output_tensor( auto begins = std::array({0, 0, 0, 0}); auto ends = std::array({shape[0], shape[1], shape[2], shape[3]}); auto step = std::array({1, 1, 1, 1}); - formatted_output = ttnn::slice(DefaultQueueId, formatted_output, begins, ends, step, mem_config); + formatted_output = ttnn::slice(formatted_output, begins, ends, step, mem_config); formatted_output = ttnn::tilize(formatted_output, mem_config); return formatted_output; } diff --git a/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/convert_to_chw.cpp b/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/convert_to_chw.cpp index 1ea1da85ce0..df87c6d4368 100644 --- a/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/convert_to_chw.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/convert_to_chw.cpp @@ -18,9 +18,4 @@ ttnn::Tensor ExecuteConvertToCHW::invoke( return operation::run(program, {a}, {}, {}, queue_id).at(0); } -ttnn::Tensor ExecuteConvertToCHW::invoke( - const Tensor& a, const std::optional& memory_config, const std::optional& dtype) { - return invoke(DefaultQueueId, a, memory_config, dtype); -} - } // namespace ttnn::operations::experimental::cnn diff --git a/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/convert_to_chw.hpp b/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/convert_to_chw.hpp index 8dd15d4d3f3..1cf28862a14 100644 --- a/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/convert_to_chw.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/convert_to_chw.hpp @@ -15,7 +15,6 @@ struct ExecuteConvertToCHW { const Tensor& a, const std::optional& memory_config = std::nullopt, const std::optional& dtype = std::nullopt); - static ttnn::Tensor invoke(const Tensor& a, const std::optional& memory_config = std::nullopt, const std::optional& dtype = std::nullopt); }; } // namespace ttnn::operations::experimental::cnn diff --git a/ttnn/cpp/ttnn/operations/experimental/plusone/plusone.cpp b/ttnn/cpp/ttnn/operations/experimental/plusone/plusone.cpp index 9ff82db36ce..a090a3b241d 100644 --- a/ttnn/cpp/ttnn/operations/experimental/plusone/plusone.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/plusone/plusone.cpp @@ -15,6 +15,4 @@ ttnn::Tensor PlusOneOperation::invoke(QueueId queue_id, const Tensor& input_tens return operation::run(PlusOne{}, {input_tensor}, {}, {}, queue_id).at(0); } -ttnn::Tensor PlusOneOperation::invoke(const Tensor& input_tensor) { return invoke(DefaultQueueId, input_tensor); } - } // namespace ttnn::operations::experimental diff --git a/ttnn/cpp/ttnn/operations/experimental/plusone/plusone.hpp b/ttnn/cpp/ttnn/operations/experimental/plusone/plusone.hpp index 4ffeafeb2aa..a74bce46923 100644 --- a/ttnn/cpp/ttnn/operations/experimental/plusone/plusone.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/plusone/plusone.hpp @@ -13,8 +13,6 @@ namespace operations::experimental { struct PlusOneOperation { static ttnn::Tensor invoke(QueueId queue_id, const Tensor& input_tensor); - - static ttnn::Tensor invoke(const Tensor& input_tensor); }; } // namespace operations::experimental diff --git a/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/fast_reduce_nc.cpp b/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/fast_reduce_nc.cpp index 68659d1c35d..b209afd6d8a 100644 --- a/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/fast_reduce_nc.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/fast_reduce_nc.cpp @@ -21,15 +21,6 @@ ttnn::Tensor FastReduceNCOperation::invoke( return detail::fast_reduce_nc(queue_id, input, dims, output, memory_config, compute_kernel_config); } -ttnn::Tensor FastReduceNCOperation::invoke( - const ttnn::Tensor& input, - tt::stl::Span dims, - const std::optional& output, - const ttnn::MemoryConfig& memory_config, - std::optional compute_kernel_config) { - return FastReduceNCOperation::invoke(DefaultQueueId, input, dims, output, memory_config, compute_kernel_config); -} - } // namespace operations::experimental::reduction } // namespace ttnn diff --git a/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/fast_reduce_nc.hpp b/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/fast_reduce_nc.hpp index a8a771c8a22..f1f776f91fc 100644 --- a/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/fast_reduce_nc.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/fast_reduce_nc.hpp @@ -20,13 +20,6 @@ struct FastReduceNCOperation { const std::optional& output, const ttnn::MemoryConfig& memory_config, std::optional compute_kernel_config); - - static ttnn::Tensor invoke( - const ttnn::Tensor& input, - tt::stl::Span dims, - const std::optional& output, - const ttnn::MemoryConfig& memory_config, - std::optional compute_kernel_config); }; } // namespace operations::experimental::reduction diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/hc_sum_reduce.cpp b/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/hc_sum_reduce.cpp index c8b56723fdf..f06c81e4b20 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/hc_sum_reduce.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/hc_sum_reduce.cpp @@ -24,12 +24,4 @@ ttnn::Tensor ExecuteHCSumReduce::invoke( return operation::run(program, {input}, {}, {}, queue_id).at(0); } -ttnn::Tensor ExecuteHCSumReduce::invoke( - const Tensor& input, - const std::optional& memory_config, - const std::optional dtype, - const std::optional math_fidelity) { - return invoke(DefaultQueueId, input, memory_config, dtype, math_fidelity); -} - } // namespace ttnn::operations::experimental::ssm diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/hc_sum_reduce.hpp b/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/hc_sum_reduce.hpp index cc4b999db4e..47d25eaaaa2 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/hc_sum_reduce.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/hc_sum_reduce.hpp @@ -16,12 +16,6 @@ struct ExecuteHCSumReduce { const std::optional& memory_config = std::nullopt, const std::optional dtype = std::nullopt, const std::optional math_fidelity = std::nullopt); - - static ttnn::Tensor invoke( - const Tensor& input, - const std::optional& memory_config = std::nullopt, - const std::optional dtype = std::nullopt, - const std::optional math_fidelity = std::nullopt); }; } // namespace ttnn::operations::experimental::ssm diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/prefix_scan.cpp b/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/prefix_scan.cpp index 70c9eb21f5d..360b1a52ffc 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/prefix_scan.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/prefix_scan.cpp @@ -26,14 +26,4 @@ ttnn::Tensor ExecutePrefixScan::invoke( return operation::run(program, {a, bx, h_prev}, {}, {}, queue_id).at(0); } -ttnn::Tensor ExecutePrefixScan::invoke( - const Tensor& a, - const Tensor& bx, - const Tensor& h_prev, - const std::optional& memory_config, - const std::optional dtype, - const std::optional math_fidelity) { - return invoke(DefaultQueueId, a, bx, h_prev, memory_config, dtype, math_fidelity); -} - } // namespace ttnn::operations::experimental::ssm diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/prefix_scan.hpp b/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/prefix_scan.hpp index 7191853626d..4ffd1c31886 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/prefix_scan.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/prefix_scan.hpp @@ -18,14 +18,6 @@ struct ExecutePrefixScan { const std::optional& memory_config = std::nullopt, const std::optional dtype = std::nullopt, const std::optional math_fidelity = std::nullopt); - - static ttnn::Tensor invoke( - const Tensor& a, - const Tensor& bx, - const Tensor& h_prev, - const std::optional& memory_config = std::nullopt, - const std::optional dtype = std::nullopt, - const std::optional math_fidelity = std::nullopt); }; } // namespace ttnn::operations::experimental::ssm diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/repeat_and_interleave_eltwise_mul.cpp b/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/repeat_and_interleave_eltwise_mul.cpp index 52fabc138df..f260164b021 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/repeat_and_interleave_eltwise_mul.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/repeat_and_interleave_eltwise_mul.cpp @@ -25,13 +25,4 @@ ttnn::Tensor ExecuteRepeatAndInterleaveEltwiseMul::invoke( return operation::run(program, {a, b}, {}, {}, queue_id).at(0); } -ttnn::Tensor ExecuteRepeatAndInterleaveEltwiseMul::invoke( - const Tensor& a, - const Tensor& b, - const std::optional& memory_config, - const std::optional dtype, - const std::optional math_fidelity) { - return invoke(DefaultQueueId, a, b, memory_config, dtype, math_fidelity); -} - } // namespace ttnn::operations::experimental::ssm diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/repeat_and_interleave_eltwise_mul.hpp b/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/repeat_and_interleave_eltwise_mul.hpp index 446b568947f..dd76d40961d 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/repeat_and_interleave_eltwise_mul.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/repeat_and_interleave_eltwise_mul.hpp @@ -18,13 +18,6 @@ struct ExecuteRepeatAndInterleaveEltwiseMul { const std::optional& memory_config = std::nullopt, const std::optional dtype = std::nullopt, const std::optional math_fidelity = std::nullopt); - - static ttnn::Tensor invoke( - const Tensor& a, - const Tensor& b, - const std::optional& memory_config = std::nullopt, - const std::optional dtype = std::nullopt, - const std::optional math_fidelity = std::nullopt); }; } // namespace ttnn::operations::experimental::ssm diff --git a/ttnn/cpp/ttnn/operations/reduction/argmax/argmax.cpp b/ttnn/cpp/ttnn/operations/reduction/argmax/argmax.cpp index d43c7df809a..b94832bcb55 100644 --- a/ttnn/cpp/ttnn/operations/reduction/argmax/argmax.cpp +++ b/ttnn/cpp/ttnn/operations/reduction/argmax/argmax.cpp @@ -33,13 +33,4 @@ ttnn::Tensor ArgMaxOperation::invoke( .at(0); } -ttnn::Tensor ArgMaxOperation::invoke( - const Tensor& input_tensor, - const std::optional dim, - const bool use_muticore, - const std::optional& memory_config, - std::optional optional_output_tensor) { - return invoke(DefaultQueueId, input_tensor, dim, use_muticore, memory_config, std::move(optional_output_tensor)); -} - } // namespace ttnn::operations::reduction diff --git a/ttnn/cpp/ttnn/operations/reduction/argmax/argmax.hpp b/ttnn/cpp/ttnn/operations/reduction/argmax/argmax.hpp index a708b177af9..74dc3834473 100644 --- a/ttnn/cpp/ttnn/operations/reduction/argmax/argmax.hpp +++ b/ttnn/cpp/ttnn/operations/reduction/argmax/argmax.hpp @@ -19,13 +19,6 @@ struct ArgMaxOperation { const bool use_muticore = false, const std::optional& memory_config = std::nullopt, std::optional optional_output_tensor = std::nullopt); - - static ttnn::Tensor invoke( - const Tensor& input_tensor, - const std::optional dim = std::nullopt, - const bool use_muticore = false, - const std::optional& memory_config = std::nullopt, - std::optional optional_output_tensor = std::nullopt); }; } // namespace operations::reduction diff --git a/ttnn/cpp/ttnn/operations/reduction/prod/prod.cpp b/ttnn/cpp/ttnn/operations/reduction/prod/prod.cpp index 2d9e1d84a4a..0d24fd959a5 100644 --- a/ttnn/cpp/ttnn/operations/reduction/prod/prod.cpp +++ b/ttnn/cpp/ttnn/operations/reduction/prod/prod.cpp @@ -132,8 +132,7 @@ Tensor ProdOperation::invoke( const auto& input_shape = input_tensor_4d.get_logical_shape(); ttnn::SmallVector start_index = {0, 0, 0, 0}; ttnn::SmallVector end_index = {input_shape[0], input_shape[1], 1, input_shape[3]}; - result = ttnn::squeeze_from_4D( - ttnn::slice(DefaultQueueId, required, start_index, end_index, step, std::nullopt), old_rank); + result = ttnn::squeeze_from_4D(ttnn::slice(required, start_index, end_index, step, std::nullopt), old_rank); } else { // dim 3 // permute ttnn::SmallVector after_permute_dims = {1, 2, 0, 3}; @@ -142,7 +141,7 @@ Tensor ProdOperation::invoke( const auto& input_shape = input_tensor_4d.get_logical_shape(); ttnn::SmallVector start_index = {0, 0, 0, 0}; ttnn::SmallVector end_index = {input_shape[0], input_shape[1], 1, input_shape[2]}; - Tensor new_unpad_tensor = ttnn::slice(DefaultQueueId, required, start_index, end_index, step, std::nullopt); + Tensor new_unpad_tensor = ttnn::slice(required, start_index, end_index, step, std::nullopt); // permute back after_permute_dims = {0, 1, 3, 2}; Tensor res_host = ttnn::permute(new_unpad_tensor, after_permute_dims, output_mem_config); diff --git a/ttnn/cpp/ttnn/tensor/tensor_impl.hpp b/ttnn/cpp/ttnn/tensor/tensor_impl.hpp index cf34ac215c2..30bb8f97010 100644 --- a/ttnn/cpp/ttnn/tensor/tensor_impl.hpp +++ b/ttnn/cpp/ttnn/tensor/tensor_impl.hpp @@ -13,6 +13,7 @@ #include #include #include + #include #include "ttnn/tensor/host_buffer/functions.hpp" From d22b9e530dafbcc77e14c4b27f2413a1aadc3473 Mon Sep 17 00:00:00 2001 From: Denys Makoviichuk Date: Fri, 7 Feb 2025 17:35:31 -0800 Subject: [PATCH 028/316] [TT-Train] Updated cmake for tt_stl (#17753) ### Problem description #include <> doesn't find. ### What's changed Added tt_stl deps and path. ### Checklist - [x] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes --- tt-train/sources/ttml/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tt-train/sources/ttml/CMakeLists.txt b/tt-train/sources/ttml/CMakeLists.txt index ed7344f8ff2..14c315e6e20 100644 --- a/tt-train/sources/ttml/CMakeLists.txt +++ b/tt-train/sources/ttml/CMakeLists.txt @@ -27,6 +27,7 @@ if(NOT TARGET Metalium::Metal) "$ENV{TT_METAL_HOME}/tt_metal/third_party/umd/device/api" "$ENV{TT_METAL_HOME}/tt_metal/hostdevcommon/api" "$ENV{TT_METAL_HOME}/tt_metal/include" + "$ENV{TT_METAL_HOME}/tt_metal/tt_stl" # TTNN "$ENV{TT_METAL_HOME}/ttnn" "$ENV{TT_METAL_HOME}/ttnn/cpp" From 99a6252fd559ce74522add5012ee2bfb2c384fc6 Mon Sep 17 00:00:00 2001 From: asaigal Date: Fri, 7 Feb 2025 15:56:22 -0600 Subject: [PATCH 029/316] Update get_dispatch_core() for unused TG MMIO dispatch cores - Additionally ensure that no runtime traffic is sent to MMIO chip dispatch cores, since they are idle and this is unnecessary --- tt_metal/impl/device/device_pool.cpp | 5 +++++ tt_metal/impl/dispatch/dispatch_query_manager.cpp | 9 +++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/tt_metal/impl/device/device_pool.cpp b/tt_metal/impl/device/device_pool.cpp index 657be4dc5c3..753631cc992 100644 --- a/tt_metal/impl/device/device_pool.cpp +++ b/tt_metal/impl/device/device_pool.cpp @@ -536,8 +536,13 @@ void DevicePool::close_devices(const std::vector& devices) { // before closing any device + modifying routing info. // If this is not done, non-blocking CCLs followed by a close will hang, since // the main thread will modify device state while the CCL is running on device. + // On TG - this should not be done on MMIO mapped devices, since we don't run + // any workloads on them for (const auto& dev_id : devices_to_close) { auto dev = tt::DevicePool::instance().get_active_device(dev_id); + if (tt::Cluster::instance().is_galaxy_cluster() and dev->is_mmio_capable()) { + continue; + } dev->synchronize(); // Synchronize worker queue Synchronize(dev); // Synchronize device } diff --git a/tt_metal/impl/dispatch/dispatch_query_manager.cpp b/tt_metal/impl/dispatch/dispatch_query_manager.cpp index 9eef6cbc72a..4ffa7597b31 100644 --- a/tt_metal/impl/dispatch/dispatch_query_manager.cpp +++ b/tt_metal/impl/dispatch/dispatch_query_manager.cpp @@ -34,8 +34,13 @@ tt_cxy_pair dispatch_core(uint8_t cq_id) { for (chip_id_t device_id = 0; device_id < tt::Cluster::instance().number_of_devices(); device_id++) { uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id); if (tt::Cluster::instance().get_associated_mmio_device(device_id) == device_id) { - // Dispatch core is not allocated on this MMIO device, skip it - if (not dispatch_core_mgr::instance().is_dispatcher_core_allocated(device_id, channel, cq_id)) { + // Dispatch core is not allocated on this MMIO device or this is a TG system, skip it + // On TG, local dispatch cores are allocated on MMIO devices, but are not used + // since programs are not run on these devices. The placement of these cores is + // irrelevant for the runtime layer, since these are not used. Hence, these are + // skipped. + if (not dispatch_core_mgr::instance().is_dispatcher_core_allocated(device_id, channel, cq_id) or + tt::Cluster::instance().is_galaxy_cluster()) { continue; } dispatch_core = dispatch_core_mgr::instance().dispatcher_core(device_id, channel, cq_id); From 15ffcc8cf4ebb70c72fc0f5830e42491901e2fb5 Mon Sep 17 00:00:00 2001 From: Stanislav Minakov Date: Sat, 8 Feb 2025 10:01:53 +0000 Subject: [PATCH 030/316] #15496 Change Tensor serialization to serialize TensorSpec with flatbuffer (#17748) ### Ticket https://github.com/tenstorrent/tt-metal/issues/15496 #16067 ### Problem description Currently TensorSpec isn't being serialized properly, which causes issues in some cases. In particular, it causes bugs in `as_tensor` with transposed tiles. ### What's changed Introduce flatbuffer schema for TensorSpec serialization. Added conversion code to/from TensorSpec to flatbuffer struct. Heavily modified serialization code to preserve compatibility with the old format, but serialize TensorSpec with flatbuffer in newer versions. Changed fstream io into fread/fwrite to improve performance. ### Checklist - [x] [All post commit](https://github.com/tenstorrent/tt-metal/actions/runs/13213319656) CI passes - [x] [Model regression](https://github.com/tenstorrent/tt-metal/actions/runs/13209781898) - [x] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/runs/13209784841) - [x] New/Existing tests provide coverage for changes --- ttnn/CMakeLists.txt | 4 + ttnn/cpp/ttnn/tensor/CMakeLists.txt | 2 + .../ttnn/tensor/flatbuffer/tensor_types.fbs | 103 ++++ .../tensor_types_from_flatbuffer.cpp | 132 +++++ .../tensor_types_from_flatbuffer.hpp | 26 + .../flatbuffer/tensor_types_to_flatbuffer.cpp | 148 +++++ .../flatbuffer/tensor_types_to_flatbuffer.hpp | 37 ++ ttnn/cpp/ttnn/tensor/layout/tensor_layout.cpp | 5 + ttnn/cpp/ttnn/tensor/layout/tensor_layout.hpp | 3 + ttnn/cpp/ttnn/tensor/serialization.cpp | 533 ++++++++++-------- ttnn/cpp/ttnn/tensor/serialization.hpp | 4 +- ttnn/cpp/ttnn/tensor/types.hpp | 2 +- 12 files changed, 764 insertions(+), 235 deletions(-) create mode 100644 ttnn/cpp/ttnn/tensor/flatbuffer/tensor_types.fbs create mode 100644 ttnn/cpp/ttnn/tensor/flatbuffer/tensor_types_from_flatbuffer.cpp create mode 100644 ttnn/cpp/ttnn/tensor/flatbuffer/tensor_types_from_flatbuffer.hpp create mode 100644 ttnn/cpp/ttnn/tensor/flatbuffer/tensor_types_to_flatbuffer.cpp create mode 100644 ttnn/cpp/ttnn/tensor/flatbuffer/tensor_types_to_flatbuffer.hpp diff --git a/ttnn/CMakeLists.txt b/ttnn/CMakeLists.txt index 74f3ef87d4f..9d750c67593 100644 --- a/ttnn/CMakeLists.txt +++ b/ttnn/CMakeLists.txt @@ -681,6 +681,7 @@ set(TTNN_PUBLIC_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR} # ${PROJECT_SOURCE_DIR}/ttnn ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/deprecated # symlink to tt_eager; should become native folder once merge complete ${CMAKE_CURRENT_SOURCE_DIR}/cpp + ${CMAKE_CURRENT_BINARY_DIR}/flatbuffers ) set(TTNN_PUBLIC_LINK_LIBRARIES metal_common_libs @@ -689,6 +690,7 @@ set(TTNN_PUBLIC_LINK_LIBRARIES xtensor xtensor-blas xtl + FlatBuffers::FlatBuffers ) set(TTNN_PUBLIC_LINK_DIRS "") @@ -803,6 +805,8 @@ endforeach( ${TTNN_SUBLIBRARIES} ) +GENERATE_FBS_HEADER(${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/tensor/flatbuffer/tensor_types.fbs) +list(APPEND TENSOR_SRCS ${FBS_GENERATED_HEADER_FILE}) add_ttnn_sublibrary(ttnn_tensor ${TENSOR_SRCS}) add_ttnn_sublibrary(ttnn_ccl ${CCL_TTNN_SRCS}) add_ttnn_sublibrary(ttnn_ccl_exp ${CCL_EXPERIMENTAL_TTNN_SRCS}) diff --git a/ttnn/cpp/ttnn/tensor/CMakeLists.txt b/ttnn/cpp/ttnn/tensor/CMakeLists.txt index 417c64b8580..6d9371fa738 100644 --- a/ttnn/cpp/ttnn/tensor/CMakeLists.txt +++ b/ttnn/cpp/ttnn/tensor/CMakeLists.txt @@ -12,6 +12,8 @@ set(TENSOR_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/layout/page_config.cpp ${CMAKE_CURRENT_SOURCE_DIR}/layout/tensor_layout.cpp ${CMAKE_CURRENT_SOURCE_DIR}/xtensor/partition.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/flatbuffer/tensor_types_to_flatbuffer.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/flatbuffer/tensor_types_from_flatbuffer.cpp CACHE INTERNAL "Tensor sources to reuse in ttnn build" ) diff --git a/ttnn/cpp/ttnn/tensor/flatbuffer/tensor_types.fbs b/ttnn/cpp/ttnn/tensor/flatbuffer/tensor_types.fbs new file mode 100644 index 00000000000..d0b2c84f950 --- /dev/null +++ b/ttnn/cpp/ttnn/tensor/flatbuffer/tensor_types.fbs @@ -0,0 +1,103 @@ +namespace ttnn.flatbuffer; + +table CoreCoord { + x: int; + y: int; +} + +table CoreRange { + start: CoreCoord; + end: CoreCoord; +} + +table CoreRangeSet { + ranges: [CoreRange]; +} + +table Tile { + tile_shape_h: uint32; + tile_shape_w: uint32; + transpose_tile: bool; +} + +enum TensorMemoryLayout: ushort { + Interleaved = 0, + SingleBank = 1, + HeightSharded = 2, + WidthSharded = 3, + BlockSharded = 4, +} + +enum BufferType: ushort { + DRAM = 0, + L1 = 1, + SystemMemory = 2, + L1Small = 3, + Trace = 4, +} + +enum ShardOrientation : ubyte { + RowMajor = 0, + ColMajor = 1, +} + +enum ShardMode : ubyte { + Physical, + Logical, +} + +table ShardShape { + height: uint32; + width: uint32; +} + +table ShardSpec { + grid: CoreRangeSet; + shape_h: uint32; + shape_w: uint32; + orientation: ShardOrientation; + shard_mode: ShardMode; + physical_shard_shape: ShardShape; +} + +enum DataType : ubyte { + BFloat16 = 0, + Float32 = 1, + UInt32 = 2, + BFloat8B = 3, + BFloat4B = 4, + UInt8 = 5, + UInt16 = 6, + Int32 = 7, + Invalid = 8 +} + +table RowMajorPageConfig {} +table TilePageConfig { + tile: Tile; +} + +union PageConfig { + row_major: RowMajorPageConfig, + tile: TilePageConfig, +} + +table MemoryConfig { + memory_layout: TensorMemoryLayout; + buffer_type: BufferType; + shard_spec: ShardSpec; +} + +table TensorLayout { + data_type: DataType; + page_config: PageConfig; + memory_config: MemoryConfig; + alignment: [uint32]; +} + +table TensorSpec { + shape: [uint32]; + tensor_layout: TensorLayout; +} + +root_type TensorSpec; diff --git a/ttnn/cpp/ttnn/tensor/flatbuffer/tensor_types_from_flatbuffer.cpp b/ttnn/cpp/ttnn/tensor/flatbuffer/tensor_types_from_flatbuffer.cpp new file mode 100644 index 00000000000..9c187e5d418 --- /dev/null +++ b/ttnn/cpp/ttnn/tensor/flatbuffer/tensor_types_from_flatbuffer.cpp @@ -0,0 +1,132 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "tensor_types_from_flatbuffer.hpp" + +namespace ttnn { + +BufferType from_flatbuffer(flatbuffer::BufferType type) { + switch (type) { + case flatbuffer::BufferType::DRAM: return BufferType::DRAM; + case flatbuffer::BufferType::L1: return BufferType::L1; + case flatbuffer::BufferType::SystemMemory: return BufferType::SYSTEM_MEMORY; + case flatbuffer::BufferType::L1Small: return BufferType::L1_SMALL; + case flatbuffer::BufferType::Trace: return BufferType::TRACE; + } + TT_THROW("Unsupported BufferType from flatbuffer."); +} + +TensorMemoryLayout from_flatbuffer(flatbuffer::TensorMemoryLayout layout) { + switch (layout) { + case flatbuffer::TensorMemoryLayout::Interleaved: return TensorMemoryLayout::INTERLEAVED; + case flatbuffer::TensorMemoryLayout::SingleBank: return TensorMemoryLayout::SINGLE_BANK; + case flatbuffer::TensorMemoryLayout::HeightSharded: return TensorMemoryLayout::HEIGHT_SHARDED; + case flatbuffer::TensorMemoryLayout::WidthSharded: return TensorMemoryLayout::WIDTH_SHARDED; + case flatbuffer::TensorMemoryLayout::BlockSharded: return TensorMemoryLayout::BLOCK_SHARDED; + } + TT_THROW("Unsupported TensorMemoryLayout from flatbuffer."); +} + +DataType from_flatbuffer(flatbuffer::DataType type) { + switch (type) { + case flatbuffer::DataType::BFloat16: return DataType::BFLOAT16; + case flatbuffer::DataType::Float32: return DataType::FLOAT32; + case flatbuffer::DataType::UInt32: return DataType::UINT32; + case flatbuffer::DataType::BFloat8B: return DataType::BFLOAT8_B; + case flatbuffer::DataType::BFloat4B: return DataType::BFLOAT4_B; + case flatbuffer::DataType::UInt8: return DataType::UINT8; + case flatbuffer::DataType::UInt16: return DataType::UINT16; + case flatbuffer::DataType::Int32: return DataType::INT32; + case flatbuffer::DataType::Invalid: return DataType::INVALID; + } + TT_THROW("Unsupported DataType from flatbuffer."); +} + +MemoryConfig from_flatbuffer(const flatbuffer::MemoryConfig* config) { + std::optional shard_spec; + if (config->shard_spec()) { + shard_spec = from_flatbuffer(config->shard_spec()); + } + return MemoryConfig{ + from_flatbuffer(config->memory_layout()), + from_flatbuffer(config->buffer_type()), + shard_spec, + }; +} + +ShardOrientation from_flatbuffer(flatbuffer::ShardOrientation orientation) { + switch (orientation) { + case flatbuffer::ShardOrientation::RowMajor: return ShardOrientation::ROW_MAJOR; + case flatbuffer::ShardOrientation::ColMajor: return ShardOrientation::COL_MAJOR; + } + TT_THROW("Unsupported ShardOrientation from flatbuffer."); +} + +ShardMode from_flatbuffer(flatbuffer::ShardMode mode) { + switch (mode) { + case flatbuffer::ShardMode::Physical: return ShardMode::PHYSICAL; + case flatbuffer::ShardMode::Logical: return ShardMode::LOGICAL; + } + TT_THROW("Unsupported ShardMode from flatbuffer."); +} + +ShardSpec from_flatbuffer(const flatbuffer::ShardSpec* spec) { + CoreRangeSet grid = from_flatbuffer(spec->grid()); + std::array shape = {spec->shape_h(), spec->shape_w()}; + ShardOrientation orientation = from_flatbuffer(spec->orientation()); + ShardMode mode = from_flatbuffer(spec->shard_mode()); + if (const auto* fb_shard_shape = spec->physical_shard_shape()) { + std::array physical_shard_shape = {fb_shard_shape->height(), fb_shard_shape->width()}; + return ShardSpec(grid, shape, physical_shard_shape, orientation); + } + return ShardSpec(grid, shape, orientation, mode); +} + +CoreCoord from_flatbuffer(const flatbuffer::CoreCoord* core_coord) { + return CoreCoord{core_coord->x(), core_coord->y()}; +} + +CoreRange from_flatbuffer(const flatbuffer::CoreRange* core_range) { + return CoreRange{ + {core_range->start()->x(), core_range->start()->y()}, {core_range->end()->x(), core_range->end()->y()}}; +} + +CoreRangeSet from_flatbuffer(const flatbuffer::CoreRangeSet* core_range_set) { + std::vector ranges; + for (const auto* range : *core_range_set->ranges()) { + ranges.emplace_back( + CoreCoord{range->start()->x(), range->start()->y()}, CoreCoord{range->end()->x(), range->end()->y()}); + } + return CoreRangeSet{ranges}; +} + +TensorLayout from_flatbuffer(const flatbuffer::TensorLayout* layout) { + PageConfig page_config = [&] { + switch (layout->page_config_type()) { + case flatbuffer::PageConfig::row_major: return PageConfig(Layout::ROW_MAJOR); + case flatbuffer::PageConfig::tile: { + const auto* tile_page_config = layout->page_config_as_tile(); + const auto* flat_tile = tile_page_config->tile(); + Tile tile( + std::array{flat_tile->tile_shape_h(), flat_tile->tile_shape_w()}, flat_tile->transpose_tile()); + return PageConfig(Layout::TILE, tile); + } + default: TT_THROW("Unsupported PageConfig type from flatbuffer."); + } + }(); + + return TensorLayout::restore_from_serialized( + from_flatbuffer(layout->data_type()), + page_config, + from_flatbuffer(layout->memory_config()), + Alignment(SmallVector(layout->alignment()->cbegin(), layout->alignment()->cend()))); +} + +TensorSpec from_flatbuffer(const flatbuffer::TensorSpec* spec) { + return TensorSpec( + Shape(SmallVector(spec->shape()->cbegin(), spec->shape()->cend())), + from_flatbuffer(spec->tensor_layout())); +} + +} // namespace ttnn diff --git a/ttnn/cpp/ttnn/tensor/flatbuffer/tensor_types_from_flatbuffer.hpp b/ttnn/cpp/ttnn/tensor/flatbuffer/tensor_types_from_flatbuffer.hpp new file mode 100644 index 00000000000..906b0d8940e --- /dev/null +++ b/ttnn/cpp/ttnn/tensor/flatbuffer/tensor_types_from_flatbuffer.hpp @@ -0,0 +1,26 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "tensor_types_generated.h" +#include "ttnn/tensor/types.hpp" +#include "ttnn/tensor/tensor_spec.hpp" + +namespace ttnn { + +BufferType from_flatbuffer(flatbuffer::BufferType type); +TensorMemoryLayout from_flatbuffer(flatbuffer::TensorMemoryLayout layout); +DataType from_flatbuffer(flatbuffer::DataType type); +ShardOrientation from_flatbuffer(flatbuffer::ShardOrientation orientation); +ShardMode from_flatbuffer(flatbuffer::ShardMode mode); +CoreCoord from_flatbuffer(const flatbuffer::CoreCoord* fb_coord); +CoreRange from_flatbuffer(const flatbuffer::CoreRange* fb_coord); +CoreRangeSet from_flatbuffer(const flatbuffer::CoreRangeSet* fb_coord); +ShardSpec from_flatbuffer(const flatbuffer::ShardSpec* spec); +MemoryConfig from_flatbuffer(const flatbuffer::MemoryConfig* config); +TensorLayout from_flatbuffer(const flatbuffer::TensorLayout* layout); +TensorSpec from_flatbuffer(const flatbuffer::TensorSpec* spec); + +} // namespace ttnn diff --git a/ttnn/cpp/ttnn/tensor/flatbuffer/tensor_types_to_flatbuffer.cpp b/ttnn/cpp/ttnn/tensor/flatbuffer/tensor_types_to_flatbuffer.cpp new file mode 100644 index 00000000000..dce51ca4177 --- /dev/null +++ b/ttnn/cpp/ttnn/tensor/flatbuffer/tensor_types_to_flatbuffer.cpp @@ -0,0 +1,148 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "tensor_types_to_flatbuffer.hpp" + +namespace ttnn { + +flatbuffer::ShardOrientation to_flatbuffer(ShardOrientation orientation) { + switch (orientation) { + case ShardOrientation::ROW_MAJOR: return flatbuffer::ShardOrientation::RowMajor; + case ShardOrientation::COL_MAJOR: return flatbuffer::ShardOrientation::ColMajor; + } + TT_THROW("Unsupported ShardOrientation to flatbuffer."); +} + +flatbuffer::ShardMode to_flatbuffer(ShardMode shard_mode) { + switch (shard_mode) { + case ShardMode::LOGICAL: return flatbuffer::ShardMode::Logical; + case ShardMode::PHYSICAL: return flatbuffer::ShardMode::Physical; + } + TT_THROW("Unsupported ShardMode to flatbuffer."); +} + +flatbuffers::Offset to_flatbuffer( + const ShardSpec& spec, flatbuffers::FlatBufferBuilder& builder) { + flatbuffers::Offset physical_shard_shape = 0; + if (spec.physical_shard_shape.has_value()) { + const auto& phys_shape = *spec.physical_shard_shape; + physical_shard_shape = flatbuffer::CreateShardShape(builder, phys_shape[0], phys_shape[1]); + } + return flatbuffer::CreateShardSpec( + builder, + to_flatbuffer(builder, spec.grid), + spec.shape[0], + spec.shape[1], + to_flatbuffer(spec.orientation), + to_flatbuffer(spec.mode), + physical_shard_shape); +} + +flatbuffers::Offset to_flatbuffer( + flatbuffers::FlatBufferBuilder& builder, const CoreCoord& core_coord) { + return flatbuffer::CreateCoreCoord(builder, core_coord.x, core_coord.y); +} + +flatbuffers::Offset to_flatbuffer( + flatbuffers::FlatBufferBuilder& builder, const CoreRange& core_range) { + auto start = flatbuffer::CreateCoreCoord(builder, core_range.start_coord.x, core_range.start_coord.y); + auto end = flatbuffer::CreateCoreCoord(builder, core_range.end_coord.x, core_range.end_coord.y); + return flatbuffer::CreateCoreRange(builder, start, end); +} + +flatbuffers::Offset to_flatbuffer( + flatbuffers::FlatBufferBuilder& builder, const CoreRangeSet& core_range_set) { + std::vector> range_offsets; + for (const auto& range : core_range_set.ranges()) { + auto start = flatbuffer::CreateCoreCoord(builder, range.start_coord.x, range.start_coord.y); + auto end = flatbuffer::CreateCoreCoord(builder, range.end_coord.x, range.end_coord.y); + range_offsets.push_back(flatbuffer::CreateCoreRange(builder, start, end)); + } + auto ranges_vector = builder.CreateVector(range_offsets); + return flatbuffer::CreateCoreRangeSet(builder, ranges_vector); +} + +flatbuffer::TensorMemoryLayout to_flatbuffer(TensorMemoryLayout layout) { + switch (layout) { + case TensorMemoryLayout::INTERLEAVED: return flatbuffer::TensorMemoryLayout::Interleaved; + case TensorMemoryLayout::SINGLE_BANK: return flatbuffer::TensorMemoryLayout::SingleBank; + case TensorMemoryLayout::HEIGHT_SHARDED: return flatbuffer::TensorMemoryLayout::HeightSharded; + case TensorMemoryLayout::WIDTH_SHARDED: return flatbuffer::TensorMemoryLayout::WidthSharded; + case TensorMemoryLayout::BLOCK_SHARDED: return flatbuffer::TensorMemoryLayout::BlockSharded; + } + TT_THROW("Unsupported TensorMemoryLayout to flatbuffer."); +} + +flatbuffer::BufferType to_flatbuffer(BufferType type) { + switch (type) { + case BufferType::DRAM: return flatbuffer::BufferType::DRAM; + case BufferType::L1: return flatbuffer::BufferType::L1; + case BufferType::SYSTEM_MEMORY: return flatbuffer::BufferType::SystemMemory; + case BufferType::L1_SMALL: return flatbuffer::BufferType::L1Small; + case BufferType::TRACE: return flatbuffer::BufferType::Trace; + } + TT_THROW("Unsupported BufferType to flatbuffer."); +} + +flatbuffer::DataType to_flatbuffer(DataType type) { + switch (type) { + case DataType::BFLOAT16: return flatbuffer::DataType::BFloat16; + case DataType::FLOAT32: return flatbuffer::DataType::Float32; + case DataType::UINT32: return flatbuffer::DataType::UInt32; + case DataType::BFLOAT8_B: return flatbuffer::DataType::BFloat8B; + case DataType::BFLOAT4_B: return flatbuffer::DataType::BFloat4B; + case DataType::UINT8: return flatbuffer::DataType::UInt8; + case DataType::UINT16: return flatbuffer::DataType::UInt16; + case DataType::INT32: return flatbuffer::DataType::Int32; + case DataType::INVALID: return flatbuffer::DataType::Invalid; + } + TT_THROW("Unsupported DataType to flatbuffer."); +} + +flatbuffers::Offset to_flatbuffer( + const MemoryConfig& config, flatbuffers::FlatBufferBuilder& builder) { + flatbuffers::Offset shard_spec = 0; + if (config.shard_spec.has_value()) { + shard_spec = to_flatbuffer(*config.shard_spec, builder); + } + return flatbuffer::CreateMemoryConfig( + builder, to_flatbuffer(config.memory_layout), to_flatbuffer(config.buffer_type), shard_spec); +} + +flatbuffers::Offset to_flatbuffer( + const TensorLayout& layout, flatbuffers::FlatBufferBuilder& builder) { + const auto& alignment = layout.get_alignment(); + auto flat_alignment = builder.CreateVector(alignment.view().data(), alignment.size()); + auto page_config = layout.get_page_config(); + if (page_config.get_layout() == Layout::TILE) { + auto tile = page_config.get_tile(); + auto flat_tile = + flatbuffer::CreateTile(builder, tile.get_height(), tile.get_width(), tile.get_transpose_of_faces()); + return flatbuffer::CreateTensorLayout( + builder, + to_flatbuffer(layout.get_data_type()), + flatbuffer::PageConfig::tile, + flatbuffer::CreateTilePageConfig(builder, flat_tile).Union(), + to_flatbuffer(layout.get_memory_config(), builder), + flat_alignment); + } else if (page_config.get_layout() == Layout::ROW_MAJOR) { + return flatbuffer::CreateTensorLayout( + builder, + to_flatbuffer(layout.get_data_type()), + flatbuffer::PageConfig::row_major, + flatbuffer::CreateRowMajorPageConfig(builder).Union(), + to_flatbuffer(layout.get_memory_config(), builder), + flat_alignment); + } + TT_THROW("Unsupported PageConfig type to flatbuffer."); +} + +flatbuffers::Offset to_flatbuffer( + const TensorSpec& spec, flatbuffers::FlatBufferBuilder& builder) { + const auto& shape = spec.logical_shape(); + auto flat_shape = builder.CreateVector(shape.view().data(), shape.rank()); + return flatbuffer::CreateTensorSpec(builder, flat_shape, to_flatbuffer(spec.tensor_layout(), builder)); +} + +} // namespace ttnn diff --git a/ttnn/cpp/ttnn/tensor/flatbuffer/tensor_types_to_flatbuffer.hpp b/ttnn/cpp/ttnn/tensor/flatbuffer/tensor_types_to_flatbuffer.hpp new file mode 100644 index 00000000000..ab7e3a2533e --- /dev/null +++ b/ttnn/cpp/ttnn/tensor/flatbuffer/tensor_types_to_flatbuffer.hpp @@ -0,0 +1,37 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "tensor_types_generated.h" + +#include "ttnn/tensor/types.hpp" +#include "ttnn/tensor/tensor_spec.hpp" + +namespace ttnn { + +flatbuffers::Offset to_flatbuffer( + flatbuffers::FlatBufferBuilder& builder, const CoreCoord& core_coord); +flatbuffers::Offset to_flatbuffer( + flatbuffers::FlatBufferBuilder& builder, const CoreRange& core_range); +flatbuffers::Offset to_flatbuffer( + flatbuffers::FlatBufferBuilder& builder, const CoreRangeSet& core_range_set); + +flatbuffer::ShardOrientation to_flatbuffer(ShardOrientation orientation); +flatbuffer::ShardMode to_flatbuffer(ShardMode shard_mode); +flatbuffers::Offset to_flatbuffer( + const ShardSpec& spec, flatbuffers::FlatBufferBuilder& builder); + +flatbuffer::TensorMemoryLayout to_flatbuffer(TensorMemoryLayout layout); +flatbuffer::BufferType to_flatbuffer(BufferType type); +flatbuffer::DataType to_flatbuffer(DataType type); + +flatbuffers::Offset to_flatbuffer( + const MemoryConfig& config, flatbuffers::FlatBufferBuilder& builder); +flatbuffers::Offset to_flatbuffer( + const TensorLayout& layout, flatbuffers::FlatBufferBuilder& builder); +flatbuffers::Offset to_flatbuffer( + const TensorSpec& spec, flatbuffers::FlatBufferBuilder& builder); + +} // namespace ttnn diff --git a/ttnn/cpp/ttnn/tensor/layout/tensor_layout.cpp b/ttnn/cpp/ttnn/tensor/layout/tensor_layout.cpp index 4f882a44e18..8bd564e511c 100644 --- a/ttnn/cpp/ttnn/tensor/layout/tensor_layout.cpp +++ b/ttnn/cpp/ttnn/tensor/layout/tensor_layout.cpp @@ -138,6 +138,11 @@ TensorLayout TensorLayout::fromPaddedShape( CMAKE_UNIQUE_NAMESPACE::legacyShapeToAlignment(logical_shape, padded_shape, page_config, memory_config)); } +TensorLayout TensorLayout::restore_from_serialized( + DataType dtype, const PageConfig& page_config, const MemoryConfig& memory_config, const Alignment& alignment) { + return TensorLayout(dtype, page_config, memory_config, alignment); +} + void TensorLayout::initialize_alignment() { auto default_alignment = page_config_.create_default_alignment(dtype_, memory_config_); if (alignment_.empty()) { diff --git a/ttnn/cpp/ttnn/tensor/layout/tensor_layout.hpp b/ttnn/cpp/ttnn/tensor/layout/tensor_layout.hpp index 04c44758f67..e7c12dbaf17 100644 --- a/ttnn/cpp/ttnn/tensor/layout/tensor_layout.hpp +++ b/ttnn/cpp/ttnn/tensor/layout/tensor_layout.hpp @@ -82,6 +82,9 @@ class TensorLayout { return std::forward_as_tuple(dtype_, page_config_, memory_config_, alignment_); } + static TensorLayout restore_from_serialized( + DataType dtype, const PageConfig& page_config, const MemoryConfig& memory_config, const Alignment& alignment); + private: // Private to not expose alignment parameter to the public API TensorLayout( diff --git a/ttnn/cpp/ttnn/tensor/serialization.cpp b/ttnn/cpp/ttnn/tensor/serialization.cpp index 455c0b90126..ee5209a0aa2 100644 --- a/ttnn/cpp/ttnn/tensor/serialization.cpp +++ b/ttnn/cpp/ttnn/tensor/serialization.cpp @@ -5,15 +5,18 @@ #include "ttnn/tensor/serialization.hpp" #include -#include -#include +#include #include #include +#include + #include "ttnn/tensor/host_buffer/functions.hpp" #include "ttnn/tensor/tensor_utils.hpp" #include "ttnn/tensor/types.hpp" #include "ttnn/distributed/types.hpp" +#include "ttnn/tensor/flatbuffer/tensor_types_from_flatbuffer.hpp" +#include "ttnn/tensor/flatbuffer/tensor_types_to_flatbuffer.hpp" namespace tt::tt_metal { @@ -69,99 +72,136 @@ struct LegacyShape { } }; -static constexpr std::size_t SENTINEL_VALUE = std::numeric_limits::max(); +static constexpr uint64_t SENTINEL_VALUE = std::numeric_limits::max(); + +void safe_fread(void* buffer, size_t size, size_t count, FILE* file) { + if (fread(buffer, size, count, file) != count) { + TT_THROW("Failed to read tensor data, file must be corrupted"); + } +} + +void safe_fwrite(const void* buffer, size_t size, size_t count, FILE* file) { + if (fwrite(buffer, size, count, file) != count) { + TT_THROW("Failed to write tensor data: file write failed"); + } +} + +void dump_tensor_spec(const TensorSpec& tensor_spec, FILE* output_file) { + flatbuffers::FlatBufferBuilder builder; + auto flat_spec = ttnn::to_flatbuffer(tensor_spec, builder); + builder.Finish(flat_spec); + uint64_t buffer_size = builder.GetSize(); + safe_fwrite(&buffer_size, sizeof(buffer_size), 1, output_file); + safe_fwrite(builder.GetBufferPointer(), buffer_size, 1, output_file); +} -void dump_owned_storage(std::ofstream& output_stream, const OwnedStorage& storage) { +TensorSpec load_tensor_spec(FILE* input_file) { + uint64_t bin_size = 0; + safe_fread(&bin_size, sizeof(bin_size), 1, input_file); + std::vector bin(bin_size); + safe_fread(bin.data(), bin_size, 1, input_file); + flatbuffers::Verifier verifier(bin.data(), bin_size); + if (!ttnn::flatbuffer::VerifyTensorSpecBuffer(verifier)) { + TT_THROW("TensorSpec deserialization failed: invalid buffer"); + } + auto spec = ttnn::flatbuffer::GetTensorSpec(bin.data()); + return ttnn::from_flatbuffer(spec); +} + +void dump_owned_storage(FILE* output_file, const OwnedStorage& storage) { std::visit( - [&output_stream](const owned_buffer::Buffer& generic_buffer) { + [output_file](const owned_buffer::Buffer& generic_buffer) { const auto buffer = owned_buffer::get_as(generic_buffer); - auto size = buffer.size(); - output_stream.write(reinterpret_cast(&size), sizeof(size)); - output_stream.write(reinterpret_cast(buffer.begin()), sizeof(T) * size); + uint64_t size = buffer.size(); + safe_fwrite(&size, sizeof(size), 1, output_file); + safe_fwrite(buffer.data(), sizeof(T) * size, 1, output_file); }, storage.buffer); } -void dump_borrowed_storage(std::ofstream& output_stream, const BorrowedStorage& storage) { +void dump_borrowed_storage(FILE* output_file, const BorrowedStorage& storage) { std::visit( - [&output_stream](const borrowed_buffer::Buffer& generic_buffer) { + [output_file](const borrowed_buffer::Buffer& generic_buffer) { const auto buffer = borrowed_buffer::get_as(generic_buffer); - auto size = buffer.size(); - output_stream.write(reinterpret_cast(&size), sizeof(size)); - output_stream.write(reinterpret_cast(buffer.begin()), sizeof(T) * size); + uint64_t size = buffer.size(); + safe_fwrite(&size, sizeof(size), 1, output_file); + safe_fwrite(buffer.data(), sizeof(T) * size, 1, output_file); }, storage.buffer); } void dump_multi_device_host_storage( - std::ofstream& output_stream, const MultiDeviceHostStorage& storage, const DistributedTensorConfig& strategy) { - std::size_t num_buffers = storage.num_buffers(); - output_stream.write(reinterpret_cast(&num_buffers), sizeof(std::size_t)); + FILE* output_file, const MultiDeviceHostStorage& storage, const DistributedTensorConfig& strategy) { + uint64_t num_buffers = storage.num_buffers(); + safe_fwrite(&num_buffers, sizeof(num_buffers), 1, output_file); // Use the user-specified strategy which defines how it gets distributed when mapped onto multi-device - output_stream.write(reinterpret_cast(&strategy), sizeof(DistributedTensorConfig)); + safe_fwrite(&strategy, sizeof(strategy), 1, output_file); if (std::holds_alternative(strategy)) { std::visit( - [&output_stream](const owned_buffer::Buffer& generic_buffer) { + [output_file](const owned_buffer::Buffer& generic_buffer) { const auto buffer = owned_buffer::get_as(generic_buffer); - auto size = buffer.size(); - output_stream.write(reinterpret_cast(&size), sizeof(size)); - output_stream.write(reinterpret_cast(buffer.begin()), sizeof(T) * size); + uint64_t size = buffer.size(); + safe_fwrite(&size, sizeof(size), 1, output_file); + safe_fwrite(buffer.begin(), sizeof(T) * size, 1, output_file); }, storage.get_buffer(0)); auto spec = storage.specs.at(0); - LegacyShape shape(spec.logical_shape(), spec.padded_shape()); - output_stream.write(reinterpret_cast(&shape), sizeof(LegacyShape)); + dump_tensor_spec(spec, output_file); } else { for (int i = 0; i < num_buffers; i++) { std::visit( - [&output_stream](const owned_buffer::Buffer& generic_buffer) { + [output_file](const owned_buffer::Buffer& generic_buffer) { const auto buffer = owned_buffer::get_as(generic_buffer); - auto size = buffer.size(); - output_stream.write(reinterpret_cast(&size), sizeof(size)); - output_stream.write(reinterpret_cast(buffer.begin()), sizeof(T) * size); + uint64_t size = buffer.size(); + safe_fwrite(&size, sizeof(size), 1, output_file); + safe_fwrite(buffer.begin(), sizeof(T) * size, 1, output_file); }, storage.get_buffer(i)); } for (const auto& spec : storage.specs) { - LegacyShape shape(spec.logical_shape(), spec.padded_shape()); - output_stream.write(reinterpret_cast(&shape), sizeof(LegacyShape)); + dump_tensor_spec(spec, output_file); } } } template -OwnedStorage load_owned_storage(std::ifstream& input_stream) { - std::size_t size = 0; - input_stream.read(reinterpret_cast(&size), sizeof(std::size_t)); +OwnedStorage load_owned_storage(FILE* input_file) { + uint64_t size = 0; + safe_fread(&size, sizeof(size), 1, input_file); auto buffer = owned_buffer::create(size); - input_stream.read(reinterpret_cast(buffer.begin()), sizeof(T) * size); + safe_fread(buffer.begin(), sizeof(T) * size, 1, input_file); return {buffer}; } template MultiDeviceHostStorage load_multi_device_host_storage( - std::ifstream& input_stream, DataType data_type, Layout layout, MeshDevice* mesh_device) { - std::size_t num_buffers = 0; + FILE* input_file, DataType data_type, Layout layout, MeshDevice* mesh_device, uint8_t version_id) { + uint64_t num_buffers = 0; DistributedTensorConfig strategy; - input_stream.read(reinterpret_cast(&num_buffers), sizeof(std::size_t)); - input_stream.read(reinterpret_cast(&strategy), sizeof(DistributedTensorConfig)); + safe_fread(&num_buffers, sizeof(num_buffers), 1, input_file); + safe_fread(&strategy, sizeof(strategy), 1, input_file); std::vector buffers; std::vector specs; if (std::holds_alternative(strategy)) { - std::size_t size = 0; - input_stream.read(reinterpret_cast(&size), sizeof(std::size_t)); + uint64_t size = 0; + safe_fread(&size, sizeof(size), 1, input_file); auto buffer = owned_buffer::create(size); - auto shape = LegacyShape{}; - input_stream.read(reinterpret_cast(buffer.begin()), sizeof(T) * size); - input_stream.read(reinterpret_cast(&shape), sizeof(LegacyShape)); + safe_fread(buffer.begin(), sizeof(T) * size, 1, input_file); buffers.push_back(buffer); - TensorSpec spec( - shape.logical_shape(), - TensorLayout::fromPaddedShape( - data_type, PageConfig(layout), MemoryConfig{}, shape.logical_shape(), shape.padded_shape())); + auto spec = [&] { + if (version_id >= 5) { + return load_tensor_spec(input_file); + } + auto shape = LegacyShape{}; + safe_fread(&shape, sizeof(shape), 1, input_file); + return TensorSpec( + shape.logical_shape(), + TensorLayout::fromPaddedShape( + data_type, PageConfig(layout), MemoryConfig{}, shape.logical_shape(), shape.padded_shape())); + }(); specs.push_back(spec); for (std::size_t i = 1; i < mesh_device->num_devices(); ++i) { @@ -171,66 +211,68 @@ MultiDeviceHostStorage load_multi_device_host_storage( } else { for (std::size_t i = 0; i < num_buffers; ++i) { - std::size_t size = 0; - input_stream.read(reinterpret_cast(&size), sizeof(std::size_t)); - + uint64_t size = 0; + safe_fread(&size, sizeof(size), 1, input_file); auto buffer = owned_buffer::create(size); - input_stream.read(reinterpret_cast(buffer.begin()), sizeof(T) * size); - + safe_fread(buffer.begin(), sizeof(T) * size, 1, input_file); buffers.push_back(std::move(buffer)); } for (std::size_t i = 0; i < num_buffers; ++i) { - auto shape = LegacyShape{}; - input_stream.read(reinterpret_cast(&shape), sizeof(LegacyShape)); - TensorSpec spec( - shape.logical_shape(), - TensorLayout::fromPaddedShape( - data_type, PageConfig(layout), MemoryConfig{}, shape.logical_shape(), shape.padded_shape())); - specs.push_back(spec); + if (version_id >= 5) { + specs.push_back(load_tensor_spec(input_file)); + } else { + auto shape = LegacyShape{}; + safe_fread(&shape, sizeof(shape), 1, input_file); + TensorSpec spec( + shape.logical_shape(), + TensorLayout::fromPaddedShape( + data_type, PageConfig(layout), MemoryConfig{}, shape.logical_shape(), shape.padded_shape())); + specs.push_back(spec); + } } } return {strategy, buffers, specs}; } -OwnedStorage load_owned_storage(std::ifstream& input_stream, DataType data_type) { +OwnedStorage load_owned_storage(FILE* input_file, DataType data_type) { if (data_type == DataType::UINT32 or data_type == DataType::BFLOAT8_B or data_type == DataType::BFLOAT4_B) { using T = std::uint32_t; - return load_owned_storage(input_stream); + return load_owned_storage(input_file); } else if (data_type == DataType::INT32) { using T = std::int32_t; - return load_owned_storage(input_stream); + return load_owned_storage(input_file); } else if (data_type == DataType::UINT8) { using T = std::uint8_t; - return load_owned_storage(input_stream); + return load_owned_storage(input_file); } else if (data_type == DataType::UINT16) { using T = std::uint16_t; - return load_owned_storage(input_stream); + return load_owned_storage(input_file); } else if (data_type == DataType::FLOAT32) { using T = float; - return load_owned_storage(input_stream); + return load_owned_storage(input_file); } else if (data_type == DataType::BFLOAT16) { using T = bfloat16; - return load_owned_storage(input_stream); + return load_owned_storage(input_file); } else { TT_THROW("Unsupported DataType"); } } MultiDeviceHostStorage load_multi_device_host_storage( - std::ifstream& input_stream, DataType data_type, Layout layout, MeshDevice* mesh_device) { + FILE* input_file, DataType data_type, Layout layout, MeshDevice* mesh_device, uint8_t version_id) { if (data_type == DataType::UINT32 or data_type == DataType::BFLOAT8_B or data_type == DataType::BFLOAT4_B) { using T = std::uint32_t; - return load_multi_device_host_storage(input_stream, data_type, layout, mesh_device); + return load_multi_device_host_storage(input_file, data_type, layout, mesh_device, version_id); } else if (data_type == DataType::UINT16) { using T = std::uint16_t; - return load_multi_device_host_storage(input_stream, data_type, layout, mesh_device); + return load_multi_device_host_storage(input_file, data_type, layout, mesh_device, version_id); } else if (data_type == DataType::FLOAT32) { using T = float; - return load_multi_device_host_storage(input_stream, data_type, layout, mesh_device); + return load_multi_device_host_storage(input_file, data_type, layout, mesh_device, version_id); } else if (data_type == DataType::BFLOAT16) { using T = bfloat16; - return load_multi_device_host_storage(input_stream, data_type, layout, mesh_device); + return load_multi_device_host_storage(input_file, data_type, layout, mesh_device, version_id); } else { TT_THROW("Unsupported DataType"); } @@ -238,67 +280,191 @@ MultiDeviceHostStorage load_multi_device_host_storage( template Storage load_storage( - std::ifstream& input_stream, DataType data_type, Layout layout, StorageType storage_type, T device) { + FILE* input_file, DataType data_type, Layout layout, StorageType storage_type, T device, uint8_t version_id) { if (storage_type == StorageType::MULTI_DEVICE_HOST or storage_type == StorageType::MULTI_DEVICE) { if constexpr (std::is_same_v) { - return load_multi_device_host_storage(input_stream, data_type, layout, device); + return load_multi_device_host_storage(input_file, data_type, layout, device, version_id); } else { TT_THROW("MeshDevice is required for MULTI_DEVICE_HOST storage"); } } else { - return load_owned_storage(input_stream, data_type); + return load_owned_storage(input_file, data_type); + } +} + +template +Tensor load_tensor_helper_legacy_impl(FILE* input_file, T device, uint8_t version_id) { + auto shape = LegacyShape{}; + DataType data_type; + Layout layout; + StorageType storage_type; + safe_fread(&shape, sizeof(shape), 1, input_file); + safe_fread(&data_type, sizeof(data_type), 1, input_file); + safe_fread(&layout, sizeof(layout), 1, input_file); + safe_fread(&storage_type, sizeof(storage_type), 1, input_file); + + bool has_memory_config = false; + MemoryConfig memory_config = + MemoryConfig{.memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED, .buffer_type = BufferType::DRAM}; + + if (version_id >= 2) { + safe_fread(&has_memory_config, sizeof(has_memory_config), 1, input_file); + if (has_memory_config) { + memory_config = tt::tt_metal::load_memory_config(input_file); + } + } + + auto storage = load_storage(input_file, data_type, layout, storage_type, device, version_id); + + auto tensor = Tensor( + std::move(storage), + TensorSpec( + shape.logical_shape(), + TensorLayout::fromPaddedShape( + data_type, layout, MemoryConfig{}, shape.logical_shape(), shape.padded_shape()))); + if (device != nullptr) { + tensor = tensor.to_device(device, memory_config); + } else if (has_memory_config) { + tt::log_warning("Memory config is ignored when loading the tensor because device is not provided"); + } + return tensor; +} + +// Used before VERSION_ID was introduced +template +Tensor load_tensor_helper_very_legacy_impl(FILE* input_file, T device) { + auto shape = LegacyShape{}; + DataType data_type; + Layout layout; + safe_fread(&shape, sizeof(shape), 1, input_file); + safe_fread(&data_type, sizeof(data_type), 1, input_file); + safe_fread(&layout, sizeof(layout), 1, input_file); + + auto storage = load_owned_storage(input_file, data_type); + auto tensor = Tensor( + std::move(storage), + TensorSpec( + shape.logical_shape(), + TensorLayout::fromPaddedShape( + data_type, layout, MemoryConfig{}, shape.logical_shape(), shape.padded_shape()))); + if (device != nullptr) { + tensor = tensor.to_device(device); + } + return tensor; +} + +// Used before flatbuffer serialization, aka VERSION_ID < 5 +MemoryConfig load_memory_config_legacy_impl(FILE* input_file, uint8_t version_id) { + TensorMemoryLayout memory_layout; + BufferType buffer_type; + bool has_shard_spec; + safe_fread(&memory_layout, sizeof(memory_layout), 1, input_file); + safe_fread(&buffer_type, sizeof(buffer_type), 1, input_file); + safe_fread(&has_shard_spec, sizeof(has_shard_spec), 1, input_file); + + std::optional shard_spec = std::nullopt; + if (has_shard_spec) { + uint64_t num_core_ranges; + std::set core_ranges; + std::array shape; + ShardOrientation orientation; + + safe_fread(&num_core_ranges, sizeof(num_core_ranges), 1, input_file); + for (auto index = 0; index < num_core_ranges; index++) { + CoreRange core_range{{}, {}}; + safe_fread(&core_range, sizeof(core_range), 1, input_file); + core_ranges.insert(core_range); + } + safe_fread(&shape, sizeof(shape), 1, input_file); + safe_fread(&orientation, sizeof(orientation), 1, input_file); + if (version_id <= 3) { + // Read halo for backward compatibility. + bool halo; + safe_fread(&halo, sizeof(halo), 1, input_file); + } + shard_spec = {CoreRangeSet{core_ranges}, shape, orientation}; } + return MemoryConfig{memory_layout, buffer_type, shard_spec}; +} + +template +Tensor load_tensor_helper(const std::string& file_name, T device) { + FILE* input_file = fopen(file_name.c_str(), "rb"); + if (not input_file) { + TT_THROW("Cannot open \"{}\"", file_name); + } + std::unique_ptr file_guard(input_file, &fclose); + + std::size_t read_sentinel; + safe_fread(&read_sentinel, sizeof(read_sentinel), 1, input_file); + if (read_sentinel != SENTINEL_VALUE) { + fseek(input_file, 0, SEEK_SET); + return load_tensor_helper_very_legacy_impl(input_file, device); + } + + std::uint8_t version_id = 0; + safe_fread(&version_id, sizeof(version_id), 1, input_file); + if (version_id > VERSION_ID) { + TT_THROW( + "Version mismatch: the serialized tensor was created with version {} but is being loaded by a loader with " + "version {}. Please update your saved data or your loader so that both versions match.", + version_id, + VERSION_ID); + } + + if (version_id < 5) { + return load_tensor_helper_legacy_impl(input_file, device, version_id); + } + + auto spec = load_tensor_spec(input_file); + StorageType storage_type = StorageType::OWNED; + safe_fread(&storage_type, sizeof(storage_type), 1, input_file); + auto storage = load_storage(input_file, spec.data_type(), spec.layout(), storage_type, device, version_id); + Tensor tensor(std::move(storage), spec); + if (device != nullptr) { + tensor = tensor.to_device(device, spec.memory_config()); + } + return tensor; } } // namespace void dump_tensor( const std::string& file_name, const Tensor& tensor, const std::unordered_map& strategy) { - std::ofstream output_stream(file_name, std::ios::out | std::ios::binary); - if (not output_stream) { - throw std::runtime_error(fmt::format("Cannot open \"{}\"", file_name)); + FILE* output_file = fopen(file_name.c_str(), "wb"); + if (not output_file) { + TT_THROW("Cannot open \"{}\"", file_name); } + std::unique_ptr file_guard(output_file, &fclose); - LegacyShape shape(tensor.get_logical_shape(), tensor.get_padded_shape()); - auto data_type = tensor.get_dtype(); - auto layout = tensor.get_layout(); - auto storage_type = tensor.storage_type(); + safe_fwrite(&SENTINEL_VALUE, sizeof(SENTINEL_VALUE), 1, output_file); + safe_fwrite(&VERSION_ID, sizeof(VERSION_ID), 1, output_file); - output_stream.write(reinterpret_cast(&SENTINEL_VALUE), sizeof(std::size_t)); - output_stream.write(reinterpret_cast(&VERSION_ID), sizeof(std::uint8_t)); - output_stream.write(reinterpret_cast(&shape), sizeof(LegacyShape)); - output_stream.write(reinterpret_cast(&data_type), sizeof(DataType)); - output_stream.write(reinterpret_cast(&layout), sizeof(Layout)); - output_stream.write(reinterpret_cast(&storage_type), sizeof(StorageType)); + dump_tensor_spec(tensor.get_tensor_spec(), output_file); - bool is_on_device = is_tensor_on_device_or_multidevice(tensor); - bool has_memory_config = is_on_device; - if (VERSION_ID >= 2) { - output_stream.write(reinterpret_cast(&has_memory_config), sizeof(bool)); - if (has_memory_config) { - tt::tt_metal::dump_memory_config(output_stream, tensor.memory_config()); - } - } + auto storage_type = tensor.storage_type(); + safe_fwrite(&storage_type, sizeof(storage_type), 1, output_file); + bool is_on_device = is_tensor_on_device_or_multidevice(tensor); Tensor tensor_to_dump = tensor; if (is_on_device) { tensor_to_dump = tensor_to_dump.cpu(); } std::visit( - [&output_stream, &strategy](const auto& storage) { + [output_file, &strategy](const auto& storage) { using StorageType = std::decay_t; if constexpr (std::is_same_v) { - dump_owned_storage(output_stream, storage); + dump_owned_storage(output_file, storage); } else if constexpr (std::is_same_v) { - dump_borrowed_storage(output_stream, storage); + dump_borrowed_storage(output_file, storage); } else if constexpr (std::is_same_v) { TT_THROW("Device storage isn't supported"); } else if constexpr (std::is_same_v) { TT_THROW("Device storage isn't supported"); } else if constexpr (std::is_same_v) { auto distribute_config = get_distributed_tensor_config(strategy); - dump_multi_device_host_storage(output_stream, storage, distribute_config); + dump_multi_device_host_storage(output_file, storage, distribute_config); } else { raise_unsupported_storage(); } @@ -306,83 +472,6 @@ void dump_tensor( tensor_to_dump.get_storage()); } -template -Tensor load_tensor_helper(const std::string& file_name, T device) { - std::ifstream input_stream(file_name, std::ios::in | std::ios::binary); - if (not input_stream) { - throw std::runtime_error(fmt::format("Cannot open \"{}\"", file_name)); - } - - std::size_t read_sentinel; - input_stream.read(reinterpret_cast(&read_sentinel), sizeof(read_sentinel)); - if (read_sentinel == SENTINEL_VALUE) { - std::uint8_t version_id; - input_stream.read(reinterpret_cast(&version_id), sizeof(version_id)); - - // Allow only backward compatible versions - if (version_id > VERSION_ID) { - throw std::runtime_error( - fmt::format("Serialized tensor with version_id: {}. Loader version: {}", version_id, VERSION_ID)); - } - auto shape = LegacyShape{}; - DataType data_type; - Layout layout; - StorageType storage_type; - input_stream.read(reinterpret_cast(&shape), sizeof(LegacyShape)); - input_stream.read(reinterpret_cast(&data_type), sizeof(DataType)); - input_stream.read(reinterpret_cast(&layout), sizeof(Layout)); - input_stream.read(reinterpret_cast(&storage_type), sizeof(StorageType)); - - bool has_memory_config = false; - MemoryConfig memory_config = MemoryConfig{ - .memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED, .buffer_type = BufferType::DRAM}; - - if (version_id >= 2) { - input_stream.read(reinterpret_cast(&has_memory_config), sizeof(bool)); - if (has_memory_config) { - memory_config = tt::tt_metal::load_memory_config(input_stream); - } - } - - auto storage = load_storage(input_stream, data_type, layout, storage_type, device); - - auto tensor = Tensor( - std::move(storage), - TensorSpec( - shape.logical_shape(), - TensorLayout::fromPaddedShape( - data_type, layout, MemoryConfig{}, shape.logical_shape(), shape.padded_shape()))); - if (device != nullptr) { - tensor = tensor.to_device(device, memory_config); - } else if (has_memory_config) { - tt::log_warning("Memory config is ignored when loading the tensor because device is not provided"); - } - return tensor; - - } else { - input_stream.seekg(0, std::ios::beg); // No sentinel found, assume it's an older format and rewind - - auto shape = LegacyShape{}; - DataType data_type; - Layout layout; - input_stream.read(reinterpret_cast(&shape), sizeof(LegacyShape)); - input_stream.read(reinterpret_cast(&data_type), sizeof(DataType)); - input_stream.read(reinterpret_cast(&layout), sizeof(Layout)); - - auto storage = load_owned_storage(input_stream, data_type); - auto tensor = Tensor( - std::move(storage), - TensorSpec( - shape.logical_shape(), - TensorLayout::fromPaddedShape( - data_type, layout, MemoryConfig{}, shape.logical_shape(), shape.padded_shape()))); - if (device != nullptr) { - tensor = tensor.to_device(device); - } - return tensor; - } -} - // Explicit instantiations Tensor load_tensor(const std::string& file_name, IDevice* device) { return load_tensor_helper(file_name, device); @@ -391,81 +480,61 @@ Tensor load_tensor(const std::string& file_name, MeshDevice* device) { return load_tensor_helper(file_name, device); } -void dump_memory_config(std::ostream& output_stream, const MemoryConfig& memory_config) { - output_stream.write(reinterpret_cast(&VERSION_ID), sizeof(std::uint8_t)); - output_stream.write(reinterpret_cast(&memory_config.memory_layout), sizeof(TensorMemoryLayout)); - output_stream.write(reinterpret_cast(&memory_config.buffer_type), sizeof(BufferType)); - - bool has_shard_spec = memory_config.shard_spec.has_value(); - output_stream.write(reinterpret_cast(&has_shard_spec), sizeof(bool)); - if (has_shard_spec) { - const auto& shard_spec = memory_config.shard_spec.value(); - const auto& core_ranges = shard_spec.grid.ranges(); - std::size_t num_core_ranges = core_ranges.size(); - output_stream.write(reinterpret_cast(&num_core_ranges), sizeof(std::size_t)); - for (const auto& core_range : core_ranges) { - output_stream.write(reinterpret_cast(&core_range), sizeof(CoreRange)); - } - output_stream.write(reinterpret_cast(&shard_spec.shape), sizeof(std::array)); - output_stream.write(reinterpret_cast(&shard_spec.orientation), sizeof(ShardOrientation)); - } +void dump_memory_config(FILE* output_file, const MemoryConfig& memory_config) { + safe_fwrite(&VERSION_ID, sizeof(VERSION_ID), 1, output_file); + flatbuffers::FlatBufferBuilder builder; + auto flat_config = ttnn::to_flatbuffer(memory_config, builder); + builder.Finish(flat_config); + uint64_t buf_size = builder.GetSize(); + safe_fwrite(&buf_size, sizeof(buf_size), 1, output_file); + safe_fwrite(builder.GetBufferPointer(), buf_size, 1, output_file); } void dump_memory_config(const std::string& file_name, const MemoryConfig& memory_config) { - std::ofstream output_stream(file_name, std::ios::out | std::ios::binary); - if (not output_stream) { - throw std::runtime_error(fmt::format("Cannot open \"{}\"", file_name)); + FILE* output_file = fopen(file_name.c_str(), "wb"); + if (not output_file) { + TT_THROW("Cannot open \"{}\"", file_name); } - dump_memory_config(output_stream, memory_config); + std::unique_ptr file_guard(output_file, &fclose); + dump_memory_config(output_file, memory_config); } -MemoryConfig load_memory_config(std::ifstream& input_stream) { +MemoryConfig load_memory_config(FILE* input_file) { std::uint8_t version_id; - TensorMemoryLayout memory_layout; - BufferType buffer_type; - bool has_shard_spec; - input_stream.read(reinterpret_cast(&version_id), sizeof(std::uint8_t)); + safe_fread(&version_id, sizeof(version_id), 1, input_file); // Allow only backward compatible versions if (version_id > VERSION_ID) { - throw std::runtime_error( - fmt::format("Serialized tensor with version_id: {}. Loader version: {}", version_id, VERSION_ID)); + TT_THROW( + "Version mismatch: the serialized memory config was created with version {} but is being loaded by a " + "loader with version {}. Please update your saved data or your loader so that both versions match.", + version_id, + VERSION_ID); } - input_stream.read(reinterpret_cast(&memory_layout), sizeof(TensorMemoryLayout)); - input_stream.read(reinterpret_cast(&buffer_type), sizeof(BufferType)); - input_stream.read(reinterpret_cast(&has_shard_spec), sizeof(bool)); - std::optional shard_spec = std::nullopt; - if (has_shard_spec) { - std::size_t num_core_ranges; - std::set core_ranges; - std::array shape; - ShardOrientation orientation; + if (version_id < 5) { + return load_memory_config_legacy_impl(input_file, version_id); + } - input_stream.read(reinterpret_cast(&num_core_ranges), sizeof(std::size_t)); - for (auto index = 0; index < num_core_ranges; index++) { - CoreRange core_range{{}, {}}; - input_stream.read(reinterpret_cast(&core_range), sizeof(CoreRange)); - core_ranges.insert(core_range); - } - input_stream.read(reinterpret_cast(&shape), sizeof(std::array)); - input_stream.read(reinterpret_cast(&orientation), sizeof(ShardOrientation)); - if (version_id <= 3) { - // Read halo for backward compatibility. - bool halo; - input_stream.read(reinterpret_cast(&halo), sizeof(bool)); - } - shard_spec = {CoreRangeSet{core_ranges}, shape, orientation}; + uint64_t bin_size = 0; + safe_fread(&bin_size, sizeof(bin_size), 1, input_file); + std::vector bin(bin_size); + safe_fread(bin.data(), bin_size, 1, input_file); + flatbuffers::Verifier verifier(bin.data(), bin_size); + if (!verifier.VerifyBuffer()) { + TT_THROW("MemoryConfig deserialization failed: invalid buffer"); } - return MemoryConfig{memory_layout, buffer_type, shard_spec}; + auto mem_config = flatbuffers::GetRoot(bin.data()); + return ttnn::from_flatbuffer(mem_config); } MemoryConfig load_memory_config(const std::string& file_name) { - std::ifstream input_stream(file_name, std::ios::in | std::ios::binary); - if (not input_stream) { - throw std::runtime_error(fmt::format("Cannot open \"{}\"", file_name)); + FILE* input_file = fopen(file_name.c_str(), "rb"); + if (not input_file) { + TT_THROW("Cannot open \"{}\"", file_name); } - return load_memory_config(input_stream); + std::unique_ptr file_guard(input_file, &fclose); + return load_memory_config(input_file); } } // namespace tt::tt_metal diff --git a/ttnn/cpp/ttnn/tensor/serialization.hpp b/ttnn/cpp/ttnn/tensor/serialization.hpp index e22d69119c4..1c2c347e60d 100644 --- a/ttnn/cpp/ttnn/tensor/serialization.hpp +++ b/ttnn/cpp/ttnn/tensor/serialization.hpp @@ -17,10 +17,10 @@ void dump_tensor( Tensor load_tensor(const std::string& file_name, IDevice* device = nullptr); Tensor load_tensor(const std::string& file_name, distributed::MeshDevice* device = nullptr); -void dump_memory_config(std::ostream& output_stream, const MemoryConfig& memory_config); +void dump_memory_config(FILE* output_file, const MemoryConfig& memory_config); void dump_memory_config(const std::string& file_name, const MemoryConfig& memory_config); -MemoryConfig load_memory_config(std::ifstream& input_stream); +MemoryConfig load_memory_config(FILE* input_file); MemoryConfig load_memory_config(const std::string& file_name); } // namespace tt::tt_metal diff --git a/ttnn/cpp/ttnn/tensor/types.hpp b/ttnn/cpp/ttnn/tensor/types.hpp index e65599131eb..09a2aeecf19 100644 --- a/ttnn/cpp/ttnn/tensor/types.hpp +++ b/ttnn/cpp/ttnn/tensor/types.hpp @@ -29,7 +29,7 @@ namespace tt { namespace tt_metal { -static constexpr std::uint8_t VERSION_ID = 4; +static constexpr std::uint8_t VERSION_ID = 5; enum class DataType { BFLOAT16 = 0, From e4ecf87d4160268a812bd33f754ee7c5be59c1cd Mon Sep 17 00:00:00 2001 From: Borys Bradel <164946524+bbradelTT@users.noreply.github.com> Date: Sat, 8 Feb 2025 09:56:09 -0500 Subject: [PATCH 031/316] =?UTF-8?q?#17737:=20move=20matmul=20sd=20tests=20?= =?UTF-8?q?to=20nightly=20and=20adjust=20matmul=20test=20dimens=E2=80=A6?= =?UTF-8?q?=20(#17743)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …ions ### Ticket Link to Github Issue #17737 ### Problem description - we need to reduce the amount of time in all post commit per op family ### What's changed - move sd tests to nightly directory - will reduce runtime and initial move to allow new nightly flow to pick it up - adjust parameters to reduce existing execution time - remove large (2048) dimensions from linear test_linear_by_passing_in_1D_systolic_array_program_config and test_wide_linear_with_argument_for_core_grid_set_to_device_grid tests - skip some max tests for GS that fail when run in suite due to a known issue ### Checklist - [x] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes https://github.com/tenstorrent/tt-metal/actions/runs/13216232079 - [x] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) https://github.com/tenstorrent/tt-metal/actions/runs/13209455228 - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) N/A - [ ] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) N/A - [ ] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) N/A - [x] New/Existing tests provide coverage for changes --- .../operations/matmul/test_matmul.py | 106 ++++++++++++++++ .../ttnn/unit_tests/operations/test_linear.py | 8 +- .../ttnn/unit_tests/operations/test_matmul.py | 115 +----------------- tests/ttnn/unit_tests/operations/test_max.py | 9 +- 4 files changed, 120 insertions(+), 118 deletions(-) create mode 100644 tests/ttnn/nightly/unit_tests/operations/matmul/test_matmul.py diff --git a/tests/ttnn/nightly/unit_tests/operations/matmul/test_matmul.py b/tests/ttnn/nightly/unit_tests/operations/matmul/test_matmul.py new file mode 100644 index 00000000000..e2b40f4f49d --- /dev/null +++ b/tests/ttnn/nightly/unit_tests/operations/matmul/test_matmul.py @@ -0,0 +1,106 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +from loguru import logger +import pytest +import torch +import math +import ttnn + +from tests.ttnn.utils_for_testing import assert_with_pcc + + +@pytest.mark.parametrize( + "batch_size, channel_a, channel_b, m_size, k_size, n_size, has_bias", + [ + (1, 2, 1, 1024, 640, 2560, False), + (2, 8, 8, 64, 96, 160, False), + (1, 2, 1, 4096, 320, 1280, False), + (1, 2, 1, 64, 1280, 5120, False), + (2, 8, 8, 64, 64, 160, False), + (1, 2, 1, 1024, 640, 768, False), + (2, 8, 8, 96, 160, 96, False), + (2, 8, 8, 1024, 1024, 96, False), + (1, 2, 1, 96, 768, 1024, False), + (1, 1, 1, 32, 1280, 1280, True), + (2, 8, 8, 4096, 96, 64, False), + (1, 2, 1, 64, 5120, 1280, True), + (2, 8, 8, 4096, 64, 96, False), + (1, 2, 1, 1024, 768, 640, True), + (1, 2, 1, 256, 1280, 1280, True), + (2, 8, 8, 1024, 96, 96, False), + (1, 2, 1, 1024, 640, 2304, False), + (1, 1, 1, 32, 1280, 320, True), + (1, 2, 1, 96, 768, 2560, False), + (1, 2, 1, 4096, 1280, 320, True), + (1, 2, 1, 1024, 2560, 640, True), + (1, 2, 1, 256, 1280, 3840, False), + (1, 1, 1, 32, 320, 1280, True), + (1, 2, 1, 4096, 512, 320, True), + (1, 2, 1, 64, 1280, 1280, True), + (1, 2, 1, 256, 5120, 1280, True), + (1, 2, 1, 256, 1280, 1280, False), + (2, 8, 8, 256, 160, 96, False), + (2, 8, 8, 256, 256, 160, False), + (1, 2, 1, 96, 768, 1536, False), + (1, 2, 1, 64, 1280, 3840, False), + (2, 8, 8, 1024, 96, 1024, False), + (2, 8, 8, 256, 96, 160, False), + (1, 2, 1, 64, 1280, 1280, False), + (2, 8, 8, 4096, 64, 4096, False), + (1, 1, 1, 32, 1280, 640, True), + (2, 8, 8, 64, 160, 64, False), + (1, 2, 1, 4096, 320, 1536, False), + (1, 2, 1, 256, 1280, 5120, False), + (2, 8, 8, 4096, 4096, 64, False), + (2, 8, 8, 256, 160, 256, False), + (1, 2, 1, 4096, 320, 512, False), + ], +) +@pytest.mark.parametrize("dtype", [ttnn.bfloat8_b]) +def test_sd_matmul(device, batch_size, channel_a, channel_b, m_size, k_size, n_size, has_bias, dtype): + torch.manual_seed(0) + if device.core_grid.y == 7: + pytest.skip("Issue #6984: Compute Grid size too small") + core_grid = ttnn.CoreGrid(x=8, y=8) + TILE_HEIGHT = 32 + + if batch_size == 2: + if (m_size == 1024 and k_size == 96 and n_size == 1024) or (m_size == 4096 and k_size == 64 and n_size == 4096): + # NOTE: matmul errors out with OOM otherwise + core_grid = None + + torch_input_tensor_a = torch.randn((batch_size, channel_a, m_size, k_size), dtype=torch.bfloat16) + torch_input_tensor_b = torch.randn((batch_size, channel_b, k_size, n_size), dtype=torch.bfloat16) + torch_output_tensor = torch_input_tensor_a @ torch_input_tensor_b + if has_bias: + torch_input_tensor_c = torch.randn((1, 1, TILE_HEIGHT, n_size), dtype=torch.bfloat16) + _torch_input_tensor_c = torch.repeat_interleave( + torch_input_tensor_c, torch_output_tensor.shape[2] // TILE_HEIGHT, dim=2 + ) + torch_output_tensor = torch_output_tensor + _torch_input_tensor_c + + input_tensor_a = ttnn.from_torch(torch_input_tensor_a, layout=ttnn.TILE_LAYOUT, device=device, dtype=dtype) + input_tensor_b = ttnn.from_torch(torch_input_tensor_b, layout=ttnn.TILE_LAYOUT, device=device, dtype=dtype) + input_tensor_c = ( + ttnn.from_torch(torch_input_tensor_c, layout=ttnn.TILE_LAYOUT, device=device, dtype=dtype) if has_bias else None + ) + pcc = 0.94 if dtype == ttnn.bfloat8_b else 0.98 + + if has_bias: + output_tensor = ttnn.linear( + input_tensor_a, + input_tensor_b, + bias=input_tensor_c, + core_grid=core_grid, + ) + else: + output_tensor = ttnn.matmul( + input_tensor_a, + input_tensor_b, + core_grid=core_grid, + ) + + output_tensor = ttnn.to_torch(output_tensor) + assert_with_pcc(torch_output_tensor, output_tensor, pcc=pcc) diff --git a/tests/ttnn/unit_tests/operations/test_linear.py b/tests/ttnn/unit_tests/operations/test_linear.py index 9f77989e3ec..daa0ce9e85a 100644 --- a/tests/ttnn/unit_tests/operations/test_linear.py +++ b/tests/ttnn/unit_tests/operations/test_linear.py @@ -136,8 +136,8 @@ def test_linear_with_core_grid( @pytest.mark.parametrize("batch_size", [1, 8]) @pytest.mark.parametrize("m_size", [32, 64]) -@pytest.mark.parametrize("k_size", [1024, 2048]) -@pytest.mark.parametrize("n_size", [1024, 2048]) +@pytest.mark.parametrize("k_size", [1024]) +@pytest.mark.parametrize("n_size", [1024]) @pytest.mark.parametrize("activation", [None, "relu", "silu"]) def test_wide_linear_with_argument_for_core_grid_set_to_device_grid( device, batch_size, m_size, k_size, n_size, activation @@ -163,8 +163,8 @@ def test_wide_linear_with_argument_for_core_grid_set_to_device_grid( @pytest.mark.parametrize("batch_size", [1, 8]) @pytest.mark.parametrize("m_size", [32, 64]) -@pytest.mark.parametrize("k_size", [1024, 2048]) -@pytest.mark.parametrize("n_size", [1024, 2048]) +@pytest.mark.parametrize("k_size", [1024]) +@pytest.mark.parametrize("n_size", [1024]) @pytest.mark.parametrize("activation", [None, "relu"]) def test_linear_by_passing_in_1D_systolic_array_program_config(device, batch_size, m_size, k_size, n_size, activation): torch.manual_seed(0) diff --git a/tests/ttnn/unit_tests/operations/test_matmul.py b/tests/ttnn/unit_tests/operations/test_matmul.py index 8879c9b8000..d108d8f0aa2 100644 --- a/tests/ttnn/unit_tests/operations/test_matmul.py +++ b/tests/ttnn/unit_tests/operations/test_matmul.py @@ -576,8 +576,8 @@ def run_matmul_2d_multiple_output_blocks_per_core( @run_for_wormhole_b0() @pytest.mark.parametrize("b", [1, 2]) -@pytest.mark.parametrize("m", [1024]) -@pytest.mark.parametrize("k", [1024]) +@pytest.mark.parametrize("m", [512]) +@pytest.mark.parametrize("k", [512]) @pytest.mark.parametrize("n", [1024]) @pytest.mark.parametrize("has_bias", [True, False]) @pytest.mark.parametrize("grid_size", [(8, 4)]) @@ -752,8 +752,8 @@ def run_matmul_2d_tiny_tile( @run_for_wormhole_b0() -@pytest.mark.parametrize("m", [768]) -@pytest.mark.parametrize("k", [1024]) +@pytest.mark.parametrize("m", [512]) +@pytest.mark.parametrize("k", [512]) @pytest.mark.parametrize("n", [768]) @pytest.mark.parametrize("has_bias", [False, True]) @pytest.mark.parametrize("grid_size", [(8, 4)]) @@ -1718,113 +1718,6 @@ def test_falcon_query_key_value_matmul(device, batch_size, m_size, k_size, n_siz assert_with_pcc(torch_output_tensor, output_tensor, pcc=0.996) -# @skip_for_grayskull() -@pytest.mark.parametrize( - "batch_size, channel_a, channel_b, m_size, k_size, n_size, has_bias", - [ - (1, 2, 1, 1024, 640, 2560, False), - (2, 8, 8, 64, 96, 160, False), - (1, 2, 1, 4096, 320, 1280, False), - (1, 2, 1, 64, 1280, 5120, False), - (2, 8, 8, 64, 64, 160, False), - (1, 2, 1, 1024, 640, 768, False), - (2, 8, 8, 96, 160, 96, False), - (2, 8, 8, 1024, 1024, 96, False), - (1, 2, 1, 96, 768, 1024, False), - (1, 1, 1, 32, 1280, 1280, True), - (2, 8, 8, 4096, 96, 64, False), - (1, 2, 1, 64, 5120, 1280, True), - (2, 8, 8, 4096, 64, 96, False), - (1, 2, 1, 1024, 768, 640, True), - (1, 2, 1, 256, 1280, 1280, True), - (2, 8, 8, 1024, 96, 96, False), - (1, 2, 1, 1024, 640, 2304, False), - (1, 1, 1, 32, 1280, 320, True), - (1, 2, 1, 96, 768, 2560, False), - (1, 2, 1, 4096, 1280, 320, True), - (1, 2, 1, 1024, 2560, 640, True), - (1, 2, 1, 256, 1280, 3840, False), - (1, 1, 1, 32, 320, 1280, True), - (1, 2, 1, 4096, 512, 320, True), - (1, 2, 1, 64, 1280, 1280, True), - (1, 2, 1, 256, 5120, 1280, True), - (1, 2, 1, 256, 1280, 1280, False), - (2, 8, 8, 256, 160, 96, False), - (2, 8, 8, 256, 256, 160, False), - (1, 2, 1, 96, 768, 1536, False), - (1, 2, 1, 64, 1280, 3840, False), - (2, 8, 8, 1024, 96, 1024, False), - (2, 8, 8, 256, 96, 160, False), - (1, 2, 1, 64, 1280, 1280, False), - (2, 8, 8, 4096, 64, 4096, False), - (1, 1, 1, 32, 1280, 640, True), - (2, 8, 8, 64, 160, 64, False), - (1, 2, 1, 4096, 320, 1536, False), - (1, 2, 1, 256, 1280, 5120, False), - (2, 8, 8, 4096, 4096, 64, False), - (2, 8, 8, 256, 160, 256, False), - (1, 2, 1, 4096, 320, 512, False), - ], -) -@pytest.mark.parametrize("dtype", [ttnn.bfloat8_b]) -def test_sd_matmul(device, batch_size, channel_a, channel_b, m_size, k_size, n_size, has_bias, dtype): - torch.manual_seed(0) - if device.core_grid.y == 7: - pytest.skip("Issue #6984: Compute Grid size too small") - core_grid = ttnn.CoreGrid(x=8, y=8) - TILE_HEIGHT = 32 - - if batch_size == 2: - if (m_size == 1024 and k_size == 96 and n_size == 1024) or (m_size == 4096 and k_size == 64 and n_size == 4096): - # NOTE: matmul errors out with OOM otherwise - core_grid = None - - # if batch_size == 2: - # if m_size == 1024 and k_size == 96 and n_size == 1024 and (dtype == ttnn.bfloat16 or is_grayskull()): - # pytest.skip("skip: Raises OOM") - # if m_size == 4096 and k_size == 64 and n_size == 4096: - # pytest.skip("skip: Raises OOM without decomposition") - # if is_grayskull(): - # if m_size == 4096 and ( - # (k_size == 96 and n_size == 64) or (k_size == 64 and n_size == 96) or (k_size == 4096 and n_size == 64) - # ): - # pytest.skip("skip: Raises OOM on GS") - - torch_input_tensor_a = torch.randn((batch_size, channel_a, m_size, k_size), dtype=torch.bfloat16) - torch_input_tensor_b = torch.randn((batch_size, channel_b, k_size, n_size), dtype=torch.bfloat16) - torch_output_tensor = torch_input_tensor_a @ torch_input_tensor_b - if has_bias: - torch_input_tensor_c = torch.randn((1, 1, TILE_HEIGHT, n_size), dtype=torch.bfloat16) - _torch_input_tensor_c = torch.repeat_interleave( - torch_input_tensor_c, torch_output_tensor.shape[2] // TILE_HEIGHT, dim=2 - ) - torch_output_tensor = torch_output_tensor + _torch_input_tensor_c - - input_tensor_a = ttnn.from_torch(torch_input_tensor_a, layout=ttnn.TILE_LAYOUT, device=device, dtype=dtype) - input_tensor_b = ttnn.from_torch(torch_input_tensor_b, layout=ttnn.TILE_LAYOUT, device=device, dtype=dtype) - input_tensor_c = ( - ttnn.from_torch(torch_input_tensor_c, layout=ttnn.TILE_LAYOUT, device=device, dtype=dtype) if has_bias else None - ) - pcc = 0.94 if dtype == ttnn.bfloat8_b else 0.98 - - if has_bias: - output_tensor = ttnn.linear( - input_tensor_a, - input_tensor_b, - bias=input_tensor_c, - core_grid=core_grid, - ) - else: - output_tensor = ttnn.matmul( - input_tensor_a, - input_tensor_b, - core_grid=core_grid, - ) - - output_tensor = ttnn.to_torch(output_tensor) - assert_with_pcc(torch_output_tensor, output_tensor, pcc=pcc) - - @run_for_wormhole_b0() @pytest.mark.parametrize( "in0_dtype, in1_dtype, num_activation_cores, num_compute_cores, has_bias, config, M, K, N", diff --git a/tests/ttnn/unit_tests/operations/test_max.py b/tests/ttnn/unit_tests/operations/test_max.py index f6536f16f4e..d5af92b4f28 100644 --- a/tests/ttnn/unit_tests/operations/test_max.py +++ b/tests/ttnn/unit_tests/operations/test_max.py @@ -8,7 +8,7 @@ import ttnn from tests.ttnn.utils_for_testing import assert_with_pcc -from models.utility_functions import torch_random, is_grayskull +from models.utility_functions import torch_random, is_grayskull, skip_for_grayskull @pytest.mark.parametrize("batch_size", [1, 16, 1, 16]) @@ -32,6 +32,7 @@ def test_max(device, batch_size, h, w, dim): assert_with_pcc(torch_output_tensor, output_tensor) +@skip_for_grayskull("May fail on GS if run all the tests in this file. #17084") @pytest.mark.parametrize("batch_size1", [2]) @pytest.mark.parametrize("batch_size2", [32]) @pytest.mark.parametrize("h", [64]) @@ -115,8 +116,10 @@ def test_max_global(device, batch_size, h, w): @pytest.mark.parametrize("keepdim", [True, False]) def test_max_dim(device, input_shape_and_dim, keepdim): input_shape, max_dim = input_shape_and_dim - if is_grayskull() and (input_shape[-1] % 32 != 0 or input_shape[-2] % 32 != 0 or input_shape[max_dim] % 32 != 0): - pytest.skip("If not a tile size multiple, may fail on GS if run all the tests in this file. #17084") + if is_grayskull() and ( + input_shape[-1] % 32 != 0 or input_shape[-2] % 32 != 0 or input_shape[max_dim] % 32 != 0 or max_dim <= -2 + ): + pytest.skip("May fail on GS if run all the tests in this file. #17084") torch_input_tensor = torch_random(input_shape, -100, 100, dtype=torch.bfloat16) torch_output_tensor, _ = torch.max(torch_input_tensor, dim=max_dim, keepdim=keepdim) From a4b0687632df69422ad45cd1e78dfd19e60e290a Mon Sep 17 00:00:00 2001 From: Nour Ardo Date: Sat, 8 Feb 2025 10:04:51 -0500 Subject: [PATCH 032/316] Quick fix for single card device perf (#17752) ### Ticket Link to Github Issue NA ### Problem description Using parallelization over the width for untilize with unpadding caused perf regression for some models. This PR fixes it ### What's changed Limiting the use of the function only when the height parallelization does not work. Block parallelization with a better threshold will be added in the future ### Checklist - [x] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes https://github.com/tenstorrent/tt-metal/actions/runs/13212247605 - [ ] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [x] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) https://github.com/tenstorrent/tt-metal/actions/runs/13207121716 - [ ] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [ ] New/Existing tests provide coverage for changes --- .../device/untilize_with_unpadding_op.cpp | 2 +- .../device/untilize_with_unpadding_op.hpp | 2 + ...ntilize_with_unpadding_program_factory.cpp | 9 ++-- ...ntilize_with_unpadding_program_factory.hpp | 4 +- .../untilize_with_unpadding.cpp | 45 ++++++++++++++++++- 5 files changed, 54 insertions(+), 8 deletions(-) diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_op.cpp index b39492d8fd7..212e2100e67 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_op.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_op.cpp @@ -109,7 +109,7 @@ operation::ProgramWithCallbacks UntilizeWithUnpadding::create_program( auto& output_tensor = output_tensors.at(0); if (input_tensors.at(0).memory_config().is_sharded() || this->use_multicore) { return detail::untilize_with_unpadding_multi_core( - input_tensor_a, output_tensor, this->use_pack_untilize, this->fp32_dest_acc_en); + input_tensor_a, output_tensor, this->use_pack_untilize, this->fp32_dest_acc_en, this->enough_space_height); } else { return detail::untilize_with_unpadding_single_core( input_tensor_a, output_tensor, this->use_pack_untilize, this->fp32_dest_acc_en); diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_op.hpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_op.hpp index f845e792479..0ca24f4985d 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_op.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_op.hpp @@ -17,6 +17,8 @@ struct UntilizeWithUnpadding { const bool use_multicore; const bool use_pack_untilize; const bool fp32_dest_acc_en; + const bool enough_space_width; + const bool enough_space_height; void validate(const std::vector& input_tensors) const; std::vector compute_output_specs(const std::vector& input_tensors) const; diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_program_factory.cpp index fb9e98524df..46ad820c73e 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_program_factory.cpp @@ -365,7 +365,7 @@ operation::ProgramWithCallbacks untilize_with_unpadding_multi_core_col_interleav } operation::ProgramWithCallbacks untilize_with_unpadding_multi_core_interleaved( - const Tensor& a, Tensor& output, bool use_pack_untilize, bool fp32_dest_acc_en) { + const Tensor& a, Tensor& output, bool use_pack_untilize, bool fp32_dest_acc_en, bool enough_space_height) { tt::tt_metal::Program program{}; tt::DataFormat input_cb_data_format = datatype_to_dataformat_converter(a.get_dtype()); @@ -383,7 +383,7 @@ operation::ProgramWithCallbacks untilize_with_unpadding_multi_core_interleaved( uint32_t num_tiles_per_row = a.get_padded_shape()[-1] / TILE_WIDTH; uint32_t num_tiles_per_col = a.get_padded_shape()[-2] / TILE_HEIGHT; - if (num_tiles_per_row > num_tiles_per_col) { + if (!enough_space_height) { return untilize_with_unpadding_multi_core_col_interleaved(a, output, use_pack_untilize, fp32_dest_acc_en); } @@ -839,11 +839,12 @@ operation::ProgramWithCallbacks untilize_with_unpadding_multi_core_sharded( } operation::ProgramWithCallbacks untilize_with_unpadding_multi_core( - const Tensor& a, Tensor& output, bool use_pack_untilize, bool fp32_dest_acc_en) { + const Tensor& a, Tensor& output, bool use_pack_untilize, bool fp32_dest_acc_en, bool enough_space_height) { if (a.memory_config().is_sharded()) { return untilize_with_unpadding_multi_core_sharded(a, output, use_pack_untilize, fp32_dest_acc_en); } else { - return untilize_with_unpadding_multi_core_interleaved(a, output, use_pack_untilize, fp32_dest_acc_en); + return untilize_with_unpadding_multi_core_interleaved( + a, output, use_pack_untilize, fp32_dest_acc_en, enough_space_height); } } diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_program_factory.hpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_program_factory.hpp index 3e232b151fd..b56c683bb10 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_program_factory.hpp @@ -12,13 +12,13 @@ tt::tt_metal::operation::ProgramWithCallbacks untilize_with_unpadding_single_cor const Tensor& a, Tensor& output, bool use_pack_untilize, bool fp32_dest_acc_en); tt::tt_metal::operation::ProgramWithCallbacks untilize_with_unpadding_multi_core_interleaved( - const Tensor& a, Tensor& output, bool use_pack_untilize, bool fp32_dest_acc_en); + const Tensor& a, Tensor& output, bool use_pack_untilize, bool fp32_dest_acc_en, bool enough_space_height); // This purely supports input block shard -> output interleaved for now tt::tt_metal::operation::ProgramWithCallbacks untilize_with_unpadding_multi_core_sharded( const Tensor& a, Tensor& output, bool use_pack_untilize, bool fp32_dest_acc_en); tt::tt_metal::operation::ProgramWithCallbacks untilize_with_unpadding_multi_core( - const Tensor& a, Tensor& output, bool use_pack_untilize, bool fp32_dest_acc_en); + const Tensor& a, Tensor& output, bool use_pack_untilize, bool fp32_dest_acc_en, bool enough_space_height); } // namespace ttnn::operations::data_movement::detail diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding.cpp index 24dea61f3bb..fbf116dfc54 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding.cpp @@ -32,6 +32,35 @@ ttnn::Shape squeeze_vector_shape(ttnn::Shape output_shape) { namespace ttnn::operations::data_movement { +inline uint32_t get_estimated_size_of_cbs( + const Tensor& input_tensor_a, + const uint32_t input_single_tile_size, + const uint32_t output_single_tile_size, + const uint32_t num_tiles_per_row) { + uint32_t cb_src0_size = input_single_tile_size * num_tiles_per_row; + uint32_t cb_output_size = output_single_tile_size * num_tiles_per_row; + return cb_src0_size + cb_output_size; +} + +inline uint32_t get_max_l1_space(const Tensor& input_tensor_a) { + auto device = input_tensor_a.device(); + auto lowest_address = device->lowest_occupied_compute_l1_address(); + uint32_t max_l1_space = lowest_address.has_value() ? lowest_address.value() : device->l1_size_per_core(); + max_l1_space = max_l1_space - device->allocator()->get_base_allocator_addr(HalMemType::L1); + return max_l1_space; +} + +inline bool enough_available_space( + const Tensor& input_tensor_a, + const uint32_t input_single_tile_size, + const uint32_t output_single_tile_size, + const uint32_t num_tiles_per_row) { + uint32_t max_l1_space = get_max_l1_space(input_tensor_a); + uint32_t estimated_size_of_cbs = + get_estimated_size_of_cbs(input_tensor_a, input_single_tile_size, output_single_tile_size, num_tiles_per_row); + return max_l1_space > estimated_size_of_cbs; +} + using OwnedUntilizeValArgs = std::tuple; using BaseUntilizeValType = std::function; @@ -82,6 +111,18 @@ ttnn::Tensor ExecuteUntilizeWithUnpadding::invoke( output_end = ttnn::Shape(std::move(output_end_vector)); } + auto input_cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(input_tensor.get_dtype()); + uint32_t input_single_tile_size = tt::tt_metal::detail::TileSize(input_cb_data_format); + + uint32_t num_tiles_per_row = input_tensor.get_padded_shape()[-1] / tt::constants::TILE_WIDTH; + uint32_t num_tiles_per_col = input_tensor.get_padded_shape()[-2] / tt::constants::TILE_HEIGHT; + + uint32_t output_single_tile_size = input_single_tile_size; + bool enough_space_width = + enough_available_space(input_tensor, input_single_tile_size, output_single_tile_size, num_tiles_per_col); + bool enough_space_height = + enough_available_space(input_tensor, input_single_tile_size, output_single_tile_size, num_tiles_per_row); + auto base_untilize = [=](const ttnn::Tensor& input_tensor) { return operation::run( UntilizeWithUnpadding{// output_end, @@ -89,7 +130,9 @@ ttnn::Tensor ExecuteUntilizeWithUnpadding::invoke( memory_config.value_or(input_tensor.memory_config()), use_multicore, use_pack_untilize, - fp32_dest_acc_en}, + fp32_dest_acc_en, + enough_space_width, + enough_space_height}, {input_tensor}, {}, {}, From e1a028f72b8dab4291585afebdcc781d14487c61 Mon Sep 17 00:00:00 2001 From: Virdhatchani Narayanamoorthy <138196495+VirdhatchaniKN@users.noreply.github.com> Date: Sun, 9 Feb 2025 08:08:56 +0530 Subject: [PATCH 033/316] #17768: Float32 support for Inference mode in Batch Norm (#17587) ### Ticket https://github.com/tenstorrent/tt-metal/issues/17768 ### Problem description To Provide Fp32 support for Inference mode of BN ### What's changed Support provided for fp32 data type for inference mode of BN ### Checklist - [x] [All post-commit tests](https://github.com/tenstorrent/tt-metal/actions/runs/13217558701) - [x] [Blackhole post-commit tests](https://github.com/tenstorrent/tt-metal/actions/runs/13157671059) - [x] [(Single-card) Tests for new models](https://github.com/tenstorrent/tt-metal/actions/runs/13217560775) - Passed as in main - [x] [(Single-card) Demo tests](https://github.com/tenstorrent/tt-metal/actions/runs/13217560090) - Passed as in main - [x] [(Single-card) Device perf regressions](https://github.com/tenstorrent/tt-metal/actions/runs/13217559606) - [x] [(Single-card) Model perf tests](https://github.com/tenstorrent/tt-metal/actions/runs/13217559245) - Passed as in main --- .../eltwise/backward/utility_funcs.py | 7 +- .../unit_tests/operations/test_batch_norm.py | 123 +++++++++ .../device/batch_norm_device_operation.cpp | 49 ++-- .../device/batch_norm_program_factory.cpp | 76 +++++- .../compute/batch_norm_sfpu_kernel.cpp | 243 ++++++++++++++++++ .../kernels/dataflow/reader_batch_norm.cpp | 12 +- .../kernels/dataflow/writer_batch_norm.cpp | 8 +- 7 files changed, 476 insertions(+), 42 deletions(-) create mode 100644 ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/batch_norm_sfpu_kernel.cpp diff --git a/tests/ttnn/unit_tests/operations/eltwise/backward/utility_funcs.py b/tests/ttnn/unit_tests/operations/eltwise/backward/utility_funcs.py index 5499c0dc7de..02058d8f739 100644 --- a/tests/ttnn/unit_tests/operations/eltwise/backward/utility_funcs.py +++ b/tests/ttnn/unit_tests/operations/eltwise/backward/utility_funcs.py @@ -18,12 +18,15 @@ def data_gen_with_range_batch_norm( device, is_input=False, required_grad=False, + testing_dtype="bfloat16", ): assert high > low, "Incorrect range provided" torch.manual_seed(213919) channels = input_shapes[1] size = input_shapes if is_input else channels - pt_tensor = torch.rand(size, requires_grad=required_grad).bfloat16() * (high - low) + low + torch_dtype = getattr(torch, testing_dtype) + ttnn_dtype = getattr(ttnn, testing_dtype) + pt_tensor = torch.rand(size, requires_grad=required_grad, dtype=torch_dtype) * (high - low) + low reshaped_tensor = pt_tensor if not is_input: reshaped_tensor = pt_tensor.view(1, channels, 1, 1) @@ -31,7 +34,7 @@ def data_gen_with_range_batch_norm( reshaped_tensor, device=device, layout=ttnn.TILE_LAYOUT, - dtype=ttnn.bfloat16, + dtype=ttnn_dtype, memory_config=ttnn.DRAM_MEMORY_CONFIG, ) return pt_tensor, tt_tensor diff --git a/tests/ttnn/unit_tests/operations/test_batch_norm.py b/tests/ttnn/unit_tests/operations/test_batch_norm.py index 66d5d432d01..56922409d00 100644 --- a/tests/ttnn/unit_tests/operations/test_batch_norm.py +++ b/tests/ttnn/unit_tests/operations/test_batch_norm.py @@ -10,6 +10,129 @@ compare_results_batch_norm, ) from itertools import product +from models.utility_functions import skip_for_grayskull + + +@skip_for_grayskull("Unsupported dtype for Grayskull") +@pytest.mark.parametrize("eps", [1.0, 0.0, 2.34, 1e-05]) +@pytest.mark.parametrize("channel_size", [1, 2, 3, 4]) +@pytest.mark.parametrize("weight", [True, False]) +@pytest.mark.parametrize("bias", [True, False]) +def test_BN_fp32_full_value(device, channel_size, eps, weight, bias): + input_tensor_torch = torch.full(torch.Size([3, channel_size, 64, 120]), 1, dtype=torch.float32) + batch_mean_torch = torch.full(torch.Size([channel_size]), 0.00030171126, dtype=torch.float32) + batch_var_torch = torch.full(torch.Size([channel_size]), 0.1262342343, dtype=torch.float32) + weight_torch = torch.full(torch.Size([channel_size]), 0.246943565369, dtype=torch.float32) if weight else None + bias_torch = torch.full(torch.Size([channel_size]), 0.59, dtype=torch.float32) if bias else None + + result_torch = torch.nn.functional.batch_norm( + input=input_tensor_torch, + running_mean=batch_mean_torch, + running_var=batch_var_torch, + weight=weight_torch, + bias=bias_torch, + eps=eps, + ) + + batch_mean_torch = batch_mean_torch.view(1, channel_size, 1, 1) + batch_var_torch = batch_var_torch.view(1, channel_size, 1, 1) + weight_torch = weight_torch.view(1, channel_size, 1, 1) if weight else None + bias_torch = bias_torch.view(1, channel_size, 1, 1) if bias else None + + input_tensor_tt = ttnn.from_torch(input_tensor_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + batch_mean_tt = ttnn.from_torch(batch_mean_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + batch_var_tt = ttnn.from_torch(batch_var_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + weight_tt = ( + ttnn.from_torch(weight_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) if weight else None + ) + bias_tt = ttnn.from_torch(bias_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) if bias else None + + result_tt = ttnn.batch_norm( + input_tensor_tt, running_mean=batch_mean_tt, running_var=batch_var_tt, eps=eps, weight=weight_tt, bias=bias_tt + ) + tt_out = ttnn.to_torch(result_tt) + + status_1 = torch.allclose(result_torch, tt_out, atol=1e-10, rtol=1e-5) + status_2 = compare_results_batch_norm([result_torch], [tt_out]) + assert status_2 and status_1 + + +@skip_for_grayskull("Unsupported dtype for Grayskull") +@pytest.mark.parametrize( + "input_shapes", + [ + *(torch.Size([n, c, 32, 32]) for n, c in product([1, 2, 3, 4], [1, 2, 3, 4])), + *(torch.Size([n, c, 23, 23]) for n, c in product([1, 2, 3, 4], [1, 2, 3, 4])), + *(torch.Size([n, c, 64, 120]) for n, c in product([1, 2], [1, 2, 3])), + torch.Size([3, 1, 64, 120]), + torch.Size([3, 2, 64, 120]), + ], +) +@pytest.mark.parametrize( + "check_mean, check_var", + [ + (False, False), # xfail case + (True, False), # xfail case + (False, True), # xfail case + (True, True), + ], +) +@pytest.mark.parametrize("weight", [True, False]) +@pytest.mark.parametrize("bias", [True, False]) +@pytest.mark.parametrize("eps", [1.0, 0.0, 2.34, 1e-05]) +def test_batch_norm_fp32( + input_shapes, check_mean, check_var, weight, bias, eps, device, training=False, testing_dtype="float32" +): + in_data, input_tensor = data_gen_with_range_batch_norm( + input_shapes, 5, 10, device, is_input=True, testing_dtype=testing_dtype + ) + mean_data, mean_tensor = ( + data_gen_with_range_batch_norm(input_shapes, 4, 10, device, testing_dtype=testing_dtype) + if (check_mean) + else (None, None) + ) + var_data, var_tensor = ( + data_gen_with_range_batch_norm(input_shapes, 4, 20, device, testing_dtype=testing_dtype) + if (check_var) + else (None, None) + ) + weight_data, weight_tensor = ( + data_gen_with_range_batch_norm(input_shapes, 4, 10, device, testing_dtype=testing_dtype) + if weight + else (None, None) + ) + bias_data, bias_tensor = ( + data_gen_with_range_batch_norm(input_shapes, 4, 10, device, testing_dtype=testing_dtype) + if bias + else (None, None) + ) + + if (not training) and ((not check_mean) or (not check_var)): + pytest.xfail("running_mean and running_var must be defined in evaluation mode") + + tt_output_tensor_on_device = ttnn.batch_norm( + input_tensor, + running_mean=mean_tensor, + running_var=var_tensor, + training=training, + eps=eps, + weight=weight_tensor, + bias=bias_tensor, + ) + tt_output = ttnn.to_torch(tt_output_tensor_on_device) + torch_result = torch.nn.functional.batch_norm( + input=in_data, + running_mean=mean_data, + running_var=var_data, + weight=weight_data, + bias=bias_data, + training=training, + eps=eps, + ) + comp_pass = compare_results_batch_norm([tt_output], [torch_result]) and torch.allclose( + torch_result, tt_output, atol=1e-6, rtol=1e-3 + ) + assert comp_pass @pytest.mark.parametrize( diff --git a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/batch_norm_device_operation.cpp b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/batch_norm_device_operation.cpp index 0ec70f7c7a2..4131612e660 100644 --- a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/batch_norm_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/batch_norm_device_operation.cpp @@ -8,42 +8,49 @@ #include "ttnn/tensor/tensor.hpp" namespace ttnn::operations::normalization { + +namespace { +inline void check_tensor_BN(const Tensor& tensor, std::string_view name, std::uint32_t input_c_dim) { + TT_FATAL( + tensor.get_layout() == Layout::TILE, "batch_norm only supports tiled layout. Got: {}", tensor.get_layout()); + TT_FATAL( + tensor.get_dtype() == DataType::BFLOAT16 || tensor.get_dtype() == DataType::FLOAT32, + "batch_norm only supports bfloat16, float32. Got: {}", + tensor.get_dtype()); + TT_FATAL( + tensor.storage_type() == StorageType::DEVICE, + "Operands to batch_norm need to be on device! Got: {}", + tensor.storage_type()); + TT_FATAL(tensor.buffer() != nullptr, "Operands to batch_norm need to be allocated in buffers on device!"); + TT_FATAL(tensor.get_logical_shape().rank() == 4, "batch_norm supports tensors of rank 4"); + TT_FATAL(tensor.get_logical_shape()[1] == input_c_dim, "{}[1] must be the same as input's channel size.", name); +} +} // namespace + void BatchNormOperation::validate_tensors( const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) { const auto& [input, batch_mean, batch_var, weight, bias, output] = tensor_args; - check_tensor(input, "batch_norm", "input"); - check_tensor(batch_mean, "batch_norm", "batch_mean"); - check_tensor(batch_var, "batch_norm", "batch_var"); - check_tensor(weight, "batch_norm", "weight"); - check_tensor(bias, "batch_norm", "bias"); - check_tensor(output, "batch_norm", "output"); - // input (N, C, H, W) auto C = input.get_logical_shape()[1]; + + check_tensor_BN(input, "input_shape", C); + check_tensor_BN(batch_mean, "batch_mean_shape", C); + check_tensor_BN(batch_var, "batch_mean_shape", C); + // output (N, C, H, W) if (output.has_value()) { - auto check_C = output.value().get_logical_shape()[1]; - TT_FATAL(C == check_C, "output_shape[1] must be the same as input's channel size."); + check_tensor_BN(output.value(), "output_shape", C); } - // mean (1, C, 1, 1) - TT_FATAL(batch_mean.get_logical_shape()[1] == C, "batch_mean_shape[1] must be the same as input's channel size."); - // var (1, C, 1, 1) - TT_FATAL(batch_var.get_logical_shape()[1] == C, "batch_var_shape[1] must be the same as input's channel size."); - // weight (1, C, 1, 1) if (weight.has_value()) { - TT_FATAL( - weight.value().get_logical_shape()[1] == C, "weight_shape[1] must be the same as input's channel size."); - TT_FATAL( - weight.value().get_logical_shape()[1] == C, "weight_shape[1] must be the same as input's channel size."); + check_tensor_BN(weight.value(), "weight_shape", C); } // bias (1, C, 1, 1) if (bias.has_value()) { - TT_FATAL(bias.value().get_logical_shape()[1] == C, "bias_shape[1] must be the same as input's channel size."); - TT_FATAL(bias.value().get_logical_shape()[1] == C, "bias_shape[1] must be the same as input's channel size."); + check_tensor_BN(bias.value(), "bias_shape", C); } } @@ -127,7 +134,7 @@ std::tuple bias, std::optional output, const std::optional& memory_config) { - operation_attributes_t operation_attributes{eps, memory_config.value_or(input.memory_config())}; + operation_attributes_t operation_attributes{eps, memory_config.value_or(input.memory_config()), input.get_dtype()}; tensor_args_t tensor_args{input, batch_mean, batch_var, std::move(weight), std::move(bias), std::move(output)}; return {operation_attributes, tensor_args}; } diff --git a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/batch_norm_program_factory.cpp b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/batch_norm_program_factory.cpp index c640a45e00d..a0f062da2f8 100644 --- a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/batch_norm_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/batch_norm_program_factory.cpp @@ -73,8 +73,11 @@ void set_or_update_runtime_arguments( } uint32_t cHtWt = cHt * cWt; - class bfloat16 bfloat_scalar_eps(eps); - uint32_t packed_scalar_eps = pack_two_bfloat16_into_uint32({bfloat_scalar_eps, bfloat_scalar_eps}); + const auto scalar = eps; + const auto packed_scalar_eps = input_tensor.get_dtype() == DataType::FLOAT32 + ? std::bit_cast(scalar) + : pack_two_bfloat16_into_uint32({scalar, scalar}); + std::array reader_runtime_args = { packed_scalar_eps, input_tensor.buffer()->address(), @@ -218,38 +221,83 @@ BatchNormOperation::BatchNormFactory::cached_program_t BatchNormOperation::Batch const auto e_is_dram = weight_has_value and weight_tensor->buffer()->buffer_type() == tt_metal::BufferType::DRAM; const auto f_is_dram = bias_has_value and bias_tensor->buffer()->buffer_type() == tt_metal::BufferType::DRAM; + std::map dataflow_defines; // Currently support only for fp32, bf16 + if (input_tensor.get_dtype() == DataType::FLOAT32) { + dataflow_defines["FILL_TILE_WITH_FIRST_ELEMENT"] = "fill_tile_with_first_element"; + dataflow_defines["FILL_WITH_VALUE_FLOAT"] = "fill_with_val<1024, float>"; + } else { + dataflow_defines["FILL_TILE_WITH_FIRST_ELEMENT"] = "fill_tile_with_first_element_bfloat16"; + dataflow_defines["FILL_WITH_VALUE"] = "fill_with_val_bfloat16"; + } + // READER KERNEL + auto reader_defines = dataflow_defines; auto reader_kernel_id = tt_metal::CreateKernel( program, "ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/reader_batch_norm.cpp", all_device_cores, - tt_metal::ReaderDataMovementConfig({a_is_dram})); + tt_metal::ReaderDataMovementConfig({a_is_dram}, std::move(reader_defines))); // WRITER KERNEL + auto writer_defines = dataflow_defines; auto writer_kernel_id = tt_metal::CreateKernel( program, "ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/writer_batch_norm.cpp", all_device_cores, - tt_metal::WriterDataMovementConfig({ - b_is_dram, - c_is_dram, - d_is_dram, - e_is_dram, - f_is_dram, - static_cast(weight_has_value), - static_cast(bias_has_value), - })); + tt_metal::WriterDataMovementConfig( + { + b_is_dram, + c_is_dram, + d_is_dram, + e_is_dram, + f_is_dram, + static_cast(weight_has_value), + static_cast(bias_has_value), + }, + std::move(writer_defines))); // COMPUTE KERNEL bool fp32_dest_acc_en = c_data_format == tt::DataFormat::UInt32 || c_data_format == tt::DataFormat::Int32 || c_data_format == tt::DataFormat::Float32; + + uint32_t src_input_cb_index = tt::CBIndex::c_0; + uint32_t src_batch_mean_cb_index = tt::CBIndex::c_1; + uint32_t src_batch_var_cb_index = tt::CBIndex::c_3; + uint32_t src_eps_cb_index = tt::CBIndex::c_4; + uint32_t src_temp_den_cb_index = tt::CBIndex::c_5; + uint32_t src_temp_num_cb_index = tt::CBIndex::c_6; + uint32_t src_weight_cb_index = tt::CBIndex::c_16; + uint32_t src_temp_1_cb_index = tt::CBIndex::c_17; + uint32_t src_bias_cb_index = tt::CBIndex::c_18; + + std::vector unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default); + if (fp32_dest_acc_en) { + for (const auto cb_index : + {src_input_cb_index, + src_batch_mean_cb_index, + src_batch_var_cb_index, + src_temp_num_cb_index, + src_temp_den_cb_index, + src_eps_cb_index, + src_weight_cb_index, + src_temp_1_cb_index, + src_bias_cb_index}) { + unpack_to_dest_mode[cb_index] = UnpackToDestMode::UnpackToDestFp32; + } + } + std::vector compute_kernel_args = { static_cast(weight_has_value), static_cast(bias_has_value)}; auto compute_kernel_id = tt_metal::CreateKernel( program, - "ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/batch_norm_kernel.cpp", + fmt::format( + "ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/batch_norm_{}.cpp", + fp32_dest_acc_en ? "sfpu_kernel" : "kernel"), all_device_cores, - tt_metal::ComputeConfig{.fp32_dest_acc_en = fp32_dest_acc_en, .compile_args = compute_kernel_args}); + tt_metal::ComputeConfig{ + .fp32_dest_acc_en = fp32_dest_acc_en, + .unpack_to_dest_mode = std::move(unpack_to_dest_mode), + .compile_args = compute_kernel_args}); auto set_runtime_args = [](Program& program, KernelHandle kernel_id, CoreCoord core, auto&& args) { tt_metal::SetRuntimeArgs(program, kernel_id, core, args); diff --git a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/batch_norm_sfpu_kernel.cpp b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/batch_norm_sfpu_kernel.cpp new file mode 100644 index 00000000000..52942da1f55 --- /dev/null +++ b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/batch_norm_sfpu_kernel.cpp @@ -0,0 +1,243 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "compute_kernel_api/eltwise_binary_sfpu.h" +#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp" +#include "compute_kernel_api/eltwise_unary/sfpu_split_includes.h" +#include "compute_kernel_api/eltwise_unary/eltwise_unary.h" + +#include + +namespace NAMESPACE { + +ALWI void batchnorm_bcast_tiles( + uint32_t cb_bcast, + uint32_t cb_other, + uint32_t freq, + uint32_t tile_start, + uint32_t cb_batch_var, + uint32_t cb_eps, + uint32_t cb_den, + uint32_t cb_num, + uint32_t cb_weight, + uint32_t cb_bias, + uint32_t cb_tmp_1, + uint32_t cb_output_0, + uint32_t weight_has, + uint32_t bias_has) { + constexpr uint32_t onetile = 1; + constexpr int dst0 = 0; + uint32_t weight_has_value = weight_has; + uint32_t bias_has_value = bias_has; + auto cb_affine_or_out = (weight_has_value || bias_has_value) ? cb_tmp_1 : cb_output_0; + auto cb_scaled_output = (bias_has_value) ? cb_tmp_1 : cb_output_0; + + // input - batch_mean + cb_wait_front(cb_bcast, onetile); + for (uint32_t j = tile_start; j < freq; ++j) { + cb_wait_front(cb_other, onetile); + + cb_reserve_back(cb_num, onetile); + + sub_binary_tile_init(); + tile_regs_acquire(); + tile_regs_wait(); + copy_tile_to_dst_init_short_with_dt(cb_bcast, cb_other); + for (uint32_t i = 0; i < onetile; ++i) { + copy_tile(cb_other, i, i * 2); + } + copy_tile_to_dst_init_short_with_dt(cb_other, cb_bcast); + for (uint32_t i = 0; i < onetile; ++i) { + copy_tile(cb_bcast, i, i * 2 + 1); + sub_binary_tile(i * 2, i * 2 + 1); + tile_regs_commit(); + pack_tile(i * 2, cb_num); + } + tile_regs_release(); + cb_push_back(cb_num, onetile); + cb_pop_front(cb_other, onetile); + } + cb_pop_front(cb_bcast, onetile); + + // 1/(sqrt(batch_var + eps)) + cb_reserve_back(cb_den, onetile); + cb_wait_front(cb_batch_var, onetile); + cb_wait_front(cb_eps, onetile); + + add_binary_tile_init(); + rsqrt_tile_init(); + copy_tile_to_dst_init_short_with_dt(cb_eps, cb_batch_var); + for (uint32_t i = 0; i < onetile; ++i) { + copy_tile(cb_batch_var, i, i * 2); + } + copy_tile_to_dst_init_short_with_dt(cb_batch_var, cb_eps); + for (uint32_t i = 0; i < onetile; ++i) { + copy_tile(cb_eps, i, i * 2 + 1); + + add_binary_tile(i * 2, i * 2 + 1); + rsqrt_tile(i * 2); + tile_regs_commit(); + + tile_regs_wait(); + pack_tile(i * 2, cb_den); + } + tile_regs_release(); + + cb_push_back(cb_den, onetile); + cb_pop_front(cb_batch_var, onetile); + cb_pop_front(cb_eps, onetile); + + // (input - batch_mean)/(sqrt(batch_var + eps)) = result + cb_wait_front(cb_den, onetile); + for (uint32_t j = tile_start; j < freq; ++j) { + cb_wait_front(cb_num, onetile); + + cb_reserve_back(cb_affine_or_out, onetile); + + mul_binary_tile_init(); + tile_regs_acquire(); + tile_regs_wait(); + copy_tile_to_dst_init_short_with_dt(cb_den, cb_num); + for (uint32_t i = 0; i < onetile; ++i) { + copy_tile(cb_num, i, i * 2); + } + copy_tile_to_dst_init_short_with_dt(cb_num, cb_den); + for (uint32_t i = 0; i < onetile; ++i) { + copy_tile(cb_den, i, i * 2 + 1); + mul_binary_tile(i * 2, i * 2 + 1); + tile_regs_commit(); + pack_tile(i * 2, cb_affine_or_out); + } + tile_regs_release(); + cb_push_back(cb_affine_or_out, onetile); + cb_pop_front(cb_num, onetile); + } + cb_pop_front(cb_den, onetile); + + if (weight_has_value) { // result = result * weight + cb_wait_front(cb_weight, onetile); + for (uint32_t j = tile_start; j < freq; ++j) { + cb_wait_front(cb_affine_or_out, onetile); + + cb_reserve_back(cb_scaled_output, onetile); + + mul_binary_tile_init(); + tile_regs_acquire(); + tile_regs_wait(); + copy_tile_to_dst_init_short_with_dt(cb_weight, cb_affine_or_out); + for (uint32_t i = 0; i < onetile; ++i) { + copy_tile(cb_affine_or_out, i, i * 2); + } + copy_tile_to_dst_init_short_with_dt(cb_affine_or_out, cb_weight); + for (uint32_t i = 0; i < onetile; ++i) { + copy_tile(cb_weight, i, i * 2 + 1); + mul_binary_tile(i * 2, i * 2 + 1); + tile_regs_commit(); + pack_tile(i * 2, cb_scaled_output); + } + tile_regs_release(); + cb_push_back(cb_scaled_output, onetile); + cb_pop_front(cb_affine_or_out, onetile); + } + cb_pop_front(cb_weight, onetile); + } + + if (bias_has_value) { // result = result + bias + cb_wait_front(cb_bias, onetile); + for (uint32_t j = tile_start; j < freq; ++j) { + cb_wait_front(cb_tmp_1, onetile); + + cb_reserve_back(cb_output_0, onetile); + + add_binary_tile_init(); + tile_regs_acquire(); + tile_regs_wait(); + copy_tile_to_dst_init_short_with_dt(cb_bias, cb_tmp_1); + for (uint32_t i = 0; i < onetile; ++i) { + copy_tile(cb_tmp_1, i, i * 2); + } + copy_tile_to_dst_init_short_with_dt(cb_tmp_1, cb_bias); + for (uint32_t i = 0; i < onetile; ++i) { + copy_tile(cb_bias, i, i * 2 + 1); + add_binary_tile(i * 2, i * 2 + 1); + tile_regs_commit(); + pack_tile(i * 2, cb_output_0); + } + tile_regs_release(); + cb_push_back(cb_output_0, onetile); + cb_pop_front(cb_tmp_1, onetile); + } + cb_pop_front(cb_bias, onetile); + } +} + +void MAIN { + uint32_t num_tiles = get_arg_val(0); + uint32_t tile_freq = get_arg_val(1); + uint32_t tile_start = get_arg_val(2); + constexpr uint32_t weight_has_value = get_compile_time_arg_val(0) == 1; + constexpr uint32_t bias_has_value = get_compile_time_arg_val(1) == 1; + + if (num_tiles == 0) { + return; + } + + constexpr auto cb_input = tt::CBIndex::c_0; // input + constexpr auto cb_batch_mean = tt::CBIndex::c_1; // batch_mean + constexpr auto cb_output_0 = + tt::CBIndex::c_2; // output -- > [(input - batch_mean)/(sqrt(batch_var + eps))] * weight + constexpr auto cb_batch_var = tt::CBIndex::c_3; // batch_var + constexpr auto cb_eps = tt::CBIndex::c_4; // eps + constexpr auto cb_den = tt::CBIndex::c_5; // 1/(sqrt(batch_var + eps)) + constexpr auto cb_num = tt::CBIndex::c_6; // input - batch_mean + constexpr auto cb_weight = tt::CBIndex::c_16; // weight tensor + constexpr auto cb_tmp_1 = tt::CBIndex::c_17; // (input - batch_mean)/(sqrt(batch_var + eps)) + constexpr auto cb_bias = tt::CBIndex::c_18; // bias tensor + + auto cb_bcast = cb_batch_mean; + auto cb_other = cb_input; + + unary_op_init_common(cb_other, cb_output_0); + + uint32_t complete_iterations = (num_tiles + tile_start) / tile_freq; + uint32_t remaining_iterations = (num_tiles + tile_start) % tile_freq; + for (uint32_t i = 0; i < complete_iterations; ++i, tile_start = 0) { + batchnorm_bcast_tiles( + cb_bcast, + cb_other, + tile_freq, + tile_start, + cb_batch_var, + cb_eps, + cb_den, + cb_num, + cb_weight, + cb_bias, + cb_tmp_1, + cb_output_0, + weight_has_value, + bias_has_value); + } + if (remaining_iterations > 0) { + batchnorm_bcast_tiles( + cb_bcast, + cb_other, + remaining_iterations, + tile_start, + cb_batch_var, + cb_eps, + cb_den, + cb_num, + cb_weight, + cb_bias, + cb_tmp_1, + cb_output_0, + weight_has_value, + bias_has_value); + } + + constexpr uint32_t onetile = 1; + constexpr int dst0 = 0; +} +} // namespace NAMESPACE diff --git a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/reader_batch_norm.cpp b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/reader_batch_norm.cpp index a5f9c86787a..ebf287dce1f 100644 --- a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/reader_batch_norm.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/reader_batch_norm.cpp @@ -37,8 +37,18 @@ void kernel_main() { constexpr auto cb_id_eps = tt::CBIndex::c_4; + union { + float f; + uint32_t u; + } scalar; + scalar.u = eps; cb_reserve_back(cb_id_eps, onetile); - fill_with_val_bfloat16(cb_id_eps, eps); +#ifdef FILL_WITH_VALUE_FLOAT + FILL_WITH_VALUE_FLOAT(cb_id_eps, scalar.f); +#endif +#ifdef FILL_WITH_VALUE + FILL_WITH_VALUE(cb_id_eps, eps); +#endif cb_push_back(cb_id_eps, onetile); // Input tile offset diff --git a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/writer_batch_norm.cpp b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/writer_batch_norm.cpp index 0143fbec042..0c80abbc870 100644 --- a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/writer_batch_norm.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/writer_batch_norm.cpp @@ -89,7 +89,7 @@ void kernel_main() { uint32_t l1_write_addr = get_write_ptr(cb_id_src); noc_async_read_tile(tile_offset, src, l1_write_addr); noc_async_read_barrier(); - fill_tile_with_first_element_bfloat16(cb_id_src); + FILL_TILE_WITH_FIRST_ELEMENT(cb_id_src); cb_push_back(cb_id_src, onetile); // read a tile from batch variance @@ -97,7 +97,7 @@ void kernel_main() { uint32_t l1_batch_var_write_addr = get_write_ptr(cb_id_batch_var); noc_async_read_tile(tile_offset, batch_var, l1_batch_var_write_addr); noc_async_read_barrier(); - fill_tile_with_first_element_bfloat16(cb_id_batch_var); + FILL_TILE_WITH_FIRST_ELEMENT(cb_id_batch_var); cb_push_back(cb_id_batch_var, onetile); if constexpr (weight_has_value) { // read a tile from weight tensor @@ -105,7 +105,7 @@ void kernel_main() { uint32_t l1_weight_write_addr = get_write_ptr(cb_id_weight); noc_async_read_tile(tile_offset, weight, l1_weight_write_addr); noc_async_read_barrier(); - fill_tile_with_first_element_bfloat16(cb_id_weight); + FILL_TILE_WITH_FIRST_ELEMENT(cb_id_weight); cb_push_back(cb_id_weight, onetile); } @@ -114,7 +114,7 @@ void kernel_main() { uint32_t l1_bias_write_addr = get_write_ptr(cb_id_bias); noc_async_read_tile(tile_offset, bias, l1_bias_write_addr); noc_async_read_barrier(); - fill_tile_with_first_element_bfloat16(cb_id_bias); + FILL_TILE_WITH_FIRST_ELEMENT(cb_id_bias); cb_push_back(cb_id_bias, onetile); } From 2911f2443ddc0de152eaa6563af804a6c88e1ba5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bojan=20Ro=C5=A1ko?= <156314064+broskoTT@users.noreply.github.com> Date: Sun, 9 Feb 2025 14:05:55 +0100 Subject: [PATCH 034/316] [UMD] Change logical to translated mapping to new API (#17674) ### Ticket Related to https://github.com/tenstorrent/tt-metal/issues/17002 ### Problem description Alter some APIs and remove some usages. ### What's changed - Remove worker_logical_to_virtual_x_ and worker_logical_to_virtual_y_ - Change get_virtual_coordinate_from_logical_coordinates so that it uses new api for tensix and eth, and same path for DRAM - Implement get_worker_logical_to_virtual_x and get_worker_logical_to_virtual_y, which should be removed once but is out of scope for this PR. However. remove the usage of old API through them. ### Testing I've added new code directly to generate_logical_to_virtual_coord_mapping, and verified that old vs new mappings are the same, before removing that code. I've verified it matches what we return by translate_coord_to from LOGICAL to TRANSLATED coords. I did this on wormhole only ### Checklist - [x] All post-commit tests : https://github.com/tenstorrent/tt-metal/actions/runs/13204500386 https://github.com/tenstorrent/tt-metal/actions/runs/13208231490 - [x] Blackhole post-commit tests : https://github.com/tenstorrent/tt-metal/actions/runs/13197582366 - [ ] (Single-card) Model perf tests : https://github.com/tenstorrent/tt-metal/actions/runs/13197585051 - [ ] (Single-card) Device perf regressions : https://github.com/tenstorrent/tt-metal/actions/runs/13197587167 - [ ] (T3K) T3000 unit tests : https://github.com/tenstorrent/tt-metal/actions/runs/13197589587 - [ ] (T3K) T3000 demo tests : https://github.com/tenstorrent/tt-metal/actions/runs/13197591287 - [ ] (TG) TG unit tests : https://github.com/tenstorrent/tt-metal/actions/runs/13197592965 - [ ] (TG) TG demo tests : https://github.com/tenstorrent/tt-metal/actions/runs/13197595178 - [x] (TGG) TGG unit tests : https://github.com/tenstorrent/tt-metal/actions/runs/13197597328 - [x] (TGG) TGG demo tests : https://github.com/tenstorrent/tt-metal/actions/runs/13197599629 --- tt_metal/api/tt-metalium/tt_cluster.hpp | 11 ++-- tt_metal/llrt/tt_cluster.cpp | 80 ++++++++++++------------- 2 files changed, 44 insertions(+), 47 deletions(-) diff --git a/tt_metal/api/tt-metalium/tt_cluster.hpp b/tt_metal/api/tt-metalium/tt_cluster.hpp index ff71e87ca00..cecb702cda6 100644 --- a/tt_metal/api/tt-metalium/tt_cluster.hpp +++ b/tt_metal/api/tt-metalium/tt_cluster.hpp @@ -242,8 +242,11 @@ class Cluster { bool is_worker_core(const CoreCoord &core, chip_id_t chip_id) const; bool is_ethernet_core(const CoreCoord &core, chip_id_t chip_id) const; CoreCoord get_logical_ethernet_core_from_virtual(chip_id_t chip, CoreCoord core) const; - const std::unordered_map& get_worker_logical_to_virtual_x(chip_id_t chip_id) const { return this->worker_logical_to_virtual_x_.at(this->get_board_type(chip_id)); }; - const std::unordered_map& get_worker_logical_to_virtual_y(chip_id_t chip_id) const { return this->worker_logical_to_virtual_y_.at(this->get_board_type(chip_id)); }; + + // These two functions should be removed in favor of direct translation. + const std::unordered_map get_worker_logical_to_virtual_x(chip_id_t chip_id) const; + const std::unordered_map get_worker_logical_to_virtual_y(chip_id_t chip_id) const; + const std::unordered_map& get_virtual_routing_to_profiler_flat_id(chip_id_t chip_id) const; private: Cluster(); @@ -262,7 +265,6 @@ class Cluster { const std::unordered_map &input, const std::unordered_map &per_chip_id_harvesting_masks); void generate_virtual_to_umd_coord_mapping(); - void generate_logical_to_virtual_coord_mapping(); void generate_virtual_to_profiler_flat_id_mapping(); // Reserves ethernet cores in cluster for tunneling @@ -295,9 +297,6 @@ class Cluster { std::unordered_map virtual_to_umd_coord_mapping_; std::unordered_map> virtual_worker_cores_; std::unordered_map> virtual_eth_cores_; - std::unordered_map> worker_logical_to_virtual_x_; - std::unordered_map> worker_logical_to_virtual_y_; - std::unordered_map> eth_logical_to_virtual_; std::unordered_map> virtual_routing_to_profiler_flat_id_; // Flag to tell whether we are on a TG type of system. // If any device has to board type of GALAXY, we are on a TG cluster. diff --git a/tt_metal/llrt/tt_cluster.cpp b/tt_metal/llrt/tt_cluster.cpp index f699180ee89..807dca854fb 100644 --- a/tt_metal/llrt/tt_cluster.cpp +++ b/tt_metal/llrt/tt_cluster.cpp @@ -198,7 +198,6 @@ void Cluster::initialize_device_drivers() { tt_device_params default_params; this->start_driver(default_params); this->generate_virtual_to_umd_coord_mapping(); - this->generate_logical_to_virtual_coord_mapping(); this->generate_virtual_to_profiler_flat_id_mapping(); } @@ -347,39 +346,6 @@ void Cluster::generate_virtual_to_umd_coord_mapping() { } } -void Cluster::generate_logical_to_virtual_coord_mapping() { - for (auto chip_id : this->cluster_desc_->get_all_chips()) { - auto board_type = this->get_board_type(chip_id); - if (this->worker_logical_to_virtual_x_.find(board_type) != this->worker_logical_to_virtual_x_.end()) { - continue; - } - auto& soc_desc = this->get_soc_desc(chip_id); - this->worker_logical_to_virtual_x_.insert({board_type, {}}); - this->worker_logical_to_virtual_y_.insert({board_type, {}}); - this->eth_logical_to_virtual_.insert({board_type, {}}); - for (auto x_coords : soc_desc.worker_log_to_routing_x) { - CoreCoord phys_core = soc_desc.get_physical_tensix_core_from_logical(CoreCoord(x_coords.first, 0)); - CoreCoord virtual_coords = this->get_virtual_coordinate_from_physical_coordinates(chip_id, phys_core); - this->worker_logical_to_virtual_x_.at(board_type).insert({x_coords.first, virtual_coords.x}); - } - for (auto y_coords : soc_desc.worker_log_to_routing_y) { - CoreCoord phys_core = soc_desc.get_physical_tensix_core_from_logical(CoreCoord(0, y_coords.first)); - CoreCoord virtual_coords = this->get_virtual_coordinate_from_physical_coordinates(chip_id, phys_core); - this->worker_logical_to_virtual_y_.at(board_type).insert({y_coords.first, virtual_coords.y}); - } - for (std::size_t log_eth_core_y = 0; log_eth_core_y < soc_desc.get_cores(CoreType::ETH).size(); - log_eth_core_y++) { - CoreCoord logical_eth_core = {0, log_eth_core_y}; - tt::umd::CoreCoord phys_eth_core = - soc_desc.translate_coord_to(soc_desc.get_eth_core_for_channel(log_eth_core_y), CoordSystem::PHYSICAL); - CoreCoord virtual_coords = - this->get_virtual_coordinate_from_physical_coordinates(chip_id, {phys_eth_core.x, phys_eth_core.y}); - this->eth_logical_to_virtual_.at(board_type).insert({logical_eth_core, virtual_coords}); - } - } - -} - void Cluster::generate_virtual_to_profiler_flat_id_mapping() { #if defined(TRACY_ENABLE) for (auto chip_id : this->cluster_desc_->get_all_chips()) { @@ -417,15 +383,27 @@ const std::unordered_set& Cluster::get_virtual_eth_cores(chip_id_t ch return this->virtual_eth_cores_.at(chip_id); } -CoreCoord Cluster::get_virtual_coordinate_from_logical_coordinates(chip_id_t chip_id, CoreCoord logical_coord, const CoreType& core_type) const { - auto board_type = this->get_board_type(chip_id); - if (core_type == CoreType::WORKER) { - return CoreCoord(this->worker_logical_to_virtual_x_.at(board_type).at(logical_coord.x), this->worker_logical_to_virtual_y_.at(board_type).at(logical_coord.y)); - } else if (core_type == CoreType::ETH) { - return this->eth_logical_to_virtual_.at(board_type).at(logical_coord); +CoreCoord Cluster::get_virtual_coordinate_from_logical_coordinates( + chip_id_t chip_id, CoreCoord logical_coord, const CoreType& core_type) const { + // Keeping the old behavior, although UMD does define translation for other cores as well. + if (core_type != CoreType::WORKER && core_type != CoreType::DRAM && core_type != CoreType::ETH) { + TT_THROW("Undefined conversion for core type."); } + auto& soc_desc = this->get_soc_desc(chip_id); - return soc_desc.get_physical_core_from_logical_core(logical_coord, core_type); + if (core_type == CoreType::DRAM) { + return soc_desc.get_physical_dram_core_from_logical(logical_coord); + } + + // TBD: Remove when all WORKER are rewritten to TENSIX + CoreType core_type_to_use = core_type; + if (core_type_to_use == CoreType::WORKER) { + core_type_to_use = CoreType::TENSIX; + } + + tt::umd::CoreCoord translated_coord = + soc_desc.translate_coord_to({logical_coord, core_type_to_use, CoordSystem::LOGICAL}, CoordSystem::TRANSLATED); + return {translated_coord.x, translated_coord.y}; } tt_cxy_pair Cluster::get_virtual_coordinate_from_logical_coordinates(tt_cxy_pair logical_coordinate, const CoreType& core_type) const { @@ -458,6 +436,26 @@ CoreCoord Cluster::get_logical_ethernet_core_from_virtual(chip_id_t chip, CoreCo return {logical_core.x, logical_core.y}; } +const std::unordered_map Cluster::get_worker_logical_to_virtual_x(chip_id_t chip_id) const { + std::unordered_map worker_logical_to_virtual_x; + const auto& soc_desc = tt::Cluster::instance().get_soc_desc(chip_id); + for (const tt::umd::CoreCoord& logical_core : soc_desc.get_cores(CoreType::TENSIX, CoordSystem::LOGICAL)) { + tt::umd::CoreCoord translated_core = soc_desc.translate_coord_to(logical_core, CoordSystem::TRANSLATED); + worker_logical_to_virtual_x[logical_core.x] = translated_core.x; + } + return worker_logical_to_virtual_x; +} + +const std::unordered_map Cluster::get_worker_logical_to_virtual_y(chip_id_t chip_id) const { + std::unordered_map worker_logical_to_virtual_y; + const auto& soc_desc = tt::Cluster::instance().get_soc_desc(chip_id); + for (const tt::umd::CoreCoord& logical_core : soc_desc.get_cores(CoreType::TENSIX, CoordSystem::LOGICAL)) { + tt::umd::CoreCoord translated_core = soc_desc.translate_coord_to(logical_core, CoordSystem::TRANSLATED); + worker_logical_to_virtual_y[logical_core.y] = translated_core.y; + } + return worker_logical_to_virtual_y; +} + uint32_t Cluster::get_harvested_rows(chip_id_t chip) const { if (this->target_type_ == TargetDevice::Simulator) { return 0; From 6a1cdca0569aba4686a85ce5deb20ba0963f5315 Mon Sep 17 00:00:00 2001 From: Mark O'Connor Date: Sun, 9 Feb 2025 17:42:43 +0100 Subject: [PATCH 035/316] [skip ci] Update README.md (#17716) --- models/demos/llama3/README.md | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/models/demos/llama3/README.md b/models/demos/llama3/README.md index 65d370e4a5b..5e8bd6f44de 100644 --- a/models/demos/llama3/README.md +++ b/models/demos/llama3/README.md @@ -1,6 +1,6 @@ -# Llama3 Models +# Llama-like Models -This codebase includes the Llama3 family of models. +This code can run Llama3 family of models and other similar models including Qwen2.5 and DeepSeek-R1-Distill variants. The current version supports the following Llama3 models: - Llama3.2-1B @@ -8,6 +8,8 @@ The current version supports the following Llama3 models: - Llama3.1-8B - Llama3.2-11B - Llama3.1-70B (T3000 and TG-only) +- Qwen2.5-7B +- Qwen2.5-72B - DeepSeek R1 Distill Llama 3.3 70B (T3000 and TG-only) All the above llama models (with the exception of 70B due to its large size) are compatible and tested on the following Tenstorrent hardware: @@ -16,6 +18,9 @@ All the above llama models (with the exception of 70B due to its large size) are - T3000 (8-chips) - TG (32-chips) +Qwen-7B requires N300 +Qwen-72B requires T3K + **Max Context Lengths (text-only)**: All of the compatible model/device combinations support a max prefill context-length of 128k, with the exception of Llama3.1-8B and Llama3.2-11B on N150 which have a max of 64k (due to a lack of memory). To support these large max context-lengths, chunked prefill is performed with different max chunk sizes as shown in the table below. Max Prefill Chunk Sizes (text-only): @@ -62,7 +67,7 @@ Llama3.2-11B multimodal requires extra python dependencies. Install them from: pip install -r models/demos/llama3/requirements.txt ``` -### HuggingFace models (e.g. DeepSeek R1 Distill Llama 3.3 70B) +### HuggingFace models (e.g. DeepSeek R1 Distill Llama 3.3 70B, Qwen 2.5 7B, ...) Download the weights from [HuggingFace](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B). Your model directory should have the following structure: @@ -74,6 +79,17 @@ DeepSeek-R1-Distill-Llama-70B/ ... ``` +#### Running llama-similar models other than DeepSeek R1 Distill and Qwen 2.5 + +If you are bringing up a new model that is similar to these but is not listed above, you will also need to set additional environment variables: +- `MAX_PREFILL_CHUNK_SIZE` - this determines how many thousands of tokens are prefilled in one go. For optimal performance pick 128. Depending on the model dimensions and hardware you're running on, there may not be enough L1 to prefill 128K tokens at once, in which case you can reduce this in powers of 2 down to 4. +- `PAD_MLP_CORES` - models with a hidden_dim that is not a nice power of 2 may not have a valid layout or may run with lower performance. You can set this to a multiple of 8 between 8 and 64; `16` and `32` commonly work well if this is required. + +You should also watch out for: +- RoPE encoding style. `llama3` and of course none are both supported. We have a [branch](https://github.com/tenstorrent/tt-metal/tree/llama-yarn) with `yarn` support in progress. +- Our [accuracy test](tests/test_llama_accuracy.py) will require you to [generate some reference logits](tests/generate_reference_hf.py) and perhaps update the test to use them. +- We parallelise attention over the number of heads. If this number is e.g. 14 then you will not be able to run it on more than 2 chips (because 14/2=7, a prime number). We do not support head-padding or similar mitigations at this time but a PR would be cool. + ### Setup TT environment 1. Set up environment variables: From 38578b33849c41ad70f5375856d736bc77239b8c Mon Sep 17 00:00:00 2001 From: Mouliraj Elamurugan Date: Mon, 10 Feb 2025 09:42:02 +0530 Subject: [PATCH 036/316] #17559: Update logit op (#17586) ### Ticket Link to Github Issue #17559 ### Problem description The current composite op implementation uses intermediate tensors created with ttnn::full_like to invoke tensor-tensor overloads of other binary ops ### What's changed Updated the logic to eliminate the use of full_like. ### Checklist - [ ] [All post commit CI] () --- .../eltwise/unary/device/unary_composite_op.cpp | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.cpp index 7cee4b3445c..b148d1dad16 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.cpp @@ -810,15 +810,11 @@ Tensor _softshrink(const Tensor& a, float param, const std::optional& output_mem_config) { - Tensor t_eps = ttnn::full_like(input_a, eps); - Tensor t1m_eps = ttnn::full_like(input_a, (1 - eps)); + float t1m_eps = 1 - eps; Tensor logit_input = ttnn::where( - ttnn::ltz(t_eps, output_mem_config), - input_a, - ttnn::where( - ttnn::lt(input_a, t_eps, std::nullopt, output_mem_config), - t_eps, - ttnn::where(ttnn::gt(input_a, t1m_eps, std::nullopt, output_mem_config), t1m_eps, input_a))); + ttnn::lt(input_a, eps, std::nullopt, output_mem_config), + eps, + ttnn::where(ttnn::gt(input_a, t1m_eps, std::nullopt, output_mem_config), t1m_eps, input_a)); Tensor linput_m1 = ttnn::rsub(logit_input, 1.0, output_mem_config); Tensor log_input = ttnn::multiply(logit_input, ttnn::reciprocal(linput_m1, output_mem_config), std::nullopt, output_mem_config); From 65b32c93b7e1165eca409f8fa56b3ff296b2d9e6 Mon Sep 17 00:00:00 2001 From: aagarwalTT Date: Sat, 8 Feb 2025 23:42:00 +0000 Subject: [PATCH 037/316] Support for routing planes --- .../kernels/tt_fabric_traffic_controller.cpp | 3 +- .../kernels/tt_fabric_traffic_gen_tx.cpp | 25 ++--- .../tt_fabric_traffic_gen_tx_socket.cpp | 29 ++---- .../routing/kernels/tt_fabric_tx_ubench.cpp | 41 ++------- .../routing/test_tt_fabric_sanity.cpp | 91 +++++++++++-------- .../routing/test_tt_fabric_socket_sanity.cpp | 9 +- tt_fabric/control_plane.hpp | 4 +- tt_fabric/hw/inc/tt_fabric_api.h | 33 ++++++- tt_fabric/hw/inc/tt_fabric_interface.h | 3 +- .../impl/kernels/tt_fabric_gatekeeper.cpp | 68 +++++++------- 10 files changed, 158 insertions(+), 148 deletions(-) diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_controller.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_controller.cpp index 0b093070666..7d6ea107690 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_controller.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_controller.cpp @@ -29,5 +29,6 @@ void kernel_main() { // do a noc multicast to tx kernels uint64_t mcast_dest_addr = get_noc_addr_helper(mcast_encoding, tx_signal_addr); - noc_async_write_multicast_one_packet((uint32_t)mcast_sem, mcast_dest_addr, sizeof(uint32_t), num_mcast_dests); + noc_async_write_multicast_loopback_src((uint32_t)mcast_sem, mcast_dest_addr, sizeof(uint32_t), num_mcast_dests); + noc_async_writes_flushed(); } diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp index 152f52e5767..c13ac0ea9cf 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp @@ -70,9 +70,8 @@ uint32_t max_packet_size_mask; auto input_queue_state = select_input_queue(); volatile local_pull_request_t *local_pull_request = (volatile local_pull_request_t *)(data_buffer_start_addr - 1024); -volatile tt_l1_ptr fabric_router_l1_config_t* routing_table = - reinterpret_cast(routing_table_start_addr); -volatile fabric_client_interface_t* client_interface = (volatile fabric_client_interface_t*)client_interface_addr; +volatile tt_l1_ptr fabric_router_l1_config_t* routing_table; +volatile fabric_client_interface_t* client_interface; fvc_producer_state_t test_producer __attribute__((aligned(16))); fvcc_inbound_state_t fvcc_test_producer __attribute__((aligned(16))); @@ -385,15 +384,12 @@ bool test_buffer_handler() { } void kernel_main() { - tt_fabric_init(); - uint32_t rt_args_idx = 0; time_seed = get_arg_val(increment_arg_idx(rt_args_idx)); src_endpoint_id = get_arg_val(increment_arg_idx(rt_args_idx)); noc_offset = get_arg_val(increment_arg_idx(rt_args_idx)); controller_noc_offset = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t router_x = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t router_y = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t routing_plane = get_arg_val(increment_arg_idx(rt_args_idx)); dest_device = get_arg_val(increment_arg_idx(rt_args_idx)); uint32_t rx_buf_size = get_arg_val(increment_arg_idx(rt_args_idx)); gk_interface_addr_l = get_arg_val(increment_arg_idx(rt_args_idx)); @@ -406,11 +402,6 @@ void kernel_main() { target_address = base_target_address; rx_addr_hi = base_target_address + rx_buf_size; - uint64_t router_config_addr = - NOC_XY_ADDR(NOC_X(router_x), NOC_Y(router_y), eth_l1_mem::address_map::FABRIC_ROUTER_CONFIG_BASE); - noc_async_read_one_packet(router_config_addr, routing_table_start_addr, sizeof(fabric_router_l1_config_t)); - noc_async_read_barrier(); - zero_l1_buf(test_results, test_results_size_bytes); test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_STARTED; test_results[PQ_TEST_STATUS_INDEX+1] = (uint32_t) local_pull_request; @@ -421,10 +412,6 @@ void kernel_main() { zero_l1_buf(reinterpret_cast(data_buffer_start_addr), data_buffer_size_words * PACKET_WORD_SIZE_BYTES); zero_l1_buf((uint32_t*)local_pull_request, sizeof(local_pull_request_t)); zero_l1_buf((uint32_t*)&packet_header, sizeof(packet_header_t)); - zero_l1_buf((uint32_t*)client_interface, sizeof(fabric_client_interface_t)); - client_interface->gk_interface_addr = ((uint64_t)gk_interface_addr_h << 32) | gk_interface_addr_l; - client_interface->gk_msg_buf_addr = - (((uint64_t)gk_interface_addr_h << 32) | gk_interface_addr_l) + offsetof(gatekeeper_info_t, gk_msg_buf); if constexpr (pkt_dest_size_choice == pkt_dest_size_choices_t::RANDOM) { input_queue_state.init(src_endpoint_id, prng_seed); @@ -474,8 +461,10 @@ void kernel_main() { uint32_t curr_packet_words_sent = 0; uint32_t packet_count = 0; - // make sure fabric node gatekeeper is available. - fabric_endpoint_init(); + // initalize client + fabric_endpoint_init(client_interface_addr, gk_interface_addr_l, gk_interface_addr_h); + routing_table = reinterpret_cast( + client_interface->routing_tables_l1_offset + sizeof(fabric_router_l1_config_t) * routing_plane); while (true) { iter++; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx_socket.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx_socket.cpp index 39571c2a5e4..0fcb8ae7c38 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx_socket.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx_socket.cpp @@ -66,9 +66,8 @@ uint32_t max_packet_size_mask; auto input_queue_state = select_input_queue(); volatile local_pull_request_t* local_pull_request = (volatile local_pull_request_t*)(data_buffer_start_addr - 1024); -volatile tt_l1_ptr fabric_router_l1_config_t* routing_table = - reinterpret_cast(routing_table_start_addr); -volatile fabric_client_interface_t* client_interface = (volatile fabric_client_interface_t*)client_interface_addr; +volatile tt_l1_ptr fabric_router_l1_config_t* routing_table; +volatile fabric_client_interface_t* client_interface; volatile tt_l1_ptr chan_req_buf* client_pull_req_buf = reinterpret_cast(client_pull_req_buf_addr); @@ -328,24 +327,16 @@ bool test_buffer_handler(socket_handle_t* socket_handle) { } void kernel_main() { - tt_fabric_init(); - // TODO: refactor src_endpoint_id = get_arg_val(0); noc_offset = get_arg_val(1); - uint32_t router_x = get_arg_val(2); - uint32_t router_y = get_arg_val(3); - dest_device = get_arg_val(4); + uint32_t routing_plane = get_arg_val(2); + dest_device = get_arg_val(3); if (ASYNC_WR == test_command) { target_address = get_arg_val(5); } - uint64_t router_config_addr = NOC_XY_ADDR(router_x, router_y, eth_l1_mem::address_map::FABRIC_ROUTER_CONFIG_BASE); - noc_async_read_one_packet( - router_config_addr, routing_table_start_addr, sizeof(tt::tt_fabric::fabric_router_l1_config_t)); - noc_async_read_barrier(); - zero_l1_buf(test_results, test_results_size_bytes); test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_STARTED; test_results[PQ_TEST_STATUS_INDEX + 1] = (uint32_t)local_pull_request; @@ -357,15 +348,15 @@ void kernel_main() { reinterpret_cast(data_buffer_start_addr), data_buffer_size_words * PACKET_WORD_SIZE_BYTES); zero_l1_buf((uint32_t*)local_pull_request, sizeof(local_pull_request_t)); zero_l1_buf((uint32_t*)&packet_header, sizeof(packet_header_t)); - zero_l1_buf((uint32_t*)client_interface, sizeof(fabric_client_interface_t)); + + // initalize client + fabric_endpoint_init(client_interface_addr, gk_interface_addr_l, gk_interface_addr_h); + routing_table = reinterpret_cast( + client_interface->routing_tables_l1_offset + sizeof(fabric_router_l1_config_t) * routing_plane); + zero_l1_buf((uint32_t*)client_pull_req_buf, sizeof(chan_req_buf)); - client_interface->gk_interface_addr = ((uint64_t)gk_interface_addr_h << 32) | gk_interface_addr_l; - client_interface->gk_msg_buf_addr = client_interface->gk_interface_addr + offsetof(gatekeeper_info_t, gk_msg_buf); client_interface->pull_req_buf_addr = xy_local_addr | client_pull_req_buf_addr; - // make sure fabric node gatekeeper is available. - fabric_endpoint_init(); - if constexpr (pkt_dest_size_choice == pkt_dest_size_choices_t::RANDOM) { input_queue_state.init(src_endpoint_id, prng_seed); } else if constexpr (pkt_dest_size_choice == pkt_dest_size_choices_t::SAME_START_RNDROBIN_FIX_SIZE) { diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_tx_ubench.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_tx_ubench.cpp index d749c799ec8..0832c67a7c1 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_tx_ubench.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_tx_ubench.cpp @@ -23,8 +23,6 @@ constexpr uint32_t dest_endpoint_start_id = get_compile_time_arg_val(2); constexpr uint32_t data_buffer_start_addr = get_compile_time_arg_val(3); constexpr uint32_t data_buffer_size_words = get_compile_time_arg_val(4); -constexpr uint32_t routing_table_start_addr = get_compile_time_arg_val(5); - constexpr uint32_t test_results_addr_arg = get_compile_time_arg_val(6); constexpr uint32_t test_results_size_bytes = get_compile_time_arg_val(7); @@ -53,7 +51,7 @@ uint32_t base_target_address = get_compile_time_arg_val(17); // atomic increment for the ATOMIC_INC command constexpr uint32_t atomic_increment = get_compile_time_arg_val(18); -// constexpr uint32_t dest_device = get_compile_time_arg_val(21); + uint32_t dest_device; constexpr uint32_t signal_address = get_compile_time_arg_val(19); @@ -65,10 +63,7 @@ constexpr uint32_t w_depth = get_compile_time_arg_val(25); constexpr uint32_t n_depth = get_compile_time_arg_val(26); constexpr uint32_t s_depth = get_compile_time_arg_val(27); -volatile local_pull_request_t* local_pull_request = (volatile local_pull_request_t*)(data_buffer_start_addr - 1024); -volatile tt_l1_ptr fabric_router_l1_config_t* routing_table = - reinterpret_cast(routing_table_start_addr); -volatile fabric_client_interface_t* client_interface = (volatile fabric_client_interface_t*)client_interface_addr; +volatile fabric_client_interface_t* client_interface; uint64_t xy_local_addr; uint32_t target_address; @@ -94,15 +89,12 @@ inline void notify_traffic_controller() { } void kernel_main() { - tt_fabric_init(); - uint32_t rt_args_idx = 0; time_seed = get_arg_val(increment_arg_idx(rt_args_idx)); src_endpoint_id = get_arg_val(increment_arg_idx(rt_args_idx)); noc_offset = get_arg_val(increment_arg_idx(rt_args_idx)); controller_noc_offset = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t router_x = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t router_y = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t routing_plane = get_arg_val(increment_arg_idx(rt_args_idx)); dest_device = get_arg_val(increment_arg_idx(rt_args_idx)); uint32_t rx_buf_size = get_arg_val(increment_arg_idx(rt_args_idx)); gk_interface_addr_l = get_arg_val(increment_arg_idx(rt_args_idx)); @@ -114,26 +106,13 @@ void kernel_main() { target_address = base_target_address; - // Read in the routing table - uint64_t router_config_addr = - NOC_XY_ADDR(NOC_X(router_x), NOC_Y(router_y), eth_l1_mem::address_map::FABRIC_ROUTER_CONFIG_BASE); - noc_async_read_one_packet(router_config_addr, routing_table_start_addr, sizeof(fabric_router_l1_config_t)); - noc_async_read_barrier(); - zero_l1_buf(test_results, test_results_size_bytes); test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_STARTED; - test_results[PQ_TEST_STATUS_INDEX + 1] = (uint32_t)local_pull_request; - test_results[PQ_TEST_MISC_INDEX] = 0xff000000; test_results[PQ_TEST_MISC_INDEX + 1] = 0xcc000000 | src_endpoint_id; zero_l1_buf( reinterpret_cast(data_buffer_start_addr), data_buffer_size_words * PACKET_WORD_SIZE_BYTES); - zero_l1_buf((uint32_t*)local_pull_request, sizeof(local_pull_request_t)); - zero_l1_buf((uint32_t*)client_interface, sizeof(fabric_client_interface_t)); - client_interface->gk_interface_addr = ((uint64_t)gk_interface_addr_h << 32) | gk_interface_addr_l; - client_interface->gk_msg_buf_addr = - (((uint64_t)gk_interface_addr_h << 32) | gk_interface_addr_l) + offsetof(gatekeeper_info_t, gk_msg_buf); uint64_t data_words_sent = 0; uint32_t packet_count = 0; @@ -160,8 +139,8 @@ void kernel_main() { ); } - // make sure fabric node gatekeeper is available. - fabric_endpoint_init(); + // initalize client + fabric_endpoint_init(client_interface_addr, gk_interface_addr_l, gk_interface_addr_h); // notify the controller kernel that this worker is ready to proceed notify_traffic_controller(); @@ -171,17 +150,18 @@ void kernel_main() { // all tx workers are ready to send data while (*(volatile tt_l1_ptr uint32_t*)signal_address == 0); - uint64_t start_timestamp = get_timestamp(); fabric_setup_pull_request( data_buffer_start_addr, // source address in sender’s memory max_packet_size_words * 16 // number of bytes to write to remote destination ); + uint64_t start_timestamp = get_timestamp(); + while (true) { client_interface->local_pull_request.pull_request.words_read = 0; if constexpr (mcast_data) { fabric_async_write_multicast( - 0, // the network plane to use for this transaction + routing_plane, // the network plane to use for this transaction data_buffer_start_addr, // source address in sender’s memory dest_device >> 16, dest_device & 0xFFFF, @@ -190,11 +170,10 @@ void kernel_main() { e_depth, w_depth, n_depth, - s_depth - ); + s_depth); } else { fabric_async_write( - 0, // the network plane to use for this transaction + routing_plane, // the network plane to use for this transaction data_buffer_start_addr, // source address in sender’s memory dest_device >> 16, dest_device & 0xFFFF, diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp index 233f9530438..052f8b39ed8 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp @@ -450,6 +450,10 @@ typedef struct test_board { return control_plane->get_intra_chip_neighbors(src_mesh_id, src_chip_id, routing_direction); } + inline routing_plane_id_t get_routing_plane_from_chan(chan_id_t eth_chan) { + return control_plane->get_routing_plane_id(eth_chan); + } + inline void close_devices() { tt::tt_metal::detail::CloseDevices(device_handle_map); } } test_board_t; @@ -472,8 +476,8 @@ typedef struct test_device { uint32_t router_mask = 0; uint32_t gk_noc_offset; metal_SocDescriptor soc_desc; - std::unordered_map>> - router_worker_map; // router phys to worker logical cores + std::unordered_map>> + router_worker_map; // router chan to worker logical cores test_device(chip_id_t chip_id_, test_board_t* board_handle_) { physical_chip_id = chip_id_; @@ -646,8 +650,8 @@ typedef struct test_device { void get_available_router_cores( uint32_t num_hops, std::shared_ptr& rx_device, - std::vector& src_routers, - std::vector& dest_routers) { + std::vector& src_routers, + std::vector& dest_routers) { // shortest route possible with least number of internal noc hops uint32_t shortest_route_length = 2 * num_hops - 1; bool select_router = false; @@ -656,16 +660,15 @@ typedef struct test_device { for (auto i = 0; i < router_logical_cores.size(); i++) { std::vector> route; std::set chips_in_route; - chan_id_t eth_chan = soc_desc.logical_eth_core_to_chan_map.at(router_logical_cores[i]); + chan_id_t src_eth_chan = soc_desc.logical_eth_core_to_chan_map.at(router_logical_cores[i]); chips_in_route.insert(physical_chip_id); try { - route = _get_route_to_chip(rx_device->mesh_id, rx_device->logical_chip_id, eth_chan); + route = _get_route_to_chip(rx_device->mesh_id, rx_device->logical_chip_id, src_eth_chan); } catch (const std::exception& e) { continue; } - auto dest_router = - tt::Cluster::instance().get_virtual_eth_core_from_channel(physical_chip_id, route.back().second); + auto dest_eth_chan = route.back().second; if (DEFAULT_NUM_HOPS == num_hops) { // no need to check for path length for default case, all routers can be used @@ -684,8 +687,8 @@ typedef struct test_device { } if (select_router) { - src_routers.push_back(router_virtual_cores[i]); - dest_routers.push_back(dest_router); + src_routers.push_back(src_eth_chan); + dest_routers.push_back(dest_eth_chan); } } @@ -695,16 +698,16 @@ typedef struct test_device { } } - std::vector> select_worker_cores( - const std::vector& router_cores, + std::vector> select_worker_cores( + const std::vector& router_cores, uint32_t num_links, uint32_t count, uint32_t skip_first_n_workers = 0) { - std::vector> result; + std::vector> result; uint32_t link_idx = 0; if (benchmark_mode) { // temp map to keep a track of indices to start lookup from - std::unordered_map router_worker_idx; + std::unordered_map router_worker_idx; for (auto i = 0; i < count; i++) { if (link_idx == num_links) { link_idx = 0; @@ -772,6 +775,7 @@ typedef struct test_device { uint32_t noc_dist, noc_index, noc0_dist, noc1_dist; for (auto i = 0; i < router_logical_cores.size(); i++) { router_phys_core = router_phys_cores[i]; + chan_id_t eth_chan = soc_desc.logical_eth_core_to_chan_map.at(router_logical_cores[i]); std::vector>> temp_map; for (auto j = 0; j < worker_logical_cores.size(); j++) { worker_phys_core = worker_phys_cores[j]; @@ -790,7 +794,7 @@ typedef struct test_device { std::sort(temp_map.begin(), temp_map.end()); for (auto& [noc_dist, pair] : temp_map) { - router_worker_map[router_virtual_cores[i]].push_back(pair); + router_worker_map[eth_chan].push_back(pair); } } } @@ -807,8 +811,8 @@ typedef struct test_traffic { uint32_t num_tx_workers; uint32_t num_rx_workers; uint32_t target_address; - std::vector> tx_workers; - std::vector> rx_workers; + std::vector> tx_workers; + std::vector> rx_workers; std::vector tx_virtual_cores; std::vector rx_virtual_cores; CoreCoord controller_logical_core; @@ -848,8 +852,8 @@ typedef struct test_traffic { throw std::runtime_error("Number of dest endpoints should be less than or equal to src endpoints"); } - std::vector src_routers; - std::vector dest_routers; + std::vector src_routers; + std::vector dest_routers; // For Unicast there is only one rx device // For mcast, this only supports line mcast, we pass the last device as the rx device tx_device->get_available_router_cores(num_hops, *rx_devices.rbegin(), src_routers, dest_routers); @@ -889,7 +893,7 @@ typedef struct test_traffic { CoreCoord tx_core, rx_core; tt_metal::NOC noc_id; std::vector zero_buf(2, 0); - CoreCoord router_virtual_core; + chan_id_t eth_chan; uint32_t mesh_chip_id = rx_devices[0]->mesh_chip_id; // update the test results address, which will be used later for polling, collecting results @@ -933,23 +937,24 @@ typedef struct test_traffic { // launch tx kernels for (auto i = 0; i < num_tx_workers; i++) { - router_virtual_core = std::get<0>(tx_workers[i]); + eth_chan = std::get<0>(tx_workers[i]); noc_id = (std::get<1>(tx_workers[i]) == 0) ? tt_metal::NOC::NOC_0 : tt_metal::NOC::NOC_1; tx_core = std::get<2>(tx_workers[i]); rx_core = std::get<2>(rx_workers[tx_to_rx_map[i]]); + auto routing_plane = tx_device->board_handle->get_routing_plane_from_chan(eth_chan); + // setup runtime args std::vector runtime_args = { time_seed, // 0: time based seed tx_device->get_endpoint_id(tx_core), // 1: src_endpoint_id - rx_devices[0]->get_noc_offset(rx_core), // 2: dest_noc_offset + rx_devices[0]->get_noc_offset(rx_core), // 2: dest_noc_offset tx_device->get_noc_offset(controller_logical_core), // 3: controller noc offset - router_virtual_core.x, // 4: router_x - router_virtual_core.y, // 5: router_y - mesh_chip_id, // 6: mesh and chip id - rx_buf_size, // 7: space in rx's L1 - gk_interface_addr, // 8: gk_message_addr_l - tx_device->gk_noc_offset, // 9: gk_message_addr_h + routing_plane, // 4: routing plane to use + mesh_chip_id, // 5: mesh and chip id + rx_buf_size, // 6: space in rx's L1 + gk_interface_addr, // 7: gk_message_addr_l + tx_device->gk_noc_offset, // 8: gk_message_addr_h }; if (ASYNC_WR & fabric_command) { @@ -962,8 +967,9 @@ typedef struct test_traffic { log_info( LogTest, - "Device: {}, TX kernel running on: logical: x={},y={}; virtual: x={},y={}", + "[Device: Phys: {}, Logical: {}] TX kernel running on: logical: x={},y={}; virtual: x={},y={}", tx_device->physical_chip_id, + (uint32_t)tx_device->logical_chip_id, tx_core.x, tx_core.y, tx_virtual_cores[i].x, @@ -1017,8 +1023,9 @@ typedef struct test_traffic { log_info( LogTest, - "Device: {}, RX kernel running on: logical: x={},y={}; virtual: x={},y={}", + "[Device: Phys: {}, Logical: {}] RX kernel running on: logical: x={},y={}; virtual: x={},y={}", rx_device->physical_chip_id, + (uint32_t)rx_device->logical_chip_id, rx_core.x, rx_core.y, rx_virtual_cores[i].x, @@ -1074,8 +1081,9 @@ typedef struct test_traffic { tx_device->physical_chip_id, tx_virtual_cores[i], test_results_address, 128)); log_info( LogTest, - "Device {} TX{} status = {}", + "[Device: Phys: {}, Logical: {}] TX{} status = {}", tx_device->physical_chip_id, + (uint32_t)tx_device->logical_chip_id, i, packet_queue_test_status_to_string(tx_results[i][PQ_TEST_STATUS_INDEX])); pass &= (tx_results[i][PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); @@ -1089,8 +1097,9 @@ typedef struct test_traffic { rx_devices[d]->physical_chip_id, rx_virtual_cores[i], test_results_address, 128)); log_info( LogTest, - "Device {} RX{} status = {}", + "[Device: Phys: {}, Logical: {}] RX{} status = {}", rx_devices[d]->physical_chip_id, + (uint32_t)rx_devices[d]->logical_chip_id, i, packet_queue_test_status_to_string(rx_results[d][i][PQ_TEST_STATUS_INDEX])); pass &= (rx_results[d][i][PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); @@ -1149,8 +1158,9 @@ typedef struct test_traffic { log_info( LogTest, - "Device: {}, TX {} words sent: {}, elapsed cycles: {} -> BW: {:.2f} B/cycle", + "[Device: Phys: {}, Logical: {}] TX {} words sent: {}, elapsed cycles: {} -> BW: {:.2f} B/cycle", tx_device->physical_chip_id, + tx_device->logical_chip_id, i, tx_words_sent, tx_elapsed_cycles, @@ -1176,8 +1186,9 @@ typedef struct test_traffic { uint32_t num_tx = rx_to_tx_map[i].size(); log_info( LogTest, - "Device: {}, RX {}, num producers = {}, words received = {}", + "[Device: Phys: {}, Logical: {}] RX {}, num producers = {}, words received = {}", rx_devices[d]->physical_chip_id, + (uint32_t)rx_devices[d]->logical_chip_id, i, num_tx, words_received); @@ -1495,8 +1506,6 @@ int main(int argc, char **argv) { } global_rng.seed(prng_seed); - log_info(LogTest, "PRNG seed = {}", prng_seed); - time_seed = std::chrono::system_clock::now().time_since_epoch().count(); try { @@ -1605,10 +1614,13 @@ int main(int argc, char **argv) { throw std::runtime_error("Test cannot run on specified device."); } */ + uint32_t worker_unreserved_base_addr = + hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); + if (run_gk_on_idle_ethernet) { routing_table_addr = hal.get_dev_addr(HalProgrammableCoreType::IDLE_ETH, HalL1MemAddrType::UNRESERVED); } else { - routing_table_addr = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); + routing_table_addr = worker_unreserved_base_addr; } gk_interface_addr = routing_table_addr + sizeof(fabric_router_l1_config_t) * 4; socket_info_addr = gk_interface_addr + sizeof(gatekeeper_info_t); @@ -1641,8 +1653,9 @@ int main(int argc, char **argv) { defines["CHECK_TIMEOUT"] = ""; } - uint32_t client_interface_addr = routing_table_addr + sizeof(fabric_router_l1_config_t) * 4; - uint32_t client_pull_req_buf_addr = client_interface_addr + sizeof(fabric_client_interface_t); + uint32_t client_interface_addr = worker_unreserved_base_addr; + uint32_t client_pull_req_buf_addr = + client_interface_addr + sizeof(fabric_client_interface_t) + sizeof(fabric_router_l1_config_t) * 4; std::vector tx_compile_args = { 0, //(device->id() << 8) + src_endpoint_start_id + i, // 0: src_endpoint_id diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp index e166f43706d..14425045b9f 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp @@ -296,6 +296,7 @@ int main(int argc, char** argv) { bool router_core_found = false; CoreCoord router_logical_core; CoreCoord router_phys_core; + routing_plane_id_t routing_plane; CoreCoord gk_phys_core; uint32_t routing_table_addr = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); uint32_t gk_interface_addr = routing_table_addr + sizeof(fabric_router_l1_config_t) * 4; @@ -318,6 +319,11 @@ int main(int argc, char** argv) { // sender device. router_logical_core = device.second->get_ethernet_sockets(neighbor)[0]; router_phys_core = device.second->ethernet_core_from_logical_core(router_logical_core); + auto eth_chan = tt::Cluster::instance() + .get_soc_desc(test_device_id_l) + .logical_eth_core_to_chan_map.at(router_logical_core); + routing_plane = control_plane->get_routing_plane_id(eth_chan); + router_core_found = true; } auto connected_logical_cores = device.second->get_ethernet_sockets(neighbor); @@ -442,8 +448,7 @@ int main(int argc, char** argv) { std::vector runtime_args = { (device_map[test_device_id_l]->id() << 8) + src_endpoint_start_id + i, // 0: src_endpoint_id 0x410, // 1: dest_noc_offset - router_phys_core.x, - router_phys_core.y, + routing_plane, (dev_r_mesh_id << 16 | dev_r_chip_id)}; if (ASYNC_WR == fabric_command) { diff --git a/tt_fabric/control_plane.hpp b/tt_fabric/control_plane.hpp index 7c829b7ea3c..0ad16aca13a 100644 --- a/tt_fabric/control_plane.hpp +++ b/tt_fabric/control_plane.hpp @@ -46,6 +46,8 @@ class ControlPlane { std::vector get_intra_chip_neighbors( mesh_id_t src_mesh_id, chip_id_t src_chip_id, RoutingDirection routing_direction) const; + routing_plane_id_t get_routing_plane_id(chan_id_t eth_chan_id) const; + private: std::unique_ptr routing_table_generator_; std::vector> logical_mesh_chip_id_to_physical_chip_id_mapping_; @@ -71,8 +73,6 @@ class ControlPlane { std::tuple get_connected_mesh_chip_chan_ids( mesh_id_t mesh_id, chip_id_t chip_id, chan_id_t chan_id) const; - - routing_plane_id_t get_routing_plane_id(chan_id_t eth_chan_id) const; }; } // namespace tt::tt_fabric diff --git a/tt_fabric/hw/inc/tt_fabric_api.h b/tt_fabric/hw/inc/tt_fabric_api.h index 63fa69e4688..5b66fa860d1 100644 --- a/tt_fabric/hw/inc/tt_fabric_api.h +++ b/tt_fabric/hw/inc/tt_fabric_api.h @@ -22,6 +22,8 @@ extern volatile fabric_client_interface_t* client_interface; #define ASYNC_WR_ADD_HEADER 4 inline uint32_t get_next_hop_router_noc_xy(uint32_t routing_plane, uint32_t dst_mesh_id, uint32_t dst_dev_id) { + ASSERT(routing_plane < client_interface->num_routing_planes); + fabric_router_l1_config_t* routing_table = (fabric_router_l1_config_t*)client_interface->routing_tables_l1_offset; if (dst_mesh_id != routing_table[routing_plane].my_mesh_id) { uint32_t next_port = routing_table[routing_plane].inter_mesh_table.dest_entry[dst_mesh_id]; return eth_chan_to_noc_xy[noc_index][next_port]; @@ -243,7 +245,19 @@ inline void fabric_socket_connect(socket_handle_t* socket_handle) { while (((volatile socket_handle_t*)socket_handle)->socket_state != SocketState::ACTIVE); } -inline void fabric_endpoint_init() { +inline void fabric_endpoint_init(uint32_t base_address, uint32_t gk_interface_addr_l, uint32_t gk_interface_addr_h) { + tt_fabric_init(); + + client_interface = (volatile fabric_client_interface_t*)base_address; + uint32_t routing_tables_offset = base_address + sizeof(fabric_client_interface_t); + + zero_l1_buf((uint32_t*)client_interface, sizeof(fabric_client_interface_t)); + client_interface->gk_interface_addr = ((uint64_t)gk_interface_addr_h << 32) | gk_interface_addr_l; + client_interface->gk_msg_buf_addr = + (((uint64_t)gk_interface_addr_h << 32) | gk_interface_addr_l) + offsetof(gatekeeper_info_t, gk_msg_buf); + client_interface->routing_tables_l1_offset = routing_tables_offset; + + // make sure fabric node gatekeeper is available. uint64_t noc_addr = client_interface->gk_interface_addr + offsetof(gatekeeper_info_t, ep_sync); client_interface->return_status[0] = 0; while (1) { @@ -253,4 +267,21 @@ inline void fabric_endpoint_init() { break; } } + + // read the gk info first at routing table addr and later override with routing tables + noc_async_read_one_packet( + client_interface->gk_interface_addr, client_interface->routing_tables_l1_offset, sizeof(gatekeeper_info_t)); + noc_async_read_barrier(); + + client_interface->num_routing_planes = ((gatekeeper_info_t*)routing_tables_offset)->routing_planes; + + // read routing tables + uint64_t gk_rt_noc_addr = client_interface->gk_interface_addr - sizeof(fabric_router_l1_config_t) * 4; + uint32_t table_offset; + for (uint32_t i = 0; i < client_interface->num_routing_planes; i++) { + table_offset = sizeof(fabric_router_l1_config_t) * i; + noc_async_read_one_packet( + gk_rt_noc_addr + table_offset, routing_tables_offset + table_offset, sizeof(fabric_router_l1_config_t)); + } + noc_async_read_barrier(); } diff --git a/tt_fabric/hw/inc/tt_fabric_interface.h b/tt_fabric/hw/inc/tt_fabric_interface.h index 1c4f69afe09..9f8c1daa949 100644 --- a/tt_fabric/hw/inc/tt_fabric_interface.h +++ b/tt_fabric/hw/inc/tt_fabric_interface.h @@ -331,7 +331,8 @@ typedef struct _fabric_client_interface { uint64_t gk_interface_addr; uint64_t gk_msg_buf_addr; uint64_t pull_req_buf_addr; - uint32_t padding[2]; + uint32_t num_routing_planes; + uint32_t routing_tables_l1_offset; uint32_t return_status[3]; uint32_t socket_count; chan_ptr wrptr; diff --git a/tt_fabric/impl/kernels/tt_fabric_gatekeeper.cpp b/tt_fabric/impl/kernels/tt_fabric_gatekeeper.cpp index b90892d5e5b..31c75c4329b 100644 --- a/tt_fabric/impl/kernels/tt_fabric_gatekeeper.cpp +++ b/tt_fabric/impl/kernels/tt_fabric_gatekeeper.cpp @@ -59,6 +59,39 @@ inline void notify_all_routers(uint32_t notification) { } } +inline void get_routing_tables() { + uint32_t temp_mask = router_mask; + uint32_t channel = 0; + uint32_t routing_plane = 0; + for (uint32_t i = 0; i < 4; i++) { + if (temp_mask & 0xF) { + temp_mask &= 0xF; + break; + } else { + temp_mask >>= 4; + } + channel += 4; + } + + if (temp_mask) { + for (uint32_t i = 0; i < 4; i++) { + if (temp_mask & 0x1) { + uint64_t router_config_addr = ((uint64_t)eth_chan_to_noc_xy[noc_index][channel] << 32) | + eth_l1_mem::address_map::FABRIC_ROUTER_CONFIG_BASE; + noc_async_read_one_packet( + router_config_addr, + (uint32_t)&routing_table[routing_plane], + sizeof(tt::tt_fabric::fabric_router_l1_config_t)); + routing_plane++; + } + temp_mask >>= 1; + channel++; + } + } + gk_info->routing_planes = routing_plane; + noc_async_read_barrier(); +} + inline void sync_all_routers() { // wait for all device routers to have incremented the sync semaphore. // sync_val is equal to number of tt-fabric routers running on a device. @@ -68,6 +101,7 @@ inline void sync_all_routers() { // semaphore notifies all other routers that this router has completed // startup handshake with its ethernet peer. notify_all_routers(sync_val); + get_routing_tables(); gk_info->ep_sync.val = sync_val; } @@ -394,39 +428,6 @@ inline void process_pending_socket() { } } -inline void get_routing_tables() { - uint32_t temp_mask = router_mask; - uint32_t channel = 0; - uint32_t routing_plane = 0; - for (uint32_t i = 0; i < 4; i++) { - if (temp_mask & 0xF) { - temp_mask &= 0xF; - break; - } else { - temp_mask >>= 4; - } - channel += 4; - } - - if (temp_mask) { - for (uint32_t i = 0; i < 4; i++) { - if (temp_mask & 0x1) { - uint64_t router_config_addr = ((uint64_t)eth_chan_to_noc_xy[noc_index][channel] << 32) | - eth_l1_mem::address_map::FABRIC_ROUTER_CONFIG_BASE; - noc_async_read_one_packet( - router_config_addr, - (uint32_t)&routing_table[routing_plane], - sizeof(tt::tt_fabric::fabric_router_l1_config_t)); - routing_plane++; - } - temp_mask >>= 1; - channel++; - } - } - gk_info->routing_planes = routing_plane; - noc_async_read_barrier(); -} - void kernel_main() { sync_val = get_arg_val(0); router_mask = get_arg_val(1); @@ -445,7 +446,6 @@ void kernel_main() { zero_l1_buf((tt_l1_ptr uint32_t*)socket_info, sizeof(socket_info_t)); sync_all_routers(); - get_routing_tables(); uint64_t start_timestamp = get_timestamp(); uint32_t loop_count = 0; From 359ff7995db206a52a1aca9876e00a99382dc7af Mon Sep 17 00:00:00 2001 From: Virdhatchani Narayanamoorthy <138196495+VirdhatchaniKN@users.noreply.github.com> Date: Mon, 10 Feb 2025 12:14:17 +0530 Subject: [PATCH 038/316] #17758: Update Batch Norm Training mode kernels (#17733) ### Ticket https://github.com/tenstorrent/tt-metal/issues/17758 ### Problem description [Comment Link](https://github.com/tenstorrent/tt-metal/pull/17587#discussion_r1945931451) ### What's changed Updated BN to use compile-time arguments for buffer indexing, replacing hardcoded values for better flexibility. ### Checklist - [x] [All post-commit tests](https://github.com/tenstorrent/tt-metal/actions/runs/13227397570) - [x] [Blackhole post-commit tests](https://github.com/tenstorrent/tt-metal/actions/runs/13227398013) - [ ] [(Single-card) Tests for new models]() - [x] [(Single-card) Demo tests](https://github.com/tenstorrent/tt-metal/actions/runs/13227399196) - [x] [(Single-card) Device perf regressions](https://github.com/tenstorrent/tt-metal/actions/runs/13227399904) - [x] [(Single-card) Model perf tests](https://github.com/tenstorrent/tt-metal/actions/runs/13227400809) --- .../device/batch_norm_program_factory.cpp | 70 ++++++++++--------- .../kernels/compute/batch_norm_kernel.cpp | 20 +++--- .../compute/batch_norm_sfpu_kernel.cpp | 20 +++--- .../kernels/dataflow/reader_batch_norm.cpp | 4 +- .../kernels/dataflow/writer_batch_norm.cpp | 10 +-- 5 files changed, 65 insertions(+), 59 deletions(-) diff --git a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/batch_norm_program_factory.cpp b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/batch_norm_program_factory.cpp index a0f062da2f8..4c347a6cfed 100644 --- a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/batch_norm_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/batch_norm_program_factory.cpp @@ -171,18 +171,18 @@ BatchNormOperation::BatchNormFactory::cached_program_t BatchNormOperation::Batch uint32_t b_num_tiles_per_cb = num_tiles_per_cb; // Input buffers - auto [a_cb, a_cb_handle] = create_cb( + auto [input_tensor_cb, input_tensor_cb_handle] = create_cb( tt::CBIndex::c_0, program, all_device_cores, a_single_tile_size, num_tiles_per_cb, a_data_format); // input - auto [b_cb, b_cb_handle] = create_cb( + auto [batch_mean_tensor_cb, batch_mean_tensor_cb_handle] = create_cb( tt::CBIndex::c_1, program, all_device_cores, b_single_tile_size, b_num_tiles_per_cb, b_data_format); // batch_mean - auto [c_cb, c_cb_handle] = create_cb( + auto [output_tensor_cb, output_tensor_cb_handle] = create_cb( tt::CBIndex::c_2, program, all_device_cores, c_single_tile_size, num_tiles_per_cb, c_data_format); // output - auto [d_cb, d_cb_handle] = create_cb( + auto [batch_var_tensor_cb, batch_var_tensor_cb_handle] = create_cb( tt::CBIndex::c_3, program, all_device_cores, @@ -191,28 +191,28 @@ BatchNormOperation::BatchNormFactory::cached_program_t BatchNormOperation::Batch d_data_format); // batch_var auto [eps_cb, eps_cb_handle] = create_cb( tt::CBIndex::c_4, program, all_device_cores, d_single_tile_size, b_num_tiles_per_cb, d_data_format); // eps - auto [e_cb, e_cb_handle] = create_cb( - tt::CBIndex::c_16, program, all_device_cores, e_single_tile_size, b_num_tiles_per_cb, e_data_format); // weight - auto [f_cb, f_cb_handle] = create_cb( - tt::CBIndex::c_18, program, all_device_cores, f_single_tile_size, b_num_tiles_per_cb, f_data_format); // bias + auto [weight_tensor_cb, weight_tensor_cb_handle] = create_cb( + tt::CBIndex::c_5, program, all_device_cores, e_single_tile_size, b_num_tiles_per_cb, e_data_format); // weight + auto [bias_tensor_cb, bias_tensor_cb_handle] = create_cb( + tt::CBIndex::c_6, program, all_device_cores, f_single_tile_size, b_num_tiles_per_cb, f_data_format); // bias // Temporary buffers to store intermediate results auto [den_cb, den_cb_handle] = create_cb( - tt::CBIndex::c_5, + tt::CBIndex::c_7, program, all_device_cores, a_single_tile_size, num_tiles_per_cb, a_data_format); // to store 1/(sqrt(batch_var + eps)) auto [num_cb, num_cb_handle] = create_cb( - tt::CBIndex::c_6, + tt::CBIndex::c_8, program, all_device_cores, a_single_tile_size, num_tiles_per_cb, a_data_format); // to store input - batch_mean auto [temp_1_cb, temp_1_cb_handle] = - create_cb(tt::CBIndex::c_17, program, all_device_cores, a_single_tile_size, num_tiles_per_cb, a_data_format); + create_cb(tt::CBIndex::c_9, program, all_device_cores, a_single_tile_size, num_tiles_per_cb, a_data_format); auto a_is_dram = static_cast(input_tensor.buffer()->buffer_type() == tt_metal::BufferType::DRAM); auto b_is_dram = static_cast(batch_mean_tensor.buffer()->buffer_type() == tt_metal::BufferType::DRAM); @@ -236,7 +236,7 @@ BatchNormOperation::BatchNormFactory::cached_program_t BatchNormOperation::Batch program, "ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/reader_batch_norm.cpp", all_device_cores, - tt_metal::ReaderDataMovementConfig({a_is_dram}, std::move(reader_defines))); + tt_metal::ReaderDataMovementConfig({a_is_dram, input_tensor_cb, eps_cb}, std::move(reader_defines))); // WRITER KERNEL auto writer_defines = dataflow_defines; @@ -253,6 +253,11 @@ BatchNormOperation::BatchNormFactory::cached_program_t BatchNormOperation::Batch f_is_dram, static_cast(weight_has_value), static_cast(bias_has_value), + batch_mean_tensor_cb, + output_tensor_cb, + batch_var_tensor_cb, + weight_tensor_cb, + bias_tensor_cb, }, std::move(writer_defines))); @@ -260,34 +265,35 @@ BatchNormOperation::BatchNormFactory::cached_program_t BatchNormOperation::Batch bool fp32_dest_acc_en = c_data_format == tt::DataFormat::UInt32 || c_data_format == tt::DataFormat::Int32 || c_data_format == tt::DataFormat::Float32; - uint32_t src_input_cb_index = tt::CBIndex::c_0; - uint32_t src_batch_mean_cb_index = tt::CBIndex::c_1; - uint32_t src_batch_var_cb_index = tt::CBIndex::c_3; - uint32_t src_eps_cb_index = tt::CBIndex::c_4; - uint32_t src_temp_den_cb_index = tt::CBIndex::c_5; - uint32_t src_temp_num_cb_index = tt::CBIndex::c_6; - uint32_t src_weight_cb_index = tt::CBIndex::c_16; - uint32_t src_temp_1_cb_index = tt::CBIndex::c_17; - uint32_t src_bias_cb_index = tt::CBIndex::c_18; - std::vector unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default); if (fp32_dest_acc_en) { for (const auto cb_index : - {src_input_cb_index, - src_batch_mean_cb_index, - src_batch_var_cb_index, - src_temp_num_cb_index, - src_temp_den_cb_index, - src_eps_cb_index, - src_weight_cb_index, - src_temp_1_cb_index, - src_bias_cb_index}) { + {input_tensor_cb, + batch_mean_tensor_cb, + batch_var_tensor_cb, + eps_cb, + den_cb, + num_cb, + weight_tensor_cb, + temp_1_cb, + bias_tensor_cb}) { unpack_to_dest_mode[cb_index] = UnpackToDestMode::UnpackToDestFp32; } } std::vector compute_kernel_args = { - static_cast(weight_has_value), static_cast(bias_has_value)}; + static_cast(weight_has_value), + static_cast(bias_has_value), + input_tensor_cb, + batch_mean_tensor_cb, + output_tensor_cb, + batch_var_tensor_cb, + eps_cb, + den_cb, + num_cb, + weight_tensor_cb, + temp_1_cb, + bias_tensor_cb}; auto compute_kernel_id = tt_metal::CreateKernel( program, fmt::format( diff --git a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/batch_norm_kernel.cpp b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/batch_norm_kernel.cpp index a58dedc3697..0de891f21cb 100644 --- a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/batch_norm_kernel.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/batch_norm_kernel.cpp @@ -144,17 +144,17 @@ void MAIN { return; } - constexpr auto cb_input = tt::CBIndex::c_0; // input - constexpr auto cb_batch_mean = tt::CBIndex::c_1; // batch_mean + constexpr auto cb_input = get_compile_time_arg_val(2); // input + constexpr auto cb_batch_mean = get_compile_time_arg_val(3); // batch_mean constexpr auto cb_output_0 = - tt::CBIndex::c_2; // output -- > [(input - batch_mean)/(sqrt(batch_var + eps))] * weight - constexpr auto cb_batch_var = tt::CBIndex::c_3; // batch_var - constexpr auto cb_eps = tt::CBIndex::c_4; // eps - constexpr auto cb_den = tt::CBIndex::c_5; // 1/(sqrt(batch_var + eps)) - constexpr auto cb_num = tt::CBIndex::c_6; // input - batch_mean - constexpr auto cb_weight = tt::CBIndex::c_16; // weight tensor - constexpr auto cb_tmp_1 = tt::CBIndex::c_17; // (input - batch_mean)/(sqrt(batch_var + eps)) - constexpr auto cb_bias = tt::CBIndex::c_18; // bias tensor + get_compile_time_arg_val(4); // output -- > [(input - batch_mean)/(sqrt(batch_var + eps))] * weight + constexpr auto cb_batch_var = get_compile_time_arg_val(5); // batch_var + constexpr auto cb_eps = get_compile_time_arg_val(6); // eps + constexpr auto cb_den = get_compile_time_arg_val(7); // 1/(sqrt(batch_var + eps)) + constexpr auto cb_num = get_compile_time_arg_val(8); // input - batch_mean + constexpr auto cb_weight = get_compile_time_arg_val(9); // weight tensor + constexpr auto cb_tmp_1 = get_compile_time_arg_val(10); // (input - batch_mean)/(sqrt(batch_var + eps)) + constexpr auto cb_bias = get_compile_time_arg_val(11); // bias tensor auto cb_bcast = cb_batch_mean; auto cb_other = cb_input; diff --git a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/batch_norm_sfpu_kernel.cpp b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/batch_norm_sfpu_kernel.cpp index 52942da1f55..11ce1c3c086 100644 --- a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/batch_norm_sfpu_kernel.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/batch_norm_sfpu_kernel.cpp @@ -183,17 +183,17 @@ void MAIN { return; } - constexpr auto cb_input = tt::CBIndex::c_0; // input - constexpr auto cb_batch_mean = tt::CBIndex::c_1; // batch_mean + constexpr auto cb_input = get_compile_time_arg_val(2); // input + constexpr auto cb_batch_mean = get_compile_time_arg_val(3); // batch_mean constexpr auto cb_output_0 = - tt::CBIndex::c_2; // output -- > [(input - batch_mean)/(sqrt(batch_var + eps))] * weight - constexpr auto cb_batch_var = tt::CBIndex::c_3; // batch_var - constexpr auto cb_eps = tt::CBIndex::c_4; // eps - constexpr auto cb_den = tt::CBIndex::c_5; // 1/(sqrt(batch_var + eps)) - constexpr auto cb_num = tt::CBIndex::c_6; // input - batch_mean - constexpr auto cb_weight = tt::CBIndex::c_16; // weight tensor - constexpr auto cb_tmp_1 = tt::CBIndex::c_17; // (input - batch_mean)/(sqrt(batch_var + eps)) - constexpr auto cb_bias = tt::CBIndex::c_18; // bias tensor + get_compile_time_arg_val(4); // output -- > [(input - batch_mean)/(sqrt(batch_var + eps))] * weight + constexpr auto cb_batch_var = get_compile_time_arg_val(5); // batch_var + constexpr auto cb_eps = get_compile_time_arg_val(6); // eps + constexpr auto cb_den = get_compile_time_arg_val(7); // 1/(sqrt(batch_var + eps)) + constexpr auto cb_num = get_compile_time_arg_val(8); // input - batch_mean + constexpr auto cb_weight = get_compile_time_arg_val(9); // weight tensor + constexpr auto cb_tmp_1 = get_compile_time_arg_val(10); // (input - batch_mean)/(sqrt(batch_var + eps)) + constexpr auto cb_bias = get_compile_time_arg_val(11); // bias tensor auto cb_bcast = cb_batch_mean; auto cb_other = cb_input; diff --git a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/reader_batch_norm.cpp b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/reader_batch_norm.cpp index ebf287dce1f..e0c453eb786 100644 --- a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/reader_batch_norm.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/reader_batch_norm.cpp @@ -21,7 +21,7 @@ void kernel_main() { constexpr bool src_is_dram = get_compile_time_arg_val(0) == 1; - constexpr auto cb_id_src = tt::CBIndex::c_0; + constexpr auto cb_id_src = get_compile_time_arg_val(1); constexpr uint32_t onetile = 1; const uint32_t src_tile_bytes = get_tile_size(cb_id_src); @@ -35,7 +35,7 @@ void kernel_main() { uint32_t start_c = start_remaining / HtWt; uint32_t start_t = start_remaining % HtWt; - constexpr auto cb_id_eps = tt::CBIndex::c_4; + constexpr auto cb_id_eps = get_compile_time_arg_val(2); union { float f; diff --git a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/writer_batch_norm.cpp b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/writer_batch_norm.cpp index 0c80abbc870..f95965ca242 100644 --- a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/writer_batch_norm.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/writer_batch_norm.cpp @@ -24,7 +24,7 @@ void kernel_main() { constexpr uint32_t onetile = 1; // batch_mean - constexpr auto cb_id_src = tt::CBIndex::c_1; + constexpr auto cb_id_src = get_compile_time_arg_val(7); constexpr bool src_is_dram = get_compile_time_arg_val(0) == 1; const uint32_t src_tile_bytes = get_tile_size(cb_id_src); const DataFormat src_data_format = get_dataformat(cb_id_src); @@ -33,7 +33,7 @@ void kernel_main() { .bank_base_address = src_addr, .page_size = src_tile_bytes, .data_format = src_data_format}; // output - constexpr auto cb_id_dst = tt::CBIndex::c_2; + constexpr auto cb_id_dst = get_compile_time_arg_val(8); constexpr bool dst_is_dram = get_compile_time_arg_val(1) == 1; const uint32_t dst_tile_bytes = get_tile_size(cb_id_dst); const DataFormat dst_data_format = get_dataformat(cb_id_dst); @@ -42,7 +42,7 @@ void kernel_main() { .bank_base_address = dst_addr, .page_size = dst_tile_bytes, .data_format = dst_data_format}; // batch_var - constexpr auto cb_id_batch_var = tt::CBIndex::c_3; + constexpr auto cb_id_batch_var = get_compile_time_arg_val(9); constexpr bool batch_var_is_dram = get_compile_time_arg_val(2) == 1; const uint32_t batch_var_tile_bytes = get_tile_size(cb_id_batch_var); const DataFormat batch_var_data_format = get_dataformat(cb_id_batch_var); @@ -51,7 +51,7 @@ void kernel_main() { .bank_base_address = batch_var_addr, .page_size = batch_var_tile_bytes, .data_format = batch_var_data_format}; // weight - constexpr auto cb_id_weight = tt::CBIndex::c_16; + constexpr auto cb_id_weight = get_compile_time_arg_val(10); constexpr bool weight_is_dram = get_compile_time_arg_val(3) == 1; const uint32_t weight_tile_bytes = get_tile_size(cb_id_weight); const DataFormat weight_data_format = get_dataformat(cb_id_weight); @@ -60,7 +60,7 @@ void kernel_main() { .bank_base_address = weight_addr, .page_size = weight_tile_bytes, .data_format = weight_data_format}; // bias - constexpr auto cb_id_bias = tt::CBIndex::c_18; + constexpr auto cb_id_bias = get_compile_time_arg_val(11); constexpr bool bias_is_dram = get_compile_time_arg_val(4) == 1; const uint32_t bias_tile_bytes = get_tile_size(cb_id_bias); const DataFormat bias_data_format = get_dataformat(cb_id_bias); From 39ab8cbf119b87549f29b61b2d2e6fdf38215172 Mon Sep 17 00:00:00 2001 From: Jason Davies Date: Mon, 10 Feb 2025 14:32:42 +0000 Subject: [PATCH 039/316] Fix incorrect tracer error when fast runtime mode is enabled. (#17776) Fixes #17773. ### Ticket #17773 ### Problem description The error message is wrong when fast runtime mode is enabled. It should say the opposite of what it says currently. ### What's changed Describe the approach used to solve the problem. Summarize the changes made and its impact. ### Checklist - [ ] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes - [ ] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [ ] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [ ] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [ ] New/Existing tests provide coverage for changes --- ttnn/ttnn/tracer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ttnn/ttnn/tracer.py b/ttnn/ttnn/tracer.py index 0c452da6f54..630d5e83af3 100644 --- a/ttnn/ttnn/tracer.py +++ b/ttnn/ttnn/tracer.py @@ -456,7 +456,7 @@ def enable_tracing(): global ENABLE_TRACER global GRAPH_STACK if ttnn.CONFIG.enable_fast_runtime_mode: - raise ValueError("Tracing is only supported in fast runtime mode.") + raise ValueError("Tracing is not supported in fast runtime mode.") if ENABLE_TRACER: raise ValueError("Tracing is already enabled.") ENABLE_TRACER = True From 63d65ca632d10b8e75dfba25e3d119be47b3b881 Mon Sep 17 00:00:00 2001 From: Sofija Jovic <148721049+s-jovic@users.noreply.github.com> Date: Mon, 10 Feb 2025 15:54:42 +0100 Subject: [PATCH 040/316] #17134: Add remaining SD unit tests (#17736) --- .../tests/test_cross_attn_midblock_2d.py | 118 ++++++++++++++++++ .../tests/test_downsample_2d.py | 96 ++++++++++++++ .../test_cross_attn_midblock_2d.py | 1 + .../stable_diffusion/test_downsample_2d.py | 1 + 4 files changed, 216 insertions(+) create mode 100644 models/demos/wormhole/stable_diffusion/tests/test_cross_attn_midblock_2d.py create mode 100644 models/demos/wormhole/stable_diffusion/tests/test_downsample_2d.py create mode 120000 tests/nightly/single_card/stable_diffusion/test_cross_attn_midblock_2d.py create mode 120000 tests/nightly/single_card/stable_diffusion/test_downsample_2d.py diff --git a/models/demos/wormhole/stable_diffusion/tests/test_cross_attn_midblock_2d.py b/models/demos/wormhole/stable_diffusion/tests/test_cross_attn_midblock_2d.py new file mode 100644 index 00000000000..617fea615cd --- /dev/null +++ b/models/demos/wormhole/stable_diffusion/tests/test_cross_attn_midblock_2d.py @@ -0,0 +1,118 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +from diffusers import StableDiffusionPipeline +import pytest +import torch +import ttnn + +from models.demos.wormhole.stable_diffusion.custom_preprocessing import custom_preprocessor +from models.demos.wormhole.stable_diffusion.tt.ttnn_functional_unet_mid_block_2d_cross_attn_new_conv import ( + unet_mid_block_2d_cross_attn, +) +from models.demos.wormhole.stable_diffusion.tt.ttnn_functional_utility_functions import ( + get_default_compute_config, + preprocess_and_push_input_to_device, + post_process_output_and_move_to_host, +) +from models.utility_functions import skip_for_grayskull, torch_random +from ttnn.model_preprocessing import preprocess_model_parameters +from tests.ttnn.utils_for_testing import assert_with_pcc + + +@skip_for_grayskull() +@pytest.mark.parametrize("device_params", [{"l1_small_size": 32768}], indirect=True) +@pytest.mark.parametrize( + "hidden_states, shard_end_core, shard_shape", + [ + ([2, 1280, 8, 8], (7, 3), (32, 160)), + ], +) +@pytest.mark.parametrize("temb", [[1, 1, 2, 1280]]) +def test_cross_attention_midblock_512x512(reset_seeds, device, hidden_states, shard_end_core, shard_shape, temb): + # Initialize PyTorch component + pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float32) + unet = pipe.unet + unet.eval() + torch_midblock = unet.mid_block + + # Initialize ttnn component + reader_patterns_cache = {} + parameters = preprocess_model_parameters( + initialize_model=lambda: unet, custom_preprocessor=custom_preprocessor, device=device + ) + parameters = parameters.mid_block + N, _, H, W = hidden_states + compute_kernel_config = get_default_compute_config(device) + + ttnn_midblock = unet_mid_block_2d_cross_attn( + device, parameters, reader_patterns_cache, N, H, W, compute_kernel_config + ) + + # Prepare inputs + in_channels = hidden_states[1] + out_channels = in_channels + temb_channels = 1280 + input_shape = hidden_states + hidden_states = torch_random(input_shape, -0.1, 0.1, dtype=torch.float32) + temb = torch_random(temb, -0.1, 0.1, dtype=torch.float32) + + encoder_hidden_states_shape = [1, 2, 77, 768] + encoder_hidden_states = torch.randn(encoder_hidden_states_shape) + + # Run PyTorch component + torch_output = torch_midblock(hidden_states, temb.squeeze(0).squeeze(0), encoder_hidden_states.squeeze(0)) + + # Prepare inputs for ttnn component + hidden_states = preprocess_and_push_input_to_device( + device, + hidden_states, + memory_config=ttnn.MemoryConfig( + ttnn.TensorMemoryLayout.BLOCK_SHARDED, + ttnn.BufferType.L1, + ttnn.ShardSpec( + ttnn.CoreRangeSet( + { + ttnn.CoreRange( + ttnn.CoreCoord(0, 0), + ttnn.CoreCoord(shard_end_core[0], shard_end_core[1]), + ), + } + ), + shard_shape, + ttnn.ShardOrientation.ROW_MAJOR, + ), + ), + ) + + temb = temb.permute(2, 0, 1, 3) + temb = ttnn.from_torch(temb, ttnn.bfloat16) + temb = ttnn.to_layout(temb, ttnn.TILE_LAYOUT, ttnn.bfloat8_b) + temb = ttnn.to_device(temb, device, memory_config=ttnn.L1_MEMORY_CONFIG) + + encoder_hidden_states = torch.nn.functional.pad(encoder_hidden_states, (0, 0, 0, 19)) + encoder_hidden_states = ttnn.from_torch( + encoder_hidden_states, dtype=ttnn.bfloat8_b, layout=ttnn.TILE_LAYOUT, device=device + ) + encoder_hidden_states = ttnn.to_device(encoder_hidden_states, device, memory_config=ttnn.L1_MEMORY_CONFIG) + + # Run ttnn component + output = ttnn_midblock( + hidden_states=hidden_states, + temb=temb, + encoder_hidden_states=encoder_hidden_states, + attention_mask=None, + cross_attention_kwargs=None, + in_channels=in_channels, + temb_channels=temb_channels, + resnet_eps=1e-5, + resnet_act_fn="silu", + attn_num_head_channels=8, + config=unet.config, + ) + + # Compare outputs + output = post_process_output_and_move_to_host(output, N, H, W, out_channels) + assert_with_pcc(torch_output, output, 0.97) diff --git a/models/demos/wormhole/stable_diffusion/tests/test_downsample_2d.py b/models/demos/wormhole/stable_diffusion/tests/test_downsample_2d.py new file mode 100644 index 00000000000..273358edf7e --- /dev/null +++ b/models/demos/wormhole/stable_diffusion/tests/test_downsample_2d.py @@ -0,0 +1,96 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +import torch +from diffusers import StableDiffusionPipeline +import os +import ttnn +import pytest + +from models.utility_functions import torch_random +from tests.ttnn.utils_for_testing import assert_with_pcc +from models.utility_functions import ( + skip_for_grayskull, +) + +from models.demos.wormhole.stable_diffusion.tt.ttnn_functional_downsample_2d_new_conv import downsample_2d +from models.demos.wormhole.stable_diffusion.custom_preprocessing import custom_preprocessor +from ttnn.model_preprocessing import preprocess_model_parameters +from models.demos.wormhole.stable_diffusion.tt.ttnn_functional_utility_functions import ( + get_default_compute_config, + preprocess_and_push_input_to_device, + post_process_output_and_move_to_host, +) + + +@skip_for_grayskull() +@pytest.mark.parametrize("device_params", [{"l1_small_size": 32768}], indirect=True) +@pytest.mark.parametrize( + "block_index, hidden_states, shard_end_core, shard_shape", + [ + (0, [2, 320, 64, 64], (4, 7), (1024, 64)), + (1, [2, 640, 32, 32], (4, 7), (256, 128)), + (2, [2, 1280, 16, 16], (7, 7), (64, 160)), + ], +) +def test_downblock_512x512(reset_seeds, device, block_index, hidden_states, shard_end_core, shard_shape): + # Initialize PyTorch component + pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float32) + unet = pipe.unet + unet.eval() + torch_downsample = pipe.unet.down_blocks[block_index].downsamplers[0] + + # Initialize ttnn component + reader_patterns_cache = {} + parameters = preprocess_model_parameters( + initialize_model=lambda: unet, custom_preprocessor=custom_preprocessor, device=device + ) + parameters = parameters.down_blocks[block_index].downsamplers[0] + N, _, H, W = hidden_states + compute_kernel_config = get_default_compute_config(device) + + ttnn_downsample = downsample_2d(device, parameters, reader_patterns_cache, N, H, W, compute_kernel_config) + + # Prepare inputs + in_channels = hidden_states[1] + out_channels = in_channels + input_shape = hidden_states + hidden_states = torch_random(input_shape, -0.1, 0.1, dtype=torch.float32) + + # Run PyTorch component + torch_output = torch_downsample(hidden_states) + + # Prepare inputs for ttnn component + hidden_states = preprocess_and_push_input_to_device( + device, + hidden_states, + memory_config=ttnn.MemoryConfig( + ttnn.TensorMemoryLayout.BLOCK_SHARDED, + ttnn.BufferType.L1, + ttnn.ShardSpec( + ttnn.CoreRangeSet( + { + ttnn.CoreRange( + ttnn.CoreCoord(0, 0), + ttnn.CoreCoord(shard_end_core[0], shard_end_core[1]), + ), + } + ), + shard_shape, + ttnn.ShardOrientation.ROW_MAJOR, + ), + ), + ) + + # Run ttnn component + output = ttnn_downsample( + in_channels=out_channels, + out_channels=out_channels, + hidden_states=hidden_states, + use_conv=True, + ) + + # Compare outputs + output = post_process_output_and_move_to_host(output, N, H // 2, W // 2, out_channels) + assert_with_pcc(torch_output, output, 0.99) diff --git a/tests/nightly/single_card/stable_diffusion/test_cross_attn_midblock_2d.py b/tests/nightly/single_card/stable_diffusion/test_cross_attn_midblock_2d.py new file mode 120000 index 00000000000..9c6045ae160 --- /dev/null +++ b/tests/nightly/single_card/stable_diffusion/test_cross_attn_midblock_2d.py @@ -0,0 +1 @@ +../../../../models/demos/wormhole/stable_diffusion/tests/test_cross_attn_midblock_2d.py \ No newline at end of file diff --git a/tests/nightly/single_card/stable_diffusion/test_downsample_2d.py b/tests/nightly/single_card/stable_diffusion/test_downsample_2d.py new file mode 120000 index 00000000000..30f3f798666 --- /dev/null +++ b/tests/nightly/single_card/stable_diffusion/test_downsample_2d.py @@ -0,0 +1 @@ +../../../../models/demos/wormhole/stable_diffusion/tests/test_downsample_2d.py \ No newline at end of file From a7bf1016c46e3f60b183b8c769b754c54a499813 Mon Sep 17 00:00:00 2001 From: Bryan Wilder Field Lozano Date: Mon, 10 Feb 2025 07:48:16 -0800 Subject: [PATCH 041/316] [skip ci] Show All Post Commit Status Badge from main on README.md (#17783) --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 749849664cf..ac4656e7e6e 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ +[![tt-metal CI](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml/badge.svg)](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) +

From 423715372e964ef2934b9d936856020b163c634f Mon Sep 17 00:00:00 2001 From: Nathan Sidwell Date: Mon, 10 Feb 2025 11:06:10 -0500 Subject: [PATCH 042/316] #14596: new sfpi release (#17602) ### Ticket https://github.com/tenstorrent/tt-metal/issues/14596 https://github.com/tenstorrent/tt-metal/issues/16603 ### Problem description * need per-cpu multilibs so that GS & WH standard libraries are built with cpu-specific silicon workarounds * Fix a bunch of compiler internal inconsistencies found by enabling checking ### What's changed New gcc toolchain Fix declaration mismatch discovered by fixed compiler ### Checklist - [yes ] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes - [yes] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [ ] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [ ] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [ ] New/Existing tests provide coverage for changes --- tt_metal/hw/CMakeLists.txt | 4 ++-- tt_metal/include/compute_kernel_api/common.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tt_metal/hw/CMakeLists.txt b/tt_metal/hw/CMakeLists.txt index 9ba5bdbea1d..bd487cb2ab7 100644 --- a/tt_metal/hw/CMakeLists.txt +++ b/tt_metal/hw/CMakeLists.txt @@ -21,8 +21,8 @@ set(TYPES include(FetchContent) set(SFPI_x86_64_Linux_RELEASE - "v6.0.0/sfpi-release.tgz" - "d837d26a2312d27815179995fdea83bd" + "v6.1.0/sfpi-release.tgz" + "da98a135fe95a462c3b6b4e054dc159f" ) if(DEFINED SFPI_${CMAKE_HOST_SYSTEM_PROCESSOR}_${CMAKE_HOST_SYSTEM_NAME}_RELEASE) set(SFPI_RELEASE "${SFPI_${CMAKE_HOST_SYSTEM_PROCESSOR}_${CMAKE_HOST_SYSTEM_NAME}_RELEASE}") diff --git a/tt_metal/include/compute_kernel_api/common.h b/tt_metal/include/compute_kernel_api/common.h index c7e13ebea85..feaa953791c 100644 --- a/tt_metal/include/compute_kernel_api/common.h +++ b/tt_metal/include/compute_kernel_api/common.h @@ -10,8 +10,8 @@ #include "compute_kernel_api/reconfig_data_format.h" #include "compute_kernel_api/cb_api.h" -extern uint32_t* rta_l1_base; -extern uint32_t* crta_l1_base; +extern uint32_t tt_l1_ptr* rta_l1_base; +extern uint32_t tt_l1_ptr* crta_l1_base; // clang-format off /** From 555f03b7373179d6a241f540d8ebc988f5df2f38 Mon Sep 17 00:00:00 2001 From: Mark O'Connor Date: Mon, 10 Feb 2025 19:57:56 +0100 Subject: [PATCH 043/316] Add Mistral-Small-24B-Instruct-2501 support (#17794) --- models/demos/llama3/PERF.md | 8 ++--- .../Mistral-Small-24B-Instruct-2501.refpt | Bin 0 -> 50792 bytes models/demos/llama3/tt/llama_attention.py | 17 --------- models/demos/llama3/tt/model_config.py | 33 +++++++++++------- 4 files changed, 24 insertions(+), 34 deletions(-) create mode 100644 models/demos/llama3/tests/reference_outputs/Mistral-Small-24B-Instruct-2501.refpt diff --git a/models/demos/llama3/PERF.md b/models/demos/llama3/PERF.md index f0bb11616df..2aefa56be3c 100644 --- a/models/demos/llama3/PERF.md +++ b/models/demos/llama3/PERF.md @@ -20,15 +20,15 @@ This configuration uses bfp4 MLP FF1+FF3 for all models. | Llama3.2-3B | TG | | | 48.5 | | Llama3.1-8B | N150 | 87 | 99 | 27.9 | | Llama3.1-8B | N300 | 88 | 99 | 43.7 | -| Llama3.1-8B | T3K | 91 | 100 | 64.2 | +| Llama3.1-8B | T3K | 88 | 100 | 64.2 | | Llama3.1-8B | TG | | | 41.0 | | Llama3.2-11B | N300 | 89 | 99 | 43.5 | | Llama3.2-11B | T3K | 88 | 99 | 63.4 | | Llama3.2-11B | TG | | | 40.9 | | Llama3.1-70B | T3K | 96 | 100 | 16.1 | | Llama3.1-70B | TG | | | | -| Qwen2.5-7B | N300 | 81 | 96 | 37.9 | -| Qwen2.5-72B | T3K | 99 | 100 | 12.8 | +| Qwen2.5-7B | N300 | 80 | 96 | 37.9 | +| Qwen2.5-72B | T3K | 98 | 100 | 12.8 | ## Accuracy @@ -53,5 +53,5 @@ This configuration uses bfp4 MLP FF1+FF3 only for the Llama-3.1-70B model and th | Llama3.2-11B | TG | 88 | 100 | 29.5 | | Llama3.1-70B | T3K | 97 | 100 | 14.7 | | Llama3.1-70B | TG | 95 | 100 | 12.7 | -| Qwen2.5-7B | N300 | 81 | 96 | 33.4 | +| Qwen2.5-7B | N300 | 80 | 96 | 33.4 | | Qwen2.5-72B | T3K | 99 | 100 | 12.8 | diff --git a/models/demos/llama3/tests/reference_outputs/Mistral-Small-24B-Instruct-2501.refpt b/models/demos/llama3/tests/reference_outputs/Mistral-Small-24B-Instruct-2501.refpt new file mode 100644 index 0000000000000000000000000000000000000000..37c108faa05a71b4d3051e62aa267edb5c804834 GIT binary patch literal 50792 zcmcJY3!KkY{{IJ&$o&>08H`H|V;Bu0A!2CcPK9aQCetuu6xoy&*5;N<7q(U;m4t5E zY}-|8t45zxuniezQ}QyiCYKwT zJLcLcnOUPp<`oo9%pG|{Qn{RxO`8rqyP(D zN@7mwp`{Aaj7d{+%Je-^EidQnD^p|oo-UPIdQMK+)Y7RX=H#3+FC$}OR@Rt|yv&K& zlgp*$jvkXcW_0(=jJ$F=<(l=+DW8{fZvUJLLn{~1$sIGcpfEdQ4B}QCUm(r+Qsc$0 zbxx)6W%|aPm6ubwfK6(Ng8!Y9Q>Ea)g?y^c8(&DSK!)o5b81X!*)R3%velX`Us9q# z(87QHA`Q|krfIPTxprFKn2Cj=_vqT_j4YCsnU|L}rcj94oj$K!ra-yEf6+oTj%i%1 z5QSdRDzQh`24^5a`2z1N{QI_?(g#W_t?K7jIDehG69r;j+j-RWGX)19t# z`dy)#1%KZ={l?E*JAL12LPgcPpwR9Ge`WnV+dZb6+t=@@^7l3tI$TW5k3T36-D4Tb zSE{Xa0pKCgG ztT*y3Y^L&4mhYg`l3ny1dFr$k4r!*-%W26RJ%6o(>O=P{m;dC}1w~DqM}MBLtfF?4 zZ&7+(1EtAxisr$I_P<%B^4e9zf5V+h!+O6Rrh2yo{4;HL>esa#*o8d)_*P@lq2H9< zCVu$IWYf>Brg|rAFZ>CA!7uO^?DbpMht77>TUZ!s3;s^nF7yNJxzO!;#;accWTgu( zP>Nk?7vhKh582;7DI5_B{-WyRS7AAHnpy6F_NzLV3n%I$4{~g{R{Y+{Q@wXB2lS}7 z&vYv~r5t|LYhk>|QxE>oFK2sGPJQZam?C*X`qZx(*kfHFS4c14@r?GA|2bCrPoGs3 zC-Rh^Hos<>syEE>1AWQ=)%_bi(05R__?)nPD<4pP+i9ge2P%!dLiCThf4tgJ&(Rn8 z$#<~c!-KfeCGevK_S4^061|x>iO&bN1LfqYM?XcM`Sy=>=GW8lk8Y+@&<- zU8OI%eiO&%otB@t)qb+-Z84u8jXUMUwS;n#zoy3({N%?PYJZ98BG=1J^!)njO5uyV z)Wa{1IgXH0@17^so;V$EK6{)}4?56A?y$V1$i;MV`+imR;Ct)+aK7|B&|1%{8kh1% zl&`y9X${x^vZd-1N4~cop)dUxxu&_CxIn&g2hnLVP4sJirL??q$-0dCS67FL_RxfBuZK_5R?b_~n}p`P5RP^N4Zp<@&UzpEED! zIrCuRDfwp0#V50i(r+zS;u__d7w5U0`7v~vABXs%Kl5Vz@UIm__maz$4zv7u?l((3 zuLBp_Gmo6=dMDiO?IEHwvA$Apy8K+>#5y9xg}8to$WMQTewY5DyC_BU!0kw!o>PAO zsPYe5uA3ZxDX0A_miPR(R6i|ZK1Y9LyaESs1IK5YiO$=XE5$FtAOFMeSbyAM9La+N z>x>zTL@(d|$8**jUwYlR#PON-;7C6T<-cKo=q|Zm>0R@bo{U$z)$5t0ABFdNuV+(@ zXJz;AdLGBoJ4)X-gLwR|+n;oN@8-Du)_LN;&(F~ZzeOMPBYse?wsETC^a9VnpwDya z#k$`4PU||2b9$cB`cA0_9qJ+91wnoIK$rHkgKj;y+Z*5%>Wd#FxIW|6bkF-(LR9*#2u}cW`ArirnZ0e&B%~=mj3MqnvhUdA@VN@#r$^1@7?a z=I7{F`nQs^%vDP1hc(^r%Q(evZ+ARp{>XFu89ydIEuJj>qx_kA_%;5F-{RN!^B+A= z08jh{I_=$m@n`TvfA9iF@CB#vdDsqqvy2OJz>j%5&qMzBC2=_{r(Gydm`ASAuc=2l z^uv1SjlTF7?eQ<)=QRrVkjF(^0ub5ZhZ}dO>@msH3*pJ!YRqb!7rj&W+ z=2psYw;ew6e2o2=RL|GgkD>nbg=$yT{da_$PZF0zc%SJfsuOSHg1m<1f%5j~=1l;BS;uANrJs z^r=rh%F!3Sh=a&O98BLSyW^+$Wx@LbsgF3MkZSJCIE-ERguT~$r( z)7q=v{M}0L*stfz%gGP%K0WI@*3W|{s2%HJ*2_EpDLSo;OCRqKuvf}`SIR&iv2+OF0+*qKiau|&&7Ivq4TV(8qE{`8Xo76qh}x0Pjmfg(^T&Y?~|k7 zo10V*{gT%zU(4&rFMBG_JmJUP$}>MWWj){Y{=!A(x7+jQ8J@3{aonQ+J+fYOz@NB* zeZz9r3qz9Bj&;>O``6=*^!#`i=~>eDPjlSGU*Ly-hI}b!{eV2w!;i{W7N2Vp#rG2H zy)jq$8~r|jby#1^%{mQwr9Ty&$2?EY@w}{51JQZYe71WYG`O_tuQl$IuT(wWTh%q* z_&<8`o(g@43-r$myk75BS@f>)xI;bqYgB({JwrY6dp!OyKV+O-?DhRn+o_c2^Y{4q z`z<90?`Nuee#CpDW^Tv&u$BFbbyB-AH{XThby zy{7#=v=ieF`xl{oz_0l%$=TWa1P_}Y@vzs0;)6ZFA32d5dc>Kq96M2ueT==&iho$2 zak{qU0oNWLU)S}NTy@<~7;n~PiQeWuO4GbfUc91czK#3QpFh|0sl!BXi{JNlzEIDR z3%S{ELGR9vpYUT5x@6%+vCt!_ZQ|XO&X{j^099f$_akp!#slii@f*+^APqC z`x}?5J4heuvCbq-J!uh4JIC={9nlqF=E; z*unVqi4)yROee*14Lx6cGdvGtpSPaZ_wYwf@SW>*U7W{f>fPQ^d={HN&uNDr^InvB z|Cg4thTFa6{SWqe*)PEk$OF#U1AB$#`KDjN^w0PB4X&)yz!keehdgoc0qYIlXHEC< zDboKrm)BmP{Kt1H%`OHP_OrnUTzHQB-~(RBQQLfx7x}@Ba-M_V#78Axf6tGi_+kg} zgb%ndpN2l?n!w?v{^FBl{HM-V{$anb3H=2epvOE1`M?8wkSoLm{m~D4a~HbbS#O?$ z(?uRPzzKfvhadcD4=&&VK9qyYBKMau=HK0MEyZzs?_S}Px?FOWU#ffw*E{}-@>h5t zGzvfZ9sH3WeUKl0qVNL`^n)JH$tO6@fcMsmC1-Jbp-cKg6SV`!_eTkbHr@x?bVTJm z&%I3f&4X31pVJT*@Q8JPr2iBrUx*9-gMYQ{C;lJZtP~u;1suSOc?S5ypMHjYh=bq) z9{3UCPwz?M`&pdy|Ip(@)2sFT9qW7eIpx#69{ujP^6WRK&Q<<9&wsYL-KAa^B^90x zE%@uZSm|=VSC7J(ahU#$p6HF9#rYpNf&+MiD|mzN4%>rv@P!}z;R{{T5NGHx?~V0) zDEi~v_QGT54y6|vw|O&^@8dl0L2mK9GS1JV@WVet{Lq7W40@m!eo1@A$521~hv&pU z^krT`$~=bh37&ss50>9Bt};$udyk&qYrmu&{tI8uiCj8c^_eGj8?Joo+mwFQNa^6q zl|EzqF1DW^Z>;BkYOZvm_sgD5lO4kO2)N^?KhG0AZvap5CJxZg!56&IhxqWvu9Aas z1pV<(=AZciKm0X}2h_W9tmxvW@NMe%lB~z7{z~;~d4KBL6~dw3T&1I%sr*!5rB8BB z#{K_!&j+LCb@(OYGx{Nah#&HUBlx{ATKrkBfG7R@-~7_!Cb)tpe6bh&;fs7BZU>D& z=ZvXGKfd8J$@iskJpP*So9%tEfgblx)>XaSNh+T+Ncp+1E04d$+Hdj4X3m2rdLTdg z;CCT@jJL>*|JCz;Gv(k&oTPqpj|cF9f4*^{9`c0vQ4W6e&y@|N|Ej+$ozYI|?KOnQ zA+JO3`$qY5l9l$(P`#(@zcq@PCo-<%PsoqF=!HHZe&7Wz;0E42562ty24DEX5B~6_ zKQq5+<#Q1`9+$o|t#79Xgu{WFdVa`u_}=Mor#be!qqnMFHNQU@Y=4^S{qRh$zwtl( z4&3lN{4h!n@Cfl^{H2`!3$Bzi?*TXP1YhuiKX`#>)O?8f591m0ZstKtR|}udjmwF> z%Eu=tJ#9QM_P%d5*Q1|aWIIkKE{mAB6aU1Go{-L!7{oct<(-6~`AmEA^Lu zWxHRkw7#8^g~t!iC`E7d`gre2(WPCK zpE6z-r!W3UTmZKaXX;U&F+y@;7jOoD>3NablkZYr`H`z7 zM=#T*J@z18g#G~@-~$feL;uAd;7pvr4#Z!|vCH(ak`Ft8H}v7lbL7Deq~Kru2I+Zp zi&E@^JsMd5KY2fWhUZsbIUa=ecp**l)vY9YEwYuz9{WrWyKoMI^L5yv;xM%%4wN+i zs`X9J<-hklX!9IBr+kU`6GOX&@jk=}dx0ak5;w3L{Rg{YFX&;X?p`01yH4^^j$N@& z$QOS23pgW*mmCJ zeaVwuR4@5yrJUo!ehXc1=313+Kd5x%A*EHjDV^hbFItYrYKt%H@Ly^->;;b4jsAq) zLVHopJR5s~D|TD%aj}c>fDiV=UdVyncyAZlkMqLtndkV{Z-DS&JjnAnbdT4|_f%8) zM^6i{6PD+5KYwGg%A@dO+#r4-KXRiN_@M{*ffqQ^55Nz+z;V=Q$1-A!5@C$2|nQVzR%NxCvi5(aTI(e^b-9M-ald;&3Y!* z@r8L#&y&In{OG?!;`RIq_iw&S%b#UCnNB&UQ@md>&~}OOIJ5kK=-gag>EmV8{&|;| zj??p1!FwCVY2tl&jsV=i5!|p>==Y)Bz!&>rr@V2J2Rpvw^&j@cPVj{e@?bAg?Dd^- zN%Opic2Ra?zx*|igQ4As*QqZ_Ugj~Oy|5$xvUHsA=Q;hc(f6WrdV$KnI;uSOO?8}_ zZF^%^?2dgYpXYiH7(eV@TwMG4J*@eGGxi04`X6zPIL34EC9Yvt?D?7ZHCT6GU-)59 z_`x5#p?$#@J0D#q{3dl#I`t-{WiJuF(_c}#?=huU8?RTruDIwgs^7}v_!}^y9>-=I43)eAiF)JnLiAnRKPy1S%gnRd0*4O@D(fzbVoStViSNfxINb~qK z&+jLS!zCQ2!2^8$x47WH-~%4`XMPLG4Il7jx;CMAJPURnX zTme6D14r-+-^_xEC2iwrIoHwS{#0%|3&EmF8CjL<}vh7;upAqfcI43k9_dMKFAB+_%-V~#tYVetmAfjyk(tG&*y<^PY})}M+s-f@hQfcaeR;8 z%iQn&ac`p9HTODf;e6%q?kzg(TdeW?r}s8Jf6exN`axwP!uO;z8xXFaAT}1&*0JRDR#zm4Y9*#d}@7 z$!U{jst11H298nq;XlZaUg7#T^!pG$a05r?)!>N#(M)nAN4->I%TKl1*`41ah{j9ae2I-a}E@~tk1xcJoFhisE-{cS*~`~ zrC<4yvh(j6DL;6f@{N3cq*fsD3#TvLAO7U`72r86UpRpS_%P1|CvZkC{1~~=19`!PahCSr1CC)ie#*HU;@Fw; z4RHV`_%pBPoDJ`Dwzd*Z+hTuJe zh%@+tH*%sc_<}b$;|Jgjp5V)QC+LAIc(bqebGiE?{NN8i>LEvnGjV}_2=2^(dUzed zd1K;4)cIm?;XMHH0=bX_dC?zv!390T{z-kx!}qm}{)eAN z=?lK-3(oi*xU#-veFVOp9XG)jzhYkpT;UHNaL4b#op{IjoD#l234c8MV&R=IM)-4n zxufUBPwvrk?oH!d^5^fU9qSHp zuL|C?vA*XV2mP9L=YX-o7dhzX*S#wI|Ne;5_Kr&jy$)m^A7u~bL&RTj03UDwC-BD} z;LdY!2Osj-W%^jj$$s6sMan}TzO+Xk^bYL-4%mTq4I7Fs?|E+Wd5jPC3daVglwyxs zt(C_v#N&xx-*5K!f!Jl5<23x&`h7--Gw)03?`iRp3%$_ym*PykCH{gd`=Y$(27hpY zFZ|$7{KXC--rH^zKG5OZ1?OPi_kMBpH0j5<&wJHYv-F&Mb+2(hIMq;euUn-w#1Gs$ z)K`1>qX+V%4}ORq_$klv*APE&gFg6yYj|D-yh1s`af@>+;D!G(Zhhx{D$a+e?H3N4 z{C+=Wn)1iJ?v49QINbBF()X$;P4GG(*0_Qr>n`Fg{!7Zd4L`>2%-<+a_Pm+r%=7qe z1%B9zagO-Hyf?%PyJ0Wv6lFK;!}(_H#C{-lj5;S1`U`xqC-IVfzHR<)c++0t{MZ$; zN2cc+$32gmI!@)|ynj^QaRXey1AM>(yMPOLASe58#o*^kcsa{h4pFpN5|BLr?TZAI^si z_Irm1-LEKLf4l6R^0dW1is)6uDsX2evIUVFa8IA{1RNjImD5E$a_cL_vCmS;eF3_S>ng| z%ln>{qx78l=J_8gU!u3-+R}>3H*#EywcjsoqUXG?I%R)PaXs$O2LCp`ce%g&L9Xfa zsvthpdt#aB9vZK7)_9c<^?uh3?Z0A1oe@re5|*cSz7?u~;U_bEV!{aX4j{eyUe zT=c7aue-zh6fQ3zKbq!w6Yby&zfd3ezi#}wp9lHj2mkHh+*kesAMQOIG(~dYSD_x^ zd@SGZr}~1mF_ZTjTc*yr&>fJ-$!>mDeHYAB79|&w~dk^uPx?FVq)K-~m2^ z0(|gOZ~+hS0hiEU-~<1^CW{~VfD7kTs8`nZq&<1gU$lEqbe4Mjq+Jv)jn=3>{enF0 zp7#45_$`?GpK+l+egjS+F5qyVzbDSOf8j6S2Oi)6fAa9-IXHk9`LG_ifK!OeERSQ* z<#!Ag_&ZbR{h^LieA9`}|PY->45h zQT+|S0+0N_uh1X)z@@b90xs~Q9yslHJ#ayP>VpgIqHqBZ@L7DLaOh~igWfClqdsee z_h9p%=l9;PIS!z90~U{hB$*S{GVZ%0qmrYzeu+QQFT=P7 z9`FYj^6u-BvD0v_N) zy{LE<_M^~WkT1$_zy1wMJD8_*gV%NN z2N&>QUJV}jOJ%Ps=}&7bi{3KJ`Lg-^^plwdMSpM}eHXg@RzLq;oZ5HvJbIef%PsHH zbL8WFEBW4&i}wGU97l6amwNCeexoNTen5TnM{o2+FZ9L!`5w2>7k=oCzVXHz{m{F) z*Ne2f%<`izdZHh4QolHTM;I6MML+Zo?Hr{q_WhIJtE1mHKHr&Wxm$FS-H`9Ee%`ri z(dX#FdmhUBJ3d}hPxa5qQF@8>SmphvFYG6*n@c39-YJhiJckeckNfz+*0Aa z_4$4mJ*W?E)C14r{AGdV&$qvTAASW+C;T1(zB~sXp0l4ooTWV3ae{gedHp!IiSz_N z@JKVB;6*#id5*r|x6gWHcpgLfY`@=oyo2~b@6%wuw7iANm-;#NxKE0G819u~-=%dq zwd4GLMdQlvt9~7?xR7u8M)>^~IJLIE@ZEBlc2kcCq_9x7raftl_<~{E? zKC<7zI*0i=^9}GLt}ioP@E+{ANj>zr+V4I1p5`j+`=sZO)c?kM69@mT+<0~K`%?5n z-=FEY9)3Z+wSj%%LwVgzvKzRB^r*LMzv^KJ?DC}XpdI!IpXVp5J@mlupnhMUV`cngzl86H|Kf9#gSyL)l+N+hK#( ztwa5t$BWMMz2xs57oVFhI)l0^9Wh*K>j6rW+#hqE5T7@FJ~h3Y$~Qfv^ketGIxCbP zu~_vto>0m;32@^yz$3(ocHnrq{g&s+_lqy%;i8X~Uuj$pHdTH}l2Y1r^7;&a z!oS%6#-FIq^9O_Y^l;D*GfbCu==bh8@u#0MU!@-Vebk2^DfQZ#PUweW`Dm~AsrRu>Zn*mU#)|67nD~^E~XsbL#PXf!MWXYsvLQZKbtuSK4p8QrfSx-_b94 zj-JpD`Lf?bIehnd+=b2?kBDENI!a?6Q#vuds2powQ~oXc+oP?NkCKZx0{`OVqknGw zT~WTn9B23Y-14`VsT^F;13u8nHm+yN_e9^Kd|Rd|4_@PaU((Z_uSfCi;(mhu*eQ(v z!~>pl-wt#?b3a5co?}P!M8B5y6Y34{{tophU*q`2eP;Zw5cTJ{e+~6}r#~JP4$m0R ziH`f+!*_6t%A@)@`t$y!yyM1szW1kA;E((sB=uQeLx=my$6JrA0H+sbN}g`+$E}`K zKJ35X#JUZf!IN^H)1Sbf{!70t=Xn9`X4)R1pVD8Td!~NMd0zaHetWay7X23f^xxi& z|MXkR)4dObJUmA(@Me7$;zB>AKKSq)dSSn%KZm$L2R!iGkS}~FryclEKhb{NZiV9h zxAuo=K6h5`em$q(=leY!_37v7ZpV3P`uB_7X-C0#$WO^@0MQBd3~yILJss{d}TbPzkKFtz%@5rGAeGUDgrk z&G~5LLT~m>&=Y^6UM1tQ#^*R!-XOlcJ+C}=Nc?wse~$a)md#PUvW`a`{XH@Mhkqj< zen&q*KIHnmv*;n;ul=5Al*gehw#)nqs@K)?l9Bd{*>_0JluK3KWTw)wFDYGZJ0V|) zAMzsC7O!uT4~Xu3&zF*nS1->~IsbK_nfSb9x%#=@#VL9YeQ-w(@BxP^e-S_Gon<;- z_sd^yjB+{8WB6~sgEWu3u$k?ZYe?8C9%;2a`xD2zXxpCAswAKW-s1ODK{ z^S?Kip4@Z(^%&{P`4Ie#cI=a(cV1hyKh#+5ZVT7@9{-T*3-71UexlFQ{du?A_q$Ii zd^mpwf7)S>&i1qG4yhgGJ=!V%^F6#Rik2^)t^DccN})H~>n-ST4{(OpF+!MJWF&hx7<(OCH|c+ zP|ChUlXF$?pA(cGPFKqQTVuzy_g(KT+j-YFs#nS5(bWT$?|z=>cX1p#`&&KV`G(R` zrnkiDcK63!b5%av?da$1V__%W7qQ>J-Fl64yPG^8*}PNq`frjvjDtV_UZL>;Kl%yt z41QmAvG+Gvk1!9~XMVItKKuiE#8LV|=UXHXah83WXMO*1uP-EbroZbS@n_{zbCibr z!1PznYp_3#KOh%!;zz{&vesjTSEv4t>y?k_!d zD9`__w z->cpqhbisk{)1nGH+BYh;t27kd|$O^o-phV<;Quxfd3K?i9f_!z603d_Z^hK|Ay#n z_B?E){bDdjX*PLgR-?Lh2ep98hFH~B-h3ZYJqC9%7kJWR=vCXxVKfFNo z?zf#ka)0SeePz}U%X!VMfUU0JU=bxd)%R$Z2Q6Yp}uNYGnmiN@4%69 zl=TmJ%6TtBKf;cWd0o@Jk>pH2OZ?Va&dNuW$Ikd0?OEqAK2`8O%0~ZwLh51By~^k0 zCv6eEzP9V@UeDa&dEU~ks(;jYj`Vo&&GmY|+IBfsU-@TVQA!*<=J9&F(W zMj!0Wd;otB&vBn#ruL=Ui{J5?N^kr@X=nS-OCERM?4jp#{QiUf3qNp-AFq08p4U=N zedO7HjrhmbSDNL1+0QufyA(Sd$Do_xeZtnw#fS6k=NX@Wdi_+!`HiM~!9}WH;RU4+ z+uy+t`N5s|z&Nzc@2Mt~Rr`xFu6RASZf&J9N ze}f-5?=ya!&jN4eMR!}Du>SS6#i!+b(a*VD`S&b$L)+&QzZa_R`s-~E<}tOtQakzs zzx%~L6aIt#l=C|*RUT0L=NBpEcf!zmK44IN?99dEXg7F6aKj z`KmddhuyM9^zbLXbLj1TsIj*9@bgtK$^EU0<%Mr!>qGxo@BW2<5NAR=<1dFjt`7wV zw|~%d?=pYlOLM;um}+~$7d$Bc+VQB4>7Xb5l77#Az;M??FZNaOZ`S{}B*<=y5{1jr ziOLgqh)0h!C|XY3LQeQ#H{`^hke59D75VA+VR_fC;!FR>|1vTq=M=9`nj7y)#wo%1 zn~gv38?h6=f7jjRI~`B5-H+m~72k^Xlk;u2eyL}d-S*5=7kuzwOibaweyODjIF)s8 zQ5*fYaZL4?zL}Hra?`UC23((>m6edxwrfJK?1Cp#GV&6Vk`r6E8l9e(-l}zvu9ahk z2Y=EoCgy$XvC`=yPTA*P;kfkPMzyHq{?0wZe8-GkF0;NLoRpM2U)W6fI^Lhgp3H0c z9nB4QiVt?E>vdua(}f@V?#$=0|3gL!z5~s7gYn*HI_!6qGkyBSI+x=o(`QK@>JQ4c zURUTj{Q|#V;J7v4a_niO`iytz{iEX!&yU#;nD@Y+b!y%9Md?g)e@5@Ej??JP^C
{p4`X7lCpDz8C;#ZekR+N8~|1G&+_13$8BJaD_2YX_F)-`eNN2GO~XFrnofj^() zahLk_UH>evQ%)N1n)c67p2f8!Cvwzrd+3C6QhuK6k-{IGySbcxc*6e8{ARPqzYv#D zj#$$vP7d%RJ>a-m`nN^x#d`q!DPe}nw|YKtsEhKes;Qse;rouBF@8Djf8*W%doEV} zR+f|a_GM2!f75Xge|yCJai8UbUoX=yUs-%vpYk35;L>_N(DkOeJ$k(So%k(w|0X`} z_(9Jz-EX=7fO`+PSK|LF7ySczneIQ$XNf<0rnOg|`&GC%qMh+e^ElV}Le=Z+=Tl9u z>2BcwznYF4VLt?yrQX-N*mRC`PZ2cC$xh3(2o0$xF0L3J@@Z~d^a8W?|iwxFw~QK zF4J49{Wi-L#h?2SL;m~@J-;(woc;J6ckIV{hTmD|cht|+Za0pV{Oli`>iggE2<=Dz z=N`3zo(D(a5ZaOU{LVDLBOTQ~v?IU6%EjlJS<` z$&I4Jy^Q!XzkfN`e#GyRGT-NSUZd!7-(bGuL2>km2mDTH6y53ekI-)!H#j%Nc|h(} z<##fp=yFdJ{fhZ8zaz=-1w^&y{yF5rkN6$MgHQi=KNvMy<={^|S-M)!vEz_<<@p`B zo4tOHl8gJjutOAG=Fh*BKKEROa!=|aIj^?=@H=T!y?*C@-6*-aM+-c;|Eh(@g{XGi zvxPh*M@fz|ZGZh3)dw%m(-7CA_;Qaa^D2I4iQiF*YRB*HATRfq#J~Y%{|;v z^uqCyd#t1ChvOpsi~9^&U*k92!yH8~92cR-J;l)Dp5Z7uQR5)@@N&;?RQqte!>-&z z%RRIBEBX~@_i!BKen##|jgluE|G0M~s(v`GaZhJTG2?91comKh+%p-)KOCRHFN~Mm zqj)M#{hWIcISu{hU{-IzD=o!~n6GoJGu-^6su*DNUKJG!tP->dO`8v1}6`k+7G zdGZ}6dT^eqq4nkasQ+K`BM;x{@f}`qdhneaa&tbL@q&3K-;wd1*iPe#e}~_1@f{Z5 zSg{()cfoeF*z zH7;_#0J->%gYPsrcgeZms`hKn(;zqBIq)6BaPL#`9RlMk-x2Vg0N(+i2j}>~8UJG* zNgUyvJaGoRI2S^F-Yek;oFgaCIdIN-6Yn{X#5ruvd2)`LbJCoH=A1L45r=UkB={#ky1%sErekp!}%M|*>R2z{+xqDZ_crCPL22ruAC#| zoEUO(&I`M6PK)yqoU=+Xo{T%l!MOt3FLPW#kHpu6AN=r3&i&92dC$c;9nRr!&StRV zFy~}A2ZJ1(W8s_%{=zvE=5d@8K|juUpdaTnunXrZI7b0p^n*X=7&xcEIfTM<29IvO zWKm500vrndMV~Wh64Sica|Snz$(@`zF}p|CCjWNOAUJnGwoSQ033fa%Zcxbr0Sf;` zOVBQ+PO%cS{OPE}1Ib}8-Jr?hXAlH4XS zB{4ZMF)`($b}8-Krz9p8JZqQICMl^<%7mp|4mB;1u<&2M)Dnfl{(K^%TC?R#3a?x! z{8w0B^zTg0XSAF&AuFUhyVK{j3j=oHKObN);lj_zilch%w7fABb4QQK4axrN!IH*B zB&}JXRB>c~LO3y``L9k?l;-${&X8v7kmkR-MUx_OzVk})H0_Kb(SNm~57C=+|5uAA z`k(!)Uuu~$9)M%&l}h+o)Iz%z{*hMv$!B6>zANBV_*c;Gr?^#Si}D3VEqGS?r|M-2 KS{2q;-TwjOPb&EU literal 0 HcmV?d00001 diff --git a/models/demos/llama3/tt/llama_attention.py b/models/demos/llama3/tt/llama_attention.py index a2c5490fef8..ac67c80f1c2 100644 --- a/models/demos/llama3/tt/llama_attention.py +++ b/models/demos/llama3/tt/llama_attention.py @@ -305,11 +305,6 @@ def forward_decode( # Use HiFi2 for DRAM-sharded matmuls as they are otherwise flop-bound. Loses 1 bit of activation precision. ### - as_torch = lambda tensor: torch.Tensor( - ttnn.to_torch(tensor, mesh_composer=ttnn.ConcatMeshToTensor(self.mesh_device, dim=-1)) - ) - - # print(f"our x:", " ".join(f'{t:+3.1f}' for t in as_torch(x)[0, 0, 0].flatten())) xqkv_fused_sharded = ttnn.linear( x, self.wqkv, @@ -582,18 +577,6 @@ def forward_prefill( if seq_len > self.MAX_QKV_MM_SEQ_LEN: xqkv_fused = ttnn.reshape(xqkv_fused, [1, 1, seq_len, -1]) - def fix(xqkv): - torch_q = xqkv[: self.head_dim * self.n_local_heads] - torch_k = xqkv[ - self.head_dim * self.n_local_heads : self.head_dim * (self.n_local_heads + self.n_local_kv_heads) - ] - torch_v = xqkv[self.head_dim * (self.n_local_heads + self.n_local_kv_heads) :] - to_hf = lambda t: permute(t.unsqueeze(-1), t.shape[0] // self.head_dim, t.shape[0], 1).squeeze(-1) - torch_q = to_hf(torch_q) - torch_k = to_hf(torch_k) - torch_v = torch_v - return torch_k.flatten() - ttnn.deallocate(x_11SH) # split qkv into heads diff --git a/models/demos/llama3/tt/model_config.py b/models/demos/llama3/tt/model_config.py index 6c91825dbbc..db7b9e207c5 100644 --- a/models/demos/llama3/tt/model_config.py +++ b/models/demos/llama3/tt/model_config.py @@ -194,9 +194,13 @@ def __init__( try: max_prefill_chunk_size_div1024 = MAX_PREFILL_CHUNK_SIZES_DIV1024[self.base_model_name][self.device_name] except KeyError: - raise ValueError( - f"Unknown model {self.model_name} on device {self.device_name}, try setting MAX_PREFILL_CHUNK_SIZE between 4 (compatible) and 128 (faster)" + logger.warning( + f"Unknown model {self.model_name} on device {self.device_name}, setting MAX_PREFILL_CHUNK_SIZE to 4 for compatibility" + ) + logger.warning( + f"Try setting MAX_PREFILL_CHUNK_SIZE to larger powers of 2 up to e.g. 128 for faster performance (if you run out of L1 memory it was too high)" ) + max_prefill_chunk_size_div1024 = 4 assert ( max_prefill_chunk_size_div1024 is not None ), f"Unsupported model {self.model_name} on device {self.device_name}" @@ -309,23 +313,18 @@ def __init__( k_chunk_size=256 if seqlen >= 2048 else 64, ) - def find_largest_divisor(n, max_divisor=8): - for i in range(max_divisor, 0, -1): - if n % i == 0: - return i - return 1 # Fallback to 1 if no divisor found - # nlp_concat_heads_decode will shard the data across this number of cores assert ( self.n_heads % self.cluster_shape[1] == 0 ), f"n_heads must be divisible by num_devices: {self.n_heads} % {self.cluster_shape[1]}" + # Note: for some models (e.g. Mistral-Small) n_heads * head_dim != dim self.model_config["ATTN_OUTPUT_PROGCFG"] = ( None if self.is_galaxy else self.dram_matmul_config( m=self.tile_padded_batch_rows, - k=self.dim // self.num_devices, + k=(self.n_heads * self.head_dim) // self.num_devices, n=self.dim, num_cores=self.n_heads // self.num_devices, ) @@ -980,7 +979,7 @@ def _set_params_from_dict(self, params): self.norm_eps = params.get("norm_eps", params.get("rms_norm_eps")) self.vocab_size = params["vocab_size"] self.padded_vocab_size = 128 * 1024 - self.head_dim = self.dim // self.n_heads + self.head_dim = params.get("head_dim", self.dim // self.n_heads) # Handle different MLP dimension specifications if "intermediate_size" in params: @@ -1332,6 +1331,12 @@ def find_grid_k_n(self, K, N): f"Cannot find a grid configuration such that both {K} and {N} tiles evenly divide into cores of max size {max_rows}x{max_cols}." ) + def find_largest_divisor(self, n, max_divisor=8): + for i in range(max_divisor, 0, -1): + if n % i == 0: + return i + return 1 # Fallback to 1 if no divisor found + def dram_matmul_config(self, m: int, k: int, n: int, num_cores=None): # in0_block_w must evenly divide k and be no larger than tile_size * num_cores if num_cores is None: @@ -1342,7 +1347,7 @@ def dram_matmul_config(self, m: int, k: int, n: int, num_cores=None): ), f"k must be divisible by tile_size * num_cores: {k} % {self.tile_size * num_cores} != 0" # assert n % (self.tile_size * num_cores) == 0, f"n must be divisible by tile_size * num_cores: {n} % {self.tile_size * num_cores} != 0" return ttnn.MatmulMultiCoreReuseMultiCastDRAMShardedProgramConfig( - in0_block_w=math.ceil(k / (self.tile_size * num_cores)), + in0_block_w=self.find_largest_divisor(k // (self.tile_size * num_cores)), per_core_M=math.ceil(m / self.tile_size), per_core_N=math.ceil(n / (self.tile_size * num_cores)), fused_activation=None, @@ -1371,7 +1376,7 @@ def matmul_1d_config( grid = ttnn.CoreGrid(x=grid.x, y=grid_y) per_core_m = m // tile_height - per_core_k = math.ceil(k / tile_width / grid.num_cores) + per_core_k = (self.find_largest_divisor(k // (self.tile_size * grid.num_cores)),) per_core_n = math.ceil(n / tile_width / grid.num_cores) if is_fp32_accumulate: @@ -1536,7 +1541,9 @@ def reference_transformer(self, wrap=True, load_checkpoint=False): else: from transformers import AutoConfig, AutoModelForCausalLM - if not load_checkpoint: + # HF is much faster at loading from a checkpoint than generating from config + # so use that by preference unless we don't have a checkpoint + if self.dummy_weights and not load_checkpoint: config = AutoConfig.from_pretrained(self.DEFAULT_CKPT_DIR) config.num_layers = self.n_layers model = AutoModelForCausalLM.from_config(config) From 524af078ed14c8a7f17954b204f29ae709dc1c8b Mon Sep 17 00:00:00 2001 From: Bryan Wilder Field Lozano Date: Mon, 10 Feb 2025 12:13:35 -0700 Subject: [PATCH 044/316] Split `command_queue_interface.hpp` into header and implementation (#17789) --- .../tt-metalium/command_queue_interface.hpp | 245 +--------------- tt_metal/impl/CMakeLists.txt | 1 + .../impl/dispatch/command_queue_interface.cpp | 272 ++++++++++++++++++ 3 files changed, 281 insertions(+), 237 deletions(-) create mode 100644 tt_metal/impl/dispatch/command_queue_interface.cpp diff --git a/tt_metal/api/tt-metalium/command_queue_interface.hpp b/tt_metal/api/tt-metalium/command_queue_interface.hpp index 01e7fe43757..30de4f2e631 100644 --- a/tt_metal/api/tt-metalium/command_queue_interface.hpp +++ b/tt_metal/api/tt-metalium/command_queue_interface.hpp @@ -255,78 +255,16 @@ inline uint32_t get_absolute_cq_offset(uint16_t channel, uint8_t cq_id, uint32_t } template -inline uint32_t get_cq_issue_rd_ptr(chip_id_t chip_id, uint8_t cq_id, uint32_t cq_size) { - uint32_t recv; - chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(chip_id); - uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(chip_id); - uint32_t channel_offset = (channel >> 2) * tt::tt_metal::DispatchSettings::MAX_DEV_CHANNEL_SIZE; - CoreType core_type = tt::tt_metal::dispatch_core_manager::instance().get_dispatch_core_type(chip_id); - uint32_t issue_q_rd_ptr = - DispatchMemMap::get(core_type).get_host_command_queue_addr(CommandQueueHostAddrType::ISSUE_Q_RD); - tt::Cluster::instance().read_sysmem( - &recv, - sizeof(uint32_t), - issue_q_rd_ptr + channel_offset + get_relative_cq_offset(cq_id, cq_size), - mmio_device_id, - channel); - if (not addr_16B) { - return recv << 4; - } - return recv; -} +uint32_t get_cq_issue_rd_ptr(chip_id_t chip_id, uint8_t cq_id, uint32_t cq_size); template -inline uint32_t get_cq_issue_wr_ptr(chip_id_t chip_id, uint8_t cq_id, uint32_t cq_size) { - uint32_t recv; - chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(chip_id); - uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(chip_id); - CoreType core_type = tt::tt_metal::dispatch_core_manager::instance().get_dispatch_core_type(chip_id); - uint32_t issue_q_wr_ptr = - DispatchMemMap::get(core_type).get_host_command_queue_addr(CommandQueueHostAddrType::ISSUE_Q_WR); - tt::Cluster::instance().read_sysmem( - &recv, sizeof(uint32_t), issue_q_wr_ptr + get_relative_cq_offset(cq_id, cq_size), mmio_device_id, channel); - if (not addr_16B) { - return recv << 4; - } - return recv; -} +uint32_t get_cq_issue_wr_ptr(chip_id_t chip_id, uint8_t cq_id, uint32_t cq_size); template -inline uint32_t get_cq_completion_wr_ptr(chip_id_t chip_id, uint8_t cq_id, uint32_t cq_size) { - uint32_t recv; - chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(chip_id); - uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(chip_id); - uint32_t channel_offset = (channel >> 2) * tt::tt_metal::DispatchSettings::MAX_DEV_CHANNEL_SIZE; - CoreType core_type = tt::tt_metal::dispatch_core_manager::instance().get_dispatch_core_type(chip_id); - uint32_t completion_q_wr_ptr = - DispatchMemMap::get(core_type).get_host_command_queue_addr(CommandQueueHostAddrType::COMPLETION_Q_WR); - tt::Cluster::instance().read_sysmem( - &recv, - sizeof(uint32_t), - completion_q_wr_ptr + channel_offset + get_relative_cq_offset(cq_id, cq_size), - mmio_device_id, - channel); - if (not addr_16B) { - return recv << 4; - } - return recv; -} +uint32_t get_cq_completion_wr_ptr(chip_id_t chip_id, uint8_t cq_id, uint32_t cq_size); template -inline uint32_t get_cq_completion_rd_ptr(chip_id_t chip_id, uint8_t cq_id, uint32_t cq_size) { - uint32_t recv; - chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(chip_id); - uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(chip_id); - CoreType core_type = tt::tt_metal::dispatch_core_manager::instance().get_dispatch_core_type(chip_id); - uint32_t completion_q_rd_ptr = - DispatchMemMap::get(core_type).get_host_command_queue_addr(CommandQueueHostAddrType::COMPLETION_Q_RD); - tt::Cluster::instance().read_sysmem( - &recv, sizeof(uint32_t), completion_q_rd_ptr + get_relative_cq_offset(cq_id, cq_size), mmio_device_id, channel); - if (not addr_16B) { - return recv << 4; - } - return recv; -} +uint32_t get_cq_completion_rd_ptr(chip_id_t chip_id, uint8_t cq_id, uint32_t cq_size); struct SystemMemoryCQInterface { // CQ is split into issue and completion regions @@ -412,94 +350,7 @@ class SystemMemoryManager { worker_launch_message_buffer_state; public: - SystemMemoryManager(chip_id_t device_id, uint8_t num_hw_cqs) : - device_id(device_id), - num_hw_cqs(num_hw_cqs), - fast_write_callable(tt::Cluster::instance().get_fast_pcie_static_tlb_write_callable(device_id)), - bypass_enable(false), - bypass_buffer_write_offset(0) { - this->completion_byte_addrs.resize(num_hw_cqs); - this->prefetcher_cores.resize(num_hw_cqs); - this->prefetch_q_writers.reserve(num_hw_cqs); - this->prefetch_q_dev_ptrs.resize(num_hw_cqs); - this->prefetch_q_dev_fences.resize(num_hw_cqs); - - // Split hugepage into however many pieces as there are CQs - chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device_id); - uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id); - char* hugepage_start = (char*)tt::Cluster::instance().host_dma_address(0, mmio_device_id, channel); - hugepage_start += (channel >> 2) * DispatchSettings::MAX_DEV_CHANNEL_SIZE; - this->cq_sysmem_start = hugepage_start; - - // TODO(abhullar): Remove env var and expose sizing at the API level - char* cq_size_override_env = std::getenv("TT_METAL_CQ_SIZE_OVERRIDE"); - if (cq_size_override_env != nullptr) { - uint32_t cq_size_override = std::stoi(string(cq_size_override_env)); - this->cq_size = cq_size_override; - } else { - this->cq_size = tt::Cluster::instance().get_host_channel_size(mmio_device_id, channel) / num_hw_cqs; - if (tt::Cluster::instance().is_galaxy_cluster()) { - // We put 4 galaxy devices per huge page since number of hugepages available is less than number of - // devices. - this->cq_size = this->cq_size / DispatchSettings::DEVICES_PER_UMD_CHANNEL; - } - } - this->channel_offset = DispatchSettings::MAX_HUGEPAGE_SIZE * get_umd_channel(channel) + (channel >> 2) * DispatchSettings::MAX_DEV_CHANNEL_SIZE; - - CoreType core_type = tt::tt_metal::dispatch_core_manager::instance().get_dispatch_core_type(device_id); - uint32_t completion_q_rd_ptr = - DispatchMemMap::get(core_type).get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_RD); - uint32_t prefetch_q_base = - DispatchMemMap::get(core_type).get_device_command_queue_addr(CommandQueueDeviceAddrType::UNRESERVED); - uint32_t cq_start = - DispatchMemMap::get(core_type).get_host_command_queue_addr(CommandQueueHostAddrType::UNRESERVED); - for (uint8_t cq_id = 0; cq_id < num_hw_cqs; cq_id++) { - tt_cxy_pair prefetcher_core = - tt::tt_metal::dispatch_core_manager::instance().prefetcher_core(device_id, channel, cq_id); - auto prefetcher_virtual = tt::Cluster::instance().get_virtual_coordinate_from_logical_coordinates(prefetcher_core.chip, CoreCoord(prefetcher_core.x, prefetcher_core.y), core_type); - this->prefetcher_cores[cq_id] = tt_cxy_pair(prefetcher_core.chip, prefetcher_virtual.x, prefetcher_virtual.y); - this->prefetch_q_writers.emplace_back( - tt::Cluster::instance().get_static_tlb_writer(this->prefetcher_cores[cq_id])); - - tt_cxy_pair completion_queue_writer_core = - tt::tt_metal::dispatch_core_manager::instance().completion_queue_writer_core(device_id, channel, cq_id); - auto completion_queue_writer_virtual = - tt::Cluster::instance().get_virtual_coordinate_from_logical_coordinates( - completion_queue_writer_core.chip, - CoreCoord(completion_queue_writer_core.x, completion_queue_writer_core.y), - core_type); - - const std::tuple completion_interface_tlb_data = - tt::Cluster::instance() - .get_tlb_data(tt_cxy_pair( - completion_queue_writer_core.chip, - completion_queue_writer_virtual.x, - completion_queue_writer_virtual.y)) - .value(); - auto [completion_tlb_offset, completion_tlb_size] = completion_interface_tlb_data; - this->completion_byte_addrs[cq_id] = completion_tlb_offset + completion_q_rd_ptr % completion_tlb_size; - - this->cq_interfaces.push_back(SystemMemoryCQInterface(channel, cq_id, this->cq_size, cq_start)); - // Prefetch queue acts as the sync mechanism to ensure that issue queue has space to write, so issue queue - // must be as large as the max amount of space the prefetch queue can specify Plus 1 to handle wrapping Plus - // 1 to allow us to start writing to issue queue before we reserve space in the prefetch queue - TT_FATAL( - DispatchMemMap::get(core_type, num_hw_cqs).max_prefetch_command_size() * - (DispatchMemMap::get(core_type, num_hw_cqs).prefetch_q_entries() + 2) <= - this->get_issue_queue_size(cq_id), - "Issue queue for cq_id {} has size of {} which is too small", - cq_id, - this->get_issue_queue_size(cq_id)); - this->cq_to_event.push_back(0); - this->cq_to_last_completed_event.push_back(0); - this->prefetch_q_dev_ptrs[cq_id] = prefetch_q_base; - this->prefetch_q_dev_fences[cq_id] = - prefetch_q_base + DispatchMemMap::get(core_type, num_hw_cqs).prefetch_q_entries() * - sizeof(DispatchSettings::prefetch_q_entry_type); - } - std::vector temp_mutexes(num_hw_cqs); - cq_to_event_locks.swap(temp_mutexes); - } + SystemMemoryManager(chip_id_t device_id, uint8_t num_hw_cqs); uint32_t get_next_event(const uint8_t cq_id) { cq_to_event_locks[cq_id].lock(); @@ -652,37 +503,7 @@ class SystemMemoryManager { } // TODO: RENAME issue_queue_stride ? - void issue_queue_push_back(uint32_t push_size_B, const uint8_t cq_id) { - if (this->bypass_enable) { - this->bypass_buffer_write_offset += push_size_B; - return; - } - - // All data needs to be PCIE_ALIGNMENT aligned - uint32_t push_size_16B = align(push_size_B, tt::tt_metal::hal.get_alignment(tt::tt_metal::HalMemType::HOST)) >> 4; - - SystemMemoryCQInterface& cq_interface = this->cq_interfaces[cq_id]; - CoreType core_type = tt::tt_metal::dispatch_core_manager::instance().get_dispatch_core_type(this->device_id); - uint32_t issue_q_wr_ptr = - DispatchMemMap::get(core_type).get_host_command_queue_addr(CommandQueueHostAddrType::ISSUE_Q_WR); - - if (cq_interface.issue_fifo_wr_ptr + push_size_16B >= cq_interface.issue_fifo_limit) { - cq_interface.issue_fifo_wr_ptr = (cq_interface.cq_start + cq_interface.offset) >> 4; // In 16B words - cq_interface.issue_fifo_wr_toggle = not cq_interface.issue_fifo_wr_toggle; // Flip the toggle - } else { - cq_interface.issue_fifo_wr_ptr += push_size_16B; - } - - // Also store this data in hugepages, so if a hang happens we can see what was written by host. - chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(this->device_id); - uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device_id); - tt::Cluster::instance().write_sysmem( - &cq_interface.issue_fifo_wr_ptr, - sizeof(uint32_t), - issue_q_wr_ptr + get_relative_cq_offset(cq_id, this->cq_size), - mmio_device_id, - channel); - } + void issue_queue_push_back(uint32_t push_size_B, const uint8_t cq_id); uint32_t completion_queue_wait_front(const uint8_t cq_id, volatile bool& exit_condition) const { uint32_t write_ptr_and_toggle; @@ -699,26 +520,7 @@ class SystemMemoryManager { return write_ptr_and_toggle; } - void send_completion_queue_read_ptr(const uint8_t cq_id) const { - const SystemMemoryCQInterface& cq_interface = this->cq_interfaces[cq_id]; - - uint32_t read_ptr_and_toggle = - cq_interface.completion_fifo_rd_ptr | (cq_interface.completion_fifo_rd_toggle << 31); - this->fast_write_callable(this->completion_byte_addrs[cq_id], 4, (uint8_t*)&read_ptr_and_toggle); - - // Also store this data in hugepages in case we hang and can't get it from the device. - chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(this->device_id); - uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device_id); - CoreType core_type = tt::tt_metal::dispatch_core_manager::instance().get_dispatch_core_type(this->device_id); - uint32_t completion_q_rd_ptr = - DispatchMemMap::get(core_type).get_host_command_queue_addr(CommandQueueHostAddrType::COMPLETION_Q_RD); - tt::Cluster::instance().write_sysmem( - &read_ptr_and_toggle, - sizeof(uint32_t), - completion_q_rd_ptr + get_relative_cq_offset(cq_id, this->cq_size), - mmio_device_id, - channel); - } + void send_completion_queue_read_ptr(const uint8_t cq_id) const; void wrap_issue_queue_wr_ptr(const uint8_t cq_id) { if (this->bypass_enable) { @@ -750,38 +552,7 @@ class SystemMemoryManager { this->send_completion_queue_read_ptr(cq_id); } - void fetch_queue_reserve_back(const uint8_t cq_id) { - if (this->bypass_enable) { - return; - } - - CoreType core_type = tt::tt_metal::dispatch_core_manager::instance().get_dispatch_core_type(device_id); - const uint32_t prefetch_q_rd_ptr = - DispatchMemMap::get(core_type).get_device_command_queue_addr(CommandQueueDeviceAddrType::PREFETCH_Q_RD); - - // Helper to wait for fetch queue space, if needed - uint32_t fence; - auto wait_for_fetch_q_space = [&]() { - // Loop until space frees up - while (this->prefetch_q_dev_ptrs[cq_id] == this->prefetch_q_dev_fences[cq_id]) { - tt::Cluster::instance().read_core( - &fence, sizeof(uint32_t), this->prefetcher_cores[cq_id], prefetch_q_rd_ptr); - this->prefetch_q_dev_fences[cq_id] = fence; - } - }; - - wait_for_fetch_q_space(); - - // Wrap FetchQ if possible - uint32_t prefetch_q_base = - DispatchMemMap::get(core_type).get_device_command_queue_addr(CommandQueueDeviceAddrType::UNRESERVED); - uint32_t prefetch_q_limit = prefetch_q_base + DispatchMemMap::get(core_type, num_hw_cqs).prefetch_q_entries() * - sizeof(DispatchSettings::prefetch_q_entry_type); - if (this->prefetch_q_dev_ptrs[cq_id] == prefetch_q_limit) { - this->prefetch_q_dev_ptrs[cq_id] = prefetch_q_base; - wait_for_fetch_q_space(); - } - } + void fetch_queue_reserve_back(const uint8_t cq_id); void fetch_queue_write(uint32_t command_size_B, const uint8_t cq_id, bool stall_prefetcher = false) { CoreType dispatch_core_type = diff --git a/tt_metal/impl/CMakeLists.txt b/tt_metal/impl/CMakeLists.txt index 46a2578a2af..12515d909f8 100644 --- a/tt_metal/impl/CMakeLists.txt +++ b/tt_metal/impl/CMakeLists.txt @@ -26,6 +26,7 @@ set(IMPL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/dispatch/dispatch_query_manager.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dispatch/dispatch_core_common.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dispatch/dispatch_core_manager.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dispatch/command_queue_interface.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dispatch/hardware_command_queue.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dispatch/launch_message_ring_buffer_state.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dispatch/worker_config_buffer.cpp diff --git a/tt_metal/impl/dispatch/command_queue_interface.cpp b/tt_metal/impl/dispatch/command_queue_interface.cpp new file mode 100644 index 00000000000..23df5c18457 --- /dev/null +++ b/tt_metal/impl/dispatch/command_queue_interface.cpp @@ -0,0 +1,272 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "command_queue_interface.hpp" + +#include "tt_cluster.hpp" + +namespace tt::tt_metal { + +template +uint32_t get_cq_issue_rd_ptr(chip_id_t chip_id, uint8_t cq_id, uint32_t cq_size) { + uint32_t recv; + chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(chip_id); + uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(chip_id); + uint32_t channel_offset = (channel >> 2) * tt::tt_metal::DispatchSettings::MAX_DEV_CHANNEL_SIZE; + CoreType core_type = tt::tt_metal::dispatch_core_manager::instance().get_dispatch_core_type(chip_id); + uint32_t issue_q_rd_ptr = + DispatchMemMap::get(core_type).get_host_command_queue_addr(CommandQueueHostAddrType::ISSUE_Q_RD); + tt::Cluster::instance().read_sysmem( + &recv, + sizeof(uint32_t), + issue_q_rd_ptr + channel_offset + get_relative_cq_offset(cq_id, cq_size), + mmio_device_id, + channel); + if constexpr (!addr_16B) { + return recv << 4; + } + return recv; +} + +template uint32_t get_cq_issue_rd_ptr(chip_id_t chip_id, uint8_t cq_id, uint32_t cq_size); +template uint32_t get_cq_issue_rd_ptr(chip_id_t chip_id, uint8_t cq_id, uint32_t cq_size); + +template +uint32_t get_cq_issue_wr_ptr(chip_id_t chip_id, uint8_t cq_id, uint32_t cq_size) { + uint32_t recv; + chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(chip_id); + uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(chip_id); + CoreType core_type = tt::tt_metal::dispatch_core_manager::instance().get_dispatch_core_type(chip_id); + uint32_t issue_q_wr_ptr = + DispatchMemMap::get(core_type).get_host_command_queue_addr(CommandQueueHostAddrType::ISSUE_Q_WR); + tt::Cluster::instance().read_sysmem( + &recv, sizeof(uint32_t), issue_q_wr_ptr + get_relative_cq_offset(cq_id, cq_size), mmio_device_id, channel); + if constexpr (!addr_16B) { + return recv << 4; + } + return recv; +} + +template uint32_t get_cq_issue_wr_ptr(chip_id_t chip_id, uint8_t cq_id, uint32_t cq_size); +template uint32_t get_cq_issue_wr_ptr(chip_id_t chip_id, uint8_t cq_id, uint32_t cq_size); + +template +uint32_t get_cq_completion_wr_ptr(chip_id_t chip_id, uint8_t cq_id, uint32_t cq_size) { + uint32_t recv; + chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(chip_id); + uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(chip_id); + uint32_t channel_offset = (channel >> 2) * tt::tt_metal::DispatchSettings::MAX_DEV_CHANNEL_SIZE; + CoreType core_type = tt::tt_metal::dispatch_core_manager::instance().get_dispatch_core_type(chip_id); + uint32_t completion_q_wr_ptr = + DispatchMemMap::get(core_type).get_host_command_queue_addr(CommandQueueHostAddrType::COMPLETION_Q_WR); + tt::Cluster::instance().read_sysmem( + &recv, + sizeof(uint32_t), + completion_q_wr_ptr + channel_offset + get_relative_cq_offset(cq_id, cq_size), + mmio_device_id, + channel); + if constexpr (!addr_16B) { + return recv << 4; + } + return recv; +} + +template uint32_t get_cq_completion_wr_ptr(chip_id_t chip_id, uint8_t cq_id, uint32_t cq_size); +template uint32_t get_cq_completion_wr_ptr(chip_id_t chip_id, uint8_t cq_id, uint32_t cq_size); + +template +inline uint32_t get_cq_completion_rd_ptr(chip_id_t chip_id, uint8_t cq_id, uint32_t cq_size) { + uint32_t recv; + chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(chip_id); + uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(chip_id); + CoreType core_type = tt::tt_metal::dispatch_core_manager::instance().get_dispatch_core_type(chip_id); + uint32_t completion_q_rd_ptr = + DispatchMemMap::get(core_type).get_host_command_queue_addr(CommandQueueHostAddrType::COMPLETION_Q_RD); + tt::Cluster::instance().read_sysmem( + &recv, sizeof(uint32_t), completion_q_rd_ptr + get_relative_cq_offset(cq_id, cq_size), mmio_device_id, channel); + if constexpr (!addr_16B) { + return recv << 4; + } + return recv; +} + +template uint32_t get_cq_completion_rd_ptr(chip_id_t chip_id, uint8_t cq_id, uint32_t cq_size); +template uint32_t get_cq_completion_rd_ptr(chip_id_t chip_id, uint8_t cq_id, uint32_t cq_size); + +SystemMemoryManager::SystemMemoryManager(chip_id_t device_id, uint8_t num_hw_cqs) : + device_id(device_id), + num_hw_cqs(num_hw_cqs), + fast_write_callable(tt::Cluster::instance().get_fast_pcie_static_tlb_write_callable(device_id)), + bypass_enable(false), + bypass_buffer_write_offset(0) { + this->completion_byte_addrs.resize(num_hw_cqs); + this->prefetcher_cores.resize(num_hw_cqs); + this->prefetch_q_writers.reserve(num_hw_cqs); + this->prefetch_q_dev_ptrs.resize(num_hw_cqs); + this->prefetch_q_dev_fences.resize(num_hw_cqs); + + // Split hugepage into however many pieces as there are CQs + chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device_id); + uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id); + char* hugepage_start = (char*)tt::Cluster::instance().host_dma_address(0, mmio_device_id, channel); + hugepage_start += (channel >> 2) * DispatchSettings::MAX_DEV_CHANNEL_SIZE; + this->cq_sysmem_start = hugepage_start; + + // TODO(abhullar): Remove env var and expose sizing at the API level + char* cq_size_override_env = std::getenv("TT_METAL_CQ_SIZE_OVERRIDE"); + if (cq_size_override_env != nullptr) { + uint32_t cq_size_override = std::stoi(string(cq_size_override_env)); + this->cq_size = cq_size_override; + } else { + this->cq_size = tt::Cluster::instance().get_host_channel_size(mmio_device_id, channel) / num_hw_cqs; + if (tt::Cluster::instance().is_galaxy_cluster()) { + // We put 4 galaxy devices per huge page since number of hugepages available is less than number of + // devices. + this->cq_size = this->cq_size / DispatchSettings::DEVICES_PER_UMD_CHANNEL; + } + } + this->channel_offset = DispatchSettings::MAX_HUGEPAGE_SIZE * get_umd_channel(channel) + + (channel >> 2) * DispatchSettings::MAX_DEV_CHANNEL_SIZE; + + CoreType core_type = tt::tt_metal::dispatch_core_manager::instance().get_dispatch_core_type(device_id); + uint32_t completion_q_rd_ptr = + DispatchMemMap::get(core_type).get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_RD); + uint32_t prefetch_q_base = + DispatchMemMap::get(core_type).get_device_command_queue_addr(CommandQueueDeviceAddrType::UNRESERVED); + uint32_t cq_start = + DispatchMemMap::get(core_type).get_host_command_queue_addr(CommandQueueHostAddrType::UNRESERVED); + for (uint8_t cq_id = 0; cq_id < num_hw_cqs; cq_id++) { + tt_cxy_pair prefetcher_core = + tt::tt_metal::dispatch_core_manager::instance().prefetcher_core(device_id, channel, cq_id); + auto prefetcher_virtual = tt::Cluster::instance().get_virtual_coordinate_from_logical_coordinates( + prefetcher_core.chip, CoreCoord(prefetcher_core.x, prefetcher_core.y), core_type); + this->prefetcher_cores[cq_id] = tt_cxy_pair(prefetcher_core.chip, prefetcher_virtual.x, prefetcher_virtual.y); + this->prefetch_q_writers.emplace_back( + tt::Cluster::instance().get_static_tlb_writer(this->prefetcher_cores[cq_id])); + + tt_cxy_pair completion_queue_writer_core = + tt::tt_metal::dispatch_core_manager::instance().completion_queue_writer_core(device_id, channel, cq_id); + auto completion_queue_writer_virtual = tt::Cluster::instance().get_virtual_coordinate_from_logical_coordinates( + completion_queue_writer_core.chip, + CoreCoord(completion_queue_writer_core.x, completion_queue_writer_core.y), + core_type); + + const std::tuple completion_interface_tlb_data = tt::Cluster::instance() + .get_tlb_data(tt_cxy_pair( + completion_queue_writer_core.chip, + completion_queue_writer_virtual.x, + completion_queue_writer_virtual.y)) + .value(); + auto [completion_tlb_offset, completion_tlb_size] = completion_interface_tlb_data; + this->completion_byte_addrs[cq_id] = completion_tlb_offset + completion_q_rd_ptr % completion_tlb_size; + + this->cq_interfaces.push_back(SystemMemoryCQInterface(channel, cq_id, this->cq_size, cq_start)); + // Prefetch queue acts as the sync mechanism to ensure that issue queue has space to write, so issue queue + // must be as large as the max amount of space the prefetch queue can specify Plus 1 to handle wrapping Plus + // 1 to allow us to start writing to issue queue before we reserve space in the prefetch queue + TT_FATAL( + DispatchMemMap::get(core_type, num_hw_cqs).max_prefetch_command_size() * + (DispatchMemMap::get(core_type, num_hw_cqs).prefetch_q_entries() + 2) <= + this->get_issue_queue_size(cq_id), + "Issue queue for cq_id {} has size of {} which is too small", + cq_id, + this->get_issue_queue_size(cq_id)); + this->cq_to_event.push_back(0); + this->cq_to_last_completed_event.push_back(0); + this->prefetch_q_dev_ptrs[cq_id] = prefetch_q_base; + this->prefetch_q_dev_fences[cq_id] = + prefetch_q_base + DispatchMemMap::get(core_type, num_hw_cqs).prefetch_q_entries() * + sizeof(DispatchSettings::prefetch_q_entry_type); + } + std::vector temp_mutexes(num_hw_cqs); + cq_to_event_locks.swap(temp_mutexes); +} + +// TODO: RENAME issue_queue_stride ? +void SystemMemoryManager::issue_queue_push_back(uint32_t push_size_B, const uint8_t cq_id) { + if (this->bypass_enable) { + this->bypass_buffer_write_offset += push_size_B; + return; + } + + // All data needs to be PCIE_ALIGNMENT aligned + uint32_t push_size_16B = align(push_size_B, tt::tt_metal::hal.get_alignment(tt::tt_metal::HalMemType::HOST)) >> 4; + + SystemMemoryCQInterface& cq_interface = this->cq_interfaces[cq_id]; + CoreType core_type = tt::tt_metal::dispatch_core_manager::instance().get_dispatch_core_type(this->device_id); + uint32_t issue_q_wr_ptr = + DispatchMemMap::get(core_type).get_host_command_queue_addr(CommandQueueHostAddrType::ISSUE_Q_WR); + + if (cq_interface.issue_fifo_wr_ptr + push_size_16B >= cq_interface.issue_fifo_limit) { + cq_interface.issue_fifo_wr_ptr = (cq_interface.cq_start + cq_interface.offset) >> 4; // In 16B words + cq_interface.issue_fifo_wr_toggle = not cq_interface.issue_fifo_wr_toggle; // Flip the toggle + } else { + cq_interface.issue_fifo_wr_ptr += push_size_16B; + } + + // Also store this data in hugepages, so if a hang happens we can see what was written by host. + chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(this->device_id); + uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device_id); + tt::Cluster::instance().write_sysmem( + &cq_interface.issue_fifo_wr_ptr, + sizeof(uint32_t), + issue_q_wr_ptr + get_relative_cq_offset(cq_id, this->cq_size), + mmio_device_id, + channel); +} + +void SystemMemoryManager::send_completion_queue_read_ptr(const uint8_t cq_id) const { + const SystemMemoryCQInterface& cq_interface = this->cq_interfaces[cq_id]; + + uint32_t read_ptr_and_toggle = cq_interface.completion_fifo_rd_ptr | (cq_interface.completion_fifo_rd_toggle << 31); + this->fast_write_callable(this->completion_byte_addrs[cq_id], 4, (uint8_t*)&read_ptr_and_toggle); + + // Also store this data in hugepages in case we hang and can't get it from the device. + chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(this->device_id); + uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device_id); + CoreType core_type = tt::tt_metal::dispatch_core_manager::instance().get_dispatch_core_type(this->device_id); + uint32_t completion_q_rd_ptr = + DispatchMemMap::get(core_type).get_host_command_queue_addr(CommandQueueHostAddrType::COMPLETION_Q_RD); + tt::Cluster::instance().write_sysmem( + &read_ptr_and_toggle, + sizeof(uint32_t), + completion_q_rd_ptr + get_relative_cq_offset(cq_id, this->cq_size), + mmio_device_id, + channel); +} + +void SystemMemoryManager::fetch_queue_reserve_back(const uint8_t cq_id) { + if (this->bypass_enable) { + return; + } + + CoreType core_type = tt::tt_metal::dispatch_core_manager::instance().get_dispatch_core_type(device_id); + const uint32_t prefetch_q_rd_ptr = + DispatchMemMap::get(core_type).get_device_command_queue_addr(CommandQueueDeviceAddrType::PREFETCH_Q_RD); + + // Helper to wait for fetch queue space, if needed + uint32_t fence; + auto wait_for_fetch_q_space = [&]() { + // Loop until space frees up + while (this->prefetch_q_dev_ptrs[cq_id] == this->prefetch_q_dev_fences[cq_id]) { + tt::Cluster::instance().read_core( + &fence, sizeof(uint32_t), this->prefetcher_cores[cq_id], prefetch_q_rd_ptr); + this->prefetch_q_dev_fences[cq_id] = fence; + } + }; + + wait_for_fetch_q_space(); + + // Wrap FetchQ if possible + uint32_t prefetch_q_base = + DispatchMemMap::get(core_type).get_device_command_queue_addr(CommandQueueDeviceAddrType::UNRESERVED); + uint32_t prefetch_q_limit = prefetch_q_base + DispatchMemMap::get(core_type, num_hw_cqs).prefetch_q_entries() * + sizeof(DispatchSettings::prefetch_q_entry_type); + if (this->prefetch_q_dev_ptrs[cq_id] == prefetch_q_limit) { + this->prefetch_q_dev_ptrs[cq_id] = prefetch_q_base; + wait_for_fetch_q_space(); + } +} + +} // namespace tt::tt_metal From 16a73d456a943c10193bc4284f67555838502beb Mon Sep 17 00:00:00 2001 From: Evan Smal Date: Mon, 10 Feb 2025 18:25:48 +0000 Subject: [PATCH 045/316] Disable `ShardOrientation.COL_MAJOR` test cases for `ttnn.upsample` --- tests/ttnn/unit_tests/operations/test_upsample.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/tests/ttnn/unit_tests/operations/test_upsample.py b/tests/ttnn/unit_tests/operations/test_upsample.py index 7109fed9283..3a2309afa4f 100644 --- a/tests/ttnn/unit_tests/operations/test_upsample.py +++ b/tests/ttnn/unit_tests/operations/test_upsample.py @@ -125,14 +125,8 @@ def test_upsample_single_core(device, input_shapes, scale_h, scale_w): @pytest.mark.parametrize("shard_strategy", [ttnn.ShardStrategy.HEIGHT, ttnn.ShardStrategy.BLOCK]) @pytest.mark.parametrize("shard_orientation", [ttnn.ShardOrientation.ROW_MAJOR, ttnn.ShardOrientation.COL_MAJOR]) def test_upsample_multi_core(device, input_shape, scale_h, scale_w, shard_strategy, shard_orientation): - if ( - (shard_strategy == ttnn.ShardStrategy.BLOCK) - and (shard_orientation == ttnn.ShardOrientation.ROW_MAJOR) - and (scale_h == 2) - and (scale_w == 2) - and (input_shape == [2, 1280, 4, 4]) - ): - pytest.skip("skipped to unblock P0 issue 16975 but needs to be fixed and removed for issue 17035") + if (shard_strategy == ttnn.ShardStrategy.BLOCK) and (shard_orientation == ttnn.ShardOrientation.COL_MAJOR): + pytest.skip("Disabled until illegal shard configs are fixed (#17795)") if is_grayskull() and (scale_h > 2 or scale_w > 2): pytest.skip("Skipping test because it won't fit in L1!") From e510a3dd08716ab2d31a0e3452cee2cd55d262bb Mon Sep 17 00:00:00 2001 From: Salar Hosseini <159165450+skhorasganiTT@users.noreply.github.com> Date: Mon, 10 Feb 2025 15:10:26 -0500 Subject: [PATCH 046/316] [skip ci] Update perf and latest features for llm models (Feb 10) (#17798) --- README.md | 21 ++++++++++----------- models/MODEL_UPDATES.md | 13 +++++++++++-- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index ac4656e7e6e..fc4e313237a 100644 --- a/README.md +++ b/README.md @@ -26,24 +26,23 @@ | Model | Batch | Hardware | ttft (ms) | t/s/u | Target
t/s/u | t/s | TT-Metalium Release | vLLM Tenstorrent Repo Release | |---------------------------------------------------------------|-------|----------------------------------------------------------|-----------|-------|-----------------|--------|---------------------------------------------------|---------------------------------------------------------------------------------------------------| -| [Falcon 7B (decode only)](./models/demos/ttnn_falcon7b) | 32 | [e150](https://tenstorrent.com/hardware/grayskull) | | 4.2 | 4.4 | 134.4 | | | -| [Falcon 7B](./models/demos/wormhole/falcon7b) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | 71 | 18.1 | 26 | 579.2 | [v0.55.0-rc18](https://github.com/tenstorrent/tt-metal/tree/v0.55.0-rc18) | | +| [Falcon 7B](./models/demos/wormhole/falcon7b) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | 71 | 18.1 | 26 | 579.2 | [v0.56.0-rc6](https://github.com/tenstorrent/tt-metal/tree/v0.56.0-rc6) | | | [Mistral 7B](./models/demos/wormhole/mistral7b) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | | 9.9 | 25 | 316.8 | [v0.51.0-rc28](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc28) | | | [Mamba 2.8B](./models/demos/wormhole/mamba) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | 48 | 12.3 | 41 | 393.6 | [v0.51.0-rc26](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc26) | | -| [Llama 3.1 8B](./models/demos/llama3) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | 161 | 23.4 | 23 | 748.8 | [v0.55.0-rc12](https://github.com/tenstorrent/tt-metal/tree/v0.55.0-rc12) | [2f33504](https://github.com/tenstorrent/vllm/tree/2f33504bad49a6202d3685155107a6126a5b5e6e) | -| [Llama 3.2 1B](./models/demos/llama3) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | 54 | 58.6 | 160 | 1875.2 | [v0.55.0-rc12](https://github.com/tenstorrent/tt-metal/tree/v0.55.0-rc12) | [2f33504](https://github.com/tenstorrent/vllm/tree/2f33504bad49a6202d3685155107a6126a5b5e6e) | -| [Llama 3.2 3B](./models/demos/llama3) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | 97 | 36.1 | 60 | 1155.2 | [v0.55.0-rc12](https://github.com/tenstorrent/tt-metal/tree/v0.55.0-rc12) | [2f33504](https://github.com/tenstorrent/vllm/tree/2f33504bad49a6202d3685155107a6126a5b5e6e) | -| [Llama 3.2 11B Vision (TP=2)](./models/demos/llama3) | 16 | [n300](https://tenstorrent.com/hardware/wormhole) | 2800 | 15.8 | 17 | 252.8 | [v0.55.0-rc12](https://github.com/tenstorrent/tt-metal/tree/v0.55.0-rc12) | [2f33504](https://github.com/tenstorrent/vllm/tree/2f33504bad49a6202d3685155107a6126a5b5e6e) | +| [Llama 3.1 8B](./models/demos/llama3) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | 168 | 24.0 | 23 | 768.0 | [v0.56.0-rc6](https://github.com/tenstorrent/tt-metal/tree/v0.56.0-rc6) | [b9564bf](https://github.com/tenstorrent/vllm/tree/b9564bf364e95a3850619fc7b2ed968cc71e30b7) | +| [Llama 3.2 1B](./models/demos/llama3) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | 56 | 59.4 | 160 | 1900.8 | [v0.56.0-rc6](https://github.com/tenstorrent/tt-metal/tree/v0.56.0-rc6) | [b9564bf](https://github.com/tenstorrent/vllm/tree/b9564bf364e95a3850619fc7b2ed968cc71e30b7) | +| [Llama 3.2 3B](./models/demos/llama3) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | 97 | 36.5 | 60 | 1168.0 | [v0.56.0-rc6](https://github.com/tenstorrent/tt-metal/tree/v0.56.0-rc6) | [b9564bf](https://github.com/tenstorrent/vllm/tree/b9564bf364e95a3850619fc7b2ed968cc71e30b7) | +| [Llama 3.2 11B Vision (TP=2)](./models/demos/llama3) | 16 | [n300](https://tenstorrent.com/hardware/wormhole) | 2550 | 15.8 | 17 | 252.8 | [v0.56.0-rc3](https://github.com/tenstorrent/tt-metal/tree/v0.56.0-rc3) | [0fde628](https://github.com/tenstorrent/vllm/tree/0fde6285eb133f5c71522840a1beb6b57a2e3b70) | | [Falcon 7B (DP=8)](./models/demos/t3000/falcon7b) | 256 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 88 | 15.5 | 26 | 3968.0 | [v0.55.0-rc18](https://github.com/tenstorrent/tt-metal/tree/v0.55.0-rc18) | | | [Llama 3.1 70B (TP=8)](./models/demos/t3000/llama3_70b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 190 | 15.1 | 20 | 483.2 | [v0.54.0-rc2](https://github.com/tenstorrent/tt-metal/tree/v0.54.0-rc2) | [9531611](https://github.com/tenstorrent/vllm/tree/953161188c50f10da95a88ab305e23977ebd3750) | -| [Falcon 40B (TP=8)](./models/demos/t3000/falcon40b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | | 5.3 | 36 | 169.6 | [v0.55.0-rc19](https://github.com/tenstorrent/tt-metal/tree/v0.55.0-rc19) | | -| [Mixtral 8x7B (TP=8)](./models/demos/t3000/mixtral8x7b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 227 | 14.9 | 33 | 476.8 | [v0.55.0-rc19](https://github.com/tenstorrent/tt-metal/tree/v0.55.0-rc19) | | -| [Falcon 7B (DP=32)](./models/demos/tg/falcon7b) | 1024 | [Galaxy](https://tenstorrent.com/hardware/galaxy) | 223 | 4.8 | 26 | 4915.2 | [v0.55.0-rc18](https://github.com/tenstorrent/tt-metal/tree/v0.55.0-rc18) | | +| [Falcon 40B (TP=8)](./models/demos/t3000/falcon40b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | | 5.3 | 36 | 169.6 | [v0.55.0-rc20](https://github.com/tenstorrent/tt-metal/tree/v0.55.0-rc20) | | +| [Mixtral 8x7B (TP=8)](./models/demos/t3000/mixtral8x7b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 227 | 14.9 | 33 | 476.8 | [v0.56.0-rc6](https://github.com/tenstorrent/tt-metal/tree/v0.56.0-rc6) | | +| [Falcon 7B (DP=32)](./models/demos/tg/falcon7b) | 1024 | [Galaxy](https://tenstorrent.com/hardware/galaxy) | 223 | 4.8 | 26 | 4915.2 | [v0.56.0-rc6](https://github.com/tenstorrent/tt-metal/tree/v0.56.0-rc6) | | | [Llama 3.1 70B (DP=4, TP=8)](./models/demos/t3000/llama3_70b) | 128 | [Galaxy](https://tenstorrent.com/hardware/galaxy) | 190 | 14.3 | 20 | 1835.5 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) | | -| [Llama 3.1 70B (TP=32)](./models/demos/llama3) | 32 | [Galaxy](https://tenstorrent.com/hardware/galaxy) | 763 | 13.5 | 80 | 432.0 | [v0.55.0-rc12](https://github.com/tenstorrent/tt-metal/tree/v0.55.0-rc12) | [2f33504](https://github.com/tenstorrent/vllm/tree/2f33504bad49a6202d3685155107a6126a5b5e6e) | +| [Llama 3.1 70B (TP=32)](./models/demos/llama3) | 32 | [Galaxy](https://tenstorrent.com/hardware/galaxy) | 763 | 13.5 | 80 | 432.0 | [v0.56.0-rc6](https://github.com/tenstorrent/tt-metal/tree/v0.56.0-rc6) | [b9564bf](https://github.com/tenstorrent/vllm/tree/b9564bf364e95a3850619fc7b2ed968cc71e30b7) | | [DeepSeek R1 Distill Llama 3.3 70B (TP=8)](https://github.com/tenstorrent/tt-metal/tree/main/models/demos/llama3) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 1113 | 16.4 | 33 |524.8 | [main](https://github.com/tenstorrent/tt-metal/) | [b9564bf](https://github.com/tenstorrent/vllm/tree/b9564bf364e95a3850619fc7b2ed968cc71e30b7) | -> **Last Update:** February 5, 2025 +> **Last Update:** February 10, 2025 > > **Notes:** > diff --git a/models/MODEL_UPDATES.md b/models/MODEL_UPDATES.md index e30c8338829..d76b8df8387 100644 --- a/models/MODEL_UPDATES.md +++ b/models/MODEL_UPDATES.md @@ -4,6 +4,15 @@ > > Please refer to the front-page [README](../README.md) for the latest verified release for each model. +## February 10, 2025 + +### [Llama 3.1/3.2](demos/llama3) +> **Note:** This feature is available as of release [v0.56.0-rc16](https://github.com/tenstorrent/tt-metal/tree/v0.56.0-rc16) +- Added support for loading HuggingFace model formats (previously loaded Meta checkpoint formats), which will also enable easier adoption of future derivative models. + +### [Llama 3.2-11B-Vision](demos/llama3) +- Added support for processing text-only prompts to the model and the [vLLM fork](https://github.com/tenstorrent/vllm/tree/dev/tt_metal). + ## January 13, 2025 ### [Llama 3.1/3.2](demos/llama3) @@ -20,7 +29,7 @@ ### [Llama 3.1/3.2](demos/llama3) - Improved the decode performance of the 1B/3B/8B/11B text models (for 8B, increased from ~23 t/s/u to ~28 t/s/u) by using BFP4 weights (instead of BFP8) for FF1 and FF3 in the MLP. -- Added the option to specify custom model configurations, with two defaults for performance and accuracy already provided. +- Added the option to specify custom model configurations, with two defaults for performance and accuracy already provided. ## November 18, 2024 @@ -76,7 +85,7 @@ ### [Mistral7B](demos/wormhole/mistral7b) - Updated the demo to support multiple batches of users -### [Mamba-2.8B](demos/wormhole/mamba) +### [Mamba-2.8B](demos/wormhole/mamba) - Updated the demo to use the full prefill graph instead of processing a single token of the prompt at a time using decode ### [Mixtral7Bx8](demos/t3000/mixtral8x7b) From 064cb1eedc8a8fe7399e22fe18e0b841154b11bc Mon Sep 17 00:00:00 2001 From: Paul Keller Date: Tue, 4 Feb 2025 15:30:49 +0000 Subject: [PATCH 047/316] New script to generate async dispatch perf results --- .../dispatch/sweep_pgm_dispatch_0.sh | 77 +++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100755 tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/sweep_pgm_dispatch_0.sh diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/sweep_pgm_dispatch_0.sh b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/sweep_pgm_dispatch_0.sh new file mode 100755 index 00000000000..8c4bc59e2b3 --- /dev/null +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/sweep_pgm_dispatch_0.sh @@ -0,0 +1,77 @@ +#/bin/bash + +if [ "$ARCH_NAME" = "grayskull" ]; then + echo "Configured core range for grayskull" + max_x="11" + max_y="8" +elif [ "$ARCH_NAME" = "wormhole_b0" ]; then + echo "Configured core range for wormhole_b0" + max_x="7" + max_y="6" +elif [ "$ARCH_NAME" = "blackhole" ]; then + echo "Configured core range for blackhole" + max_x="12" + max_y="9" +else + echo "Unknown arch: $ARCH_NAME" + exit 1 +fi + +# Initialize the string variable +trace_option="" +eth_dispatch_option="" + +# Parse command line arguments +for arg in "$@"; do + case $arg in + --trace) + trace_option="-tr" + shift + ;; + --eth) + eth_dispatch_option="-de" + shift + ;; + *) + # Handle other arguments if necessary + ;; + esac +done + +set -x + +# skips ncrisc to reduce uncovered kernel init time on WH +function shadow_test() { + build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_${ARCH_NAME} --custom -w 5000 -n -s 256 -x $max_x -y $max_y -rs 40000 $trace_option $eth_dispatch_option $@ + build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_${ARCH_NAME} --custom -w 5000 -n -s 2048 -x $max_x -y $max_y -rs 40000 $trace_option $eth_dispatch_option $@ + build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_${ARCH_NAME} --custom -w 5000 -n -s 8192 -x $max_x -y $max_y -rs 40000 $trace_option $eth_dispatch_option $@ + + build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_${ARCH_NAME} --custom -w 5000 -n -s 256 -x $max_x -y $max_y -rs 40000 -a 1 $trace_option $eth_dispatch_option $@ + build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_${ARCH_NAME} --custom -w 5000 -n -s 2048 -x $max_x -y $max_y -rs 40000 -a 1 $trace_option $eth_dispatch_option $@ + build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_${ARCH_NAME} --custom -w 5000 -n -s 8192 -x $max_x -y $max_y -rs 40000 -a 1 $trace_option $eth_dispatch_option $@ + + build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_${ARCH_NAME} --custom -w 5000 -n -s 256 -x $max_x -y $max_y -kg $max_x -rs 40000 -a 1 $trace_option $eth_dispatch_option $@ + build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_${ARCH_NAME} --custom -w 5000 -n -s 2048 -x $max_x -y $max_y -kg $max_x -rs 40000 -a 1 $trace_option $eth_dispatch_option $@ + build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_${ARCH_NAME} --custom -w 5000 -n -s 8192 -x $max_x -y $max_y -kg $max_x -rs 40000 -a 1 $trace_option $eth_dispatch_option $@ +} + +# Test w/ n shadow kernels +echo "###" kernel groups w/ 4 shadow kernels + shadow_test -nf 0 +echo "###" kernel groups w/ 4 shadow kernels + shadow_test -nf 1 +echo "###" kernel groups w/ 4 shadow kernels + shadow_test -nf 2 +echo "###" kernel groups w/ 4 shadow kernels + shadow_test -nf 3 +echo "###" kernel groups w/ 4 shadow kernels + shadow_test -nf 4 +echo "###" kernel groups w/ 4 shadow kernels + shadow_test -nf 5 +echo "###" kernel groups w/ 4 shadow kernels + shadow_test -nf 6 +echo "###" kernel groups w/ 4 shadow kernels + shadow_test -nf 7 +echo "###" kernel groups w/ 4 shadow kernels + shadow_test -nf 8 +echo "###" done From c32b41b663cde4380a2d880d13954cfa6e38e218 Mon Sep 17 00:00:00 2001 From: Paul Keller Date: Tue, 4 Feb 2025 19:18:09 +0000 Subject: [PATCH 048/316] Add i$ test to bw_and_latency This test is shoved in here, probably not worth running this ever again so doesn't need to be maintained. --- .../dispatch/kernels/bw_and_latency.cpp | 9 +++++++++ .../perf_microbenchmark/dispatch/test_bw_and_latency.cpp | 8 +++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/kernels/bw_and_latency.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/kernels/bw_and_latency.cpp index fec10557331..6dc29010ac5 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/kernels/bw_and_latency.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/kernels/bw_and_latency.cpp @@ -3,6 +3,14 @@ // SPDX-License-Identifier: Apache-2.0 void kernel_main() { +#if NOP_COUNT + for (int i = 0; i < ITERATIONS; i++) { +#pragma GCC unroll 4096 + for (int j = 0; j < NOP_COUNT; j++) { + asm("nop"); + } + } +#else #ifdef PAGE_SIZE uint32_t page_size = PAGE_SIZE; #else @@ -60,4 +68,5 @@ void kernel_main() { noc_async_read_barrier(); #endif #endif +#endif } diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp index 03e124f7c94..100534ab260 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp @@ -49,6 +49,7 @@ bool hammer_pcie_g = false; bool hammer_pcie_type_g = false; bool test_write = false; bool linked = false; +uint32_t nop_count_g = 0; void init(int argc, char** argv) { std::vector input_args(argv, argv + argc); @@ -88,6 +89,7 @@ void init(int argc, char** argv) { log_info(LogTest, " -hp: hammer hugepage PCIe memory while executing (for PCIe test)"); log_info(LogTest, " -hpt:hammer hugepage PCIe hammer type: 0:32bit writes 1:128bit non-temporal writes"); log_info(LogTest, " -psrta: pass page size as a runtime argument (default compile time define)"); + log_info(LogTest, " -nop: time loop of nops"); exit(0); } @@ -110,6 +112,8 @@ void init(int argc, char** argv) { page_size_g = test_args::get_command_option_uint32(input_args, "-p", DEFAULT_PAGE_SIZE); page_size_as_runtime_arg_g = test_args::has_command_option(input_args, "-psrta"); read_one_packet_g = test_args::has_command_option(input_args, "-o"); + nop_count_g = test_args::get_command_option_uint32(input_args, "-nop", 0); + if (read_one_packet_g && page_size_g > 8192) { log_info(LogTest, "Page size must be <= 8K for read_one_packet\n"); exit(-1); @@ -270,7 +274,9 @@ int main(int argc, char** argv) { {"LINKED", std::to_string(linked)}, {"NUM_MCAST_DESTS", std::to_string(num_mcast_dests)}, {"MCAST_NOC_END_ADDR_X", std::to_string(mcast_noc_addr_end_x)}, - {"MCAST_NOC_END_ADDR_Y", std::to_string(mcast_noc_addr_end_y)}}; + {"MCAST_NOC_END_ADDR_Y", std::to_string(mcast_noc_addr_end_y)}, + {"NOP_COUNT", std::to_string(nop_count_g)}, + }; if (!page_size_as_runtime_arg_g) { defines.insert(std::pair("PAGE_SIZE", std::to_string(page_size_g))); } From e487f2c6bb7252b94896bf2045ea21d1d0955b09 Mon Sep 17 00:00:00 2001 From: Paul Keller Date: Thu, 6 Feb 2025 20:13:40 +0000 Subject: [PATCH 049/316] Add some comments regarding future work --- tt_metal/impl/program/dispatch.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tt_metal/impl/program/dispatch.cpp b/tt_metal/impl/program/dispatch.cpp index 67e9a1a2740..2416aede1e0 100644 --- a/tt_metal/impl/program/dispatch.cpp +++ b/tt_metal/impl/program/dispatch.cpp @@ -1513,6 +1513,9 @@ void reserve_space_in_kernel_config_buffer( dispatch_md.stall_first = false; dispatch_md.stall_before_program = true; } + + // TODO: config_buffer_mgr is stateful so code below restores original reservation state + // pull state out of the config_buffer_mgr reservation = config_buffer_mgr.reserve(program_config_sizes); } @@ -1527,6 +1530,10 @@ void reserve_space_in_kernel_config_buffer( } config_buffer_mgr.alloc(expected_num_workers_completed + num_program_workers); + // TODO. This code is needlessly complex due to enqueue program and + // binary writing being intertwined. Separate out writing kernel + // binaries into program compile/finalize. The sync below is confusing + // and not needed (just need a barrier on DRAM write) if (program_binary_status != ProgramBinaryStatus::Committed) { // Insert a stall before writing any program configs when binaries are in flight dispatch_md.stall_first = true; From 5eeb4d4f41222999a80dbb4a5f3d5eb8f440e4d0 Mon Sep 17 00:00:00 2001 From: Paul Keller Date: Thu, 6 Feb 2025 20:11:54 +0000 Subject: [PATCH 050/316] Increase size of launch_msg buffer from 4 to 8 --- tests/tt_metal/tools/profiler/test_device_profiler.py | 4 ++-- .../tt_metal/debug_tools/watcher/test_assert.cpp | 2 +- .../tt_metal/debug_tools/watcher/test_noc_sanitize.cpp | 10 +++++----- .../tt_metal/debug_tools/watcher/test_waypoint.cpp | 3 ++- tt_metal/api/tt-metalium/dev_msgs.h | 2 +- tt_metal/hw/inc/blackhole/dev_mem_map.h | 4 ++-- tt_metal/hw/inc/blackhole/eth_l1_address_map.h | 2 +- tt_metal/hw/inc/grayskull/dev_mem_map.h | 2 +- tt_metal/hw/inc/wormhole/dev_mem_map.h | 4 ++-- tt_metal/hw/inc/wormhole/eth_l1_address_map.h | 2 +- tt_metal/impl/debug/watcher_device_reader.cpp | 9 ++++++++- 11 files changed, 26 insertions(+), 18 deletions(-) diff --git a/tests/tt_metal/tools/profiler/test_device_profiler.py b/tests/tt_metal/tools/profiler/test_device_profiler.py index f235f7a29b5..eb32531bae5 100644 --- a/tests/tt_metal/tools/profiler/test_device_profiler.py +++ b/tests/tt_metal/tools/profiler/test_device_profiler.py @@ -230,8 +230,8 @@ def test_dispatch_cores(): @skip_for_grayskull() def test_ethernet_dispatch_cores(): REF_COUNT_DICT = { - "Ethernet CQ Dispatch": [17, 12, 3902], - "Ethernet CQ Prefetch": [18, 1954], + "Ethernet CQ Dispatch": [17, 12, 3899], + "Ethernet CQ Prefetch": [18, 1951], } os.environ["TT_METAL_DEVICE_PROFILER_DISPATCH"] = "1" devicesData = run_device_profiler_test( diff --git a/tests/tt_metal/tt_metal/debug_tools/watcher/test_assert.cpp b/tests/tt_metal/tt_metal/debug_tools/watcher/test_assert.cpp index 1c4f6d01e8b..13920ee1ac7 100644 --- a/tests/tt_metal/tt_metal/debug_tools/watcher/test_assert.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/watcher/test_assert.cpp @@ -160,7 +160,7 @@ static void RunTest(WatcherFixture *fixture, IDevice* device, riscv_id_t riscv_t string expected = fmt::format( "Device {} {} core(x={:2},y={:2}) virtual(x={:2},y={:2}): {} tripped an assert on line {}. Current kernel: {}.", device->id(), - (riscv_type == DebugErisc) ? "ethnet" : "worker", + (riscv_type == DebugErisc) ? "active ethnet" : "worker", logical_core.x, logical_core.y, virtual_core.x, diff --git a/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize.cpp b/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize.cpp index 5962ae29275..2ecd288f817 100644 --- a/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize.cpp @@ -171,7 +171,7 @@ void RunTestOnCore(WatcherFixture* fixture, IDevice* device, CoreCoord &core, bo "bytes from local L1[{:#08x}] to Unknown core w/ virtual coords {} [addr=0x{:08x}] (NOC target " "address did not map to any known Tensix/Ethernet/DRAM/PCIE core).", device->id(), - (is_eth_core) ? "ethnet" : "worker", + (is_eth_core) ? "active ethnet" : "worker", core.x, core.y, virtual_core.x, @@ -188,7 +188,7 @@ void RunTestOnCore(WatcherFixture* fixture, IDevice* device, CoreCoord &core, bo "bytes from local L1[{:#08x}] to Tensix core w/ virtual coords {} L1[addr=0x{:08x}] (invalid address " "alignment in NOC transaction).", device->id(), - (is_eth_core) ? "ethnet" : "worker", + (is_eth_core) ? "active ethnet" : "worker", core.x, core.y, virtual_core.x, @@ -207,7 +207,7 @@ void RunTestOnCore(WatcherFixture* fixture, IDevice* device, CoreCoord &core, bo "bytes to local L1[{:#08x}] from Tensix core w/ virtual coords {} L1[addr=0x{:08x}] (invalid address " "alignment in NOC transaction).", device->id(), - (is_eth_core) ? "ethnet" : "worker", + (is_eth_core) ? "active ethnet" : "worker", core.x, core.y, virtual_core.x, @@ -225,7 +225,7 @@ void RunTestOnCore(WatcherFixture* fixture, IDevice* device, CoreCoord &core, bo "bytes from local L1[{:#08x}] to Tensix core w/ virtual coords {} L1[addr=0x{:08x}] (NOC target " "overwrites mailboxes).", device->id(), - (is_eth_core) ? "ethnet" : "worker", + (is_eth_core) ? "active ethnet" : "worker", core.x, core.y, virtual_core.x, @@ -243,7 +243,7 @@ void RunTestOnCore(WatcherFixture* fixture, IDevice* device, CoreCoord &core, bo "bytes to local L1[{:#08x}] from Tensix core w/ virtual coords {} L1[addr=0x{:08x}] (Local L1 " "overwrites mailboxes).", device->id(), - (is_eth_core) ? "ethnet" : "worker", + (is_eth_core) ? "active ethnet" : "worker", core.x, core.y, virtual_core.x, diff --git a/tests/tt_metal/tt_metal/debug_tools/watcher/test_waypoint.cpp b/tests/tt_metal/tt_metal/debug_tools/watcher/test_waypoint.cpp index 4a32dcd2664..67398cbf569 100644 --- a/tests/tt_metal/tt_metal/debug_tools/watcher/test_waypoint.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/watcher/test_waypoint.cpp @@ -159,8 +159,9 @@ static void RunTest(WatcherFixture* fixture, IDevice* device) { k_id_s = ""; } expected = fmt::format( - "Device {} ethnet core(x={:2},y={:2}) virtual(x={:2},y={:2}): {},{}, X, X, X ", + "Device {} {} ethnet core(x={:2},y={:2}) virtual(x={:2},y={:2}): {},{}, X, X, X ", device->id(), + is_active ? "active" : "idle", logical_core.x, logical_core.y, virtual_core.x, diff --git a/tt_metal/api/tt-metalium/dev_msgs.h b/tt_metal/api/tt-metalium/dev_msgs.h index 4fde76aff8a..92e1427e47d 100644 --- a/tt_metal/api/tt-metalium/dev_msgs.h +++ b/tt_metal/api/tt-metalium/dev_msgs.h @@ -329,7 +329,7 @@ struct core_info_msg_t { volatile uint8_t pad[25]; }; -constexpr uint32_t launch_msg_buffer_num_entries = 4; +constexpr uint32_t launch_msg_buffer_num_entries = 8; struct mailboxes_t { struct ncrisc_halt_msg_t ncrisc_halt; struct slave_sync_msg_t slave_sync; diff --git a/tt_metal/hw/inc/blackhole/dev_mem_map.h b/tt_metal/hw/inc/blackhole/dev_mem_map.h index b97e3c5601b..7a6bdd3e585 100644 --- a/tt_metal/hw/inc/blackhole/dev_mem_map.h +++ b/tt_metal/hw/inc/blackhole/dev_mem_map.h @@ -68,7 +68,7 @@ #define MEM_L1_BARRIER 12 #define MEM_MAILBOX_BASE 16 // Magic size must be big enough to hold dev_msgs_t. static_asserts will fire if this is too small -#define MEM_MAILBOX_SIZE 12256 +#define MEM_MAILBOX_SIZE 12640 #define MEM_MAILBOX_END (MEM_MAILBOX_BASE + MEM_MAILBOX_SIZE) #define MEM_ZEROS_BASE ((MEM_MAILBOX_END + 31) & ~31) @@ -125,7 +125,7 @@ // TODO: reduce this when mailbox sizes are core type aware for some members (eg watcher/dprint) // TODO: also, move into gap above in the reserved area #define MEM_IERISC_MAILBOX_BASE (MEM_IERISC_RESERVED1 + MEM_IERISC_RESERVED1_SIZE) -#define MEM_IERISC_MAILBOX_SIZE 3344 +#define MEM_IERISC_MAILBOX_SIZE 3728 #define MEM_IERISC_MAILBOX_END (MEM_IERISC_MAILBOX_BASE + MEM_IERISC_MAILBOX_SIZE) #define MEM_IERISC_FIRMWARE_BASE (MEM_IERISC_MAILBOX_END) #define MEM_SLAVE_IERISC_FIRMWARE_BASE (MEM_IERISC_FIRMWARE_BASE + MEM_IERISC_FIRMWARE_SIZE) diff --git a/tt_metal/hw/inc/blackhole/eth_l1_address_map.h b/tt_metal/hw/inc/blackhole/eth_l1_address_map.h index 37dd8ea87c8..275bccce2e6 100644 --- a/tt_metal/hw/inc/blackhole/eth_l1_address_map.h +++ b/tt_metal/hw/inc/blackhole/eth_l1_address_map.h @@ -43,7 +43,7 @@ struct address_map { static constexpr uint32_t MEM_ERISC_RESERVED1_SIZE = 1024; static constexpr std::int32_t ERISC_MEM_MAILBOX_BASE = MEM_ERISC_RESERVED1 + MEM_ERISC_RESERVED1_SIZE; - static constexpr std::uint32_t ERISC_MEM_MAILBOX_SIZE = 3344; + static constexpr std::uint32_t ERISC_MEM_MAILBOX_SIZE = 3728; static constexpr std::uint32_t ERISC_MEM_MAILBOX_END = ERISC_MEM_MAILBOX_BASE + ERISC_MEM_MAILBOX_SIZE; static constexpr std::int32_t FIRMWARE_BASE = ERISC_MEM_MAILBOX_END; diff --git a/tt_metal/hw/inc/grayskull/dev_mem_map.h b/tt_metal/hw/inc/grayskull/dev_mem_map.h index df0fc64bb09..6aacb64c804 100644 --- a/tt_metal/hw/inc/grayskull/dev_mem_map.h +++ b/tt_metal/hw/inc/grayskull/dev_mem_map.h @@ -71,7 +71,7 @@ #define MEM_L1_BARRIER 12 #define MEM_MAILBOX_BASE 16 // Magic size must be big enough to hold dev_msgs_t. static_asserts will fire if this is too small -#define MEM_MAILBOX_SIZE 12256 +#define MEM_MAILBOX_SIZE 12640 // These are used in ncrisc-halt.S, asserted in ncrisc.cc to be valid #define MEM_NCRISC_HALT_STACK_MAILBOX_ADDRESS MEM_MAILBOX_BASE + 4 #define MEM_SLAVE_RUN_MAILBOX_ADDRESS MEM_MAILBOX_BASE + 8 diff --git a/tt_metal/hw/inc/wormhole/dev_mem_map.h b/tt_metal/hw/inc/wormhole/dev_mem_map.h index 0d9e1dd932c..c14c4dd57d1 100644 --- a/tt_metal/hw/inc/wormhole/dev_mem_map.h +++ b/tt_metal/hw/inc/wormhole/dev_mem_map.h @@ -72,7 +72,7 @@ #define MEM_L1_BARRIER 12 #define MEM_MAILBOX_BASE 16 // Magic size must be big enough to hold dev_msgs_t. static_asserts will fire if this is too small -#define MEM_MAILBOX_SIZE 12256 +#define MEM_MAILBOX_SIZE 12640 // These are used in ncrisc-halt.S, asserted in ncrisc.cc to be valid #define MEM_NCRISC_HALT_STACK_MAILBOX_ADDRESS MEM_MAILBOX_BASE + 4 #define MEM_SLAVE_RUN_MAILBOX_ADDRESS MEM_MAILBOX_BASE + 8 @@ -136,7 +136,7 @@ // TODO: reduce this when mailbox sizes are core type aware for some members (eg watcher/dprint) // TODO: also, move into gap above in the reserved area #define MEM_IERISC_MAILBOX_BASE (MEM_IERISC_RESERVED2 + MEM_IERISC_RESERVED2_SIZE) -#define MEM_IERISC_MAILBOX_SIZE 3232 +#define MEM_IERISC_MAILBOX_SIZE 3616 #define MEM_IERISC_MAILBOX_END (MEM_IERISC_MAILBOX_BASE + MEM_IERISC_MAILBOX_SIZE) #define MEM_IERISC_FIRMWARE_BASE MEM_IERISC_MAILBOX_END #define MEM_IERISC_MAP_END (MEM_IERISC_FIRMWARE_BASE + MEM_IERISC_FIRMWARE_SIZE) diff --git a/tt_metal/hw/inc/wormhole/eth_l1_address_map.h b/tt_metal/hw/inc/wormhole/eth_l1_address_map.h index e28c477a8a2..f8fb59c52e1 100644 --- a/tt_metal/hw/inc/wormhole/eth_l1_address_map.h +++ b/tt_metal/hw/inc/wormhole/eth_l1_address_map.h @@ -58,7 +58,7 @@ struct address_map { static constexpr std::int32_t ERISC_MEM_MAILBOX_BASE = ERISC_APP_SYNC_INFO_BASE + ERISC_APP_SYNC_INFO_SIZE; - static constexpr std::uint32_t ERISC_MEM_MAILBOX_SIZE = 3232; + static constexpr std::uint32_t ERISC_MEM_MAILBOX_SIZE = 3616; static constexpr std::uint32_t ERISC_MEM_MAILBOX_END = ERISC_MEM_MAILBOX_BASE + ERISC_MEM_MAILBOX_SIZE; static constexpr std::int32_t ERISC_L1_KERNEL_CONFIG_BASE = ERISC_MEM_MAILBOX_END; static constexpr std::int32_t FABRIC_ROUTER_CONFIG_BASE = diff --git a/tt_metal/impl/debug/watcher_device_reader.cpp b/tt_metal/impl/debug/watcher_device_reader.cpp index bf3af601b2f..f3074aa1733 100644 --- a/tt_metal/impl/debug/watcher_device_reader.cpp +++ b/tt_metal/impl/debug/watcher_device_reader.cpp @@ -311,7 +311,7 @@ void WatcherDeviceReader::DumpCore(CoreDescriptor& logical_core, bool is_active_ virtual_core.type = logical_core.type; // Print device id, core coords (logical) - string core_type = is_eth_core ? "ethnet" : "worker"; + string core_type = is_eth_core ? (is_active_eth_core ? "active ethnet" : "idle ethnet") : "worker"; string core_coord_str = fmt::format( "core(x={:2},y={:2}) virtual(x={:2},y={:2})", logical_core.coord.x, @@ -343,6 +343,13 @@ void WatcherDeviceReader::DumpCore(CoreDescriptor& logical_core, bool is_active_ // For more accurate reporting of launch messages and running kernel ids, dump data from the previous valid // program (one entry before), if the current program is invalid (enables == 0) uint32_t launch_msg_read_ptr = mbox_data->launch_msg_rd_ptr; + if (launch_msg_read_ptr > launch_msg_buffer_num_entries) { + TT_THROW( + "Watcher read invalid launch_msg_read_ptr on {}: read {}, max valid {}!", + core_str, + launch_msg_read_ptr, + launch_msg_buffer_num_entries); + } if (mbox_data->launch[launch_msg_read_ptr].kernel_config.enables == 0) { launch_msg_read_ptr = (launch_msg_read_ptr - 1 + launch_msg_buffer_num_entries) % launch_msg_buffer_num_entries; } From 42bfa50ca79c3b40d8f37fd61c460765fc795c34 Mon Sep 17 00:00:00 2001 From: Paul Keller Date: Mon, 10 Feb 2025 17:45:16 +0000 Subject: [PATCH 051/316] new pgm_dispatch sweep tests Show "uncovered" dispatch cost on workers (CBs, ncrisc) --- .../dispatch/sweep_pgm_dispatch.sh | 29 ++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/sweep_pgm_dispatch.sh b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/sweep_pgm_dispatch.sh index f3f91cba376..a12f86e26bc 100755 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/sweep_pgm_dispatch.sh +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/sweep_pgm_dispatch.sh @@ -208,7 +208,34 @@ build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_${ARCH_NAME} build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_${ARCH_NAME} --custom -w 5000 -s 4096 -x $max_x -y $max_y -kg 8 $trace_option $eth_dispatch_option build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_${ARCH_NAME} --custom -w 5000 -s 8192 -x $max_x -y $max_y -kg 8 $trace_option $eth_dispatch_option - # Same as above, but w/ 1 slow kernel and 4 fast "shadow kernels" (test worker RB queuing) + # Run kernels w/ a fixed runtime. Diff between expected time and actual time is unhidden dispatch cost + echo "###" kernel groups w/ 4 shadow kernels + build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_${ARCH_NAME} --custom -w 5000 -s 256 -rs 10000 $trace_option $eth_dispatch_option + build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_${ARCH_NAME} --custom -w 5000 -s 512 -rs 10000 $trace_option $eth_dispatch_option + build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_${ARCH_NAME} --custom -w 5000 -s 1024 -rs 10000 $trace_option $eth_dispatch_option + build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_${ARCH_NAME} --custom -w 5000 -s 2048 -rs 10000 $trace_option $eth_dispatch_option + build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_${ARCH_NAME} --custom -w 5000 -s 4096 -rs 10000 $trace_option $eth_dispatch_option + build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_${ARCH_NAME} --custom -w 5000 -s 8192 -rs 10000 $trace_option $eth_dispatch_option + + # Same as above but w/o ncrisc to measure ncrisc init cost + echo "###" kernel groups w/ 4 shadow kernels + build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_${ARCH_NAME} --custom -w 5000 -s 256 -rs 10000 -n $trace_option $eth_dispatch_option + build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_${ARCH_NAME} --custom -w 5000 -s 512 -rs 10000 -n $trace_option $eth_dispatch_option + build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_${ARCH_NAME} --custom -w 5000 -s 1024 -rs 10000 -n $trace_option $eth_dispatch_option + build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_${ARCH_NAME} --custom -w 5000 -s 2048 -rs 10000 -n $trace_option $eth_dispatch_option + build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_${ARCH_NAME} --custom -w 5000 -s 4096 -rs 10000 -n $trace_option $eth_dispatch_option + build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_${ARCH_NAME} --custom -w 5000 -s 8192 -rs 10000 -n $trace_option $eth_dispatch_option + + # Same as above but with 32 CBs to measure CB init cost + echo "###" kernel groups w/ 4 shadow kernels + build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_${ARCH_NAME} --custom -w 5000 -s 256 -rs 10000 -n -c 32 $trace_option $eth_dispatch_option + build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_${ARCH_NAME} --custom -w 5000 -s 512 -rs 10000 -n -c 32 $trace_option $eth_dispatch_option + build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_${ARCH_NAME} --custom -w 5000 -s 1024 -rs 10000 -n -c 32 $trace_option $eth_dispatch_option + build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_${ARCH_NAME} --custom -w 5000 -s 2048 -rs 10000 -n -c 32 $trace_option $eth_dispatch_option + build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_${ARCH_NAME} --custom -w 5000 -s 4096 -rs 10000 -n -c 32 $trace_option $eth_dispatch_option + build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_${ARCH_NAME} --custom -w 5000 -s 8192 -rs 10000 -n -c 32 $trace_option $eth_dispatch_option + + # Like earlier tests w/ kernel groups, but w/ 1 slow kernel and 4 fast "shadow kernels" (test worker RB queuing) echo "###" kernel groups w/ 4 shadow kernels build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_${ARCH_NAME} --custom -w 5000 -s 256 -x $max_x -y $max_y -kg $max_x -rs 40000 -nf 4 $trace_option $eth_dispatch_option build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_${ARCH_NAME} --custom -w 5000 -s 512 -x $max_x -y $max_y -kg $max_x -rs 40000 -nf 4 $trace_option $eth_dispatch_option From 9b459f01ad6af1ef57a0ef0c4e089921b733fe87 Mon Sep 17 00:00:00 2001 From: Brian Liu Date: Mon, 27 Jan 2025 17:37:39 +0000 Subject: [PATCH 052/316] #17060: Flip TT_ASSERT to TT_FATAL for sharding validation #17806: Skip incorrect sharded tests for ShardVectorConversionTests --- .../gtests/tensor/test_tensor_sharding.cpp | 2 -- .../gtests/tensor/test_vector_conversion.cpp | 11 +++++------ ttnn/cpp/ttnn/tensor/layout/tensor_layout.cpp | 2 +- ttnn/cpp/ttnn/tensor/tensor_spec.cpp | 18 +++++++++--------- 4 files changed, 15 insertions(+), 18 deletions(-) diff --git a/tests/ttnn/unit_tests/gtests/tensor/test_tensor_sharding.cpp b/tests/ttnn/unit_tests/gtests/tensor/test_tensor_sharding.cpp index 5678c31e4df..0c90a2efca7 100644 --- a/tests/ttnn/unit_tests/gtests/tensor/test_tensor_sharding.cpp +++ b/tests/ttnn/unit_tests/gtests/tensor/test_tensor_sharding.cpp @@ -967,7 +967,6 @@ struct IllegalShardSpecParams { class IllegalTensorLayoutCreationTests : public ::testing::TestWithParam {}; TEST_P(IllegalTensorLayoutCreationTests, ExpectFailAndCheckErrMsg) { - GTEST_SKIP() << "Enable tests after flipping asserts to TT_FATAL (issue #17060)"; const auto& params = GetParam(); EXPECT_THAT( @@ -1042,7 +1041,6 @@ INSTANTIATE_TEST_SUITE_P( class IllegalTensorSpecCreationTests : public ::testing::TestWithParam {}; TEST_P(IllegalTensorSpecCreationTests, ExpectFailAndCheckErrMsg) { - GTEST_SKIP() << "Enable tests after flipping asserts to TT_FATAL (issue #17060)"; const auto& params = GetParam(); auto tensor_layout = TensorLayout(DataType::BFLOAT16, params.page_config, params.memory_config); diff --git a/tests/ttnn/unit_tests/gtests/tensor/test_vector_conversion.cpp b/tests/ttnn/unit_tests/gtests/tensor/test_vector_conversion.cpp index c6b960946f3..a5b970ab635 100644 --- a/tests/ttnn/unit_tests/gtests/tensor/test_vector_conversion.cpp +++ b/tests/ttnn/unit_tests/gtests/tensor/test_vector_conversion.cpp @@ -371,12 +371,11 @@ TEST_P(ShardVectorConversionTest, BlockfloatRoundtripTilizedShardMapping) { INSTANTIATE_TEST_SUITE_P( ShardVectorConversionTests, ShardVectorConversionTest, - ::testing::Values( - TensorMemoryLayout::INTERLEAVED, - TensorMemoryLayout::SINGLE_BANK, - TensorMemoryLayout::HEIGHT_SHARDED, - TensorMemoryLayout::WIDTH_SHARDED, - TensorMemoryLayout::BLOCK_SHARDED)); + ::testing::Values(TensorMemoryLayout::INTERLEAVED, TensorMemoryLayout::SINGLE_BANK)); +// #17806: Fix illegal shard spec and re-enable! +// TensorMemoryLayout::HEIGHT_SHARDED, +// TensorMemoryLayout::WIDTH_SHARDED, +// TensorMemoryLayout::BLOCK_SHARDED)); } // namespace diff --git a/ttnn/cpp/ttnn/tensor/layout/tensor_layout.cpp b/ttnn/cpp/ttnn/tensor/layout/tensor_layout.cpp index 8bd564e511c..f119c7bc621 100644 --- a/ttnn/cpp/ttnn/tensor/layout/tensor_layout.cpp +++ b/ttnn/cpp/ttnn/tensor/layout/tensor_layout.cpp @@ -102,7 +102,7 @@ void validate_shard_spec(const TensorLayout& tensor_layout) { const auto& physical_shard_shape = tensor_layout.get_physical_shard_shape(); const auto& tile_shape = tensor_layout.get_tile().get_tile_shape(); // TODO (issue #17060): Flip to TT_FATAL - TT_ASSERT( + TT_FATAL( (physical_shard_shape.height() % tile_shape[0] == 0 && physical_shard_shape.width() % tile_shape[1] == 0), "Physical shard shape {} must be tile {} sized!", physical_shard_shape, diff --git a/ttnn/cpp/ttnn/tensor/tensor_spec.cpp b/ttnn/cpp/ttnn/tensor/tensor_spec.cpp index 683f4814e2a..d80cc71ecb6 100644 --- a/ttnn/cpp/ttnn/tensor/tensor_spec.cpp +++ b/ttnn/cpp/ttnn/tensor/tensor_spec.cpp @@ -29,31 +29,31 @@ void validate_shard_spec_with_tensor_shape(const TensorSpec& tensor_spec) { // TODO (issue #17060): Flip to TT_FATAL if (memory_config.memory_layout == TensorMemoryLayout::HEIGHT_SHARDED) { - TT_ASSERT( + TT_FATAL( physical_width == physical_shard_width, "Shard width {} must match physical width {} for height sharded", physical_shard_width, physical_width); uint32_t num_shards = div_up(physical_height, physical_shard_height); - TT_ASSERT( + TT_FATAL( num_shards <= num_cores, "Number of shards along height {} must not exceed number of cores {}", num_shards, num_cores); } else if (memory_config.memory_layout == TensorMemoryLayout::WIDTH_SHARDED) { - TT_ASSERT( + TT_FATAL( physical_height == physical_shard_height, "Shard height {} must match physical height {} for width sharded", physical_shard_height, physical_height); uint32_t num_shards = div_up(physical_width, physical_shard_width); - TT_ASSERT( + TT_FATAL( num_shards <= num_cores, "Number of shards along width {} must not exceed number of cores {}", num_shards, num_cores); } else if (memory_config.memory_layout == TensorMemoryLayout::BLOCK_SHARDED) { - TT_ASSERT( + TT_FATAL( shard_spec.grid.ranges().size() == 1, "Shard grid must be one full rectangular grid for block sharded!"); uint32_t num_shards_along_height = div_up(physical_height, physical_shard_height); uint32_t num_shards_along_width = div_up(physical_width, physical_shard_width); @@ -61,24 +61,24 @@ void validate_shard_spec_with_tensor_shape(const TensorSpec& tensor_spec) { // Additionally check that number of cores along height and width matches shard grid const CoreCoord shard_grid = shard_spec.grid.bounding_box().grid_size(); if (shard_spec.orientation == ShardOrientation::ROW_MAJOR) { - TT_ASSERT( + TT_FATAL( num_shards_along_height <= shard_grid.y, "Number of shards along height {} must not exceed number of rows {} for row major orientation!", num_shards_along_height, shard_grid.y); - TT_ASSERT( + TT_FATAL( num_shards_along_width <= shard_grid.x, "Number of shards along width {} must not exceed number of columns {} for row major orientation!", num_shards_along_width, shard_grid.x); } else { - TT_ASSERT( + TT_FATAL( num_shards_along_height <= shard_grid.x, "Number of shards along height {} must not exceed number of columns {} for column major " "orientation!", num_shards_along_height, shard_grid.x); - TT_ASSERT( + TT_FATAL( num_shards_along_width <= shard_grid.y, "Number of shards along width {} must not exceed number of rows {} for column major orientation!", num_shards_along_width, From 0d5c997b61738cd05cbf6af1b227b2cc9377ae1d Mon Sep 17 00:00:00 2001 From: Bryan Wilder Field Lozano Date: Mon, 10 Feb 2025 15:57:08 -0700 Subject: [PATCH 053/316] [skip ci] Fix L2 workflow and add matmul nightly tests (#17802) --- .github/workflows/tt-metal-l2-nightly.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/tt-metal-l2-nightly.yaml b/.github/workflows/tt-metal-l2-nightly.yaml index bbbbb618607..35c08c107dd 100644 --- a/.github/workflows/tt-metal-l2-nightly.yaml +++ b/.github/workflows/tt-metal-l2-nightly.yaml @@ -50,13 +50,13 @@ jobs: matrix: os: ["ubuntu-20.04"] test-group: - - name: ttnn example tests - cmd: ./tests/scripts/run_ttnn_examples.sh - name: ${{ matrix.test-group.name }} ${{ inputs.arch }} ${{ inputs.runner-label }} + - name: ttnn nightly tests + cmd: pytest tests/ttnn/nightly/unit_tests -xv -m "not disable_fast_runtime_mode" + name: ${{ matrix.test-group.name }} env: LOGURU_LEVEL: INFO runs-on: - - ${{ inputs.runner-label }} + - ${{ inputs.runner-label || 'N150' }} - "in-service" steps: - uses: tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main @@ -64,13 +64,13 @@ jobs: with: name: eager-dist-${{ matrix.os }}-any - name: ${{ matrix.test-group.name }} tests - timeout-minutes: ${{ inputs.timeout }} + timeout-minutes: ${{ inputs.timeout || '45' }} uses: ./.github/actions/docker-run with: docker_username: ${{ github.actor }} docker_password: ${{ secrets.GITHUB_TOKEN }} docker_opts: | - -e ARCH_NAME=${{ inputs.arch }} + -e ARCH_NAME=${{ inputs.arch || 'wormhole_b0' }} run_args: | WHEEL_FILENAME=$(ls -1 *.whl) pip3 install --user $WHEEL_FILENAME From 2d6c93d647ca8c7993833b48720677823151efa5 Mon Sep 17 00:00:00 2001 From: "Jack (Xun) Cai" Date: Mon, 10 Feb 2025 17:02:49 -0600 Subject: [PATCH 054/316] All gather async llama ci (#17746) ### What's changed Added Llama shape ccl async test to CI and added (e2e) perf measurement ### Checklist - [x] All post commit: https://github.com/tenstorrent/tt-metal/actions/runs/13246317576 --- .../tg/ccl/test_ccl_async_TG_llama_nightly.py | 1 + .../ccl/test_all_gather_TG_post_commit.py | 9 ++++++ .../operations/ccl/test_ccl_async_TG_llama.py | 30 ++++++++++++++----- 3 files changed, 33 insertions(+), 7 deletions(-) create mode 120000 tests/nightly/tg/ccl/test_ccl_async_TG_llama_nightly.py diff --git a/tests/nightly/tg/ccl/test_ccl_async_TG_llama_nightly.py b/tests/nightly/tg/ccl/test_ccl_async_TG_llama_nightly.py new file mode 120000 index 00000000000..18ed2ca2998 --- /dev/null +++ b/tests/nightly/tg/ccl/test_ccl_async_TG_llama_nightly.py @@ -0,0 +1 @@ +../../../ttnn/unit_tests/operations/ccl/test_ccl_async_TG_llama.py \ No newline at end of file diff --git a/tests/ttnn/unit_tests/operations/ccl/test_all_gather_TG_post_commit.py b/tests/ttnn/unit_tests/operations/ccl/test_all_gather_TG_post_commit.py index a476163c8d5..7f37600028a 100644 --- a/tests/ttnn/unit_tests/operations/ccl/test_all_gather_TG_post_commit.py +++ b/tests/ttnn/unit_tests/operations/ccl/test_all_gather_TG_post_commit.py @@ -14,6 +14,7 @@ teardown_fabric_interface, create_global_semaphore_with_same_address, ) +from models.perf.benchmarking_utils import BenchmarkProfiler def report_mismatches(golden, actual, max_printable=None): @@ -64,6 +65,7 @@ def run_with_trace( n_buffer=None, num_iter=20, use_all_gather_async=False, + profiler=BenchmarkProfiler(), ): # Compile Run logger.info("Compiling model") @@ -131,10 +133,15 @@ def run_with_trace( # Run the op logger.info("Starting Trace perf test...") + profiler.start("all-gather-async-trace") ttnn.execute_trace(mesh_device, trace_id, blocking=False) ttnn.release_trace(mesh_device, trace_id) for d in mesh_device.get_devices(): ttnn.synchronize_device(d) + profiler.end("all-gather-async-trace") + logger.info(f"Time taken: {profiler.get_duration('all-gather-async-trace')} s") + logger.info(f"Time per iter: {(profiler.get_duration('all-gather-async-trace')) / num_iter} s") + logger.info(f"Time per iter: {(profiler.get_duration('all-gather-async-trace')) / num_iter * 1e6} us") return tt_out_tensor @@ -160,6 +167,7 @@ def run_line_all_gather_on_TG_with_mesh_tensor_along_rows( tile=(32, 32), trace_mode=False, debug=False, + profiler=BenchmarkProfiler(), # New all-gather-async and persistent fabric params use_all_gather_async=False, enable_persistent_fabric=False, @@ -270,6 +278,7 @@ def run_line_all_gather_on_TG_with_mesh_tensor_along_rows( all_gather_topology=ttnn.Topology.Linear, num_iter=num_iters, use_all_gather_async=use_all_gather_async, + profiler=profiler, ) else: diff --git a/tests/ttnn/unit_tests/operations/ccl/test_ccl_async_TG_llama.py b/tests/ttnn/unit_tests/operations/ccl/test_ccl_async_TG_llama.py index c1673280601..fe967467e14 100644 --- a/tests/ttnn/unit_tests/operations/ccl/test_ccl_async_TG_llama.py +++ b/tests/ttnn/unit_tests/operations/ccl/test_ccl_async_TG_llama.py @@ -23,6 +23,7 @@ from tests.ttnn.unit_tests.operations.ccl.test_all_reduce_async import ( run_all_reduce_with_mesh_tensor_along_row, ) +from models.perf.benchmarking_utils import BenchmarkProfiler PREFETCHER_NOC1_RING = [ @@ -79,22 +80,25 @@ def get_core_range_set(output_core_grid): "num_devices, num_links", [ (4, 3), - (4, 2), - (4, 1), ], ) @pytest.mark.parametrize( "input_dtype", [ - ttnn.bfloat16, ttnn.bfloat8_b, ], ) +@pytest.mark.parametrize( + "num_iters", + [ + 5000, + ], +) @pytest.mark.parametrize("shard_grid_orientation", [ttnn.ShardOrientation.ROW_MAJOR]) @pytest.mark.parametrize( - "tensor_mem_layout, output_shape, dim, input_shard_shape,input_shard_grid,output_shard_shape, output_shard_grid, layout", + "tensor_mem_layout, output_shape, dim, input_shard_shape,input_shard_grid,output_shard_shape, output_shard_grid, layout, perf_target_us", ( - ( # AllGather after SDPA (~160 us) + ( # AllGather after SDPA ttnn.TensorMemoryLayout.HEIGHT_SHARDED, (1, 32, 32, 128), 1, @@ -108,8 +112,9 @@ def get_core_range_set(output_core_grid): } ), ttnn.TILE_LAYOUT, + 32, ), - ( # AllGather after Binary Mult+Silu (~160 us) + ( # AllGather after Binary Mult+Silu ttnn.TensorMemoryLayout.WIDTH_SHARDED, (1, 1, 32, 3840), 3, @@ -118,6 +123,7 @@ def get_core_range_set(output_core_grid): (32, 160), get_core_range_set(PREFETCHER_NOC1_RING), ttnn.TILE_LAYOUT, + 25, ), ), ) @@ -143,7 +149,8 @@ def test_line_all_gather_sharded_on_TG_rows_llama( function_level_defaults, enable_async, replication_factor, - num_iters=100, + num_iters, + perf_target_us, ): if len(mesh_device.get_devices()) != 32: pytest.skip("Not TG!") @@ -162,6 +169,8 @@ def test_line_all_gather_sharded_on_TG_rows_llama( else: output_shard_spec = None + profiler = BenchmarkProfiler() + run_line_all_gather_on_TG_with_mesh_tensor_along_rows( mesh_device, num_devices, @@ -180,6 +189,7 @@ def test_line_all_gather_sharded_on_TG_rows_llama( output_shard_spec=output_shard_spec, num_all_gather_instances=replication_factor, cluster_axis=1, + profiler=profiler, trace_mode=True, use_all_gather_async=True, enable_persistent_fabric=True, @@ -187,6 +197,12 @@ def test_line_all_gather_sharded_on_TG_rows_llama( teardown_persistent_fabric=True, ) + latency_us = profiler.get_duration("all-gather-async-trace") / num_iters * 1e6 + if perf_target_us is not None: + assert ( + latency_us < perf_target_us + ), f"Measured latency {latency_us} us is greater than target {perf_target_us} us" + @skip_for_grayskull("Requires eth connected devices to run") @pytest.mark.parametrize( From 66f0c03ad291371f4420e58c65c056d18f0e60cc Mon Sep 17 00:00:00 2001 From: Sean Nijjar Date: Mon, 10 Feb 2025 18:03:25 -0500 Subject: [PATCH 055/316] Additional EDM fabric optimizations (mix of low level and experimental flow control protocol trimming) (#17749) High level changes: 1) Optimize size information in packet header. - simplifies packet processing and setup 2) Optimize routing information storage in packet header - simplifies packet processing 3) Added missing inline write command type which is required after these changes 4) Migrate to more optimized eth APIs - eth_write_reg and eth_send_packet that omit bit shifts and omit context switch calls 5) Trimming flow control protocols further + various force inlines for tiny getter functions ## Packet Header Size Field Optimization - Simplify packet size storage and access - promote to "top-level" of packet to remove conditionality previously needed to get size info from packet - NOTE: packet size now specifies PAYLOAD SIZE ONLY!!! The header size must be implicitly added by fabric. - net this is still fine because we had to previous subtract header size when writing out to noc. ## Packet Header Routing Info Optimization Merged the mcast and unicast representation to match so I can uniformly process the packet to decide the following: - Does packet get sent to local device noc? - Does packet get forwarded through the fabric? The previous implementation was required to first check the fabric send type before being able to do further inspection to answer the above questions. Now the code is much simpler - no fabric type info checked - single code path to check both. Additionally the check logic is also streamlined. ## New packet command type Extra functionality: Added `NOC_UNICAST_INLINE_WRITE` eth packet command type to address a regression as a result of the above change (if the command type wasn't added) ## Optimized Eth send APIs - Migrate `eth_send_packet` calls to new version that takes size in bytes. - This version avoids a number of shift operations that were present in the previously used version. - Add and using new eth write remote reg (`eth_write_remote_reg_no_txq_check`) that doesn't have conditional context switch in body of function ## Flow Control Protocol Trimming - Enabled (by default) a less granular syncing mode between sender and receiver channels. Overall, in a theoretical sense, this is suboptimal. However, in a severely SW bound implementation like present, this will save on instruction count. We disable the following: - first level ack (i.e. when receiver gets the packet and notifies sender of packet received) - separate pointer management for write flush ptr and completion pointer send on receiver channel - Flush ptr now merged with completion pointer so we cut down on processing. --- .../gtests/ccl/kernels/edm_fabric_writer.cpp | 12 +- ...c_erisc_datamover_sender_worker_sender.cpp | 19 +- .../fabric_worker_sender_multi_input.cpp | 10 +- .../ccl/kernels/test_kernels.common.hpp | 7 +- ...erisc_data_mover_loopback_with_workers.cpp | 14 +- .../operations/ccl/test_new_all_gather.py | 11 + tt_metal/hw/inc/ethernet/tunneling.h | 6 + ttnn/cpp/pybind11/global_semaphore.cpp | 1 + .../kernel_common/kernel_writers.hpp | 5 +- .../kernels/ccl_send_reader_two_input.cpp | 20 +- .../ccl/common/kernels/ccl_send_utils.hpp | 8 +- .../edm_fabric/fabric_edm_packet_header.hpp | 131 +++++---- .../fabric_edm_packet_transmission.hpp | 95 ++---- .../edm_fabric/fabric_erisc_datamover.cpp | 278 +++++++++--------- .../fabric_erisc_datamover_channels.hpp | 113 ++----- .../device/kernels/minimal_ccl_common.hpp | 7 +- 16 files changed, 327 insertions(+), 410 deletions(-) diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp index cd142bef8fd..952a4963104 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp @@ -128,7 +128,7 @@ void kernel_main() { mcast_fwd_packet_header->to_chip_multicast(MulticastRoutingCommandHeader{1, static_cast(mcast_fwd_hops)}); mcast_bwd_packet_header->to_chip_multicast(MulticastRoutingCommandHeader{1, static_cast(mcast_bwd_hops)}); - unicast_packet_header->to_chip_unicast(UnicastRoutingCommandHeader{static_cast(unicast_hops)}); + unicast_packet_header->to_chip_unicast(static_cast(unicast_hops)); { DeviceZoneScopedN("MAIN-WRITE-ZONE"); @@ -140,8 +140,8 @@ void kernel_main() { noc_async_write(source_l1_buffer_address, dest_addr, packet_payload_size_bytes); if (fabric_connection.has_forward_connection()) { DeviceZoneScopedN("WR-FWD"); - mcast_fwd_packet_header->to_noc_unicast_write(NocUnicastCommandHeader{ - noc0_dest_addr, packet_payload_size_bytes + sizeof(tt::fabric::PacketHeader)}); + mcast_fwd_packet_header->to_noc_unicast_write( + NocUnicastCommandHeader{noc0_dest_addr}, packet_payload_size_bytes); { DeviceZoneScopedN("WR-FWD-WAIT"); fabric_connection.get_forward_connection().wait_for_empty_write_slot(); @@ -155,8 +155,8 @@ void kernel_main() { if (fabric_connection.has_backward_connection()) { DeviceZoneScopedN("WR-BWD"); - mcast_bwd_packet_header->to_noc_unicast_write(NocUnicastCommandHeader{ - noc0_dest_addr, packet_payload_size_bytes + sizeof(tt::fabric::PacketHeader)}); + mcast_bwd_packet_header->to_noc_unicast_write( + NocUnicastCommandHeader{noc0_dest_addr}, packet_payload_size_bytes); { DeviceZoneScopedN("WR-BWD-WAIT"); fabric_connection.get_backward_connection().wait_for_empty_write_slot(); @@ -179,7 +179,7 @@ void kernel_main() { DeviceZoneScopedN("UNICAST-WRITE"); auto& fabric_conn = unicast_is_fwd ? fabric_connection.get_forward_connection() : fabric_connection.get_backward_connection(); - unicast_packet_header->to_noc_unicast_write(NocUnicastCommandHeader{noc0_dest_addr, packet_payload_size_bytes}); + unicast_packet_header->to_noc_unicast_write(NocUnicastCommandHeader{noc0_dest_addr}, packet_payload_size_bytes); fabric_conn.wait_for_empty_write_slot(); fabric_conn.send_payload_without_header_non_blocking_from_address( source_l1_buffer_address, packet_payload_size_bytes); diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp index d0b384fc55f..b210f32efb5 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp @@ -124,18 +124,17 @@ void kernel_main() { const auto dest_noc_address = get_noc_addr(p, dest_addr_gen, 0, NORMALIZED_NOC_INDEX); const size_t packet_size = page_size + sizeof(tt::fabric::PacketHeader); auto packet_addr = get_read_ptr(cb_id_in0); - auto& packet_header = *reinterpret_cast(packet_addr); + auto* packet_header = reinterpret_cast(packet_addr); if constexpr (mcast_mode) { packet_header - .to_chip_multicast(tt::fabric::MulticastRoutingCommandHeader{config.mcast.distance, config.mcast.range}) - .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{ - dest_noc_address, (pages_to_send * page_size) + sizeof(tt::fabric::PacketHeader)}); - packet_header.reserved2 = 0x1111; // debug only + ->to_chip_multicast( + tt::fabric::MulticastRoutingCommandHeader{config.mcast.distance, config.mcast.range}) + ->to_noc_unicast_write( + tt::fabric::NocUnicastCommandHeader{dest_noc_address}, (pages_to_send * page_size)); } else { - packet_header.to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{config.unicast.distance}) - .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{ - dest_noc_address, (pages_to_send * page_size) + sizeof(tt::fabric::PacketHeader)}); - packet_header.reserved2 = 0x1111; // debug only + packet_header->to_chip_unicast(config.unicast.distance) + ->to_noc_unicast_write( + tt::fabric::NocUnicastCommandHeader{dest_noc_address}, (pages_to_send * page_size)); } sender.send_payload_blocking_from_address(packet_addr, packet_size); @@ -150,7 +149,7 @@ void kernel_main() { ASSERT(*last_message_semaphore_address == 0); uint64_t last_message_semaphore_noc0_addr = safe_get_noc_addr(my_x[0], my_y[0], (uint32_t)last_message_semaphore_address, 0); - packet_header.to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{2}); + packet_header.to_chip_unicast(2); packet_header.to_noc_unicast_atomic_inc( tt::fabric::NocUnicastAtomicIncCommandHeader(last_message_semaphore_noc0_addr, 1, 32)); diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp index 98a60766922..eaa14a0e40f 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp @@ -59,12 +59,10 @@ auto forward_to_fabric_from_cb( if constexpr (mcast_mode) { packet_header .to_chip_multicast(tt::fabric::MulticastRoutingCommandHeader{config.mcast.distance, config.mcast.range}) - .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{ - noc0_dest_address, (pages_to_send * page_size) + sizeof(tt::fabric::PacketHeader)}); + .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_address}, (pages_to_send * page_size)); } else { - packet_header.to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{config.unicast.distance}) - .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{ - noc0_dest_address, (pages_to_send * page_size) + sizeof(tt::fabric::PacketHeader)}); + packet_header.to_chip_unicast(config.unicast.distance) + .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_address}, (pages_to_send * page_size)); } uint64_t buffer_address = sender.edm_buffer_addr + (*sender.buffer_index_ptr * (sender.buffer_size_bytes + sizeof(eth_channel_sync_t))); @@ -189,7 +187,7 @@ void kernel_main() { packet_header.reserved = 0xE; packet_header.reserved2 = 0xFFFF; uint64_t last_message_sem_noc_addr = get_noc_addr(my_x[0], my_y[0], last_message_semaphore_address); - packet_header.to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{kLoopbackNumHopsToMyChip}); + packet_header.to_chip_unicast(kLoopbackNumHopsToMyChip); packet_header.to_noc_unicast_atomic_inc( tt::fabric::NocUnicastAtomicIncCommandHeader(last_message_sem_noc_addr, 1, 32)); diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/test_kernels.common.hpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/test_kernels.common.hpp index cae2798e893..ae5e9135a2b 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/kernels/test_kernels.common.hpp +++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/test_kernels.common.hpp @@ -32,9 +32,10 @@ bool terminate_fabric_endpoints_farthest_to_nearest ( auto &packet_header = *reinterpret_cast(a_packet_header_addr); reinterpret_cast(a_packet_header_addr)[sizeof(tt::fabric::PacketHeader) >> 2] = tt::fabric::TerminationSignal::GRACEFULLY_TERMINATE; sender.wait_for_empty_write_slot(); - packet_header.to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{static_cast(distance)}) - .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{ - termination_sig_noc_addr, sizeof(tt::fabric::PacketHeader) + sizeof(uint32_t)}); + packet_header.to_chip_unicast(static_cast(distance)) + .to_noc_unicast_write( + tt::fabric::NocUnicastCommandHeader{termination_sig_noc_addr}, + sizeof(tt::fabric::PacketHeader) + sizeof(uint32_t)); sender.send_payload_blocking_from_address(a_packet_header_addr, packet_header.get_payload_size_including_header()); noc_async_writes_flushed(); } diff --git a/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp b/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp index ee3a644e06e..4f9eadf730c 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp @@ -3266,7 +3266,6 @@ TEST(EdmFabric, DISABLED_BasicMcastThroughputTest_SenderFullNoWrap_ReceiverNoWra RunWriteThroughputStabilityTestWithPersistentFabric( num_mcasts, num_unicasts, num_links, num_op_invocations, params); } -// hangs with DPRINT TEST(EdmFabric, BasicMcastThroughputTest_SenderFullNoWrap_ReceiverNoWrap_2Device) { const size_t num_mcasts = 9; const size_t num_unicasts = 0; @@ -3294,7 +3293,6 @@ TEST(EdmFabric, DISABLED_BasicMcastThroughputTest_SenderFullNoWrap_ReceiverNoWra RunWriteThroughputStabilityTestWithPersistentFabric( num_mcasts, num_unicasts, num_links, num_op_invocations, params); } -// First to hang - maybe somethign to do with merging traffic TEST(EdmFabric, DISABLED_BasicMcastThroughputTest_SenderFullNoWrap_ReceiverNoWrap_TwoWorkers_4Device) { const size_t num_mcasts = 9; const size_t num_unicasts = 0; @@ -3603,6 +3601,18 @@ TEST(EdmFabric, BasicMcastThroughputTest_3) { RunWriteThroughputStabilityTestWithPersistentFabric( num_mcasts, num_unicasts, num_links, num_op_invocations, params); } +TEST(EdmFabric, BasicMcastThroughputTest_3_onehop) { + const size_t num_mcasts = 200000; + const size_t num_unicasts = 2; + const size_t num_links = 1; + const size_t num_op_invocations = 1; + const bool line_sync = true; + WriteThroughputStabilityTestWithPersistentFabricParams params; + params.line_sync = line_sync; + params.line_size = 2; + RunWriteThroughputStabilityTestWithPersistentFabric( + num_mcasts, num_unicasts, num_links, num_op_invocations, params); +} TEST(EdmFabric, BasicMcastThroughputTest_4) { const size_t num_mcasts = 800000; const size_t num_unicasts = 2; diff --git a/tests/ttnn/unit_tests/operations/ccl/test_new_all_gather.py b/tests/ttnn/unit_tests/operations/ccl/test_new_all_gather.py index 08d359325c2..41f1076a2af 100644 --- a/tests/ttnn/unit_tests/operations/ccl/test_new_all_gather.py +++ b/tests/ttnn/unit_tests/operations/ccl/test_new_all_gather.py @@ -464,6 +464,17 @@ def test_all_gather( None, ttnn.TensorMemoryLayout.HEIGHT_SHARDED, ), + ( + 4, + [1, 4, 32, 1280], + 3, + ttnn.TILE_LAYOUT, + (32, 320), + ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(1, 4))}), + None, + None, + ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + ), ], ) @pytest.mark.parametrize("num_links", [1]) diff --git a/tt_metal/hw/inc/ethernet/tunneling.h b/tt_metal/hw/inc/ethernet/tunneling.h index 37d1422d2f6..a4070cbb24b 100644 --- a/tt_metal/hw/inc/ethernet/tunneling.h +++ b/tt_metal/hw/inc/ethernet/tunneling.h @@ -96,6 +96,12 @@ void eth_write_remote_reg(uint32_t q_num, uint32_t reg_addr, uint32_t val) { eth_txq_reg_write(q_num, ETH_TXQ_REMOTE_REG_DATA, val); eth_txq_reg_write(q_num, ETH_TXQ_CMD, ETH_TXQ_CMD_START_REG); } +FORCE_INLINE +void eth_write_remote_reg_no_txq_check(uint32_t q_num, uint32_t reg_addr, uint32_t val) { + eth_txq_reg_write(q_num, ETH_TXQ_DEST_ADDR, reg_addr); + eth_txq_reg_write(q_num, ETH_TXQ_REMOTE_REG_DATA, val); + eth_txq_reg_write(q_num, ETH_TXQ_CMD, ETH_TXQ_CMD_START_REG); +} void check_and_context_switch() { uint32_t start_time = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L); diff --git a/ttnn/cpp/pybind11/global_semaphore.cpp b/ttnn/cpp/pybind11/global_semaphore.cpp index bf9f82673c7..bdc7a2d977b 100644 --- a/ttnn/cpp/pybind11/global_semaphore.cpp +++ b/ttnn/cpp/pybind11/global_semaphore.cpp @@ -7,6 +7,7 @@ #include #include "cpp/ttnn/global_semaphore.hpp" #include "pybind11/pybind11.h" +#include "pybind11/stl.h" namespace ttnn::global_semaphore { diff --git a/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/kernel_writers.hpp b/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/kernel_writers.hpp index b69b5caaad2..fd6bae7f5ee 100644 --- a/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/kernel_writers.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/kernel_writers.hpp @@ -33,8 +33,7 @@ FORCE_INLINE void write_and_advance_local_read_address_for_fabric_write( pkt_hdr->reserved2 = my_chip_id; #endif - size_t packet_send_size_bytes = payload_size_bytes + sizeof(tt::fabric::PacketHeader); - pkt_hdr->to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_noc_addr, packet_send_size_bytes}); + pkt_hdr->to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_noc_addr}, payload_size_bytes); switch (current_cmd_header.dest_type) { case ttnn::ccl::cmd::CclCommandDestType::CHIP_UNICAST: { @@ -42,7 +41,7 @@ FORCE_INLINE void write_and_advance_local_read_address_for_fabric_write( auto& fabric_conn = unicast_args.is_forward_direction ? fabric_connection.get_forward_connection() : fabric_connection.get_backward_connection(); - pkt_hdr->to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{unicast_args.distance_in_hops}); + pkt_hdr->to_chip_unicast(unicast_args.distance_in_hops); fabric_conn.wait_for_empty_write_slot(); fabric_conn.send_payload_without_header_non_blocking_from_address(l1_read_addr, payload_size_bytes); fabric_conn.send_payload_flush_blocking_from_address((uint32_t)pkt_hdr, sizeof(tt::fabric::PacketHeader)); diff --git a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp index 4225247db41..731ed70359e 100644 --- a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp @@ -438,22 +438,19 @@ void try_advance_inline_write_or_atomic_inc(command_context_t& cmd_ctx) ASSERT(cmd_ctx.packet_header_buffer_addr != 0); auto* pkt_hdr = reinterpret_cast(cmd_ctx.packet_header_buffer_addr); -#ifdef DEBUG_PRINT_ENABLED - pkt_hdr->reserved2 = my_chip_id; -#endif + uint64_t dest_noc_addr_for_pkt = safe_get_noc_addr(dest_noc0_x, dest_noc0_y, dest_bank_addr, 0); if (cmd_ctx.current_cmd_header.code == ttnn::ccl::cmd::CclCommandCode::ATOMIC_INC) { pkt_hdr->to_noc_unicast_atomic_inc( tt::fabric::NocUnicastAtomicIncCommandHeader{dest_noc_addr_for_pkt, static_cast(value), 32}); } else { - pkt_hdr->to_noc_unicast_write( - tt::fabric::NocUnicastCommandHeader{dest_noc_addr_for_pkt, static_cast(value)}); + pkt_hdr->to_noc_unicast_inline_write( + tt::fabric::NocUnicastInlineWriteCommandHeader{dest_noc_addr_for_pkt, static_cast(value)}); } switch (cmd_ctx.current_cmd_header.dest_type) { case ttnn::ccl::cmd::CclCommandDestType::CHIP_UNICAST: { - pkt_hdr->to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{ - cmd_ctx.current_cmd_header.get_unicast_dest_args().distance_in_hops}); + pkt_hdr->to_chip_unicast(cmd_ctx.current_cmd_header.get_unicast_dest_args().distance_in_hops); auto& fabric_connection = cmd_ctx.current_cmd_header.get_unicast_dest_args().is_forward_direction ? cmd_ctx.fabric_connection.get_forward_connection() @@ -563,13 +560,8 @@ void write_and_advance_local_read_address_for_fabric_write( const size_t payload_l1_address = l1_read_addr; auto pkt_hdr = reinterpret_cast(packet_header_buffer_addr); -#ifdef DEBUG_PRINT_ENABLED - pkt_hdr->reserved2 = my_chip_id; -#endif - size_t packet_send_size_bytes = payload_size_bytes + sizeof(tt::fabric::PacketHeader); - pkt_hdr->to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{ - noc0_dest_noc_addr, packet_send_size_bytes}); + pkt_hdr->to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_noc_addr}, payload_size_bytes); switch (current_cmd_header.dest_type) { case ttnn::ccl::cmd::CclCommandDestType::CHIP_UNICAST: { @@ -577,7 +569,7 @@ void write_and_advance_local_read_address_for_fabric_write( auto& fabric_conn = unicast_args.is_forward_direction ? fabric_connection.get_forward_connection() : fabric_connection.get_backward_connection(); - pkt_hdr->to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{unicast_args.distance_in_hops}); + pkt_hdr->to_chip_unicast(unicast_args.distance_in_hops); fabric_conn.wait_for_empty_write_slot(); fabric_conn.send_payload_without_header_non_blocking_from_address(l1_read_addr, payload_size_bytes); diff --git a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp index 0f662c4bfd4..904cd775a9a 100644 --- a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp @@ -118,9 +118,7 @@ void mcast_contig_pages_to_noc_address( pkt_hdr .to_chip_multicast( tt::fabric::MulticastRoutingCommandHeader{1, static_cast(forward_direction_num_hops)}) - .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{ - noc0_dest_addr, - packet_send_size_bytes}); + .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_addr}, packet_send_size_bytes); forward_fabric_sender.wait_for_empty_write_slot(); forward_fabric_sender.send_payload_flush_blocking_from_address(l1_read_addr, packet_send_size_bytes); } @@ -131,9 +129,7 @@ void mcast_contig_pages_to_noc_address( pkt_hdr .to_chip_multicast( tt::fabric::MulticastRoutingCommandHeader{1, static_cast(backward_direction_num_hops)}) - .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{ - noc0_dest_addr, - packet_send_size_bytes}); + .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_addr}, packet_send_size_bytes); backward_fabric_sender.wait_for_empty_write_slot(); backward_fabric_sender.send_payload_non_blocking_from_address(l1_read_addr, packet_send_size_bytes); } diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp index be4f8c42ce4..9a5cfcb40f9 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp @@ -6,6 +6,7 @@ #include #include +#include namespace tt::fabric { @@ -19,13 +20,13 @@ enum TerminationSignal : uint32_t { IMMEDIATELY_TERMINATE = 2 }; - -// 2 bits +// 3 bits enum NocSendType : uint8_t { NOC_UNICAST_WRITE = 0, - NOC_MULTICAST_WRITE = 1, - NOC_UNICAST_ATOMIC_INC = 2, - NOC_MULTICAST_ATOMIC_INC = 3 + NOC_UNICAST_INLINE_WRITE = 1, + NOC_MULTICAST_WRITE = 2, + NOC_UNICAST_ATOMIC_INC = 3, + NOC_MULTICAST_ATOMIC_INC = 4 }; // How to send the payload across the cluster // 1 bit @@ -34,29 +35,33 @@ enum ChipSendType : uint8_t { CHIP_MULTICAST = 1, }; +struct RoutingFields { + static constexpr uint8_t START_DISTANCE_FIELD_BIT_WIDTH = 4; + static constexpr uint8_t RANGE_HOPS_FIELD_BIT_WIDTH = 4; + static constexpr uint8_t LAST_HOP_DISTANCE_VAL = 1; + static constexpr uint8_t LAST_CHIP_IN_MCAST_VAL = 1 << tt::fabric::RoutingFields::START_DISTANCE_FIELD_BIT_WIDTH; + static constexpr uint8_t HOP_DISTANCE_MASK = (1 << tt::fabric::RoutingFields::RANGE_HOPS_FIELD_BIT_WIDTH) - 1; + static constexpr uint8_t RANGE_MASK = ((1 << tt::fabric::RoutingFields::RANGE_HOPS_FIELD_BIT_WIDTH) - 1) + << tt::fabric::RoutingFields::START_DISTANCE_FIELD_BIT_WIDTH; + static constexpr uint8_t LAST_MCAST_VAL = LAST_CHIP_IN_MCAST_VAL | LAST_HOP_DISTANCE_VAL; -struct UnicastRoutingCommandHeader { - uint8_t distance_in_hops; + uint8_t value; }; -static_assert(sizeof(UnicastRoutingCommandHeader) == 1, "UnicastRoutingCommandHeader size is not 1 byte"); +static_assert(sizeof(RoutingFields) == sizeof(uint8_t), "RoutingFields size is not 1 bytes"); +static_assert((RoutingFields::START_DISTANCE_FIELD_BIT_WIDTH + RoutingFields::RANGE_HOPS_FIELD_BIT_WIDTH) <= sizeof(RoutingFields) * 8, "START_DISTANCE_FIELD_BIT_WIDTH + RANGE_HOPS_FIELD_BIT_WIDTH must equal 8"); + struct MulticastRoutingCommandHeader { - uint8_t start_distance_in_hops: 4; - uint8_t range_hops: 4; // 0 implies unicast + uint8_t start_distance_in_hops: RoutingFields::START_DISTANCE_FIELD_BIT_WIDTH; + uint8_t range_hops: RoutingFields::RANGE_HOPS_FIELD_BIT_WIDTH; // 0 implies unicast }; -static_assert(sizeof(MulticastRoutingCommandHeader) == 1, "MulticastRoutingCommandHeader size is not 1 byte"); -union RoutingFields { - UnicastRoutingCommandHeader chip_unicast; - MulticastRoutingCommandHeader chip_mcast; -}; -static_assert(sizeof(RoutingFields) == sizeof(UnicastRoutingCommandHeader), "RoutingFields size is not 1 bytes"); +static_assert(sizeof(MulticastRoutingCommandHeader) <= sizeof(RoutingFields), "MulticastRoutingCommandHeader size is not 1 byte"); struct NocUnicastCommandHeader { uint64_t noc_address; - uint32_t size; - // ignores header size - inline uint32_t get_payload_only_size() const { - return size; - } +}; +struct NocUnicastInlineWriteCommandHeader { + uint64_t noc_address; + uint32_t value; }; struct NocUnicastAtomicIncCommandHeader { NocUnicastAtomicIncCommandHeader(uint64_t noc_address, uint16_t val, uint16_t wrap) @@ -68,16 +73,10 @@ struct NocUnicastAtomicIncCommandHeader { }; struct NocMulticastCommandHeader { uint32_t address; - uint32_t size; uint8_t noc_x_start; uint8_t noc_y_start; uint8_t mcast_rect_size_x; uint8_t mcast_rect_size_y; - - // ignores header size - inline uint32_t get_payload_only_size() const { - return size; - } }; struct NocMulticastAtomicIncCommandHeader { uint32_t address; @@ -88,12 +87,14 @@ struct NocMulticastAtomicIncCommandHeader { uint8_t size_x; uint8_t size_y; }; -static_assert(sizeof(NocUnicastCommandHeader) == 16, "NocUnicastCommandHeader size is not 1 byte"); -static_assert(sizeof(NocMulticastCommandHeader) == 12, "NocMulticastCommandHeader size is not 1 byte"); +static_assert(sizeof(NocUnicastCommandHeader) == 8, "NocUnicastCommandHeader size is not 1 byte"); +static_assert(sizeof(NocMulticastCommandHeader) == 8, "NocMulticastCommandHeader size is not 1 byte"); +static_assert(sizeof(NocUnicastInlineWriteCommandHeader) == 16, "NocMulticastCommandHeader size is not 1 byte"); static_assert(sizeof(NocUnicastAtomicIncCommandHeader) == 16, "NocUnicastCommandHeader size is not 1 byte"); static_assert(sizeof(NocMulticastAtomicIncCommandHeader) == 12, "NocAtomicIncCommandHeader size is not 1 byte"); union NocCommandFields{ NocUnicastCommandHeader unicast_write; + NocUnicastInlineWriteCommandHeader unicast_inline_write; NocMulticastCommandHeader mcast_write; NocUnicastAtomicIncCommandHeader unicast_seminc; NocMulticastAtomicIncCommandHeader mcast_seminc; @@ -106,16 +107,16 @@ struct PacketHeader { // -> unicast_write, mcast_write, unicast_seminc, mcast_seminc // For now, kept it separate so I could do reads which would be handled differently // but for our purposes we shouldn't need read so we should be able to omit the support - NocSendType noc_send_type : 2; + NocSendType noc_send_type : 3; ChipSendType chip_send_type : 1; - uint8_t reserved : 1; + // Used only by the EDM sender and receiver channels. Populated by EDM sender channel to // indicate to the receiver channel what channel was the source of this packet. Reserved // otherwise. uint8_t src_ch_id : 4; RoutingFields routing_fields; - uint16_t reserved2; // can be tagged with src device for debug + uint16_t payload_size_bytes; // excludes header size NocCommandFields command_fields; // size = 16B due to uint64_t alignment // Sort of hack to work-around DRAM read alignment issues that must be 32B aligned @@ -134,87 +135,89 @@ struct PacketHeader { inline void set_routing_fields(RoutingFields &fields) { this->routing_fields = fields; } inline void set_command_fields(NocCommandFields &fields) { this->command_fields = fields; } + // Returns size of payload in bytes - TODO: convert to words (4B) size_t get_payload_size_excluding_header() volatile const { - switch(this->noc_send_type) { - case NOC_UNICAST_WRITE: { - return this->command_fields.unicast_write.size - sizeof(PacketHeader); - } break; - case NOC_MULTICAST_WRITE: { - return this->command_fields.mcast_write.size - sizeof(PacketHeader); - } break; - case NOC_UNICAST_ATOMIC_INC: - case NOC_MULTICAST_ATOMIC_INC: - return 0; - default: - #if defined(KERNEL_BUILD) || defined(FW_BUILD) - ASSERT(false); - #endif - return 0; - }; + return this->payload_size_bytes; } inline size_t get_payload_size_including_header() volatile const { return get_payload_size_excluding_header() + sizeof(PacketHeader); } - inline PacketHeader &to_chip_unicast(UnicastRoutingCommandHeader const &chip_unicast_command_header) { + inline PacketHeader &to_chip_unicast(uint8_t distance_in_hops) { this->chip_send_type = CHIP_UNICAST; - this->routing_fields.chip_unicast = chip_unicast_command_header; + this->routing_fields.value = RoutingFields::LAST_CHIP_IN_MCAST_VAL | distance_in_hops; return *this; } inline PacketHeader &to_chip_multicast(MulticastRoutingCommandHeader const &chip_multicast_command_header) { this->chip_send_type = CHIP_MULTICAST; - this->routing_fields.chip_mcast = chip_multicast_command_header; + this->routing_fields.value = ((static_cast(chip_multicast_command_header.range_hops) << RoutingFields::START_DISTANCE_FIELD_BIT_WIDTH)) | static_cast(chip_multicast_command_header.start_distance_in_hops); return *this; } - inline PacketHeader &to_noc_unicast_write(NocUnicastCommandHeader const &noc_unicast_command_header) { + inline PacketHeader &to_noc_unicast_write(NocUnicastCommandHeader const &noc_unicast_command_header, size_t payload_size_bytes) { this->noc_send_type = NOC_UNICAST_WRITE; this->command_fields.unicast_write = noc_unicast_command_header; + this->payload_size_bytes = payload_size_bytes; + return *this; + } + inline PacketHeader &to_noc_unicast_inline_write(NocUnicastInlineWriteCommandHeader const &noc_unicast_command_header) { + this->noc_send_type = NOC_UNICAST_INLINE_WRITE; + this->command_fields.unicast_inline_write = noc_unicast_command_header; + this->payload_size_bytes = 0; return *this; } - inline PacketHeader &to_noc_multicast_write(NocMulticastCommandHeader const &noc_multicast_command_header) { + inline PacketHeader &to_noc_multicast_write(NocMulticastCommandHeader const &noc_multicast_command_header, size_t payload_size_bytes) { this->noc_send_type = NOC_MULTICAST_WRITE; this->command_fields.mcast_write = noc_multicast_command_header; + this->payload_size_bytes = payload_size_bytes; return *this; } inline PacketHeader &to_noc_unicast_atomic_inc(NocUnicastAtomicIncCommandHeader const &noc_unicast_atomic_inc_command_header) { this->noc_send_type = NOC_UNICAST_ATOMIC_INC; this->command_fields.unicast_seminc = noc_unicast_atomic_inc_command_header; + this->payload_size_bytes = 0; return *this; } - inline PacketHeader &to_noc_multicast_atomic_inc(NocMulticastAtomicIncCommandHeader const &noc_multicast_command_header) { + inline PacketHeader &to_noc_multicast_atomic_inc(NocMulticastAtomicIncCommandHeader const &noc_multicast_command_header, size_t payload_size_bytes) { #if defined(KERNEL_BUILD) || defined(FW_BUILD) ASSERT(false); while (1) {}; #endif + this->payload_size_bytes = payload_size_bytes; return *this; } - inline volatile PacketHeader *to_chip_unicast(UnicastRoutingCommandHeader const &chip_unicast_command_header) volatile { + inline volatile PacketHeader *to_chip_unicast(uint8_t distance_in_hops) volatile { this->chip_send_type = CHIP_UNICAST; - this->routing_fields.chip_unicast.distance_in_hops = chip_unicast_command_header.distance_in_hops; + this->routing_fields.value = RoutingFields::LAST_CHIP_IN_MCAST_VAL | distance_in_hops; return this; } inline volatile PacketHeader *to_chip_multicast(MulticastRoutingCommandHeader const &chip_multicast_command_header) volatile { this->chip_send_type = CHIP_MULTICAST; - this->routing_fields.chip_mcast.range_hops = chip_multicast_command_header.range_hops; - this->routing_fields.chip_mcast.start_distance_in_hops = chip_multicast_command_header.start_distance_in_hops; + this->routing_fields.value = (static_cast(chip_multicast_command_header.range_hops) << RoutingFields::START_DISTANCE_FIELD_BIT_WIDTH) | chip_multicast_command_header.start_distance_in_hops; return this; } - inline volatile PacketHeader *to_noc_unicast_write(NocUnicastCommandHeader const &noc_unicast_command_header) volatile { + inline volatile PacketHeader *to_noc_unicast_write(NocUnicastCommandHeader const &noc_unicast_command_header, size_t payload_size_bytes) volatile { this->noc_send_type = NOC_UNICAST_WRITE; this->command_fields.unicast_write.noc_address = noc_unicast_command_header.noc_address; - this->command_fields.unicast_write.size = noc_unicast_command_header.size; + this->payload_size_bytes = payload_size_bytes; return this; } - inline volatile PacketHeader *to_noc_multicast(NocMulticastCommandHeader const &noc_multicast_command_header) volatile { + inline volatile PacketHeader &to_noc_unicast_inline_write(NocUnicastInlineWriteCommandHeader const &noc_unicast_command_header) volatile { + this->noc_send_type = NOC_UNICAST_INLINE_WRITE; + this->command_fields.unicast_inline_write.noc_address = noc_unicast_command_header.noc_address; + this->command_fields.unicast_inline_write.value = noc_unicast_command_header.value; + this->payload_size_bytes = 0; + return *this; + } + inline volatile PacketHeader *to_noc_multicast(NocMulticastCommandHeader const &noc_multicast_command_header, size_t payload_size_bytes) volatile { this->noc_send_type = NOC_MULTICAST_WRITE; this->command_fields.mcast_write.mcast_rect_size_x = noc_multicast_command_header.mcast_rect_size_x; this->command_fields.mcast_write.mcast_rect_size_y = noc_multicast_command_header.mcast_rect_size_y; this->command_fields.mcast_write.noc_x_start = noc_multicast_command_header.noc_x_start; this->command_fields.mcast_write.noc_y_start = noc_multicast_command_header.noc_y_start; - this->command_fields.mcast_write.size = noc_multicast_command_header.size; + this->payload_size_bytes = payload_size_bytes; this->command_fields.mcast_write.address = noc_multicast_command_header.address; return this; @@ -225,11 +228,12 @@ struct PacketHeader { this->command_fields.unicast_seminc.noc_address = noc_unicast_atomic_inc_command_header.noc_address; this->command_fields.unicast_seminc.val = noc_unicast_atomic_inc_command_header.val; this->command_fields.unicast_seminc.wrap = noc_unicast_atomic_inc_command_header.wrap; + this->payload_size_bytes = 0; return this; } inline volatile PacketHeader *to_noc_multicast_atomic_inc( - NocMulticastAtomicIncCommandHeader const &noc_multicast_atomic_inc_command_header) volatile { + NocMulticastAtomicIncCommandHeader const &noc_multicast_atomic_inc_command_header, size_t payload_size_bytes) volatile { this->noc_send_type = NOC_MULTICAST_ATOMIC_INC; this->command_fields.mcast_seminc.address = noc_multicast_atomic_inc_command_header.address; this->command_fields.mcast_seminc.noc_x_start = noc_multicast_atomic_inc_command_header.noc_x_start; @@ -238,6 +242,7 @@ struct PacketHeader { this->command_fields.mcast_seminc.size_y = noc_multicast_atomic_inc_command_header.size_y; this->command_fields.mcast_seminc.val = noc_multicast_atomic_inc_command_header.val; this->command_fields.mcast_seminc.wrap = noc_multicast_atomic_inc_command_header.wrap; + this->payload_size_bytes = payload_size_bytes; return this; } diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp index 16d003b1c71..35533d4d26e 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp @@ -17,24 +17,26 @@ static constexpr size_t DESTINATION_HOP_COUNT = 1; static constexpr size_t LAST_MCAST_DESTINATION = 1; void print_pkt_hdr_routing_fields(volatile tt::fabric::PacketHeader *const packet_start) { +#ifdef DEBUG_PRINT_ENABLED switch (packet_start->chip_send_type) { case tt::fabric::CHIP_UNICAST: { - DPRINT << "C_UNI: dist:" << (uint32_t) packet_start->routing_fields.chip_unicast.distance_in_hops << "\n"; + DPRINT << "C_UNI: dist:" << (uint32_t) (packet_start->routing_fields.value & tt::fabric::RoutingFields::HOP_DISTANCE_MASK) << "\n"; break; } case tt::fabric::CHIP_MULTICAST: { - DPRINT << "C_MCST: dist:" << (uint32_t) packet_start->routing_fields.chip_mcast.start_distance_in_hops << - ", rng:" << (uint32_t) packet_start->routing_fields.chip_mcast.range_hops << "\n"; + DPRINT << "C_MCST: dist:" << (uint32_t) (packet_start->routing_fields.value & tt::fabric::RoutingFields::HOP_DISTANCE_MASK) << + ", rng:" << (uint32_t)((packet_start->routing_fields.value & tt::fabric::RoutingFields::RANGE_MASK) >> tt::fabric::RoutingFields::START_DISTANCE_FIELD_BIT_WIDTH) << "\n"; break; } }; +#endif } void print_pkt_header_noc_fields(volatile tt::fabric::PacketHeader *const packet_start) { +#ifdef DEBUG_PRINT_ENABLED switch (packet_start->noc_send_type) { case tt::fabric::NocSendType::NOC_UNICAST_WRITE: { - DPRINT << "N_WR addr:"<<(uint64_t)packet_start->command_fields.unicast_write.noc_address << - ", size:" << (uint32_t) packet_start->command_fields.unicast_write.size << "\n"; + DPRINT << "N_WR addr:"<<(uint64_t)packet_start->command_fields.unicast_write.noc_address << "\n"; } break; case tt::fabric::NocSendType::NOC_UNICAST_ATOMIC_INC: { DPRINT << "N_WR addr:"<<(uint64_t)packet_start->command_fields.unicast_seminc.noc_address << @@ -45,30 +47,33 @@ void print_pkt_header_noc_fields(volatile tt::fabric::PacketHeader *const packet ASSERT(false); // unimplemented break; }; +#endif } void print_pkt_header(volatile tt::fabric::PacketHeader *const packet_start) { +#ifdef DEBUG_PRINT_ENABLED auto const& header = *packet_start; DPRINT << "PKT: nsnd_t:" << (uint32_t) packet_start->noc_send_type << ", csnd_t:" << (uint32_t) packet_start->chip_send_type << - ", src_chip:" << (uint32_t) packet_start->reserved2 << "\n"; + ", src_chip:" << (uint32_t) packet_start->src_ch_id << + ", payload_size_bytes:" << (uint32_t) packet_start->payload_size_bytes << "\n"; print_pkt_hdr_routing_fields(packet_start); print_pkt_header_noc_fields(packet_start); +#endif } // Since we unicast to local, we must omit the packet header -void execute_chip_unicast_to_local_chip(volatile tt::fabric::PacketHeader *const packet_start, uint32_t transaction_id) { +FORCE_INLINE void execute_chip_unicast_to_local_chip(volatile tt::fabric::PacketHeader *const packet_start, uint32_t transaction_id) { auto const& header = *packet_start; uint32_t payload_start_address = reinterpret_cast(packet_start) + sizeof(tt::fabric::PacketHeader); tt::fabric::NocSendType noc_send_type = packet_start->noc_send_type; + auto const payload_size_bytes = header.payload_size_bytes; switch (noc_send_type) { case tt::fabric::NocSendType::NOC_UNICAST_WRITE: { auto const dest_address = header.command_fields.unicast_write.noc_address; - auto const size = header.command_fields.unicast_write.size - sizeof(tt::fabric::PacketHeader); - noc_async_write_one_packet_with_trid(payload_start_address, dest_address, size, transaction_id); - + noc_async_write_one_packet_with_trid(payload_start_address, dest_address, payload_size_bytes, transaction_id); } break; case tt::fabric::NocSendType::NOC_MULTICAST_WRITE: { @@ -80,9 +85,7 @@ void execute_chip_unicast_to_local_chip(volatile tt::fabric::PacketHeader *const header.command_fields.mcast_write.noc_y_start + header.command_fields.mcast_write.mcast_rect_size_y, header.command_fields.mcast_write.address); auto const num_dests = header.command_fields.mcast_write.mcast_rect_size_x * header.command_fields.mcast_write.mcast_rect_size_y; - auto const size = header.command_fields.mcast_write.size - sizeof(tt::fabric::PacketHeader); - noc_async_write_one_packet_with_trid(payload_start_address, mcast_dest_address, size, num_dests, transaction_id); - + noc_async_write_one_packet_with_trid(payload_start_address, mcast_dest_address, payload_size_bytes, num_dests, transaction_id); } break; case tt::fabric::NocSendType::NOC_UNICAST_ATOMIC_INC: { @@ -92,6 +95,12 @@ void execute_chip_unicast_to_local_chip(volatile tt::fabric::PacketHeader *const } break; + case tt::fabric::NocSendType::NOC_UNICAST_INLINE_WRITE: { + auto const dest_address = header.command_fields.unicast_inline_write.noc_address; + auto const value = header.command_fields.unicast_inline_write.value; + noc_inline_dw_write(dest_address, value); + } break; + case tt::fabric::NocSendType::NOC_MULTICAST_ATOMIC_INC: default: { ASSERT(false); @@ -99,24 +108,12 @@ void execute_chip_unicast_to_local_chip(volatile tt::fabric::PacketHeader *const }; } - - -void update_packet_header_for_next_hop(volatile tt::fabric::PacketHeader * packet_header) { - switch (packet_header->chip_send_type) { - case tt::fabric::CHIP_UNICAST: { - ASSERT(packet_header->routing_fields.chip_unicast.distance_in_hops > 0); - packet_header->routing_fields.chip_unicast.distance_in_hops--; - } break; - case tt::fabric::CHIP_MULTICAST: { - if (packet_header->routing_fields.chip_mcast.start_distance_in_hops == DESTINATION_HOP_COUNT) { - ASSERT(packet_header->routing_fields.chip_mcast.range_hops > 0); - packet_header->routing_fields.chip_mcast.range_hops--; - } else { - ASSERT(packet_header->routing_fields.chip_mcast.start_distance_in_hops > 0); - packet_header->routing_fields.chip_mcast.start_distance_in_hops--; - } - } break; - } +FORCE_INLINE void update_packet_header_for_next_hop(volatile tt::fabric::PacketHeader * packet_header, tt::fabric::RoutingFields cached_routing_fields) { + // if the distance field is one, it means the range field decrements, else the start distance field decrements + // TODO [optimization]: If we can make the terminal value 0, then we can save an instruction on the eq insn + bool decrement_range = (cached_routing_fields.value & tt::fabric::RoutingFields::HOP_DISTANCE_MASK) == tt::fabric::RoutingFields::LAST_HOP_DISTANCE_VAL; + uint8_t decrement_val = static_cast(1) << (decrement_range * tt::fabric::RoutingFields::RANGE_HOPS_FIELD_BIT_WIDTH); + packet_header->routing_fields.value = cached_routing_fields.value - decrement_val; } // This function forwards a packet to the downstream EDM channel for eventual sending @@ -128,8 +125,9 @@ void update_packet_header_for_next_hop(volatile tt::fabric::PacketHeader * packe // !!!WARNING!!! * do NOT call before determining if the packet should be consumed locally or forwarded // !!!WARNING!!! * ENSURE DOWNSTREAM EDM HAS SPACE FOR PACKET BEFORE CALLING // !!!WARNING!!! -void forward_payload_to_downstream_edm( +FORCE_INLINE void forward_payload_to_downstream_edm( volatile tt::fabric::PacketHeader *packet_header, + tt::fabric::RoutingFields cached_routing_fields, tt::fabric::WorkerToFabricEdmSender &downstream_edm_interface, uint8_t transaction_id ) { @@ -139,40 +137,9 @@ void forward_payload_to_downstream_edm( // This is a good place to print the packet header for debug if you are trying to inspect packets // because it is before we start manipulating the header for forwarding - update_packet_header_for_next_hop(packet_header); + update_packet_header_for_next_hop(packet_header, cached_routing_fields); downstream_edm_interface.send_payload_non_blocking_from_address_with_trid( reinterpret_cast(packet_header), packet_header->get_payload_size_including_header(), transaction_id); } - - -bool packet_must_be_consumed_locally(volatile tt::fabric::PacketHeader const& packet_header) { - switch (packet_header.chip_send_type) { - case tt::fabric::ChipSendType::CHIP_UNICAST: { - return packet_header.routing_fields.chip_unicast.distance_in_hops == DESTINATION_HOP_COUNT; - } - case tt::fabric::ChipSendType::CHIP_MULTICAST: { - return packet_header.routing_fields.chip_mcast.start_distance_in_hops == DESTINATION_HOP_COUNT; - } - default: { - ASSERT(false); - return false; - } - } -} - - -bool packet_must_be_forwarded_to_next_chip(volatile tt::fabric::PacketHeader const& packet_header) { - switch (packet_header.chip_send_type) { - case tt::fabric::ChipSendType::CHIP_UNICAST: - return packet_header.routing_fields.chip_unicast.distance_in_hops != DESTINATION_HOP_COUNT; - - case tt::fabric::ChipSendType::CHIP_MULTICAST: - return packet_header.routing_fields.chip_mcast.range_hops != LAST_MCAST_DESTINATION; - - default: - ASSERT(false); - return false; - } -} diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp index e913c18f7aa..b0c732ee00b 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp @@ -4,7 +4,7 @@ #include "dataflow_api.h" -#include "tt_metal/hw/inc/ethernet/dataflow_api.h" +#include "tt_metal/hw/inc/ethernet/tunneling.h" #include "cpp/ttnn/operations/ccl/kernels/edm/edm_handshake.hpp" #include "cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp" #include "cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp" @@ -23,6 +23,9 @@ using ttnn::ccl::WorkerXY; +static constexpr bool enable_first_level_ack = true; +static constexpr bool fuse_receiver_flush_and_completion_ptr = true; + /* The fabric Erisc Data Mover (EDM) is a component that can be used to build *very* simple linear topology fabrics. @@ -247,11 +250,11 @@ constexpr uint8_t NUM_TRANSACTION_IDS = 4; template struct TransactionIdCounter { - void increment() { + FORCE_INLINE void increment() { this->next_trid = tt::fabric::wrap_increment(this->next_trid); } - uint8_t get() const { + FORCE_INLINE uint8_t get() const { return this->next_trid; } @@ -298,6 +301,7 @@ struct WriteTransactionIdTracker { TransactionIdCounter trid_counter; }; +static constexpr uint32_t DEFAULT_ETH_TXQ = 0; // senders update this stream constexpr uint32_t to_receiver_pkts_sent_id = 0; @@ -313,15 +317,11 @@ constexpr uint32_t to_sender_1_pkts_completed_id = 4; // This will be an atomic register read to the register template -int32_t get_ptr_val() { +FORCE_INLINE int32_t get_ptr_val() { return NOC_STREAM_READ_REG(stream_id, STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_REG_INDEX); - constexpr uint32_t addr = STREAM_REG_ADDR(stream_id, STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_REG_INDEX); - return *reinterpret_cast(addr); } -int32_t get_ptr_val(uint8_t stream_id) { +FORCE_INLINE int32_t get_ptr_val(uint8_t stream_id) { return NOC_STREAM_READ_REG(stream_id, STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_REG_INDEX); - const uint32_t addr = STREAM_REG_ADDR(stream_id, STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_REG_INDEX); - return *reinterpret_cast(addr); } // Writing to this register will leverage the built-in stream hardware which will automatically perform an atomic increment @@ -329,25 +329,25 @@ int32_t get_ptr_val(uint8_t stream_id) { // Additionally, these registers are accessible via eth_reg_write calls which can be used to write a value, // inline the eth command (without requiring source L1) template -void increment_local_update_ptr_val(int32_t val) { +FORCE_INLINE void increment_local_update_ptr_val(int32_t val) { NOC_STREAM_WRITE_REG_FIELD(stream_id, STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_REG_INDEX, REMOTE_DEST_BUF_WORDS_FREE_INC, val); } -void increment_local_update_ptr_val(uint8_t stream_id, int32_t val) { +FORCE_INLINE void increment_local_update_ptr_val(uint8_t stream_id, int32_t val) { NOC_STREAM_WRITE_REG_FIELD(stream_id, STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_REG_INDEX, REMOTE_DEST_BUF_WORDS_FREE_INC, val); } template -void remote_update_ptr_val(int32_t val) { +FORCE_INLINE void remote_update_ptr_val(int32_t val) { constexpr uint32_t addr = STREAM_REG_ADDR(stream_id, STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_REG_INDEX); - eth_write_remote_reg(addr, val << REMOTE_DEST_BUF_WORDS_FREE_INC); + internal_::eth_write_remote_reg_no_txq_check(DEFAULT_ETH_TXQ, addr, val << REMOTE_DEST_BUF_WORDS_FREE_INC); } -void remote_update_ptr_val(uint32_t stream_id, int32_t val) { +FORCE_INLINE void remote_update_ptr_val(uint32_t stream_id, int32_t val) { const uint32_t addr = STREAM_REG_ADDR(stream_id, STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_REG_INDEX); - eth_write_remote_reg(addr, val << REMOTE_DEST_BUF_WORDS_FREE_INC); + internal_::eth_write_remote_reg_no_txq_check(DEFAULT_ETH_TXQ, addr, val << REMOTE_DEST_BUF_WORDS_FREE_INC); } template -void init_ptr_val(int32_t val) { +FORCE_INLINE void init_ptr_val(int32_t val) { NOC_STREAM_WRITE_REG(stream_id, STREAM_REMOTE_DEST_BUF_SIZE_REG_INDEX, val); } @@ -370,19 +370,19 @@ struct OutboundReceiverChannelPointers { tt::fabric::ChannelBufferPointer ack_ptr; tt::fabric::ChannelBufferPointer completion_ptr; - bool has_space_for_packet() const { + FORCE_INLINE bool has_space_for_packet() const { return completion_ptr.distance_behind(wrptr) < RECEIVER_NUM_BUFFERS; } - bool has_unacknowledged_eth_packets() const { + FORCE_INLINE bool has_unacknowledged_eth_packets() const { return ack_ptr.get_ptr() != wrptr.get_ptr(); } - bool has_incomplete_eth_packets() const { + FORCE_INLINE bool has_incomplete_eth_packets() const { return completion_ptr.get_ptr() != wrptr.get_ptr(); } - bool has_unacknowledged_or_incomplete_eth_packets() const { + FORCE_INLINE bool has_unacknowledged_or_incomplete_eth_packets() const { return has_incomplete_eth_packets() || has_unacknowledged_eth_packets(); } }; @@ -485,25 +485,9 @@ static constexpr size_t worker_info_offset_past_connection_semaphore = 32; // SENDER SIDE HELPERS ///////////////////////////////////////////// -template -void send_channel_sync( - tt::fabric::EthChannelBuffer &sender_buffer_channel, - tt::fabric::ChannelBufferPointer &sender_wrptr, - tt::fabric::EthChannelBuffer &receiver_buffer_channel, - tt::fabric::ChannelBufferPointer &remote_receiver_wrptr - ) { - auto src_addr = sender_buffer_channel.get_bytes_sent_address(sender_wrptr.get_buffer_index()); - auto dest_addr = receiver_buffer_channel.get_bytes_sent_address(remote_receiver_wrptr.get_buffer_index()); - eth_send_bytes_over_channel_payload_only_unsafe( - reinterpret_cast(src_addr), - reinterpret_cast(dest_addr), - sizeof(eth_channel_sync_t), - sizeof(eth_channel_sync_t), - sizeof(eth_channel_sync_t) >> ETH_BYTES_TO_WORDS_SHIFT); -} template -void send_next_data( +FORCE_INLINE void send_next_data( tt::fabric::EthChannelBuffer &sender_buffer_channel, tt::fabric::EdmChannelWorkerInterface &sender_worker_interface, OutboundReceiverChannelPointers &outbound_to_receiver_channel_pointers, @@ -514,7 +498,7 @@ void send_next_data( auto &local_sender_wrptr = sender_worker_interface.local_wrptr; auto local_sender_wrptr_buffer_index = local_sender_wrptr.get_buffer_index(); - ASSERT(!eth_txq_is_busy()); + ASSERT(!internal_::eth_txq_is_busy(DEFAULT_ETH_TXQ)); // TODO: TUNING - experiment with only conditionally breaking the transfer up into multiple packets if we are // a certain threshold less than full packet @@ -525,25 +509,19 @@ void send_next_data( auto volatile *pkt_header = reinterpret_cast(sender_buffer_channel.get_buffer_address(local_sender_wrptr_buffer_index)); ASSERT(tt::fabric::is_valid(*const_cast(pkt_header))); - size_t payload_size = 0; - payload_size = pkt_header->get_payload_size_including_header(); + size_t payload_size_bytes = pkt_header->get_payload_size_including_header(); pkt_header->src_ch_id = sender_channel_index; auto src_addr = sender_buffer_channel.get_buffer_address(local_sender_wrptr_buffer_index); auto dest_addr = receiver_buffer_channel.get_buffer_address(remote_receiver_wrptr.get_buffer_index()); - eth_send_bytes_over_channel_payload_only_unsafe( - src_addr, - dest_addr, - payload_size, - payload_size, - payload_size >> ETH_BYTES_TO_WORDS_SHIFT); - + internal_::eth_send_packet_bytes_unsafe(DEFAULT_ETH_TXQ, src_addr, dest_addr, payload_size_bytes); // Note: We can only advance to the next buffer index if we have fully completed the send (both the payload and sync // messages) local_sender_wrptr.increment(); // update the remote reg static constexpr uint32_t words_to_forward = 1; + while (internal_::eth_txq_is_busy(DEFAULT_ETH_TXQ)) {}; remote_update_ptr_val(words_to_forward); remote_receiver_wrptr.increment(); } @@ -560,7 +538,7 @@ void send_next_data( * MUST CHECK !is_eth_txq_busy() before calling */ template -void receiver_send_received_ack( +FORCE_INLINE void receiver_send_received_ack( std::array, NUM_SENDER_CHANNELS> &remote_eth_sender_ackptrs, std::array, NUM_SENDER_CHANNELS> &remote_sender_channels, // currently the pointer is working multiple jobs (ack, completion, read) because we haven't implemented the @@ -594,54 +572,32 @@ FORCE_INLINE void receiver_send_completion_ack( } -PacketLocalForwardType get_packet_local_forward_type(const volatile tt::fabric::PacketHeader &packet_header) { - const bool local_chip_is_packet_destination = packet_must_be_consumed_locally(packet_header); - const bool packet_needs_forwarding = packet_must_be_forwarded_to_next_chip(packet_header); - PacketLocalForwardType forward_type = - static_cast(packet_needs_forwarding << 1 | local_chip_is_packet_destination); - return forward_type; -} - FORCE_INLINE bool can_forward_packet_completely( - const volatile tt::fabric::PacketHeader *packet_header, tt::fabric::WorkerToFabricEdmSender &downstream_edm_interface) { - auto forward_status = get_packet_local_forward_type(*packet_header); - - switch (forward_status) { - case PACKET_FORWARD_INVALID: return false; - case PACKET_FORWARD_LOCAL_ONLY: return true; - - case PACKET_FORWARD_REMOTE_ONLY: - case PACKET_FORWARD_LOCAL_AND_REMOTE: return downstream_edm_interface.edm_has_space_for_packet(); - default: ASSERT(false); return false; - }; + const volatile tt::fabric::PacketHeader* packet_header, + tt::fabric::RoutingFields cached_routing_fields, + tt::fabric::WorkerToFabricEdmSender& downstream_edm_interface) { + // We always check if it is the terminal mcast packet value. We can do this because all unicast packets have the + // mcast terminal value masked in to the routing field. This simplifies the check here to a single compare. + bool deliver_locally_only = cached_routing_fields.value == tt::fabric::RoutingFields::LAST_MCAST_VAL; + return deliver_locally_only || downstream_edm_interface.edm_has_space_for_packet(); } // !!!WARNING!!! - MAKE SURE CONSUMER HAS SPACE BEFORE CALLING -void receiver_forward_packet( - volatile tt::fabric::PacketHeader *packet_start, tt::fabric::WorkerToFabricEdmSender &downstream_edm_interface, uint8_t transaction_id) { - // Just cache the packet_header - we don't really expect (or care) if contents change during this function. - volatile tt::fabric::PacketHeader const &packet_header = *packet_start; - ASSERT(tt::fabric::is_valid(const_cast(packet_header))); - auto forward_status = get_packet_local_forward_type(packet_header); - switch (forward_status) { - case PACKET_FORWARD_LOCAL_ONLY: { - execute_chip_unicast_to_local_chip(packet_start, transaction_id); - } break; - - case PACKET_FORWARD_REMOTE_ONLY: { - forward_payload_to_downstream_edm(packet_start, downstream_edm_interface, transaction_id); - } break; - - case PACKET_FORWARD_LOCAL_AND_REMOTE: { - ASSERT(packet_header.chip_send_type == tt::fabric::ChipSendType::CHIP_MULTICAST); - // TODO: make local chip write non-blocking - execute_chip_unicast_to_local_chip(packet_start, transaction_id); - forward_payload_to_downstream_edm(packet_start, downstream_edm_interface, transaction_id); - } break; - - case PACKET_FORWARD_INVALID: - default: ASSERT(false); - }; +FORCE_INLINE void receiver_forward_packet( + // TODO: have a separate cached copy of the packet header to save some additional L1 loads + volatile tt::fabric::PacketHeader *packet_start, + tt::fabric::RoutingFields cached_routing_fields, + tt::fabric::WorkerToFabricEdmSender &downstream_edm_interface, + uint8_t transaction_id) { + + bool start_distance_is_terminal_value = (cached_routing_fields.value & tt::fabric::RoutingFields::HOP_DISTANCE_MASK) == tt::fabric::RoutingFields::LAST_HOP_DISTANCE_VAL; + if (start_distance_is_terminal_value) { + execute_chip_unicast_to_local_chip(packet_start, transaction_id); + } + bool not_last_destination_device = cached_routing_fields.value != tt::fabric::RoutingFields::LAST_MCAST_VAL; + if (not_last_destination_device) { + forward_payload_to_downstream_edm(packet_start, cached_routing_fields, downstream_edm_interface, transaction_id); + } } //////////////////////////////////// @@ -650,7 +606,7 @@ void receiver_forward_packet( //////////////////////////////////// //////////////////////////////////// template -bool run_sender_channel_step( +FORCE_INLINE bool run_sender_channel_step( tt::fabric::EthChannelBuffer &local_sender_channel, tt::fabric::EdmChannelWorkerInterface &local_sender_channel_worker_interface, OutboundReceiverChannelPointers &outbound_to_receiver_channel_pointers, @@ -666,7 +622,7 @@ bool run_sender_channel_step( // when moving to stream regs to manage rd/wr ptrs // TODO: update to be stream reg based. Initialize to space available and simply check for non-zero bool receiver_has_space_for_packet = outbound_to_receiver_channel_pointers.has_space_for_packet(); - if (receiver_has_space_for_packet && !eth_txq_is_busy()) { + if (receiver_has_space_for_packet && !internal_::eth_txq_is_busy(DEFAULT_ETH_TXQ)) { bool has_unsent_packet = local_sender_channel_worker_interface.has_unsent_payload(); if (has_unsent_packet) { bool sender_backpressured_from_sender_side = !(local_sender_channel_worker_interface.local_rdptr.distance_behind(local_sender_channel_worker_interface.local_wrptr) < SENDER_NUM_BUFFERS); @@ -695,22 +651,30 @@ bool run_sender_channel_step( outbound_to_receiver_channel_pointers.completion_ptr.increment_n(completions_since_last_check); sender_rdptr.increment_n(completions_since_last_check); increment_local_update_ptr_val(to_sender_packets_completed_streams[sender_channel_index], -completions_since_last_check); + if constexpr (!enable_first_level_ack) { + if (channel_connection_established) { + local_sender_channel_worker_interface.update_worker_copy_of_read_ptr(sender_rdptr.get_ptr()); + } + } } // Process ACKs from receiver // ACKs are processed second to avoid any sort of races. If we process acks second, // we are guaranteed to see equal to or greater the number of acks than completions - auto acks_since_last_check = get_ptr_val(to_sender_packets_acked_streams[sender_channel_index]); - - auto& sender_ackptr = local_sender_channel_worker_interface.local_ackptr; - if (acks_since_last_check > 0) { - sender_ackptr.increment_n(acks_since_last_check); - if (channel_connection_established) { - local_sender_channel_worker_interface.update_worker_copy_of_read_ptr(); + if constexpr (enable_first_level_ack) { + auto acks_since_last_check = get_ptr_val(to_sender_packets_acked_streams[sender_channel_index]); + auto& sender_ackptr = local_sender_channel_worker_interface.local_ackptr; + if (acks_since_last_check > 0) { + sender_ackptr.increment_n(acks_since_last_check); + if (channel_connection_established) { + local_sender_channel_worker_interface.update_worker_copy_of_read_ptr(sender_ackptr.get_ptr()); + } + increment_local_update_ptr_val(to_sender_packets_acked_streams[sender_channel_index], -acks_since_last_check); } - increment_local_update_ptr_val(to_sender_packets_acked_streams[sender_channel_index], -acks_since_last_check); + did_something = did_something || (completions_since_last_check + acks_since_last_check) > 0; + } else { + did_something = did_something || (completions_since_last_check > 0); } - did_something = did_something || (completions_since_last_check + acks_since_last_check) > 0; if (!channel_connection_established) { @@ -730,7 +694,11 @@ bool run_sender_channel_step( } did_something = true; channel_connection_established = true; - local_sender_channel_worker_interface.update_worker_copy_of_read_ptr(); + if constexpr (enable_first_level_ack) { + local_sender_channel_worker_interface.update_worker_copy_of_read_ptr(local_sender_channel_worker_interface.local_ackptr.get_ptr()); + } else { + local_sender_channel_worker_interface.update_worker_copy_of_read_ptr(local_sender_channel_worker_interface.local_rdptr.get_ptr()); + } } } else if (local_sender_channel_worker_interface.has_worker_teardown_request()) { did_something = true; @@ -743,7 +711,7 @@ bool run_sender_channel_step( }; template -void run_receiver_channel_step( +FORCE_INLINE void run_receiver_channel_step( tt::fabric::EthChannelBuffer &local_receiver_channel, std::array, NUM_SENDER_CHANNELS> &remote_sender_channnels, tt::fabric::WorkerToFabricEdmSender &downstream_edm_interface, @@ -757,17 +725,22 @@ void run_receiver_channel_step( auto &ack_ptr = receiver_channel_pointers.ack_ptr; auto pkts_received_since_last_check = get_ptr_val(); bool pkts_received = pkts_received_since_last_check > 0; - bool can_send_over_eth = !eth_txq_is_busy(); - ASSERT(receiver_channel_pointers.completion_ptr.distance_behind(ack_ptr) < RECEIVER_NUM_BUFFERS); - if (pkts_received && can_send_over_eth) { - // currently only support processing one packet at a time, so we only decrement by 1 - increment_local_update_ptr_val(-1); - receiver_send_received_ack( - remote_eth_sender_wrptrs, - remote_sender_channnels, - ack_ptr, - local_receiver_channel); - ack_ptr.increment(); + if constexpr (enable_first_level_ack) { + bool can_send_over_eth = !internal_::eth_txq_is_busy(DEFAULT_ETH_TXQ); + ASSERT(receiver_channel_pointers.completion_ptr.distance_behind(ack_ptr) < RECEIVER_NUM_BUFFERS); + if (pkts_received && can_send_over_eth) { + // currently only support processing one packet at a time, so we only decrement by 1 + increment_local_update_ptr_val(-1); + receiver_send_received_ack( + remote_eth_sender_wrptrs, + remote_sender_channnels, + ack_ptr, + local_receiver_channel); + ack_ptr.increment(); + } + } else { + increment_local_update_ptr_val(-pkts_received_since_last_check); + ack_ptr.increment_n(pkts_received_since_last_check); } auto &wr_sent_ptr = receiver_channel_pointers.wr_sent_ptr; @@ -775,43 +748,64 @@ void run_receiver_channel_step( if (unwritten_packets) { auto receiver_buffer_index = wr_sent_ptr.get_buffer_index(); volatile auto packet_header = local_receiver_channel.get_packet_header(receiver_buffer_index); + + tt::fabric::RoutingFields cached_routing_fields = const_cast(packet_header)->routing_fields; print_pkt_header(packet_header); bool can_send_to_all_local_chip_receivers = - can_forward_packet_completely(packet_header, downstream_edm_interface); + can_forward_packet_completely(packet_header, cached_routing_fields, downstream_edm_interface); bool trid_flushed = receiver_channel_trid_tracker.transaction_flushed(receiver_buffer_index); if (can_send_to_all_local_chip_receivers && trid_flushed) { + // DeviceZoneScopedN("EDMR-Send-Impl"); uint8_t trid = receiver_channel_trid_tracker.update_buffer_slot_to_next_trid_and_advance_trid_counter(receiver_buffer_index); - receiver_forward_packet(packet_header, downstream_edm_interface, trid); + receiver_forward_packet(packet_header, cached_routing_fields, downstream_edm_interface, trid); wr_sent_ptr.increment(); } } - auto &wr_flush_ptr = receiver_channel_pointers.wr_flush_ptr; - bool unflushed_writes = !wr_flush_ptr.is_caught_up_to(wr_sent_ptr); - if (unflushed_writes) { - auto receiver_buffer_index = wr_flush_ptr.get_buffer_index(); - // Temporary patch for instability. Issue was not caught due to what appears to be a bug in CI - // not running all tests. Issue tracked here: https://github.com/tenstorrent/tt-metal/issues/17702 - bool next_trid_flushed = receiver_channel_trid_tracker.transaction_flushed(receiver_buffer_index); - if (next_trid_flushed) { - local_receiver_channel.eth_clear_sender_channel_ack(receiver_buffer_index); - wr_flush_ptr.increment(); - receiver_channel_trid_tracker.clear_trid_at_buffer_slot(receiver_buffer_index); + if constexpr (!fuse_receiver_flush_and_completion_ptr) { + auto &wr_flush_ptr = receiver_channel_pointers.wr_flush_ptr; + bool unflushed_writes = !wr_flush_ptr.is_caught_up_to(wr_sent_ptr); + if (unflushed_writes) { + auto receiver_buffer_index = wr_flush_ptr.get_buffer_index(); + bool next_trid_flushed = receiver_channel_trid_tracker.transaction_flushed(receiver_buffer_index); + if (next_trid_flushed) { + wr_flush_ptr.increment(); + receiver_channel_trid_tracker.clear_trid_at_buffer_slot(receiver_buffer_index); + } } - } - auto &completion_ptr = receiver_channel_pointers.completion_ptr; - bool unsent_completions = !completion_ptr.is_caught_up_to(wr_flush_ptr); - if (unsent_completions) { - bool can_send_without_blocking = !eth_txq_is_busy(); - if (can_send_without_blocking) { - // completion ptr incremented in callee - receiver_send_completion_ack( - remote_eth_sender_wrptrs, - remote_sender_channnels, - completion_ptr, - local_receiver_channel); + auto &completion_ptr = receiver_channel_pointers.completion_ptr; + bool unsent_completions = !completion_ptr.is_caught_up_to(wr_flush_ptr); + if (unsent_completions) { + bool can_send_without_blocking = !internal_::eth_txq_is_busy(DEFAULT_ETH_TXQ); + if (can_send_without_blocking) { + // completion ptr incremented in callee + receiver_send_completion_ack( + remote_eth_sender_wrptrs, + remote_sender_channnels, + completion_ptr, + local_receiver_channel); + } } + } else { + auto &wr_flush_ptr = receiver_channel_pointers.wr_flush_ptr; + // Currently unclear if it's better to loop here or not... Also unclear if merging these + // two pointers is better or not... Seems to be maybe 5-10% better merged but need more data + if (!wr_flush_ptr.is_caught_up_to(wr_sent_ptr) && !internal_::eth_txq_is_busy(DEFAULT_ETH_TXQ)) { + auto receiver_buffer_index = wr_flush_ptr.get_buffer_index(); + bool next_trid_flushed = receiver_channel_trid_tracker.transaction_flushed(receiver_buffer_index); + if (next_trid_flushed) { + auto &completion_ptr = receiver_channel_pointers.completion_ptr; + wr_flush_ptr.increment(); + receiver_channel_trid_tracker.clear_trid_at_buffer_slot(receiver_buffer_index); + receiver_send_completion_ack( + remote_eth_sender_wrptrs, + remote_sender_channnels, + completion_ptr, + local_receiver_channel); + } + } + } }; @@ -1006,7 +1000,7 @@ void kernel_main() { static constexpr size_t sender_channel_0_counters_address = get_compile_time_arg_val(18); static constexpr size_t sender_channel_1_counters_address = get_compile_time_arg_val(19); - static constexpr bool enable_packet_header_recording = get_compile_time_arg_val(20) != 0; + static constexpr bool enable_packet_header_recording = false; //get_compile_time_arg_val(20) != 0; static constexpr size_t receiver_completed_packet_header_cb_address = get_compile_time_arg_val(21); static constexpr size_t receiver_completed_packet_header_cb_size_headers = get_compile_time_arg_val(22); static constexpr size_t sender_0_completed_packet_header_cb_address = get_compile_time_arg_val(23); diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp index a5d8298bbff..2285a6c42cb 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp @@ -24,13 +24,13 @@ template class NamedType { public: - explicit NamedType(T const& value) : value_(value) {} - explicit NamedType(T&& value) : value_(std::move(value)) {} - NamedType &operator=(NamedType const& rhs) = default; - T& get() { return value_; } - T const& get() const {return value_; } - operator T() const { return value_; } - operator T&() { return value_; } + FORCE_INLINE explicit NamedType(T const& value) : value_(value) {} + FORCE_INLINE explicit NamedType(T&& value) : value_(std::move(value)) {} + FORCE_INLINE NamedType &operator=(NamedType const& rhs) = default; + FORCE_INLINE T& get() { return value_; } + FORCE_INLINE T const& get() const {return value_; } + FORCE_INLINE operator T() const { return value_; } + FORCE_INLINE operator T&() { return value_; } private: T value_; }; @@ -41,6 +41,7 @@ using BufferPtr = NamedType; // Increments val and wraps to 0 if it reaches limit template +FORCE_INLINE auto wrap_increment(T val) -> T { static_assert(LIMIT != 0, "wrap_increment called with limit of 0; it must be greater than 0"); constexpr bool is_pow2 = is_power_of_2(LIMIT); @@ -55,6 +56,7 @@ auto wrap_increment(T val) -> T { } } template +FORCE_INLINE auto wrap_increment_n(T val, uint8_t increment) -> T { static_assert(LIMIT != 0, "wrap_increment called with limit of 0; it must be greater than 0"); constexpr bool is_pow2 = is_power_of_2(LIMIT); @@ -72,6 +74,7 @@ auto wrap_increment_n(T val, uint8_t increment) -> T { } template +FORCE_INLINE auto normalize_ptr(BufferPtr ptr) -> BufferIndex { static_assert(NUM_BUFFERS != 0, "normalize_ptr called with NUM_BUFFERS of 0; it must be greater than 0"); constexpr bool is_size_pow2 = (NUM_BUFFERS & (NUM_BUFFERS - 1)) == 0; @@ -112,38 +115,38 @@ class ChannelBufferPointer { /* * Returns the "raw" pointer - not usable to index the buffer channel */ - BufferPtr get_ptr() const { + FORCE_INLINE BufferPtr get_ptr() const { return this->ptr; } - bool is_caught_up_to(ChannelBufferPointer const& leading_ptr) const { + FORCE_INLINE bool is_caught_up_to(ChannelBufferPointer const& leading_ptr) const { return this->is_caught_up_to(leading_ptr.get_ptr()); } - uint8_t distance_behind(ChannelBufferPointer const& leading_ptr) const { + FORCE_INLINE uint8_t distance_behind(ChannelBufferPointer const& leading_ptr) const { return this->distance_behind(leading_ptr.get_ptr()); } /* * Returns the buffer index pointer which is usable to index into the buffer memory */ - BufferIndex get_buffer_index() const { + FORCE_INLINE BufferIndex get_buffer_index() const { return BufferIndex{normalize_ptr(this->ptr)}; } - void increment_n(uint8_t n) { + FORCE_INLINE void increment_n(uint8_t n) { this->ptr = BufferPtr{wrap_increment_n<2*NUM_BUFFERS>(this->ptr.get(), n)}; } - void increment() { + FORCE_INLINE void increment() { this->ptr = wrap_increment<2*NUM_BUFFERS>(this->ptr); } private: // Make these private to make sure caller doesn't accidentally mix two pointers pointing to // different sized channels - bool is_caught_up_to(BufferPtr const& leading_ptr) const { + FORCE_INLINE bool is_caught_up_to(BufferPtr const& leading_ptr) const { return this->get_ptr() == leading_ptr; } - uint8_t distance_behind(BufferPtr const& leading_ptr) const { + FORCE_INLINE uint8_t distance_behind(BufferPtr const& leading_ptr) const { bool leading_gte_trailing_ptr = leading_ptr >= this->ptr; if constexpr (is_size_pow2) { return (leading_ptr - this->ptr) & ptr_wrap_mask; @@ -175,7 +178,7 @@ class EthChannelBuffer final { // &channel_sync-> |----------------| // | channel_sync | // ------------------ - EthChannelBuffer() : buffer_size_in_bytes(0), eth_transaction_ack_word_addr(0), max_eth_payload_size_in_bytes(0) {} + EthChannelBuffer() : buffer_size_in_bytes(0), max_eth_payload_size_in_bytes(0) {} /* * Expected that *buffer_index_ptr is initialized outside of this object @@ -188,30 +191,11 @@ class EthChannelBuffer final { // that can fit 2 eth_channel_syncs cfor ack uint8_t channel_id) : buffer_size_in_bytes(buffer_size_bytes), - eth_transaction_ack_word_addr(eth_transaction_ack_word_addr), max_eth_payload_size_in_bytes(buffer_size_in_bytes + sizeof(eth_channel_sync_t)), channel_id(channel_id) { for (uint8_t i = 0; i < NUM_BUFFERS; i++) { this->buffer_addresses[i] = channel_base_address + i * this->max_eth_payload_size_in_bytes; - - uint32_t channel_sync_addr = this->buffer_addresses[i] + buffer_size_in_bytes; - auto channel_sync_ptr = reinterpret_cast(channel_sync_addr); - - channel_bytes_sent_addresses[i] = - reinterpret_cast(&(channel_sync_ptr->bytes_sent)); - channel_bytes_acked_addresses[i] = - reinterpret_cast(&(channel_sync_ptr->receiver_ack)); - channel_src_id_addresses[i] = reinterpret_cast(&(channel_sync_ptr->src_id)); - - ASSERT((uint32_t)channel_bytes_acked_addresses[i] != (uint32_t)(channel_bytes_sent_addresses[i])); - *(channel_bytes_sent_addresses[i]) = 0; - *(channel_bytes_acked_addresses[i]) = 0; - *(channel_src_id_addresses[i]) = 0x1c0ffee1; - (channel_src_id_addresses[i])[1] = 0x1c0ffee2; - - // Note we don't need to overwrite the `channel_src_id_addresses` except for perhapse - // debug purposes where we may wish to tag this with a special value } } @@ -226,22 +210,6 @@ class EthChannelBuffer final { [[nodiscard]] FORCE_INLINE size_t get_payload_size(BufferIndex const& buffer_index) const { return get_packet_header(buffer_index)->get_payload_size_including_header(); } - [[nodiscard]] FORCE_INLINE size_t get_payload_plus_channel_sync_size(BufferIndex const& buffer_index) const { - return get_packet_header(buffer_index)->get_payload_size_including_header() + sizeof(eth_channel_sync_t); - } - - [[nodiscard]] FORCE_INLINE volatile tt_l1_ptr size_t *get_bytes_sent_address(BufferIndex const& buffer_index) const { - return this->channel_bytes_sent_addresses[buffer_index]; - } - - [[nodiscard]] FORCE_INLINE volatile tt_l1_ptr size_t *get_bytes_acked_address(BufferIndex const& buffer_index) const { - return this->channel_bytes_acked_addresses[buffer_index]; - } - - [[nodiscard]] FORCE_INLINE volatile tt_l1_ptr size_t *get_src_id_address(BufferIndex const& buffer_index) const { - return this->channel_src_id_addresses[buffer_index]; - } - [[nodiscard]] FORCE_INLINE size_t get_channel_buffer_max_size_in_bytes(BufferIndex const& buffer_index) const { return this->buffer_size_in_bytes; } @@ -253,57 +221,30 @@ class EthChannelBuffer final { [[nodiscard]] FORCE_INLINE size_t get_id() const { return this->channel_id; } - [[nodiscard]] FORCE_INLINE bool eth_is_receiver_channel_send_done(BufferIndex const& buffer_index) const { - return *(this->get_bytes_sent_address(buffer_index)) == 0; - } - [[nodiscard]] FORCE_INLINE bool eth_bytes_are_available_on_channel(BufferIndex const& buffer_index) const { - return *(this->get_bytes_sent_address(buffer_index)) != 0; - } - [[nodiscard]] FORCE_INLINE bool eth_is_receiver_channel_send_acked(BufferIndex const& buffer_index) const { - return *(this->get_bytes_acked_address(buffer_index)) != 0; - } - FORCE_INLINE void eth_clear_sender_channel_ack(BufferIndex const& buffer_index) const { - *(this->channel_bytes_acked_addresses[buffer_index]) = 0; - } [[nodiscard]] FORCE_INLINE bool eth_is_acked_or_completed(BufferIndex const& buffer_index) const { return eth_is_receiver_channel_send_acked(buffer_index) || eth_is_receiver_channel_send_done(buffer_index); } - [[nodiscard]] FORCE_INLINE size_t get_eth_transaction_ack_word_addr() const { - return this->eth_transaction_ack_word_addr; - } - - [[nodiscard]] FORCE_INLINE bool all_buffers_drained() const { - bool drained = true; - for (size_t i = 0; i < NUM_BUFFERS && drained; i++) { - drained &= *(channel_bytes_sent_addresses[i]) == 0; - } - return drained; - } - bool needs_to_send_channel_sync() const { + FORCE_INLINE bool needs_to_send_channel_sync() const { return this->need_to_send_channel_sync; } - void set_need_to_send_channel_sync(bool need_to_send_channel_sync) { + FORCE_INLINE void set_need_to_send_channel_sync(bool need_to_send_channel_sync) { this->need_to_send_channel_sync = need_to_send_channel_sync; } - void clear_need_to_send_channel_sync() { + FORCE_INLINE void clear_need_to_send_channel_sync() { this->need_to_send_channel_sync = false; } private: std::array buffer_addresses; - std::array channel_bytes_sent_addresses; - std::array channel_bytes_acked_addresses; - std::array channel_src_id_addresses; // header + payload regions only const std::size_t buffer_size_in_bytes; // Includes header + payload + channel_sync - const std::size_t eth_transaction_ack_word_addr; const std::size_t max_eth_payload_size_in_bytes; uint8_t channel_id; }; @@ -354,11 +295,11 @@ struct EdmChannelWorkerInterface { return worker_location_info_ptr->worker_semaphore_address; } - FORCE_INLINE void update_worker_copy_of_read_ptr() { + FORCE_INLINE void update_worker_copy_of_read_ptr(BufferPtr new_ptr_val) { auto const &worker_info = *worker_location_info_ptr; uint64_t worker_semaphore_address = get_noc_addr( (uint32_t)worker_info.worker_xy.x, (uint32_t)worker_info.worker_xy.y, worker_info.worker_semaphore_address); - noc_inline_dw_write(worker_semaphore_address, local_ackptr.get_ptr()); + noc_inline_dw_write(worker_semaphore_address, new_ptr_val); } // Connection management methods @@ -376,15 +317,15 @@ struct EdmChannelWorkerInterface { noc_semaphore_inc(worker_semaphore_address, 1); } - bool all_eth_packets_acked() const { + FORCE_INLINE bool all_eth_packets_acked() const { return this->local_ackptr.is_caught_up_to(this->local_wrptr); } - bool all_eth_packets_completed() const { + FORCE_INLINE bool all_eth_packets_completed() const { return this->local_rdptr.is_caught_up_to(this->local_wrptr); } // Call to keep the connection flow control info fresh with worker. - void propagate_ackptr_to_connection_info() { + FORCE_INLINE void propagate_ackptr_to_connection_info() { worker_location_info_ptr->edm_rdptr = local_ackptr.get_ptr(); } diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/minimal_ccl_common.hpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/minimal_ccl_common.hpp index a281806cafc..641e6cee244 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/minimal_ccl_common.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/minimal_ccl_common.hpp @@ -20,11 +20,8 @@ FORCE_INLINE void write_and_advance_local_read_address_for_fabric_write( const auto [dest_noc_xy, dest_addr] = get_noc_address_components(noc0_dest_noc_addr); const size_t payload_l1_address = l1_read_addr; - size_t packet_send_size_bytes = payload_size_bytes + sizeof(tt::fabric::PacketHeader); - pkt_hdr_forward->to_noc_unicast_write( - tt::fabric::NocUnicastCommandHeader{noc0_dest_noc_addr, packet_send_size_bytes}); - pkt_hdr_backward->to_noc_unicast_write( - tt::fabric::NocUnicastCommandHeader{noc0_dest_noc_addr, packet_send_size_bytes}); + pkt_hdr_forward->to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_noc_addr}, payload_size_bytes); + pkt_hdr_backward->to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_noc_addr}, payload_size_bytes); noc_async_write(payload_l1_address, safe_get_noc_addr(dest_noc_xy.x, dest_noc_xy.y, dest_addr), payload_size_bytes); if (fabric_connection.has_forward_connection()) { From 88bd40253bb76a09e110dd5af499110e257ea735 Mon Sep 17 00:00:00 2001 From: Kyle Mabee Date: Sat, 8 Feb 2025 20:44:17 +0000 Subject: [PATCH 056/316] LightMetal - Store Program obj by id instead of ptr at capture time (Issue #17761) - Solves std::move() on Program in ttnn path create_or_get_program_from_cache() from invalidating addr already captured. Just use the unique ID instead. - Only impacts lightmetal cpp unit tests so far, but will help with upcoming python ttnn tests --- .../impl/lightmetal/lightmetal_capture.cpp | 18 +++++++++--------- .../impl/lightmetal/lightmetal_capture.hpp | 5 +++-- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/tt_metal/impl/lightmetal/lightmetal_capture.cpp b/tt_metal/impl/lightmetal/lightmetal_capture.cpp index c1c7d4e4dee..8ac29b15e33 100644 --- a/tt_metal/impl/lightmetal/lightmetal_capture.cpp +++ b/tt_metal/impl/lightmetal/lightmetal_capture.cpp @@ -62,7 +62,7 @@ void LightMetalCaptureContext::reset() { cmds_vec_.clear(); trace_descs_vec_.clear(); buffer_to_global_id_map_.clear(); - program_to_global_id_map_.clear(); + program_id_to_global_id_map_.clear(); kernel_to_global_id_map_.clear(); cb_handle_to_global_id_map_.clear(); } @@ -101,31 +101,31 @@ uint32_t LightMetalCaptureContext::get_global_id(const Buffer* obj) { } bool LightMetalCaptureContext::is_in_map(const Program* obj) { - return program_to_global_id_map_.find(obj) != program_to_global_id_map_.end(); + return program_id_to_global_id_map_.find(obj->get_id()) != program_id_to_global_id_map_.end(); } uint32_t LightMetalCaptureContext::add_to_map(const Program* obj) { if (is_in_map(obj)) { - log_warning(tt::LogMetalTrace, "Program already exists in global_id map."); + log_warning(tt::LogMetalTrace, "Program id: {} already exists in global_id map.", obj->get_id()); } uint32_t global_id = next_global_id_++; - program_to_global_id_map_[obj] = global_id; + program_id_to_global_id_map_[obj->get_id()] = global_id; return global_id; } void LightMetalCaptureContext::remove_from_map(const Program* obj) { if (!is_in_map(obj)) { - log_warning(tt::LogMetalTrace, "Program not found in global_id map."); + log_warning(tt::LogMetalTrace, "Program id: {} not found in global_id map.", obj->get_id()); } - program_to_global_id_map_.erase(obj); + program_id_to_global_id_map_.erase(obj->get_id()); } uint32_t LightMetalCaptureContext::get_global_id(const Program* obj) { - auto it = program_to_global_id_map_.find(obj); - if (it != program_to_global_id_map_.end()) { + auto it = program_id_to_global_id_map_.find(obj->get_id()); + if (it != program_id_to_global_id_map_.end()) { return it->second; } else { - TT_THROW("Program not found in global_id map."); + TT_THROW("Program id: {} not found in global_id map.", obj->get_id()); } } diff --git a/tt_metal/impl/lightmetal/lightmetal_capture.hpp b/tt_metal/impl/lightmetal/lightmetal_capture.hpp index 3712e666108..78c22a0e268 100644 --- a/tt_metal/impl/lightmetal/lightmetal_capture.hpp +++ b/tt_metal/impl/lightmetal/lightmetal_capture.hpp @@ -73,10 +73,11 @@ class LightMetalCaptureContext { std::vector> cmds_vec_; std::vector trace_descs_vec_; - // Object maps for associating each object with a global_id + // Object maps for associating each object (or identifier) with a global_id + // TODO (kmabee) - upgrade all global_id to be uint64_t for capture + replay. uint32_t next_global_id_ = 0; // Shared across all object types. std::unordered_map buffer_to_global_id_map_; - std::unordered_map program_to_global_id_map_; + std::unordered_map program_id_to_global_id_map_; std::unordered_map kernel_to_global_id_map_; std::unordered_map cb_handle_to_global_id_map_; // TODO (kmabee) - consider adding map for CommandQueue object. From 02fb2125f3fe1f05afd38bcea7993ccb7df87313 Mon Sep 17 00:00:00 2001 From: Daiki Aminaka Date: Mon, 10 Feb 2025 21:34:36 -0800 Subject: [PATCH 057/316] Refactoring same definitions (#17747) ### Ticket N/A ### Problem description There are same definitions spreading to multiple files. The name is overwrapping with other file's one, so refactoring to make it really unique ### What's changed Fix name - PACKET_QUEUE_TEST to TT_FABRIC_STATUS - PQ_TEST to TT_FABRIC - move common test utilities to test_common.hpp ### Checklist - [ ] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes - [ ] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [ ] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [ ] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [ ] New/Existing tests provide coverage for changes --- .../dispatch/test_prefetcher.cpp | 1 - .../routing/kernels/traffic_gen.hpp | 2 +- .../routing/kernels/traffic_gen_test.hpp | 41 ---------- .../routing/kernels/traffic_gen_tx.cpp | 1 + .../routing/kernels/tt_fabric_traffic_gen.hpp | 2 +- .../kernels/tt_fabric_traffic_gen_rx.cpp | 20 ++--- .../kernels/tt_fabric_traffic_gen_test.hpp | 78 ------------------- .../routing/kernels/tt_fabric_tx_ubench.cpp | 14 ++-- .../routing/test_common.hpp | 9 +++ .../routing/test_mux_demux.cpp | 3 +- .../routing/test_mux_demux_2level.cpp | 2 +- .../test_tt_fabric_multi_hop_sanity.cpp | 26 +++---- .../routing/test_tt_fabric_sanity.cpp | 35 +++++---- .../routing/test_tt_fabric_socket_sanity.cpp | 26 +++---- .../routing/test_tx_rx.cpp | 2 +- .../routing/test_vc_bi_tunnel_2ep.cpp | 3 +- .../routing/test_vc_bi_tunnel_4ep.cpp | 3 +- .../routing/test_vc_loopback_tunnel.cpp | 3 +- .../routing/test_vc_mux_demux.cpp | 3 +- .../routing/test_vc_uni_tunnel.cpp | 3 +- tt_fabric/hw/inc/tt_fabric_status.h | 45 +++++++++++ .../impl/kernels/tt_fabric_gatekeeper.cpp | 35 +++------ tt_fabric/impl/kernels/tt_fabric_router.cpp | 49 +++++------- .../dispatch/kernels/packet_queue_ctrl.hpp | 11 +++ 24 files changed, 162 insertions(+), 255 deletions(-) delete mode 100644 tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_test.hpp delete mode 100644 tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_test.hpp create mode 100644 tt_fabric/hw/inc/tt_fabric_status.h diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp index ab2483709e2..0b1dc88bec3 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp @@ -17,7 +17,6 @@ #include "common.h" #include "tt_cluster.hpp" #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" -#include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_test.hpp" #include #include "llrt.hpp" diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen.hpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen.hpp index a255f46c798..01b9dedaae2 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen.hpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen.hpp @@ -5,7 +5,7 @@ #pragma once #include "debug/dprint.h" -#include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_test.hpp" +#include "tt_fabric/hw/inc/tt_fabric_status.h" inline uint32_t prng_next(uint32_t n) { uint32_t x = n; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_test.hpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_test.hpp deleted file mode 100644 index 6e28268ef98..00000000000 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_test.hpp +++ /dev/null @@ -1,41 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" -#include -#include - -inline const char* packet_queue_test_status_to_string(uint32_t status) { - switch (status) { - case PACKET_QUEUE_TEST_STARTED: return "STARTED"; - case PACKET_QUEUE_TEST_PASS: return "DONE/OK"; - case PACKET_QUEUE_TEST_TIMEOUT: return "TIMEOUT"; - case PACKET_QUEUE_TEST_DATA_MISMATCH: return "DATA_MISMATCH"; - default: return "UNKNOWN"; - } -} - -inline uint64_t get_64b_result(uint32_t* buf, uint32_t index) { - return (((uint64_t)buf[index]) << 32) | buf[index + 1]; -} - -inline uint64_t get_64b_result(const std::vector& vec, uint32_t index) { - return (((uint64_t)vec[index]) << 32) | vec[index + 1]; -} - -#define TX_TEST_IDX_TOT_DATA_WORDS PQ_TEST_MISC_INDEX + 1 -#define TX_TEST_IDX_NPKT PQ_TEST_MISC_INDEX + 3 -#define TX_TEST_IDX_WORDS_FLUSHED PQ_TEST_MISC_INDEX + 5 -#define TX_TEST_IDX_FEW_DATA_WORDS_SENT_ITER PQ_TEST_MISC_INDEX + 7 -#define TX_TEST_IDX_MANY_DATA_WORDS_SENT_ITER PQ_TEST_MISC_INDEX + 9 -#define TX_TEST_IDX_ZERO_DATA_WORDS_SENT_ITER PQ_TEST_MISC_INDEX + 11 -// #define TX_TEST_IDX_ PQ_TEST_MISC_INDEX + -// #define TX_TEST_IDX_ PQ_TEST_MISC_INDEX + - -enum class pkt_dest_size_choices_t { - RANDOM = 0, - SAME_START_RNDROBIN_FIX_SIZE = 1 // max packet size used -}; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_tx.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_tx.cpp index 24a7decd1bd..57812ccde36 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_tx.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_tx.cpp @@ -5,6 +5,7 @@ #include "dataflow_api.h" #include "debug/dprint.h" #include "tt_metal/impl/dispatch/kernels/packet_queue.hpp" +#include "tt_fabric/hw/inc/tt_fabric_status.h" #include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen.hpp" constexpr uint32_t src_endpoint_id = get_compile_time_arg_val(0); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen.hpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen.hpp index 23a32149192..19fcdc79dbd 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen.hpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen.hpp @@ -5,7 +5,7 @@ #pragma once #include "debug/dprint.h" -#include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_test.hpp" +#include "tt_fabric/hw/inc/tt_fabric_status.h" #define is_power_of_2(x) (((x) > 0) && (((x) & ((x) - 1)) == 0)) diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx.cpp index efdb7aa794c..4c29d8b4ef9 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx.cpp @@ -7,7 +7,7 @@ #include "dataflow_api.h" #include "tt_fabric/hw/inc/tt_fabric.h" #include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen.hpp" -#include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_test.hpp" +#include "tt_fabric/hw/inc/tt_fabric_status.h" #include "tt_fabric/hw/inc/tt_fabric_interface.h" #include "tests/tt_metal/tt_metal/perf_microbenchmark/common/kernel_utils.hpp" // clang-format on @@ -61,8 +61,8 @@ void kernel_main() { rx_buf_size = get_arg_val(increment_arg_idx(rt_args_idx)); zero_l1_buf(test_results, test_results_size_bytes); - test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_STARTED; - test_results[PQ_TEST_MISC_INDEX] = 0xff000000; + test_results[TT_FABRIC_STATUS_INDEX] = TT_FABRIC_STATUS_STARTED; + test_results[TT_FABRIC_MISC_INDEX] = 0xff000000; if constexpr (ASYNC_WR & test_command) { uint32_t packet_rnd_seed; @@ -174,9 +174,9 @@ void kernel_main() { read_addr, curr_payload_words, start_val, mismatch_addr, mismatch_val, expected_val); if (!match) { async_wr_check_failed = true; - test_results[PQ_TEST_MISC_INDEX + 12] = mismatch_addr; - test_results[PQ_TEST_MISC_INDEX + 13] = mismatch_val; - test_results[PQ_TEST_MISC_INDEX + 14] = expected_val; + test_results[TT_FABRIC_MISC_INDEX + 12] = mismatch_addr; + test_results[TT_FABRIC_MISC_INDEX + 13] = mismatch_val; + test_results[TT_FABRIC_MISC_INDEX + 14] = expected_val; break; } } @@ -200,13 +200,13 @@ void kernel_main() { } // write out results - set_64b_result(test_results, processed_packet_words, PQ_TEST_WORD_CNT_INDEX); + set_64b_result(test_results, processed_packet_words, TT_FABRIC_WORD_CNT_INDEX); set_64b_result(test_results, num_packets, TX_TEST_IDX_NPKT); if (async_wr_check_failed) { - test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_DATA_MISMATCH; + test_results[TT_FABRIC_STATUS_INDEX] = TT_FABRIC_STATUS_DATA_MISMATCH; } else { - test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_PASS; - test_results[PQ_TEST_MISC_INDEX] = 0xff000005; + test_results[TT_FABRIC_STATUS_INDEX] = TT_FABRIC_STATUS_PASS; + test_results[TT_FABRIC_MISC_INDEX] = 0xff000005; } } diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_test.hpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_test.hpp deleted file mode 100644 index ac4ebaee8e3..00000000000 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_test.hpp +++ /dev/null @@ -1,78 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -//#include "tt_metal/impl/dispatch/kernels/tt_fabric.hpp" -#include -#include - -constexpr uint32_t PACKET_QUEUE_STAUS_MASK = 0xabc00000; -constexpr uint32_t PACKET_QUEUE_TEST_STARTED = PACKET_QUEUE_STAUS_MASK | 0x0; -constexpr uint32_t PACKET_QUEUE_TEST_PASS = PACKET_QUEUE_STAUS_MASK | 0x1; -constexpr uint32_t PACKET_QUEUE_TEST_TIMEOUT = PACKET_QUEUE_STAUS_MASK | 0xdead0; -constexpr uint32_t PACKET_QUEUE_TEST_BAD_HEADER = PACKET_QUEUE_STAUS_MASK | 0xdead1; -constexpr uint32_t PACKET_QUEUE_TEST_DATA_MISMATCH = PACKET_QUEUE_STAUS_MASK | 0x3; - -// indexes of return values in test results buffer -constexpr uint32_t PQ_TEST_STATUS_INDEX = 0; -constexpr uint32_t PQ_TEST_WORD_CNT_INDEX = 2; -constexpr uint32_t PQ_TEST_CYCLES_INDEX = 4; -constexpr uint32_t PQ_TEST_ITER_INDEX = 6; -constexpr uint32_t PQ_TEST_MISC_INDEX = 16; - -/* -inline const char *packet_queue_test_status_to_string(uint32_t status) { - switch (status) { - case TT_FABRIC_TEST_STARTED: - return "STARTED"; - case TT_FABRIC_TEST_PASS: - return "DONE/OK"; - case TT_FABRIC_TEST_TIMEOUT: - return "TIMEOUT"; - case TT_FABRIC_TEST_DATA_MISMATCH: - return "DATA_MISMATCH"; - default: - return "UNKNOWN"; - } -} -*/ - -inline const char *packet_queue_test_status_to_string(uint32_t status) { - switch (status) { - case PACKET_QUEUE_TEST_STARTED: - return "STARTED"; - case PACKET_QUEUE_TEST_PASS: - return "DONE/OK"; - case PACKET_QUEUE_TEST_TIMEOUT: - return "TIMEOUT"; - case PACKET_QUEUE_TEST_BAD_HEADER: return "BAD_PACKET_HEADER"; - case PACKET_QUEUE_TEST_DATA_MISMATCH: - return "DATA_MISMATCH"; - default: - return "UNKNOWN"; - } -} - -inline uint64_t get_64b_result(uint32_t* buf, uint32_t index) { - return (((uint64_t)buf[index]) << 32) | buf[index+1]; -} - -inline uint64_t get_64b_result(const std::vector& vec, uint32_t index) { - return (((uint64_t)vec[index]) << 32) | vec[index+1]; -} - -#define TX_TEST_IDX_TOT_DATA_WORDS PQ_TEST_MISC_INDEX + 1 -#define TX_TEST_IDX_NPKT PQ_TEST_MISC_INDEX + 3 -#define TX_TEST_IDX_WORDS_FLUSHED PQ_TEST_MISC_INDEX + 5 -#define TX_TEST_IDX_FEW_DATA_WORDS_SENT_ITER PQ_TEST_MISC_INDEX + 7 -#define TX_TEST_IDX_MANY_DATA_WORDS_SENT_ITER PQ_TEST_MISC_INDEX + 9 -#define TX_TEST_IDX_ZERO_DATA_WORDS_SENT_ITER PQ_TEST_MISC_INDEX + 11 -// #define TX_TEST_IDX_ PQ_TEST_MISC_INDEX + -// #define TX_TEST_IDX_ PQ_TEST_MISC_INDEX + - -enum class pkt_dest_size_choices_t { - RANDOM=0, - SAME_START_RNDROBIN_FIX_SIZE=1 // max packet size used -}; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_tx_ubench.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_tx_ubench.cpp index 0832c67a7c1..d9991ed8b67 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_tx_ubench.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_tx_ubench.cpp @@ -107,9 +107,9 @@ void kernel_main() { target_address = base_target_address; zero_l1_buf(test_results, test_results_size_bytes); - test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_STARTED; - test_results[PQ_TEST_MISC_INDEX] = 0xff000000; - test_results[PQ_TEST_MISC_INDEX + 1] = 0xcc000000 | src_endpoint_id; + test_results[TT_FABRIC_STATUS_INDEX] = TT_FABRIC_STATUS_STARTED; + test_results[TT_FABRIC_MISC_INDEX] = 0xff000000; + test_results[TT_FABRIC_MISC_INDEX + 1] = 0xcc000000 | src_endpoint_id; zero_l1_buf( reinterpret_cast(data_buffer_start_addr), data_buffer_size_words * PACKET_WORD_SIZE_BYTES); @@ -199,11 +199,11 @@ void kernel_main() { uint64_t cycles_elapsed = get_timestamp() - start_timestamp; uint64_t num_packets = packet_count; - set_64b_result(test_results, data_words_sent, PQ_TEST_WORD_CNT_INDEX); - set_64b_result(test_results, cycles_elapsed, PQ_TEST_CYCLES_INDEX); + set_64b_result(test_results, data_words_sent, TT_FABRIC_WORD_CNT_INDEX); + set_64b_result(test_results, cycles_elapsed, TT_FABRIC_CYCLES_INDEX); set_64b_result(test_results, total_data_words, TX_TEST_IDX_TOT_DATA_WORDS); set_64b_result(test_results, num_packets, TX_TEST_IDX_NPKT); - test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_PASS; - test_results[PQ_TEST_MISC_INDEX] = packet_count; + test_results[TT_FABRIC_STATUS_INDEX] = TT_FABRIC_STATUS_PASS; + test_results[TT_FABRIC_MISC_INDEX] = packet_count; } diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp index fa061868bca..f055d0a9833 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp @@ -6,6 +6,7 @@ #include #include +#include "hw/inc/tt_fabric_status.h" #include "llrt.hpp" static inline std::string to_string(pkt_dest_size_choices_t choice) { @@ -25,3 +26,11 @@ static inline void log_phys_coord_to_json(nlohmann::json& config, const std::vec static inline void log_phys_coord_to_json(nlohmann::json& config, const CoreCoord& phys_core, const std::string& name) { config[name] = fmt::format("({}, {})", phys_core.x, phys_core.y); } + +inline uint64_t get_64b_result(uint32_t* buf, uint32_t index) { + return (((uint64_t)buf[index]) << 32) | buf[index+1]; +} + +inline uint64_t get_64b_result(const std::vector& vec, uint32_t index) { + return (((uint64_t)vec[index]) << 32) | vec[index+1]; +} diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp index 03f804ce55f..05a35add66a 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp @@ -8,8 +8,7 @@ #include #include #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" -#include "kernels/traffic_gen_test.hpp" -#include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp" +#include "test_common.hpp" #include "llrt.hpp" using std::vector; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp index 63105c881cc..dc4a8f132fd 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp @@ -7,7 +7,7 @@ #include #include #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" -#include "kernels/traffic_gen_test.hpp" +#include "test_common.hpp" #include "llrt.hpp" using std::vector; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp index 096370e0c1b..8ac6dbd69b3 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp @@ -9,8 +9,8 @@ #include "tt_fabric/control_plane.hpp" // #include // #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" -#include "kernels/tt_fabric_traffic_gen_test.hpp" -#include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp" +#include "tt_fabric/hw/inc/tt_fabric_status.h" +#include "test_common.hpp" #include "eth_l1_address_map.h" #include "tt_fabric/hw/inc/tt_fabric_interface.h" @@ -542,12 +542,8 @@ int main(int argc, char** argv) { for (uint32_t i = 0; i < num_src_endpoints; i++) { tx_results.push_back(tt::llrt::read_hex_vec_from_core( device_map[test_device_id_l]->id(), tx_phys_core[i], test_results_addr, 128)); - log_info( - LogTest, - "TX{} status = {}", - i, - packet_queue_test_status_to_string(tx_results[i][PQ_TEST_STATUS_INDEX])); - pass &= (tx_results[i][PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); + log_info(LogTest, "TX{} status = {}", i, tt_fabric_status_to_string(tx_results[i][TT_FABRIC_STATUS_INDEX])); + pass &= (tx_results[i][TT_FABRIC_STATUS_INDEX] == TT_FABRIC_STATUS_PASS); } /* TODO: Need to add these once control plane api is available to @@ -556,15 +552,15 @@ int main(int argc, char** argv) { tt::llrt::read_hex_vec_from_core( device_map[test_device_id_l]->id(), tunneler_phys_core, tunneler_test_results_addr, 128); log_info(LogTest, "L Router status = {}", - packet_queue_test_status_to_string(router_results[PQ_TEST_STATUS_INDEX])); pass &= - (router_results[PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); + tt_fabric_status_to_string(router_results[TT_FABRIC_STATUS_INDEX])); pass &= + (router_results[TT_FABRIC_STATUS_INDEX] == TT_FABRIC_STATUS_PASS); vector r_router_results = tt::llrt::read_hex_vec_from_core( device_map[test_device_id_r]->id(), r_tunneler_phys_core, tunneler_test_results_addr, 128); log_info(LogTest, "R Router status = {}", - packet_queue_test_status_to_string(r_router_results[PQ_TEST_STATUS_INDEX])); pass &= - (r_router_results[PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); + tt_fabric_status_to_string(r_router_results[TT_FABRIC_STATUS_INDEX])); pass &= + (r_router_results[TT_FABRIC_STATUS_INDEX] == TT_FABRIC_STATUS_PASS); */ for (auto active_device : device_map) { pass &= tt_metal::CloseDevice(active_device.second); @@ -575,12 +571,12 @@ int main(int argc, char** argv) { uint64_t total_tx_words_sent = 0; uint64_t total_rx_words_checked = 0; for (uint32_t i = 0; i < num_src_endpoints; i++) { - uint64_t tx_words_sent = get_64b_result(tx_results[i], PQ_TEST_WORD_CNT_INDEX); + uint64_t tx_words_sent = get_64b_result(tx_results[i], TT_FABRIC_WORD_CNT_INDEX); total_tx_words_sent += tx_words_sent; - uint64_t tx_elapsed_cycles = get_64b_result(tx_results[i], PQ_TEST_CYCLES_INDEX); + uint64_t tx_elapsed_cycles = get_64b_result(tx_results[i], TT_FABRIC_CYCLES_INDEX); double tx_bw = ((double)tx_words_sent) * PACKET_WORD_SIZE_BYTES / tx_elapsed_cycles; total_tx_bw += tx_bw; - uint64_t iter = get_64b_result(tx_results[i], PQ_TEST_ITER_INDEX); + uint64_t iter = get_64b_result(tx_results[i], TT_FABRIC_ITER_INDEX); // uint64_t zero_data_sent_iter = get_64b_result(tx_results[i], TX_TEST_IDX_ZERO_DATA_WORDS_SENT_ITER); // uint64_t few_data_sent_iter = get_64b_result(tx_results[i], TX_TEST_IDX_FEW_DATA_WORDS_SENT_ITER); // uint64_t many_data_sent_iter = get_64b_result(tx_results[i], TX_TEST_IDX_MANY_DATA_WORDS_SENT_ITER); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp index 052f8b39ed8..a0e91bd4dc2 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp @@ -10,8 +10,8 @@ #include "tt_fabric/mesh_graph.hpp" //#include //#include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" -#include "kernels/tt_fabric_traffic_gen_test.hpp" -#include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp" +#include "tt_fabric/hw/inc/tt_fabric_status.h" +#include "test_common.hpp" #include "eth_l1_address_map.h" #include "tt_fabric/hw/inc/tt_fabric_interface.h" #include @@ -869,7 +869,8 @@ typedef struct test_traffic { num_cores_to_skip = (num_rx_workers + num_links_to_use - 1) / num_links_to_use; } // Assumes uniform worker grid across receiver chips - rx_workers = rx_devices[0]->select_worker_cores(dest_routers, num_links_to_use, num_rx_workers, num_cores_to_skip); + rx_workers = + rx_devices[0]->select_worker_cores(dest_routers, num_links_to_use, num_rx_workers, num_cores_to_skip); // TODO: not the most optimum selection, might impact somewhat in bidirectional mode controller_logical_core = tx_device->select_random_worker_cores(1)[0]; @@ -1085,8 +1086,8 @@ typedef struct test_traffic { tx_device->physical_chip_id, (uint32_t)tx_device->logical_chip_id, i, - packet_queue_test_status_to_string(tx_results[i][PQ_TEST_STATUS_INDEX])); - pass &= (tx_results[i][PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); + tt_fabric_status_to_string(tx_results[i][TT_FABRIC_STATUS_INDEX])); + pass &= (tx_results[i][TT_FABRIC_STATUS_INDEX] == TT_FABRIC_STATUS_PASS); } // collect rx results @@ -1101,8 +1102,8 @@ typedef struct test_traffic { rx_devices[d]->physical_chip_id, (uint32_t)rx_devices[d]->logical_chip_id, i, - packet_queue_test_status_to_string(rx_results[d][i][PQ_TEST_STATUS_INDEX])); - pass &= (rx_results[d][i][PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); + tt_fabric_status_to_string(rx_results[d][i][TT_FABRIC_STATUS_INDEX])); + pass &= (rx_results[d][i][TT_FABRIC_STATUS_INDEX] == TT_FABRIC_STATUS_PASS); } } @@ -1120,10 +1121,10 @@ typedef struct test_traffic { num_tx_packets = 0; for (auto j : rx_to_tx_map[i]) { - num_tx_words += get_64b_result(tx_results[j], PQ_TEST_WORD_CNT_INDEX); + num_tx_words += get_64b_result(tx_results[j], TT_FABRIC_WORD_CNT_INDEX); num_tx_packets += get_64b_result(tx_results[j], TX_TEST_IDX_NPKT); } - pass &= (get_64b_result(rx_results[d][i], PQ_TEST_WORD_CNT_INDEX) == num_tx_words); + pass &= (get_64b_result(rx_results[d][i], TT_FABRIC_WORD_CNT_INDEX) == num_tx_words); pass &= (get_64b_result(rx_results[d][i], TX_TEST_IDX_NPKT) == num_tx_packets); if (!pass) { @@ -1142,12 +1143,12 @@ typedef struct test_traffic { uint64_t total_rx_words_checked = 0; uint64_t max_tx_elapsed_cycles = 0; for (uint32_t i = 0; i < num_tx_workers; i++) { - uint64_t tx_words_sent = get_64b_result(tx_results[i], PQ_TEST_WORD_CNT_INDEX); + uint64_t tx_words_sent = get_64b_result(tx_results[i], TT_FABRIC_WORD_CNT_INDEX); total_tx_words_sent += tx_words_sent; - uint64_t tx_elapsed_cycles = get_64b_result(tx_results[i], PQ_TEST_CYCLES_INDEX); + uint64_t tx_elapsed_cycles = get_64b_result(tx_results[i], TT_FABRIC_CYCLES_INDEX); double tx_bw = ((double)tx_words_sent) * PACKET_WORD_SIZE_BYTES / tx_elapsed_cycles; total_tx_bw += tx_bw; - uint64_t iter = get_64b_result(tx_results[i], PQ_TEST_ITER_INDEX); + uint64_t iter = get_64b_result(tx_results[i], TT_FABRIC_ITER_INDEX); max_tx_elapsed_cycles = std::max(max_tx_elapsed_cycles, tx_elapsed_cycles); // uint64_t zero_data_sent_iter = get_64b_result(tx_results[i], TX_TEST_IDX_ZERO_DATA_WORDS_SENT_ITER); // uint64_t few_data_sent_iter = get_64b_result(tx_results[i], TX_TEST_IDX_FEW_DATA_WORDS_SENT_ITER); @@ -1182,7 +1183,7 @@ typedef struct test_traffic { total_tx_bw_2 = ((double)total_tx_words_sent) * PACKET_WORD_SIZE_BYTES / max_tx_elapsed_cycles; for (uint32_t d = 0; d < rx_devices.size(); d++) { for (uint32_t i = 0; i < num_rx_workers; i++) { - uint64_t words_received = get_64b_result(rx_results[d][i], PQ_TEST_WORD_CNT_INDEX); + uint64_t words_received = get_64b_result(rx_results[d][i], TT_FABRIC_WORD_CNT_INDEX); uint32_t num_tx = rx_to_tx_map[i].size(); log_info( LogTest, @@ -1761,15 +1762,15 @@ int main(int argc, char **argv) { tt::llrt::read_hex_vec_from_core( device->id(), tunneler_phys_core, tunneler_test_results_addr, 128); log_info(LogTest, "L Router status = {}", - packet_queue_test_status_to_string(router_results[PQ_TEST_STATUS_INDEX])); pass &= - (router_results[PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); + tt_fabric_status_to_string(router_results[TT_FABRIC_STATUS_INDEX])); pass &= + (router_results[TT_FABRIC_STATUS_INDEX] == TT_FABRIC_STATUS_PASS); vector r_router_results = tt::llrt::read_hex_vec_from_core( device_r->id(), r_tunneler_phys_core, tunneler_test_results_addr, 128); log_info(LogTest, "R Router status = {}", - packet_queue_test_status_to_string(r_router_results[PQ_TEST_STATUS_INDEX])); pass &= - (r_router_results[PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); + tt_fabric_status_to_string(r_router_results[TT_FABRIC_STATUS_INDEX])); pass &= + (r_router_results[TT_FABRIC_STATUS_INDEX] == TT_FABRIC_STATUS_PASS); */ // close devices diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp index 14425045b9f..cf140eeaf80 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp @@ -9,8 +9,8 @@ #include "tt_fabric/control_plane.hpp" // #include "tt_metal/impl/dispatch/cq_commands.hpp" // #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" -#include "kernels/tt_fabric_traffic_gen_test.hpp" -#include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp" +#include "tt_fabric/hw/inc/tt_fabric_status.h" +#include "test_common.hpp" #include "eth_l1_address_map.h" #include "tt_fabric/hw/inc/tt_fabric_interface.h" @@ -577,12 +577,8 @@ int main(int argc, char** argv) { for (uint32_t i = 0; i < num_src_endpoints; i++) { tx_results.push_back(tt::llrt::read_hex_vec_from_core( device_map[test_device_id_l]->id(), tx_phys_core[i], test_results_addr, 128)); - log_info( - LogTest, - "TX{} status = {}", - i, - packet_queue_test_status_to_string(tx_results[i][PQ_TEST_STATUS_INDEX])); - pass &= (tx_results[i][PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); + log_info(LogTest, "TX{} status = {}", i, tt_fabric_status_to_string(tx_results[i][TT_FABRIC_STATUS_INDEX])); + pass &= (tx_results[i][TT_FABRIC_STATUS_INDEX] == TT_FABRIC_STATUS_PASS); } /* TODO: Need to add these once control plane api is available to @@ -591,15 +587,15 @@ int main(int argc, char** argv) { tt::llrt::read_hex_vec_from_core( device_map[test_device_id_l]->id(), tunneler_phys_core, tunneler_test_results_addr, 128); log_info(LogTest, "L Router status = {}", - packet_queue_test_status_to_string(router_results[PQ_TEST_STATUS_INDEX])); pass &= - (router_results[PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); + tt_fabric_status_to_string(router_results[TT_FABRIC_STATUS_INDEX])); pass &= + (router_results[TT_FABRIC_STATUS_INDEX] == TT_FABRIC_STATUS_PASS); vector r_router_results = tt::llrt::read_hex_vec_from_core( device_map[test_device_id_r]->id(), r_tunneler_phys_core, tunneler_test_results_addr, 128); log_info(LogTest, "R Router status = {}", - packet_queue_test_status_to_string(r_router_results[PQ_TEST_STATUS_INDEX])); pass &= - (r_router_results[PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); + tt_fabric_status_to_string(r_router_results[TT_FABRIC_STATUS_INDEX])); pass &= + (r_router_results[TT_FABRIC_STATUS_INDEX] == TT_FABRIC_STATUS_PASS); */ for (auto active_device : device_map) { pass &= tt_metal::CloseDevice(active_device.second); @@ -610,12 +606,12 @@ int main(int argc, char** argv) { uint64_t total_tx_words_sent = 0; uint64_t total_rx_words_checked = 0; for (uint32_t i = 0; i < num_src_endpoints; i++) { - uint64_t tx_words_sent = get_64b_result(tx_results[i], PQ_TEST_WORD_CNT_INDEX); + uint64_t tx_words_sent = get_64b_result(tx_results[i], TT_FABRIC_WORD_CNT_INDEX); total_tx_words_sent += tx_words_sent; - uint64_t tx_elapsed_cycles = get_64b_result(tx_results[i], PQ_TEST_CYCLES_INDEX); + uint64_t tx_elapsed_cycles = get_64b_result(tx_results[i], TT_FABRIC_CYCLES_INDEX); double tx_bw = ((double)tx_words_sent) * PACKET_WORD_SIZE_BYTES / tx_elapsed_cycles; total_tx_bw += tx_bw; - uint64_t iter = get_64b_result(tx_results[i], PQ_TEST_ITER_INDEX); + uint64_t iter = get_64b_result(tx_results[i], TT_FABRIC_ITER_INDEX); // uint64_t zero_data_sent_iter = get_64b_result(tx_results[i], TX_TEST_IDX_ZERO_DATA_WORDS_SENT_ITER); // uint64_t few_data_sent_iter = get_64b_result(tx_results[i], TX_TEST_IDX_FEW_DATA_WORDS_SENT_ITER); // uint64_t many_data_sent_iter = get_64b_result(tx_results[i], TX_TEST_IDX_MANY_DATA_WORDS_SENT_ITER); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp index e0e200af967..a645b972fa6 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp @@ -7,7 +7,7 @@ #include #include #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" -#include "kernels/traffic_gen_test.hpp" +#include "test_common.hpp" #include "utils.hpp" #include "llrt.hpp" diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_2ep.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_2ep.cpp index f96ca0c8528..99d271f3ce0 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_2ep.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_2ep.cpp @@ -7,9 +7,8 @@ #include #include #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" -#include "kernels/traffic_gen_test.hpp" #include -#include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp" +#include "test_common.hpp" using std::vector; using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_4ep.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_4ep.cpp index c1945c1b5aa..8c70290d9c3 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_4ep.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_4ep.cpp @@ -7,9 +7,8 @@ #include #include #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" -#include "kernels/traffic_gen_test.hpp" #include -#include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp" +#include "test_common.hpp" using std::vector; using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_loopback_tunnel.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_loopback_tunnel.cpp index 9348333bd56..0b9cf4ae5b4 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_loopback_tunnel.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_loopback_tunnel.cpp @@ -7,9 +7,8 @@ #include #include #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" -#include "kernels/traffic_gen_test.hpp" #include -#include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp" +#include "test_common.hpp" using std::vector; using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp index cf6fb4609e6..11eda9992de 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp @@ -8,8 +8,7 @@ #include #include #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" -#include "kernels/traffic_gen_test.hpp" -#include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp" +#include "test_common.hpp" using std::vector; using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp index a837a0be959..32d69fb8586 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp @@ -8,8 +8,7 @@ #include #include #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" -#include "kernels/traffic_gen_test.hpp" -#include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp" +#include "test_common.hpp" using std::vector; using namespace tt; diff --git a/tt_fabric/hw/inc/tt_fabric_status.h b/tt_fabric/hw/inc/tt_fabric_status.h new file mode 100644 index 00000000000..5f415112755 --- /dev/null +++ b/tt_fabric/hw/inc/tt_fabric_status.h @@ -0,0 +1,45 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include + +constexpr uint32_t TT_FABRIC_STAUS_MASK = 0xabc00000; +constexpr uint32_t TT_FABRIC_STATUS_STARTED = TT_FABRIC_STAUS_MASK | 0x0; +constexpr uint32_t TT_FABRIC_STATUS_PASS = TT_FABRIC_STAUS_MASK | 0x1; +constexpr uint32_t TT_FABRIC_STATUS_TIMEOUT = TT_FABRIC_STAUS_MASK | 0xdead0; +constexpr uint32_t TT_FABRIC_STATUS_BAD_HEADER = TT_FABRIC_STAUS_MASK | 0xdead1; +constexpr uint32_t TT_FABRIC_STATUS_DATA_MISMATCH = TT_FABRIC_STAUS_MASK | 0x3; + +// indexes of return values in test results buffer +constexpr uint32_t TT_FABRIC_STATUS_INDEX = 0; +constexpr uint32_t TT_FABRIC_WORD_CNT_INDEX = 2; +constexpr uint32_t TT_FABRIC_CYCLES_INDEX = 4; +constexpr uint32_t TT_FABRIC_ITER_INDEX = 6; +constexpr uint32_t TT_FABRIC_MISC_INDEX = 16; + +inline std::string_view tt_fabric_status_to_string(uint32_t status) { + switch (status) { + case TT_FABRIC_STATUS_STARTED: return "STARTED"; + case TT_FABRIC_STATUS_PASS: return "DONE/OK"; + case TT_FABRIC_STATUS_TIMEOUT: return "TIMEOUT"; + case TT_FABRIC_STATUS_BAD_HEADER: return "BAD_PACKET_HEADER"; + case TT_FABRIC_STATUS_DATA_MISMATCH: return "DATA_MISMATCH"; + default: return "UNKNOWN"; + } +} + +constexpr uint32_t TX_TEST_IDX_TOT_DATA_WORDS = TT_FABRIC_MISC_INDEX + 1; +constexpr uint32_t TX_TEST_IDX_NPKT = TT_FABRIC_MISC_INDEX + 3; +constexpr uint32_t TX_TEST_IDX_WORDS_FLUSHED = TT_FABRIC_MISC_INDEX + 5; +constexpr uint32_t TX_TEST_IDX_FEW_DATA_WORDS_SENT_ITER = TT_FABRIC_MISC_INDEX + 7; +constexpr uint32_t TX_TEST_IDX_MANY_DATA_WORDS_SENT_ITER = TT_FABRIC_MISC_INDEX + 9; +constexpr uint32_t TX_TEST_IDX_ZERO_DATA_WORDS_SENT_ITER = TT_FABRIC_MISC_INDEX + 11; +// constexpr uint32_t TX_TEST_IDX_ = TT_FABRIC_MISC_INDEX + ; +// constexpr uint32_t TX_TEST_IDX_ = TT_FABRIC_MISC_INDEX + ; + +enum class pkt_dest_size_choices_t { + RANDOM = 0, + SAME_START_RNDROBIN_FIX_SIZE = 1 // max packet size used +}; diff --git a/tt_fabric/impl/kernels/tt_fabric_gatekeeper.cpp b/tt_fabric/impl/kernels/tt_fabric_gatekeeper.cpp index 31c75c4329b..c211c6f0133 100644 --- a/tt_fabric/impl/kernels/tt_fabric_gatekeeper.cpp +++ b/tt_fabric/impl/kernels/tt_fabric_gatekeeper.cpp @@ -5,6 +5,7 @@ // clang-format off #include "dataflow_api.h" #include "tt_fabric/hw/inc/tt_fabric.h" +#include "tt_fabric/hw/inc/tt_fabric_status.h" #include "debug/dprint.h" // clang-format on @@ -19,20 +20,6 @@ constexpr uint32_t timeout_cycles = get_compile_time_arg_val(5); uint32_t sync_val; uint32_t router_mask; -constexpr uint32_t PACKET_QUEUE_STAUS_MASK = 0xabc00000; -constexpr uint32_t PACKET_QUEUE_TEST_STARTED = PACKET_QUEUE_STAUS_MASK | 0x0; -constexpr uint32_t PACKET_QUEUE_TEST_PASS = PACKET_QUEUE_STAUS_MASK | 0x1; -constexpr uint32_t PACKET_QUEUE_TEST_TIMEOUT = PACKET_QUEUE_STAUS_MASK | 0xdead0; -constexpr uint32_t PACKET_QUEUE_TEST_BAD_HEADER = PACKET_QUEUE_STAUS_MASK | 0xdead1; -constexpr uint32_t PACKET_QUEUE_TEST_DATA_MISMATCH = PACKET_QUEUE_STAUS_MASK | 0x3; - -// indexes of return values in test results buffer -constexpr uint32_t PQ_TEST_STATUS_INDEX = 0; -constexpr uint32_t PQ_TEST_WORD_CNT_INDEX = 2; -constexpr uint32_t PQ_TEST_CYCLES_INDEX = 4; -constexpr uint32_t PQ_TEST_ITER_INDEX = 6; -constexpr uint32_t PQ_TEST_MISC_INDEX = 16; - // careful, may be null tt_l1_ptr uint32_t* const kernel_status = reinterpret_cast(kernel_status_buf_addr); volatile tt_l1_ptr fabric_router_l1_config_t* routing_table = @@ -436,11 +423,11 @@ void kernel_main() { tt_fabric_init(); - write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_STARTED); - write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff000000); - write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX + 1, 0xbb000000); - write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX + 2, 0xAABBCCDD); - write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX + 3, 0xDDCCBBAA); + write_kernel_status(kernel_status, TT_FABRIC_STATUS_INDEX, TT_FABRIC_STATUS_STARTED); + write_kernel_status(kernel_status, TT_FABRIC_MISC_INDEX, 0xff000000); + write_kernel_status(kernel_status, TT_FABRIC_MISC_INDEX + 1, 0xbb000000); + write_kernel_status(kernel_status, TT_FABRIC_MISC_INDEX + 2, 0xAABBCCDD); + write_kernel_status(kernel_status, TT_FABRIC_MISC_INDEX + 3, 0xDDCCBBAA); zero_l1_buf((tt_l1_ptr uint32_t*)&gk_info->gk_msg_buf, FVCC_BUF_SIZE_BYTES); zero_l1_buf((tt_l1_ptr uint32_t*)socket_info, sizeof(socket_info_t)); @@ -477,7 +464,7 @@ void kernel_main() { gk_msg_buf_advance_rdptr((ctrl_chan_msg_buf*)msg_buf); loop_count = 0; } else { - write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_BAD_HEADER); + write_kernel_status(kernel_status, TT_FABRIC_STATUS_INDEX, TT_FABRIC_STATUS_BAD_HEADER); return; } } @@ -498,11 +485,11 @@ void kernel_main() { DPRINT << "Gatekeeper messages processed " << total_messages_procesed << ENDL(); - write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff000002); + write_kernel_status(kernel_status, TT_FABRIC_MISC_INDEX, 0xff000002); - write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff000003); + write_kernel_status(kernel_status, TT_FABRIC_MISC_INDEX, 0xff000003); - write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_PASS); + write_kernel_status(kernel_status, TT_FABRIC_STATUS_INDEX, TT_FABRIC_STATUS_PASS); - write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff00005); + write_kernel_status(kernel_status, TT_FABRIC_MISC_INDEX, 0xff00005); } diff --git a/tt_fabric/impl/kernels/tt_fabric_router.cpp b/tt_fabric/impl/kernels/tt_fabric_router.cpp index 5453c5f6ca3..0eeb7879f9d 100644 --- a/tt_fabric/impl/kernels/tt_fabric_router.cpp +++ b/tt_fabric/impl/kernels/tt_fabric_router.cpp @@ -5,6 +5,7 @@ // clang-format off #include "dataflow_api.h" #include "tt_fabric/hw/inc/tt_fabric.h" +#include "tt_fabric/hw/inc/tt_fabric_status.h" // clang-format on using namespace tt::tt_fabric; @@ -28,20 +29,6 @@ uint32_t router_mask; uint32_t gk_message_addr_l; uint32_t gk_message_addr_h; -constexpr uint32_t PACKET_QUEUE_STAUS_MASK = 0xabc00000; -constexpr uint32_t PACKET_QUEUE_TEST_STARTED = PACKET_QUEUE_STAUS_MASK | 0x0; -constexpr uint32_t PACKET_QUEUE_TEST_PASS = PACKET_QUEUE_STAUS_MASK | 0x1; -constexpr uint32_t PACKET_QUEUE_TEST_TIMEOUT = PACKET_QUEUE_STAUS_MASK | 0xdead0; -constexpr uint32_t PACKET_QUEUE_TEST_BAD_HEADER = PACKET_QUEUE_STAUS_MASK | 0xdead1; -constexpr uint32_t PACKET_QUEUE_TEST_DATA_MISMATCH = PACKET_QUEUE_STAUS_MASK | 0x3; - -// indexes of return values in test results buffer -constexpr uint32_t PQ_TEST_STATUS_INDEX = 0; -constexpr uint32_t PQ_TEST_WORD_CNT_INDEX = 2; -constexpr uint32_t PQ_TEST_CYCLES_INDEX = 4; -constexpr uint32_t PQ_TEST_ITER_INDEX = 6; -constexpr uint32_t PQ_TEST_MISC_INDEX = 16; - // careful, may be null tt_l1_ptr uint32_t* const kernel_status = reinterpret_cast(kernel_status_buf_addr_arg); tt_l1_ptr volatile chan_req_buf* fvc_consumer_req_buf = @@ -90,11 +77,11 @@ void kernel_main() { tt_fabric_init(); - write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_STARTED); - write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff000000); - write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX + 1, 0xbb000000); - write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX + 2, 0xAABBCCDD); - write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX + 3, 0xDDCCBBAA); + write_kernel_status(kernel_status, TT_FABRIC_STATUS_INDEX, TT_FABRIC_STATUS_STARTED); + write_kernel_status(kernel_status, TT_FABRIC_MISC_INDEX, 0xff000000); + write_kernel_status(kernel_status, TT_FABRIC_MISC_INDEX + 1, 0xbb000000); + write_kernel_status(kernel_status, TT_FABRIC_MISC_INDEX + 2, 0xAABBCCDD); + write_kernel_status(kernel_status, TT_FABRIC_MISC_INDEX + 3, 0xDDCCBBAA); router_state.sync_in = 0; router_state.sync_out = 0; @@ -102,9 +89,9 @@ void kernel_main() { zero_l1_buf((tt_l1_ptr uint32_t*)fvc_consumer_req_buf, sizeof(chan_req_buf)); zero_l1_buf((tt_l1_ptr uint32_t*)FVCC_IN_BUF_START, FVCC_IN_BUF_SIZE); zero_l1_buf((tt_l1_ptr uint32_t*)FVCC_OUT_BUF_START, FVCC_OUT_BUF_SIZE); - write_kernel_status(kernel_status, PQ_TEST_WORD_CNT_INDEX, (uint32_t)&router_state); - write_kernel_status(kernel_status, PQ_TEST_WORD_CNT_INDEX + 1, (uint32_t)&fvc_consumer_state); - write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX + 1, (uint32_t)&fvc_producer_state); + write_kernel_status(kernel_status, TT_FABRIC_WORD_CNT_INDEX, (uint32_t)&router_state); + write_kernel_status(kernel_status, TT_FABRIC_WORD_CNT_INDEX + 1, (uint32_t)&fvc_consumer_state); + write_kernel_status(kernel_status, TT_FABRIC_STATUS_INDEX + 1, (uint32_t)&fvc_producer_state); fvc_consumer_state.init(FABRIC_ROUTER_DATA_BUF_START, fvc_data_buf_size_words / 2); fvc_producer_state.init( @@ -121,14 +108,14 @@ void kernel_main() { #endif if (!wait_all_src_dest_ready(&router_state, timeout_cycles)) { - write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_TIMEOUT); + write_kernel_status(kernel_status, TT_FABRIC_STATUS_INDEX, TT_FABRIC_STATUS_TIMEOUT); return; } notify_gatekeeper(); uint64_t start_timestamp = get_timestamp(); - write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff000001); + write_kernel_status(kernel_status, TT_FABRIC_MISC_INDEX, 0xff000001); uint32_t loop_count = 0; uint32_t launch_msg_rd_ptr = *GET_MAILBOX_ADDRESS_DEV(launch_msg_rd_ptr); @@ -172,7 +159,7 @@ void kernel_main() { fvc_producer_state.process_inbound_packet(); loop_count = 0; } else if (fvc_producer_state.packet_corrupted) { - write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_BAD_HEADER); + write_kernel_status(kernel_status, TT_FABRIC_STATUS_INDEX, TT_FABRIC_STATUS_BAD_HEADER); return; } @@ -200,16 +187,16 @@ void kernel_main() { } uint64_t cycles_elapsed = fvc_producer_state.packet_timestamp - start_timestamp; - write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff000002); + write_kernel_status(kernel_status, TT_FABRIC_MISC_INDEX, 0xff000002); - write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff000003); + write_kernel_status(kernel_status, TT_FABRIC_MISC_INDEX, 0xff000003); - set_64b_result(kernel_status, cycles_elapsed, PQ_TEST_CYCLES_INDEX); + set_64b_result(kernel_status, cycles_elapsed, TT_FABRIC_CYCLES_INDEX); if (fvc_consumer_state.packet_in_progress) { - write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_TIMEOUT); + write_kernel_status(kernel_status, TT_FABRIC_STATUS_INDEX, TT_FABRIC_STATUS_TIMEOUT); } else { - write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_PASS); + write_kernel_status(kernel_status, TT_FABRIC_STATUS_INDEX, TT_FABRIC_STATUS_PASS); } - write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff00005); + write_kernel_status(kernel_status, TT_FABRIC_MISC_INDEX, 0xff00005); } diff --git a/tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp b/tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp index d86086ad78d..f7be23a8d36 100644 --- a/tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp +++ b/tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #pragma once +#include constexpr uint32_t PACKET_WORD_SIZE_BYTES = 16; constexpr uint32_t MAX_SWITCH_FAN_IN = 4; @@ -32,6 +33,16 @@ constexpr uint32_t PQ_TEST_ITER_INDEX = 6; constexpr uint32_t PQ_TEST_MISC_INDEX = 16; +inline std::string_view packet_queue_test_status_to_string(uint32_t status) { + switch (status) { + case PACKET_QUEUE_TEST_STARTED: return "STARTED"; + case PACKET_QUEUE_TEST_PASS: return "DONE/OK"; + case PACKET_QUEUE_TEST_TIMEOUT: return "TIMEOUT"; + case PACKET_QUEUE_TEST_DATA_MISMATCH: return "DATA_MISMATCH"; + default: return "UNKNOWN"; + } +} + enum DispatchPacketFlag : uint32_t { PACKET_CMD_START = (0x1 << 1), PACKET_CMD_END = (0x1 << 2), From fa297cf6aba2edb698b06940c9364e8feb3539ad Mon Sep 17 00:00:00 2001 From: Joseph Chu Date: Thu, 6 Feb 2025 22:51:34 +0000 Subject: [PATCH 058/316] #0: Fix issue where traced llama models hanging --- tt_metal/distributed/mesh_device.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tt_metal/distributed/mesh_device.cpp b/tt_metal/distributed/mesh_device.cpp index 312d164934b..82265d1f725 100644 --- a/tt_metal/distributed/mesh_device.cpp +++ b/tt_metal/distributed/mesh_device.cpp @@ -584,6 +584,12 @@ void MeshDevice::replay_trace(const uint8_t cq_id, const uint32_t tid, const boo for (auto& device : scoped_devices_->get_devices()) { device->replay_trace(cq_id, tid, blocking); } + // If blocking, wait until worker threads have completed + if (blocking) { + for (auto& device : scoped_devices_->get_devices()) { + device->synchronize(); + } + } } void MeshDevice::release_trace(const uint32_t tid) { for (auto& device : scoped_devices_->get_devices()) { From 0e02f7b6e9c23e28bffa6871da93d7b206833a69 Mon Sep 17 00:00:00 2001 From: Joseph Chu Date: Fri, 7 Feb 2025 01:52:23 +0000 Subject: [PATCH 059/316] #0: Fix non-deterministic hangs caused by MeshDevice trace replay --- tt_metal/api/tt-metalium/device.hpp | 3 ++- tt_metal/api/tt-metalium/device_impl.hpp | 6 +++++- tt_metal/api/tt-metalium/mesh_device.hpp | 6 +++++- tt_metal/distributed/mesh_device.cpp | 7 ++++--- tt_metal/impl/device/device.cpp | 14 +++++--------- tt_metal/tt_metal.cpp | 2 +- ttnn/cpp/ttnn/operations/core/core.cpp | 2 +- 7 files changed, 23 insertions(+), 17 deletions(-) diff --git a/tt_metal/api/tt-metalium/device.hpp b/tt_metal/api/tt-metalium/device.hpp index 821eeaf5c9d..3c0eaae0bb8 100644 --- a/tt_metal/api/tt-metalium/device.hpp +++ b/tt_metal/api/tt-metalium/device.hpp @@ -141,7 +141,8 @@ class IDevice { // Metal trace device capture mode virtual void begin_trace(const uint8_t cq_id, const uint32_t tid) = 0; virtual void end_trace(const uint8_t cq_id, const uint32_t tid) = 0; - virtual void replay_trace(const uint8_t cq_id, const uint32_t tid, const bool blocking) = 0; + virtual void replay_trace( + const uint8_t cq_id, const uint32_t tid, const bool block_on_device, const bool block_on_worker_thread) = 0; virtual void release_trace(const uint32_t tid) = 0; virtual std::shared_ptr get_trace(uint32_t tid) = 0; diff --git a/tt_metal/api/tt-metalium/device_impl.hpp b/tt_metal/api/tt-metalium/device_impl.hpp index 375e515ad62..8b486f6010f 100644 --- a/tt_metal/api/tt-metalium/device_impl.hpp +++ b/tt_metal/api/tt-metalium/device_impl.hpp @@ -130,7 +130,11 @@ class Device : public IDevice { // Metal trace device capture mode void begin_trace(const uint8_t cq_id, const uint32_t tid) override; void end_trace(const uint8_t cq_id, const uint32_t tid) override; - void replay_trace(const uint8_t cq_id, const uint32_t tid, const bool blocking) override; + void replay_trace( + const uint8_t cq_id, + const uint32_t tid, + const bool block_on_device, + const bool block_on_worker_thread) override; void release_trace(const uint32_t tid) override; std::shared_ptr get_trace(uint32_t tid) override; uint32_t get_trace_buffers_size() const override { return trace_buffers_size_; } diff --git a/tt_metal/api/tt-metalium/mesh_device.hpp b/tt_metal/api/tt-metalium/mesh_device.hpp index c4f1469ee46..493d0ede6d5 100644 --- a/tt_metal/api/tt-metalium/mesh_device.hpp +++ b/tt_metal/api/tt-metalium/mesh_device.hpp @@ -139,7 +139,11 @@ class MeshDevice : public IDevice, public std::enable_shared_from_this get_trace(uint32_t tid) override; uint32_t get_trace_buffers_size() const override; diff --git a/tt_metal/distributed/mesh_device.cpp b/tt_metal/distributed/mesh_device.cpp index 82265d1f725..099c7c8f34b 100644 --- a/tt_metal/distributed/mesh_device.cpp +++ b/tt_metal/distributed/mesh_device.cpp @@ -580,12 +580,13 @@ void MeshDevice::end_trace(const uint8_t cq_id, const uint32_t tid) { device->end_trace(cq_id, tid); } } -void MeshDevice::replay_trace(const uint8_t cq_id, const uint32_t tid, const bool blocking) { +void MeshDevice::replay_trace( + const uint8_t cq_id, const uint32_t tid, const bool block_on_device, const bool block_on_worker_thread) { for (auto& device : scoped_devices_->get_devices()) { - device->replay_trace(cq_id, tid, blocking); + device->replay_trace(cq_id, tid, block_on_device, false /* block_on_worker_thread */); } // If blocking, wait until worker threads have completed - if (blocking) { + if (block_on_worker_thread) { for (auto& device : scoped_devices_->get_devices()) { device->synchronize(); } diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index c544bf00a3c..f1d8125e259 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -1494,10 +1494,11 @@ void Device::load_trace(const uint8_t cq_id, const uint32_t trace_id, const Trac this->mark_allocations_unsafe(); } -void Device::replay_trace(const uint8_t cq_id, const uint32_t tid, const bool blocking) { +void Device::replay_trace( + const uint8_t cq_id, const uint32_t tid, const bool block_on_device, const bool block_on_worker_thread) { // If blocking, ensure that worker thread blocks until trace is completed this->push_work( - [this, cq_id, tid, blocking]() mutable { + [this, cq_id, tid, block_on_device]() mutable { ZoneScoped; TracyTTMetalReplayTrace(this->id(), tid); constexpr bool check = false; @@ -1512,14 +1513,9 @@ void Device::replay_trace(const uint8_t cq_id, const uint32_t tid, const bool bl if constexpr (check) { Trace::validate_instance(*trace_buffer); } - EnqueueTrace(this->command_queue(cq_id), tid, blocking); + EnqueueTrace(this->command_queue(cq_id), tid, block_on_device); }, - blocking); - - // If blocking, wait until worker threads have completed - if (blocking) { - this->synchronize(); - } + block_on_worker_thread); } void Device::release_trace(const uint32_t tid) { diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index f1a36ce8f7a..f4d0f6cbb54 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -1326,7 +1326,7 @@ void EndTraceCapture(IDevice* device, const uint8_t cq_id, const uint32_t tid) { void ReplayTrace(IDevice* device, const uint8_t cq_id, const uint32_t tid, const bool blocking) { LIGHT_METAL_TRACE_FUNCTION_ENTRY(); LIGHT_METAL_TRACE_FUNCTION_CALL(CaptureReplayTrace, device, cq_id, tid, blocking); - device->replay_trace(cq_id, tid, blocking); + device->replay_trace(cq_id, tid, blocking /* block_on_device */, blocking /* block_on_worker_thread */); } void ReleaseTrace(IDevice* device, const uint32_t tid) { diff --git a/ttnn/cpp/ttnn/operations/core/core.cpp b/ttnn/cpp/ttnn/operations/core/core.cpp index eb8370acf78..a9ad99356c8 100644 --- a/ttnn/cpp/ttnn/operations/core/core.cpp +++ b/ttnn/cpp/ttnn/operations/core/core.cpp @@ -142,7 +142,7 @@ void end_trace_capture(IDevice* device, const uint32_t tid, const QueueId cq_id) void execute_trace(IDevice* device, const uint32_t tid, const QueueId cq_id, bool blocking) { ZoneScoped; - device->replay_trace(*cq_id, tid, blocking); + device->replay_trace(*cq_id, tid, blocking /* block_on_device */, blocking /* block_on_worker_thread */); } void release_trace(IDevice* device, const uint32_t tid) { From 05b16aa1bba169551050b7482dd7964d18a631c5 Mon Sep 17 00:00:00 2001 From: Joseph Chu Date: Fri, 7 Feb 2025 02:18:51 +0000 Subject: [PATCH 060/316] #0: add comment about deprecating --- tt_metal/api/tt-metalium/mesh_device.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tt_metal/api/tt-metalium/mesh_device.hpp b/tt_metal/api/tt-metalium/mesh_device.hpp index 493d0ede6d5..de088e22685 100644 --- a/tt_metal/api/tt-metalium/mesh_device.hpp +++ b/tt_metal/api/tt-metalium/mesh_device.hpp @@ -139,6 +139,8 @@ class MeshDevice : public IDevice, public std::enable_shared_from_this Date: Sun, 9 Feb 2025 20:30:45 +0000 Subject: [PATCH 061/316] LightMetal - SetRuntimeArgsUint32VecPerCore Trace + Replay support (some TTNN ops use) (Issue #17779) - Add C++ lightmetal unit test that lightly tests this SetRuntimeArgs() API --- .../tt_metal/lightmetal/test_lightmetal.cpp | 86 +++++++++++-------- tt_metal/impl/flatbuffer/command.fbs | 8 ++ tt_metal/impl/flatbuffer/program_types.fbs | 4 + .../program_types_from_flatbuffer.cpp | 23 +++++ .../program_types_from_flatbuffer.hpp | 5 ++ .../program_types_to_flatbuffer.cpp | 21 +++++ .../program_types_to_flatbuffer.hpp | 10 +++ .../lightmetal/host_api_capture_helpers.cpp | 31 ++++++- .../lightmetal/host_api_capture_helpers.hpp | 6 ++ .../impl/lightmetal/lightmetal_replay.cpp | 26 ++++++ .../impl/lightmetal/lightmetal_replay.hpp | 2 + tt_metal/tt_metal.cpp | 2 + 12 files changed, 187 insertions(+), 37 deletions(-) diff --git a/tests/tt_metal/tt_metal/lightmetal/test_lightmetal.cpp b/tests/tt_metal/tt_metal/lightmetal/test_lightmetal.cpp index 083e072a322..7096e73a0f2 100644 --- a/tests/tt_metal/tt_metal/lightmetal/test_lightmetal.cpp +++ b/tests/tt_metal/tt_metal/lightmetal/test_lightmetal.cpp @@ -25,7 +25,8 @@ namespace tt::tt_metal { namespace { // Single RISC, no CB's here. Very simple. -Program create_simple_datamovement_program(Buffer& input, Buffer& output, Buffer& l1_buffer) { +Program create_simple_datamovement_program( + const Buffer& input, const Buffer& output, const Buffer& l1_buffer, bool rt_arg_per_core_vec = false) { Program program = CreateProgram(); IDevice* device = input.device(); constexpr CoreCoord core = {0, 0}; @@ -44,8 +45,15 @@ Program create_simple_datamovement_program(Buffer& input, Buffer& output, Buffer const std::vector runtime_args = { l1_buffer.address(), input.address(), input_bank_id, output.address(), output_bank_id, l1_buffer.size()}; - // Note - this interface doesn't take Buffer, just data. - SetRuntimeArgs(program, dram_copy_kernel_id, core, runtime_args); + // Very minimal testing/usage of other SetRuntimeArgs API that TTNN uses for ops here, j + // just to see it go through the light-metal capture + replay flow. + if (rt_arg_per_core_vec) { + const std::vector> runtime_args_per_core = {runtime_args}; + SetRuntimeArgs(program, dram_copy_kernel_id, {core}, runtime_args_per_core); + } else { + // Note - this interface doesn't take Buffer, just data. + SetRuntimeArgs(program, dram_copy_kernel_id, core, runtime_args); + } return program; } @@ -125,7 +133,7 @@ using LightMetalBasicTest = SingleDeviceLightMetalFixture; TEST_F(LightMetalBasicTest, CreateBufferEnqueueWriteRead) { CreateDeviceAndBeginCapture(4096); - CommandQueue& command_queue = this->device_->command_queue(); + CommandQueue& command_queue = device_->command_queue(); uint32_t num_loops = 5; bool keep_buffers_alive = true; std::vector> buffers_vec; @@ -135,7 +143,7 @@ TEST_F(LightMetalBasicTest, CreateBufferEnqueueWriteRead) { // Switch to use top level CreateBuffer API that has trace support. uint32_t size_bytes = 64; // 16 elements. - auto buffer = CreateBuffer(InterleavedBufferConfig{this->device_, size_bytes, size_bytes, BufferType::DRAM}); + auto buffer = CreateBuffer(InterleavedBufferConfig{device_, size_bytes, size_bytes, BufferType::DRAM}); log_debug( tt::LogTest, "created buffer loop: {} with size: {} bytes addr: 0x{:x}", @@ -182,14 +190,11 @@ TEST_F(LightMetalBasicTest, CreateBufferEnqueueWriteRead) { Finish(command_queue); } -// Test simple case of single datamovement program on single RISC works for trace + replay. -TEST_F(LightMetalBasicTest, SingleRISCDataMovement) { - CreateDeviceAndBeginCapture(4096); - +void SingleRISCDataMovement_test(tt::tt_metal::IDevice* device, bool rt_arg_per_core_vec) { uint32_t size_bytes = 64; // 16 elements. - auto input = CreateBuffer(InterleavedBufferConfig{this->device_, size_bytes, size_bytes, BufferType::DRAM}); - auto output = CreateBuffer(InterleavedBufferConfig{this->device_, size_bytes, size_bytes, BufferType::DRAM}); - auto l1_buffer = CreateBuffer(InterleavedBufferConfig{this->device_, size_bytes, size_bytes, BufferType::L1}); + auto input = CreateBuffer(InterleavedBufferConfig{device, size_bytes, size_bytes, BufferType::DRAM}); + auto output = CreateBuffer(InterleavedBufferConfig{device, size_bytes, size_bytes, BufferType::DRAM}); + auto l1_buffer = CreateBuffer(InterleavedBufferConfig{device, size_bytes, size_bytes, BufferType::L1}); log_debug( tt::LogTest, "Created 3 Buffers. input: 0x{:x} output: 0x{:x} l1_buffer: 0x{:x}", @@ -197,9 +202,9 @@ TEST_F(LightMetalBasicTest, SingleRISCDataMovement) { output->address(), l1_buffer->address()); - CommandQueue& command_queue = this->device_->command_queue(); + CommandQueue& command_queue = device->command_queue(); - Program simple_program = create_simple_datamovement_program(*input, *output, *l1_buffer); + Program simple_program = create_simple_datamovement_program(*input, *output, *l1_buffer, rt_arg_per_core_vec); vector input_data(input->size() / sizeof(uint32_t), 0); for (uint32_t i = 0; i < input_data.size(); i++) { input_data[i] = i; @@ -224,15 +229,27 @@ TEST_F(LightMetalBasicTest, SingleRISCDataMovement) { Finish(command_queue); } +// Test simple case of single datamovement program on single RISC works for trace + replay. +TEST_F(LightMetalBasicTest, SingleRISCDataMovement) { + CreateDeviceAndBeginCapture(4096); + SingleRISCDataMovement_test(device_, false); +} + +// Same as above but with SetRuntimeArgs API that uses vec of CoreCoord and vec of vec rtargs. +TEST_F(LightMetalBasicTest, SingleRISCDataMovementRtArgsPerCoreVec) { + CreateDeviceAndBeginCapture(4096); + SingleRISCDataMovement_test(device_, true); +} + // Test simple case of 3 riscs used for datamovement and compute works for trace + replay. TEST_F(LightMetalBasicTest, ThreeRISCDataMovementCompute) { CreateDeviceAndBeginCapture(4096); uint32_t size_bytes = 64; // 16 elements. - auto input = CreateBuffer(InterleavedBufferConfig{this->device_, size_bytes, size_bytes, BufferType::DRAM}); - auto output = CreateBuffer(InterleavedBufferConfig{this->device_, size_bytes, size_bytes, BufferType::DRAM}); + auto input = CreateBuffer(InterleavedBufferConfig{device_, size_bytes, size_bytes, BufferType::DRAM}); + auto output = CreateBuffer(InterleavedBufferConfig{device_, size_bytes, size_bytes, BufferType::DRAM}); - CommandQueue& command_queue = this->device_->command_queue(); + CommandQueue& command_queue = device_->command_queue(); // TODO (kmabee) - There is issue with using make_shared, revisit this. // auto simple_program = std::make_shared(create_simple_unary_program(*input, @@ -259,10 +276,9 @@ TEST_F(LightMetalBasicTest, ThreeRISCDataMovementComputeDynamicCB) { uint32_t buf_size_bytes = 64; // 16 elements. uint32_t cb_size_bytes = 2048; - auto input = CreateBuffer(InterleavedBufferConfig{this->device_, buf_size_bytes, buf_size_bytes, BufferType::DRAM}); - auto output = - CreateBuffer(InterleavedBufferConfig{this->device_, buf_size_bytes, buf_size_bytes, BufferType::DRAM}); - auto cb_in_buf = CreateBuffer(InterleavedBufferConfig{this->device_, cb_size_bytes, cb_size_bytes, BufferType::L1}); + auto input = CreateBuffer(InterleavedBufferConfig{device_, buf_size_bytes, buf_size_bytes, BufferType::DRAM}); + auto output = CreateBuffer(InterleavedBufferConfig{device_, buf_size_bytes, buf_size_bytes, BufferType::DRAM}); + auto cb_in_buf = CreateBuffer(InterleavedBufferConfig{device_, cb_size_bytes, cb_size_bytes, BufferType::L1}); log_info( tt::LogTest, "Created 3 Buffers. 0x{:x} 0x{:x} 0x{:x}", @@ -270,7 +286,7 @@ TEST_F(LightMetalBasicTest, ThreeRISCDataMovementComputeDynamicCB) { output->address(), cb_in_buf->address()); - CommandQueue& command_queue = this->device_->command_queue(); + CommandQueue& command_queue = device_->command_queue(); auto simple_program = create_simple_unary_program(*input, *output, cb_in_buf.get()); vector input_data(input->size() / sizeof(uint32_t), 0); @@ -292,10 +308,10 @@ TEST_F(LightMetalBasicTest, SingleProgramTraceCapture) { CreateDeviceAndBeginCapture(4096); uint32_t size_bytes = 64; // 16 elements. Was 2048 in original test. - auto input = CreateBuffer(InterleavedBufferConfig{this->device_, size_bytes, size_bytes, BufferType::DRAM}); - auto output = CreateBuffer(InterleavedBufferConfig{this->device_, size_bytes, size_bytes, BufferType::DRAM}); + auto input = CreateBuffer(InterleavedBufferConfig{device_, size_bytes, size_bytes, BufferType::DRAM}); + auto output = CreateBuffer(InterleavedBufferConfig{device_, size_bytes, size_bytes, BufferType::DRAM}); - CommandQueue& command_queue = this->device_->command_queue(); + CommandQueue& command_queue = device_->command_queue(); Program simple_program = create_simple_unary_program(*input, *output); // Setup input data for program with some simple values. @@ -316,16 +332,16 @@ TEST_F(LightMetalBasicTest, SingleProgramTraceCapture) { write_junk_to_buffer(command_queue, *output); // Now enable Metal Trace and run program again for capture. - uint32_t tid = BeginTraceCapture(this->device_, command_queue.id()); + uint32_t tid = BeginTraceCapture(device_, command_queue.id()); EnqueueProgram(command_queue, simple_program, false); - EndTraceCapture(this->device_, command_queue.id(), tid); + EndTraceCapture(device_, command_queue.id(), tid); // Verify trace output during replay matches expected output from original capture. LightMetalCompareToGolden(command_queue, *output, eager_output_data.data()); // Done Finish(command_queue); - ReleaseTrace(this->device_, tid); + ReleaseTrace(device_, tid); } // Test simple compute test with metal trace, but no explicit trace replay (added automatically by light metal trace). @@ -333,11 +349,11 @@ TEST_F(LightMetalBasicTest, TwoProgramTraceCapture) { CreateDeviceAndBeginCapture(4096); uint32_t size_bytes = 64; // 16 elements. Was 2048 in original test. - auto input = CreateBuffer(InterleavedBufferConfig{this->device_, size_bytes, size_bytes, BufferType::DRAM}); - auto interm = CreateBuffer(InterleavedBufferConfig{this->device_, size_bytes, size_bytes, BufferType::DRAM}); - auto output = CreateBuffer(InterleavedBufferConfig{this->device_, size_bytes, size_bytes, BufferType::DRAM}); + auto input = CreateBuffer(InterleavedBufferConfig{device_, size_bytes, size_bytes, BufferType::DRAM}); + auto interm = CreateBuffer(InterleavedBufferConfig{device_, size_bytes, size_bytes, BufferType::DRAM}); + auto output = CreateBuffer(InterleavedBufferConfig{device_, size_bytes, size_bytes, BufferType::DRAM}); - CommandQueue& command_queue = this->device_->command_queue(); + CommandQueue& command_queue = device_->command_queue(); Program op0 = create_simple_unary_program(*input, *interm); Program op1 = create_simple_unary_program(*interm, *output); @@ -362,17 +378,17 @@ TEST_F(LightMetalBasicTest, TwoProgramTraceCapture) { write_junk_to_buffer(command_queue, *output); // Now enable Metal Trace and run program again for capture. - uint32_t tid = BeginTraceCapture(this->device_, command_queue.id()); + uint32_t tid = BeginTraceCapture(device_, command_queue.id()); EnqueueProgram(command_queue, op0, false); EnqueueProgram(command_queue, op1, false); - EndTraceCapture(this->device_, command_queue.id(), tid); + EndTraceCapture(device_, command_queue.id(), tid); // Verify trace output during replay matches expected output from original capture. LightMetalCompareToGolden(command_queue, *output, eager_output_data.data()); // Done Finish(command_queue); - ReleaseTrace(this->device_, tid); + ReleaseTrace(device_, tid); } } // namespace diff --git a/tt_metal/impl/flatbuffer/command.fbs b/tt_metal/impl/flatbuffer/command.fbs index b21a4a5dba2..2ab147c3d63 100644 --- a/tt_metal/impl/flatbuffer/command.fbs +++ b/tt_metal/impl/flatbuffer/command.fbs @@ -81,6 +81,13 @@ table SetRuntimeArgsUint32Command { args: [uint32]; // Arguments to be passed to kernel } +table SetRuntimeArgsUint32VecPerCoreCommand { + program_global_id: uint32; // Reference to Program + kernel_global_id: uint32; // Reference to Kernel + core_spec: [CoreCoord]; + args: [UInt32Vector]; // vector of vector of uint32_t +} + table SetRuntimeArgsCommand { kernel_global_id: uint32; // Reference to Kernel core_spec: CoreSpec; @@ -115,6 +122,7 @@ union CommandType { EnqueueProgramCommand, CreateKernelCommand, SetRuntimeArgsUint32Command, + SetRuntimeArgsUint32VecPerCoreCommand, SetRuntimeArgsCommand, CreateCircularBufferCommand, LightMetalCompareCommand, diff --git a/tt_metal/impl/flatbuffer/program_types.fbs b/tt_metal/impl/flatbuffer/program_types.fbs index 0d3b338fc90..8712a5e6c29 100644 --- a/tt_metal/impl/flatbuffer/program_types.fbs +++ b/tt_metal/impl/flatbuffer/program_types.fbs @@ -72,3 +72,7 @@ union RuntimeArgValue { table RuntimeArg { value: RuntimeArgValue; } + +table UInt32Vector { + values: [uint32]; +} diff --git a/tt_metal/impl/flatbuffer/program_types_from_flatbuffer.cpp b/tt_metal/impl/flatbuffer/program_types_from_flatbuffer.cpp index 8aff12e3bed..d47354f0d1d 100644 --- a/tt_metal/impl/flatbuffer/program_types_from_flatbuffer.cpp +++ b/tt_metal/impl/flatbuffer/program_types_from_flatbuffer.cpp @@ -92,4 +92,27 @@ std::vector from_flatbuffer(const flatbuffers::Vector* fb_ return sub_device_ids; } +std::vector from_flatbuffer( + const flatbuffers::Vector>* core_spec_fbs) { + TT_FATAL(core_spec_fbs, "Invalid Vector of CoreCoord data from flatbuffer."); + + std::vector core_spec(core_spec_fbs->size()); + for (const auto* coord_fbs : *core_spec_fbs) { + core_spec.emplace_back(coord_fbs->x(), coord_fbs->y()); + } + return core_spec; +} + +std::vector> from_flatbuffer( + const flatbuffers::Vector>* vec_of_vec_fbs) { + TT_FATAL(vec_of_vec_fbs, "Invalid FlatBuffer data: expected a vector of vector of uint32_t."); + + std::vector> result(vec_of_vec_fbs->size()); + for (const auto* sub_vector_fbs : *vec_of_vec_fbs) { + std::vector sub_vector(sub_vector_fbs->values()->begin(), sub_vector_fbs->values()->end()); + result.push_back(std::move(sub_vector)); + } + return result; +} + } // namespace tt::tt_metal diff --git a/tt_metal/impl/flatbuffer/program_types_from_flatbuffer.hpp b/tt_metal/impl/flatbuffer/program_types_from_flatbuffer.hpp index 930ebe230e7..4486fb5eba5 100644 --- a/tt_metal/impl/flatbuffer/program_types_from_flatbuffer.hpp +++ b/tt_metal/impl/flatbuffer/program_types_from_flatbuffer.hpp @@ -16,6 +16,11 @@ ComputeConfig from_flatbuffer(const flatbuffer::ComputeConfig* fb_config); EthernetConfig from_flatbuffer(const flatbuffer::EthernetConfig* fb_config); std::vector from_flatbuffer(const flatbuffers::Vector* fb_sub_device_ids); +std::vector from_flatbuffer( + const flatbuffers::Vector>* core_spec_fbs); +std::vector> from_flatbuffer( + const flatbuffers::Vector>* vec_of_vec_fbs); + template std::variant core_spec_from_flatbuffer(const CommandType* cmd) { switch (cmd->core_spec_type()) { diff --git a/tt_metal/impl/flatbuffer/program_types_to_flatbuffer.cpp b/tt_metal/impl/flatbuffer/program_types_to_flatbuffer.cpp index 6c8f1570604..c1abb57cfe7 100644 --- a/tt_metal/impl/flatbuffer/program_types_to_flatbuffer.cpp +++ b/tt_metal/impl/flatbuffer/program_types_to_flatbuffer.cpp @@ -38,6 +38,27 @@ std::pair> to_flatbuffer( core_spec); } +FlatbufferCoreCoordVector to_flatbuffer( + flatbuffers::FlatBufferBuilder& builder, const std::vector& core_spec) { + std::vector> core_offsets; + for (const auto& coord : core_spec) { + core_offsets.push_back(flatbuffer::CreateCoreCoord(builder, coord.x, coord.y)); + } + return builder.CreateVector(core_offsets); +} + +FlatbufferUInt32VecOfVec to_flatbuffer( + flatbuffers::FlatBufferBuilder& builder, const std::vector>& vec_of_vec) { + std::vector> vec_offsets; + + for (const auto& sub_vector : vec_of_vec) { + auto values_offset = builder.CreateVector(sub_vector); + vec_offsets.push_back(flatbuffer::CreateUInt32Vector(builder, values_offset)); + } + + return builder.CreateVector(vec_offsets); +} + // Original types defined in kernel_types.hpp std::pair> to_flatbuffer( flatbuffers::FlatBufferBuilder& builder, const DataMovementConfig& config) { diff --git a/tt_metal/impl/flatbuffer/program_types_to_flatbuffer.hpp b/tt_metal/impl/flatbuffer/program_types_to_flatbuffer.hpp index 858cdfdc0da..d381ef1cc9f 100644 --- a/tt_metal/impl/flatbuffer/program_types_to_flatbuffer.hpp +++ b/tt_metal/impl/flatbuffer/program_types_to_flatbuffer.hpp @@ -14,9 +14,19 @@ namespace tt::tt_metal { +using FlatbufferCoreCoordVector = flatbuffers::Offset>>; +using FlatbufferUInt32VecOfVec = + flatbuffers::Offset>>; + std::pair> to_flatbuffer( flatbuffers::FlatBufferBuilder& builder, const std::variant& core_spec); +FlatbufferCoreCoordVector to_flatbuffer( + flatbuffers::FlatBufferBuilder& builder, const std::vector& core_spec); + +FlatbufferUInt32VecOfVec to_flatbuffer( + flatbuffers::FlatBufferBuilder& builder, const std::vector>& vec_of_vec); + std::pair> to_flatbuffer( flatbuffers::FlatBufferBuilder& builder, const DataMovementConfig& config); diff --git a/tt_metal/impl/lightmetal/host_api_capture_helpers.cpp b/tt_metal/impl/lightmetal/host_api_capture_helpers.cpp index 9d4905bb2c6..43fd54d3fee 100644 --- a/tt_metal/impl/lightmetal/host_api_capture_helpers.cpp +++ b/tt_metal/impl/lightmetal/host_api_capture_helpers.cpp @@ -295,7 +295,7 @@ void CaptureSetRuntimeArgsUint32( uint32_t kernel_global_id = ctx.get_global_id(kernel.get()); log_debug( tt::LogMetalTrace, - "{}(uint32): kernel_global_id: {} program_global_id: {} rt_args: {}", + "{}: kernel_global_id: {} program_global_id: {} rt_args: {}", __FUNCTION__, kernel_global_id, program_global_id, @@ -310,6 +310,33 @@ void CaptureSetRuntimeArgsUint32( CaptureCommand(tt::tt_metal::flatbuffer::CommandType::SetRuntimeArgsUint32Command, cmd.Union()); } +void CaptureSetRuntimeArgsUint32VecPerCore( + const Program& program, + KernelHandle kernel_id, + const std::vector& core_spec, + const std::vector>& runtime_args) { + auto& ctx = LightMetalCaptureContext::get(); + + std::shared_ptr kernel = program.get_kernel(kernel_id); + uint32_t program_global_id = ctx.get_global_id(&program); + uint32_t kernel_global_id = ctx.get_global_id(kernel.get()); + log_debug( + tt::LogMetalTrace, + "{}: kernel_global_id: {} program_global_id: {} num_cores: {}", + __FUNCTION__, + kernel_global_id, + program_global_id, + core_spec.size()); + + auto& fbb = ctx.get_builder(); + auto core_spec_offset = to_flatbuffer(fbb, core_spec); + auto runtime_args_offset = to_flatbuffer(fbb, runtime_args); + + auto cmd = tt::tt_metal::flatbuffer::CreateSetRuntimeArgsUint32VecPerCoreCommand( + fbb, program_global_id, kernel_global_id, core_spec_offset, runtime_args_offset); + + CaptureCommand(tt::tt_metal::flatbuffer::CommandType::SetRuntimeArgsUint32VecPerCoreCommand, cmd.Union()); +} void CaptureSetRuntimeArgs( IDevice* device, const std::shared_ptr& kernel, @@ -322,7 +349,7 @@ void CaptureSetRuntimeArgs( auto rt_args_offset = to_flatbuffer(fbb, runtime_args); log_debug( tt::LogMetalTrace, - "{}(RuntimeArgs): kernel_global_id: {} rt_args_size: {}", + "{}: kernel_global_id: {} rt_args_size: {}", __FUNCTION__, kernel_global_id, runtime_args->size()); diff --git a/tt_metal/impl/lightmetal/host_api_capture_helpers.hpp b/tt_metal/impl/lightmetal/host_api_capture_helpers.hpp index 3639fd3b90b..7b2c982f42c 100644 --- a/tt_metal/impl/lightmetal/host_api_capture_helpers.hpp +++ b/tt_metal/impl/lightmetal/host_api_capture_helpers.hpp @@ -112,6 +112,12 @@ void CaptureSetRuntimeArgsUint32( const std::variant& core_spec, tt::stl::Span runtime_args); +void CaptureSetRuntimeArgsUint32VecPerCore( + const Program& program, + KernelHandle kernel_id, + const std::vector& core_spec, + const std::vector>& runtime_args); + void CaptureSetRuntimeArgs( IDevice* device, const std::shared_ptr& kernel, diff --git a/tt_metal/impl/lightmetal/lightmetal_replay.cpp b/tt_metal/impl/lightmetal/lightmetal_replay.cpp index 2971f438fa4..d42805161ae 100644 --- a/tt_metal/impl/lightmetal/lightmetal_replay.cpp +++ b/tt_metal/impl/lightmetal/lightmetal_replay.cpp @@ -300,6 +300,10 @@ void LightMetalReplay::execute(const tt::tt_metal::flatbuffer::Command* command) execute(command->cmd_as_SetRuntimeArgsUint32Command()); break; } + case ::tt::tt_metal::flatbuffer::CommandType::SetRuntimeArgsUint32VecPerCoreCommand: { + execute(command->cmd_as_SetRuntimeArgsUint32VecPerCoreCommand()); + break; + } case ::tt::tt_metal::flatbuffer::CommandType::SetRuntimeArgsCommand: { execute(command->cmd_as_SetRuntimeArgsCommand()); break; @@ -517,6 +521,28 @@ void LightMetalReplay::execute(const tt::tt_metal::flatbuffer::SetRuntimeArgsUin SetRuntimeArgs(*program, kernel_id, core_spec, args_span); } +void LightMetalReplay::execute(const tt::tt_metal::flatbuffer::SetRuntimeArgsUint32VecPerCoreCommand* cmd) { + log_debug( + tt::LogMetalTrace, + "LightMetalReplay(SetRuntimeArgs). program_global_id: {} kernel_global_id: {}", + cmd->program_global_id(), + cmd->kernel_global_id()); + auto program = get_program_from_map(cmd->program_global_id()); + auto kernel_id = get_kernel_handle_from_map(cmd->kernel_global_id()); + TT_FATAL( + program, + "Attempted to SetRuntimeArgs() using a program w/ global_id: {} that was not previously created.", + cmd->program_global_id()); + TT_FATAL( + kernel_id != UINT32_MAX, + "Attempted to SetRuntimeArgs() using a kernel w/ global_id: {} that was not previously created.", + cmd->kernel_global_id()); + + auto core_spec = from_flatbuffer(cmd->core_spec()); + auto runtime_args = from_flatbuffer(cmd->args()); + SetRuntimeArgs(*program, kernel_id, core_spec, runtime_args); +} + void LightMetalReplay::execute(const tt::tt_metal::flatbuffer::SetRuntimeArgsCommand* cmd) { log_debug( tt::LogMetalTrace, diff --git a/tt_metal/impl/lightmetal/lightmetal_replay.hpp b/tt_metal/impl/lightmetal/lightmetal_replay.hpp index a2c96ecdbe8..5089a6ba999 100644 --- a/tt_metal/impl/lightmetal/lightmetal_replay.hpp +++ b/tt_metal/impl/lightmetal/lightmetal_replay.hpp @@ -33,6 +33,7 @@ struct CreateProgramCommand; struct EnqueueProgramCommand; struct CreateKernelCommand; struct SetRuntimeArgsUint32Command; +struct SetRuntimeArgsUint32VecPerCoreCommand; struct SetRuntimeArgsCommand; struct CreateCircularBufferCommand; struct LightMetalCompareCommand; @@ -76,6 +77,7 @@ class LightMetalReplay { void execute(const tt::tt_metal::flatbuffer::EnqueueProgramCommand* command); void execute(const tt::tt_metal::flatbuffer::CreateKernelCommand* command); void execute(const tt::tt_metal::flatbuffer::SetRuntimeArgsUint32Command* command); + void execute(const tt::tt_metal::flatbuffer::SetRuntimeArgsUint32VecPerCoreCommand* cmd); void execute(const tt::tt_metal::flatbuffer::SetRuntimeArgsCommand* command); void execute(const tt::tt_metal::flatbuffer::CreateCircularBufferCommand* command); void execute(const tt::tt_metal::flatbuffer::LightMetalCompareCommand* command); diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index f4d0f6cbb54..4caeae9b22c 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -1253,6 +1253,8 @@ void SetRuntimeArgs( const std::vector& core_spec, const std::vector>& runtime_args) { ZoneScoped; + LIGHT_METAL_TRACE_FUNCTION_ENTRY(); + LIGHT_METAL_TRACE_FUNCTION_CALL(CaptureSetRuntimeArgsUint32VecPerCore, program, kernel, core_spec, runtime_args); TT_FATAL( core_spec.size() == runtime_args.size(), "Mistmatch between number of cores {} and number of runtime args {} getting updated", From 50325e8744fa7b4bded441f0f0bcd338c3f8f285 Mon Sep 17 00:00:00 2001 From: asaigal Date: Mon, 10 Feb 2025 23:28:11 -0800 Subject: [PATCH 062/316] #0: Make DispatchQueryManager::get_dispatch_core thread-safe since its called in the worker threads --- tt_metal/impl/dispatch/dispatch_query_manager.cpp | 1 + tt_metal/impl/dispatch/dispatch_query_manager.hpp | 3 +++ 2 files changed, 4 insertions(+) diff --git a/tt_metal/impl/dispatch/dispatch_query_manager.cpp b/tt_metal/impl/dispatch/dispatch_query_manager.cpp index 4ffa7597b31..a2d35e09f01 100644 --- a/tt_metal/impl/dispatch/dispatch_query_manager.cpp +++ b/tt_metal/impl/dispatch/dispatch_query_manager.cpp @@ -112,6 +112,7 @@ const std::vector& DispatchQueryManager::get_logical_dispatch_cores(u } tt_cxy_pair DispatchQueryManager::get_dispatch_core(uint8_t cq_id) const { + std::scoped_lock lock(modifier_mutex); if (dispatch_cores_.empty()) { for (auto cq = 0; cq < num_hw_cqs_; cq++) { // Populate when queried. Statically allocating at diff --git a/tt_metal/impl/dispatch/dispatch_query_manager.hpp b/tt_metal/impl/dispatch/dispatch_query_manager.hpp index 9435871461f..af091a6b427 100644 --- a/tt_metal/impl/dispatch/dispatch_query_manager.hpp +++ b/tt_metal/impl/dispatch/dispatch_query_manager.hpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#include + #include namespace tt::tt_metal { @@ -45,6 +47,7 @@ class DispatchQueryManager { // Make this mutable, since this is JIT populated // through a const instance when queried mutable std::vector dispatch_cores_; + mutable std::mutex modifier_mutex; }; } // namespace tt::tt_metal From bc262e5cb23a06f104969d3d7c766167b3937835 Mon Sep 17 00:00:00 2001 From: William Ly Date: Tue, 11 Feb 2025 12:16:33 -0600 Subject: [PATCH 063/316] [skip ci] #17811: Change job_success criteria so skipped jobs are not failing jobs (#17819) ### Ticket [17811](https://github.com/tenstorrent/tt-metal/issues/17811) ### Problem description Skipped jobs (such as build jobs) are being pushed as failing jobs in superset. ### What's changed Changed the criteria in the python workflow so that jobs that have github API `conclusion` field set to `success` or `skipped` have `job_success=true` ### TODO: Schema change to add `job_conclusion` as a new column in `sw_test.cicd_jobs` to distinguish between passing jobs and skipped jobs. ### Checklist - [x] New/Existing tests provide coverage for changes Unit test changes --- infra/data_collection/github/utils.py | 3 ++- infra/tests/data_collection/test_cicd.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/infra/data_collection/github/utils.py b/infra/data_collection/github/utils.py index b898ca00cd3..1761285f225 100644 --- a/infra/data_collection/github/utils.py +++ b/infra/data_collection/github/utils.py @@ -232,7 +232,8 @@ def get_job_row_from_github_job(github_job, github_job_id_to_annotations): job_end_ts = github_job["completed_at"] - job_success = github_job["conclusion"] == "success" + # skipped jobs are considered passing jobs (nothing was run) + job_success = github_job["conclusion"] in ["success", "skipped"] is_build_job = "build" in name or "build" in labels diff --git a/infra/tests/data_collection/test_cicd.py b/infra/tests/data_collection/test_cicd.py index 5256c516850..bd47c10fb37 100644 --- a/infra/tests/data_collection/test_cicd.py +++ b/infra/tests/data_collection/test_cicd.py @@ -104,8 +104,8 @@ def test_create_pipeline_json_to_detect_runner_comm_error_v1_among_other_failure failing_jobs = get_non_success_jobs_(pipeline) - # some are skipped - assert len(failing_jobs) == 4 + # some are skipped (skipped jobs are considered success) + assert len(failing_jobs) == 2 assert pipeline.github_pipeline_id == 11110261767 From f6d246107466041169fb53aa6d5e8e69a2d7af3f Mon Sep 17 00:00:00 2001 From: Dimitri Gnidash <119051828+dimitri-tenstorrent@users.noreply.github.com> Date: Tue, 11 Feb 2025 15:34:59 -0500 Subject: [PATCH 064/316] #17433: Part 1 of Versioned Documentation PR - Checking links (#17810) ### Ticket #17433 ### Problem description This PR is the first part to make pr #17434 more digestible. We do not want the links in Readme and Installation guide to be broken and would like the users to be alerted about the breakage. ### What's changed This PR adds a markdown linter that checks for links as one of the static code checks. ### Checklist - [x] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes - [ ] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [ ] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [ ] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [ ] New/Existing tests provide coverage for changes --- .github/workflows/all-static-checks.yaml | 13 +++++++++++++ .github/workflows/docs-latest-public.yaml | 6 ++++++ README.md | 4 ++-- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/.github/workflows/all-static-checks.yaml b/.github/workflows/all-static-checks.yaml index c46bb1b8c39..7f079d23b6a 100644 --- a/.github/workflows/all-static-checks.yaml +++ b/.github/workflows/all-static-checks.yaml @@ -67,6 +67,19 @@ jobs: run: sudo apt-get install -y aspell - name: Run checks on docs run: TT_METAL_HOME=$(pwd) docs/spellcheck.sh + check-docs-links: + runs-on: ubuntu-latest + steps: + - name: Checkout repo + uses: actions/checkout@v4 + with: + submodules: recursive + clean: false + - name: Link Checker + uses: lycheeverse/lychee-action@v2 + with: + args: --verbose './README.md' './INSTALLING.md' './docs/source/**/*.rst' './docs/source/**/*.md' + fail: true check-forbidden-imports: runs-on: ubuntu-latest steps: diff --git a/.github/workflows/docs-latest-public.yaml b/.github/workflows/docs-latest-public.yaml index 85e76a877c7..c092a50ffc8 100644 --- a/.github/workflows/docs-latest-public.yaml +++ b/.github/workflows/docs-latest-public.yaml @@ -83,3 +83,9 @@ jobs: continue-on-error: true with: name: github-pages + - name: Check the docs deployment is up + if: ${{ github.ref == 'refs/heads/main' }} + # TODO: Enhance this by looping over all the published versions in docs/published_versions.json + run: | + set -eu # basic shell hygiene + curl --fail -LI https://docs.tenstorrent.com/tt-metal/latest/ttnn/index.html -o /dev/null -s diff --git a/README.md b/README.md index fc4e313237a..db6c978ea98 100644 --- a/README.md +++ b/README.md @@ -74,8 +74,8 @@ |-----------------------------------------------------|-------|----------------------------------------------------|---------|----------------|---------| | [BERT-Large](./models/demos/metal_BERT_large_11/) | 12 | [e150](https://tenstorrent.com/hardware/grayskull) | 370 | 410 | | | [BERT-Large](./models/demos/metal_BERT_large_11/) | 8 | [n150](https://tenstorrent.com/hardware/wormhole) | 270 | 400 | | -| [T5 small](.models/demos/grayskull/t5) | | [e150](https://tenstorrent.com/hardware/grayskull) | 140 | | | -| [Bloom](.models/demos/grayskull/functional_bloom) | | [e150](https://tenstorrent.com/hardware/grayskull) | 70 | | | +| [T5 small](./models/demos/grayskull/t5) | | [e150](https://tenstorrent.com/hardware/grayskull) | 140 | | | +| [Bloom](./models/demos/grayskull/functional_bloom) | | [e150](https://tenstorrent.com/hardware/grayskull) | 70 | | | ## Model Updates From b89d7fa171a44537822868aa63ba9f26218daefe Mon Sep 17 00:00:00 2001 From: Bryan Wilder Field Lozano Date: Tue, 11 Feb 2025 14:47:05 -0700 Subject: [PATCH 065/316] [skip ci] Update metal-api-surface workflow (#17823) --- .github/workflows/metal-api-surface.yaml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/metal-api-surface.yaml b/.github/workflows/metal-api-surface.yaml index 2a3376c1154..a295376e2c3 100644 --- a/.github/workflows/metal-api-surface.yaml +++ b/.github/workflows/metal-api-surface.yaml @@ -57,9 +57,7 @@ jobs: with: payload: | { - "text": "\nTT_METAL_API_SURFACE:\ndate: ${{ env.DATE }} \nnum_files: ${{ env.NUM_FILES }} \nnum_types: ${{ env.NUM_TYPES }} \nnum_methods: ${{ env.NUM_METHODS }}", - "owner": "U07J3K6KS1K" + "text": "date: ${{ env.DATE }} \nnum_files: ${{ env.NUM_FILES }} \nnum_types: ${{ env.NUM_TYPES }} \nnum_methods: ${{ env.NUM_METHODS }}" } env: - SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} - SLACK_CHANNEL_ID: C08BAGE4410 + SLACK_WEBHOOK_URL: ${{ secrets.METAL_API_SURFACE_WEBHOOK }} From 441142fcf5ceccb07136c58fb86fbab926f71f07 Mon Sep 17 00:00:00 2001 From: Bryan Wilder Field Lozano Date: Tue, 11 Feb 2025 15:58:53 -0700 Subject: [PATCH 066/316] [skip ci] Update t3000-nightly-tests-impl.yaml (#17778) --- .../workflows/t3000-nightly-tests-impl.yaml | 3 +-- .../scripts/t3000/run_t3000_nightly_tests.sh | 20 ------------------- 2 files changed, 1 insertion(+), 22 deletions(-) delete mode 100755 tests/scripts/t3000/run_t3000_nightly_tests.sh diff --git a/.github/workflows/t3000-nightly-tests-impl.yaml b/.github/workflows/t3000-nightly-tests-impl.yaml index d2bc182e92f..b09dfcc6318 100644 --- a/.github/workflows/t3000-nightly-tests-impl.yaml +++ b/.github/workflows/t3000-nightly-tests-impl.yaml @@ -14,7 +14,7 @@ jobs: fail-fast: false matrix: test-group: [ - { name: "t3k_ccl_tests", arch: wormhole_b0, cmd: run_t3000_ccl_tests, timeout: 180, owner_id: ULMEPM2MA}, # Sean Nijjar + { name: "t3k_ccl_tests", arch: wormhole_b0, cmd: pytest -n auto tests/nightly/t3000/ccl, timeout: 180, owner_id: ULMEPM2MA}, # Sean Nijjar ] name: ${{ matrix.test-group.name }} @@ -46,7 +46,6 @@ jobs: source ${{ github.workspace }}/python_env/bin/activate cd $TT_METAL_HOME export PYTHONPATH=$TT_METAL_HOME - source ${{ github.workspace }}/tests/scripts/t3000/run_t3000_nightly_tests.sh ${{ matrix.test-group.cmd }} - uses: ./.github/actions/slack-report if: ${{ failure() }} diff --git a/tests/scripts/t3000/run_t3000_nightly_tests.sh b/tests/scripts/t3000/run_t3000_nightly_tests.sh deleted file mode 100755 index 006555e0cf8..00000000000 --- a/tests/scripts/t3000/run_t3000_nightly_tests.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -run_t3000_ccl_tests() { - # Record the start time - fail=0 - start_time=$(date +%s) - - echo "LOG_METAL: Running run_t3000_ccl_tests" - - # Falcon40B prefill 60 layer end to end with 10 loops; we need 8x8 grid size - pytest -n auto tests/nightly/t3000/ccl --timeout=180 ; fail+=$? - - # Record the end time - end_time=$(date +%s) - duration=$((end_time - start_time)) - echo "LOG_METAL: run_t3000_ccl_tests $duration seconds to complete" - if [[ $fail -ne 0 ]]; then - exit 1 - fi -} From d2f0b15273732d0c987b9cb83cfca4673aa096af Mon Sep 17 00:00:00 2001 From: Mark O'Connor Date: Wed, 12 Feb 2025 04:25:53 +0100 Subject: [PATCH 067/316] [TT-Transformer] Add HF_MODEL to load models directly from huggingface Co-authored-by: mtairum --- models/demos/llama3/PERF.md | 16 +++---- .../demos/llama3/tests/test_llama_accuracy.py | 29 ++++++------- models/demos/llama3/tt/llama_attention.py | 6 +-- models/demos/llama3/tt/load_checkpoints.py | 11 +++-- models/demos/llama3/tt/model_config.py | 42 +++++++++++++++---- 5 files changed, 67 insertions(+), 37 deletions(-) diff --git a/models/demos/llama3/PERF.md b/models/demos/llama3/PERF.md index 2aefa56be3c..8fb3be2baf7 100644 --- a/models/demos/llama3/PERF.md +++ b/models/demos/llama3/PERF.md @@ -11,16 +11,16 @@ This configuration uses bfp4 MLP FF1+FF3 for all models. | Model | Device | Top-1 (%) | Top-5 (%) | Speed (t/s/u) | |----------------|--------|-----------|-----------|---------------| | Llama3.2-1B | N150 | 89 | 98 | 86.9 | -| Llama3.2-1B | N300 | 91 | 98 | 104.3 | -| Llama3.2-1B | T3K | 91 | 98 | 118.5 | +| Llama3.2-1B | N300 | 90 | 98 | 104.3 | +| Llama3.2-1B | T3K | 87 | 98 | 118.5 | | Llama3.2-1B | TG | | | 72.3 | -| Llama3.2-3B | N150 | 92 | 96 | 53.3 | +| Llama3.2-3B | N150 | 91 | 96 | 53.3 | | Llama3.2-3B | N300 | 91 | 96 | 66.1 | | Llama3.2-3B | T3K | 91 | 96 | 66.9 | | Llama3.2-3B | TG | | | 48.5 | | Llama3.1-8B | N150 | 87 | 99 | 27.9 | | Llama3.1-8B | N300 | 88 | 99 | 43.7 | -| Llama3.1-8B | T3K | 88 | 100 | 64.2 | +| Llama3.1-8B | T3K | 88 | 99 | 64.2 | | Llama3.1-8B | TG | | | 41.0 | | Llama3.2-11B | N300 | 89 | 99 | 43.5 | | Llama3.2-11B | T3K | 88 | 99 | 63.4 | @@ -37,12 +37,12 @@ This configuration uses bfp4 MLP FF1+FF3 only for the Llama-3.1-70B model and th | Model | Device | Top-1 (%) | Top-5 (%) | Speed (t/s/u) | |----------------|--------|-----------|-----------|---------------| | Llama3.2-1B | N150 | 88 | 98 | 86.8 | -| Llama3.2-1B | N300 | 90 | 98 | 98.1 | -| Llama3.2-1B | T3K | 90 | 98 | 97.5 | +| Llama3.2-1B | N300 | 88 | 98 | 98.1 | +| Llama3.2-1B | T3K | 89 | 99 | 97.5 | | Llama3.2-1B | TG | 87 | 98 | 51.3 | -| Llama3.2-3B | N150 | 93 | 99 | 44.2 | +| Llama3.2-3B | N150 | 92 | 99 | 44.2 | | Llama3.2-3B | N300 | 92 | 98 | 54.2 | -| Llama3.2-3B | T3K | 93 | 98 | 55.6 | +| Llama3.2-3B | T3K | 91 | 100 | 55.6 | | Llama3.2-3B | TG | 91 | 98 | 33.6 | | Llama3.1-8B | N150 | 93 | 100 | 23.6 | | Llama3.1-8B | N300 | 93 | 100 | 34.5 | diff --git a/models/demos/llama3/tests/test_llama_accuracy.py b/models/demos/llama3/tests/test_llama_accuracy.py index d0fd2d2a15b..5a40dec57ac 100644 --- a/models/demos/llama3/tests/test_llama_accuracy.py +++ b/models/demos/llama3/tests/test_llama_accuracy.py @@ -157,7 +157,7 @@ def test_tt_model_acc( text = f.read() # Encode text to tokens - encoded_tokens = tokenizer.encode(text, bos=True, eos=False) + encoded_tokens = model_args.encode_prompt(text, system_prompt_text=None, instruct=False) total_length = prefill_len + decode_len + 1 reference_tokens = torch.tensor(encoded_tokens[:total_length]).unsqueeze(0) top5_tokens = None # Will be computed during inference @@ -439,17 +439,18 @@ def test_tt_model_acc( true_word = sanitize(tokenizer.decode([true_token])) logger.info(f"{error['position']}: {context}[{incorrect}] != [{expected}], true: [{true_word}]") - # Get accuracy thresholds from PERF.md - min_top1_acc, min_top5_acc = get_accuracy_thresholds( - model_args.base_model_name, - model_args.device_name, - optimizations, - ) + if use_reference_file: + # Get accuracy thresholds from PERF.md + min_top1_acc, min_top5_acc = get_accuracy_thresholds( + model_args.base_model_name, + model_args.device_name, + optimizations, + ) - logger.info(f"Top-1: {total_top1_acc:.0f}% | Top-5: {total_top5_acc:.0f}%") - assert ( - total_top1_acc >= min_top1_acc - ), f"Top-1 accuracy {total_top1_acc:.1f}% is too low (expected >={min_top1_acc}%)" - assert ( - total_top5_acc >= min_top5_acc - ), f"Top-5 accuracy {total_top5_acc:.1f}% is too low (expected >={min_top5_acc}%)" + logger.info(f"Top-1: {total_top1_acc:.0f}% | Top-5: {total_top5_acc:.0f}%") + assert ( + total_top1_acc >= min_top1_acc + ), f"Top-1 accuracy {total_top1_acc:.1f}% is too low (expected >={min_top1_acc}%)" + assert ( + total_top5_acc >= min_top5_acc + ), f"Top-5 accuracy {total_top5_acc:.1f}% is too low (expected >={min_top5_acc}%)" diff --git a/models/demos/llama3/tt/llama_attention.py b/models/demos/llama3/tt/llama_attention.py index ac67c80f1c2..a8c8581dc98 100644 --- a/models/demos/llama3/tt/llama_attention.py +++ b/models/demos/llama3/tt/llama_attention.py @@ -8,8 +8,6 @@ import ttnn from models.common.lightweightmodule import LightweightModule from models.demos.llama3.tt.llama_ccl import tt_all_reduce, tt_all_gather -from models.demos.llama3.tt.llama_common import first_five -from models.demos.llama3.tt.load_checkpoints import permute class TtLlamaAttention(LightweightModule): @@ -138,7 +136,9 @@ def __init__( ) # as_tensor returns (32, dim) which is incorrect, this reshape updates the padded size to the correct size self.wqkv_bias_prefill = ttnn.reshape( - self.wqkv_bias_prefill, ttnn.Shape([1, 1, 1, self.wqkv_bias_prefill.shape[-1]]) + self.wqkv_bias_prefill, + (1, 1, 1, self.wqkv_bias_prefill.shape[-1]), + (1, 1, self.wqkv_bias_prefill.shape[-2], self.wqkv_bias_prefill.shape[-1]), ) # Broadcasting does not seem to be supported inside execute_trace so expand to the whole batch size diff --git a/models/demos/llama3/tt/load_checkpoints.py b/models/demos/llama3/tt/load_checkpoints.py index 7e330a2e18d..f85788ee1e3 100644 --- a/models/demos/llama3/tt/load_checkpoints.py +++ b/models/demos/llama3/tt/load_checkpoints.py @@ -37,13 +37,16 @@ def load_hf_state_dict(ckpt_dir): raise FileNotFoundError(f"Neither model.safetensors.index.json nor model.safetensors found in {ckpt_dir}") loaded_weights = safetensors_load_file(safetensor_path) - if not "lm_head.weight" in loaded_weights: - # Assume tied to the embeddings if not present - loaded_weights["lm_head.weight"] = loaded_weights["model.embed_tokens.weight"] - return loaded_weights +def standardize_hf_keys(state_dict): + if not "lm_head.weight" in state_dict: + # Assume tied to the embeddings if not present + state_dict["lm_head.weight"] = state_dict["model.embed_tokens.weight"] + return state_dict + + def convert_hf_to_meta(state_dict, head_dim): state_dict = convert_hf_qkv_to_meta_format(state_dict, head_dim) state_dict = map_hf_to_meta_keys(state_dict) diff --git a/models/demos/llama3/tt/model_config.py b/models/demos/llama3/tt/model_config.py index db7b9e207c5..c58ea0a9eaa 100644 --- a/models/demos/llama3/tt/model_config.py +++ b/models/demos/llama3/tt/model_config.py @@ -31,6 +31,7 @@ convert_hf_to_meta, convert_meta_to_hf, reverse_permute, + standardize_hf_keys, ) @@ -114,8 +115,10 @@ def __init__( self.max_batch_size = max_batch_size self.tile_size = 32 self.is_70b = False + self.from_hf_url = False # updated below if true LLAMA_DIR = os.getenv("LLAMA_DIR") + HF_MODEL = os.getenv("HF_MODEL") if LLAMA_DIR: if any([os.getenv("LLAMA_CKPT_DIR"), os.getenv("LLAMA_TOKENIZER_PATH"), os.getenv("LLAMA_CACHE_PATH")]): logger.warning( @@ -125,10 +128,18 @@ def __init__( self.DEFAULT_TOKENIZER_PATH = LLAMA_DIR self.DEFAULT_CACHE_PATH = os.path.join(LLAMA_DIR, self.device_name) self.model_name = os.path.basename(LLAMA_DIR) # May be overridden by config + elif HF_MODEL: + self.DEFAULT_CKPT_DIR = HF_MODEL + self.DEFAULT_TOKENIZER_PATH = HF_MODEL + self.DEFAULT_CACHE_PATH = os.getenv("LLAMA_CACHE_PATH") + if not self.DEFAULT_CACHE_PATH: + self.DEFAULT_CACHE_PATH = os.path.join("model_cache", HF_MODEL, self.device_name) + self.model_name = HF_MODEL # May be overridden by config + self.from_hf_url = True else: assert "Please set $LLAMA_DIR to a valid checkpoint directory" - if not dummy_weights: + if not dummy_weights and not HF_MODEL: # Assert if all folders and files exist assert os.path.exists( self.DEFAULT_CKPT_DIR @@ -157,7 +168,10 @@ def __init__( self.instruct = True # Load model params - if not dummy_weights: + if HF_MODEL: + self.checkpoint_type = CheckpointType.HuggingFace + self._set_hf_params(self.DEFAULT_CKPT_DIR) + elif not dummy_weights: self.checkpoint_type = self.detect_checkpoint_type() self._set_model_params(self.DEFAULT_CKPT_DIR) else: # With Dummy weights, set the params from the local copy inside the model folder. This is required for CI pipeline that doesn't mount the external folders. @@ -1107,10 +1121,15 @@ def _set_llama_params(self, checkpoint_dir): self.orig_context_len = 8192 def _set_hf_params(self, checkpoint_dir): - config_file = os.path.join(checkpoint_dir, "config.json") - assert os.path.exists(config_file), f"config.json file not found at {config_file}" - with open(config_file, "r") as f: - config = json.load(f) + if self.from_hf_url: + from transformers import AutoConfig + + config = AutoConfig.from_pretrained(self.model_name).to_dict() + else: + config_file = os.path.join(checkpoint_dir, "config.json") + assert os.path.exists(config_file), f"config.json file not found at {config_file}" + with open(config_file, "r") as f: + config = json.load(f) self._set_params_from_dict(config) def __repr__(self): @@ -1172,7 +1191,14 @@ def load_state_dict(self): state_dict = load_meta_state_dict(self.DEFAULT_CKPT_DIR, self.n_layers) else: assert self.checkpoint_type == CheckpointType.HuggingFace - state_dict = load_hf_state_dict(self.DEFAULT_CKPT_DIR) + if self.from_hf_url: + from transformers import AutoModelForCausalLM + + model = AutoModelForCausalLM.from_pretrained(self.DEFAULT_CKPT_DIR) + state_dict = model.state_dict() + else: + state_dict = load_hf_state_dict(self.DEFAULT_CKPT_DIR) + state_dict = standardize_hf_keys(state_dict) state_dict = convert_hf_to_meta(state_dict, self.head_dim) keys_dict = list(state_dict.keys())[:] remv = [f"layers.{i}." for i in list(range(self.n_layers, self.full_model_n_layers))] @@ -1210,7 +1236,7 @@ def matmul_config( ) # TODO: Needed for TG hang workaround if in0_block_w is None: - in0_block_w = min(4, max(1, k // (self.tile_size * grid_size[0]))) + in0_block_w = self.find_largest_divisor(k // (self.tile_size * grid_size[1])) return ttnn.MatmulMultiCoreReuseMultiCastProgramConfig( compute_with_storage_grid_size=grid_size, From d221ac28c4ed0d25616f5e6aeff50c12b9b9592b Mon Sep 17 00:00:00 2001 From: William Ly Date: Tue, 11 Feb 2025 21:48:54 -0600 Subject: [PATCH 068/316] #17731: generate gtest testcase xml and upload as artifacts during cpp/sd unit test workflows (#17732) ### Ticket [17731 ](https://github.com/tenstorrent/tt-metal/issues/17731) ### Problem description C++ test data from cpp-unit-tests and sd-unit-tests do not get uploaded to superset since they're: - not generating test result xml artifacts that get read in during the produce_data workflow - running with gtest instead of pytest - current produce_data flow only supports pytest test result format (junit test xml) ### What's changed - Create and upload test result artifacts during cpp and sd unit test workflow ### Checklist SD unit tests: https://github.com/tenstorrent/tt-metal/actions/runs/13209167409 C++ tests: https://github.com/tenstorrent/tt-metal/actions/runs/13209164955 --- .github/workflows/build-and-unit-tests.yaml | 8 ++++++++ .github/workflows/cpp-post-commit.yaml | 8 ++++++++ infra/data_collection/github/workflows.py | 3 ++- 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-and-unit-tests.yaml b/.github/workflows/build-and-unit-tests.yaml index 20d649d5d49..aa0a14264b4 100644 --- a/.github/workflows/build-and-unit-tests.yaml +++ b/.github/workflows/build-and-unit-tests.yaml @@ -87,16 +87,24 @@ jobs: -e TT_METAL_HOME=${{ github.workspace }} -e TT_METAL_SLOW_DISPATCH_MODE=1 -e LD_LIBRARY_PATH=${{ github.workspace }}/build/lib + -e GTEST_OUTPUT=xml:generated/test_reports/ run_args: | pip install --force-reinstall pip==21.2.4 pip install -r tt_metal/python_env/requirements-dev.txt pip install -e . + mkdir -p generated/test_reports ${{ matrix.test-group.cmd }} - uses: ./.github/actions/slack-report if: ${{ failure() }} with: slack_webhook_url: ${{ secrets.SLACK_WEBHOOK_URL }} owner: U06CXU895AP # Michael Chiou + - uses: ./.github/actions/upload-artifact-with-job-uuid + if: ${{ !cancelled() }} + with: + path: | + generated/test_reports/ + prefix: "test_reports_" - name: Generate system logs on failure uses: ./.github/actions/generate-system-logs if: ${{ failure() }} diff --git a/.github/workflows/cpp-post-commit.yaml b/.github/workflows/cpp-post-commit.yaml index 0feaa3b80cb..93744a0bc7b 100644 --- a/.github/workflows/cpp-post-commit.yaml +++ b/.github/workflows/cpp-post-commit.yaml @@ -94,16 +94,24 @@ jobs: -e TT_METAL_HOME=${{ github.workspace }} -e ARCH_NAME=${{ inputs.arch }} -e LD_LIBRARY_PATH=${{ github.workspace }}/build/lib + -e GTEST_OUTPUT=xml:generated/test_reports/ run_args: | pip install --force-reinstall pip==21.2.4 pip install -r tt_metal/python_env/requirements-dev.txt pip install -e . + mkdir -p generated/test_reports ${{ matrix.test-group.cmd }} - uses: ./.github/actions/slack-report if: ${{ failure() }} with: slack_webhook_url: ${{ secrets.SLACK_WEBHOOK_URL }} owner: U06CXU895AP # Michael Chiou + - uses: ./.github/actions/upload-artifact-with-job-uuid + if: ${{ !cancelled() }} + with: + path: | + generated/test_reports/ + prefix: "test_reports_" - name: Generate system logs on failure uses: ./.github/actions/generate-system-logs if: ${{ failure() }} diff --git a/infra/data_collection/github/workflows.py b/infra/data_collection/github/workflows.py index d5a2ea5adf7..64bf9bb0d0a 100644 --- a/infra/data_collection/github/workflows.py +++ b/infra/data_collection/github/workflows.py @@ -237,4 +237,5 @@ def get_tests_from_test_report_path(test_report_path): return tests else: - raise Exception("We only support pytest junit xml outputs for now") + logger.warning("XML is not pytest junit format (gtest?), skipping for now") + return [] From 8b265e985f016d5741fd7eb5badfb786a63640f8 Mon Sep 17 00:00:00 2001 From: Bryan Wilder Field Lozano Date: Wed, 12 Feb 2025 01:36:58 -0600 Subject: [PATCH 069/316] [skip ci] Fix the version tag in python wheel (#17830) --- .github/workflows/build-artifact.yaml | 1 + .github/workflows/package-and-release.yaml | 2 ++ 2 files changed, 3 insertions(+) diff --git a/.github/workflows/build-artifact.yaml b/.github/workflows/build-artifact.yaml index c9fed1b5405..5d8b458c636 100644 --- a/.github/workflows/build-artifact.yaml +++ b/.github/workflows/build-artifact.yaml @@ -143,6 +143,7 @@ jobs: - name: ⬇️ Checkout uses: actions/checkout@v4 with: + fetch-depth: 0 submodules: recursive path: docker-job # Here be dragons; keep it scoped to our desired volume, yet must be under github.workspace and be sure to clean up at the end diff --git a/.github/workflows/package-and-release.yaml b/.github/workflows/package-and-release.yaml index c5dfdcb0f50..6a44ac31ded 100644 --- a/.github/workflows/package-and-release.yaml +++ b/.github/workflows/package-and-release.yaml @@ -14,6 +14,8 @@ jobs: build-artifact: uses: ./.github/workflows/build-artifact.yaml secrets: inherit + with: + build-wheel: true build-artifact-profiler: uses: ./.github/workflows/build-artifact.yaml with: From a0fa9d0bda8ea1558996e872cef78acfa9f1e977 Mon Sep 17 00:00:00 2001 From: Virdhatchani Narayanamoorthy <138196495+VirdhatchaniKN@users.noreply.github.com> Date: Wed, 12 Feb 2025 13:49:09 +0530 Subject: [PATCH 070/316] #17768: Documentation update for Batch Normalization (#17818) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Ticket https://github.com/tenstorrent/tt-metal/issues/17768 ### What's changed Documentation update for BN Screenshot 2025-02-09 at 4 57 27 PM ### Checklist - [ ] [All post-commit tests](https://github.com/tenstorrent/tt-metal/actions/runs/13261168764) - [ ] [Blackhole post-commit tests](https://github.com/tenstorrent/tt-metal/actions/runs/13261170584) - [ ] [(Single-card) Tests for new models](https://github.com/tenstorrent/tt-metal/actions/runs/13261171898) - [ ] [(Single-card) Demo tests](https://github.com/tenstorrent/tt-metal/actions/runs/13261173226) - [ ] [(Single-card) Device perf regressions](https://github.com/tenstorrent/tt-metal/actions/runs/13261174799) - [ ] [(Single-card) Model perf tests](https://github.com/tenstorrent/tt-metal/actions/runs/13261177707) --- .../batch_norm/batch_norm_pybind.cpp | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/ttnn/cpp/ttnn/operations/normalization/batch_norm/batch_norm_pybind.cpp b/ttnn/cpp/ttnn/operations/normalization/batch_norm/batch_norm_pybind.cpp index 2523f8b15c5..0a9250ac123 100644 --- a/ttnn/cpp/ttnn/operations/normalization/batch_norm/batch_norm_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/batch_norm/batch_norm_pybind.cpp @@ -37,6 +37,32 @@ void bind_batch_norm_operation(pybind11::module& module) { ttnn.Tensor: the output tensor. + Note: + Supported dtypes, layouts, and ranks: + + .. list-table:: + :header-rows: 1 + + * - Dtypes + - Layouts + - Ranks + * - BFLOAT16, FLOAT32 + - TILE + - 4 + + + Example: + + >>> input_tensor = ttnn.from_torch(torch.rand([2, 3, 4, 5], dtype=torch.bfloat16)), layout=ttnn.TILE_LAYOUT, device=device) + >>> running_mean = ttnn.from_torch(torch.rand([1, 3, 1, 1], dtype=torch.bfloat16)), layout=ttnn.TILE_LAYOUT, device=device) + >>> running_var = ttnn.from_torch(torch.rand([1, 3, 1, 1], dtype=torch.bfloat16)), layout=ttnn.TILE_LAYOUT, device=device) + >>> weight = ttnn.from_torch(torch.rand([1, 3, 1, 1], dtype=torch.bfloat16)), layout=ttnn.TILE_LAYOUT, device=device) + >>> bias = ttnn.from_torch(torch.rand([1, 3, 1, 1], dtype=torch.bfloat16)), layout=ttnn.TILE_LAYOUT, device=device) + >>> eps = 1e-05 + >>> momentum = 0.1 + >>> output = ttnn.batch_norm(input_tensor, running_mean = running_mean, running_var = running_var, weight = weight, bias = bias, eps = eps, momentum = momentum, training = True) + + )doc", ttnn::pybind_arguments_t{ py::arg("input"), From 21f589b2fedd8b79c08cf805b8b4e8e8f0937f28 Mon Sep 17 00:00:00 2001 From: William Ly Date: Wed, 12 Feb 2025 11:37:02 -0600 Subject: [PATCH 071/316] [skip ci] #0: Fix crash due to strict xml filename checking (#17842) Fix crash due to strict filename checking (gtest xmls don't necessarily match the name) ### Ticket Link to Github Issue ### Problem description Temporary fix for the strict filename checking (https://github.com/tenstorrent/tt-metal/actions/runs/13290740712/job/37110666839). Correct checking of all xml paths is WIP in https://github.com/tenstorrent/tt-metal/tree/williamly/data-pipeline-gtest-upload ### What's changed Add try-except block to catch FileNotFound exception thrown ### Checklist - [x] New/Existing tests provide coverage for changes https://github.com/tenstorrent/tt-metal/actions/runs/13291104817/job/37111891262 --- infra/data_collection/github/workflows.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/infra/data_collection/github/workflows.py b/infra/data_collection/github/workflows.py index 64bf9bb0d0a..0fc9a823a5a 100644 --- a/infra/data_collection/github/workflows.py +++ b/infra/data_collection/github/workflows.py @@ -24,9 +24,15 @@ def get_workflow_run_uuids_to_test_reports_paths_(workflow_outputs_dir, workflow assert test_report_dir.is_dir(), f"{test_report_dir} is not dir" test_report_uuid = test_report_dir.name.replace("test_reports_", "") - workflow_run_test_reports_path[test_report_uuid] = (test_report_dir / "most_recent_tests.xml").resolve( - strict=True - ) + + try: + xml_file_paths = (test_report_dir / "most_recent_tests.xml").resolve(strict=True) + except FileNotFoundError as e: + logger.warning( + f"no pytest xml file found matching most_recent_tests.xml (likely gtest xml) in {test_report_dir}" + ) + else: + workflow_run_test_reports_path[test_report_uuid] = xml_file_paths return workflow_run_test_reports_path From a71646441b2d92757dd7ab62d3bfdf2b6816bb9b Mon Sep 17 00:00:00 2001 From: Bryan Wilder Field Lozano Date: Wed, 12 Feb 2025 11:46:50 -0600 Subject: [PATCH 072/316] [skip ci] Dockerize tt-train cpp tests workflow (#17834) --- .../workflows/all-post-commit-workflows.yaml | 1 + .../tt-train-post-commit-wrapper.yaml | 1 + .github/workflows/tt-train-post-commit.yaml | 78 ++++++++++--------- 3 files changed, 44 insertions(+), 36 deletions(-) diff --git a/.github/workflows/all-post-commit-workflows.yaml b/.github/workflows/all-post-commit-workflows.yaml index f4bd6f0dc6d..e873132cdb1 100644 --- a/.github/workflows/all-post-commit-workflows.yaml +++ b/.github/workflows/all-post-commit-workflows.yaml @@ -157,6 +157,7 @@ jobs: with: arch: ${{ matrix.test-group.arch }} runner-label: ${{ matrix.test-group.runner-label }} + docker-image: ${{ needs.build-artifact.outputs.ci-build-docker-image }} run-profiler-regression: needs: build-artifact-profiler strategy: diff --git a/.github/workflows/tt-train-post-commit-wrapper.yaml b/.github/workflows/tt-train-post-commit-wrapper.yaml index 1e101a10725..b9acb83f608 100644 --- a/.github/workflows/tt-train-post-commit-wrapper.yaml +++ b/.github/workflows/tt-train-post-commit-wrapper.yaml @@ -26,3 +26,4 @@ jobs: with: arch: ${{ matrix.test-group.arch}} runner-label: ${{ matrix.test-group.runner-label}} + docker-image: ${{ needs.build-artifact.outputs.ci-build-docker-image }} diff --git a/.github/workflows/tt-train-post-commit.yaml b/.github/workflows/tt-train-post-commit.yaml index 1ecdcabfd17..d8de5434479 100644 --- a/.github/workflows/tt-train-post-commit.yaml +++ b/.github/workflows/tt-train-post-commit.yaml @@ -9,23 +9,9 @@ on: runner-label: required: true type: string - timeout: - required: false - type: number - default: 20 - workflow_dispatch: - inputs: - arch: - required: true - type: choice - options: - - wormhole_b0 - runner-label: + docker-image: required: true - type: choice - options: - - N150 - - N300 + type: string timeout: required: false type: number @@ -42,39 +28,59 @@ jobs: {name: tt-train, cmd: ctest --no-tests=error --output-on-failure}, ] name: ${{ matrix.test-group.name }} ${{ inputs.arch }} ${{ inputs.runner-label }} - env: - TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} - ARCH_NAME: ${{ inputs.arch }} - LOGURU_LEVEL: INFO - LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib - TEST_DATA_DIR: ${{ github.workspace }}/data - ENABLE_CI_ONLY_TT_TRAIN_TESTS: 1 runs-on: - ${{ inputs.runner-label }} - cloud-virtual-machine - in-service + container: + image: ${{ inputs.docker-image }} + env: + TT_METAL_HOME: /work + LD_LIBRARY_PATH: /work/build/lib + TEST_DATA_DIR: /work/data + ENABLE_CI_ONLY_TT_TRAIN_TESTS: 1 + volumes: + - ${{ github.workspace }}/docker-job:/work # Subdir to workaround https://github.com/actions/runner/issues/691 + - /dev/hugepages-1G:/dev/hugepages-1G + options: "--device /dev/tenstorrent" + defaults: + run: + shell: bash + working-directory: /work # https://github.com/actions/runner/issues/878 steps: - - uses: tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main - - name: Set up dynamic env vars for build - run: | - echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV - - uses: ./.github/actions/prepare-metal-run + - name: ⬇️ Checkout + uses: actions/checkout@v4 + with: + submodules: recursive + path: docker-job # Here be dragons; keep it scoped to our desired volume, yet must be under github.workspace and be sure to clean up at the end + - uses: actions/download-artifact@v4 + with: + name: TTMetal_build_any + path: docker-job + - name: Extract files + run: tar -xvf ttm_any.tar - name: ${{ matrix.test-group.name }} tests timeout-minutes: ${{ inputs.timeout }} run: | - source ${{ github.workspace }}/python_env/bin/activate - export PYTHONPATH=$TT_METAL_HOME - cd $TT_METAL_HOME cp ./build/tt-train/3rd_party/wandb-cpp/libwandbcpp.so build/lib/ - find ./build -type f -name "*.tcl" -o -name "*.cmake" -exec sed -i "s|/home/ubuntu/[^/]*/_work/tt-metal/tt-metal/build_Release|${TT_METAL_HOME}/build|g" {} + - cd $TT_METAL_HOME/build/tt-train + find ./build -type f -name "*.tcl" -o -name "*.cmake" -exec sed -i "s|/work/build_Release|/work/build|g" {} + + cd /work/build/tt-train ldd tests/ttml_tests || true ${{ matrix.test-group.cmd }} + - uses: ./.github/actions/slack-report if: ${{ failure() }} with: slack_webhook_url: ${{ secrets.SLACK_WEBHOOK_URL }} owner: U07ASPTGJTS # Denys - - name: Generate system logs on failure - uses: ./.github/actions/generate-system-logs - if: ${{ failure() }} + + - name: Cleanup + if: always() + run: | + # We are forced to checkout the repo into a subdir of the host's workdir; this pollutes the host + # with root-owned files. Be sure to clean up after ourselves in case we're on a non-ephemeral runner. + echo "pre rm" + ls -al /__w/tt-metal/tt-metal + rm -rf /__w/tt-metal/tt-metal/docker-job + echo "post rm" + ls -al /__w/tt-metal/tt-metal From 66603f29f8846cb9af7070850dbc99d45780ed72 Mon Sep 17 00:00:00 2001 From: Mark O'Connor Date: Wed, 12 Feb 2025 19:09:22 +0000 Subject: [PATCH 073/316] #0: Fall back to non-instruct prompt encoding if required --- models/demos/llama3/tt/model_config.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/models/demos/llama3/tt/model_config.py b/models/demos/llama3/tt/model_config.py index c58ea0a9eaa..dceb72a2ecf 100644 --- a/models/demos/llama3/tt/model_config.py +++ b/models/demos/llama3/tt/model_config.py @@ -1540,9 +1540,13 @@ def encode_prompt(self, prompt_text, system_prompt_text=None, instruct=True): return self.tokenizer.encode(prompt_text, bos=True, eos=False) else: if instruct: - return encode_prompt_hf(self.tokenizer, prompt_text, system_prompt_text) - else: - return self.tokenizer.encode(prompt_text, add_special_tokens=False) + try: + return encode_prompt_hf(self.tokenizer, prompt_text, system_prompt_text) + except ValueError as e: + logger.warning(f"Failed to encode chat prompt, are you sure this is an instruct model? Error: {e}") + logger.warning(f"Falling back to base model encoding with no chat template") + + return self.tokenizer.encode(prompt_text, add_special_tokens=False) def reference_lm_head(self): if self.checkpoint_type == CheckpointType.Meta: From 56dc66f1394d376b3c571e2a9b0e58b3623ccff6 Mon Sep 17 00:00:00 2001 From: Dimitri Gnidash <119051828+dimitri-tenstorrent@users.noreply.github.com> Date: Wed, 12 Feb 2025 14:40:35 -0500 Subject: [PATCH 074/316] #17846: Make build_artifact dependent on create-tag (#17847) ### Ticket #17846 ### Problem description The build file created would have the older tag being part of the name. See example in https://github.com/tenstorrent/tt-metal/releases/tag/v0.56.0-rc24 with the wheel file having the name of `ttnn-0.56.0rc19.dev3+any-cp38-cp38-linux_x86_64.whl` ### What's changed In setup.py, there is a class that determines the version name and it relies on inferring the available tags in the git checkout. We have unbound the job named `build-artifact` from `create-tag`. This meant that the tag was not created before the wheel was created. Adding the dependency fixed the problem. ### Checklist - [ ] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes - [ ] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [ ] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [ ] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [ ] New/Existing tests provide coverage for changes --- .github/workflows/package-and-release.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/package-and-release.yaml b/.github/workflows/package-and-release.yaml index 6a44ac31ded..0a1c6cbd8ea 100644 --- a/.github/workflows/package-and-release.yaml +++ b/.github/workflows/package-and-release.yaml @@ -12,6 +12,7 @@ permissions: jobs: build-artifact: + needs: create-tag uses: ./.github/workflows/build-artifact.yaml secrets: inherit with: From 8acfe7ce2c7f0fa06ecde56250568cc396eace47 Mon Sep 17 00:00:00 2001 From: Artem Yerofieiev <169092593+ayerofieiev-tt@users.noreply.github.com> Date: Wed, 12 Feb 2025 21:59:02 -0600 Subject: [PATCH 075/316] Create workflow to mirror a branch from fork (#17856) ### Ticket None ### Problem description We can't run CI on a PR from a branch coming from a fork. This is the typical workflow when community creates a PR. ### What's changed Adding a workflow that helps to mirror a branch. ### Checklist - [ ] Workflow run --- .github/workflows/mirror-fork-branch.yaml | 83 +++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 .github/workflows/mirror-fork-branch.yaml diff --git a/.github/workflows/mirror-fork-branch.yaml b/.github/workflows/mirror-fork-branch.yaml new file mode 100644 index 00000000000..0e5da31c18d --- /dev/null +++ b/.github/workflows/mirror-fork-branch.yaml @@ -0,0 +1,83 @@ +name: Mirror Fork Branch to Origin + +on: + workflow_dispatch: + inputs: + source: + description: 'Source in format : (e.g., user:branch)' + required: true + target_branch: + description: > + Optional. Target branch name in origin. If not provided, the branch will be named + `mirror//`. + required: false + +jobs: + mirror-fork-branch: + runs-on: ubuntu-latest + steps: + - name: Parse input + id: parse_input + shell: bash + run: | + # Expect input format: : + IFS=":" read -r FORK_OWNER SRC_BRANCH <<< "${{ github.event.inputs.source }}" + if [ -z "$FORK_OWNER" ] || [ -z "$SRC_BRANCH" ]; then + echo "Error: Input must be in the format :" + exit 1 + fi + # Derive the fork repository name from the current repository. + ORIGIN_REPO_NAME=$(echo "${{ github.repository }}" | cut -d'/' -f2) + FORK_REPO="${FORK_OWNER}/${ORIGIN_REPO_NAME}" + echo "FORK_OWNER: $FORK_OWNER" + echo "Source branch: $SRC_BRANCH" + echo "Fork repository: $FORK_REPO" + echo "fork_owner=$FORK_OWNER" >> $GITHUB_OUTPUT + echo "src_branch=$SRC_BRANCH" >> $GITHUB_OUTPUT + echo "fork_repo=$FORK_REPO" >> $GITHUB_OUTPUT + + - name: Checkout base repository + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Configure Git + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + + - name: Add fork remote and fetch branch + run: | + echo "Adding remote for fork: ${{ steps.parse_input.outputs.fork_repo }}" + git remote add fork https://github.com/${{ steps.parse_input.outputs.fork_repo }}.git || echo "Remote 'fork' already exists" + echo "Fetching branch: ${{ steps.parse_input.outputs.src_branch }}" + git fetch fork ${{ steps.parse_input.outputs.src_branch }} + + - name: Create or update local branch from fork branch + id: create_branch + shell: bash + run: | + # Determine the target branch name. + if [ -n "${{ github.event.inputs.target_branch }}" ]; then + TARGET_BRANCH="${{ github.event.inputs.target_branch }}" + else + TARGET_BRANCH="mirror/${{ steps.parse_input.outputs.fork_owner }}/${{ steps.parse_input.outputs.src_branch }}" + fi + echo "Using target branch: $TARGET_BRANCH" + + # If the branch exists locally, reset it; otherwise, create it. + if git show-ref --verify --quiet "refs/heads/$TARGET_BRANCH"; then + echo "Branch '$TARGET_BRANCH' exists. Updating with latest commits from fork." + git checkout "$TARGET_BRANCH" + git reset --hard "fork/${{ steps.parse_input.outputs.src_branch }}" + else + echo "Branch '$TARGET_BRANCH' does not exist. Creating it from fork branch." + git checkout -b "$TARGET_BRANCH" "fork/${{ steps.parse_input.outputs.src_branch }}" + fi + echo "target_branch=$TARGET_BRANCH" >> $GITHUB_OUTPUT + + - name: Push branch to origin + run: | + TARGET_BRANCH="${{ steps.create_branch.outputs.target_branch }}" + echo "Pushing branch '$TARGET_BRANCH' to origin" + git push origin "$TARGET_BRANCH" --force From 32d2c2592b5a2e6daab13f2393807be1a64226c9 Mon Sep 17 00:00:00 2001 From: Artem Yerofieiev <169092593+ayerofieiev-tt@users.noreply.github.com> Date: Wed, 12 Feb 2025 22:23:25 -0600 Subject: [PATCH 076/316] Create a comment in a PR from a fork when branch is mirrored (#17857) ### Ticket None ### What's changed We now will post a comment when a fork-branch is mirrored --- .github/workflows/mirror-fork-branch.yaml | 34 +++++++++++++++++++++-- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/.github/workflows/mirror-fork-branch.yaml b/.github/workflows/mirror-fork-branch.yaml index 0e5da31c18d..f89b1c7c14a 100644 --- a/.github/workflows/mirror-fork-branch.yaml +++ b/.github/workflows/mirror-fork-branch.yaml @@ -4,7 +4,7 @@ on: workflow_dispatch: inputs: source: - description: 'Source in format : (e.g., user:branch)' + description: 'Source in format :' required: true target_branch: description: > @@ -64,8 +64,7 @@ jobs: TARGET_BRANCH="mirror/${{ steps.parse_input.outputs.fork_owner }}/${{ steps.parse_input.outputs.src_branch }}" fi echo "Using target branch: $TARGET_BRANCH" - - # If the branch exists locally, reset it; otherwise, create it. + # If the branch exists locally, update it; if not, create it. if git show-ref --verify --quiet "refs/heads/$TARGET_BRANCH"; then echo "Branch '$TARGET_BRANCH' exists. Updating with latest commits from fork." git checkout "$TARGET_BRANCH" @@ -81,3 +80,32 @@ jobs: TARGET_BRANCH="${{ steps.create_branch.outputs.target_branch }}" echo "Pushing branch '$TARGET_BRANCH' to origin" git push origin "$TARGET_BRANCH" --force + + - name: Find PR + id: pr + shell: bash + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + # Look for a PR with head "fork_owner:src_branch" + HEAD_QUERY="${{ steps.parse_input.outputs.fork_owner }}:${{ steps.parse_input.outputs.src_branch }}" + echo "Searching for PR with head: ${HEAD_QUERY}" + PR_JSON=$(curl -s -H "Authorization: token $GITHUB_TOKEN" \ + -H "Accept: application/vnd.github+json" \ + "https://api.github.com/repos/${{ github.repository }}/pulls?head=${HEAD_QUERY}") + PR_NUMBER=$(echo "$PR_JSON" | jq '.[0].number // empty') + if [ -z "$PR_NUMBER" ]; then + echo "No PR found" + echo "issue=" >> $GITHUB_OUTPUT + else + echo "Found PR #$PR_NUMBER" + echo "issue=$PR_NUMBER" >> $GITHUB_OUTPUT + fi + + - name: Post comment on PR + if: steps.pr.outputs.issue != '' + uses: mshick/add-pr-comment@v2 + with: + issue: ${{ steps.pr.outputs.issue }} + message: | + ✨ A mirror branch has been created/updated for this PR: [`${{ steps.create_branch.outputs.target_branch }}`](https://github.com/${{ github.repository }}/tree/${{ steps.create_branch.outputs.target_branch }}) From 3e5adf3918ed724c84d547c00a10f9fb6166d410 Mon Sep 17 00:00:00 2001 From: Oleg Milyutin Date: Wed, 12 Feb 2025 23:33:46 -0500 Subject: [PATCH 077/316] #0: Delete data_transfer op and use Tensor methods instead (#17839) ### Ticket N/A ### Problem description "data_transfer" op can be deleted. ### What's changed Delete "data_transfer" op and use `Tensor::to_device` / `Tensor::cpu` directly. Adjust `Tensor::to` -> `Tensor::to_layout` / `Tensor::to_device` graph tracker labels. ### Checklist - [X] [All post commit](https://github.com/tenstorrent/tt-metal/actions/runs/13278861423) CI passes --- ttnn/CMakeLists.txt | 1 - ttnn/cpp/ttnn/operations/core/core.cpp | 1 - .../data_transfer/data_transfer.cpp | 33 ------------------- .../data_transfer/data_transfer.hpp | 29 ---------------- .../data_movement/reshape_view/reshape.cpp | 1 - .../experimental/auto_format/auto_format.cpp | 9 +++-- ttnn/cpp/ttnn/tensor/tensor_ops.cpp | 10 +++--- 7 files changed, 9 insertions(+), 75 deletions(-) delete mode 100644 ttnn/cpp/ttnn/operations/data_movement/data_transfer/data_transfer.cpp delete mode 100644 ttnn/cpp/ttnn/operations/data_movement/data_transfer/data_transfer.hpp diff --git a/ttnn/CMakeLists.txt b/ttnn/CMakeLists.txt index 9d750c67593..e8a6f887a09 100644 --- a/ttnn/CMakeLists.txt +++ b/ttnn/CMakeLists.txt @@ -46,7 +46,6 @@ set(TTNN_OP_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/copy/copy_pybind.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/copy/typecast/typecast.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/copy/typecast/typecast_pybind.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/data_transfer/data_transfer.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_op.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/fill_pad/fill_pad.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/fill_pad/fill_pad_pybind.cpp diff --git a/ttnn/cpp/ttnn/operations/core/core.cpp b/ttnn/cpp/ttnn/operations/core/core.cpp index a9ad99356c8..bf18d293652 100644 --- a/ttnn/cpp/ttnn/operations/core/core.cpp +++ b/ttnn/cpp/ttnn/operations/core/core.cpp @@ -11,7 +11,6 @@ #include "cpp/ttnn/operations/data_movement/move/move.hpp" #include "cpp/ttnn/operations/data_movement/reshape_on_device/reshape.hpp" #include "cpp/ttnn/operations/data_movement/reshape_view/reshape.hpp" -#include "ttnn/operations/data_movement/data_transfer/data_transfer.hpp" #include "ttnn/distributed/types.hpp" #include "ttnn/operations/data_movement/sharded/sharded_to_interleaved/sharded_to_interleaved.hpp" #include "ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded.hpp" diff --git a/ttnn/cpp/ttnn/operations/data_movement/data_transfer/data_transfer.cpp b/ttnn/cpp/ttnn/operations/data_movement/data_transfer/data_transfer.cpp deleted file mode 100644 index cca84c20ed8..00000000000 --- a/ttnn/cpp/ttnn/operations/data_movement/data_transfer/data_transfer.cpp +++ /dev/null @@ -1,33 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include "ttnn/operations/data_movement/data_transfer/data_transfer.hpp" -#include "ttnn/tensor/tensor.hpp" - -namespace ttnn::operations::data_movement { - -Tensor DataTransferToHostOperation::invoke(const Tensor& input_tensor) { - if (input_tensor.storage_type() != StorageType::DEVICE) { - return input_tensor; - } - - return input_tensor.cpu(); -} - -Tensor DataTransferToDeviceOperation::invoke( - const Tensor& input_tensor, IDevice* device, const MemoryConfig& memory_config) { - TT_FATAL(device != nullptr, "Error"); - - if (input_tensor.get_layout() == Layout::ROW_MAJOR) { - TT_FATAL(input_tensor.get_padded_shape()[-1] * input_tensor.element_size() % sizeof(uint32_t) == 0, "Error"); - } - - if (input_tensor.storage_type() == StorageType::DEVICE && input_tensor.device() == device) { - return {input_tensor}; - } - - return input_tensor.to_device(device, memory_config); -} - -} // namespace ttnn::operations::data_movement diff --git a/ttnn/cpp/ttnn/operations/data_movement/data_transfer/data_transfer.hpp b/ttnn/cpp/ttnn/operations/data_movement/data_transfer/data_transfer.hpp deleted file mode 100644 index 2d8ec4701ea..00000000000 --- a/ttnn/cpp/ttnn/operations/data_movement/data_transfer/data_transfer.hpp +++ /dev/null @@ -1,29 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "ttnn/decorators.hpp" - -namespace ttnn { -namespace operations::data_movement { - -struct DataTransferToHostOperation { - static Tensor invoke(const Tensor& input_tensor); -}; - -struct DataTransferToDeviceOperation { - static Tensor invoke(const Tensor& input_tensor, IDevice* device, const MemoryConfig& memory_config); -}; - -} // namespace operations::data_movement - -constexpr auto data_transfer_to_host = ttnn::register_operation_with_auto_launch_op< - "ttnn::data_transfer_to_host", - ttnn::operations::data_movement::DataTransferToHostOperation>(); -constexpr auto data_transfer_to_device = ttnn::register_operation_with_auto_launch_op< - "ttnn::data_transfer_to_device", - ttnn::operations::data_movement::DataTransferToDeviceOperation>(); - -} // namespace ttnn diff --git a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp index 2f3b2f33d2c..6bb2d3f1398 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp @@ -12,7 +12,6 @@ #include "ttnn/operations/experimental/auto_format/auto_format.hpp" #include "ttnn/tensor/tensor_utils.hpp" #include "cpp/ttnn/operations/data_movement/reshape_on_device/reshape.hpp" -#include "ttnn/operations/data_movement/data_transfer/data_transfer.hpp" #include "ttnn/operations/data_movement/slice/slice.hpp" #include "ttnn/operations/core/core.hpp" #include "device/reshape_rm_op.hpp" diff --git a/ttnn/cpp/ttnn/operations/experimental/auto_format/auto_format.cpp b/ttnn/cpp/ttnn/operations/experimental/auto_format/auto_format.cpp index 9a3a24b2d80..0301fb8eef7 100644 --- a/ttnn/cpp/ttnn/operations/experimental/auto_format/auto_format.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/auto_format/auto_format.cpp @@ -8,7 +8,6 @@ #include #include "ttnn/operations/data_movement/clone/clone.hpp" -#include "ttnn/operations/data_movement/data_transfer/data_transfer.hpp" #include "ttnn/operations/data_movement/pad/pad.hpp" #include "ttnn/operations/data_movement/slice/slice.hpp" #include "ttnn/operations/data_movement/tilize/tilize.hpp" @@ -23,7 +22,7 @@ namespace ttnn::operations::experimental::auto_format { Tensor AutoFormat::move_tensor_to_device(const Tensor& input, IDevice* device, const MemoryConfig& mem_config) { if (input.storage_type() != StorageType::DEVICE) { - return ttnn::data_transfer_to_device(input, device, mem_config); + return input.to_device(device, mem_config); } else { return input; } @@ -31,7 +30,7 @@ Tensor AutoFormat::move_tensor_to_device(const Tensor& input, IDevice* device, c Tensor AutoFormat::move_tensor_to_mem_config(const Tensor& input, const MemoryConfig& mem_config) { if (input.storage_type() != StorageType::DEVICE) { - return ttnn::data_transfer_to_device(input, AutoFormat::GetDefaultDevice(), mem_config); + return input.to_device(AutoFormat::GetDefaultDevice(), mem_config); } else if (input.memory_config() != mem_config) { return ttnn::clone(input, std::nullopt, mem_config, std::nullopt); } else { @@ -123,7 +122,7 @@ Tensor AutoFormat::format_input_tensor( } } // Fall back to host conversions - formatted_input = ttnn::data_transfer_to_host(formatted_input); + formatted_input = formatted_input.cpu(); } // Host side conversions @@ -218,7 +217,7 @@ Tensor AutoFormat::format_output_tensor( } } // Fall back to host conversions - formatted_output = ttnn::data_transfer_to_host(formatted_output); + formatted_output = formatted_output.cpu(); } // Host side conversions diff --git a/ttnn/cpp/ttnn/tensor/tensor_ops.cpp b/ttnn/cpp/ttnn/tensor/tensor_ops.cpp index 5f250738ed4..913d67c136e 100644 --- a/ttnn/cpp/ttnn/tensor/tensor_ops.cpp +++ b/ttnn/cpp/ttnn/tensor/tensor_ops.cpp @@ -30,7 +30,7 @@ namespace tt::tt_metal::tensor_ops { Tensor tensor_to_device( const Tensor& input_tensor, IDevice* target_device, const MemoryConfig& mem_config, QueueId cq_id) { ZoneScoped; - GraphTracker::instance().track_function_start("Tensor::to", input_tensor, target_device, mem_config); + GraphTracker::instance().track_function_start("Tensor::to_device", input_tensor, target_device, mem_config); // Tensor can be using borrowed storage. If so, when running in async mode, copy this tensor to owned storage. Tensor async_safe_tensor = copy_borrowed_tensor_in_async_mode(target_device, input_tensor); // Populate device storage outside of thread, so that downstream @@ -67,7 +67,7 @@ Tensor tensor_to_device( Tensor tensor_to_device( const Tensor& input_tensor, const std::vector& workers, const MemoryConfig& mem_config, QueueId cq_id) { ZoneScoped; - GraphTracker::instance().track_function_start("Tensor::to", input_tensor, workers, mem_config); + GraphTracker::instance().track_function_start("Tensor::to_device", input_tensor, workers, mem_config); TT_FATAL( validate_worker_modes(workers), "All device threads/workers must be running in the same mode (ASYNC or SYNC)"); Tensor device_tensor = Tensor(workers); @@ -144,7 +144,7 @@ Tensor tensor_cpu(const Tensor& input_tensor, bool blocking, QueueId cq_id) { Tensor tensor_to_layout(const Tensor& input_tensor, Layout target_layout, IDevice* worker) { ZoneScoped; - GraphTracker::instance().track_function_start("Tensor::to", input_tensor, target_layout, worker); + GraphTracker::instance().track_function_start("Tensor::to_layout", input_tensor, target_layout, worker); // Only push layout conversion to worker if running in async mode if (worker and worker->get_worker_mode() == WorkExecutorMode::ASYNCHRONOUS) { // Tensor can be using borrowed storage. If so, when running in async mode, copy this tensor to owned storage. @@ -154,7 +154,7 @@ Tensor tensor_to_layout(const Tensor& input_tensor, Layout target_layout, IDevic TT_ASSERT( async_safe_tensor.storage_type() == StorageType::OWNED or async_safe_tensor.storage_type() == StorageType::BORROWED && - "to(layout) must be called on host tensors with a single buffer when a single worker is specified"); + "to_layout must be called on host tensors with a single buffer when a single worker is specified"); auto local_tensor = tensor_impl::to_layout_wrapper(async_safe_tensor, target_layout); // Populate modified layout tensor tensor_modified_layout.populate_buffers_and_metadata(local_tensor); @@ -176,7 +176,7 @@ Tensor tensor_to_layout(const Tensor& input_tensor, Layout target_layout, IDevic Tensor tensor_to_layout(const Tensor& input_tensor, Layout target_layout, distributed::MeshDevice* mesh_device) { ZoneScoped; - GraphTracker::instance().track_function_start("Tensor::to", input_tensor, target_layout, mesh_device); + GraphTracker::instance().track_function_start("Tensor::to_layout", input_tensor, target_layout, mesh_device); if (mesh_device) { auto workers = ttnn::distributed::get_mapped_devices(input_tensor, *mesh_device); TT_FATAL( From fc4ae37e92034c7b19da4bc25da424ffd51b1c94 Mon Sep 17 00:00:00 2001 From: Artem Yerofieiev <169092593+ayerofieiev-tt@users.noreply.github.com> Date: Wed, 12 Feb 2025 22:49:22 -0600 Subject: [PATCH 078/316] [skip ci] Create pr-comment-trigger.yaml (#17858) ### Problem description This PR introduces a new GitHub Actions workflow (PR Comment Trigger Workflow) that automates specific tasks based on comments made on pull requests (PRs). The workflow is designed to trigger actions like mirroring a branch or running tests when specific keywords (:mirror: or :test:) are detected in PR comments. --- .github/workflows/pr-comment-trigger.yaml | 63 +++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 .github/workflows/pr-comment-trigger.yaml diff --git a/.github/workflows/pr-comment-trigger.yaml b/.github/workflows/pr-comment-trigger.yaml new file mode 100644 index 00000000000..f98c02fd97e --- /dev/null +++ b/.github/workflows/pr-comment-trigger.yaml @@ -0,0 +1,63 @@ +name: PR Comment Trigger Workflow + +on: + pull_request: + types: [opened, edited, reopened] + issue_comment: + types: [created] + +jobs: + detect_trigger: + runs-on: ubuntu-latest + outputs: + mirror_triggered: ${{ steps.mirror_check.outputs.triggered }} + test_triggered: ${{ steps.test_check.outputs.triggered }} + steps: + - name: Check for trigger (mirror) + uses: khan/pull-request-comment-trigger@v1.1.0 + id: mirror_check + with: + trigger: ':mirror:' + reaction: eyes + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Check for trigger (test) + uses: khan/pull-request-comment-trigger@v1.1.0 + id: test_check + with: + trigger: ':test:' + reaction: rocket + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + mirror: + needs: detect_trigger + if: needs.detect_trigger.outputs.mirror_triggered == 'true' + uses: ./.github/workflows/mirror-branch-workflow.yml + with: + # Build the source input as ":" using the PR head info. + source: "${{ github.event.pull_request.head.repo.owner.login }}:${{ github.event.pull_request.head.ref }}" + + test: + needs: detect_trigger + if: needs.detect_trigger.outputs.test_triggered == 'true' + uses: ./.github/workflows/all-post-commit-tests.yml + with: + build-type: Release + # For PRs from a fork, run tests on the mirror branch in our repo; + # otherwise, run on the PR’s head branch. + branch: ${{ github.event.pull_request.head.repo.full_name != github.repository && 'mirror/' + github.event.pull_request.head.repo.owner.login + '/' + github.event.pull_request.head.ref || github.event.pull_request.head.ref }} + + post_comment: + needs: [mirror, test] + if: (needs.detect_trigger.outputs.test_triggered == 'true') + runs-on: ubuntu-latest + steps: + - name: Post workflow run link comment on PR + uses: mshick/add-pr-comment@v2 + with: + # If this is a PR event, use its number. + issue: ${{ github.event.pull_request.number }} + message: | + ✨ Tests workflow run is available [here](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) From 224c5c6142651f398fb2274b42a2c98f8fbd54bb Mon Sep 17 00:00:00 2001 From: Artem Yerofieiev <169092593+ayerofieiev-tt@users.noreply.github.com> Date: Wed, 12 Feb 2025 20:55:22 -0800 Subject: [PATCH 079/316] Update pr-comment-trigger.yaml --- .github/workflows/pr-comment-trigger.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr-comment-trigger.yaml b/.github/workflows/pr-comment-trigger.yaml index f98c02fd97e..c8bb8ec0f6c 100644 --- a/.github/workflows/pr-comment-trigger.yaml +++ b/.github/workflows/pr-comment-trigger.yaml @@ -34,7 +34,7 @@ jobs: mirror: needs: detect_trigger if: needs.detect_trigger.outputs.mirror_triggered == 'true' - uses: ./.github/workflows/mirror-branch-workflow.yml + uses: ./.github/workflows/mirror-fork-branch.yaml with: # Build the source input as ":" using the PR head info. source: "${{ github.event.pull_request.head.repo.owner.login }}:${{ github.event.pull_request.head.ref }}" @@ -42,7 +42,7 @@ jobs: test: needs: detect_trigger if: needs.detect_trigger.outputs.test_triggered == 'true' - uses: ./.github/workflows/all-post-commit-tests.yml + uses: ./.github/workflows/all-post-commit-workflows.yaml with: build-type: Release # For PRs from a fork, run tests on the mirror branch in our repo; From adf5f602d8aef31e821890d403c003b927a12c78 Mon Sep 17 00:00:00 2001 From: Artem Yerofieiev <169092593+ayerofieiev-tt@users.noreply.github.com> Date: Wed, 12 Feb 2025 23:07:44 -0600 Subject: [PATCH 080/316] Comment trigger fixes (#17859) ### Ticket None ### Problem description It is not possible to trigger mirror workflow from a different workflow. ### What's changed Add workflow_call support. I researched if its possible to share inputs, but its not possible. Sad. --- .github/workflows/mirror-fork-branch.yaml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.github/workflows/mirror-fork-branch.yaml b/.github/workflows/mirror-fork-branch.yaml index f89b1c7c14a..d4a2e0b72b7 100644 --- a/.github/workflows/mirror-fork-branch.yaml +++ b/.github/workflows/mirror-fork-branch.yaml @@ -1,6 +1,18 @@ name: Mirror Fork Branch to Origin on: + workflow_call: + inputs: + source: + description: 'Source in format :' + required: true + type: string + target_branch: + description: > + Optional. Target branch name in origin. If not provided, the branch will be named + `mirror//`. + required: false + type: string workflow_dispatch: inputs: source: From f87aa8049ef84e3fffbb0521c7769a99088c0108 Mon Sep 17 00:00:00 2001 From: Artem Yerofieiev <169092593+ayerofieiev-tt@users.noreply.github.com> Date: Wed, 12 Feb 2025 21:14:26 -0800 Subject: [PATCH 081/316] Update pr-comment-trigger.yaml to properly extract branch name --- .github/workflows/pr-comment-trigger.yaml | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/.github/workflows/pr-comment-trigger.yaml b/.github/workflows/pr-comment-trigger.yaml index c8bb8ec0f6c..0b6cf55af10 100644 --- a/.github/workflows/pr-comment-trigger.yaml +++ b/.github/workflows/pr-comment-trigger.yaml @@ -42,15 +42,28 @@ jobs: test: needs: detect_trigger if: needs.detect_trigger.outputs.test_triggered == 'true' - uses: ./.github/workflows/all-post-commit-workflows.yaml + runs-on: ubuntu-latest + outputs: + branch_name: ${{ steps.set_branch.outputs.branch_name }} + steps: + - name: Set branch name + id: set_branch + run: | + if [ "${{ github.event.pull_request.head.repo.full_name }}" != "${{ github.repository }}" ]; then + echo "branch_name=mirror/${{ github.event.pull_request.head.repo.owner.login }}/${{ github.event.pull_request.head.ref }}" >> $GITHUB_OUTPUT + else + echo "branch_name=${{ github.event.pull_request.head.ref }}" >> $GITHUB_OUTPUT + fi + + run_tests: + needs: test + uses: ./.github/workflows/all-post-commit-tests.yml with: build-type: Release - # For PRs from a fork, run tests on the mirror branch in our repo; - # otherwise, run on the PR’s head branch. - branch: ${{ github.event.pull_request.head.repo.full_name != github.repository && 'mirror/' + github.event.pull_request.head.repo.owner.login + '/' + github.event.pull_request.head.ref || github.event.pull_request.head.ref }} + branch: ${{ needs.test.outputs.branch_name }} post_comment: - needs: [mirror, test] + needs: [mirror, run_tests] if: (needs.detect_trigger.outputs.test_triggered == 'true') runs-on: ubuntu-latest steps: From babcd0decc42847c77897a70203dd60fd9264674 Mon Sep 17 00:00:00 2001 From: Artem Yerofieiev <169092593+ayerofieiev-tt@users.noreply.github.com> Date: Wed, 12 Feb 2025 21:16:28 -0800 Subject: [PATCH 082/316] Fix apc workflow name in pr-comment-trigger workflow --- .github/workflows/pr-comment-trigger.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr-comment-trigger.yaml b/.github/workflows/pr-comment-trigger.yaml index 0b6cf55af10..4084ce51a5c 100644 --- a/.github/workflows/pr-comment-trigger.yaml +++ b/.github/workflows/pr-comment-trigger.yaml @@ -57,7 +57,7 @@ jobs: run_tests: needs: test - uses: ./.github/workflows/all-post-commit-tests.yml + uses: ./.github/workflows/all-post-commit-workflows.yaml with: build-type: Release branch: ${{ needs.test.outputs.branch_name }} From df8ccb0355cc39b83a1cb6a8f22dca7286a25947 Mon Sep 17 00:00:00 2001 From: Artem Yerofieiev <169092593+ayerofieiev-tt@users.noreply.github.com> Date: Wed, 12 Feb 2025 21:26:40 -0800 Subject: [PATCH 083/316] Simplify pr-comment-trigger.yaml. Don't handle test from a fork for now. --- .github/workflows/pr-comment-trigger.yaml | 23 +++-------------------- 1 file changed, 3 insertions(+), 20 deletions(-) diff --git a/.github/workflows/pr-comment-trigger.yaml b/.github/workflows/pr-comment-trigger.yaml index 4084ce51a5c..e6133c2ffb7 100644 --- a/.github/workflows/pr-comment-trigger.yaml +++ b/.github/workflows/pr-comment-trigger.yaml @@ -40,31 +40,14 @@ jobs: source: "${{ github.event.pull_request.head.repo.owner.login }}:${{ github.event.pull_request.head.ref }}" test: - needs: detect_trigger if: needs.detect_trigger.outputs.test_triggered == 'true' - runs-on: ubuntu-latest - outputs: - branch_name: ${{ steps.set_branch.outputs.branch_name }} - steps: - - name: Set branch name - id: set_branch - run: | - if [ "${{ github.event.pull_request.head.repo.full_name }}" != "${{ github.repository }}" ]; then - echo "branch_name=mirror/${{ github.event.pull_request.head.repo.owner.login }}/${{ github.event.pull_request.head.ref }}" >> $GITHUB_OUTPUT - else - echo "branch_name=${{ github.event.pull_request.head.ref }}" >> $GITHUB_OUTPUT - fi - - run_tests: - needs: test uses: ./.github/workflows/all-post-commit-workflows.yaml with: build-type: Release - branch: ${{ needs.test.outputs.branch_name }} post_comment: - needs: [mirror, run_tests] - if: (needs.detect_trigger.outputs.test_triggered == 'true') + needs: [mirror, test] + if: needs.detect_trigger.outputs.mirror_triggered == 'true' || needs.detect_trigger.outputs.test_triggered == 'true' runs-on: ubuntu-latest steps: - name: Post workflow run link comment on PR @@ -73,4 +56,4 @@ jobs: # If this is a PR event, use its number. issue: ${{ github.event.pull_request.number }} message: | - ✨ Tests workflow run is available [here](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) + ✨ Workflow run is available [here](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) From 14d48c6e446b05119dcdf6f57831a33b26f90a12 Mon Sep 17 00:00:00 2001 From: Stanislav Minakov Date: Thu, 13 Feb 2025 05:33:35 +0000 Subject: [PATCH 084/316] Remove unused kernel parameter from moreh_nll_loss_step1 (#17849) ### Ticket ### Problem description During an unrelated investigation we found out that N/origin_N were unintentionally changed during porting moreh ops to the new shape. But later I realized that N/origin_N is actually unused, so removing it. ### What's changed Removed unused kernel parameter from moreh_nll_loss_step1 ### Checklist - [x] [All post commit CI passes](https://github.com/tenstorrent/tt-metal/actions/runs/13295261261) - [x] New/Existing tests provide coverage for changes --------- Co-authored-by: Artem Yerofieiev <169092593+ayerofieiev-tt@users.noreply.github.com> --- .../device/kernels/reader_moreh_nll_loss_step1.cpp | 1 - .../device/kernels/reader_moreh_nll_loss_step1_large.cpp | 1 - .../device/moreh_nll_loss_step1_program_factory.cpp | 7 ------- 3 files changed, 9 deletions(-) diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/kernels/reader_moreh_nll_loss_step1.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/kernels/reader_moreh_nll_loss_step1.cpp index 6e89dca5be9..85b1e7e847f 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/kernels/reader_moreh_nll_loss_step1.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/kernels/reader_moreh_nll_loss_step1.cpp @@ -11,7 +11,6 @@ void kernel_main() { auto ignore_index = static_cast(get_arg_val(i++)); auto num_units_per_core = get_arg_val(i++); auto start_id = get_arg_val(i++); - auto N = get_arg_val(i++); auto C = get_arg_val(i++); auto weight_num_tile = get_arg_val(i++); auto element_size = get_arg_val(i++); diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/kernels/reader_moreh_nll_loss_step1_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/kernels/reader_moreh_nll_loss_step1_large.cpp index 15748fd4527..7e74cc4f98c 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/kernels/reader_moreh_nll_loss_step1_large.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/kernels/reader_moreh_nll_loss_step1_large.cpp @@ -11,7 +11,6 @@ void kernel_main() { auto ignore_index = static_cast(get_arg_val(i++)); auto num_units_per_core = get_arg_val(i++); auto start_id = get_arg_val(i++); - auto N = get_arg_val(i++); auto C = get_arg_val(i++); auto weight_num_tile = get_arg_val(i++); auto element_size = get_arg_val(i++); diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/moreh_nll_loss_step1_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/moreh_nll_loss_step1_program_factory.cpp index c16e372f182..6d970f46bc9 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/moreh_nll_loss_step1_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/moreh_nll_loss_step1_program_factory.cpp @@ -27,13 +27,7 @@ MorehNllLossStep1DeviceOperation::Factory::cached_program_t MorehNllLossStep1Dev const auto& compute_kernel_config = operation_attributes.compute_kernel_config; auto target_shape = target.get_padded_shape(); - auto N = target_shape[-3]; - - const auto target_shape_without_padding = target.get_logical_shape(); - const auto origin_N = target_shape_without_padding[-3]; - const bool weight_has_value = weight.has_value(); - auto H = target_shape[-2]; auto W = target_shape[-1]; auto Ht = H / tt::constants::TILE_HEIGHT; @@ -154,7 +148,6 @@ MorehNllLossStep1DeviceOperation::Factory::cached_program_t MorehNllLossStep1Dev static_cast(ignore_index), num_units_per_core, tile_offset, - origin_N, channel_size, weight_num_tile, element_size, From c0075465b508d68c20b5aa2b99db52f8a31ac1bc Mon Sep 17 00:00:00 2001 From: Artem Yerofieiev <169092593+ayerofieiev-tt@users.noreply.github.com> Date: Wed, 12 Feb 2025 21:42:34 -0800 Subject: [PATCH 085/316] [skip ci] Let pr-comment-trigger.yaml fetch pr info from a comment --- .github/workflows/pr-comment-trigger.yaml | 25 ++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/.github/workflows/pr-comment-trigger.yaml b/.github/workflows/pr-comment-trigger.yaml index e6133c2ffb7..04efde80856 100644 --- a/.github/workflows/pr-comment-trigger.yaml +++ b/.github/workflows/pr-comment-trigger.yaml @@ -31,13 +31,29 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + mirror_fetch_pr_info: + if: needs.detect_trigger.outputs.mirror_triggered == 'true' + runs-on: ubuntu-latest + outputs: + source: ${{ steps.set_source.outputs.source }} + steps: + - name: Find pull request + id: find_pr + uses: peter-evans/find-pull-request@v2 + with: + token: ${{ secrets.GITHUB_TOKEN }} + issue-number: ${{ github.event.issue.number }} + - name: Set source + id: set_source + run: | + echo "source=${{ steps.find_pr.outputs.head_repo_owner }}:${{ steps.find_pr.outputs.head_ref }}" >> $GITHUB_OUTPUT + mirror: - needs: detect_trigger + needs: mirror_fetch_pr_info if: needs.detect_trigger.outputs.mirror_triggered == 'true' uses: ./.github/workflows/mirror-fork-branch.yaml with: - # Build the source input as ":" using the PR head info. - source: "${{ github.event.pull_request.head.repo.owner.login }}:${{ github.event.pull_request.head.ref }}" + source: "${{ needs.mirror_fetch_pr_info.outputs.source }}" test: if: needs.detect_trigger.outputs.test_triggered == 'true' @@ -53,7 +69,6 @@ jobs: - name: Post workflow run link comment on PR uses: mshick/add-pr-comment@v2 with: - # If this is a PR event, use its number. - issue: ${{ github.event.pull_request.number }} + issue: ${{ github.event.issue.number || github.event.pull_request.number }} message: | ✨ Workflow run is available [here](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) From 41a1a8b0ff83b275ce7a2ba21d6fd295c76e109f Mon Sep 17 00:00:00 2001 From: Artem Yerofieiev <169092593+ayerofieiev-tt@users.noreply.github.com> Date: Wed, 12 Feb 2025 21:50:35 -0800 Subject: [PATCH 086/316] [skip ci] Debug pr-comment-trigger.yaml --- .github/workflows/pr-comment-trigger.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/pr-comment-trigger.yaml b/.github/workflows/pr-comment-trigger.yaml index 04efde80856..4103f41f870 100644 --- a/.github/workflows/pr-comment-trigger.yaml +++ b/.github/workflows/pr-comment-trigger.yaml @@ -31,6 +31,11 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Debug trigger output + run: | + echo "Mirror Triggered: ${{ steps.mirror_check.outputs.triggered }}" + echo "Test Triggered: ${{ steps.test_check.outputs.triggered }}" + mirror_fetch_pr_info: if: needs.detect_trigger.outputs.mirror_triggered == 'true' runs-on: ubuntu-latest From 7fe2764ae519084817342949a1307cf163d030f4 Mon Sep 17 00:00:00 2001 From: Artem Yerofieiev <169092593+ayerofieiev-tt@users.noreply.github.com> Date: Wed, 12 Feb 2025 22:01:44 -0800 Subject: [PATCH 087/316] [skip ci] Test alternative dependency setting in pr-comment-trigger.yaml --- .github/workflows/pr-comment-trigger.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/pr-comment-trigger.yaml b/.github/workflows/pr-comment-trigger.yaml index 4103f41f870..0ca79c9367d 100644 --- a/.github/workflows/pr-comment-trigger.yaml +++ b/.github/workflows/pr-comment-trigger.yaml @@ -37,6 +37,7 @@ jobs: echo "Test Triggered: ${{ steps.test_check.outputs.triggered }}" mirror_fetch_pr_info: + needs: detect_trigger if: needs.detect_trigger.outputs.mirror_triggered == 'true' runs-on: ubuntu-latest outputs: @@ -54,20 +55,20 @@ jobs: echo "source=${{ steps.find_pr.outputs.head_repo_owner }}:${{ steps.find_pr.outputs.head_ref }}" >> $GITHUB_OUTPUT mirror: - needs: mirror_fetch_pr_info - if: needs.detect_trigger.outputs.mirror_triggered == 'true' + needs: [detect_trigger, mirror_fetch_pr_info] uses: ./.github/workflows/mirror-fork-branch.yaml with: source: "${{ needs.mirror_fetch_pr_info.outputs.source }}" test: + needs: detect_trigger if: needs.detect_trigger.outputs.test_triggered == 'true' uses: ./.github/workflows/all-post-commit-workflows.yaml with: build-type: Release post_comment: - needs: [mirror, test] + needs: detect_trigger if: needs.detect_trigger.outputs.mirror_triggered == 'true' || needs.detect_trigger.outputs.test_triggered == 'true' runs-on: ubuntu-latest steps: From 6c740c78b1dcd8ec6d9beb6b7aa1010f15a914d5 Mon Sep 17 00:00:00 2001 From: Artem Yerofieiev <169092593+ayerofieiev-tt@users.noreply.github.com> Date: Wed, 12 Feb 2025 22:06:37 -0800 Subject: [PATCH 088/316] [skip ci] Delete .github/workflows/pr-comment-trigger.yaml --- .github/workflows/pr-comment-trigger.yaml | 80 ----------------------- 1 file changed, 80 deletions(-) delete mode 100644 .github/workflows/pr-comment-trigger.yaml diff --git a/.github/workflows/pr-comment-trigger.yaml b/.github/workflows/pr-comment-trigger.yaml deleted file mode 100644 index 0ca79c9367d..00000000000 --- a/.github/workflows/pr-comment-trigger.yaml +++ /dev/null @@ -1,80 +0,0 @@ -name: PR Comment Trigger Workflow - -on: - pull_request: - types: [opened, edited, reopened] - issue_comment: - types: [created] - -jobs: - detect_trigger: - runs-on: ubuntu-latest - outputs: - mirror_triggered: ${{ steps.mirror_check.outputs.triggered }} - test_triggered: ${{ steps.test_check.outputs.triggered }} - steps: - - name: Check for trigger (mirror) - uses: khan/pull-request-comment-trigger@v1.1.0 - id: mirror_check - with: - trigger: ':mirror:' - reaction: eyes - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Check for trigger (test) - uses: khan/pull-request-comment-trigger@v1.1.0 - id: test_check - with: - trigger: ':test:' - reaction: rocket - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Debug trigger output - run: | - echo "Mirror Triggered: ${{ steps.mirror_check.outputs.triggered }}" - echo "Test Triggered: ${{ steps.test_check.outputs.triggered }}" - - mirror_fetch_pr_info: - needs: detect_trigger - if: needs.detect_trigger.outputs.mirror_triggered == 'true' - runs-on: ubuntu-latest - outputs: - source: ${{ steps.set_source.outputs.source }} - steps: - - name: Find pull request - id: find_pr - uses: peter-evans/find-pull-request@v2 - with: - token: ${{ secrets.GITHUB_TOKEN }} - issue-number: ${{ github.event.issue.number }} - - name: Set source - id: set_source - run: | - echo "source=${{ steps.find_pr.outputs.head_repo_owner }}:${{ steps.find_pr.outputs.head_ref }}" >> $GITHUB_OUTPUT - - mirror: - needs: [detect_trigger, mirror_fetch_pr_info] - uses: ./.github/workflows/mirror-fork-branch.yaml - with: - source: "${{ needs.mirror_fetch_pr_info.outputs.source }}" - - test: - needs: detect_trigger - if: needs.detect_trigger.outputs.test_triggered == 'true' - uses: ./.github/workflows/all-post-commit-workflows.yaml - with: - build-type: Release - - post_comment: - needs: detect_trigger - if: needs.detect_trigger.outputs.mirror_triggered == 'true' || needs.detect_trigger.outputs.test_triggered == 'true' - runs-on: ubuntu-latest - steps: - - name: Post workflow run link comment on PR - uses: mshick/add-pr-comment@v2 - with: - issue: ${{ github.event.issue.number || github.event.pull_request.number }} - message: | - ✨ Workflow run is available [here](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) From b924d02c51e6f7a1b5f85640db9559a263815069 Mon Sep 17 00:00:00 2001 From: David Ma Date: Thu, 6 Feb 2025 22:53:35 +0000 Subject: [PATCH 089/316] #17167: Remove build APIs from Device Do this by adding a build env/key/state manager outside of Device. Build dependencies are on device ID only. --- .../eth/test_erisc_app_direct_send.cpp | 13 +- tests/tt_metal/tt_metal/test_compile_args.cpp | 5 +- .../tt_metal/test_compile_program.cpp | 37 ++- .../test_compile_sets_kernel_binaries.cpp | 46 ++-- tt_metal/api/tt-metalium/device.hpp | 18 +- tt_metal/api/tt-metalium/device_impl.hpp | 30 +- tt_metal/api/tt-metalium/mesh_device.hpp | 15 +- tt_metal/distributed/mesh_device.cpp | 32 --- tt_metal/impl/device/device.cpp | 214 ++------------- tt_metal/impl/device/device_pool.cpp | 13 +- tt_metal/impl/kernels/kernel.cpp | 47 ++-- tt_metal/impl/program/dispatch.cpp | 4 +- tt_metal/impl/program/program.cpp | 19 +- tt_metal/jit_build/CMakeLists.txt | 1 + tt_metal/jit_build/build_env_manager.cpp | 257 ++++++++++++++++++ tt_metal/jit_build/build_env_manager.hpp | 55 ++++ 16 files changed, 471 insertions(+), 335 deletions(-) create mode 100644 tt_metal/jit_build/build_env_manager.cpp create mode 100644 tt_metal/jit_build/build_env_manager.hpp diff --git a/tests/tt_metal/tt_metal/eth/test_erisc_app_direct_send.cpp b/tests/tt_metal/tt_metal/eth/test_erisc_app_direct_send.cpp index dce9a0a2ddb..8f62ce75ce9 100644 --- a/tests/tt_metal/tt_metal/eth/test_erisc_app_direct_send.cpp +++ b/tests/tt_metal/tt_metal/eth/test_erisc_app_direct_send.cpp @@ -16,6 +16,7 @@ #include #include #include "tt_metal/test_utils/stimulus.hpp" +#include "tt_metal/jit_build/build_env_manager.hpp" // TODO: ARCH_NAME specific, must remove #include "eth_l1_address_map.h" @@ -227,10 +228,14 @@ bool send_over_eth( // TODO: this should be updated to use kernel api uint32_t active_eth_index = hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH); - ll_api::memory const& binary_mem_send = - llrt::get_risc_binary(sender_device->build_firmware_target_path(active_eth_index, 0, 0)); - ll_api::memory const& binary_mem_receive = - llrt::get_risc_binary(receiver_device->build_firmware_target_path(active_eth_index, 0, 0)); + auto sender_firmware_path = BuildEnvManager::get_instance() + .get_firmware_build_state(sender_device->id(), active_eth_index, 0, 0) + .get_target_out_path(""); + auto receiver_firmware_path = BuildEnvManager::get_instance() + .get_firmware_build_state(receiver_device->id(), active_eth_index, 0, 0) + .get_target_out_path(""); + const ll_api::memory& binary_mem_send = llrt::get_risc_binary(sender_firmware_path); + const ll_api::memory& binary_mem_receive = llrt::get_risc_binary(receiver_firmware_path); for (const auto& eth_core : eth_cores) { llrt::write_hex_vec_to_core( diff --git a/tests/tt_metal/tt_metal/test_compile_args.cpp b/tests/tt_metal/tt_metal/test_compile_args.cpp index ce1424b520d..60421324c1e 100644 --- a/tests/tt_metal/tt_metal/test_compile_args.cpp +++ b/tests/tt_metal/tt_metal/test_compile_args.cpp @@ -12,6 +12,7 @@ #include #include "dprint_server.hpp" #include +#include "tt_metal/jit_build/build_env_manager.hpp" ////////////////////////////////////////////////////////////////////////////////////////// // TODO: explain what test does @@ -67,7 +68,9 @@ int main(int argc, char** argv) { tt_metal::IDevice* device = tt_metal::CreateDevice(device_id); // Remove old compiled kernels static const std::string kernel_name = "test_compile_args"; - auto binary_path_str = device->build_env().get_out_kernel_root_path() + kernel_name; + auto binary_path_str = + kernel->binaries(BuildEnvManager::get_instance().get_build_env(device->id())).get_out_kernel_root_path() + + kernel_name; std::filesystem::remove_all(binary_path_str); pass &= test_compile_args({0, 68, 0, 124}, device); diff --git a/tests/tt_metal/tt_metal/test_compile_program.cpp b/tests/tt_metal/tt_metal/test_compile_program.cpp index c2426ae2fbc..4c01ee62762 100644 --- a/tests/tt_metal/tt_metal/test_compile_program.cpp +++ b/tests/tt_metal/tt_metal/test_compile_program.cpp @@ -16,6 +16,7 @@ #include #include +#include "tt_metal/jit_build/build_env_manager.hpp" using std::vector; using namespace tt; @@ -59,13 +60,13 @@ std::unordered_map get_last_program_binary_path(const // TODO: Replace this when we have debug/test hooks (GH: #964) to inspect inside CompileProgram KernelCacheStatus CompileProgramTestWrapper(IDevice* device, Program& program, bool profile_kernel = false) { // Check - std::unordered_map pre_compile_kernel_to_hash_str = - get_last_program_binary_path(program, device->build_env().get_out_kernel_root_path()); + std::unordered_map pre_compile_kernel_to_hash_str = get_last_program_binary_path( + program, BuildEnvManager::get_instance().get_build_env(device->id()).get_out_kernel_root_path()); detail::CompileProgram(device, program); - std::unordered_map post_compile_kernel_to_hash_str = - get_last_program_binary_path(program, device->build_env().get_out_kernel_root_path()); + std::unordered_map post_compile_kernel_to_hash_str = get_last_program_binary_path( + program, BuildEnvManager::get_instance().get_build_env(device->id()).get_out_kernel_root_path()); KernelCacheStatus kernel_cache_status; for (const auto& [kernel_name, hash_str] : post_compile_kernel_to_hash_str) { @@ -186,7 +187,7 @@ void assert_kernel_hash_matches( bool test_compile_program_in_loop(IDevice* device) { bool pass = true; - ClearKernelCache(device->build_env().get_out_kernel_root_path()); + ClearKernelCache(BuildEnvManager::get_instance().get_build_env(device->id()).get_out_kernel_root_path()); ProgramAttributes default_attributes; auto program = create_program(device, default_attributes); @@ -195,7 +196,10 @@ bool test_compile_program_in_loop(IDevice* device) { for (int compile_idx = 0; compile_idx < num_compiles; compile_idx++) { auto kernel_cache_status = CompileProgramTestWrapper(device, program); if (compile_idx == 0) { - assert_kernel_binary_path_exists(program, device->build_env().get_out_kernel_root_path(), kernel_cache_status); + assert_kernel_binary_path_exists( + program, + BuildEnvManager::get_instance().get_build_env(device->id()).get_out_kernel_root_path(), + kernel_cache_status); assert_program_cache_hit_status(program, /*hit_expected=*/false, kernel_cache_status); kernel_name_to_hash = kernel_cache_status.kernel_name_to_hash_str; } else { @@ -210,18 +214,21 @@ bool test_compile_program_in_loop(IDevice* device) { bool test_compile_program_after_clean_kernel_binary_directory(IDevice* device) { bool pass = true; - ClearKernelCache(device->build_env().get_out_kernel_root_path()); + ClearKernelCache(BuildEnvManager::get_instance().get_build_env(device->id()).get_out_kernel_root_path()); ProgramAttributes default_attributes; auto program = create_program(device, default_attributes); auto kernel_cache_status = CompileProgramTestWrapper(device, program); - assert_kernel_binary_path_exists(program, device->build_env().get_out_kernel_root_path(), kernel_cache_status); + assert_kernel_binary_path_exists( + program, + BuildEnvManager::get_instance().get_build_env(device->id()).get_out_kernel_root_path(), + kernel_cache_status); assert_program_cache_hit_status(program, /*hit_expected=*/false, kernel_cache_status); std::unordered_map kernel_name_to_hash = kernel_cache_status.kernel_name_to_hash_str; - ClearKernelCache(device->build_env().get_out_kernel_root_path()); + ClearKernelCache(BuildEnvManager::get_instance().get_build_env(device->id()).get_out_kernel_root_path()); auto second_program = create_program(device, default_attributes); auto second_kernel_cache_status = CompileProgramTestWrapper(device, second_program); assert_program_cache_hit_status(second_program, /*hit_expected=*/false, second_kernel_cache_status); @@ -273,7 +280,10 @@ std::unordered_map compile_program_with_modified_kerne const std::unordered_map& kernel_type_to_cache_hit_status) { auto program = create_program(device, attributes); auto kernel_cache_status = CompileProgramTestWrapper(device, program); - assert_kernel_binary_path_exists(program, device->build_env().get_out_kernel_root_path(), kernel_cache_status); + assert_kernel_binary_path_exists( + program, + BuildEnvManager::get_instance().get_build_env(device->id()).get_out_kernel_root_path(), + kernel_cache_status); assert_cache_hit_status_for_kernel_type(program, kernel_type_to_cache_hit_status, kernel_cache_status); assert_hash_comparison_for_kernel_type( program, prev_kernel_name_to_hash, kernel_type_to_cache_hit_status, kernel_cache_status); @@ -296,12 +306,15 @@ bool test_compile_program_with_modified_program(IDevice* device) { const static std::unordered_map compute_miss_data_movement_miss = { {tt::RISCV::COMPUTE, false}, {tt::RISCV::BRISC, false}, {tt::RISCV::NCRISC, false}}; - ClearKernelCache(device->build_env().get_out_kernel_root_path()); + ClearKernelCache(BuildEnvManager::get_instance().get_build_env(device->id()).get_out_kernel_root_path()); ProgramAttributes attributes; auto program = create_program(device, attributes); auto kernel_cache_status = CompileProgramTestWrapper(device, program); - assert_kernel_binary_path_exists(program, device->build_env().get_out_kernel_root_path(), kernel_cache_status); + assert_kernel_binary_path_exists( + program, + BuildEnvManager::get_instance().get_build_env(device->id()).get_out_kernel_root_path(), + kernel_cache_status); assert_program_cache_hit_status(program, /*hit_expected=*/false, kernel_cache_status); std::unordered_map kernel_name_to_hash = kernel_cache_status.kernel_name_to_hash_str; diff --git a/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp b/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp index 1bd9ea2f9b5..52f6053922e 100644 --- a/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp +++ b/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp @@ -15,6 +15,7 @@ #include #include #include +#include "tt_metal/jit_build/build_env_manager.hpp" ////////////////////////////////////////////////////////////////////////////////////////// // TODO: explain what test does @@ -150,7 +151,7 @@ int main(int argc, char** argv) { tt_metal::detail::GetKernel(program, kernel_group->kernel_ids[DISPATCH_CLASS_TENSIX_DM1].value()); // Run iteration to get golden - uint32_t mask = device->build_key(); + uint32_t mask = BuildEnvManager::get_instance().get_build_key(device->id()); tt_metal::detail::CompileProgram(device, program); compute_binaries.insert({mask, compute_kernel->binaries(mask)}); TT_FATAL(compute_binaries.at(mask).size() == 3, "Expected 3 Compute binaries!"); @@ -165,7 +166,9 @@ int main(int argc, char** argv) { std::vector kernel_names = {"reader_unary_push_4", "writer_unary", "eltwise_copy_3m"}; for (int i = 0; i < num_devices; i++) { for (const auto& kernel_name : kernel_names) { - std::filesystem::remove_all(devices[i]->build_env().get_out_kernel_root_path() + kernel_name); + std::filesystem::remove_all( + BuildEnvManager::get_instance().get_build_env(devices[i]->id()).get_out_kernel_root_path() + + kernel_name); } } tt_metal::detail::ClearKernelCache(); @@ -186,7 +189,7 @@ int main(int argc, char** argv) { auto& program = new_programs[i]; ths.emplace_back([&] { for (int j = 0; j < num_compiles; j++) { - uint32_t mask = device->build_key(); + uint32_t mask = BuildEnvManager::get_instance().get_build_key(device->id()); tt_metal::detail::CompileProgram(device, program); uint32_t programmable_core_index = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX); @@ -201,21 +204,25 @@ int main(int argc, char** argv) { TT_FATAL(riscv0_kernel->binaries(mask) == brisc_binaries.at(mask), "Error"); TT_FATAL(riscv1_kernel->binaries(mask) == ncrisc_binaries.at(mask), "Error"); - std::string brisc_hex_path = device->build_kernel_target_path( - programmable_core_index, - dm_class_idx, - 0, - get_latest_kernel_binary_path(device->build_env().get_out_kernel_root_path(), riscv0_kernel)); + std::string kernel_name = get_latest_kernel_binary_path( + BuildEnvManager::get_instance().get_build_env(device->id()).get_out_kernel_root_path(), + riscv0_kernel); + std::string brisc_hex_path = + BuildEnvManager::get_instance() + .get_kernel_build_state(device->id(), programmable_core_index, dm_class_idx, 0) + .get_target_out_path(kernel_name); ll_api::memory const& brisc_binary = llrt::get_risc_binary(brisc_hex_path, ll_api::memory::Loading::CONTIGUOUS_XIP); TT_FATAL( brisc_binary == *brisc_binaries.at(mask).at(0), "Expected saved BRISC binary to be the same as binary in persistent cache"); - std::string ncrisc_hex_path = device->build_kernel_target_path( - programmable_core_index, - dm_class_idx, - 1, - get_latest_kernel_binary_path(device->build_env().get_out_kernel_root_path(), riscv1_kernel)); + kernel_name = get_latest_kernel_binary_path( + BuildEnvManager::get_instance().get_build_env(device->id()).get_out_kernel_root_path(), + riscv1_kernel); + std::string ncrisc_hex_path = + BuildEnvManager::get_instance() + .get_kernel_build_state(device->id(), programmable_core_index, dm_class_idx, 1) + .get_target_out_path(kernel_name); auto load_type = (device->arch() == tt::ARCH::GRAYSKULL || device->arch() == tt::ARCH::WORMHOLE_B0) ? ll_api::memory::Loading::CONTIGUOUS @@ -225,12 +232,15 @@ int main(int argc, char** argv) { ncrisc_binary == *ncrisc_binaries.at(mask).at(0), "Expected saved NCRISC binary to be the same as binary in persistent cache"); for (int trisc_id = 0; trisc_id <= 2; trisc_id++) { + kernel_name = get_latest_kernel_binary_path( + BuildEnvManager::get_instance().get_build_env(device->id()).get_out_kernel_root_path(), + compute_kernel); std::string trisc_id_str = std::to_string(trisc_id); - std::string trisc_hex_path = device->build_kernel_target_path( - programmable_core_index, - compute_class_idx, - trisc_id, - get_latest_kernel_binary_path(device->build_env().get_out_kernel_root_path(), compute_kernel)); + std::string trisc_hex_path = + BuildEnvManager::get_instance() + .get_kernel_build_state( + device->id(), programmable_core_index, compute_class_idx, trisc_id) + .get_target_out_path(kernel_name); ll_api::memory const& trisc_binary = llrt::get_risc_binary(trisc_hex_path, ll_api::memory::Loading::CONTIGUOUS_XIP); TT_FATAL( diff --git a/tt_metal/api/tt-metalium/device.hpp b/tt_metal/api/tt-metalium/device.hpp index 3c0eaae0bb8..35dffa444ea 100644 --- a/tt_metal/api/tt-metalium/device.hpp +++ b/tt_metal/api/tt-metalium/device.hpp @@ -43,7 +43,6 @@ class SubDevice; } // namespace v0 -class JitBuildEnv; class CommandQueue; class TraceBuffer; struct TraceDescriptor; @@ -69,8 +68,6 @@ class IDevice { virtual chip_id_t id() const = 0; - virtual uint32_t build_key() const = 0; - virtual uint8_t num_hw_cqs() const = 0; virtual bool is_initialized() const = 0; @@ -128,13 +125,6 @@ class IDevice { virtual uint32_t get_noc_unicast_encoding(uint8_t noc_index, const CoreCoord& core) const = 0; virtual uint32_t get_noc_multicast_encoding(uint8_t noc_index, const CoreRange& cores) const = 0; - virtual const JitBuildEnv& build_env() const = 0; - virtual const string build_firmware_target_path(uint32_t programmable_core, uint32_t processor_class, int i) const = 0; - virtual const string build_kernel_target_path(uint32_t programmable_core, uint32_t processor_class, int i, const string& kernel_name) const = 0; - virtual const JitBuildState& build_firmware_state(uint32_t programmable_core, uint32_t processor_class, int i) const = 0; - virtual const JitBuildState& build_kernel_state(uint32_t programmable_core, uint32_t processor_class, int i) const = 0; - virtual const JitBuildStateSubset build_kernel_states(uint32_t programmable_core, uint32_t processor_class) const = 0; - virtual SystemMemoryManager& sysmem_manager() = 0; virtual CommandQueue& command_queue(size_t cq_id = 0) = 0; @@ -156,8 +146,12 @@ class IDevice { // Checks that the given arch is on the given pci_slot and that it's responding // Puts device into reset - virtual bool initialize(const uint8_t num_hw_cqs, size_t l1_small_size, size_t trace_region_size, tt::stl::Span l1_bank_remap = {}, bool minimal = false) = 0; - virtual void build_firmware() = 0; + virtual bool initialize( + const uint8_t num_hw_cqs, + size_t l1_small_size, + size_t trace_region_size, + tt::stl::Span l1_bank_remap = {}, + bool minimal = false) = 0; virtual void reset_cores() = 0; virtual void initialize_and_launch_firmware() = 0; virtual void init_command_queue_host() = 0; diff --git a/tt_metal/api/tt-metalium/device_impl.hpp b/tt_metal/api/tt-metalium/device_impl.hpp index 8b486f6010f..ae2aeef578e 100644 --- a/tt_metal/api/tt-metalium/device_impl.hpp +++ b/tt_metal/api/tt-metalium/device_impl.hpp @@ -57,8 +57,6 @@ class Device : public IDevice { chip_id_t id() const override { return id_; } - uint32_t build_key() const override { return build_key_; } - uint8_t num_hw_cqs() const override { return num_hw_cqs_; } bool is_initialized() const override { return this->initialized_; } @@ -117,13 +115,6 @@ class Device : public IDevice { uint32_t get_noc_unicast_encoding(uint8_t noc_index, const CoreCoord& core) const override; uint32_t get_noc_multicast_encoding(uint8_t noc_index, const CoreRange& cores) const override; - const JitBuildEnv& build_env() const override { return this->build_env_; } - const string build_firmware_target_path(uint32_t programmable_core, uint32_t processor_class, int i) const override; - const string build_kernel_target_path(uint32_t programmable_core, uint32_t processor_class, int i, const string& kernel_name) const override; - const JitBuildState& build_firmware_state(uint32_t programmable_core, uint32_t processor_class, int i) const override; - const JitBuildState& build_kernel_state(uint32_t programmable_core, uint32_t processor_class, int i) const override; - const JitBuildStateSubset build_kernel_states(uint32_t programmable_core, uint32_t processor_class) const override; - SystemMemoryManager& sysmem_manager() override { return *sysmem_manager_; } CommandQueue& command_queue(size_t cq_id = 0) override; @@ -147,8 +138,12 @@ class Device : public IDevice { // Checks that the given arch is on the given pci_slot and that it's responding // Puts device into reset - bool initialize(const uint8_t num_hw_cqs, size_t l1_small_size, size_t trace_region_size, tt::stl::Span l1_bank_remap = {}, bool minimal = false) override; - void build_firmware() override; + bool initialize( + const uint8_t num_hw_cqs, + size_t l1_small_size, + size_t trace_region_size, + tt::stl::Span l1_bank_remap = {}, + bool minimal = false) override; void reset_cores() override; void initialize_and_launch_firmware() override; void init_command_queue_host() override; @@ -207,8 +202,6 @@ class Device : public IDevice { void initialize_cluster(); std::unique_ptr initialize_allocator( size_t l1_small_size, size_t trace_region_size, tt::stl::Span l1_bank_remap = {}); - void initialize_build(); - void initialize_device_kernel_defines(); void initialize_device_bank_to_noc_tables(const HalProgrammableCoreType &core_type, CoreCoord virtual_core); void initialize_firmware(const HalProgrammableCoreType &core_type, CoreCoord virtual_core, launch_msg_t *launch_msg, go_msg_t* go_msg); @@ -220,9 +213,8 @@ class Device : public IDevice { void configure_command_queue_programs(); void clear_l1_state(); void get_associated_dispatch_virtual_cores( - std::unordered_map> &my_dispatch_cores, - std::unordered_map> &other_dispatch_cores); - std::pair build_processor_type_to_index(uint32_t programmable_core, uint32_t processor_class) const; + std::unordered_map>& my_dispatch_cores, + std::unordered_map>& other_dispatch_cores); void set_worker_mode(const WorkExecutorMode& mode); @@ -237,7 +229,6 @@ class Device : public IDevice { CoreCoord virtual_core_from_physical_core(const CoreCoord& physical_coord) const; chip_id_t id_; - uint32_t build_key_ = 0; std::vector> tunnels_from_mmio_; std::unique_ptr sub_device_manager_tracker_; @@ -258,11 +249,6 @@ class Device : public IDevice { // SystemMemoryManager is the interface to the hardware command queue std::vector> command_queues_; - JitBuildEnv build_env_; - JitBuildStateSet firmware_build_states_; - JitBuildStateSet kernel_build_states_; - std::vector>> build_state_indices_; - std::set compute_cores_; std::set storage_only_cores_; std::set ethernet_cores_; diff --git a/tt_metal/api/tt-metalium/mesh_device.hpp b/tt_metal/api/tt-metalium/mesh_device.hpp index de088e22685..b115f58a6d8 100644 --- a/tt_metal/api/tt-metalium/mesh_device.hpp +++ b/tt_metal/api/tt-metalium/mesh_device.hpp @@ -83,7 +83,6 @@ class MeshDevice : public IDevice, public std::enable_shared_from_this& storage_only_cores() const override; uint32_t get_noc_unicast_encoding(uint8_t noc_index, const CoreCoord& core) const override; uint32_t get_noc_multicast_encoding(uint8_t noc_index, const CoreRange& cores) const override; - const JitBuildEnv& build_env() const override; - const string build_firmware_target_path(uint32_t programmable_core, uint32_t processor_class, int i) const override; - const string build_kernel_target_path(uint32_t programmable_core, uint32_t processor_class, int i, const string& kernel_name) const override; - const JitBuildState& build_firmware_state(uint32_t programmable_core, uint32_t processor_class, int i) const override; - const JitBuildState& build_kernel_state(uint32_t programmable_core, uint32_t processor_class, int i) const override; - const JitBuildStateSubset build_kernel_states(uint32_t programmable_core, uint32_t processor_class) const override; SystemMemoryManager& sysmem_manager() override; CommandQueue& command_queue(size_t cq_id = 0) override; @@ -157,8 +150,12 @@ class MeshDevice : public IDevice, public std::enable_shared_from_this l1_bank_remap = {}, bool minimal = false) override; - void build_firmware() override; + bool initialize( + const uint8_t num_hw_cqs, + size_t l1_small_size, + size_t trace_region_size, + tt::stl::Span l1_bank_remap = {}, + bool minimal = false) override; void reset_cores() override; void initialize_and_launch_firmware() override; void init_command_queue_host() override; diff --git a/tt_metal/distributed/mesh_device.cpp b/tt_metal/distributed/mesh_device.cpp index 099c7c8f34b..9e20a8bde93 100644 --- a/tt_metal/distributed/mesh_device.cpp +++ b/tt_metal/distributed/mesh_device.cpp @@ -90,12 +90,6 @@ MeshDevice::ScopedDevices::~ScopedDevices() { const std::vector& MeshDevice::ScopedDevices::get_devices() const { return devices_; } -uint32_t MeshDevice::build_key() const { - TT_FATAL(tt::tt_metal::hal.is_coordinate_virtualization_enabled(), "MeshDevice::build_key() expects coordinate virtualization to be enabled"); - return validate_and_get_reference_value( - scoped_devices_->get_devices(), [](const auto& device) { return device->build_key(); }); -} - uint8_t MeshDevice::num_hw_cqs() const { return validate_and_get_reference_value( scoped_devices_->get_devices(), [](const auto& device) { return device->num_hw_cqs(); }); @@ -536,28 +530,6 @@ uint32_t MeshDevice::get_noc_multicast_encoding(uint8_t noc_index, const CoreRan }); } -// Floating point and build environment -const JitBuildEnv& MeshDevice::build_env() const { return reference_device()->build_env(); } - -// Build and firmware paths -const string MeshDevice::build_firmware_target_path(uint32_t programmable_core, uint32_t processor_class, int i) const { - return reference_device()->build_firmware_target_path(programmable_core, processor_class, i); -} -const string MeshDevice::build_kernel_target_path( - uint32_t programmable_core, uint32_t processor_class, int i, const string& kernel_name) const { - return reference_device()->build_kernel_target_path(programmable_core, processor_class, i, kernel_name); -} -const JitBuildState& MeshDevice::build_firmware_state( - uint32_t programmable_core, uint32_t processor_class, int i) const { - return reference_device()->build_firmware_state(programmable_core, processor_class, i); -} -const JitBuildState& MeshDevice::build_kernel_state(uint32_t programmable_core, uint32_t processor_class, int i) const { - return reference_device()->build_kernel_state(programmable_core, processor_class, i); -} -const JitBuildStateSubset MeshDevice::build_kernel_states(uint32_t programmable_core, uint32_t processor_class) const { - return reference_device()->build_kernel_states(programmable_core, processor_class); -} - // System memory and command queue management SystemMemoryManager& MeshDevice::sysmem_manager() { TT_THROW("sysmem_manager() is not supported on MeshDevice - use individual devices instead"); @@ -643,10 +615,6 @@ bool MeshDevice::initialize( return true; } -void MeshDevice::build_firmware() { - TT_THROW("build_firmware() is not supported on MeshDevice - use individual devices instead"); - reference_device()->build_firmware(); -} void MeshDevice::reset_cores() { TT_THROW("reset_cores() is not supported on MeshDevice - use individual devices instead"); reference_device()->reset_cores(); diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index f1d8125e259..e87352c4b59 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -35,6 +35,7 @@ #include "impl/dispatch/topology.hpp" #include "impl/dispatch/hardware_command_queue.hpp" +#include "tt_metal/jit_build/build_env_manager.hpp" namespace tt { @@ -334,130 +335,6 @@ std::unique_ptr Device::initialize_allocator(size_t l1_small_size, si return std::make_unique(config); } -void Device::initialize_device_kernel_defines() -{ - // Clear previously stored defines, in case we are running with different configuration this time. - // This is needed to handle the case where the number of L1 banks on GS can be changed in each run. - this->device_kernel_defines_.clear(); - const size_t num_dram_banks = this->allocator()->get_num_banks(BufferType::DRAM); - const size_t num_l1_banks = this->allocator()->get_num_banks(BufferType::L1); - - bool is_dram_pow2 = ceil(log2(num_dram_banks)) == log2(num_dram_banks); - bool is_l1_pow2 = ceil(log2(num_l1_banks)) == log2(num_l1_banks); - - this->device_kernel_defines_.emplace("NUM_DRAM_BANKS", std::to_string(num_dram_banks)); - this->device_kernel_defines_.emplace("NUM_L1_BANKS", std::to_string(num_l1_banks)); - - if (is_dram_pow2) { - this->device_kernel_defines_.emplace("LOG_BASE_2_OF_NUM_DRAM_BANKS", std::to_string(static_cast(log2(num_dram_banks)))); - } else { - this->device_kernel_defines_.emplace("IS_NOT_POW2_NUM_DRAM_BANKS", "1"); - } - if (is_l1_pow2) { - this->device_kernel_defines_.emplace("LOG_BASE_2_OF_NUM_L1_BANKS", std::to_string(static_cast(log2(num_l1_banks)))); - } else { - this->device_kernel_defines_.emplace("IS_NOT_POW2_NUM_L1_BANKS", "1"); - } - - // TODO (abhullar): Until we switch to virtual coordinates, we need to pass physical PCIe coordinates to device - // because Blackhole PCIe endpoint is dependent on board type - const metal_SocDescriptor& soc_d = tt::Cluster::instance().get_soc_desc(this->id()); - auto pcie_cores = soc_d.get_pcie_cores(); - auto grid_size = this->grid_size(); - - CoreCoord pcie_core = pcie_cores.empty() ? grid_size : pcie_cores[0]; - - this->device_kernel_defines_.emplace("PCIE_NOC_X", std::to_string(pcie_core.x)); - this->device_kernel_defines_.emplace("PCIE_NOC_Y", std::to_string(pcie_core.y)); -} - -void Device::initialize_build() { - ZoneScoped; - - this->initialize_device_kernel_defines(); - this->build_env_.init(this->build_key(), this->arch(), this->device_kernel_defines_); - - CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(this->id()); - uint32_t dispatch_message_addr = DispatchMemMap::get(dispatch_core_type, this->num_hw_cqs_) - .get_device_command_queue_addr(CommandQueueDeviceAddrType::DISPATCH_MESSAGE); - - uint32_t num_build_states = hal.get_num_risc_processors(); - - auto init_helper = [this, dispatch_message_addr, num_build_states] (bool is_fw) -> JitBuildStateSet { - std::vector> build_states; - - build_states.resize(num_build_states); - uint32_t programmable_core_type_count = hal.get_programmable_core_type_count(); - if (is_fw) { - this->build_state_indices_.resize(programmable_core_type_count); - } - - uint32_t index = 0; - for (uint32_t programmable_core = 0; programmable_core < programmable_core_type_count; programmable_core++) { - HalProgrammableCoreType core_type = magic_enum::enum_value(programmable_core); - uint32_t processor_class_count = hal.get_processor_classes_count(programmable_core); - if (is_fw) { - this->build_state_indices_[programmable_core].resize(processor_class_count); - } - for (uint32_t processor_class = 0; processor_class < processor_class_count; processor_class++) { - auto compute_proc_class = magic_enum::enum_cast(processor_class); - bool is_compute_processor = compute_proc_class.has_value() and compute_proc_class.value() == HalProcessorClassType::COMPUTE; - uint32_t processor_types_count = hal.get_processor_types_count(programmable_core, processor_class); - if (is_fw) { - this->build_state_indices_[programmable_core][processor_class] = {index, processor_types_count}; - } - for (uint32_t processor_type = 0; processor_type < processor_types_count; processor_type++) { - switch (core_type) { - case HalProgrammableCoreType::TENSIX: { - if (is_compute_processor) { - build_states[index] = std::make_shared( - this->build_env_, JitBuiltStateConfig{.processor_id = processor_type, .is_fw=is_fw, .dispatch_message_addr=dispatch_message_addr}); - } else { - // TODO: Make .processor_id = processor_type when brisc and ncrisc are considered one processor class - build_states[index] = std::make_shared( - this->build_env_, JitBuiltStateConfig{.processor_id = processor_class, .is_fw=is_fw, .dispatch_message_addr=dispatch_message_addr}); - } - break; - } - case HalProgrammableCoreType::ACTIVE_ETH: { - // Cooperative means active erisc FW needs to context switch to base FW - bool is_cooperative = this->arch() == ARCH::WORMHOLE_B0; - build_states[index] = std::make_shared( - this->build_env_, - JitBuiltStateConfig{ - .processor_id = processor_class, - .is_fw = is_fw, - .dispatch_message_addr = dispatch_message_addr, - .is_cooperative = is_cooperative}); - break; - } - case HalProgrammableCoreType::IDLE_ETH: { - build_states[index] = std::make_shared( - this->build_env_, JitBuiltStateConfig{.processor_id = processor_class, .is_fw=is_fw, .dispatch_message_addr=dispatch_message_addr}); - break; - } - default: - TT_THROW("Unsupported programable core type {} to initialize build states", magic_enum::enum_name(core_type)); - } - index++; - } - } - } - - return build_states; - }; - - this->firmware_build_states_ = init_helper(true); - this->kernel_build_states_ = init_helper(false); -} - -void Device::build_firmware() { - log_debug(tt::LogMetal, "Building base firmware for device {}", this->id_); - ZoneScoped; - - jit_build_set(this->firmware_build_states_, nullptr); -} - void Device::initialize_device_bank_to_noc_tables(const HalProgrammableCoreType &core_type, CoreCoord virtual_core) { const uint32_t dram_to_noc_sz_in_bytes = dram_bank_to_noc_xy_.size() * sizeof(uint16_t); @@ -492,19 +369,23 @@ void Device::initialize_firmware(const HalProgrammableCoreType &core_type, CoreC switch (core_type) { case HalProgrammableCoreType::TENSIX: { for (uint32_t processor_class = 0; processor_class < processor_class_count; processor_class++) { - auto [build_idx, num_build_states] = this->build_processor_type_to_index(core_type_idx, processor_class); - for (uint32_t riscv_id = build_idx; riscv_id < (build_idx + num_build_states); riscv_id++) { - ll_api::memory const& binary_mem = llrt::get_risc_binary( - firmware_build_states_[riscv_id]->get_target_out_path("")); + auto [build_idx, num_build_states] = + BuildEnvManager::get_instance().get_build_index_and_state_count(core_type_idx, processor_class); + for (uint32_t riscv_id = 0; riscv_id < num_build_states; riscv_id++) { + auto fw_path = BuildEnvManager::get_instance() + .get_firmware_build_state(id_, core_type_idx, processor_class, riscv_id) + .get_target_out_path(""); + const ll_api::memory& binary_mem = llrt::get_risc_binary(fw_path); uint32_t fw_size = binary_mem.get_text_size(); - if (riscv_id == 1) { // TODO: clean up how brisc/ncrisc are handled + if (riscv_id + build_idx == 1) { // TODO: clean up how brisc/ncrisc are handled // In this context, ncrisc_kernel_size16 is the size of the fw launch_msg->kernel_config.ncrisc_kernel_size16 = (fw_size + 15) >> 4; } log_debug(LogDevice, "RISC {} fw binary size: {} in bytes", riscv_id, fw_size); if (not llrt::RunTimeOptions::get_instance().get_skip_loading_fw()) { - llrt::test_load_write_read_risc_binary(binary_mem, this->id(), virtual_core, core_type_idx, processor_class, (riscv_id - build_idx)); + llrt::test_load_write_read_risc_binary( + binary_mem, this->id(), virtual_core, core_type_idx, processor_class, riscv_id); } } } @@ -536,13 +417,16 @@ void Device::initialize_firmware(const HalProgrammableCoreType &core_type, CoreC } if (not llrt::RunTimeOptions::get_instance().get_skip_loading_fw()) { for (uint32_t processor_class = 0; processor_class < processor_class_count; processor_class++) { - auto [build_idx, num_build_states] = this->build_processor_type_to_index(core_type_idx, processor_class); - for (uint32_t eriscv_id = build_idx; eriscv_id < (build_idx + num_build_states); eriscv_id++) { - ll_api::memory const& binary_mem = llrt::get_risc_binary( - firmware_build_states_[eriscv_id]->get_target_out_path("")); + auto num_build_states = hal.get_processor_types_count(core_type_idx, processor_class); + for (uint32_t eriscv_id = 0; eriscv_id < num_build_states; eriscv_id++) { + auto fw_path = BuildEnvManager::get_instance() + .get_firmware_build_state(id_, core_type_idx, processor_class, eriscv_id) + .get_target_out_path(""); + const ll_api::memory& binary_mem = llrt::get_risc_binary(fw_path); uint32_t fw_size = binary_mem.get_text_size(); log_debug(LogDevice, "ERISC fw binary size: {} in bytes", fw_size); - llrt::test_load_write_read_risc_binary(binary_mem, this->id(), virtual_core, core_type_idx, processor_class, (eriscv_id - build_idx)); + llrt::test_load_write_read_risc_binary( + binary_mem, this->id(), virtual_core, core_type_idx, processor_class, eriscv_id); } } } @@ -1030,31 +914,9 @@ bool Device::initialize(const uint8_t num_hw_cqs, size_t l1_small_size, size_t t update_dispatch_cores_for_multi_cq_eth_dispatch(); } this->num_hw_cqs_ = num_hw_cqs; - constexpr uint32_t harvesting_map_bits = 12; - constexpr uint32_t num_hw_cq_bits = 8; - constexpr uint32_t dispatch_core_axis_bits = 1; - constexpr uint32_t dispatch_core_type_bits = 1; - static_assert(dispatch_core_manager::MAX_NUM_HW_CQS <= (1 << num_hw_cq_bits)); - static_assert(static_cast(DispatchCoreAxis::COUNT) <= (1 << dispatch_core_axis_bits)); - static_assert(static_cast(DispatchCoreType::COUNT) <= (1 << dispatch_core_type_bits)); - static_assert(harvesting_map_bits + num_hw_cq_bits + dispatch_core_axis_bits + dispatch_core_type_bits <= sizeof(this->build_key_) * CHAR_BIT); - - // num_hw_cqs, dispatch_core_axis, dispatch_core_type all change the number of banks, so need to be part of the - // build key since we have defines based on number of banks. - const auto& dispatch_core_config = dispatch_core_manager::instance().get_dispatch_core_config(this->id_); - this->build_key_ = (static_cast(dispatch_core_config.get_dispatch_core_type()) << (harvesting_map_bits + num_hw_cq_bits + dispatch_core_axis_bits)) | - (static_cast(dispatch_core_config.get_dispatch_core_axis()) << (harvesting_map_bits + num_hw_cq_bits)) | - (static_cast(num_hw_cqs_) << harvesting_map_bits); - if (not hal.is_coordinate_virtualization_enabled()) { - // Coordinate virtualization is not enabled. For a single program, its associated binaries will vary across devices with different cores harvested. - this->build_key_ = (this->build_key_) | tt::Cluster::instance().get_harvesting_mask(this->id()); - } else { - // Coordinate Virtualization is enabled. Track only the number of harvested cores, instead of the exact harvesting configuration (this is not needed). - this->build_key_ = (this->build_key_) | (std::bitset(tt::Cluster::instance().get_harvesting_mask(this->id())).count()); - } + BuildEnvManager::get_instance().add_build_env(this->id(), this->num_hw_cqs()); this->initialize_cluster(); this->initialize_default_sub_device_state(l1_small_size, trace_region_size, l1_bank_remap); - this->initialize_build(); this->generate_device_bank_to_noc_tables(); // For minimal setup, don't initialize FW, watcher, dprint. They won't work if we're attaching to a hung chip. @@ -1341,42 +1203,6 @@ std::optional Device::lowest_occupied_compute_l1_address(tt::stl::Sp return sub_device_manager_tracker_->lowest_occupied_compute_l1_address(sub_device_ids); } -std::pair Device::build_processor_type_to_index(uint32_t programmable_core, uint32_t processor_class) const { - TT_ASSERT(programmable_core < this->build_state_indices_.size(), - "Programmable core type {} is not included in the FW or Kernel build state", programmable_core); - TT_ASSERT(processor_class < this->build_state_indices_[programmable_core].size(), - "Processor class type {} is not included in the FW or Kernel build state", processor_class); - return this->build_state_indices_[programmable_core][processor_class]; -} - -// Ideally the firmware getter would be private to the device, however, tests look for this -const JitBuildState& Device::build_firmware_state(uint32_t programmable_core, uint32_t processor_class, int i) const { - return *(this->firmware_build_states_[build_processor_type_to_index(programmable_core, processor_class).first + i]); -} - -const JitBuildState& Device::build_kernel_state(uint32_t programmable_core, uint32_t processor_class, int i) const { - return *(this->kernel_build_states_[build_processor_type_to_index(programmable_core, processor_class).first + i]); -} - -const JitBuildStateSubset Device::build_kernel_states(uint32_t programmable_core, uint32_t processor_class) const { - std::pair bptti = build_processor_type_to_index(programmable_core, processor_class); - JitBuildStateSubset subset = { - &this->kernel_build_states_[bptti.first], - bptti.second - }; - return subset; -} - -const string Device::build_firmware_target_path(uint32_t programmable_core, uint32_t processor_class, int i) const { - const JitBuildState& bs = build_firmware_state(programmable_core, processor_class, i); - return bs.get_target_out_path(""); -} - -const string Device::build_kernel_target_path(uint32_t programmable_core, uint32_t processor_class, int i, const string& kernel_name) const { - const JitBuildState& bs = build_kernel_state(programmable_core, processor_class, i); - return bs.get_target_out_path(kernel_name); -} - CommandQueue& Device::command_queue(size_t cq_id) { detail::DispatchStateCheck(using_fast_dispatch_); TT_FATAL(cq_id < command_queues_.size(), "cq_id {} is out of range", cq_id); diff --git a/tt_metal/impl/device/device_pool.cpp b/tt_metal/impl/device/device_pool.cpp index 753631cc992..e0e24f67710 100644 --- a/tt_metal/impl/device/device_pool.cpp +++ b/tt_metal/impl/device/device_pool.cpp @@ -21,6 +21,7 @@ #include "tt_metal/impl/debug/watcher_server.hpp" #include "tt_metal/impl/dispatch/topology.hpp" #include "tt_metal/impl/dispatch/dispatch_query_manager.hpp" +#include "tt_metal/jit_build/build_env_manager.hpp" using namespace tt::tt_metal; @@ -304,18 +305,18 @@ void DevicePool::activate_device(chip_id_t id) { false, worker_core_thread_core, completion_queue_reader_core); - if (!this->firmware_built_keys.contains(device->build_key())) { - device->build_firmware(); - this->firmware_built_keys.insert(device->build_key()); + if (!this->firmware_built_keys.contains(BuildEnvManager::get_instance().get_build_key(device->id()))) { + BuildEnvManager::get_instance().build_firmware(device->id()); + this->firmware_built_keys.insert(BuildEnvManager::get_instance().get_build_key(device->id())); } this->devices.emplace_back(std::unique_ptr(device)); } else { log_debug(tt::LogMetal, "DevicePool re-initialize device {}", id); if (not device->is_initialized()) { device->initialize(num_hw_cqs, this->l1_small_size, this->trace_region_size, this->l1_bank_remap); - if (!this->firmware_built_keys.contains(device->build_key())) { - device->build_firmware(); - this->firmware_built_keys.insert(device->build_key()); + if (!this->firmware_built_keys.contains(BuildEnvManager::get_instance().get_build_key(device->id()))) { + BuildEnvManager::get_instance().build_firmware(device->id()); + this->firmware_built_keys.insert(BuildEnvManager::get_instance().get_build_key(device->id())); } } else { TT_THROW("Cannot re-initialize device {}, must first call close()", id); diff --git a/tt_metal/impl/kernels/kernel.cpp b/tt_metal/impl/kernels/kernel.cpp index 24221f4d9c4..a95a7d18c8d 100644 --- a/tt_metal/impl/kernels/kernel.cpp +++ b/tt_metal/impl/kernels/kernel.cpp @@ -17,6 +17,7 @@ #include #include #include "tt_metal/jit_build/genfiles.hpp" +#include "tt_metal/jit_build/build_env_manager.hpp" namespace tt { namespace tt_metal { @@ -317,13 +318,13 @@ bool Kernel::is_idle_eth() const { uint32_t Kernel::get_binary_packed_size(IDevice* device, int index) const { // In testing situations we can query the size w/o a binary - auto iter = binaries_.find(device->build_key()); + auto iter = binaries_.find(BuildEnvManager::get_instance().get_build_key(device->id())); return iter != this->binaries_.end() ? iter->second[index]->get_packed_size() : 0; } uint32_t Kernel::get_binary_text_size(IDevice* device, int index) const { // In testing situations we can query the size w/o a binary - auto iter = binaries_.find(device->build_key()); + auto iter = binaries_.find(BuildEnvManager::get_instance().get_build_key(device->id())); return iter != this->binaries_.end() ? iter->second[index]->get_text_size() : 0; } @@ -337,26 +338,34 @@ void ComputeKernel::set_build_options(JitBuildOptions &build_options) const { } void DataMovementKernel::generate_binaries(IDevice* device, JitBuildOptions &build_options) const { - jit_build_genfiles_kernel_include(device->build_env(), *this, this->kernel_src_); + jit_build_genfiles_kernel_include( + BuildEnvManager::get_instance().get_build_env(device->id()), *this, this->kernel_src_); uint32_t tensix_core_type = hal.get_programmable_core_type_index(this->get_kernel_programmable_core_type()); uint32_t dm_class_idx = magic_enum::enum_integer(HalProcessorClassType::DM); int riscv_id = static_cast::type>(this->config_.processor); - jit_build(device->build_kernel_state(tensix_core_type, dm_class_idx, riscv_id), this); + jit_build( + BuildEnvManager::get_instance().get_kernel_build_state(device->id(), tensix_core_type, dm_class_idx, riscv_id), + this); } void EthernetKernel::generate_binaries(IDevice* device, JitBuildOptions &build_options) const { - jit_build_genfiles_kernel_include(device->build_env(), *this, this->kernel_src_); + jit_build_genfiles_kernel_include( + BuildEnvManager::get_instance().get_build_env(device->id()), *this, this->kernel_src_); uint32_t erisc_core_type = hal.get_programmable_core_type_index(this->get_kernel_programmable_core_type()); uint32_t dm_class_idx = magic_enum::enum_integer(HalProcessorClassType::DM); int erisc_id = magic_enum::enum_integer(this->config_.processor); - jit_build(device->build_kernel_state(erisc_core_type, dm_class_idx, erisc_id), this); + jit_build( + BuildEnvManager::get_instance().get_kernel_build_state(device->id(), erisc_core_type, dm_class_idx, erisc_id), + this); } void ComputeKernel::generate_binaries(IDevice* device, JitBuildOptions &build_options) const { - jit_build_genfiles_triscs_src(device->build_env(), *this, this->kernel_src_); + jit_build_genfiles_triscs_src( + BuildEnvManager::get_instance().get_build_env(device->id()), *this, this->kernel_src_); uint32_t tensix_core_type = hal.get_programmable_core_type_index(this->get_kernel_programmable_core_type()); uint32_t compute_class_idx = magic_enum::enum_integer(HalProcessorClassType::COMPUTE); - JitBuildStateSubset build_states = device->build_kernel_states(tensix_core_type, compute_class_idx); + JitBuildStateSubset build_states = + BuildEnvManager::get_instance().get_kernel_build_states(device->id(), tensix_core_type, compute_class_idx); jit_build_subset(build_states, this); } @@ -379,7 +388,8 @@ void DataMovementKernel::read_binaries(IDevice* device) { uint32_t tensix_core_type = hal.get_programmable_core_type_index(this->get_kernel_programmable_core_type()); uint32_t dm_class_idx = magic_enum::enum_integer(HalProcessorClassType::DM); int riscv_id = static_cast::type>(this->config_.processor); - const JitBuildState &build_state = device->build_kernel_state(tensix_core_type, dm_class_idx, riscv_id); + const JitBuildState& build_state = + BuildEnvManager::get_instance().get_kernel_build_state(device->id(), tensix_core_type, dm_class_idx, riscv_id); // TODO: from HAL auto load_type = (riscv_id == 1 && (device->arch() == tt::ARCH::GRAYSKULL || device->arch() == tt::ARCH::WORMHOLE_B0)) ? @@ -390,7 +400,7 @@ void DataMovementKernel::read_binaries(IDevice* device) { binaries.push_back(&binary_mem); uint32_t binary_size = binary_mem.get_packed_size(); log_debug(LogLoader, "RISC {} kernel binary size: {} in bytes", riscv_id, binary_size); - this->set_binaries(device->build_key(), std::move(binaries)); + this->set_binaries(BuildEnvManager::get_instance().get_build_key(device->id()), std::move(binaries)); } void EthernetKernel::read_binaries(IDevice* device) { @@ -400,7 +410,8 @@ void EthernetKernel::read_binaries(IDevice* device) { uint32_t erisc_core_type = hal.get_programmable_core_type_index(this->get_kernel_programmable_core_type()); uint32_t dm_class_idx = magic_enum::enum_integer(HalProcessorClassType::DM); int erisc_id = magic_enum::enum_integer(this->config_.processor); - const JitBuildState &build_state = device->build_kernel_state(erisc_core_type, dm_class_idx, erisc_id); + const JitBuildState& build_state = + BuildEnvManager::get_instance().get_kernel_build_state(device->id(), erisc_core_type, dm_class_idx, erisc_id); int risc_id = erisc_id + (this->config_.eth_mode == Eth::IDLE ? 6 : 5); // TODO (abhullar): clean this up when llrt helpers use HAL // TODO: fix when active eth supports relo auto load_type = (this->config_.eth_mode == Eth::IDLE) ? @@ -411,7 +422,7 @@ void EthernetKernel::read_binaries(IDevice* device) { binaries.push_back(&binary_mem); uint32_t binary_size = binary_mem.get_packed_size(); log_debug(LogLoader, "ERISC {} kernel binary size: {} in bytes", erisc_id, binary_size); - this->set_binaries(device->build_key(), std::move(binaries)); + this->set_binaries(BuildEnvManager::get_instance().get_build_key(device->id()), std::move(binaries)); } void ComputeKernel::read_binaries(IDevice* device) { @@ -420,7 +431,8 @@ void ComputeKernel::read_binaries(IDevice* device) { uint32_t tensix_core_type = hal.get_programmable_core_type_index(this->get_kernel_programmable_core_type()); uint32_t compute_class_idx = magic_enum::enum_integer(HalProcessorClassType::COMPUTE); for (int trisc_id = 0; trisc_id <= 2; trisc_id++) { - const JitBuildState &build_state = device->build_kernel_state(tensix_core_type, compute_class_idx, trisc_id); + const JitBuildState& build_state = BuildEnvManager::get_instance().get_kernel_build_state( + device->id(), tensix_core_type, compute_class_idx, trisc_id); ll_api::memory const& binary_mem = llrt::get_risc_binary( build_state.get_target_out_path(this->kernel_full_name_), ll_api::memory::Loading::CONTIGUOUS_XIP); @@ -428,7 +440,7 @@ void ComputeKernel::read_binaries(IDevice* device) { uint32_t binary_size = binary_mem.get_packed_size(); log_debug(LogLoader, "RISC {} kernel binary size: {} in bytes", trisc_id + 2, binary_size); } - this->set_binaries(device->build_key(), std::move(binaries)); + this->set_binaries(BuildEnvManager::get_instance().get_build_key(device->id()), std::move(binaries)); } RISCV DataMovementKernel::processor() const { @@ -450,7 +462,7 @@ bool DataMovementKernel::configure(IDevice* device, const CoreCoord &logical_cor } auto device_id = device->id(); auto worker_core = device->worker_core_from_logical_core(logical_core); - ll_api::memory const& binary_mem = *this->binaries(device->build_key())[0]; + const ll_api::memory& binary_mem = *this->binaries(BuildEnvManager::get_instance().get_build_key(device->id()))[0]; int riscv_id = static_cast::type>(this->config_.processor); llrt::write_binary_to_address(binary_mem, device_id, worker_core, base_address + offsets[riscv_id]); @@ -460,7 +472,7 @@ bool DataMovementKernel::configure(IDevice* device, const CoreCoord &logical_cor bool EthernetKernel::configure(IDevice* device, const CoreCoord &logical_core, uint32_t base_address, const uint32_t offsets[]) const { auto device_id = device->id(); auto ethernet_core = device->ethernet_core_from_logical_core(logical_core); - ll_api::memory const& binary_mem = *this->binaries(device->build_key())[0]; + const ll_api::memory& binary_mem = *this->binaries(BuildEnvManager::get_instance().get_build_key(device->id()))[0]; if (this->config_.eth_mode == Eth::IDLE) { uint32_t offset_idx = magic_enum::enum_integer(HalProcessorClassType::DM) + magic_enum::enum_integer(this->config_.processor); @@ -482,7 +494,8 @@ bool ComputeKernel::configure(IDevice* device, const CoreCoord &logical_core, ui } auto device_id = device->id(); auto worker_core = device->worker_core_from_logical_core(logical_core); - std::vector const& binaries = this->binaries(device->build_key()); + const std::vector& binaries = + this->binaries(BuildEnvManager::get_instance().get_build_key(device->id())); for (int trisc_id = 0; trisc_id <= 2; trisc_id++) { llrt::write_binary_to_address( *binaries[trisc_id], device_id, worker_core, base_address + offsets[2 + trisc_id]); diff --git a/tt_metal/impl/program/dispatch.cpp b/tt_metal/impl/program/dispatch.cpp index 2416aede1e0..39a1fa208ce 100644 --- a/tt_metal/impl/program/dispatch.cpp +++ b/tt_metal/impl/program/dispatch.cpp @@ -15,6 +15,7 @@ #include "tt_metal/impl/dispatch/data_collection.hpp" #include "tt_metal/impl/dispatch/device_command_calculator.hpp" #include "tt_metal/impl/dispatch/dispatch_query_manager.hpp" +#include "tt_metal/jit_build/build_env_manager.hpp" namespace tt::tt_metal { namespace program_dispatch { @@ -217,7 +218,8 @@ uint32_t finalize_kernel_bins( auto& optional_id = kg->kernel_ids[class_id]; if (optional_id) { const auto kernel = kernels.at(optional_id.value()); - const std::vector& binaries = kernel->binaries(device->build_key()); + const std::vector& binaries = + kernel->binaries(BuildEnvManager::get_instance().get_build_key(device->id())); // TODO: this is really ugly, save me future-HAL! if (programmable_core_type_index == hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX)) { diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp index 0e4f20b137c..6e4af7110df 100644 --- a/tt_metal/impl/program/program.cpp +++ b/tt_metal/impl/program/program.cpp @@ -25,6 +25,7 @@ #include "tt_metal/impl/dispatch/dispatch_query_manager.hpp" #include "tt_metal/impl/program/dispatch.hpp" #include "tt_metal/jit_build/genfiles.hpp" +#include "tt_metal/jit_build/build_env_manager.hpp" #include "llrt.hpp" #include "tt_metal/program.hpp" #include "tracy/Tracy.hpp" @@ -41,7 +42,7 @@ void GenerateBinaries(IDevice* device, JitBuildOptions &build_options, const std //const std::string tracyPrefix = "GenerateBinaries_"; //ZoneName((tracyPrefix + build_options.name).c_str(), build_options.name.length() + tracyPrefix.length()); try { - jit_build_genfiles_descriptors(device->build_env(), build_options); + jit_build_genfiles_descriptors(BuildEnvManager::get_instance().get_build_env(device->id()), build_options); kernel->generate_binaries(device, build_options); } catch (std::runtime_error &ex) { TT_THROW("Failed to generate binaries for {} {}", kernel->name(), ex.what()); @@ -1114,7 +1115,7 @@ void detail::Program_::populate_dispatch_data(IDevice* device) { } else { sub_kernels = {kernel->processor()}; } - const auto &binaries = kernel->binaries(device->build_key()); + const auto& binaries = kernel->binaries(BuildEnvManager::get_instance().get_build_key(device->id())); std::vector dst_base_addrs; std::vector page_offsets; std::vector lengths; @@ -1307,7 +1308,7 @@ void Program::populate_dispatch_data(IDevice* device) { pimpl_->populate_dispatc void Program::generate_dispatch_commands(IDevice* device) { bool is_cached = this->is_cached(); - uint64_t command_hash = device->build_key(); + uint64_t command_hash = BuildEnvManager::get_instance().get_build_key(device->id()); if (not hal.is_coordinate_virtualization_enabled()) { // When coordinate virtualization is not enabled, explicitly encode the device // id into the command hash, to always assert on programs being reused across devices. @@ -1333,7 +1334,7 @@ void Program::allocate_kernel_bin_buf_on_device(IDevice* device) { pimpl_->alloc void detail::Program_::compile(IDevice* device, bool fd_bootloader_mode) { //ZoneScoped; - if (compiled_.contains(device->build_key())) { + if (compiled_.contains(BuildEnvManager::get_instance().get_build_key(device->id()))) { return; } // Clear the determined sub_device_ids when we compile the program for the first time @@ -1393,7 +1394,7 @@ void detail::Program_::compile(IDevice* device, bool fd_bootloader_mode) { validate_kernel_placement(kernel); launch_build_step( [kernel, device, this] { - JitBuildOptions build_options(device->build_env()); + JitBuildOptions build_options(BuildEnvManager::get_instance().get_build_env(device->id())); kernel->set_build_options(build_options); if (this->compiled_.empty()) { this->set_remote_circular_buffer_init(kernel); @@ -1401,7 +1402,11 @@ void detail::Program_::compile(IDevice* device, bool fd_bootloader_mode) { this->set_cb_data_fmt(kernel->logical_coreranges(), build_options); this->set_cb_tile_dims(kernel->logical_coreranges(), build_options); - auto kernel_hash = KernelCompileHash(kernel, build_options, device->build_key(), device->get_device_kernel_defines_hash()); + auto kernel_hash = KernelCompileHash( + kernel, + build_options, + BuildEnvManager::get_instance().get_build_key(device->id()), + device->get_device_kernel_defines_hash()); std::string kernel_path_suffix = kernel->name() + "/" + std::to_string(kernel_hash) + "/"; kernel->set_full_name(kernel_path_suffix); build_options.set_name(kernel_path_suffix); @@ -1446,7 +1451,7 @@ void detail::Program_::compile(IDevice* device, bool fd_bootloader_mode) { if (detail::MemoryReporter::enabled()) { detail::MemoryReporter::inst().flush_program_memory_usage(get_id(), device); } - compiled_.insert(device->build_key()); + compiled_.insert(BuildEnvManager::get_instance().get_build_key(device->id())); } void Program::compile(IDevice* device, bool fd_bootloader_mode) { pimpl_->compile(device, fd_bootloader_mode); } diff --git a/tt_metal/jit_build/CMakeLists.txt b/tt_metal/jit_build/CMakeLists.txt index d69d99a1ba6..9d15f575899 100644 --- a/tt_metal/jit_build/CMakeLists.txt +++ b/tt_metal/jit_build/CMakeLists.txt @@ -1,5 +1,6 @@ set(JIT_BUILD_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/build.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/build_env_manager.cpp ${CMAKE_CURRENT_SOURCE_DIR}/data_format.cpp ${CMAKE_CURRENT_SOURCE_DIR}/genfiles.cpp ${CMAKE_CURRENT_SOURCE_DIR}/kernel_args.cpp diff --git a/tt_metal/jit_build/build_env_manager.cpp b/tt_metal/jit_build/build_env_manager.cpp new file mode 100644 index 00000000000..c21d7b96544 --- /dev/null +++ b/tt_metal/jit_build/build_env_manager.cpp @@ -0,0 +1,257 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "build_env_manager.hpp" +#include +#include + +namespace tt::tt_metal { + +BuildEnvManager::BuildEnvManager() { + // Initialize build_state_indices_ + uint32_t index = 0; + uint32_t programmable_core_type_count = hal.get_programmable_core_type_count(); + build_state_indices_.resize(programmable_core_type_count); + for (uint32_t programmable_core = 0; programmable_core < programmable_core_type_count; programmable_core++) { + uint32_t processor_class_count = hal.get_processor_classes_count(programmable_core); + build_state_indices_[programmable_core].resize(processor_class_count); + for (uint32_t processor_class = 0; processor_class < processor_class_count; processor_class++) { + uint32_t processor_types_count = hal.get_processor_types_count(programmable_core, processor_class); + build_state_indices_[programmable_core][processor_class] = {index, processor_types_count}; + index += processor_types_count; + } + } +} + +BuildEnvManager::~BuildEnvManager() {} + +std::map initialize_device_kernel_defines(chip_id_t device_id, uint8_t num_hw_cqs) { + std::map device_kernel_defines; + + const metal_SocDescriptor& soc_d = tt::Cluster::instance().get_soc_desc(device_id); + const size_t num_dram_banks = static_cast(soc_d.get_num_dram_views()); + // # of L1 banks needs to match allocator. For L1BankingAllocator this is the # of storage cores. TODO: when + // allocator is pulled out of device, use it to get that info here. + const auto& dispatch_core_config = dispatch_core_manager::instance().get_dispatch_core_config(device_id); + const size_t num_l1_banks = tt::get_logical_compute_cores(device_id, num_hw_cqs, dispatch_core_config).size() + + tt::get_logical_storage_cores(device_id, num_hw_cqs, dispatch_core_config).size(); + + bool is_dram_pow2 = ceil(log2(num_dram_banks)) == log2(num_dram_banks); + bool is_l1_pow2 = ceil(log2(num_l1_banks)) == log2(num_l1_banks); + + device_kernel_defines.emplace("NUM_DRAM_BANKS", std::to_string(num_dram_banks)); + device_kernel_defines.emplace("NUM_L1_BANKS", std::to_string(num_l1_banks)); + + if (is_dram_pow2) { + device_kernel_defines.emplace( + "LOG_BASE_2_OF_NUM_DRAM_BANKS", std::to_string(static_cast(log2(num_dram_banks)))); + } else { + device_kernel_defines.emplace("IS_NOT_POW2_NUM_DRAM_BANKS", "1"); + } + if (is_l1_pow2) { + device_kernel_defines.emplace( + "LOG_BASE_2_OF_NUM_L1_BANKS", std::to_string(static_cast(log2(num_l1_banks)))); + } else { + device_kernel_defines.emplace("IS_NOT_POW2_NUM_L1_BANKS", "1"); + } + + // TODO (abhullar): Until we switch to virtual coordinates, we need to pass physical PCIe coordinates to device + // because Blackhole PCIe endpoint is dependent on board type + auto pcie_cores = soc_d.get_pcie_cores(); + CoreCoord pcie_core = pcie_cores.empty() ? soc_d.grid_size : pcie_cores[0]; + + device_kernel_defines.emplace("PCIE_NOC_X", std::to_string(pcie_core.x)); + device_kernel_defines.emplace("PCIE_NOC_Y", std::to_string(pcie_core.y)); + + return device_kernel_defines; +} + +uint32_t compute_build_key(chip_id_t device_id, uint8_t num_hw_cqs) { + uint32_t build_key = 0; + constexpr uint32_t harvesting_map_bits = 12; + constexpr uint32_t num_hw_cq_bits = 8; + constexpr uint32_t dispatch_core_axis_bits = 1; + constexpr uint32_t dispatch_core_type_bits = 1; + static_assert(dispatch_core_manager::MAX_NUM_HW_CQS <= (1 << num_hw_cq_bits)); + static_assert(static_cast(DispatchCoreAxis::COUNT) <= (1 << dispatch_core_axis_bits)); + static_assert(static_cast(DispatchCoreType::COUNT) <= (1 << dispatch_core_type_bits)); + static_assert( + harvesting_map_bits + num_hw_cq_bits + dispatch_core_axis_bits + dispatch_core_type_bits <= + sizeof(build_key) * CHAR_BIT); + + // num_hw_cqs, dispatch_core_axis, dispatch_core_type all change the number of banks, so need to be part of the + // build key since we have defines based on number of banks. + const auto& dispatch_core_config = dispatch_core_manager::instance().get_dispatch_core_config(device_id); + build_key = (static_cast(dispatch_core_config.get_dispatch_core_type()) + << (harvesting_map_bits + num_hw_cq_bits + dispatch_core_axis_bits)) | + (static_cast(dispatch_core_config.get_dispatch_core_axis()) + << (harvesting_map_bits + num_hw_cq_bits)) | + (static_cast(num_hw_cqs) << harvesting_map_bits); + if (not hal.is_coordinate_virtualization_enabled()) { + // Coordinate virtualization is not enabled. For a single program, its associated binaries will vary across + // devices with different cores harvested. + build_key |= tt::Cluster::instance().get_harvesting_mask(device_id); + } else { + // Coordinate Virtualization is enabled. Track only the number of harvested cores, instead of the exact + // harvesting configuration (this is not needed). + build_key |= (std::bitset(tt::Cluster::instance().get_harvesting_mask(device_id)).count()); + } + return build_key; +} + +JitBuildStateSet create_build_state(JitBuildEnv& build_env, chip_id_t device_id, uint8_t num_hw_cqs, bool is_fw) { + CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device_id); + uint32_t dispatch_message_addr = DispatchMemMap::get(dispatch_core_type, num_hw_cqs) + .get_device_command_queue_addr(CommandQueueDeviceAddrType::DISPATCH_MESSAGE); + + uint32_t num_build_states = hal.get_num_risc_processors(); + std::vector> build_states; + build_states.resize(num_build_states); + + uint32_t index = 0; + uint32_t programmable_core_type_count = hal.get_programmable_core_type_count(); + for (uint32_t programmable_core = 0; programmable_core < programmable_core_type_count; programmable_core++) { + HalProgrammableCoreType core_type = magic_enum::enum_value(programmable_core); + uint32_t processor_class_count = hal.get_processor_classes_count(programmable_core); + for (uint32_t processor_class = 0; processor_class < processor_class_count; processor_class++) { + auto compute_proc_class = magic_enum::enum_cast(processor_class); + bool is_compute_processor = + compute_proc_class.has_value() and compute_proc_class.value() == HalProcessorClassType::COMPUTE; + uint32_t processor_types_count = hal.get_processor_types_count(programmable_core, processor_class); + for (uint32_t processor_type = 0; processor_type < processor_types_count; processor_type++) { + switch (core_type) { + case HalProgrammableCoreType::TENSIX: { + if (is_compute_processor) { + build_states[index] = std::make_shared( + build_env, + JitBuiltStateConfig{ + .processor_id = processor_type, + .is_fw = is_fw, + .dispatch_message_addr = dispatch_message_addr}); + } else { + // TODO: Make .processor_id = processor_type when brisc and ncrisc are considered one + // processor class + build_states[index] = std::make_shared( + build_env, + JitBuiltStateConfig{ + .processor_id = processor_class, + .is_fw = is_fw, + .dispatch_message_addr = dispatch_message_addr}); + } + break; + } + case HalProgrammableCoreType::ACTIVE_ETH: { + // Cooperative means active erisc FW needs to context switch to base FW + bool is_cooperative = tt::Cluster::instance().arch() == ARCH::WORMHOLE_B0; + build_states[index] = std::make_shared( + build_env, + JitBuiltStateConfig{ + .processor_id = processor_class, + .is_fw = is_fw, + .dispatch_message_addr = dispatch_message_addr, + .is_cooperative = is_cooperative}); + break; + } + case HalProgrammableCoreType::IDLE_ETH: { + build_states[index] = std::make_shared( + build_env, + JitBuiltStateConfig{ + .processor_id = processor_class, + .is_fw = is_fw, + .dispatch_message_addr = dispatch_message_addr}); + break; + } + default: + TT_THROW( + "Unsupported programable core type {} to initialize build states", + magic_enum::enum_name(core_type)); + } + index++; + } + } + } + + return build_states; +} + +void BuildEnvManager::add_build_env(chip_id_t device_id, uint8_t num_hw_cqs) { + uint32_t build_key = compute_build_key(device_id, num_hw_cqs); + device_id_to_build_key_[device_id] = build_key; + + auto device_kernel_defines = initialize_device_kernel_defines(device_id, num_hw_cqs); + device_id_to_build_env_[device_id].init(build_key, tt::Cluster::instance().arch(), device_kernel_defines); + + device_id_to_firmware_build_states_[device_id] = + create_build_state(device_id_to_build_env_[device_id], device_id, num_hw_cqs, true); + device_id_to_kernel_build_states_[device_id] = + create_build_state(device_id_to_build_env_[device_id], device_id, num_hw_cqs, false); +} + +const JitBuildEnv& BuildEnvManager::get_build_env(chip_id_t device_id) { + TT_ASSERT(device_id_to_build_env_.count(device_id) != 0, "Couldn't find build env for device {}.", device_id); + return device_id_to_build_env_[device_id]; +} + +uint32_t BuildEnvManager::get_build_key(chip_id_t device_id) { + TT_ASSERT(device_id_to_build_key_.count(device_id) != 0, "Couldn't find build key for device {}.", device_id); + return device_id_to_build_key_[device_id]; +} + +const JitBuildState& BuildEnvManager::get_firmware_build_state( + chip_id_t device_id, uint32_t programmable_core, uint32_t processor_class, int processor_id) { + TT_ASSERT( + device_id_to_firmware_build_states_.count(device_id) != 0, + "Couldn't find firmware build state for device {}.", + device_id); + uint32_t state_idx = get_build_index_and_state_count(programmable_core, processor_class).first + processor_id; + return *device_id_to_firmware_build_states_[device_id][state_idx]; +} + +const JitBuildState& BuildEnvManager::get_kernel_build_state( + chip_id_t device_id, uint32_t programmable_core, uint32_t processor_class, int processor_id) { + TT_ASSERT( + device_id_to_kernel_build_states_.count(device_id) != 0, + "Couldn't find kernel build state for device {}.", + device_id); + uint32_t state_idx = get_build_index_and_state_count(programmable_core, processor_class).first + processor_id; + return *device_id_to_kernel_build_states_[device_id][state_idx]; +} + +const JitBuildStateSubset BuildEnvManager::get_kernel_build_states( + chip_id_t device_id, uint32_t programmable_core, uint32_t processor_class) { + TT_ASSERT( + device_id_to_kernel_build_states_.count(device_id) != 0, + "Couldn't find kernel build state for device {}.", + device_id); + std::pair b_id_and_count = get_build_index_and_state_count(programmable_core, processor_class); + JitBuildStateSubset subset = { + &device_id_to_kernel_build_states_[device_id][b_id_and_count.first], b_id_and_count.second}; + return subset; +} + +std::pair BuildEnvManager::get_build_index_and_state_count( + uint32_t programmable_core, uint32_t processor_class) { + TT_ASSERT( + programmable_core < build_state_indices_.size(), + "Programmable core type {} is not included in the FW or Kernel build state", + programmable_core); + TT_ASSERT( + processor_class < build_state_indices_[programmable_core].size(), + "Processor class type {} is not included in the FW or Kernel build state", + processor_class); + return build_state_indices_[programmable_core][processor_class]; +} + +void BuildEnvManager::build_firmware(chip_id_t device_id) { + TT_ASSERT( + device_id_to_firmware_build_states_.count(device_id) != 0, + "Couldn't find firmware build state for device {}.", + device_id); + log_debug(tt::LogMetal, "Building base firmware for device {}", device_id); + ZoneScoped; + + jit_build_set(device_id_to_firmware_build_states_[device_id], nullptr); +} + +} // namespace tt::tt_metal diff --git a/tt_metal/jit_build/build_env_manager.hpp b/tt_metal/jit_build/build_env_manager.hpp new file mode 100644 index 00000000000..52169be72b8 --- /dev/null +++ b/tt_metal/jit_build/build_env_manager.hpp @@ -0,0 +1,55 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "build.hpp" + +namespace tt::tt_metal { + +// Singleton class to generate and hold build environments, build keys, and build states. +class BuildEnvManager { +public: + BuildEnvManager(const BuildEnvManager&) = delete; + BuildEnvManager& operator=(const BuildEnvManager&) = delete; + static BuildEnvManager& get_instance() { + static BuildEnvManager instance; + return instance; + } + + // Add a new build environment for the corresponding device id and num_hw_cqs. Also generates the build key and + // build states. + void add_build_env(chip_id_t device_id, uint8_t num_hw_cqs); + + // Getter functions for build envs/keys/states + const JitBuildEnv& get_build_env(chip_id_t device_id); + uint32_t get_build_key(chip_id_t device_id); + const JitBuildState& get_firmware_build_state( + chip_id_t device_id, uint32_t programmable_core, uint32_t processor_class, int processor_id); + const JitBuildState& get_kernel_build_state( + chip_id_t device_id, uint32_t programmable_core, uint32_t processor_class, int processor_id); + const JitBuildStateSubset get_kernel_build_states( + chip_id_t device_id, uint32_t programmable_core, uint32_t processor_class); + + void build_firmware(chip_id_t device_id); + + // Helper function to get the unique build id and number of states for a given programmable_core and + // processor_class. + std::pair get_build_index_and_state_count(uint32_t programmable_core, uint32_t processor_class); + +private: + BuildEnvManager(); + ~BuildEnvManager(); + + std::unordered_map device_id_to_build_env_; + std::unordered_map device_id_to_build_key_; + std::unordered_map device_id_to_firmware_build_states_; + std::unordered_map device_id_to_kernel_build_states_; + + // A device-agnostic mapping from programmable_core_type and processor_class to unique index + processor_type_count. + // TODO: processor_type_count can be looked up in the hal, do we need this in here? + std::vector>> build_state_indices_; +}; + +} // namespace tt::tt_metal From 49a3328c8cc5783bec3c238da8404797647587a9 Mon Sep 17 00:00:00 2001 From: David Ma Date: Fri, 7 Feb 2025 23:26:26 +0000 Subject: [PATCH 090/316] #0: GS bugfix --- tt_metal/jit_build/build_env_manager.cpp | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/tt_metal/jit_build/build_env_manager.cpp b/tt_metal/jit_build/build_env_manager.cpp index c21d7b96544..4c92b21899f 100644 --- a/tt_metal/jit_build/build_env_manager.cpp +++ b/tt_metal/jit_build/build_env_manager.cpp @@ -34,8 +34,17 @@ std::map initialize_device_kernel_defines(chip_id_t de // # of L1 banks needs to match allocator. For L1BankingAllocator this is the # of storage cores. TODO: when // allocator is pulled out of device, use it to get that info here. const auto& dispatch_core_config = dispatch_core_manager::instance().get_dispatch_core_config(device_id); - const size_t num_l1_banks = tt::get_logical_compute_cores(device_id, num_hw_cqs, dispatch_core_config).size() + - tt::get_logical_storage_cores(device_id, num_hw_cqs, dispatch_core_config).size(); + const size_t num_compute_and_storage_cores = + tt::get_logical_compute_cores(device_id, num_hw_cqs, dispatch_core_config).size(); + const size_t num_storage_only_cores = + tt::get_logical_storage_cores(device_id, num_hw_cqs, dispatch_core_config).size(); + size_t num_banks_per_storage_core = 0; + if (num_storage_only_cores > 0) { + num_banks_per_storage_core = + static_cast(soc_d.worker_l1_size) / + tt::get_storage_core_bank_size(device_id, num_hw_cqs, dispatch_core_config).value(); + } + const size_t num_l1_banks = num_compute_and_storage_cores + num_storage_only_cores * num_banks_per_storage_core; bool is_dram_pow2 = ceil(log2(num_dram_banks)) == log2(num_dram_banks); bool is_l1_pow2 = ceil(log2(num_l1_banks)) == log2(num_l1_banks); From 7b1a84df41fdeb9f7952d328c83da099cd746a91 Mon Sep 17 00:00:00 2001 From: David Ma Date: Fri, 7 Feb 2025 23:44:08 +0000 Subject: [PATCH 091/316] #0: PR feedback --- tt_metal/jit_build/build_env_manager.cpp | 4 +--- tt_metal/jit_build/build_env_manager.hpp | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/tt_metal/jit_build/build_env_manager.cpp b/tt_metal/jit_build/build_env_manager.cpp index 4c92b21899f..30032ee9435 100644 --- a/tt_metal/jit_build/build_env_manager.cpp +++ b/tt_metal/jit_build/build_env_manager.cpp @@ -24,8 +24,6 @@ BuildEnvManager::BuildEnvManager() { } } -BuildEnvManager::~BuildEnvManager() {} - std::map initialize_device_kernel_defines(chip_id_t device_id, uint8_t num_hw_cqs) { std::map device_kernel_defines; @@ -227,7 +225,7 @@ const JitBuildState& BuildEnvManager::get_kernel_build_state( return *device_id_to_kernel_build_states_[device_id][state_idx]; } -const JitBuildStateSubset BuildEnvManager::get_kernel_build_states( +JitBuildStateSubset BuildEnvManager::get_kernel_build_states( chip_id_t device_id, uint32_t programmable_core, uint32_t processor_class) { TT_ASSERT( device_id_to_kernel_build_states_.count(device_id) != 0, diff --git a/tt_metal/jit_build/build_env_manager.hpp b/tt_metal/jit_build/build_env_manager.hpp index 52169be72b8..b8035f5327d 100644 --- a/tt_metal/jit_build/build_env_manager.hpp +++ b/tt_metal/jit_build/build_env_manager.hpp @@ -29,7 +29,7 @@ class BuildEnvManager { chip_id_t device_id, uint32_t programmable_core, uint32_t processor_class, int processor_id); const JitBuildState& get_kernel_build_state( chip_id_t device_id, uint32_t programmable_core, uint32_t processor_class, int processor_id); - const JitBuildStateSubset get_kernel_build_states( + JitBuildStateSubset get_kernel_build_states( chip_id_t device_id, uint32_t programmable_core, uint32_t processor_class); void build_firmware(chip_id_t device_id); @@ -40,7 +40,7 @@ class BuildEnvManager { private: BuildEnvManager(); - ~BuildEnvManager(); + ~BuildEnvManager() = default; std::unordered_map device_id_to_build_env_; std::unordered_map device_id_to_build_key_; From 6b7869acfc2f07000521636a1db9d93ff3bbb79c Mon Sep 17 00:00:00 2001 From: David Ma Date: Mon, 10 Feb 2025 00:48:06 +0000 Subject: [PATCH 092/316] #0: PR feedback part 2 --- tests/tt_metal/tt_metal/test_compile_args.cpp | 3 +- .../tt_metal/test_compile_program.cpp | 26 +-- .../test_compile_sets_kernel_binaries.cpp | 20 ++- tt_metal/impl/device/device_pool.cpp | 12 +- tt_metal/impl/kernels/kernel.cpp | 27 +-- tt_metal/impl/program/dispatch.cpp | 2 +- tt_metal/impl/program/program.cpp | 17 +- tt_metal/jit_build/build_env_manager.cpp | 162 +++++++++--------- tt_metal/jit_build/build_env_manager.hpp | 32 ++-- 9 files changed, 164 insertions(+), 137 deletions(-) diff --git a/tests/tt_metal/tt_metal/test_compile_args.cpp b/tests/tt_metal/tt_metal/test_compile_args.cpp index 60421324c1e..f1b8dccb478 100644 --- a/tests/tt_metal/tt_metal/test_compile_args.cpp +++ b/tests/tt_metal/tt_metal/test_compile_args.cpp @@ -69,7 +69,8 @@ int main(int argc, char** argv) { // Remove old compiled kernels static const std::string kernel_name = "test_compile_args"; auto binary_path_str = - kernel->binaries(BuildEnvManager::get_instance().get_build_env(device->id())).get_out_kernel_root_path() + + kernel->binaries(BuildEnvManager::get_instance().get_device_build_env(device->id()).build_env) + .get_out_kernel_root_path() + kernel_name; std::filesystem::remove_all(binary_path_str); diff --git a/tests/tt_metal/tt_metal/test_compile_program.cpp b/tests/tt_metal/tt_metal/test_compile_program.cpp index 4c01ee62762..fb00d21b1f5 100644 --- a/tests/tt_metal/tt_metal/test_compile_program.cpp +++ b/tests/tt_metal/tt_metal/test_compile_program.cpp @@ -61,12 +61,14 @@ std::unordered_map get_last_program_binary_path(const KernelCacheStatus CompileProgramTestWrapper(IDevice* device, Program& program, bool profile_kernel = false) { // Check std::unordered_map pre_compile_kernel_to_hash_str = get_last_program_binary_path( - program, BuildEnvManager::get_instance().get_build_env(device->id()).get_out_kernel_root_path()); + program, + BuildEnvManager::get_instance().get_device_build_env(device->id()).build_env.get_out_kernel_root_path()); detail::CompileProgram(device, program); std::unordered_map post_compile_kernel_to_hash_str = get_last_program_binary_path( - program, BuildEnvManager::get_instance().get_build_env(device->id()).get_out_kernel_root_path()); + program, + BuildEnvManager::get_instance().get_device_build_env(device->id()).build_env.get_out_kernel_root_path()); KernelCacheStatus kernel_cache_status; for (const auto& [kernel_name, hash_str] : post_compile_kernel_to_hash_str) { @@ -187,7 +189,8 @@ void assert_kernel_hash_matches( bool test_compile_program_in_loop(IDevice* device) { bool pass = true; - ClearKernelCache(BuildEnvManager::get_instance().get_build_env(device->id()).get_out_kernel_root_path()); + ClearKernelCache( + BuildEnvManager::get_instance().get_device_build_env(device->id()).build_env.get_out_kernel_root_path()); ProgramAttributes default_attributes; auto program = create_program(device, default_attributes); @@ -198,7 +201,7 @@ bool test_compile_program_in_loop(IDevice* device) { if (compile_idx == 0) { assert_kernel_binary_path_exists( program, - BuildEnvManager::get_instance().get_build_env(device->id()).get_out_kernel_root_path(), + BuildEnvManager::get_instance().get_device_build_env(device->id()).build_env.get_out_kernel_root_path(), kernel_cache_status); assert_program_cache_hit_status(program, /*hit_expected=*/false, kernel_cache_status); kernel_name_to_hash = kernel_cache_status.kernel_name_to_hash_str; @@ -214,7 +217,8 @@ bool test_compile_program_in_loop(IDevice* device) { bool test_compile_program_after_clean_kernel_binary_directory(IDevice* device) { bool pass = true; - ClearKernelCache(BuildEnvManager::get_instance().get_build_env(device->id()).get_out_kernel_root_path()); + ClearKernelCache( + BuildEnvManager::get_instance().get_device_build_env(device->id()).build_env.get_out_kernel_root_path()); ProgramAttributes default_attributes; auto program = create_program(device, default_attributes); @@ -223,12 +227,13 @@ bool test_compile_program_after_clean_kernel_binary_directory(IDevice* device) { assert_kernel_binary_path_exists( program, - BuildEnvManager::get_instance().get_build_env(device->id()).get_out_kernel_root_path(), + BuildEnvManager::get_instance().get_device_build_env(device->id()).build_env.get_out_kernel_root_path(), kernel_cache_status); assert_program_cache_hit_status(program, /*hit_expected=*/false, kernel_cache_status); std::unordered_map kernel_name_to_hash = kernel_cache_status.kernel_name_to_hash_str; - ClearKernelCache(BuildEnvManager::get_instance().get_build_env(device->id()).get_out_kernel_root_path()); + ClearKernelCache( + BuildEnvManager::get_instance().get_device_build_env(device->id()).build_env.get_out_kernel_root_path()); auto second_program = create_program(device, default_attributes); auto second_kernel_cache_status = CompileProgramTestWrapper(device, second_program); assert_program_cache_hit_status(second_program, /*hit_expected=*/false, second_kernel_cache_status); @@ -282,7 +287,7 @@ std::unordered_map compile_program_with_modified_kerne auto kernel_cache_status = CompileProgramTestWrapper(device, program); assert_kernel_binary_path_exists( program, - BuildEnvManager::get_instance().get_build_env(device->id()).get_out_kernel_root_path(), + BuildEnvManager::get_instance().get_device_build_env(device->id()).build_env.get_out_kernel_root_path(), kernel_cache_status); assert_cache_hit_status_for_kernel_type(program, kernel_type_to_cache_hit_status, kernel_cache_status); assert_hash_comparison_for_kernel_type( @@ -306,14 +311,15 @@ bool test_compile_program_with_modified_program(IDevice* device) { const static std::unordered_map compute_miss_data_movement_miss = { {tt::RISCV::COMPUTE, false}, {tt::RISCV::BRISC, false}, {tt::RISCV::NCRISC, false}}; - ClearKernelCache(BuildEnvManager::get_instance().get_build_env(device->id()).get_out_kernel_root_path()); + ClearKernelCache( + BuildEnvManager::get_instance().get_device_build_env(device->id()).build_env.get_out_kernel_root_path()); ProgramAttributes attributes; auto program = create_program(device, attributes); auto kernel_cache_status = CompileProgramTestWrapper(device, program); assert_kernel_binary_path_exists( program, - BuildEnvManager::get_instance().get_build_env(device->id()).get_out_kernel_root_path(), + BuildEnvManager::get_instance().get_device_build_env(device->id()).build_env.get_out_kernel_root_path(), kernel_cache_status); assert_program_cache_hit_status(program, /*hit_expected=*/false, kernel_cache_status); std::unordered_map kernel_name_to_hash = kernel_cache_status.kernel_name_to_hash_str; diff --git a/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp b/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp index 52f6053922e..0e70f8551d8 100644 --- a/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp +++ b/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp @@ -151,7 +151,7 @@ int main(int argc, char** argv) { tt_metal::detail::GetKernel(program, kernel_group->kernel_ids[DISPATCH_CLASS_TENSIX_DM1].value()); // Run iteration to get golden - uint32_t mask = BuildEnvManager::get_instance().get_build_key(device->id()); + uint32_t mask = BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key; tt_metal::detail::CompileProgram(device, program); compute_binaries.insert({mask, compute_kernel->binaries(mask)}); TT_FATAL(compute_binaries.at(mask).size() == 3, "Expected 3 Compute binaries!"); @@ -167,7 +167,9 @@ int main(int argc, char** argv) { for (int i = 0; i < num_devices; i++) { for (const auto& kernel_name : kernel_names) { std::filesystem::remove_all( - BuildEnvManager::get_instance().get_build_env(devices[i]->id()).get_out_kernel_root_path() + + BuildEnvManager::get_instance() + .get_device_build_env(devices[i]->id()) + .build_env.get_out_kernel_root_path() + kernel_name); } } @@ -189,7 +191,7 @@ int main(int argc, char** argv) { auto& program = new_programs[i]; ths.emplace_back([&] { for (int j = 0; j < num_compiles; j++) { - uint32_t mask = BuildEnvManager::get_instance().get_build_key(device->id()); + uint32_t mask = BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key; tt_metal::detail::CompileProgram(device, program); uint32_t programmable_core_index = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX); @@ -205,7 +207,9 @@ int main(int argc, char** argv) { TT_FATAL(riscv1_kernel->binaries(mask) == ncrisc_binaries.at(mask), "Error"); std::string kernel_name = get_latest_kernel_binary_path( - BuildEnvManager::get_instance().get_build_env(device->id()).get_out_kernel_root_path(), + BuildEnvManager::get_instance() + .get_device_build_env(device->id()) + .build_env.get_out_kernel_root_path(), riscv0_kernel); std::string brisc_hex_path = BuildEnvManager::get_instance() @@ -217,7 +221,9 @@ int main(int argc, char** argv) { brisc_binary == *brisc_binaries.at(mask).at(0), "Expected saved BRISC binary to be the same as binary in persistent cache"); kernel_name = get_latest_kernel_binary_path( - BuildEnvManager::get_instance().get_build_env(device->id()).get_out_kernel_root_path(), + BuildEnvManager::get_instance() + .get_device_build_env(device->id()) + .build_env.get_out_kernel_root_path(), riscv1_kernel); std::string ncrisc_hex_path = BuildEnvManager::get_instance() @@ -233,7 +239,9 @@ int main(int argc, char** argv) { "Expected saved NCRISC binary to be the same as binary in persistent cache"); for (int trisc_id = 0; trisc_id <= 2; trisc_id++) { kernel_name = get_latest_kernel_binary_path( - BuildEnvManager::get_instance().get_build_env(device->id()).get_out_kernel_root_path(), + BuildEnvManager::get_instance() + .get_device_build_env(device->id()) + .build_env.get_out_kernel_root_path(), compute_kernel); std::string trisc_id_str = std::to_string(trisc_id); std::string trisc_hex_path = diff --git a/tt_metal/impl/device/device_pool.cpp b/tt_metal/impl/device/device_pool.cpp index e0e24f67710..fe3d699f59d 100644 --- a/tt_metal/impl/device/device_pool.cpp +++ b/tt_metal/impl/device/device_pool.cpp @@ -305,18 +305,22 @@ void DevicePool::activate_device(chip_id_t id) { false, worker_core_thread_core, completion_queue_reader_core); - if (!this->firmware_built_keys.contains(BuildEnvManager::get_instance().get_build_key(device->id()))) { + if (!this->firmware_built_keys.contains( + BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key)) { BuildEnvManager::get_instance().build_firmware(device->id()); - this->firmware_built_keys.insert(BuildEnvManager::get_instance().get_build_key(device->id())); + this->firmware_built_keys.insert( + BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key); } this->devices.emplace_back(std::unique_ptr(device)); } else { log_debug(tt::LogMetal, "DevicePool re-initialize device {}", id); if (not device->is_initialized()) { device->initialize(num_hw_cqs, this->l1_small_size, this->trace_region_size, this->l1_bank_remap); - if (!this->firmware_built_keys.contains(BuildEnvManager::get_instance().get_build_key(device->id()))) { + if (!this->firmware_built_keys.contains( + BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key)) { BuildEnvManager::get_instance().build_firmware(device->id()); - this->firmware_built_keys.insert(BuildEnvManager::get_instance().get_build_key(device->id())); + this->firmware_built_keys.insert( + BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key); } } else { TT_THROW("Cannot re-initialize device {}, must first call close()", id); diff --git a/tt_metal/impl/kernels/kernel.cpp b/tt_metal/impl/kernels/kernel.cpp index a95a7d18c8d..2900624b204 100644 --- a/tt_metal/impl/kernels/kernel.cpp +++ b/tt_metal/impl/kernels/kernel.cpp @@ -318,13 +318,13 @@ bool Kernel::is_idle_eth() const { uint32_t Kernel::get_binary_packed_size(IDevice* device, int index) const { // In testing situations we can query the size w/o a binary - auto iter = binaries_.find(BuildEnvManager::get_instance().get_build_key(device->id())); + auto iter = binaries_.find(BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key); return iter != this->binaries_.end() ? iter->second[index]->get_packed_size() : 0; } uint32_t Kernel::get_binary_text_size(IDevice* device, int index) const { // In testing situations we can query the size w/o a binary - auto iter = binaries_.find(BuildEnvManager::get_instance().get_build_key(device->id())); + auto iter = binaries_.find(BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key); return iter != this->binaries_.end() ? iter->second[index]->get_text_size() : 0; } @@ -339,7 +339,7 @@ void ComputeKernel::set_build_options(JitBuildOptions &build_options) const { void DataMovementKernel::generate_binaries(IDevice* device, JitBuildOptions &build_options) const { jit_build_genfiles_kernel_include( - BuildEnvManager::get_instance().get_build_env(device->id()), *this, this->kernel_src_); + BuildEnvManager::get_instance().get_device_build_env(device->id()).build_env, *this, this->kernel_src_); uint32_t tensix_core_type = hal.get_programmable_core_type_index(this->get_kernel_programmable_core_type()); uint32_t dm_class_idx = magic_enum::enum_integer(HalProcessorClassType::DM); int riscv_id = static_cast::type>(this->config_.processor); @@ -350,7 +350,7 @@ void DataMovementKernel::generate_binaries(IDevice* device, JitBuildOptions &bui void EthernetKernel::generate_binaries(IDevice* device, JitBuildOptions &build_options) const { jit_build_genfiles_kernel_include( - BuildEnvManager::get_instance().get_build_env(device->id()), *this, this->kernel_src_); + BuildEnvManager::get_instance().get_device_build_env(device->id()).build_env, *this, this->kernel_src_); uint32_t erisc_core_type = hal.get_programmable_core_type_index(this->get_kernel_programmable_core_type()); uint32_t dm_class_idx = magic_enum::enum_integer(HalProcessorClassType::DM); int erisc_id = magic_enum::enum_integer(this->config_.processor); @@ -361,7 +361,7 @@ void EthernetKernel::generate_binaries(IDevice* device, JitBuildOptions &build_o void ComputeKernel::generate_binaries(IDevice* device, JitBuildOptions &build_options) const { jit_build_genfiles_triscs_src( - BuildEnvManager::get_instance().get_build_env(device->id()), *this, this->kernel_src_); + BuildEnvManager::get_instance().get_device_build_env(device->id()).build_env, *this, this->kernel_src_); uint32_t tensix_core_type = hal.get_programmable_core_type_index(this->get_kernel_programmable_core_type()); uint32_t compute_class_idx = magic_enum::enum_integer(HalProcessorClassType::COMPUTE); JitBuildStateSubset build_states = @@ -400,7 +400,8 @@ void DataMovementKernel::read_binaries(IDevice* device) { binaries.push_back(&binary_mem); uint32_t binary_size = binary_mem.get_packed_size(); log_debug(LogLoader, "RISC {} kernel binary size: {} in bytes", riscv_id, binary_size); - this->set_binaries(BuildEnvManager::get_instance().get_build_key(device->id()), std::move(binaries)); + this->set_binaries( + BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key, std::move(binaries)); } void EthernetKernel::read_binaries(IDevice* device) { @@ -422,7 +423,8 @@ void EthernetKernel::read_binaries(IDevice* device) { binaries.push_back(&binary_mem); uint32_t binary_size = binary_mem.get_packed_size(); log_debug(LogLoader, "ERISC {} kernel binary size: {} in bytes", erisc_id, binary_size); - this->set_binaries(BuildEnvManager::get_instance().get_build_key(device->id()), std::move(binaries)); + this->set_binaries( + BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key, std::move(binaries)); } void ComputeKernel::read_binaries(IDevice* device) { @@ -440,7 +442,8 @@ void ComputeKernel::read_binaries(IDevice* device) { uint32_t binary_size = binary_mem.get_packed_size(); log_debug(LogLoader, "RISC {} kernel binary size: {} in bytes", trisc_id + 2, binary_size); } - this->set_binaries(BuildEnvManager::get_instance().get_build_key(device->id()), std::move(binaries)); + this->set_binaries( + BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key, std::move(binaries)); } RISCV DataMovementKernel::processor() const { @@ -462,7 +465,8 @@ bool DataMovementKernel::configure(IDevice* device, const CoreCoord &logical_cor } auto device_id = device->id(); auto worker_core = device->worker_core_from_logical_core(logical_core); - const ll_api::memory& binary_mem = *this->binaries(BuildEnvManager::get_instance().get_build_key(device->id()))[0]; + const ll_api::memory& binary_mem = + *this->binaries(BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key)[0]; int riscv_id = static_cast::type>(this->config_.processor); llrt::write_binary_to_address(binary_mem, device_id, worker_core, base_address + offsets[riscv_id]); @@ -472,7 +476,8 @@ bool DataMovementKernel::configure(IDevice* device, const CoreCoord &logical_cor bool EthernetKernel::configure(IDevice* device, const CoreCoord &logical_core, uint32_t base_address, const uint32_t offsets[]) const { auto device_id = device->id(); auto ethernet_core = device->ethernet_core_from_logical_core(logical_core); - const ll_api::memory& binary_mem = *this->binaries(BuildEnvManager::get_instance().get_build_key(device->id()))[0]; + const ll_api::memory& binary_mem = + *this->binaries(BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key)[0]; if (this->config_.eth_mode == Eth::IDLE) { uint32_t offset_idx = magic_enum::enum_integer(HalProcessorClassType::DM) + magic_enum::enum_integer(this->config_.processor); @@ -495,7 +500,7 @@ bool ComputeKernel::configure(IDevice* device, const CoreCoord &logical_core, ui auto device_id = device->id(); auto worker_core = device->worker_core_from_logical_core(logical_core); const std::vector& binaries = - this->binaries(BuildEnvManager::get_instance().get_build_key(device->id())); + this->binaries(BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key); for (int trisc_id = 0; trisc_id <= 2; trisc_id++) { llrt::write_binary_to_address( *binaries[trisc_id], device_id, worker_core, base_address + offsets[2 + trisc_id]); diff --git a/tt_metal/impl/program/dispatch.cpp b/tt_metal/impl/program/dispatch.cpp index 39a1fa208ce..d711ac28e2e 100644 --- a/tt_metal/impl/program/dispatch.cpp +++ b/tt_metal/impl/program/dispatch.cpp @@ -219,7 +219,7 @@ uint32_t finalize_kernel_bins( if (optional_id) { const auto kernel = kernels.at(optional_id.value()); const std::vector& binaries = - kernel->binaries(BuildEnvManager::get_instance().get_build_key(device->id())); + kernel->binaries(BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key); // TODO: this is really ugly, save me future-HAL! if (programmable_core_type_index == hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX)) { diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp index 6e4af7110df..b054e1b5167 100644 --- a/tt_metal/impl/program/program.cpp +++ b/tt_metal/impl/program/program.cpp @@ -42,7 +42,8 @@ void GenerateBinaries(IDevice* device, JitBuildOptions &build_options, const std //const std::string tracyPrefix = "GenerateBinaries_"; //ZoneName((tracyPrefix + build_options.name).c_str(), build_options.name.length() + tracyPrefix.length()); try { - jit_build_genfiles_descriptors(BuildEnvManager::get_instance().get_build_env(device->id()), build_options); + jit_build_genfiles_descriptors( + BuildEnvManager::get_instance().get_device_build_env(device->id()).build_env, build_options); kernel->generate_binaries(device, build_options); } catch (std::runtime_error &ex) { TT_THROW("Failed to generate binaries for {} {}", kernel->name(), ex.what()); @@ -1115,7 +1116,8 @@ void detail::Program_::populate_dispatch_data(IDevice* device) { } else { sub_kernels = {kernel->processor()}; } - const auto& binaries = kernel->binaries(BuildEnvManager::get_instance().get_build_key(device->id())); + const auto& binaries = + kernel->binaries(BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key); std::vector dst_base_addrs; std::vector page_offsets; std::vector lengths; @@ -1308,7 +1310,7 @@ void Program::populate_dispatch_data(IDevice* device) { pimpl_->populate_dispatc void Program::generate_dispatch_commands(IDevice* device) { bool is_cached = this->is_cached(); - uint64_t command_hash = BuildEnvManager::get_instance().get_build_key(device->id()); + uint64_t command_hash = BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key; if (not hal.is_coordinate_virtualization_enabled()) { // When coordinate virtualization is not enabled, explicitly encode the device // id into the command hash, to always assert on programs being reused across devices. @@ -1334,7 +1336,7 @@ void Program::allocate_kernel_bin_buf_on_device(IDevice* device) { pimpl_->alloc void detail::Program_::compile(IDevice* device, bool fd_bootloader_mode) { //ZoneScoped; - if (compiled_.contains(BuildEnvManager::get_instance().get_build_key(device->id()))) { + if (compiled_.contains(BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key)) { return; } // Clear the determined sub_device_ids when we compile the program for the first time @@ -1394,7 +1396,8 @@ void detail::Program_::compile(IDevice* device, bool fd_bootloader_mode) { validate_kernel_placement(kernel); launch_build_step( [kernel, device, this] { - JitBuildOptions build_options(BuildEnvManager::get_instance().get_build_env(device->id())); + JitBuildOptions build_options( + BuildEnvManager::get_instance().get_device_build_env(device->id()).build_env); kernel->set_build_options(build_options); if (this->compiled_.empty()) { this->set_remote_circular_buffer_init(kernel); @@ -1405,7 +1408,7 @@ void detail::Program_::compile(IDevice* device, bool fd_bootloader_mode) { auto kernel_hash = KernelCompileHash( kernel, build_options, - BuildEnvManager::get_instance().get_build_key(device->id()), + BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key, device->get_device_kernel_defines_hash()); std::string kernel_path_suffix = kernel->name() + "/" + std::to_string(kernel_hash) + "/"; kernel->set_full_name(kernel_path_suffix); @@ -1451,7 +1454,7 @@ void detail::Program_::compile(IDevice* device, bool fd_bootloader_mode) { if (detail::MemoryReporter::enabled()) { detail::MemoryReporter::inst().flush_program_memory_usage(get_id(), device); } - compiled_.insert(BuildEnvManager::get_instance().get_build_key(device->id())); + compiled_.insert(BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key); } void Program::compile(IDevice* device, bool fd_bootloader_mode) { pimpl_->compile(device, fd_bootloader_mode); } diff --git a/tt_metal/jit_build/build_env_manager.cpp b/tt_metal/jit_build/build_env_manager.cpp index 30032ee9435..2ac938613f2 100644 --- a/tt_metal/jit_build/build_env_manager.cpp +++ b/tt_metal/jit_build/build_env_manager.cpp @@ -8,6 +8,11 @@ namespace tt::tt_metal { +BuildEnvManager& BuildEnvManager::get_instance() { + static BuildEnvManager instance; + return instance; +} + BuildEnvManager::BuildEnvManager() { // Initialize build_state_indices_ uint32_t index = 0; @@ -108,14 +113,71 @@ uint32_t compute_build_key(chip_id_t device_id, uint8_t num_hw_cqs) { } JitBuildStateSet create_build_state(JitBuildEnv& build_env, chip_id_t device_id, uint8_t num_hw_cqs, bool is_fw) { + // Get the dispatch message address for this device CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device_id); uint32_t dispatch_message_addr = DispatchMemMap::get(dispatch_core_type, num_hw_cqs) .get_device_command_queue_addr(CommandQueueDeviceAddrType::DISPATCH_MESSAGE); + // Prepare the container for build states uint32_t num_build_states = hal.get_num_risc_processors(); - std::vector> build_states; - build_states.resize(num_build_states); + std::vector> build_states(num_build_states); + ; + // Helper lambda to create a build state based on the core type and processor info. + auto create_jit_build_state = [&](HalProgrammableCoreType core_type, + uint32_t processor_class, + uint32_t processor_type, + bool is_compute_processor) -> std::shared_ptr { + switch (core_type) { + case HalProgrammableCoreType::TENSIX: { + if (is_compute_processor) { + return std::make_shared( + build_env, + JitBuiltStateConfig{ + .processor_id = processor_type, + .is_fw = is_fw, + .dispatch_message_addr = dispatch_message_addr}); + } else { + // TODO: Make .processor_id = processor_type when brisc and ncrisc are considered one + // processor class + return std::make_shared( + build_env, + JitBuiltStateConfig{ + .processor_id = processor_class, + .is_fw = is_fw, + .dispatch_message_addr = dispatch_message_addr}); + } + break; + } + case HalProgrammableCoreType::ACTIVE_ETH: { + // Cooperative means active erisc FW needs to context switch to base FW + bool is_cooperative = tt::Cluster::instance().arch() == ARCH::WORMHOLE_B0; + return std::make_shared( + build_env, + JitBuiltStateConfig{ + .processor_id = processor_class, + .is_fw = is_fw, + .dispatch_message_addr = dispatch_message_addr, + .is_cooperative = is_cooperative}); + break; + } + case HalProgrammableCoreType::IDLE_ETH: { + return std::make_shared( + build_env, + JitBuiltStateConfig{ + .processor_id = processor_class, + .is_fw = is_fw, + .dispatch_message_addr = dispatch_message_addr}); + break; + } + default: + TT_THROW( + "Unsupported programable core type {} to initialize build states", + magic_enum::enum_name(core_type)); + } + }; + + // Loop through programmable core types and their processor classes/types. uint32_t index = 0; uint32_t programmable_core_type_count = hal.get_programmable_core_type_count(); for (uint32_t programmable_core = 0; programmable_core < programmable_core_type_count; programmable_core++) { @@ -127,54 +189,8 @@ JitBuildStateSet create_build_state(JitBuildEnv& build_env, chip_id_t device_id, compute_proc_class.has_value() and compute_proc_class.value() == HalProcessorClassType::COMPUTE; uint32_t processor_types_count = hal.get_processor_types_count(programmable_core, processor_class); for (uint32_t processor_type = 0; processor_type < processor_types_count; processor_type++) { - switch (core_type) { - case HalProgrammableCoreType::TENSIX: { - if (is_compute_processor) { - build_states[index] = std::make_shared( - build_env, - JitBuiltStateConfig{ - .processor_id = processor_type, - .is_fw = is_fw, - .dispatch_message_addr = dispatch_message_addr}); - } else { - // TODO: Make .processor_id = processor_type when brisc and ncrisc are considered one - // processor class - build_states[index] = std::make_shared( - build_env, - JitBuiltStateConfig{ - .processor_id = processor_class, - .is_fw = is_fw, - .dispatch_message_addr = dispatch_message_addr}); - } - break; - } - case HalProgrammableCoreType::ACTIVE_ETH: { - // Cooperative means active erisc FW needs to context switch to base FW - bool is_cooperative = tt::Cluster::instance().arch() == ARCH::WORMHOLE_B0; - build_states[index] = std::make_shared( - build_env, - JitBuiltStateConfig{ - .processor_id = processor_class, - .is_fw = is_fw, - .dispatch_message_addr = dispatch_message_addr, - .is_cooperative = is_cooperative}); - break; - } - case HalProgrammableCoreType::IDLE_ETH: { - build_states[index] = std::make_shared( - build_env, - JitBuiltStateConfig{ - .processor_id = processor_class, - .is_fw = is_fw, - .dispatch_message_addr = dispatch_message_addr}); - break; - } - default: - TT_THROW( - "Unsupported programable core type {} to initialize build states", - magic_enum::enum_name(core_type)); - } - index++; + build_states[index++] = + create_jit_build_state(core_type, processor_class, processor_type, is_compute_processor); } } } @@ -184,56 +200,38 @@ JitBuildStateSet create_build_state(JitBuildEnv& build_env, chip_id_t device_id, void BuildEnvManager::add_build_env(chip_id_t device_id, uint8_t num_hw_cqs) { uint32_t build_key = compute_build_key(device_id, num_hw_cqs); - device_id_to_build_key_[device_id] = build_key; - auto device_kernel_defines = initialize_device_kernel_defines(device_id, num_hw_cqs); - device_id_to_build_env_[device_id].init(build_key, tt::Cluster::instance().arch(), device_kernel_defines); - device_id_to_firmware_build_states_[device_id] = - create_build_state(device_id_to_build_env_[device_id], device_id, num_hw_cqs, true); - device_id_to_kernel_build_states_[device_id] = - create_build_state(device_id_to_build_env_[device_id], device_id, num_hw_cqs, false); + device_id_to_build_env_[device_id].build_key = build_key; + device_id_to_build_env_[device_id].build_env.init(build_key, tt::Cluster::instance().arch(), device_kernel_defines); + device_id_to_build_env_[device_id].firmware_build_states = + create_build_state(device_id_to_build_env_[device_id].build_env, device_id, num_hw_cqs, true); + device_id_to_build_env_[device_id].kernel_build_states = + create_build_state(device_id_to_build_env_[device_id].build_env, device_id, num_hw_cqs, false); } -const JitBuildEnv& BuildEnvManager::get_build_env(chip_id_t device_id) { +const DeviceBuildEnv& BuildEnvManager::get_device_build_env(chip_id_t device_id) { TT_ASSERT(device_id_to_build_env_.count(device_id) != 0, "Couldn't find build env for device {}.", device_id); return device_id_to_build_env_[device_id]; } -uint32_t BuildEnvManager::get_build_key(chip_id_t device_id) { - TT_ASSERT(device_id_to_build_key_.count(device_id) != 0, "Couldn't find build key for device {}.", device_id); - return device_id_to_build_key_[device_id]; -} - const JitBuildState& BuildEnvManager::get_firmware_build_state( chip_id_t device_id, uint32_t programmable_core, uint32_t processor_class, int processor_id) { - TT_ASSERT( - device_id_to_firmware_build_states_.count(device_id) != 0, - "Couldn't find firmware build state for device {}.", - device_id); uint32_t state_idx = get_build_index_and_state_count(programmable_core, processor_class).first + processor_id; - return *device_id_to_firmware_build_states_[device_id][state_idx]; + return *get_device_build_env(device_id).firmware_build_states[state_idx]; } const JitBuildState& BuildEnvManager::get_kernel_build_state( chip_id_t device_id, uint32_t programmable_core, uint32_t processor_class, int processor_id) { - TT_ASSERT( - device_id_to_kernel_build_states_.count(device_id) != 0, - "Couldn't find kernel build state for device {}.", - device_id); uint32_t state_idx = get_build_index_and_state_count(programmable_core, processor_class).first + processor_id; - return *device_id_to_kernel_build_states_[device_id][state_idx]; + return *get_device_build_env(device_id).kernel_build_states[state_idx]; } JitBuildStateSubset BuildEnvManager::get_kernel_build_states( chip_id_t device_id, uint32_t programmable_core, uint32_t processor_class) { - TT_ASSERT( - device_id_to_kernel_build_states_.count(device_id) != 0, - "Couldn't find kernel build state for device {}.", - device_id); std::pair b_id_and_count = get_build_index_and_state_count(programmable_core, processor_class); JitBuildStateSubset subset = { - &device_id_to_kernel_build_states_[device_id][b_id_and_count.first], b_id_and_count.second}; + &get_device_build_env(device_id).kernel_build_states[b_id_and_count.first], b_id_and_count.second}; return subset; } @@ -251,14 +249,8 @@ std::pair BuildEnvManager::get_build_index_and_state_count( } void BuildEnvManager::build_firmware(chip_id_t device_id) { - TT_ASSERT( - device_id_to_firmware_build_states_.count(device_id) != 0, - "Couldn't find firmware build state for device {}.", - device_id); - log_debug(tt::LogMetal, "Building base firmware for device {}", device_id); ZoneScoped; - - jit_build_set(device_id_to_firmware_build_states_[device_id], nullptr); + jit_build_set(get_device_build_env(device_id).firmware_build_states, nullptr); } } // namespace tt::tt_metal diff --git a/tt_metal/jit_build/build_env_manager.hpp b/tt_metal/jit_build/build_env_manager.hpp index b8035f5327d..4a88cf118da 100644 --- a/tt_metal/jit_build/build_env_manager.hpp +++ b/tt_metal/jit_build/build_env_manager.hpp @@ -8,23 +8,34 @@ namespace tt::tt_metal { +using BuildIndexAndTypeCount = std::pair; // Build index and processor type count +using ProcClassMapping = std::vector; // Processor class to BuildIndexAndTypeCount +using ProgCoreMapping = + std::vector; // Programmable core and processor class to BuildIndexAndTypeCount + +// A struct to hold device-specific build environment +struct DeviceBuildEnv { + uint32_t build_key = 0; + JitBuildEnv build_env; + JitBuildStateSet firmware_build_states; + JitBuildStateSet kernel_build_states; +}; + // Singleton class to generate and hold build environments, build keys, and build states. class BuildEnvManager { public: BuildEnvManager(const BuildEnvManager&) = delete; BuildEnvManager& operator=(const BuildEnvManager&) = delete; - static BuildEnvManager& get_instance() { - static BuildEnvManager instance; - return instance; - } + static BuildEnvManager& get_instance(); // Add a new build environment for the corresponding device id and num_hw_cqs. Also generates the build key and // build states. void add_build_env(chip_id_t device_id, uint8_t num_hw_cqs); // Getter functions for build envs/keys/states - const JitBuildEnv& get_build_env(chip_id_t device_id); - uint32_t get_build_key(chip_id_t device_id); + const DeviceBuildEnv& get_device_build_env(chip_id_t device_id); + + // Helper functions to extract build states from the build env. const JitBuildState& get_firmware_build_state( chip_id_t device_id, uint32_t programmable_core, uint32_t processor_class, int processor_id); const JitBuildState& get_kernel_build_state( @@ -36,20 +47,17 @@ class BuildEnvManager { // Helper function to get the unique build id and number of states for a given programmable_core and // processor_class. - std::pair get_build_index_and_state_count(uint32_t programmable_core, uint32_t processor_class); + BuildIndexAndTypeCount get_build_index_and_state_count(uint32_t programmable_core, uint32_t processor_class); private: BuildEnvManager(); ~BuildEnvManager() = default; - std::unordered_map device_id_to_build_env_; - std::unordered_map device_id_to_build_key_; - std::unordered_map device_id_to_firmware_build_states_; - std::unordered_map device_id_to_kernel_build_states_; + std::unordered_map device_id_to_build_env_; // A device-agnostic mapping from programmable_core_type and processor_class to unique index + processor_type_count. // TODO: processor_type_count can be looked up in the hal, do we need this in here? - std::vector>> build_state_indices_; + ProgCoreMapping build_state_indices_; }; } // namespace tt::tt_metal From aa674c1387e01fa08ec40ae86465aae15d918df1 Mon Sep 17 00:00:00 2001 From: David Ma Date: Mon, 10 Feb 2025 02:04:33 +0000 Subject: [PATCH 093/316] #0: Add a lock on BuildEnvManager --- tt_metal/jit_build/build_env_manager.cpp | 3 +++ tt_metal/jit_build/build_env_manager.hpp | 1 + 2 files changed, 4 insertions(+) diff --git a/tt_metal/jit_build/build_env_manager.cpp b/tt_metal/jit_build/build_env_manager.cpp index 2ac938613f2..6cb7d59e105 100644 --- a/tt_metal/jit_build/build_env_manager.cpp +++ b/tt_metal/jit_build/build_env_manager.cpp @@ -199,6 +199,7 @@ JitBuildStateSet create_build_state(JitBuildEnv& build_env, chip_id_t device_id, } void BuildEnvManager::add_build_env(chip_id_t device_id, uint8_t num_hw_cqs) { + const std::lock_guard lock(this->lock); uint32_t build_key = compute_build_key(device_id, num_hw_cqs); auto device_kernel_defines = initialize_device_kernel_defines(device_id, num_hw_cqs); @@ -211,6 +212,7 @@ void BuildEnvManager::add_build_env(chip_id_t device_id, uint8_t num_hw_cqs) { } const DeviceBuildEnv& BuildEnvManager::get_device_build_env(chip_id_t device_id) { + const std::lock_guard lock(this->lock); TT_ASSERT(device_id_to_build_env_.count(device_id) != 0, "Couldn't find build env for device {}.", device_id); return device_id_to_build_env_[device_id]; } @@ -237,6 +239,7 @@ JitBuildStateSubset BuildEnvManager::get_kernel_build_states( std::pair BuildEnvManager::get_build_index_and_state_count( uint32_t programmable_core, uint32_t processor_class) { + const std::lock_guard lock(this->lock); TT_ASSERT( programmable_core < build_state_indices_.size(), "Programmable core type {} is not included in the FW or Kernel build state", diff --git a/tt_metal/jit_build/build_env_manager.hpp b/tt_metal/jit_build/build_env_manager.hpp index 4a88cf118da..c9be160032a 100644 --- a/tt_metal/jit_build/build_env_manager.hpp +++ b/tt_metal/jit_build/build_env_manager.hpp @@ -58,6 +58,7 @@ class BuildEnvManager { // A device-agnostic mapping from programmable_core_type and processor_class to unique index + processor_type_count. // TODO: processor_type_count can be looked up in the hal, do we need this in here? ProgCoreMapping build_state_indices_; + std::mutex lock; }; } // namespace tt::tt_metal From 550e3113a29eeeacfa76f2385e69fb22ad806f02 Mon Sep 17 00:00:00 2001 From: David Ma Date: Mon, 10 Feb 2025 08:32:48 +0000 Subject: [PATCH 094/316] #0: Mesh bugfix --- .../eth/test_erisc_app_direct_send.cpp | 4 +- tests/tt_metal/tt_metal/test_compile_args.cpp | 2 +- .../tt_metal/test_compile_program.cpp | 22 +++++----- .../test_compile_sets_kernel_binaries.cpp | 15 +++---- tt_metal/api/tt-metalium/device.hpp | 1 + tt_metal/api/tt-metalium/device_impl.hpp | 2 + tt_metal/api/tt-metalium/mesh_device.hpp | 1 + tt_metal/distributed/mesh_device.cpp | 2 + tt_metal/impl/device/device_pool.cpp | 12 +++--- tt_metal/impl/kernels/kernel.cpp | 40 ++++++++++--------- tt_metal/impl/program/dispatch.cpp | 4 +- tt_metal/impl/program/program.cpp | 14 +++---- 12 files changed, 65 insertions(+), 54 deletions(-) diff --git a/tests/tt_metal/tt_metal/eth/test_erisc_app_direct_send.cpp b/tests/tt_metal/tt_metal/eth/test_erisc_app_direct_send.cpp index 8f62ce75ce9..9c96515a0f1 100644 --- a/tests/tt_metal/tt_metal/eth/test_erisc_app_direct_send.cpp +++ b/tests/tt_metal/tt_metal/eth/test_erisc_app_direct_send.cpp @@ -229,10 +229,10 @@ bool send_over_eth( // TODO: this should be updated to use kernel api uint32_t active_eth_index = hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH); auto sender_firmware_path = BuildEnvManager::get_instance() - .get_firmware_build_state(sender_device->id(), active_eth_index, 0, 0) + .get_firmware_build_state(sender_device->build_id(), active_eth_index, 0, 0) .get_target_out_path(""); auto receiver_firmware_path = BuildEnvManager::get_instance() - .get_firmware_build_state(receiver_device->id(), active_eth_index, 0, 0) + .get_firmware_build_state(receiver_device->build_id(), active_eth_index, 0, 0) .get_target_out_path(""); const ll_api::memory& binary_mem_send = llrt::get_risc_binary(sender_firmware_path); const ll_api::memory& binary_mem_receive = llrt::get_risc_binary(receiver_firmware_path); diff --git a/tests/tt_metal/tt_metal/test_compile_args.cpp b/tests/tt_metal/tt_metal/test_compile_args.cpp index f1b8dccb478..f52ea268b5a 100644 --- a/tests/tt_metal/tt_metal/test_compile_args.cpp +++ b/tests/tt_metal/tt_metal/test_compile_args.cpp @@ -69,7 +69,7 @@ int main(int argc, char** argv) { // Remove old compiled kernels static const std::string kernel_name = "test_compile_args"; auto binary_path_str = - kernel->binaries(BuildEnvManager::get_instance().get_device_build_env(device->id()).build_env) + kernel->binaries(BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_env) .get_out_kernel_root_path() + kernel_name; std::filesystem::remove_all(binary_path_str); diff --git a/tests/tt_metal/tt_metal/test_compile_program.cpp b/tests/tt_metal/tt_metal/test_compile_program.cpp index fb00d21b1f5..ab70a1d7a0a 100644 --- a/tests/tt_metal/tt_metal/test_compile_program.cpp +++ b/tests/tt_metal/tt_metal/test_compile_program.cpp @@ -62,13 +62,13 @@ KernelCacheStatus CompileProgramTestWrapper(IDevice* device, Program& program, b // Check std::unordered_map pre_compile_kernel_to_hash_str = get_last_program_binary_path( program, - BuildEnvManager::get_instance().get_device_build_env(device->id()).build_env.get_out_kernel_root_path()); + BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_env.get_out_kernel_root_path()); detail::CompileProgram(device, program); std::unordered_map post_compile_kernel_to_hash_str = get_last_program_binary_path( program, - BuildEnvManager::get_instance().get_device_build_env(device->id()).build_env.get_out_kernel_root_path()); + BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_env.get_out_kernel_root_path()); KernelCacheStatus kernel_cache_status; for (const auto& [kernel_name, hash_str] : post_compile_kernel_to_hash_str) { @@ -190,7 +190,7 @@ bool test_compile_program_in_loop(IDevice* device) { bool pass = true; ClearKernelCache( - BuildEnvManager::get_instance().get_device_build_env(device->id()).build_env.get_out_kernel_root_path()); + BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_env.get_out_kernel_root_path()); ProgramAttributes default_attributes; auto program = create_program(device, default_attributes); @@ -201,7 +201,9 @@ bool test_compile_program_in_loop(IDevice* device) { if (compile_idx == 0) { assert_kernel_binary_path_exists( program, - BuildEnvManager::get_instance().get_device_build_env(device->id()).build_env.get_out_kernel_root_path(), + BuildEnvManager::get_instance() + .get_device_build_env(device->build_id()) + .build_env.get_out_kernel_root_path(), kernel_cache_status); assert_program_cache_hit_status(program, /*hit_expected=*/false, kernel_cache_status); kernel_name_to_hash = kernel_cache_status.kernel_name_to_hash_str; @@ -218,7 +220,7 @@ bool test_compile_program_after_clean_kernel_binary_directory(IDevice* device) { bool pass = true; ClearKernelCache( - BuildEnvManager::get_instance().get_device_build_env(device->id()).build_env.get_out_kernel_root_path()); + BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_env.get_out_kernel_root_path()); ProgramAttributes default_attributes; auto program = create_program(device, default_attributes); @@ -227,13 +229,13 @@ bool test_compile_program_after_clean_kernel_binary_directory(IDevice* device) { assert_kernel_binary_path_exists( program, - BuildEnvManager::get_instance().get_device_build_env(device->id()).build_env.get_out_kernel_root_path(), + BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_env.get_out_kernel_root_path(), kernel_cache_status); assert_program_cache_hit_status(program, /*hit_expected=*/false, kernel_cache_status); std::unordered_map kernel_name_to_hash = kernel_cache_status.kernel_name_to_hash_str; ClearKernelCache( - BuildEnvManager::get_instance().get_device_build_env(device->id()).build_env.get_out_kernel_root_path()); + BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_env.get_out_kernel_root_path()); auto second_program = create_program(device, default_attributes); auto second_kernel_cache_status = CompileProgramTestWrapper(device, second_program); assert_program_cache_hit_status(second_program, /*hit_expected=*/false, second_kernel_cache_status); @@ -287,7 +289,7 @@ std::unordered_map compile_program_with_modified_kerne auto kernel_cache_status = CompileProgramTestWrapper(device, program); assert_kernel_binary_path_exists( program, - BuildEnvManager::get_instance().get_device_build_env(device->id()).build_env.get_out_kernel_root_path(), + BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_env.get_out_kernel_root_path(), kernel_cache_status); assert_cache_hit_status_for_kernel_type(program, kernel_type_to_cache_hit_status, kernel_cache_status); assert_hash_comparison_for_kernel_type( @@ -312,14 +314,14 @@ bool test_compile_program_with_modified_program(IDevice* device) { {tt::RISCV::COMPUTE, false}, {tt::RISCV::BRISC, false}, {tt::RISCV::NCRISC, false}}; ClearKernelCache( - BuildEnvManager::get_instance().get_device_build_env(device->id()).build_env.get_out_kernel_root_path()); + BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_env.get_out_kernel_root_path()); ProgramAttributes attributes; auto program = create_program(device, attributes); auto kernel_cache_status = CompileProgramTestWrapper(device, program); assert_kernel_binary_path_exists( program, - BuildEnvManager::get_instance().get_device_build_env(device->id()).build_env.get_out_kernel_root_path(), + BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_env.get_out_kernel_root_path(), kernel_cache_status); assert_program_cache_hit_status(program, /*hit_expected=*/false, kernel_cache_status); std::unordered_map kernel_name_to_hash = kernel_cache_status.kernel_name_to_hash_str; diff --git a/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp b/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp index 0e70f8551d8..78c36188188 100644 --- a/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp +++ b/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp @@ -151,7 +151,7 @@ int main(int argc, char** argv) { tt_metal::detail::GetKernel(program, kernel_group->kernel_ids[DISPATCH_CLASS_TENSIX_DM1].value()); // Run iteration to get golden - uint32_t mask = BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key; + uint32_t mask = BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_key; tt_metal::detail::CompileProgram(device, program); compute_binaries.insert({mask, compute_kernel->binaries(mask)}); TT_FATAL(compute_binaries.at(mask).size() == 3, "Expected 3 Compute binaries!"); @@ -191,7 +191,8 @@ int main(int argc, char** argv) { auto& program = new_programs[i]; ths.emplace_back([&] { for (int j = 0; j < num_compiles; j++) { - uint32_t mask = BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key; + uint32_t mask = + BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_key; tt_metal::detail::CompileProgram(device, program); uint32_t programmable_core_index = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX); @@ -208,12 +209,12 @@ int main(int argc, char** argv) { std::string kernel_name = get_latest_kernel_binary_path( BuildEnvManager::get_instance() - .get_device_build_env(device->id()) + .get_device_build_env(device->build_id()) .build_env.get_out_kernel_root_path(), riscv0_kernel); std::string brisc_hex_path = BuildEnvManager::get_instance() - .get_kernel_build_state(device->id(), programmable_core_index, dm_class_idx, 0) + .get_kernel_build_state(device->build_id(), programmable_core_index, dm_class_idx, 0) .get_target_out_path(kernel_name); ll_api::memory const& brisc_binary = llrt::get_risc_binary(brisc_hex_path, ll_api::memory::Loading::CONTIGUOUS_XIP); @@ -222,12 +223,12 @@ int main(int argc, char** argv) { "Expected saved BRISC binary to be the same as binary in persistent cache"); kernel_name = get_latest_kernel_binary_path( BuildEnvManager::get_instance() - .get_device_build_env(device->id()) + .get_device_build_env(device->build_id()) .build_env.get_out_kernel_root_path(), riscv1_kernel); std::string ncrisc_hex_path = BuildEnvManager::get_instance() - .get_kernel_build_state(device->id(), programmable_core_index, dm_class_idx, 1) + .get_kernel_build_state(device->build_id(), programmable_core_index, dm_class_idx, 1) .get_target_out_path(kernel_name); auto load_type = (device->arch() == tt::ARCH::GRAYSKULL || device->arch() == tt::ARCH::WORMHOLE_B0) @@ -240,7 +241,7 @@ int main(int argc, char** argv) { for (int trisc_id = 0; trisc_id <= 2; trisc_id++) { kernel_name = get_latest_kernel_binary_path( BuildEnvManager::get_instance() - .get_device_build_env(device->id()) + .get_device_build_env(device->build_id()) .build_env.get_out_kernel_root_path(), compute_kernel); std::string trisc_id_str = std::to_string(trisc_id); diff --git a/tt_metal/api/tt-metalium/device.hpp b/tt_metal/api/tt-metalium/device.hpp index 35dffa444ea..be8e9af943f 100644 --- a/tt_metal/api/tt-metalium/device.hpp +++ b/tt_metal/api/tt-metalium/device.hpp @@ -67,6 +67,7 @@ class IDevice { virtual tt::ARCH arch() const = 0; virtual chip_id_t id() const = 0; + virtual chip_id_t build_id() const = 0; virtual uint8_t num_hw_cqs() const = 0; diff --git a/tt_metal/api/tt-metalium/device_impl.hpp b/tt_metal/api/tt-metalium/device_impl.hpp index ae2aeef578e..88dd1d44bc4 100644 --- a/tt_metal/api/tt-metalium/device_impl.hpp +++ b/tt_metal/api/tt-metalium/device_impl.hpp @@ -56,6 +56,8 @@ class Device : public IDevice { tt::ARCH arch() const override; chip_id_t id() const override { return id_; } + // For a single device, build id is the same as device id + chip_id_t build_id() const override { return id_; } uint8_t num_hw_cqs() const override { return num_hw_cqs_; } diff --git a/tt_metal/api/tt-metalium/mesh_device.hpp b/tt_metal/api/tt-metalium/mesh_device.hpp index b115f58a6d8..91638a57cb6 100644 --- a/tt_metal/api/tt-metalium/mesh_device.hpp +++ b/tt_metal/api/tt-metalium/mesh_device.hpp @@ -83,6 +83,7 @@ class MeshDevice : public IDevice, public std::enable_shared_from_thisid(); } bool MeshDevice::is_parent_mesh() const { return parent_mesh_.expired(); } diff --git a/tt_metal/impl/device/device_pool.cpp b/tt_metal/impl/device/device_pool.cpp index fe3d699f59d..cd73f565e73 100644 --- a/tt_metal/impl/device/device_pool.cpp +++ b/tt_metal/impl/device/device_pool.cpp @@ -306,10 +306,10 @@ void DevicePool::activate_device(chip_id_t id) { worker_core_thread_core, completion_queue_reader_core); if (!this->firmware_built_keys.contains( - BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key)) { - BuildEnvManager::get_instance().build_firmware(device->id()); + BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_key)) { + BuildEnvManager::get_instance().build_firmware(device->build_id()); this->firmware_built_keys.insert( - BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key); + BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_key); } this->devices.emplace_back(std::unique_ptr(device)); } else { @@ -317,10 +317,10 @@ void DevicePool::activate_device(chip_id_t id) { if (not device->is_initialized()) { device->initialize(num_hw_cqs, this->l1_small_size, this->trace_region_size, this->l1_bank_remap); if (!this->firmware_built_keys.contains( - BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key)) { - BuildEnvManager::get_instance().build_firmware(device->id()); + BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_key)) { + BuildEnvManager::get_instance().build_firmware(device->build_id()); this->firmware_built_keys.insert( - BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key); + BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_key); } } else { TT_THROW("Cannot re-initialize device {}, must first call close()", id); diff --git a/tt_metal/impl/kernels/kernel.cpp b/tt_metal/impl/kernels/kernel.cpp index 2900624b204..9014661fa9c 100644 --- a/tt_metal/impl/kernels/kernel.cpp +++ b/tt_metal/impl/kernels/kernel.cpp @@ -318,13 +318,13 @@ bool Kernel::is_idle_eth() const { uint32_t Kernel::get_binary_packed_size(IDevice* device, int index) const { // In testing situations we can query the size w/o a binary - auto iter = binaries_.find(BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key); + auto iter = binaries_.find(BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_key); return iter != this->binaries_.end() ? iter->second[index]->get_packed_size() : 0; } uint32_t Kernel::get_binary_text_size(IDevice* device, int index) const { // In testing situations we can query the size w/o a binary - auto iter = binaries_.find(BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key); + auto iter = binaries_.find(BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_key); return iter != this->binaries_.end() ? iter->second[index]->get_text_size() : 0; } @@ -339,33 +339,35 @@ void ComputeKernel::set_build_options(JitBuildOptions &build_options) const { void DataMovementKernel::generate_binaries(IDevice* device, JitBuildOptions &build_options) const { jit_build_genfiles_kernel_include( - BuildEnvManager::get_instance().get_device_build_env(device->id()).build_env, *this, this->kernel_src_); + BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_env, *this, this->kernel_src_); uint32_t tensix_core_type = hal.get_programmable_core_type_index(this->get_kernel_programmable_core_type()); uint32_t dm_class_idx = magic_enum::enum_integer(HalProcessorClassType::DM); int riscv_id = static_cast::type>(this->config_.processor); jit_build( - BuildEnvManager::get_instance().get_kernel_build_state(device->id(), tensix_core_type, dm_class_idx, riscv_id), + BuildEnvManager::get_instance().get_kernel_build_state( + device->build_id(), tensix_core_type, dm_class_idx, riscv_id), this); } void EthernetKernel::generate_binaries(IDevice* device, JitBuildOptions &build_options) const { jit_build_genfiles_kernel_include( - BuildEnvManager::get_instance().get_device_build_env(device->id()).build_env, *this, this->kernel_src_); + BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_env, *this, this->kernel_src_); uint32_t erisc_core_type = hal.get_programmable_core_type_index(this->get_kernel_programmable_core_type()); uint32_t dm_class_idx = magic_enum::enum_integer(HalProcessorClassType::DM); int erisc_id = magic_enum::enum_integer(this->config_.processor); jit_build( - BuildEnvManager::get_instance().get_kernel_build_state(device->id(), erisc_core_type, dm_class_idx, erisc_id), + BuildEnvManager::get_instance().get_kernel_build_state( + device->build_id(), erisc_core_type, dm_class_idx, erisc_id), this); } void ComputeKernel::generate_binaries(IDevice* device, JitBuildOptions &build_options) const { jit_build_genfiles_triscs_src( - BuildEnvManager::get_instance().get_device_build_env(device->id()).build_env, *this, this->kernel_src_); + BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_env, *this, this->kernel_src_); uint32_t tensix_core_type = hal.get_programmable_core_type_index(this->get_kernel_programmable_core_type()); uint32_t compute_class_idx = magic_enum::enum_integer(HalProcessorClassType::COMPUTE); - JitBuildStateSubset build_states = - BuildEnvManager::get_instance().get_kernel_build_states(device->id(), tensix_core_type, compute_class_idx); + JitBuildStateSubset build_states = BuildEnvManager::get_instance().get_kernel_build_states( + device->build_id(), tensix_core_type, compute_class_idx); jit_build_subset(build_states, this); } @@ -388,8 +390,8 @@ void DataMovementKernel::read_binaries(IDevice* device) { uint32_t tensix_core_type = hal.get_programmable_core_type_index(this->get_kernel_programmable_core_type()); uint32_t dm_class_idx = magic_enum::enum_integer(HalProcessorClassType::DM); int riscv_id = static_cast::type>(this->config_.processor); - const JitBuildState& build_state = - BuildEnvManager::get_instance().get_kernel_build_state(device->id(), tensix_core_type, dm_class_idx, riscv_id); + const JitBuildState& build_state = BuildEnvManager::get_instance().get_kernel_build_state( + device->build_id(), tensix_core_type, dm_class_idx, riscv_id); // TODO: from HAL auto load_type = (riscv_id == 1 && (device->arch() == tt::ARCH::GRAYSKULL || device->arch() == tt::ARCH::WORMHOLE_B0)) ? @@ -401,7 +403,7 @@ void DataMovementKernel::read_binaries(IDevice* device) { uint32_t binary_size = binary_mem.get_packed_size(); log_debug(LogLoader, "RISC {} kernel binary size: {} in bytes", riscv_id, binary_size); this->set_binaries( - BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key, std::move(binaries)); + BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_key, std::move(binaries)); } void EthernetKernel::read_binaries(IDevice* device) { @@ -411,8 +413,8 @@ void EthernetKernel::read_binaries(IDevice* device) { uint32_t erisc_core_type = hal.get_programmable_core_type_index(this->get_kernel_programmable_core_type()); uint32_t dm_class_idx = magic_enum::enum_integer(HalProcessorClassType::DM); int erisc_id = magic_enum::enum_integer(this->config_.processor); - const JitBuildState& build_state = - BuildEnvManager::get_instance().get_kernel_build_state(device->id(), erisc_core_type, dm_class_idx, erisc_id); + const JitBuildState& build_state = BuildEnvManager::get_instance().get_kernel_build_state( + device->build_id(), erisc_core_type, dm_class_idx, erisc_id); int risc_id = erisc_id + (this->config_.eth_mode == Eth::IDLE ? 6 : 5); // TODO (abhullar): clean this up when llrt helpers use HAL // TODO: fix when active eth supports relo auto load_type = (this->config_.eth_mode == Eth::IDLE) ? @@ -424,7 +426,7 @@ void EthernetKernel::read_binaries(IDevice* device) { uint32_t binary_size = binary_mem.get_packed_size(); log_debug(LogLoader, "ERISC {} kernel binary size: {} in bytes", erisc_id, binary_size); this->set_binaries( - BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key, std::move(binaries)); + BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_key, std::move(binaries)); } void ComputeKernel::read_binaries(IDevice* device) { @@ -443,7 +445,7 @@ void ComputeKernel::read_binaries(IDevice* device) { log_debug(LogLoader, "RISC {} kernel binary size: {} in bytes", trisc_id + 2, binary_size); } this->set_binaries( - BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key, std::move(binaries)); + BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_key, std::move(binaries)); } RISCV DataMovementKernel::processor() const { @@ -466,7 +468,7 @@ bool DataMovementKernel::configure(IDevice* device, const CoreCoord &logical_cor auto device_id = device->id(); auto worker_core = device->worker_core_from_logical_core(logical_core); const ll_api::memory& binary_mem = - *this->binaries(BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key)[0]; + *this->binaries(BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_key)[0]; int riscv_id = static_cast::type>(this->config_.processor); llrt::write_binary_to_address(binary_mem, device_id, worker_core, base_address + offsets[riscv_id]); @@ -477,7 +479,7 @@ bool EthernetKernel::configure(IDevice* device, const CoreCoord &logical_core, u auto device_id = device->id(); auto ethernet_core = device->ethernet_core_from_logical_core(logical_core); const ll_api::memory& binary_mem = - *this->binaries(BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key)[0]; + *this->binaries(BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_key)[0]; if (this->config_.eth_mode == Eth::IDLE) { uint32_t offset_idx = magic_enum::enum_integer(HalProcessorClassType::DM) + magic_enum::enum_integer(this->config_.processor); @@ -500,7 +502,7 @@ bool ComputeKernel::configure(IDevice* device, const CoreCoord &logical_core, ui auto device_id = device->id(); auto worker_core = device->worker_core_from_logical_core(logical_core); const std::vector& binaries = - this->binaries(BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key); + this->binaries(BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_key); for (int trisc_id = 0; trisc_id <= 2; trisc_id++) { llrt::write_binary_to_address( *binaries[trisc_id], device_id, worker_core, base_address + offsets[2 + trisc_id]); diff --git a/tt_metal/impl/program/dispatch.cpp b/tt_metal/impl/program/dispatch.cpp index d711ac28e2e..fdf9e4ee5ab 100644 --- a/tt_metal/impl/program/dispatch.cpp +++ b/tt_metal/impl/program/dispatch.cpp @@ -218,8 +218,8 @@ uint32_t finalize_kernel_bins( auto& optional_id = kg->kernel_ids[class_id]; if (optional_id) { const auto kernel = kernels.at(optional_id.value()); - const std::vector& binaries = - kernel->binaries(BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key); + const std::vector& binaries = kernel->binaries( + BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_key); // TODO: this is really ugly, save me future-HAL! if (programmable_core_type_index == hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX)) { diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp index b054e1b5167..66c44b84018 100644 --- a/tt_metal/impl/program/program.cpp +++ b/tt_metal/impl/program/program.cpp @@ -43,7 +43,7 @@ void GenerateBinaries(IDevice* device, JitBuildOptions &build_options, const std //ZoneName((tracyPrefix + build_options.name).c_str(), build_options.name.length() + tracyPrefix.length()); try { jit_build_genfiles_descriptors( - BuildEnvManager::get_instance().get_device_build_env(device->id()).build_env, build_options); + BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_env, build_options); kernel->generate_binaries(device, build_options); } catch (std::runtime_error &ex) { TT_THROW("Failed to generate binaries for {} {}", kernel->name(), ex.what()); @@ -1117,7 +1117,7 @@ void detail::Program_::populate_dispatch_data(IDevice* device) { sub_kernels = {kernel->processor()}; } const auto& binaries = - kernel->binaries(BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key); + kernel->binaries(BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_key); std::vector dst_base_addrs; std::vector page_offsets; std::vector lengths; @@ -1310,7 +1310,7 @@ void Program::populate_dispatch_data(IDevice* device) { pimpl_->populate_dispatc void Program::generate_dispatch_commands(IDevice* device) { bool is_cached = this->is_cached(); - uint64_t command_hash = BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key; + uint64_t command_hash = BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_key; if (not hal.is_coordinate_virtualization_enabled()) { // When coordinate virtualization is not enabled, explicitly encode the device // id into the command hash, to always assert on programs being reused across devices. @@ -1336,7 +1336,7 @@ void Program::allocate_kernel_bin_buf_on_device(IDevice* device) { pimpl_->alloc void detail::Program_::compile(IDevice* device, bool fd_bootloader_mode) { //ZoneScoped; - if (compiled_.contains(BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key)) { + if (compiled_.contains(BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_key)) { return; } // Clear the determined sub_device_ids when we compile the program for the first time @@ -1397,7 +1397,7 @@ void detail::Program_::compile(IDevice* device, bool fd_bootloader_mode) { launch_build_step( [kernel, device, this] { JitBuildOptions build_options( - BuildEnvManager::get_instance().get_device_build_env(device->id()).build_env); + BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_env); kernel->set_build_options(build_options); if (this->compiled_.empty()) { this->set_remote_circular_buffer_init(kernel); @@ -1408,7 +1408,7 @@ void detail::Program_::compile(IDevice* device, bool fd_bootloader_mode) { auto kernel_hash = KernelCompileHash( kernel, build_options, - BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key, + BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_key, device->get_device_kernel_defines_hash()); std::string kernel_path_suffix = kernel->name() + "/" + std::to_string(kernel_hash) + "/"; kernel->set_full_name(kernel_path_suffix); @@ -1454,7 +1454,7 @@ void detail::Program_::compile(IDevice* device, bool fd_bootloader_mode) { if (detail::MemoryReporter::enabled()) { detail::MemoryReporter::inst().flush_program_memory_usage(get_id(), device); } - compiled_.insert(BuildEnvManager::get_instance().get_device_build_env(device->id()).build_key); + compiled_.insert(BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_key); } void Program::compile(IDevice* device, bool fd_bootloader_mode) { pimpl_->compile(device, fd_bootloader_mode); } From 422c7ec7314cebdbceca72e9d05ca4210f2d4a71 Mon Sep 17 00:00:00 2001 From: David Ma Date: Mon, 10 Feb 2025 09:03:14 +0000 Subject: [PATCH 095/316] #0: Bugfix --- tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp | 2 +- tt_metal/impl/kernels/kernel.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp b/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp index 78c36188188..e0cab094ff7 100644 --- a/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp +++ b/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp @@ -248,7 +248,7 @@ int main(int argc, char** argv) { std::string trisc_hex_path = BuildEnvManager::get_instance() .get_kernel_build_state( - device->id(), programmable_core_index, compute_class_idx, trisc_id) + device->build_id(), programmable_core_index, compute_class_idx, trisc_id) .get_target_out_path(kernel_name); ll_api::memory const& trisc_binary = llrt::get_risc_binary(trisc_hex_path, ll_api::memory::Loading::CONTIGUOUS_XIP); diff --git a/tt_metal/impl/kernels/kernel.cpp b/tt_metal/impl/kernels/kernel.cpp index 9014661fa9c..6299cd38e73 100644 --- a/tt_metal/impl/kernels/kernel.cpp +++ b/tt_metal/impl/kernels/kernel.cpp @@ -436,7 +436,7 @@ void ComputeKernel::read_binaries(IDevice* device) { uint32_t compute_class_idx = magic_enum::enum_integer(HalProcessorClassType::COMPUTE); for (int trisc_id = 0; trisc_id <= 2; trisc_id++) { const JitBuildState& build_state = BuildEnvManager::get_instance().get_kernel_build_state( - device->id(), tensix_core_type, compute_class_idx, trisc_id); + device->build_id(), tensix_core_type, compute_class_idx, trisc_id); ll_api::memory const& binary_mem = llrt::get_risc_binary( build_state.get_target_out_path(this->kernel_full_name_), ll_api::memory::Loading::CONTIGUOUS_XIP); From 8653cf80781e964358d4e2aaccb1e63d8f84bbb6 Mon Sep 17 00:00:00 2001 From: Bryan Wilder Field Lozano Date: Thu, 13 Feb 2025 01:13:44 -0600 Subject: [PATCH 096/316] Remove `tt_cluster.hpp` from public API (#17813) --- .../device/test_galaxy_cluster_api.cpp | 2 +- .../dispatch/test_bw_and_latency.cpp | 3 + .../dispatch/test_pgm_dispatch.cpp | 1 + .../test_ethernet_read_and_send_data.cpp | 2 + ...ers_and_erisc_datamover_unidirectional.cpp | 2 + ...st_vs_multicast_to_single_core_latency.cpp | 1 + .../old/matmul/matmul_global_l1.cpp | 1 + .../old/matmul/matmul_local_l1.cpp | 1 + .../old/noc/test_noc_read_global_l1.cpp | 1 + .../old/noc/test_noc_read_local_l1.cpp | 1 + .../old/pcie/test_enqueue_rw_buffer.cpp | 1 + .../old/pcie/test_rw_buffer.cpp | 1 + .../old/pcie/test_rw_device_dram.cpp | 1 + .../old/pcie/test_rw_device_l1.cpp | 1 + .../tt_metal/test_stress_noc_mcast.cpp | 2 + .../unit_tests/gtests/test_ccl_on_galaxy.cpp | 2 + tt-train/tests/core/n300_utils_test.cpp | 4 +- .../model/linear_regression_ddp_test.cpp | 3 +- .../tests/modules/distributed/linear_test.cpp | 3 +- .../tests/ops/distributed/comm_ops_test.cpp | 3 +- .../distributed/distributed_ttnn_ops_test.cpp | 3 +- tt_fabric/CMakeLists.txt | 8 +- tt_fabric/control_plane.cpp | 2 + tt_fabric/mesh_graph.hpp | 4 +- tt_metal/api/tt-metalium/core_descriptor.hpp | 18 +-- tt_metal/api/tt-metalium/device.hpp | 1 - tt_metal/api/tt-metalium/device_impl.hpp | 1 - .../api/tt-metalium/dispatch_core_common.hpp | 2 + .../api/tt-metalium/dispatch_settings.hpp | 6 +- tt_metal/api/tt-metalium/hal_exp.hpp | 8 ++ tt_metal/common/CMakeLists.txt | 1 - tt_metal/common/core_assignment.cpp | 1 + tt_metal/common/core_assignment.hpp | 5 +- tt_metal/distributed/CMakeLists.txt | 1 + .../distributed/coordinate_translation.cpp | 2 + tt_metal/distributed/mesh_command_queue.cpp | 1 + tt_metal/distributed/system_mesh.cpp | 2 + tt_metal/experimental/hal.cpp | 2 + tt_metal/impl/buffers/dispatch.cpp | 2 + .../impl/buffers/global_circular_buffer.cpp | 2 + tt_metal/impl/buffers/global_semaphore.cpp | 2 + tt_metal/impl/debug/watcher_server.hpp | 2 + tt_metal/impl/device/device_pool.cpp | 2 + tt_metal/impl/dispatch/debug_tools.cpp | 3 + .../impl/dispatch/hardware_command_queue.cpp | 2 + .../impl/dispatch/kernel_config/fd_kernel.hpp | 1 + tt_metal/impl/dispatch/topology.cpp | 2 + tt_metal/impl/event/dispatch.cpp | 2 + .../impl/sub_device/sub_device_manager.cpp | 2 + tt_metal/llrt/CMakeLists.txt | 2 + tt_metal/{common => llrt}/core_descriptor.cpp | 14 +++ .../{api/tt-metalium => llrt}/tt_cluster.hpp | 112 +++++++++--------- ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp | 8 +- .../moreh/moreh_helper_functions.cpp | 7 +- .../reduction/prod/device/prod_op_all.cpp | 2 + 55 files changed, 187 insertions(+), 84 deletions(-) rename tt_metal/{common => llrt}/core_descriptor.cpp (94%) rename tt_metal/{api/tt-metalium => llrt}/tt_cluster.hpp (79%) diff --git a/tests/tt_metal/tt_metal/device/test_galaxy_cluster_api.cpp b/tests/tt_metal/tt_metal/device/test_galaxy_cluster_api.cpp index 5a59b2c03f8..8c998b1705e 100644 --- a/tests/tt_metal/tt_metal/device/test_galaxy_cluster_api.cpp +++ b/tests/tt_metal/tt_metal/device/test_galaxy_cluster_api.cpp @@ -5,7 +5,7 @@ #include #include "galaxy_fixture.hpp" -#include +#include "tt_cluster.hpp" #include using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp index 100534ab260..3053fd4c7ed 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp @@ -12,12 +12,15 @@ #include "logger.hpp" #include #include +#include #include #include #include #include #include +#include "tt_cluster.hpp" + constexpr uint32_t DEFAULT_ITERATIONS = 1000; constexpr uint32_t DEFAULT_WARMUP_ITERATIONS = 2; constexpr uint32_t DEFAULT_PAGE_SIZE = 2048; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp index bedd3d9d8f8..416566e7655 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_read_and_send_data.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_read_and_send_data.cpp index b8d8917462c..4eac223e08e 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_read_and_send_data.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_read_and_send_data.cpp @@ -21,6 +21,8 @@ #include "tt_metal/test_utils/stimulus.hpp" #include "tt_metal/test_utils/env_vars.hpp" +#include "tt_cluster.hpp" + // TODO: ARCH_NAME specific, must remove #include "eth_l1_address_map.h" diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp index a06c59ca543..2e7a24662d2 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp @@ -23,6 +23,8 @@ #include "tt_metal/test_utils/print_helpers.hpp" #include "tt_metal/test_utils/stimulus.hpp" +#include "tt_cluster.hpp" + // TODO: ARCH_NAME specific, must remove #include "eth_l1_address_map.h" diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_unicast_vs_multicast_to_single_core_latency.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_unicast_vs_multicast_to_single_core_latency.cpp index 5cc3d654981..ef049ae2f0a 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_unicast_vs_multicast_to_single_core_latency.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_unicast_vs_multicast_to_single_core_latency.cpp @@ -9,6 +9,7 @@ #include #include "dprint_server.hpp" #include "tt_metal/test_utils/deprecated/tensor.hpp" +#include "tt_cluster.hpp" using namespace tt; // diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp index 660e43fa781..13eb1015602 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include "dprint_server.hpp" diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1.cpp index 31b1ff6d780..b15d222a21d 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include "dprint_server.hpp" #include "tt_metal/test_utils/deprecated/tensor.hpp" diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp index 9e333537946..24580476130 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include "dprint_server.hpp" #include "tt_metal/test_utils/deprecated/tensor.hpp" diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_local_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_local_l1.cpp index be56b013dde..a08ec04c278 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_local_l1.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_local_l1.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include "dprint_server.hpp" #include "tt_metal/test_utils/deprecated/tensor.hpp" diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_enqueue_rw_buffer.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_enqueue_rw_buffer.cpp index 930199dd4e7..caa962ab89e 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_enqueue_rw_buffer.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_enqueue_rw_buffer.cpp @@ -8,6 +8,7 @@ #include #include +#include #include #include diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_buffer.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_buffer.cpp index 02f4ba02ab2..714e0b2af26 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_buffer.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_buffer.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_dram.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_dram.cpp index bc4cb0b2896..4ab4568663b 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_dram.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_dram.cpp @@ -9,6 +9,7 @@ #include #include +#include #include using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_l1.cpp index 193e687648e..04ae58dc362 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_l1.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_l1.cpp @@ -9,6 +9,7 @@ #include #include +#include #include using namespace tt; diff --git a/tests/tt_metal/tt_metal/test_stress_noc_mcast.cpp b/tests/tt_metal/tt_metal/test_stress_noc_mcast.cpp index df113d4c4d4..2ab7e642602 100644 --- a/tests/tt_metal/tt_metal/test_stress_noc_mcast.cpp +++ b/tests/tt_metal/tt_metal/test_stress_noc_mcast.cpp @@ -18,6 +18,7 @@ #include "logger.hpp" #include #include +#include #include #include #include @@ -25,6 +26,7 @@ #include #include #include +#include "tt_cluster.hpp" using namespace tt; diff --git a/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp b/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp index 8d5f455a4d2..69ba9810227 100644 --- a/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp +++ b/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp @@ -13,6 +13,8 @@ #include "ttnn/tensor/layout/tensor_layout.hpp" #include "ttnn_multi_command_queue_fixture.hpp" +#include "tt_cluster.hpp" + using namespace tt; using namespace tt_metal; diff --git a/tt-train/tests/core/n300_utils_test.cpp b/tt-train/tests/core/n300_utils_test.cpp index 6dca6e9d811..e4f05a45bf0 100644 --- a/tt-train/tests/core/n300_utils_test.cpp +++ b/tt-train/tests/core/n300_utils_test.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include +#include #include #include @@ -13,8 +14,9 @@ #include "core/tt_tensor_utils.hpp" auto check_board_is_n300() { - return tt::Cluster::instance().get_board_type(0) == BoardType::N300; + return tt_ClusterDescriptor::create()->get_board_type(0) == BoardType::N300; } + class N300UtilsTest : public ::testing::Test { protected: void SetUp() override { diff --git a/tt-train/tests/model/linear_regression_ddp_test.cpp b/tt-train/tests/model/linear_regression_ddp_test.cpp index 082ebdba960..cb29f87b187 100644 --- a/tt-train/tests/model/linear_regression_ddp_test.cpp +++ b/tt-train/tests/model/linear_regression_ddp_test.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include +#include #include #include @@ -22,7 +23,7 @@ namespace { auto check_board_is_n300() { - return tt::Cluster::instance().get_board_type(0) == BoardType::N300; + return tt_ClusterDescriptor::create()->get_board_type(0) == BoardType::N300; } } // namespace diff --git a/tt-train/tests/modules/distributed/linear_test.cpp b/tt-train/tests/modules/distributed/linear_test.cpp index 39fc1c587f3..fb1c47c23be 100644 --- a/tt-train/tests/modules/distributed/linear_test.cpp +++ b/tt-train/tests/modules/distributed/linear_test.cpp @@ -5,6 +5,7 @@ #include "modules/distributed/linear.hpp" #include +#include #include #include @@ -16,7 +17,7 @@ namespace { auto check_board_is_n300() { - return tt::Cluster::instance().get_board_type(0) == BoardType::N300; + return tt_ClusterDescriptor::create()->get_board_type(0) == BoardType::N300; } ttml::autograd::TensorPtr get_parameter(auto& parameters, const std::string& name_substring) { diff --git a/tt-train/tests/ops/distributed/comm_ops_test.cpp b/tt-train/tests/ops/distributed/comm_ops_test.cpp index e9ca096998e..e0d938d06eb 100644 --- a/tt-train/tests/ops/distributed/comm_ops_test.cpp +++ b/tt-train/tests/ops/distributed/comm_ops_test.cpp @@ -5,6 +5,7 @@ #include "ops/distributed/comm_ops.hpp" #include +#include #include #include @@ -17,7 +18,7 @@ namespace { auto check_board_is_n300() { - return tt::Cluster::instance().get_board_type(0) == BoardType::N300; + return tt_ClusterDescriptor::create()->get_board_type(0) == BoardType::N300; } } // namespace diff --git a/tt-train/tests/ttnn_fixed/distributed/distributed_ttnn_ops_test.cpp b/tt-train/tests/ttnn_fixed/distributed/distributed_ttnn_ops_test.cpp index b52c099a586..ff3cf5f838d 100644 --- a/tt-train/tests/ttnn_fixed/distributed/distributed_ttnn_ops_test.cpp +++ b/tt-train/tests/ttnn_fixed/distributed/distributed_ttnn_ops_test.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include +#include #include #include @@ -17,7 +18,7 @@ namespace { auto check_board_is_n300() { - return tt::Cluster::instance().get_board_type(0) == BoardType::N300; + return tt_ClusterDescriptor::create()->get_board_type(0) == BoardType::N300; } class TrivialTnnFixedDistributedTest : public ::testing::Test { diff --git a/tt_fabric/CMakeLists.txt b/tt_fabric/CMakeLists.txt index 34add9c0350..23cd638d49d 100644 --- a/tt_fabric/CMakeLists.txt +++ b/tt_fabric/CMakeLists.txt @@ -9,12 +9,18 @@ target_sources( mesh_graph.cpp ) -target_include_directories(tt_fabric PRIVATE .) +target_include_directories( + tt_fabric + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + ${PROJECT_SOURCE_DIR}/tt_metal/api/tt-metalium +) target_link_libraries( tt_fabric PRIVATE Metalium::Metal + Metalium::Metal::LLRT umd::device metal_common_libs magic_enum diff --git a/tt_fabric/control_plane.cpp b/tt_fabric/control_plane.cpp index 0bfede9f0a0..70bba401531 100644 --- a/tt_fabric/control_plane.cpp +++ b/tt_fabric/control_plane.cpp @@ -6,6 +6,8 @@ #include "control_plane.hpp" #include +#include "tt_cluster.hpp" + namespace tt::tt_fabric { // Get the physical chip ids for a mesh diff --git a/tt_fabric/mesh_graph.hpp b/tt_fabric/mesh_graph.hpp index 414b8947527..1b9ac9c6359 100644 --- a/tt_fabric/mesh_graph.hpp +++ b/tt_fabric/mesh_graph.hpp @@ -11,9 +11,11 @@ #include #include -#include #include +#include // tt::ARCH +#include // chip_id_t + namespace tt::tt_fabric { struct ChipSpec { tt::ARCH arch; diff --git a/tt_metal/api/tt-metalium/core_descriptor.hpp b/tt_metal/api/tt-metalium/core_descriptor.hpp index f403f7c23d6..9b45020a67d 100644 --- a/tt_metal/api/tt-metalium/core_descriptor.hpp +++ b/tt_metal/api/tt-metalium/core_descriptor.hpp @@ -5,10 +5,12 @@ #pragma once #include "core_coord.hpp" -#include "tt_cluster.hpp" #include "hal.hpp" #include "dispatch_core_common.hpp" +#include // tt::ARCH +#include // chip_id_t + namespace tt { struct core_descriptor_t { @@ -38,18 +40,8 @@ const core_descriptor_t& get_core_descriptor_config( const std::tuple& get_physical_worker_grid_config( chip_id_t chip, uint8_t num_hw_cqs, const tt_metal::DispatchCoreConfig& dispatch_core_config); -inline std::optional get_storage_core_bank_size( - chip_id_t device_id, const uint8_t num_hw_cqs, const tt_metal::DispatchCoreConfig& dispatch_core_config) { - const core_descriptor_t& core_desc = get_core_descriptor_config(device_id, num_hw_cqs, dispatch_core_config); - const metal_SocDescriptor& soc_desc = tt::Cluster::instance().get_soc_desc(device_id); - if (core_desc.storage_core_bank_size.has_value()) { - TT_FATAL( - core_desc.storage_core_bank_size.value() % tt_metal::hal.get_alignment(tt_metal::HalMemType::L1) == 0, - "Storage core bank size must be {} B aligned", - tt_metal::hal.get_alignment(tt_metal::HalMemType::L1)); - } - return core_desc.storage_core_bank_size; -} +std::optional get_storage_core_bank_size( + chip_id_t device_id, const uint8_t num_hw_cqs, const tt_metal::DispatchCoreConfig& dispatch_core_config); inline const std::vector& get_logical_storage_cores( chip_id_t device_id, const uint8_t num_hw_cqs, const tt_metal::DispatchCoreConfig& dispatch_core_config) { diff --git a/tt_metal/api/tt-metalium/device.hpp b/tt_metal/api/tt-metalium/device.hpp index be8e9af943f..36df50bb957 100644 --- a/tt_metal/api/tt-metalium/device.hpp +++ b/tt_metal/api/tt-metalium/device.hpp @@ -15,7 +15,6 @@ #include "data_types.hpp" #include "program_device_map.hpp" #include "build.hpp" -#include "tt_cluster.hpp" #include "hal.hpp" #include "command_queue_interface.hpp" #include "sub_device_manager.hpp" diff --git a/tt_metal/api/tt-metalium/device_impl.hpp b/tt_metal/api/tt-metalium/device_impl.hpp index 88dd1d44bc4..71cb322c39a 100644 --- a/tt_metal/api/tt-metalium/device_impl.hpp +++ b/tt_metal/api/tt-metalium/device_impl.hpp @@ -15,7 +15,6 @@ #include "data_types.hpp" #include "program_device_map.hpp" #include "build.hpp" -#include "tt_cluster.hpp" #include "hal.hpp" #include "command_queue_interface.hpp" #include "command_queue.hpp" diff --git a/tt_metal/api/tt-metalium/dispatch_core_common.hpp b/tt_metal/api/tt-metalium/dispatch_core_common.hpp index e6306d9238d..322d8d57641 100644 --- a/tt_metal/api/tt-metalium/dispatch_core_common.hpp +++ b/tt_metal/api/tt-metalium/dispatch_core_common.hpp @@ -9,6 +9,8 @@ #include "data_types.hpp" #include "reflection.hpp" +#include // CoreType + namespace tt::tt_metal { enum DispatchWorkerType : uint32_t { diff --git a/tt_metal/api/tt-metalium/dispatch_settings.hpp b/tt_metal/api/tt-metalium/dispatch_settings.hpp index 357e5220d16..fe91d61183f 100644 --- a/tt_metal/api/tt-metalium/dispatch_settings.hpp +++ b/tt_metal/api/tt-metalium/dispatch_settings.hpp @@ -7,12 +7,16 @@ #include #include #include +#include "dev_msgs.h" // go_msg_t #include "hal.hpp" -#include "tt_cluster.hpp" #include #include #include "umd/device/tt_core_coordinates.h" +namespace tt { +class Cluster; +} + namespace tt::tt_metal { // diff --git a/tt_metal/api/tt-metalium/hal_exp.hpp b/tt_metal/api/tt-metalium/hal_exp.hpp index a90a93cd8ea..5e14b0a5353 100644 --- a/tt_metal/api/tt-metalium/hal_exp.hpp +++ b/tt_metal/api/tt-metalium/hal_exp.hpp @@ -6,9 +6,17 @@ #include #include +#include namespace tt::tt_metal::experimental::hal { +/** + * @brief Uses the hardware abstraction layer to inform client of the architecture + * + * @return Architecture enum defined by UMD + */ +tt::ARCH get_arch(); + /** * @brief Uses the hardware abstraction layer to inform client of the architecture name * diff --git a/tt_metal/common/CMakeLists.txt b/tt_metal/common/CMakeLists.txt index 551051ea52b..3a31f8e6e07 100644 --- a/tt_metal/common/CMakeLists.txt +++ b/tt_metal/common/CMakeLists.txt @@ -1,7 +1,6 @@ set(COMMON_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/core_assignment.cpp ${CMAKE_CURRENT_SOURCE_DIR}/core_coord.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/core_descriptor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/metal_soc_descriptor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/shape2d.cpp ${CMAKE_CURRENT_SOURCE_DIR}/shape_base.cpp diff --git a/tt_metal/common/core_assignment.cpp b/tt_metal/common/core_assignment.cpp index 6131b31c9d8..0016850befe 100644 --- a/tt_metal/common/core_assignment.cpp +++ b/tt_metal/common/core_assignment.cpp @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 +#include "assert.hpp" #include "core_assignment.hpp" namespace tt { diff --git a/tt_metal/common/core_assignment.hpp b/tt_metal/common/core_assignment.hpp index 311a351d564..9ac23c17f28 100644 --- a/tt_metal/common/core_assignment.hpp +++ b/tt_metal/common/core_assignment.hpp @@ -3,7 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 #include "core_coord.hpp" -#include + +#include // tt::ARCH namespace tt { namespace tt_metal { @@ -12,7 +13,7 @@ namespace tt_metal { // a DRAM read or write. // Worker cores are derived based on architecture, harvesting configurations and DRAM Controller placement. std::vector get_optimal_dram_to_physical_worker_assignment( - ARCH arch, + tt::ARCH arch, const std::vector& dram_phy_coords, uint32_t full_grid_size_x, uint32_t full_grid_size_y, diff --git a/tt_metal/distributed/CMakeLists.txt b/tt_metal/distributed/CMakeLists.txt index 62f068ca7cc..ba9dbb1a442 100644 --- a/tt_metal/distributed/CMakeLists.txt +++ b/tt_metal/distributed/CMakeLists.txt @@ -17,5 +17,6 @@ target_link_libraries( common PRIVATE Metalium::Metal::Impl + Metalium::Metal::LLRT TT::Metalium::HostDevCommon ) diff --git a/tt_metal/distributed/coordinate_translation.cpp b/tt_metal/distributed/coordinate_translation.cpp index 5e4be86b0b8..e834ae37e2d 100644 --- a/tt_metal/distributed/coordinate_translation.cpp +++ b/tt_metal/distributed/coordinate_translation.cpp @@ -4,6 +4,8 @@ #include "tt_metal/distributed/coordinate_translation.hpp" +#include "tt_cluster.hpp" + #include namespace tt::tt_metal::distributed { diff --git a/tt_metal/distributed/mesh_command_queue.cpp b/tt_metal/distributed/mesh_command_queue.cpp index d19911a3112..e60010e150a 100644 --- a/tt_metal/distributed/mesh_command_queue.cpp +++ b/tt_metal/distributed/mesh_command_queue.cpp @@ -14,6 +14,7 @@ #include "tt_metal/impl/program/dispatch.hpp" #include "tt_metal/impl/dispatch/dispatch_query_manager.hpp" +#include "tt_cluster.hpp" namespace tt::tt_metal::distributed { struct MeshReadEventDescriptor { diff --git a/tt_metal/distributed/system_mesh.cpp b/tt_metal/distributed/system_mesh.cpp index 45185381ba6..e5399de7d69 100644 --- a/tt_metal/distributed/system_mesh.cpp +++ b/tt_metal/distributed/system_mesh.cpp @@ -7,6 +7,8 @@ #include "umd/device/types/cluster_descriptor_types.h" #include "tt_metal/distributed/coordinate_translation.hpp" +#include "tt_cluster.hpp" + namespace tt::tt_metal::distributed { class SystemMesh::Impl { diff --git a/tt_metal/experimental/hal.cpp b/tt_metal/experimental/hal.cpp index a93cfc65c70..d67c8d87e9c 100644 --- a/tt_metal/experimental/hal.cpp +++ b/tt_metal/experimental/hal.cpp @@ -17,6 +17,8 @@ using tt::tt_metal::HalSingleton; namespace tt::tt_metal::experimental::hal { +tt::ARCH get_arch() { return HalSingleton::getInstance().get_arch(); } + std::string get_arch_name() { auto arch_enum = HalSingleton::getInstance().get_arch(); return tt::get_string_lowercase(arch_enum); diff --git a/tt_metal/impl/buffers/dispatch.cpp b/tt_metal/impl/buffers/dispatch.cpp index 56b9e2a8c57..8655c830709 100644 --- a/tt_metal/impl/buffers/dispatch.cpp +++ b/tt_metal/impl/buffers/dispatch.cpp @@ -9,6 +9,8 @@ #include #include +#include "tt_cluster.hpp" + namespace tt::tt_metal { namespace buffer_dispatch { diff --git a/tt_metal/impl/buffers/global_circular_buffer.cpp b/tt_metal/impl/buffers/global_circular_buffer.cpp index 9759c6314ae..10974d388f9 100644 --- a/tt_metal/impl/buffers/global_circular_buffer.cpp +++ b/tt_metal/impl/buffers/global_circular_buffer.cpp @@ -18,6 +18,8 @@ #include #include +#include "tt_cluster.hpp" + namespace tt::tt_metal { namespace v1 { diff --git a/tt_metal/impl/buffers/global_semaphore.cpp b/tt_metal/impl/buffers/global_semaphore.cpp index 96164f64871..7102161571e 100644 --- a/tt_metal/impl/buffers/global_semaphore.cpp +++ b/tt_metal/impl/buffers/global_semaphore.cpp @@ -18,6 +18,8 @@ #include #include +#include "tt_cluster.hpp" + namespace tt::tt_metal { GlobalSemaphore::GlobalSemaphore( diff --git a/tt_metal/impl/debug/watcher_server.hpp b/tt_metal/impl/debug/watcher_server.hpp index 79f6680d4de..38a16e3c8ce 100644 --- a/tt_metal/impl/debug/watcher_server.hpp +++ b/tt_metal/impl/debug/watcher_server.hpp @@ -6,6 +6,8 @@ #include +struct metal_SocDescriptor; + namespace tt { void watcher_init(tt_metal::IDevice* device); diff --git a/tt_metal/impl/device/device_pool.cpp b/tt_metal/impl/device/device_pool.cpp index cd73f565e73..a269e823dd3 100644 --- a/tt_metal/impl/device/device_pool.cpp +++ b/tt_metal/impl/device/device_pool.cpp @@ -23,6 +23,8 @@ #include "tt_metal/impl/dispatch/dispatch_query_manager.hpp" #include "tt_metal/jit_build/build_env_manager.hpp" +#include "tt_cluster.hpp" + using namespace tt::tt_metal; namespace tt { diff --git a/tt_metal/impl/dispatch/debug_tools.cpp b/tt_metal/impl/dispatch/debug_tools.cpp index 95707965738..fc8980679e3 100644 --- a/tt_metal/impl/dispatch/debug_tools.cpp +++ b/tt_metal/impl/dispatch/debug_tools.cpp @@ -3,6 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 #include "debug_tools.hpp" + +#include "tt_cluster.hpp" + namespace internal { using namespace tt::tt_metal; diff --git a/tt_metal/impl/dispatch/hardware_command_queue.cpp b/tt_metal/impl/dispatch/hardware_command_queue.cpp index 8a72db6e742..d0aa1824264 100644 --- a/tt_metal/impl/dispatch/hardware_command_queue.cpp +++ b/tt_metal/impl/dispatch/hardware_command_queue.cpp @@ -12,6 +12,8 @@ #include #include +#include "tt_cluster.hpp" + // Because we are a Friend of Program, accessing Program::get_program_transfer_info() and Program::get_kernels_buffer() // MUST REMOVE #include diff --git a/tt_metal/impl/dispatch/kernel_config/fd_kernel.hpp b/tt_metal/impl/dispatch/kernel_config/fd_kernel.hpp index 33d394abf91..d60d15c991b 100644 --- a/tt_metal/impl/dispatch/kernel_config/fd_kernel.hpp +++ b/tt_metal/impl/dispatch/kernel_config/fd_kernel.hpp @@ -6,6 +6,7 @@ #include #include #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" +#include "tt_cluster.hpp" #define UNUSED_LOGICAL_CORE tt_cxy_pair(device_->id(), 0, 0) #define UNUSED_SEM_ID 0 diff --git a/tt_metal/impl/dispatch/topology.cpp b/tt_metal/impl/dispatch/topology.cpp index 6a9ff796669..b8eff2dd822 100644 --- a/tt_metal/impl/dispatch/topology.cpp +++ b/tt_metal/impl/dispatch/topology.cpp @@ -15,6 +15,8 @@ #include "kernel_config/eth_router.hpp" #include "kernel_config/eth_tunneler.hpp" +#include "tt_cluster.hpp" + namespace tt::tt_metal { // For readablity, unset = x = -1 diff --git a/tt_metal/impl/event/dispatch.cpp b/tt_metal/impl/event/dispatch.cpp index 36a62181c60..dad0f24cb7e 100644 --- a/tt_metal/impl/event/dispatch.cpp +++ b/tt_metal/impl/event/dispatch.cpp @@ -7,6 +7,8 @@ #include "tt_metal/impl/dispatch/dispatch_query_manager.hpp" #include +#include "tt_cluster.hpp" + namespace tt::tt_metal { namespace event_dispatch { diff --git a/tt_metal/impl/sub_device/sub_device_manager.cpp b/tt_metal/impl/sub_device/sub_device_manager.cpp index 042e46ae828..0a29d896618 100644 --- a/tt_metal/impl/sub_device/sub_device_manager.cpp +++ b/tt_metal/impl/sub_device/sub_device_manager.cpp @@ -20,6 +20,8 @@ #include #include "tt_metal/impl/dispatch/dispatch_query_manager.hpp" +#include "tt_cluster.hpp" + namespace tt::tt_metal { // assert here to avoid the need to include command_queue_interface.hpp in header diff --git a/tt_metal/llrt/CMakeLists.txt b/tt_metal/llrt/CMakeLists.txt index 3f60ed70a06..439492cc309 100644 --- a/tt_metal/llrt/CMakeLists.txt +++ b/tt_metal/llrt/CMakeLists.txt @@ -82,6 +82,7 @@ target_link_libraries( set(LLRT_SRC ${CMAKE_CURRENT_SOURCE_DIR}/llrt.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/core_descriptor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/rtoptions.cpp ${CMAKE_CURRENT_SOURCE_DIR}/tlb_config.cpp ${CMAKE_CURRENT_SOURCE_DIR}/tt_cluster.cpp @@ -105,6 +106,7 @@ target_link_libraries( Tracy::TracyClient nlohmann_json::nlohmann_json Reflect::Reflect + yaml-cpp::yaml-cpp magic_enum span common diff --git a/tt_metal/common/core_descriptor.cpp b/tt_metal/llrt/core_descriptor.cpp similarity index 94% rename from tt_metal/common/core_descriptor.cpp rename to tt_metal/llrt/core_descriptor.cpp index a54e5fbe818..99fd72ec096 100644 --- a/tt_metal/common/core_descriptor.cpp +++ b/tt_metal/llrt/core_descriptor.cpp @@ -4,6 +4,7 @@ #include "core_descriptor.hpp" #include "rtoptions.hpp" +#include "tt_cluster.hpp" #include "yaml-cpp/yaml.h" @@ -241,4 +242,17 @@ const std::tuple& get_physical_worker_grid_config( return physical_grid_config_cache.at(config_hash); } +std::optional get_storage_core_bank_size( + chip_id_t device_id, const uint8_t num_hw_cqs, const tt_metal::DispatchCoreConfig& dispatch_core_config) { + const core_descriptor_t& core_desc = get_core_descriptor_config(device_id, num_hw_cqs, dispatch_core_config); + const metal_SocDescriptor& soc_desc = tt::Cluster::instance().get_soc_desc(device_id); + if (core_desc.storage_core_bank_size.has_value()) { + TT_FATAL( + core_desc.storage_core_bank_size.value() % tt_metal::hal.get_alignment(tt_metal::HalMemType::L1) == 0, + "Storage core bank size must be {} B aligned", + tt_metal::hal.get_alignment(tt_metal::HalMemType::L1)); + } + return core_desc.storage_core_bank_size; +} + } // namespace tt diff --git a/tt_metal/api/tt-metalium/tt_cluster.hpp b/tt_metal/llrt/tt_cluster.hpp similarity index 79% rename from tt_metal/api/tt-metalium/tt_cluster.hpp rename to tt_metal/llrt/tt_cluster.hpp index cecb702cda6..666e9fa4eed 100644 --- a/tt_metal/api/tt-metalium/tt_cluster.hpp +++ b/tt_metal/llrt/tt_cluster.hpp @@ -39,20 +39,20 @@ enum class TargetDevice : std::uint8_t { }; class Cluster { - public: - Cluster &operator=(const Cluster &) = delete; - Cluster &operator=(Cluster &&other) noexcept = delete; - Cluster(const Cluster &) = delete; - Cluster(Cluster &&other) noexcept = delete; +public: + Cluster& operator=(const Cluster&) = delete; + Cluster& operator=(Cluster&& other) noexcept = delete; + Cluster(const Cluster&) = delete; + Cluster(Cluster&& other) noexcept = delete; - static const Cluster &instance(); + static const Cluster& instance(); // For TG Galaxy systems, mmio chips are gateway chips that are only used for dispatc, so user_devices are meant for // user facing host apis size_t number_of_user_devices() const { if (this->is_tg_cluster_) { - const auto &chips = this->cluster_desc_->get_all_chips(); - return std::count_if(chips.begin(), chips.end(), [&](const auto &id) { + const auto& chips = this->cluster_desc_->get_all_chips(); + return std::count_if(chips.begin(), chips.end(), [&](const auto& id) { return this->cluster_desc_->get_board_type(id) == BoardType::GALAXY; }); } else { @@ -68,10 +68,12 @@ class Cluster { ARCH arch() const { return this->arch_; } - const metal_SocDescriptor &get_soc_desc(chip_id_t chip) const; - CoreCoord get_virtual_coordinate_from_logical_coordinates(chip_id_t chip_id, CoreCoord logical_coord, const CoreType& core_type) const; + const metal_SocDescriptor& get_soc_desc(chip_id_t chip) const; + CoreCoord get_virtual_coordinate_from_logical_coordinates( + chip_id_t chip_id, CoreCoord logical_coord, const CoreType& core_type) const; CoreCoord get_virtual_coordinate_from_physical_coordinates(chip_id_t chip_id, CoreCoord physical_coord) const; - tt_cxy_pair get_virtual_coordinate_from_logical_coordinates(tt_cxy_pair logical_coordinate, const CoreType& core_type) const; + tt_cxy_pair get_virtual_coordinate_from_logical_coordinates( + tt_cxy_pair logical_coordinate, const CoreType& core_type) const; CoreCoord get_physical_coordinate_from_logical_coordinates( chip_id_t chip_id, CoreCoord logical_coord, const CoreType& core_type, bool no_warn = false) const; const std::unordered_set& get_virtual_worker_cores(chip_id_t chip_id) const; @@ -83,14 +85,15 @@ class Cluster { } //! device driver and misc apis - void verify_sw_fw_versions(int device_id, std::uint32_t sw_version, std::vector &fw_versions) const; + void verify_sw_fw_versions(int device_id, std::uint32_t sw_version, std::vector& fw_versions) const; - void deassert_risc_reset_at_core(const tt_cxy_pair &physical_chip_coord) const; - void assert_risc_reset_at_core(const tt_cxy_pair &physical_chip_coord) const; + void deassert_risc_reset_at_core(const tt_cxy_pair& physical_chip_coord) const; + void assert_risc_reset_at_core(const tt_cxy_pair& physical_chip_coord) const; - void write_dram_vec(std::vector &vec, tt_target_dram dram, uint64_t addr, bool small_access = false) const; + void write_dram_vec( + std::vector& vec, tt_target_dram dram, uint64_t addr, bool small_access = false) const; void read_dram_vec( - std::vector &vec, + std::vector& vec, uint32_t size_in_bytes, tt_target_dram dram, uint64_t addr, @@ -98,48 +101,52 @@ class Cluster { // Accepts physical noc coordinates void write_core( - const void *mem_ptr, uint32_t sz_in_bytes, tt_cxy_pair core, uint64_t addr, bool small_access = false) const; + const void* mem_ptr, uint32_t sz_in_bytes, tt_cxy_pair core, uint64_t addr, bool small_access = false) const; void read_core( - void *mem_ptr, uint32_t sz_in_bytes, tt_cxy_pair core, uint64_t addr, bool small_access = false) const; + void* mem_ptr, uint32_t sz_in_bytes, tt_cxy_pair core, uint64_t addr, bool small_access = false) const; void read_core( - std::vector &data, uint32_t sz_in_bytes, tt_cxy_pair core, uint64_t addr, bool small_access = false) const; + std::vector& data, + uint32_t sz_in_bytes, + tt_cxy_pair core, + uint64_t addr, + bool small_access = false) const; - std::optional> get_tlb_data(const tt_cxy_pair &target) const { - tt::umd::Cluster *device = dynamic_cast(driver_.get()); + std::optional> get_tlb_data(const tt_cxy_pair& target) const { + tt::umd::Cluster* device = dynamic_cast(driver_.get()); tt::umd::CoreCoord target_coord = get_soc_desc(target.chip).get_coord_at(target, CoordSystem::TRANSLATED); return device->get_tlb_data_from_target(target.chip, target_coord); } - std::function get_fast_pcie_static_tlb_write_callable( - int chip_id) const { + std::function get_fast_pcie_static_tlb_write_callable(int chip_id) const { chip_id_t mmio_device_id = device_to_mmio_device_.at(chip_id); - tt::umd::Cluster *device = dynamic_cast(driver_.get()); + tt::umd::Cluster* device = dynamic_cast(driver_.get()); return device->get_fast_pcie_static_tlb_write_callable(mmio_device_id); } // Returns a writer object which holds a pointer to a static tlb - // Allows for fast writes when targeting same device core by only doing the lookup once and avoiding repeated stack traversals + // Allows for fast writes when targeting same device core by only doing the lookup once and avoiding repeated stack + // traversals tt::Writer get_static_tlb_writer(tt_cxy_pair target) const { - tt::umd::Cluster *device = dynamic_cast(driver_.get()); + tt::umd::Cluster* device = dynamic_cast(driver_.get()); tt::umd::CoreCoord target_coord = get_soc_desc(target.chip).get_coord_at(target, CoordSystem::TRANSLATED); return device->get_static_tlb_writer(target.chip, target_coord); } std::uint32_t get_numa_node_for_device(uint32_t device_id) const { uint32_t mmio_device_id = this->get_associated_mmio_device(device_id); - tt::umd::Cluster *device = dynamic_cast(driver_.get()); + tt::umd::Cluster* device = dynamic_cast(driver_.get()); return driver_->get_numa_node_for_pcie_device(mmio_device_id); } - void write_reg(const std::uint32_t *mem_ptr, tt_cxy_pair target, uint64_t addr) const; - void read_reg(std::uint32_t *mem_ptr, tt_cxy_pair target, uint64_t addr) const; + void write_reg(const std::uint32_t* mem_ptr, tt_cxy_pair target, uint64_t addr) const; + void read_reg(std::uint32_t* mem_ptr, tt_cxy_pair target, uint64_t addr) const; void write_sysmem( - const void *mem_ptr, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id, uint16_t channel) const; + const void* mem_ptr, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id, uint16_t channel) const; void read_sysmem( - void *mem_ptr, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id, uint16_t channel) const; + void* mem_ptr, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id, uint16_t channel) const; - int get_device_aiclk(const chip_id_t &chip_id) const; + int get_device_aiclk(const chip_id_t& chip_id) const; void dram_barrier(chip_id_t chip_id) const; void l1_barrier(chip_id_t chip_id) const; @@ -147,7 +154,7 @@ class Cluster { uint32_t get_num_host_channels(chip_id_t device_id) const; uint32_t get_host_channel_size(chip_id_t device_id, uint32_t channel) const; // Returns address in host space - void *host_dma_address(uint64_t offset, chip_id_t src_device_id, uint16_t channel) const; + void* host_dma_address(uint64_t offset, chip_id_t src_device_id, uint16_t channel) const; uint64_t get_pcie_base_addr_from_device(chip_id_t chip_id) const; // Ethernet cluster api @@ -170,12 +177,11 @@ class Cluster { // get_ethernet_sockets(a, b)[0] is connected to get_ethernet_sockets(b, a)[0] std::vector get_ethernet_sockets(chip_id_t local_chip, chip_id_t remote_chip) const; // Converts logical ethernet core coord to physical ethernet core coord - CoreCoord ethernet_core_from_logical_core(chip_id_t chip_id, const CoreCoord &logical_core) const; + CoreCoord ethernet_core_from_logical_core(chip_id_t chip_id, const CoreCoord& logical_core) const; // Returns virtual eth coord from channel CoreCoord get_virtual_eth_core_from_channel(chip_id_t chip_id, int channel) const; - // Bookkeeping for mmio device tunnels uint32_t get_mmio_device_max_tunnel_depth(chip_id_t mmio_device) const; uint32_t get_mmio_device_tunnel_count(chip_id_t mmio_device) const; @@ -186,7 +192,8 @@ class Cluster { tt_cxy_pair get_eth_core_for_dispatch_core( tt_cxy_pair logical_dispatch_core, EthRouterMode mode, chip_id_t connected_chip_id) const; - std::tuple get_eth_tunnel_core(chip_id_t upstream_chip_id, chip_id_t downstream_chip_id, EthRouterMode mode) const; + std::tuple get_eth_tunnel_core( + chip_id_t upstream_chip_id, chip_id_t downstream_chip_id, EthRouterMode mode) const; // Internal routing for SD and FD enables launching user ethernet kernels and FD tunneling for all devices in the // cluster. When using multiple devices in a cluster, this should be the flow: @@ -196,14 +203,13 @@ class Cluster { // set_internal_routing_info_for_ethernet_cores(false); // CloseDevice(0) // CloseDevice(1) - void set_internal_routing_info_for_ethernet_cores(bool enable_internal_routing, const std::vector& target_mmio_devices = {}) const; - + void set_internal_routing_info_for_ethernet_cores( + bool enable_internal_routing, const std::vector& target_mmio_devices = {}) const; std::unordered_map>> - get_ethernet_connections() const { - return this->cluster_desc_->get_ethernet_connections(); - } - + get_ethernet_connections() const { + return this->cluster_desc_->get_ethernet_connections(); + } // Returns MMIO device ID (logical) that controls given `device_id`. If `device_id` is MMIO device it is returned. chip_id_t get_associated_mmio_device(chip_id_t device_id) const { @@ -215,7 +221,7 @@ class Cluster { } // Returns collection of devices that are controlled by the specified MMIO device inclusive of the MMIO device - const std::set &get_devices_controlled_by_mmio_device(chip_id_t mmio_device_id) const { + const std::set& get_devices_controlled_by_mmio_device(chip_id_t mmio_device_id) const { TT_ASSERT( this->devices_grouped_by_assoc_mmio_device_.count(mmio_device_id), "Expected device {} to be an MMIO device!", @@ -239,8 +245,8 @@ class Cluster { // Returns Wormhole chip board type. BoardType get_board_type(chip_id_t chip_id) const; - bool is_worker_core(const CoreCoord &core, chip_id_t chip_id) const; - bool is_ethernet_core(const CoreCoord &core, chip_id_t chip_id) const; + bool is_worker_core(const CoreCoord& core, chip_id_t chip_id) const; + bool is_ethernet_core(const CoreCoord& core, chip_id_t chip_id) const; CoreCoord get_logical_ethernet_core_from_virtual(chip_id_t chip, CoreCoord core) const; // These two functions should be removed in favor of direct translation. @@ -248,7 +254,8 @@ class Cluster { const std::unordered_map get_worker_logical_to_virtual_y(chip_id_t chip_id) const; const std::unordered_map& get_virtual_routing_to_profiler_flat_id(chip_id_t chip_id) const; - private: + +private: Cluster(); ~Cluster(); @@ -256,14 +263,13 @@ class Cluster { void generate_cluster_descriptor(); void initialize_device_drivers(); void assert_risc_reset(); - void assign_mem_channels_to_devices(chip_id_t mmio_device_id, const std::set &controlled_device_ids); - void open_driver( - const bool &skip_driver_allocs = false); - void start_driver(tt_device_params &device_params) const; + void assign_mem_channels_to_devices(chip_id_t mmio_device_id, const std::set& controlled_device_ids); + void open_driver(const bool& skip_driver_allocs = false); + void start_driver(tt_device_params& device_params) const; void get_metal_desc_from_tt_desc( - const std::unordered_map &input, - const std::unordered_map &per_chip_id_harvesting_masks); + const std::unordered_map& input, + const std::unordered_map& per_chip_id_harvesting_masks); void generate_virtual_to_umd_coord_mapping(); void generate_virtual_to_profiler_flat_id_mapping(); @@ -326,4 +332,4 @@ class Cluster { } // namespace tt -std::ostream &operator<<(std::ostream &os, tt_target_dram const &dram); +std::ostream& operator<<(std::ostream& os, const tt_target_dram& dram); diff --git a/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp b/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp index a8b1db8196b..3d684c08996 100644 --- a/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp @@ -12,9 +12,13 @@ #include "ttnn/operations/data_movement/slice/slice.hpp" #include "ttnn/operations/data_movement/concat/concat.hpp" +#include "tt-metalium/hal_exp.hpp" + namespace ttnn { namespace ccl { +using namespace tt::tt_metal::experimental; + void SyncModeSpec::add_signal(uint32_t sem_id, uint32_t wait_count) { this->sem_ids.push_back(sem_id); this->wait_counts.push_back(wait_count); @@ -213,8 +217,8 @@ void generate_edm_kernels_for_ring_or_linear_topology( std::vector const& counter_clockwise_edm_builders, std::optional receiver_device_id, std::optional sender_device_id) { - auto sender_noc = tt::tt_metal::detail::GetPreferredNOCForDRAMRead(tt::Cluster::instance().arch()); - auto receiver_noc = tt::tt_metal::detail::GetPreferredNOCForDRAMWrite(tt::Cluster::instance().arch()); + auto sender_noc = tt::tt_metal::detail::GetPreferredNOCForDRAMRead(hal::get_arch()); + auto receiver_noc = tt::tt_metal::detail::GetPreferredNOCForDRAMWrite(hal::get_arch()); uint32_t sender_socket_idx = 0; uint32_t receiver_socket_idx = 0; if (receiver_device_id == sender_device_id) { diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_helper_functions.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_helper_functions.cpp index 4964b963bf1..7429ff9efa9 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_helper_functions.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_helper_functions.cpp @@ -11,11 +11,14 @@ #include #include +#include "tt-metalium/hal_exp.hpp" + namespace ttnn { namespace operations { using namespace tt; using namespace tt::tt_metal; +using namespace tt::tt_metal::experimental; using namespace constants; std::tuple add_core_offset( @@ -102,7 +105,7 @@ std::tuple #include "tools/profiler/op_profiler.hpp" +#include // tt_ClusterDescriptor + namespace tt { using namespace constants; namespace operations { From e1f46359e2b25b3ec5768915f9d810e8c1bc57dd Mon Sep 17 00:00:00 2001 From: Michael Chiou <156848643+ttmchiou@users.noreply.github.com> Date: Thu, 13 Feb 2025 02:06:18 -0600 Subject: [PATCH 097/316] Revert "Remove `tt_cluster.hpp` from public API (#17813)" This reverts commit 8653cf80781e964358d4e2aaccb1e63d8f84bbb6. --- .../device/test_galaxy_cluster_api.cpp | 2 +- .../dispatch/test_bw_and_latency.cpp | 3 - .../dispatch/test_pgm_dispatch.cpp | 1 - .../test_ethernet_read_and_send_data.cpp | 2 - ...ers_and_erisc_datamover_unidirectional.cpp | 2 - ...st_vs_multicast_to_single_core_latency.cpp | 1 - .../old/matmul/matmul_global_l1.cpp | 1 - .../old/matmul/matmul_local_l1.cpp | 1 - .../old/noc/test_noc_read_global_l1.cpp | 1 - .../old/noc/test_noc_read_local_l1.cpp | 1 - .../old/pcie/test_enqueue_rw_buffer.cpp | 1 - .../old/pcie/test_rw_buffer.cpp | 1 - .../old/pcie/test_rw_device_dram.cpp | 1 - .../old/pcie/test_rw_device_l1.cpp | 1 - .../tt_metal/test_stress_noc_mcast.cpp | 2 - .../unit_tests/gtests/test_ccl_on_galaxy.cpp | 2 - tt-train/tests/core/n300_utils_test.cpp | 4 +- .../model/linear_regression_ddp_test.cpp | 3 +- .../tests/modules/distributed/linear_test.cpp | 3 +- .../tests/ops/distributed/comm_ops_test.cpp | 3 +- .../distributed/distributed_ttnn_ops_test.cpp | 3 +- tt_fabric/CMakeLists.txt | 8 +- tt_fabric/control_plane.cpp | 2 - tt_fabric/mesh_graph.hpp | 4 +- tt_metal/api/tt-metalium/core_descriptor.hpp | 18 ++- tt_metal/api/tt-metalium/device.hpp | 1 + tt_metal/api/tt-metalium/device_impl.hpp | 1 + .../api/tt-metalium/dispatch_core_common.hpp | 2 - .../api/tt-metalium/dispatch_settings.hpp | 6 +- tt_metal/api/tt-metalium/hal_exp.hpp | 8 -- .../{llrt => api/tt-metalium}/tt_cluster.hpp | 112 +++++++++--------- tt_metal/common/CMakeLists.txt | 1 + tt_metal/common/core_assignment.cpp | 1 - tt_metal/common/core_assignment.hpp | 5 +- tt_metal/{llrt => common}/core_descriptor.cpp | 14 --- tt_metal/distributed/CMakeLists.txt | 1 - .../distributed/coordinate_translation.cpp | 2 - tt_metal/distributed/mesh_command_queue.cpp | 1 - tt_metal/distributed/system_mesh.cpp | 2 - tt_metal/experimental/hal.cpp | 2 - tt_metal/impl/buffers/dispatch.cpp | 2 - .../impl/buffers/global_circular_buffer.cpp | 2 - tt_metal/impl/buffers/global_semaphore.cpp | 2 - tt_metal/impl/debug/watcher_server.hpp | 2 - tt_metal/impl/device/device_pool.cpp | 2 - tt_metal/impl/dispatch/debug_tools.cpp | 3 - .../impl/dispatch/hardware_command_queue.cpp | 2 - .../impl/dispatch/kernel_config/fd_kernel.hpp | 1 - tt_metal/impl/dispatch/topology.cpp | 2 - tt_metal/impl/event/dispatch.cpp | 2 - .../impl/sub_device/sub_device_manager.cpp | 2 - tt_metal/llrt/CMakeLists.txt | 2 - ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp | 8 +- .../moreh/moreh_helper_functions.cpp | 7 +- .../reduction/prod/device/prod_op_all.cpp | 2 - 55 files changed, 84 insertions(+), 187 deletions(-) rename tt_metal/{llrt => api/tt-metalium}/tt_cluster.hpp (79%) rename tt_metal/{llrt => common}/core_descriptor.cpp (94%) diff --git a/tests/tt_metal/tt_metal/device/test_galaxy_cluster_api.cpp b/tests/tt_metal/tt_metal/device/test_galaxy_cluster_api.cpp index 8c998b1705e..5a59b2c03f8 100644 --- a/tests/tt_metal/tt_metal/device/test_galaxy_cluster_api.cpp +++ b/tests/tt_metal/tt_metal/device/test_galaxy_cluster_api.cpp @@ -5,7 +5,7 @@ #include #include "galaxy_fixture.hpp" -#include "tt_cluster.hpp" +#include #include using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp index 3053fd4c7ed..100534ab260 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp @@ -12,15 +12,12 @@ #include "logger.hpp" #include #include -#include #include #include #include #include #include -#include "tt_cluster.hpp" - constexpr uint32_t DEFAULT_ITERATIONS = 1000; constexpr uint32_t DEFAULT_WARMUP_ITERATIONS = 2; constexpr uint32_t DEFAULT_PAGE_SIZE = 2048; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp index 416566e7655..bedd3d9d8f8 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_read_and_send_data.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_read_and_send_data.cpp index 4eac223e08e..b8d8917462c 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_read_and_send_data.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_read_and_send_data.cpp @@ -21,8 +21,6 @@ #include "tt_metal/test_utils/stimulus.hpp" #include "tt_metal/test_utils/env_vars.hpp" -#include "tt_cluster.hpp" - // TODO: ARCH_NAME specific, must remove #include "eth_l1_address_map.h" diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp index 2e7a24662d2..a06c59ca543 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp @@ -23,8 +23,6 @@ #include "tt_metal/test_utils/print_helpers.hpp" #include "tt_metal/test_utils/stimulus.hpp" -#include "tt_cluster.hpp" - // TODO: ARCH_NAME specific, must remove #include "eth_l1_address_map.h" diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_unicast_vs_multicast_to_single_core_latency.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_unicast_vs_multicast_to_single_core_latency.cpp index ef049ae2f0a..5cc3d654981 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_unicast_vs_multicast_to_single_core_latency.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_unicast_vs_multicast_to_single_core_latency.cpp @@ -9,7 +9,6 @@ #include #include "dprint_server.hpp" #include "tt_metal/test_utils/deprecated/tensor.hpp" -#include "tt_cluster.hpp" using namespace tt; // diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp index 13eb1015602..660e43fa781 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include "dprint_server.hpp" diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1.cpp index b15d222a21d..31b1ff6d780 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1.cpp @@ -11,7 +11,6 @@ #include #include #include -#include #include #include "dprint_server.hpp" #include "tt_metal/test_utils/deprecated/tensor.hpp" diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp index 24580476130..9e333537946 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp @@ -11,7 +11,6 @@ #include #include #include -#include #include #include "dprint_server.hpp" #include "tt_metal/test_utils/deprecated/tensor.hpp" diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_local_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_local_l1.cpp index a08ec04c278..be56b013dde 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_local_l1.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_local_l1.cpp @@ -11,7 +11,6 @@ #include #include #include -#include #include #include "dprint_server.hpp" #include "tt_metal/test_utils/deprecated/tensor.hpp" diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_enqueue_rw_buffer.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_enqueue_rw_buffer.cpp index caa962ab89e..930199dd4e7 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_enqueue_rw_buffer.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_enqueue_rw_buffer.cpp @@ -8,7 +8,6 @@ #include #include -#include #include #include diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_buffer.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_buffer.cpp index 714e0b2af26..02f4ba02ab2 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_buffer.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_buffer.cpp @@ -9,7 +9,6 @@ #include #include #include -#include #include using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_dram.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_dram.cpp index 4ab4568663b..bc4cb0b2896 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_dram.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_dram.cpp @@ -9,7 +9,6 @@ #include #include -#include #include using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_l1.cpp index 04ae58dc362..193e687648e 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_l1.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_l1.cpp @@ -9,7 +9,6 @@ #include #include -#include #include using namespace tt; diff --git a/tests/tt_metal/tt_metal/test_stress_noc_mcast.cpp b/tests/tt_metal/tt_metal/test_stress_noc_mcast.cpp index 2ab7e642602..df113d4c4d4 100644 --- a/tests/tt_metal/tt_metal/test_stress_noc_mcast.cpp +++ b/tests/tt_metal/tt_metal/test_stress_noc_mcast.cpp @@ -18,7 +18,6 @@ #include "logger.hpp" #include #include -#include #include #include #include @@ -26,7 +25,6 @@ #include #include #include -#include "tt_cluster.hpp" using namespace tt; diff --git a/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp b/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp index 69ba9810227..8d5f455a4d2 100644 --- a/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp +++ b/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp @@ -13,8 +13,6 @@ #include "ttnn/tensor/layout/tensor_layout.hpp" #include "ttnn_multi_command_queue_fixture.hpp" -#include "tt_cluster.hpp" - using namespace tt; using namespace tt_metal; diff --git a/tt-train/tests/core/n300_utils_test.cpp b/tt-train/tests/core/n300_utils_test.cpp index e4f05a45bf0..6dca6e9d811 100644 --- a/tt-train/tests/core/n300_utils_test.cpp +++ b/tt-train/tests/core/n300_utils_test.cpp @@ -3,7 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 #include -#include #include #include @@ -14,9 +13,8 @@ #include "core/tt_tensor_utils.hpp" auto check_board_is_n300() { - return tt_ClusterDescriptor::create()->get_board_type(0) == BoardType::N300; + return tt::Cluster::instance().get_board_type(0) == BoardType::N300; } - class N300UtilsTest : public ::testing::Test { protected: void SetUp() override { diff --git a/tt-train/tests/model/linear_regression_ddp_test.cpp b/tt-train/tests/model/linear_regression_ddp_test.cpp index cb29f87b187..082ebdba960 100644 --- a/tt-train/tests/model/linear_regression_ddp_test.cpp +++ b/tt-train/tests/model/linear_regression_ddp_test.cpp @@ -3,7 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 #include -#include #include #include @@ -23,7 +22,7 @@ namespace { auto check_board_is_n300() { - return tt_ClusterDescriptor::create()->get_board_type(0) == BoardType::N300; + return tt::Cluster::instance().get_board_type(0) == BoardType::N300; } } // namespace diff --git a/tt-train/tests/modules/distributed/linear_test.cpp b/tt-train/tests/modules/distributed/linear_test.cpp index fb1c47c23be..39fc1c587f3 100644 --- a/tt-train/tests/modules/distributed/linear_test.cpp +++ b/tt-train/tests/modules/distributed/linear_test.cpp @@ -5,7 +5,6 @@ #include "modules/distributed/linear.hpp" #include -#include #include #include @@ -17,7 +16,7 @@ namespace { auto check_board_is_n300() { - return tt_ClusterDescriptor::create()->get_board_type(0) == BoardType::N300; + return tt::Cluster::instance().get_board_type(0) == BoardType::N300; } ttml::autograd::TensorPtr get_parameter(auto& parameters, const std::string& name_substring) { diff --git a/tt-train/tests/ops/distributed/comm_ops_test.cpp b/tt-train/tests/ops/distributed/comm_ops_test.cpp index e0d938d06eb..e9ca096998e 100644 --- a/tt-train/tests/ops/distributed/comm_ops_test.cpp +++ b/tt-train/tests/ops/distributed/comm_ops_test.cpp @@ -5,7 +5,6 @@ #include "ops/distributed/comm_ops.hpp" #include -#include #include #include @@ -18,7 +17,7 @@ namespace { auto check_board_is_n300() { - return tt_ClusterDescriptor::create()->get_board_type(0) == BoardType::N300; + return tt::Cluster::instance().get_board_type(0) == BoardType::N300; } } // namespace diff --git a/tt-train/tests/ttnn_fixed/distributed/distributed_ttnn_ops_test.cpp b/tt-train/tests/ttnn_fixed/distributed/distributed_ttnn_ops_test.cpp index ff3cf5f838d..b52c099a586 100644 --- a/tt-train/tests/ttnn_fixed/distributed/distributed_ttnn_ops_test.cpp +++ b/tt-train/tests/ttnn_fixed/distributed/distributed_ttnn_ops_test.cpp @@ -3,7 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 #include -#include #include #include @@ -18,7 +17,7 @@ namespace { auto check_board_is_n300() { - return tt_ClusterDescriptor::create()->get_board_type(0) == BoardType::N300; + return tt::Cluster::instance().get_board_type(0) == BoardType::N300; } class TrivialTnnFixedDistributedTest : public ::testing::Test { diff --git a/tt_fabric/CMakeLists.txt b/tt_fabric/CMakeLists.txt index 23cd638d49d..34add9c0350 100644 --- a/tt_fabric/CMakeLists.txt +++ b/tt_fabric/CMakeLists.txt @@ -9,18 +9,12 @@ target_sources( mesh_graph.cpp ) -target_include_directories( - tt_fabric - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR} - ${PROJECT_SOURCE_DIR}/tt_metal/api/tt-metalium -) +target_include_directories(tt_fabric PRIVATE .) target_link_libraries( tt_fabric PRIVATE Metalium::Metal - Metalium::Metal::LLRT umd::device metal_common_libs magic_enum diff --git a/tt_fabric/control_plane.cpp b/tt_fabric/control_plane.cpp index 70bba401531..0bfede9f0a0 100644 --- a/tt_fabric/control_plane.cpp +++ b/tt_fabric/control_plane.cpp @@ -6,8 +6,6 @@ #include "control_plane.hpp" #include -#include "tt_cluster.hpp" - namespace tt::tt_fabric { // Get the physical chip ids for a mesh diff --git a/tt_fabric/mesh_graph.hpp b/tt_fabric/mesh_graph.hpp index 1b9ac9c6359..414b8947527 100644 --- a/tt_fabric/mesh_graph.hpp +++ b/tt_fabric/mesh_graph.hpp @@ -11,11 +11,9 @@ #include #include +#include #include -#include // tt::ARCH -#include // chip_id_t - namespace tt::tt_fabric { struct ChipSpec { tt::ARCH arch; diff --git a/tt_metal/api/tt-metalium/core_descriptor.hpp b/tt_metal/api/tt-metalium/core_descriptor.hpp index 9b45020a67d..f403f7c23d6 100644 --- a/tt_metal/api/tt-metalium/core_descriptor.hpp +++ b/tt_metal/api/tt-metalium/core_descriptor.hpp @@ -5,12 +5,10 @@ #pragma once #include "core_coord.hpp" +#include "tt_cluster.hpp" #include "hal.hpp" #include "dispatch_core_common.hpp" -#include // tt::ARCH -#include // chip_id_t - namespace tt { struct core_descriptor_t { @@ -40,8 +38,18 @@ const core_descriptor_t& get_core_descriptor_config( const std::tuple& get_physical_worker_grid_config( chip_id_t chip, uint8_t num_hw_cqs, const tt_metal::DispatchCoreConfig& dispatch_core_config); -std::optional get_storage_core_bank_size( - chip_id_t device_id, const uint8_t num_hw_cqs, const tt_metal::DispatchCoreConfig& dispatch_core_config); +inline std::optional get_storage_core_bank_size( + chip_id_t device_id, const uint8_t num_hw_cqs, const tt_metal::DispatchCoreConfig& dispatch_core_config) { + const core_descriptor_t& core_desc = get_core_descriptor_config(device_id, num_hw_cqs, dispatch_core_config); + const metal_SocDescriptor& soc_desc = tt::Cluster::instance().get_soc_desc(device_id); + if (core_desc.storage_core_bank_size.has_value()) { + TT_FATAL( + core_desc.storage_core_bank_size.value() % tt_metal::hal.get_alignment(tt_metal::HalMemType::L1) == 0, + "Storage core bank size must be {} B aligned", + tt_metal::hal.get_alignment(tt_metal::HalMemType::L1)); + } + return core_desc.storage_core_bank_size; +} inline const std::vector& get_logical_storage_cores( chip_id_t device_id, const uint8_t num_hw_cqs, const tt_metal::DispatchCoreConfig& dispatch_core_config) { diff --git a/tt_metal/api/tt-metalium/device.hpp b/tt_metal/api/tt-metalium/device.hpp index 36df50bb957..be8e9af943f 100644 --- a/tt_metal/api/tt-metalium/device.hpp +++ b/tt_metal/api/tt-metalium/device.hpp @@ -15,6 +15,7 @@ #include "data_types.hpp" #include "program_device_map.hpp" #include "build.hpp" +#include "tt_cluster.hpp" #include "hal.hpp" #include "command_queue_interface.hpp" #include "sub_device_manager.hpp" diff --git a/tt_metal/api/tt-metalium/device_impl.hpp b/tt_metal/api/tt-metalium/device_impl.hpp index 71cb322c39a..88dd1d44bc4 100644 --- a/tt_metal/api/tt-metalium/device_impl.hpp +++ b/tt_metal/api/tt-metalium/device_impl.hpp @@ -15,6 +15,7 @@ #include "data_types.hpp" #include "program_device_map.hpp" #include "build.hpp" +#include "tt_cluster.hpp" #include "hal.hpp" #include "command_queue_interface.hpp" #include "command_queue.hpp" diff --git a/tt_metal/api/tt-metalium/dispatch_core_common.hpp b/tt_metal/api/tt-metalium/dispatch_core_common.hpp index 322d8d57641..e6306d9238d 100644 --- a/tt_metal/api/tt-metalium/dispatch_core_common.hpp +++ b/tt_metal/api/tt-metalium/dispatch_core_common.hpp @@ -9,8 +9,6 @@ #include "data_types.hpp" #include "reflection.hpp" -#include // CoreType - namespace tt::tt_metal { enum DispatchWorkerType : uint32_t { diff --git a/tt_metal/api/tt-metalium/dispatch_settings.hpp b/tt_metal/api/tt-metalium/dispatch_settings.hpp index fe91d61183f..357e5220d16 100644 --- a/tt_metal/api/tt-metalium/dispatch_settings.hpp +++ b/tt_metal/api/tt-metalium/dispatch_settings.hpp @@ -7,16 +7,12 @@ #include #include #include -#include "dev_msgs.h" // go_msg_t #include "hal.hpp" +#include "tt_cluster.hpp" #include #include #include "umd/device/tt_core_coordinates.h" -namespace tt { -class Cluster; -} - namespace tt::tt_metal { // diff --git a/tt_metal/api/tt-metalium/hal_exp.hpp b/tt_metal/api/tt-metalium/hal_exp.hpp index 5e14b0a5353..a90a93cd8ea 100644 --- a/tt_metal/api/tt-metalium/hal_exp.hpp +++ b/tt_metal/api/tt-metalium/hal_exp.hpp @@ -6,17 +6,9 @@ #include #include -#include namespace tt::tt_metal::experimental::hal { -/** - * @brief Uses the hardware abstraction layer to inform client of the architecture - * - * @return Architecture enum defined by UMD - */ -tt::ARCH get_arch(); - /** * @brief Uses the hardware abstraction layer to inform client of the architecture name * diff --git a/tt_metal/llrt/tt_cluster.hpp b/tt_metal/api/tt-metalium/tt_cluster.hpp similarity index 79% rename from tt_metal/llrt/tt_cluster.hpp rename to tt_metal/api/tt-metalium/tt_cluster.hpp index 666e9fa4eed..cecb702cda6 100644 --- a/tt_metal/llrt/tt_cluster.hpp +++ b/tt_metal/api/tt-metalium/tt_cluster.hpp @@ -39,20 +39,20 @@ enum class TargetDevice : std::uint8_t { }; class Cluster { -public: - Cluster& operator=(const Cluster&) = delete; - Cluster& operator=(Cluster&& other) noexcept = delete; - Cluster(const Cluster&) = delete; - Cluster(Cluster&& other) noexcept = delete; + public: + Cluster &operator=(const Cluster &) = delete; + Cluster &operator=(Cluster &&other) noexcept = delete; + Cluster(const Cluster &) = delete; + Cluster(Cluster &&other) noexcept = delete; - static const Cluster& instance(); + static const Cluster &instance(); // For TG Galaxy systems, mmio chips are gateway chips that are only used for dispatc, so user_devices are meant for // user facing host apis size_t number_of_user_devices() const { if (this->is_tg_cluster_) { - const auto& chips = this->cluster_desc_->get_all_chips(); - return std::count_if(chips.begin(), chips.end(), [&](const auto& id) { + const auto &chips = this->cluster_desc_->get_all_chips(); + return std::count_if(chips.begin(), chips.end(), [&](const auto &id) { return this->cluster_desc_->get_board_type(id) == BoardType::GALAXY; }); } else { @@ -68,12 +68,10 @@ class Cluster { ARCH arch() const { return this->arch_; } - const metal_SocDescriptor& get_soc_desc(chip_id_t chip) const; - CoreCoord get_virtual_coordinate_from_logical_coordinates( - chip_id_t chip_id, CoreCoord logical_coord, const CoreType& core_type) const; + const metal_SocDescriptor &get_soc_desc(chip_id_t chip) const; + CoreCoord get_virtual_coordinate_from_logical_coordinates(chip_id_t chip_id, CoreCoord logical_coord, const CoreType& core_type) const; CoreCoord get_virtual_coordinate_from_physical_coordinates(chip_id_t chip_id, CoreCoord physical_coord) const; - tt_cxy_pair get_virtual_coordinate_from_logical_coordinates( - tt_cxy_pair logical_coordinate, const CoreType& core_type) const; + tt_cxy_pair get_virtual_coordinate_from_logical_coordinates(tt_cxy_pair logical_coordinate, const CoreType& core_type) const; CoreCoord get_physical_coordinate_from_logical_coordinates( chip_id_t chip_id, CoreCoord logical_coord, const CoreType& core_type, bool no_warn = false) const; const std::unordered_set& get_virtual_worker_cores(chip_id_t chip_id) const; @@ -85,15 +83,14 @@ class Cluster { } //! device driver and misc apis - void verify_sw_fw_versions(int device_id, std::uint32_t sw_version, std::vector& fw_versions) const; + void verify_sw_fw_versions(int device_id, std::uint32_t sw_version, std::vector &fw_versions) const; - void deassert_risc_reset_at_core(const tt_cxy_pair& physical_chip_coord) const; - void assert_risc_reset_at_core(const tt_cxy_pair& physical_chip_coord) const; + void deassert_risc_reset_at_core(const tt_cxy_pair &physical_chip_coord) const; + void assert_risc_reset_at_core(const tt_cxy_pair &physical_chip_coord) const; - void write_dram_vec( - std::vector& vec, tt_target_dram dram, uint64_t addr, bool small_access = false) const; + void write_dram_vec(std::vector &vec, tt_target_dram dram, uint64_t addr, bool small_access = false) const; void read_dram_vec( - std::vector& vec, + std::vector &vec, uint32_t size_in_bytes, tt_target_dram dram, uint64_t addr, @@ -101,52 +98,48 @@ class Cluster { // Accepts physical noc coordinates void write_core( - const void* mem_ptr, uint32_t sz_in_bytes, tt_cxy_pair core, uint64_t addr, bool small_access = false) const; + const void *mem_ptr, uint32_t sz_in_bytes, tt_cxy_pair core, uint64_t addr, bool small_access = false) const; void read_core( - void* mem_ptr, uint32_t sz_in_bytes, tt_cxy_pair core, uint64_t addr, bool small_access = false) const; + void *mem_ptr, uint32_t sz_in_bytes, tt_cxy_pair core, uint64_t addr, bool small_access = false) const; void read_core( - std::vector& data, - uint32_t sz_in_bytes, - tt_cxy_pair core, - uint64_t addr, - bool small_access = false) const; + std::vector &data, uint32_t sz_in_bytes, tt_cxy_pair core, uint64_t addr, bool small_access = false) const; - std::optional> get_tlb_data(const tt_cxy_pair& target) const { - tt::umd::Cluster* device = dynamic_cast(driver_.get()); + std::optional> get_tlb_data(const tt_cxy_pair &target) const { + tt::umd::Cluster *device = dynamic_cast(driver_.get()); tt::umd::CoreCoord target_coord = get_soc_desc(target.chip).get_coord_at(target, CoordSystem::TRANSLATED); return device->get_tlb_data_from_target(target.chip, target_coord); } - std::function get_fast_pcie_static_tlb_write_callable(int chip_id) const { + std::function get_fast_pcie_static_tlb_write_callable( + int chip_id) const { chip_id_t mmio_device_id = device_to_mmio_device_.at(chip_id); - tt::umd::Cluster* device = dynamic_cast(driver_.get()); + tt::umd::Cluster *device = dynamic_cast(driver_.get()); return device->get_fast_pcie_static_tlb_write_callable(mmio_device_id); } // Returns a writer object which holds a pointer to a static tlb - // Allows for fast writes when targeting same device core by only doing the lookup once and avoiding repeated stack - // traversals + // Allows for fast writes when targeting same device core by only doing the lookup once and avoiding repeated stack traversals tt::Writer get_static_tlb_writer(tt_cxy_pair target) const { - tt::umd::Cluster* device = dynamic_cast(driver_.get()); + tt::umd::Cluster *device = dynamic_cast(driver_.get()); tt::umd::CoreCoord target_coord = get_soc_desc(target.chip).get_coord_at(target, CoordSystem::TRANSLATED); return device->get_static_tlb_writer(target.chip, target_coord); } std::uint32_t get_numa_node_for_device(uint32_t device_id) const { uint32_t mmio_device_id = this->get_associated_mmio_device(device_id); - tt::umd::Cluster* device = dynamic_cast(driver_.get()); + tt::umd::Cluster *device = dynamic_cast(driver_.get()); return driver_->get_numa_node_for_pcie_device(mmio_device_id); } - void write_reg(const std::uint32_t* mem_ptr, tt_cxy_pair target, uint64_t addr) const; - void read_reg(std::uint32_t* mem_ptr, tt_cxy_pair target, uint64_t addr) const; + void write_reg(const std::uint32_t *mem_ptr, tt_cxy_pair target, uint64_t addr) const; + void read_reg(std::uint32_t *mem_ptr, tt_cxy_pair target, uint64_t addr) const; void write_sysmem( - const void* mem_ptr, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id, uint16_t channel) const; + const void *mem_ptr, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id, uint16_t channel) const; void read_sysmem( - void* mem_ptr, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id, uint16_t channel) const; + void *mem_ptr, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id, uint16_t channel) const; - int get_device_aiclk(const chip_id_t& chip_id) const; + int get_device_aiclk(const chip_id_t &chip_id) const; void dram_barrier(chip_id_t chip_id) const; void l1_barrier(chip_id_t chip_id) const; @@ -154,7 +147,7 @@ class Cluster { uint32_t get_num_host_channels(chip_id_t device_id) const; uint32_t get_host_channel_size(chip_id_t device_id, uint32_t channel) const; // Returns address in host space - void* host_dma_address(uint64_t offset, chip_id_t src_device_id, uint16_t channel) const; + void *host_dma_address(uint64_t offset, chip_id_t src_device_id, uint16_t channel) const; uint64_t get_pcie_base_addr_from_device(chip_id_t chip_id) const; // Ethernet cluster api @@ -177,11 +170,12 @@ class Cluster { // get_ethernet_sockets(a, b)[0] is connected to get_ethernet_sockets(b, a)[0] std::vector get_ethernet_sockets(chip_id_t local_chip, chip_id_t remote_chip) const; // Converts logical ethernet core coord to physical ethernet core coord - CoreCoord ethernet_core_from_logical_core(chip_id_t chip_id, const CoreCoord& logical_core) const; + CoreCoord ethernet_core_from_logical_core(chip_id_t chip_id, const CoreCoord &logical_core) const; // Returns virtual eth coord from channel CoreCoord get_virtual_eth_core_from_channel(chip_id_t chip_id, int channel) const; + // Bookkeeping for mmio device tunnels uint32_t get_mmio_device_max_tunnel_depth(chip_id_t mmio_device) const; uint32_t get_mmio_device_tunnel_count(chip_id_t mmio_device) const; @@ -192,8 +186,7 @@ class Cluster { tt_cxy_pair get_eth_core_for_dispatch_core( tt_cxy_pair logical_dispatch_core, EthRouterMode mode, chip_id_t connected_chip_id) const; - std::tuple get_eth_tunnel_core( - chip_id_t upstream_chip_id, chip_id_t downstream_chip_id, EthRouterMode mode) const; + std::tuple get_eth_tunnel_core(chip_id_t upstream_chip_id, chip_id_t downstream_chip_id, EthRouterMode mode) const; // Internal routing for SD and FD enables launching user ethernet kernels and FD tunneling for all devices in the // cluster. When using multiple devices in a cluster, this should be the flow: @@ -203,13 +196,14 @@ class Cluster { // set_internal_routing_info_for_ethernet_cores(false); // CloseDevice(0) // CloseDevice(1) - void set_internal_routing_info_for_ethernet_cores( - bool enable_internal_routing, const std::vector& target_mmio_devices = {}) const; + void set_internal_routing_info_for_ethernet_cores(bool enable_internal_routing, const std::vector& target_mmio_devices = {}) const; + std::unordered_map>> - get_ethernet_connections() const { - return this->cluster_desc_->get_ethernet_connections(); - } + get_ethernet_connections() const { + return this->cluster_desc_->get_ethernet_connections(); + } + // Returns MMIO device ID (logical) that controls given `device_id`. If `device_id` is MMIO device it is returned. chip_id_t get_associated_mmio_device(chip_id_t device_id) const { @@ -221,7 +215,7 @@ class Cluster { } // Returns collection of devices that are controlled by the specified MMIO device inclusive of the MMIO device - const std::set& get_devices_controlled_by_mmio_device(chip_id_t mmio_device_id) const { + const std::set &get_devices_controlled_by_mmio_device(chip_id_t mmio_device_id) const { TT_ASSERT( this->devices_grouped_by_assoc_mmio_device_.count(mmio_device_id), "Expected device {} to be an MMIO device!", @@ -245,8 +239,8 @@ class Cluster { // Returns Wormhole chip board type. BoardType get_board_type(chip_id_t chip_id) const; - bool is_worker_core(const CoreCoord& core, chip_id_t chip_id) const; - bool is_ethernet_core(const CoreCoord& core, chip_id_t chip_id) const; + bool is_worker_core(const CoreCoord &core, chip_id_t chip_id) const; + bool is_ethernet_core(const CoreCoord &core, chip_id_t chip_id) const; CoreCoord get_logical_ethernet_core_from_virtual(chip_id_t chip, CoreCoord core) const; // These two functions should be removed in favor of direct translation. @@ -254,8 +248,7 @@ class Cluster { const std::unordered_map get_worker_logical_to_virtual_y(chip_id_t chip_id) const; const std::unordered_map& get_virtual_routing_to_profiler_flat_id(chip_id_t chip_id) const; - -private: + private: Cluster(); ~Cluster(); @@ -263,13 +256,14 @@ class Cluster { void generate_cluster_descriptor(); void initialize_device_drivers(); void assert_risc_reset(); - void assign_mem_channels_to_devices(chip_id_t mmio_device_id, const std::set& controlled_device_ids); - void open_driver(const bool& skip_driver_allocs = false); - void start_driver(tt_device_params& device_params) const; + void assign_mem_channels_to_devices(chip_id_t mmio_device_id, const std::set &controlled_device_ids); + void open_driver( + const bool &skip_driver_allocs = false); + void start_driver(tt_device_params &device_params) const; void get_metal_desc_from_tt_desc( - const std::unordered_map& input, - const std::unordered_map& per_chip_id_harvesting_masks); + const std::unordered_map &input, + const std::unordered_map &per_chip_id_harvesting_masks); void generate_virtual_to_umd_coord_mapping(); void generate_virtual_to_profiler_flat_id_mapping(); @@ -332,4 +326,4 @@ class Cluster { } // namespace tt -std::ostream& operator<<(std::ostream& os, const tt_target_dram& dram); +std::ostream &operator<<(std::ostream &os, tt_target_dram const &dram); diff --git a/tt_metal/common/CMakeLists.txt b/tt_metal/common/CMakeLists.txt index 3a31f8e6e07..551051ea52b 100644 --- a/tt_metal/common/CMakeLists.txt +++ b/tt_metal/common/CMakeLists.txt @@ -1,6 +1,7 @@ set(COMMON_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/core_assignment.cpp ${CMAKE_CURRENT_SOURCE_DIR}/core_coord.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/core_descriptor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/metal_soc_descriptor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/shape2d.cpp ${CMAKE_CURRENT_SOURCE_DIR}/shape_base.cpp diff --git a/tt_metal/common/core_assignment.cpp b/tt_metal/common/core_assignment.cpp index 0016850befe..6131b31c9d8 100644 --- a/tt_metal/common/core_assignment.cpp +++ b/tt_metal/common/core_assignment.cpp @@ -2,7 +2,6 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "assert.hpp" #include "core_assignment.hpp" namespace tt { diff --git a/tt_metal/common/core_assignment.hpp b/tt_metal/common/core_assignment.hpp index 9ac23c17f28..311a351d564 100644 --- a/tt_metal/common/core_assignment.hpp +++ b/tt_metal/common/core_assignment.hpp @@ -3,8 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include "core_coord.hpp" - -#include // tt::ARCH +#include namespace tt { namespace tt_metal { @@ -13,7 +12,7 @@ namespace tt_metal { // a DRAM read or write. // Worker cores are derived based on architecture, harvesting configurations and DRAM Controller placement. std::vector get_optimal_dram_to_physical_worker_assignment( - tt::ARCH arch, + ARCH arch, const std::vector& dram_phy_coords, uint32_t full_grid_size_x, uint32_t full_grid_size_y, diff --git a/tt_metal/llrt/core_descriptor.cpp b/tt_metal/common/core_descriptor.cpp similarity index 94% rename from tt_metal/llrt/core_descriptor.cpp rename to tt_metal/common/core_descriptor.cpp index 99fd72ec096..a54e5fbe818 100644 --- a/tt_metal/llrt/core_descriptor.cpp +++ b/tt_metal/common/core_descriptor.cpp @@ -4,7 +4,6 @@ #include "core_descriptor.hpp" #include "rtoptions.hpp" -#include "tt_cluster.hpp" #include "yaml-cpp/yaml.h" @@ -242,17 +241,4 @@ const std::tuple& get_physical_worker_grid_config( return physical_grid_config_cache.at(config_hash); } -std::optional get_storage_core_bank_size( - chip_id_t device_id, const uint8_t num_hw_cqs, const tt_metal::DispatchCoreConfig& dispatch_core_config) { - const core_descriptor_t& core_desc = get_core_descriptor_config(device_id, num_hw_cqs, dispatch_core_config); - const metal_SocDescriptor& soc_desc = tt::Cluster::instance().get_soc_desc(device_id); - if (core_desc.storage_core_bank_size.has_value()) { - TT_FATAL( - core_desc.storage_core_bank_size.value() % tt_metal::hal.get_alignment(tt_metal::HalMemType::L1) == 0, - "Storage core bank size must be {} B aligned", - tt_metal::hal.get_alignment(tt_metal::HalMemType::L1)); - } - return core_desc.storage_core_bank_size; -} - } // namespace tt diff --git a/tt_metal/distributed/CMakeLists.txt b/tt_metal/distributed/CMakeLists.txt index ba9dbb1a442..62f068ca7cc 100644 --- a/tt_metal/distributed/CMakeLists.txt +++ b/tt_metal/distributed/CMakeLists.txt @@ -17,6 +17,5 @@ target_link_libraries( common PRIVATE Metalium::Metal::Impl - Metalium::Metal::LLRT TT::Metalium::HostDevCommon ) diff --git a/tt_metal/distributed/coordinate_translation.cpp b/tt_metal/distributed/coordinate_translation.cpp index e834ae37e2d..5e4be86b0b8 100644 --- a/tt_metal/distributed/coordinate_translation.cpp +++ b/tt_metal/distributed/coordinate_translation.cpp @@ -4,8 +4,6 @@ #include "tt_metal/distributed/coordinate_translation.hpp" -#include "tt_cluster.hpp" - #include namespace tt::tt_metal::distributed { diff --git a/tt_metal/distributed/mesh_command_queue.cpp b/tt_metal/distributed/mesh_command_queue.cpp index e60010e150a..d19911a3112 100644 --- a/tt_metal/distributed/mesh_command_queue.cpp +++ b/tt_metal/distributed/mesh_command_queue.cpp @@ -14,7 +14,6 @@ #include "tt_metal/impl/program/dispatch.hpp" #include "tt_metal/impl/dispatch/dispatch_query_manager.hpp" -#include "tt_cluster.hpp" namespace tt::tt_metal::distributed { struct MeshReadEventDescriptor { diff --git a/tt_metal/distributed/system_mesh.cpp b/tt_metal/distributed/system_mesh.cpp index e5399de7d69..45185381ba6 100644 --- a/tt_metal/distributed/system_mesh.cpp +++ b/tt_metal/distributed/system_mesh.cpp @@ -7,8 +7,6 @@ #include "umd/device/types/cluster_descriptor_types.h" #include "tt_metal/distributed/coordinate_translation.hpp" -#include "tt_cluster.hpp" - namespace tt::tt_metal::distributed { class SystemMesh::Impl { diff --git a/tt_metal/experimental/hal.cpp b/tt_metal/experimental/hal.cpp index d67c8d87e9c..a93cfc65c70 100644 --- a/tt_metal/experimental/hal.cpp +++ b/tt_metal/experimental/hal.cpp @@ -17,8 +17,6 @@ using tt::tt_metal::HalSingleton; namespace tt::tt_metal::experimental::hal { -tt::ARCH get_arch() { return HalSingleton::getInstance().get_arch(); } - std::string get_arch_name() { auto arch_enum = HalSingleton::getInstance().get_arch(); return tt::get_string_lowercase(arch_enum); diff --git a/tt_metal/impl/buffers/dispatch.cpp b/tt_metal/impl/buffers/dispatch.cpp index 8655c830709..56b9e2a8c57 100644 --- a/tt_metal/impl/buffers/dispatch.cpp +++ b/tt_metal/impl/buffers/dispatch.cpp @@ -9,8 +9,6 @@ #include #include -#include "tt_cluster.hpp" - namespace tt::tt_metal { namespace buffer_dispatch { diff --git a/tt_metal/impl/buffers/global_circular_buffer.cpp b/tt_metal/impl/buffers/global_circular_buffer.cpp index 10974d388f9..9759c6314ae 100644 --- a/tt_metal/impl/buffers/global_circular_buffer.cpp +++ b/tt_metal/impl/buffers/global_circular_buffer.cpp @@ -18,8 +18,6 @@ #include #include -#include "tt_cluster.hpp" - namespace tt::tt_metal { namespace v1 { diff --git a/tt_metal/impl/buffers/global_semaphore.cpp b/tt_metal/impl/buffers/global_semaphore.cpp index 7102161571e..96164f64871 100644 --- a/tt_metal/impl/buffers/global_semaphore.cpp +++ b/tt_metal/impl/buffers/global_semaphore.cpp @@ -18,8 +18,6 @@ #include #include -#include "tt_cluster.hpp" - namespace tt::tt_metal { GlobalSemaphore::GlobalSemaphore( diff --git a/tt_metal/impl/debug/watcher_server.hpp b/tt_metal/impl/debug/watcher_server.hpp index 38a16e3c8ce..79f6680d4de 100644 --- a/tt_metal/impl/debug/watcher_server.hpp +++ b/tt_metal/impl/debug/watcher_server.hpp @@ -6,8 +6,6 @@ #include -struct metal_SocDescriptor; - namespace tt { void watcher_init(tt_metal::IDevice* device); diff --git a/tt_metal/impl/device/device_pool.cpp b/tt_metal/impl/device/device_pool.cpp index a269e823dd3..cd73f565e73 100644 --- a/tt_metal/impl/device/device_pool.cpp +++ b/tt_metal/impl/device/device_pool.cpp @@ -23,8 +23,6 @@ #include "tt_metal/impl/dispatch/dispatch_query_manager.hpp" #include "tt_metal/jit_build/build_env_manager.hpp" -#include "tt_cluster.hpp" - using namespace tt::tt_metal; namespace tt { diff --git a/tt_metal/impl/dispatch/debug_tools.cpp b/tt_metal/impl/dispatch/debug_tools.cpp index fc8980679e3..95707965738 100644 --- a/tt_metal/impl/dispatch/debug_tools.cpp +++ b/tt_metal/impl/dispatch/debug_tools.cpp @@ -3,9 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 #include "debug_tools.hpp" - -#include "tt_cluster.hpp" - namespace internal { using namespace tt::tt_metal; diff --git a/tt_metal/impl/dispatch/hardware_command_queue.cpp b/tt_metal/impl/dispatch/hardware_command_queue.cpp index d0aa1824264..8a72db6e742 100644 --- a/tt_metal/impl/dispatch/hardware_command_queue.cpp +++ b/tt_metal/impl/dispatch/hardware_command_queue.cpp @@ -12,8 +12,6 @@ #include #include -#include "tt_cluster.hpp" - // Because we are a Friend of Program, accessing Program::get_program_transfer_info() and Program::get_kernels_buffer() // MUST REMOVE #include diff --git a/tt_metal/impl/dispatch/kernel_config/fd_kernel.hpp b/tt_metal/impl/dispatch/kernel_config/fd_kernel.hpp index d60d15c991b..33d394abf91 100644 --- a/tt_metal/impl/dispatch/kernel_config/fd_kernel.hpp +++ b/tt_metal/impl/dispatch/kernel_config/fd_kernel.hpp @@ -6,7 +6,6 @@ #include #include #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" -#include "tt_cluster.hpp" #define UNUSED_LOGICAL_CORE tt_cxy_pair(device_->id(), 0, 0) #define UNUSED_SEM_ID 0 diff --git a/tt_metal/impl/dispatch/topology.cpp b/tt_metal/impl/dispatch/topology.cpp index b8eff2dd822..6a9ff796669 100644 --- a/tt_metal/impl/dispatch/topology.cpp +++ b/tt_metal/impl/dispatch/topology.cpp @@ -15,8 +15,6 @@ #include "kernel_config/eth_router.hpp" #include "kernel_config/eth_tunneler.hpp" -#include "tt_cluster.hpp" - namespace tt::tt_metal { // For readablity, unset = x = -1 diff --git a/tt_metal/impl/event/dispatch.cpp b/tt_metal/impl/event/dispatch.cpp index dad0f24cb7e..36a62181c60 100644 --- a/tt_metal/impl/event/dispatch.cpp +++ b/tt_metal/impl/event/dispatch.cpp @@ -7,8 +7,6 @@ #include "tt_metal/impl/dispatch/dispatch_query_manager.hpp" #include -#include "tt_cluster.hpp" - namespace tt::tt_metal { namespace event_dispatch { diff --git a/tt_metal/impl/sub_device/sub_device_manager.cpp b/tt_metal/impl/sub_device/sub_device_manager.cpp index 0a29d896618..042e46ae828 100644 --- a/tt_metal/impl/sub_device/sub_device_manager.cpp +++ b/tt_metal/impl/sub_device/sub_device_manager.cpp @@ -20,8 +20,6 @@ #include #include "tt_metal/impl/dispatch/dispatch_query_manager.hpp" -#include "tt_cluster.hpp" - namespace tt::tt_metal { // assert here to avoid the need to include command_queue_interface.hpp in header diff --git a/tt_metal/llrt/CMakeLists.txt b/tt_metal/llrt/CMakeLists.txt index 439492cc309..3f60ed70a06 100644 --- a/tt_metal/llrt/CMakeLists.txt +++ b/tt_metal/llrt/CMakeLists.txt @@ -82,7 +82,6 @@ target_link_libraries( set(LLRT_SRC ${CMAKE_CURRENT_SOURCE_DIR}/llrt.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/core_descriptor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/rtoptions.cpp ${CMAKE_CURRENT_SOURCE_DIR}/tlb_config.cpp ${CMAKE_CURRENT_SOURCE_DIR}/tt_cluster.cpp @@ -106,7 +105,6 @@ target_link_libraries( Tracy::TracyClient nlohmann_json::nlohmann_json Reflect::Reflect - yaml-cpp::yaml-cpp magic_enum span common diff --git a/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp b/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp index 3d684c08996..a8b1db8196b 100644 --- a/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp @@ -12,13 +12,9 @@ #include "ttnn/operations/data_movement/slice/slice.hpp" #include "ttnn/operations/data_movement/concat/concat.hpp" -#include "tt-metalium/hal_exp.hpp" - namespace ttnn { namespace ccl { -using namespace tt::tt_metal::experimental; - void SyncModeSpec::add_signal(uint32_t sem_id, uint32_t wait_count) { this->sem_ids.push_back(sem_id); this->wait_counts.push_back(wait_count); @@ -217,8 +213,8 @@ void generate_edm_kernels_for_ring_or_linear_topology( std::vector const& counter_clockwise_edm_builders, std::optional receiver_device_id, std::optional sender_device_id) { - auto sender_noc = tt::tt_metal::detail::GetPreferredNOCForDRAMRead(hal::get_arch()); - auto receiver_noc = tt::tt_metal::detail::GetPreferredNOCForDRAMWrite(hal::get_arch()); + auto sender_noc = tt::tt_metal::detail::GetPreferredNOCForDRAMRead(tt::Cluster::instance().arch()); + auto receiver_noc = tt::tt_metal::detail::GetPreferredNOCForDRAMWrite(tt::Cluster::instance().arch()); uint32_t sender_socket_idx = 0; uint32_t receiver_socket_idx = 0; if (receiver_device_id == sender_device_id) { diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_helper_functions.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_helper_functions.cpp index 7429ff9efa9..4964b963bf1 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_helper_functions.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_helper_functions.cpp @@ -11,14 +11,11 @@ #include #include -#include "tt-metalium/hal_exp.hpp" - namespace ttnn { namespace operations { using namespace tt; using namespace tt::tt_metal; -using namespace tt::tt_metal::experimental; using namespace constants; std::tuple add_core_offset( @@ -105,7 +102,7 @@ std::tuple #include "tools/profiler/op_profiler.hpp" -#include // tt_ClusterDescriptor - namespace tt { using namespace constants; namespace operations { From 0ef33f1109d58511fb2be9bcda333c3375bd152c Mon Sep 17 00:00:00 2001 From: VirdhatchaniKN Date: Wed, 5 Feb 2025 17:05:27 +0000 Subject: [PATCH 098/316] #17768: Float32 support for Training mode in Batch Norm --- .../unit_tests/operations/test_batch_norm.py | 108 +++++++++ .../compute/running_statistics_kernel.cpp | 4 +- .../running_statistics_sfpu_kernel.cpp | 228 ++++++++++++++++++ .../dataflow/reader_running_statistics.cpp | 15 +- .../dataflow/writer_running_statistics.cpp | 4 +- .../running_statistics_device_operation.cpp | 46 ++-- .../running_statistics_program_factory.cpp | 81 +++++-- 7 files changed, 442 insertions(+), 44 deletions(-) create mode 100644 ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/running_statistics_sfpu_kernel.cpp diff --git a/tests/ttnn/unit_tests/operations/test_batch_norm.py b/tests/ttnn/unit_tests/operations/test_batch_norm.py index 56922409d00..1305fc33005 100644 --- a/tests/ttnn/unit_tests/operations/test_batch_norm.py +++ b/tests/ttnn/unit_tests/operations/test_batch_norm.py @@ -13,6 +13,114 @@ from models.utility_functions import skip_for_grayskull +@skip_for_grayskull("Unsupported dtype for Grayskull") +@pytest.mark.parametrize( + "input_shapes", + [ + *(torch.Size([n, c, 32, 32]) for n, c in product([1, 2, 3, 4], [1, 2, 3, 4])), + *(torch.Size([n, c, 23, 23]) for n, c in product([1, 2, 3, 4], [1, 2, 3, 4])), + *(torch.Size([n, c, 64, 120]) for n, c in product([1, 2], [1, 2, 3])), + torch.Size([3, 1, 64, 120]), + torch.Size([3, 2, 64, 120]), + ], +) +@pytest.mark.parametrize( + "check_mean, check_var", + [ + (False, False), + (True, False), + (False, True), + (True, True), + ], +) +@pytest.mark.parametrize("weight", [True, False]) +@pytest.mark.parametrize("bias", [True, False]) +@pytest.mark.parametrize("eps", [1.0, 0.0, 2.34, 1e-05]) +@pytest.mark.parametrize("momentum", [0.0, 0.1, 0.5]) +def test_batch_norm_training_fp32( + input_shapes, check_mean, check_var, weight, bias, eps, device, momentum, training=True, testing_dtype="float32" +): + in_data, input_tensor = data_gen_with_range_batch_norm( + input_shapes, 5, 10, device, is_input=True, testing_dtype=testing_dtype + ) + mean_data, mean_tensor = ( + data_gen_with_range_batch_norm(input_shapes, 4, 10, device, testing_dtype=testing_dtype) + if (check_mean) + else (None, None) + ) + var_data, var_tensor = ( + data_gen_with_range_batch_norm(input_shapes, 4, 20, device, testing_dtype=testing_dtype) + if (check_var) + else (None, None) + ) + weight_data, weight_tensor = ( + data_gen_with_range_batch_norm(input_shapes, 4, 10, device, testing_dtype=testing_dtype) + if weight + else (None, None) + ) + bias_data, bias_tensor = ( + data_gen_with_range_batch_norm(input_shapes, 4, 10, device, testing_dtype=testing_dtype) + if bias + else (None, None) + ) + + if (not training) and ((not check_mean) or (not check_var)): + pytest.xfail("running_mean and running_var must be defined in evaluation mode") + + tt_output_tensor_on_device = ttnn.batch_norm( + input_tensor, + running_mean=mean_tensor, + running_var=var_tensor, + training=training, + eps=eps, + weight=weight_tensor, + bias=bias_tensor, + momentum=momentum, + ) + tt_output = ttnn.to_torch(tt_output_tensor_on_device) + tt_updated_mean = None + tt_updated_var = None + if training: + if check_mean: + tt_updated_mean = ttnn.to_torch(mean_tensor) + if check_var: + tt_updated_var = ttnn.to_torch(var_tensor) + + torch_result = torch.nn.functional.batch_norm( + input=in_data, + running_mean=mean_data, + running_var=var_data, + weight=weight_data, + bias=bias_data, + training=training, + eps=eps, + momentum=momentum, + ) + comp_pass = compare_results_batch_norm([tt_output], [torch_result]) + if training: + channels = input_shapes[1] + if check_mean: + comp_pass_1 = compare_results_batch_norm( + [tt_updated_mean], [mean_data.view(1, channels, 1, 1)], stats=True + ) # Check Updated running mean + else: + if tt_updated_mean is None: + comp_pass_1 = True + else: + comp_pass_1 = False + if check_var: + comp_pass_2 = compare_results_batch_norm( + [tt_updated_var], [var_data.view(1, channels, 1, 1)], stats=True + ) # Check Updated running var + else: + if tt_updated_var is None: + comp_pass_2 = True + else: + comp_pass_2 = False + comp_pass = comp_pass and comp_pass_1 and comp_pass_2 + assert comp_pass + + @skip_for_grayskull("Unsupported dtype for Grayskull") @pytest.mark.parametrize("eps", [1.0, 0.0, 2.34, 1e-05]) @pytest.mark.parametrize("channel_size", [1, 2, 3, 4]) diff --git a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/running_statistics_kernel.cpp b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/running_statistics_kernel.cpp index f7955a6f81d..642a1c6f807 100644 --- a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/running_statistics_kernel.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/running_statistics_kernel.cpp @@ -39,13 +39,13 @@ void MAIN { sub_tiles_to_cb(cb_one, cb_momentum, cb_tmp1, 0, 0, 0, 0); // 1 - momentum mul_tiles_to_cb(cb_momentum, cb_batch_mean, cb_tmp2, 0, 0, 0, 1); // momentum * batch stat mul_tiles_to_cb(cb_tmp1, cb_old_running_mean, cb_tmp3, 0, 0, 1, 1); // cb_tmp1 * running stats - add_tiles_to_cb(cb_tmp2, cb_tmp3, cb_updated_running_mean, 0, 0, 1, 1); // cb_tmp2 * cb_tmp3 + add_tiles_to_cb(cb_tmp2, cb_tmp3, cb_updated_running_mean, 0, 0, 1, 1); // cb_tmp2 + cb_tmp3 } if constexpr (old_running_var_has_value) { sub_tiles_to_cb(cb_one, cb_momentum, cb_tmp1, 0, 0, 0, 0); // 1 - momentum mul_tiles_to_cb(cb_momentum, cb_batch_var, cb_tmp2, 0, 0, 0, 1); // momentum * batch stat mul_tiles_to_cb(cb_tmp1, cb_old_running_var, cb_tmp3, 0, 0, 1, 1); // cb_tmp1 * running stats - add_tiles_to_cb(cb_tmp2, cb_tmp3, cb_updated_running_var, 0, 0, 1, 1); // cb_tmp2 * cb_tmp3 + add_tiles_to_cb(cb_tmp2, cb_tmp3, cb_updated_running_var, 0, 0, 1, 1); // cb_tmp2 + cb_tmp3 } tile_regs_commit(); tile_regs_wait(); diff --git a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/running_statistics_sfpu_kernel.cpp b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/running_statistics_sfpu_kernel.cpp new file mode 100644 index 00000000000..47256317ee8 --- /dev/null +++ b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/running_statistics_sfpu_kernel.cpp @@ -0,0 +1,228 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "compute_kernel_api/eltwise_binary.h" +#include "compute_kernel_api/tile_move_copy.h" +#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp" +#include "compute_kernel_api/eltwise_binary_sfpu.h" +#include "compute_kernel_api/eltwise_unary/sfpu_split_includes.h" +#include "compute_kernel_api/eltwise_unary/eltwise_unary.h" + +namespace NAMESPACE { +void MAIN { + uint32_t num_tiles = get_arg_val(0); + constexpr uint32_t old_running_mean_has_value = get_compile_time_arg_val(0) == 1; + constexpr uint32_t old_running_var_has_value = get_compile_time_arg_val(1) == 1; + + constexpr auto cb_batch_mean = tt::CBIndex::c_0; // batch mean + constexpr auto cb_batch_var = tt::CBIndex::c_1; // batch var + constexpr auto cb_out0 = tt::CBIndex::c_2; + constexpr auto cb_old_running_mean = tt::CBIndex::c_3; // old running mean tensor + constexpr auto cb_old_running_var = tt::CBIndex::c_4; // old running var tensor + constexpr auto cb_updated_running_mean = tt::CBIndex::c_27; // updated running mean tensor + constexpr auto cb_updated_running_var = tt::CBIndex::c_28; // updated running var tensor + constexpr auto cb_momentum = tt::CBIndex::c_5; // momentum + constexpr auto cb_one = tt::CBIndex::c_6; // stores 1 + constexpr auto cb_tmp1 = tt::CBIndex::c_21; // tmp 1 + constexpr auto cb_tmp2 = tt::CBIndex::c_22; // tmp 2 + constexpr auto cb_tmp3 = tt::CBIndex::c_23; // tmp 3 + + unary_op_init_common(cb_batch_mean, cb_out0); + constexpr uint32_t onetile = 1; + + // updated_running_stat = (1 − momentum) × running_stat + momentum × batch_stat + for (uint32_t tile_id = 0; tile_id < num_tiles; ++tile_id) { + tile_regs_acquire(); + cb_wait_front(cb_one, 1); + cb_wait_front(cb_momentum, 1); + + if constexpr (old_running_mean_has_value) { + // 1 - momentum + cb_reserve_back(cb_tmp1, onetile); + sub_binary_tile_init(); + tile_regs_acquire(); + tile_regs_wait(); + copy_tile_to_dst_init_short_with_dt(cb_momentum, cb_one); + for (uint32_t i = 0; i < onetile; ++i) { + copy_tile(cb_one, i, i * 2); + } + copy_tile_to_dst_init_short_with_dt(cb_one, cb_momentum); + for (uint32_t i = 0; i < onetile; ++i) { + copy_tile(cb_momentum, i, i * 2 + 1); + sub_binary_tile(i * 2, i * 2 + 1); + tile_regs_commit(); + pack_tile(i * 2, cb_tmp1); + } + tile_regs_release(); + cb_push_back(cb_tmp1, onetile); + + // momentum * batch stat + cb_wait_front(cb_batch_mean, onetile); + cb_reserve_back(cb_tmp2, onetile); + mul_binary_tile_init(); + tile_regs_acquire(); + tile_regs_wait(); + copy_tile_to_dst_init_short_with_dt(cb_momentum, cb_batch_mean); + for (uint32_t i = 0; i < onetile; ++i) { + copy_tile(cb_batch_mean, i, i * 2); + } + copy_tile_to_dst_init_short_with_dt(cb_batch_mean, cb_momentum); + for (uint32_t i = 0; i < onetile; ++i) { + copy_tile(cb_momentum, i, i * 2 + 1); + mul_binary_tile(i * 2, i * 2 + 1); + tile_regs_commit(); + pack_tile(i * 2, cb_tmp2); + } + tile_regs_release(); + cb_push_back(cb_tmp2, onetile); + cb_pop_front(cb_batch_mean, onetile); + + // cb_tmp1 * running stats --> (1 - momentum) * running stats + cb_wait_front(cb_tmp1, onetile); + cb_wait_front(cb_old_running_mean, onetile); + cb_reserve_back(cb_tmp3, onetile); + mul_binary_tile_init(); + tile_regs_acquire(); + tile_regs_wait(); + copy_tile_to_dst_init_short_with_dt(cb_tmp1, cb_old_running_mean); + for (uint32_t i = 0; i < onetile; ++i) { + copy_tile(cb_old_running_mean, i, i * 2); + } + copy_tile_to_dst_init_short_with_dt(cb_old_running_mean, cb_tmp1); + for (uint32_t i = 0; i < onetile; ++i) { + copy_tile(cb_tmp1, i, i * 2 + 1); + mul_binary_tile(i * 2, i * 2 + 1); + tile_regs_commit(); + pack_tile(i * 2, cb_tmp3); + } + tile_regs_release(); + cb_push_back(cb_tmp3, onetile); + cb_pop_front(cb_old_running_mean, onetile); + cb_pop_front(cb_tmp1, onetile); + + // cb_tmp2 + cb_tmp3 --> (momentum * batch stat) + ((1 - momentum) * running stats) + cb_wait_front(cb_tmp2, onetile); + cb_wait_front(cb_tmp3, onetile); + + cb_reserve_back(cb_updated_running_mean, onetile); + + add_binary_tile_init(); + tile_regs_acquire(); + tile_regs_wait(); + copy_tile_to_dst_init_short_with_dt(cb_tmp2, cb_tmp3); + for (uint32_t i = 0; i < onetile; ++i) { + copy_tile(cb_tmp3, i, i * 2); + } + copy_tile_to_dst_init_short_with_dt(cb_tmp3, cb_tmp2); + for (uint32_t i = 0; i < onetile; ++i) { + copy_tile(cb_tmp2, i, i * 2 + 1); + add_binary_tile(i * 2, i * 2 + 1); + tile_regs_commit(); + pack_tile(i * 2, cb_updated_running_mean); + } + tile_regs_release(); + cb_push_back(cb_updated_running_mean, onetile); + cb_pop_front(cb_tmp3, onetile); + cb_pop_front(cb_tmp2, onetile); + } + if constexpr (old_running_var_has_value) { + // 1 - momentum + cb_reserve_back(cb_tmp1, onetile); + sub_binary_tile_init(); + tile_regs_acquire(); + tile_regs_wait(); + copy_tile_to_dst_init_short_with_dt(cb_momentum, cb_one); + for (uint32_t i = 0; i < onetile; ++i) { + copy_tile(cb_one, i, i * 2); + } + copy_tile_to_dst_init_short_with_dt(cb_one, cb_momentum); + for (uint32_t i = 0; i < onetile; ++i) { + copy_tile(cb_momentum, i, i * 2 + 1); + sub_binary_tile(i * 2, i * 2 + 1); + tile_regs_commit(); + pack_tile(i * 2, cb_tmp1); + } + tile_regs_release(); + cb_push_back(cb_tmp1, onetile); + + // momentum * batch stat + cb_wait_front(cb_batch_var, onetile); + cb_reserve_back(cb_tmp2, onetile); + mul_binary_tile_init(); + tile_regs_acquire(); + tile_regs_wait(); + copy_tile_to_dst_init_short_with_dt(cb_momentum, cb_batch_var); + for (uint32_t i = 0; i < onetile; ++i) { + copy_tile(cb_batch_var, i, i * 2); + } + copy_tile_to_dst_init_short_with_dt(cb_batch_var, cb_momentum); + for (uint32_t i = 0; i < onetile; ++i) { + copy_tile(cb_momentum, i, i * 2 + 1); + mul_binary_tile(i * 2, i * 2 + 1); + tile_regs_commit(); + pack_tile(i * 2, cb_tmp2); + } + tile_regs_release(); + cb_push_back(cb_tmp2, onetile); + cb_pop_front(cb_batch_var, onetile); + + // cb_tmp1 * running stats --> (1 - momentum) * running stats + cb_wait_front(cb_tmp1, onetile); + cb_wait_front(cb_old_running_var, onetile); + cb_reserve_back(cb_tmp3, onetile); + mul_binary_tile_init(); + tile_regs_acquire(); + tile_regs_wait(); + copy_tile_to_dst_init_short_with_dt(cb_tmp1, cb_old_running_var); + for (uint32_t i = 0; i < onetile; ++i) { + copy_tile(cb_old_running_var, i, i * 2); + } + copy_tile_to_dst_init_short_with_dt(cb_old_running_var, cb_tmp1); + for (uint32_t i = 0; i < onetile; ++i) { + copy_tile(cb_tmp1, i, i * 2 + 1); + mul_binary_tile(i * 2, i * 2 + 1); + tile_regs_commit(); + pack_tile(i * 2, cb_tmp3); + } + tile_regs_release(); + cb_push_back(cb_tmp3, onetile); + cb_pop_front(cb_old_running_var, onetile); + cb_pop_front(cb_tmp1, onetile); + + // cb_tmp2 + cb_tmp3 --> (momentum * batch stat) + ((1 - momentum) * running stats) + cb_wait_front(cb_tmp2, onetile); + cb_wait_front(cb_tmp3, onetile); + + cb_reserve_back(cb_updated_running_var, onetile); + + add_binary_tile_init(); + tile_regs_acquire(); + tile_regs_wait(); + copy_tile_to_dst_init_short_with_dt(cb_tmp2, cb_tmp3); + for (uint32_t i = 0; i < onetile; ++i) { + copy_tile(cb_tmp3, i, i * 2); + } + copy_tile_to_dst_init_short_with_dt(cb_tmp3, cb_tmp2); + for (uint32_t i = 0; i < onetile; ++i) { + copy_tile(cb_tmp2, i, i * 2 + 1); + add_binary_tile(i * 2, i * 2 + 1); + tile_regs_commit(); + pack_tile(i * 2, cb_updated_running_var); + } + tile_regs_release(); + cb_push_back(cb_updated_running_var, onetile); + cb_pop_front(cb_tmp3, onetile); + cb_pop_front(cb_tmp2, onetile); + } + } + tile_regs_commit(); + tile_regs_wait(); + pack_tile(0, cb_out0); + tile_regs_release(); + cb_pop_front(cb_momentum, 1); + cb_pop_front(cb_one, 1); + cb_push_back(cb_out0, 1); +} +} // namespace NAMESPACE diff --git a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/reader_running_statistics.cpp b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/reader_running_statistics.cpp index e27719d5b5e..e3c457c13c6 100644 --- a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/reader_running_statistics.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/reader_running_statistics.cpp @@ -46,12 +46,19 @@ void kernel_main() { union { float f; uint32_t u; - } scalar; - scalar.f = 1.0f; - fill_cb_with_value(cb_id_one, scalar.u); + } scalar_one, scalar_momentum; + scalar_one.f = 1.0f; + fill_cb_with_value(cb_id_one, scalar_one.u); + // momentum + scalar_momentum.u = momentum; cb_reserve_back(cb_id_momentum, onetile); - fill_with_val_bfloat16(cb_id_momentum, momentum); +#ifdef FILL_WITH_VALUE_FLOAT + FILL_WITH_VALUE_FLOAT(cb_id_momentum, scalar_momentum.f); +#endif +#ifdef FILL_WITH_VALUE + FILL_WITH_VALUE(cb_id_momentum, momentum); +#endif cb_push_back(cb_id_momentum, onetile); uint32_t num_tiles_read = 0; diff --git a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/writer_running_statistics.cpp b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/writer_running_statistics.cpp index dec7420448b..6924193e6f6 100644 --- a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/writer_running_statistics.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/writer_running_statistics.cpp @@ -93,7 +93,7 @@ void kernel_main() { uint32_t l1_old_running_mean_write_addr = get_write_ptr(cb_id_old_running_mean); noc_async_read_tile(tile_offset, old_running_mean, l1_old_running_mean_write_addr); noc_async_read_barrier(); - fill_tile_with_first_element_bfloat16(cb_id_old_running_mean); + FILL_TILE_WITH_FIRST_ELEMENT(cb_id_old_running_mean); cb_push_back(cb_id_old_running_mean, onetile); // write data @@ -110,7 +110,7 @@ void kernel_main() { uint32_t l1_old_running_var_write_addr = get_write_ptr(cb_id_old_running_var); noc_async_read_tile(tile_offset, old_running_var, l1_old_running_var_write_addr); noc_async_read_barrier(); - fill_tile_with_first_element_bfloat16(cb_id_old_running_var); + FILL_TILE_WITH_FIRST_ELEMENT(cb_id_old_running_var); cb_push_back(cb_id_old_running_var, onetile); // write data diff --git a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/running_statistics_device_operation.cpp b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/running_statistics_device_operation.cpp index 30341012f2e..d0e841dd288 100644 --- a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/running_statistics_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/running_statistics_device_operation.cpp @@ -8,38 +8,43 @@ #include "ttnn/tensor/tensor.hpp" namespace ttnn::operations::normalization { + +namespace { +inline void check_tensor_stat(const Tensor& tensor, std::string_view name, std::uint32_t input_c_dim) { + TT_FATAL( + tensor.get_layout() == Layout::TILE, "batch_norm only supports tiled layout. Got: {}", tensor.get_layout()); + TT_FATAL( + tensor.get_dtype() == DataType::BFLOAT16 || tensor.get_dtype() == DataType::FLOAT32, + "batch_norm only supports bfloat16, float32. Got: {}", + tensor.get_dtype()); + TT_FATAL( + tensor.storage_type() == StorageType::DEVICE, + "Operands to batch_norm need to be on device! Got: {}", + tensor.storage_type()); + TT_FATAL(tensor.buffer() != nullptr, "Operands to batch_norm need to be allocated in buffers on device!"); + TT_FATAL(tensor.get_logical_shape().rank() == 4, "batch_norm supports tensors of rank 4"); + TT_FATAL(tensor.get_logical_shape()[1] == input_c_dim, "{}[1] must be the same as input's channel size.", name); +} +} // namespace + void RunningStatistics::validate_tensors( const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) { const auto& [batch_mean, batch_var, running_mean, running_var] = tensor_args; - check_tensor(batch_mean, "running_statistics", "batch_mean"); - check_tensor(batch_var, "running_statistics", "batch_var"); - check_tensor(running_mean, "running_statistics", "running_mean"); - check_tensor(running_var, "running_statistics", "running_var"); - // mean (1, C, 1, 1) auto C = batch_mean.get_logical_shape()[1]; - // var (1, C, 1, 1) - TT_FATAL(batch_var.get_logical_shape()[1] == C, "batch_var_shape[1] must be the same as input's channel size."); + + check_tensor_stat(batch_mean, "batch_mean_shape", C); + check_tensor_stat(batch_var, "batch_var_shape", C); // running_mean (1, C, 1, 1) if (running_mean.has_value()) { - TT_FATAL( - running_mean.value().get_logical_shape()[1] == C, - "running_mean_shape[1] must be the same as input's channel size."); - TT_FATAL( - running_mean.value().get_logical_shape()[1] == C, - "running_mean_shape[1] must be the same as input's channel size."); + check_tensor_stat(running_mean.value(), "running_mean_shape", C); } // running_var (1, C, 1, 1) if (running_var.has_value()) { - TT_FATAL( - running_var.value().get_logical_shape()[1] == C, - "running_var_shape[1] must be the same as input's channel size."); - TT_FATAL( - running_var.value().get_logical_shape()[1] == C, - "running_var_shape[1] must be the same as input's channel size."); + check_tensor_stat(running_var.value(), "running_var_shape", C); } } @@ -110,7 +115,8 @@ std::tuple running_mean, std::optional running_var, const std::optional& memory_config) { - operation_attributes_t operation_attributes{momentum, memory_config.value_or(batch_mean.memory_config())}; + operation_attributes_t operation_attributes{ + momentum, memory_config.value_or(batch_mean.memory_config()), batch_mean.get_dtype()}; tensor_args_t tensor_args{batch_mean, batch_var, std::move(running_mean), std::move(running_var)}; return {operation_attributes, tensor_args}; } diff --git a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/running_statistics_program_factory.cpp b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/running_statistics_program_factory.cpp index 7f476e8f2ea..05ea322dc21 100644 --- a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/running_statistics_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/running_statistics_program_factory.cpp @@ -74,9 +74,10 @@ void set_or_update_runtime_arguments( } uint32_t cHtWt = cHt * cWt; - class bfloat16 bfloat_scalar_momentum(momentum); - uint32_t packed_scalar_momentum = - pack_two_bfloat16_into_uint32({bfloat_scalar_momentum, bfloat_scalar_momentum}); + const auto scalar = momentum; + const auto packed_scalar_momentum = batch_mean_tensor.get_dtype() == DataType::FLOAT32 + ? std::bit_cast(scalar) + : pack_two_bfloat16_into_uint32({scalar, scalar}); std::array reader_runtime_args = { packed_scalar_momentum, batch_mean_tensor.buffer()->address(), @@ -227,8 +228,7 @@ RunningStatistics::RunningStatisticsProgramFactory::create( b_num_tiles_per_cb, e_data_format); // updated running var - // Intermediate buffers required for uodation of running stats - + // Intermediate buffers required for updation of running stats auto [tmp1_cb, tmp1_cb_handle] = create_cb(tt::CBIndex::c_21, program, all_device_cores, b_single_tile_size, b_num_tiles_per_cb, b_data_format); @@ -246,37 +246,86 @@ RunningStatistics::RunningStatisticsProgramFactory::create( const auto e_is_dram = running_var_has_value and running_var_tensor->buffer()->buffer_type() == tt_metal::BufferType::DRAM; + std::map dataflow_defines; // Currently support only for fp32, bf16 + if (batch_mean_tensor.get_dtype() == DataType::FLOAT32) { + dataflow_defines["FILL_TILE_WITH_FIRST_ELEMENT"] = "fill_tile_with_first_element"; + dataflow_defines["FILL_WITH_VALUE_FLOAT"] = "fill_with_val<1024, float>"; + } else { + dataflow_defines["FILL_TILE_WITH_FIRST_ELEMENT"] = "fill_tile_with_first_element_bfloat16"; + dataflow_defines["FILL_WITH_VALUE"] = "fill_with_val_bfloat16"; + } + // READER KERNEL + auto reader_defines = dataflow_defines; auto reader_kernel_id = tt_metal::CreateKernel( program, "ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/reader_running_statistics.cpp", all_device_cores, - tt_metal::ReaderDataMovementConfig({a_is_dram})); + tt_metal::ReaderDataMovementConfig({a_is_dram}, std::move(reader_defines))); // WRITER KERNEL + auto writer_defines = dataflow_defines; auto writer_kernel_id = tt_metal::CreateKernel( program, "ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/writer_running_statistics.cpp", all_device_cores, - tt_metal::WriterDataMovementConfig({ - b_is_dram, - c_is_dram, - d_is_dram, - e_is_dram, - static_cast(running_mean_has_value), - static_cast(running_var_has_value), - })); + tt_metal::WriterDataMovementConfig( + { + b_is_dram, + c_is_dram, + d_is_dram, + e_is_dram, + static_cast(running_mean_has_value), + static_cast(running_var_has_value), + }, + std::move(writer_defines))); // COMPUTE KERNEL bool fp32_dest_acc_en = c_data_format == tt::DataFormat::UInt32 || c_data_format == tt::DataFormat::Int32 || c_data_format == tt::DataFormat::Float32; + + uint32_t src_batch_mean_cb_index = tt::CBIndex::c_0; + uint32_t src_batch_var_cb_index = tt::CBIndex::c_1; + uint32_t src_momentum_cb_index = tt::CBIndex::c_5; + uint32_t src_one_cb_index = tt::CBIndex::c_6; + uint32_t src_temp_1_cb_index = tt::CBIndex::c_21; + uint32_t src_temp_2_cb_index = tt::CBIndex::c_22; + uint32_t src_temp_3_cb_index = tt::CBIndex::c_23; + uint32_t src_updated_running_mean_cb_index = tt::CBIndex::c_27; + uint32_t src_old_running_mean_cb_index = tt::CBIndex::c_3; + uint32_t src_updated_running_var_cb_index = tt::CBIndex::c_28; + uint32_t src_old_running_var_cb_index = tt::CBIndex::c_4; + + std::vector unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default); + if (fp32_dest_acc_en) { + for (const auto cb_index : + {src_batch_mean_cb_index, + src_batch_var_cb_index, + src_momentum_cb_index, + src_one_cb_index, + src_temp_1_cb_index, + src_temp_2_cb_index, + src_temp_3_cb_index, + src_updated_running_mean_cb_index, + src_old_running_mean_cb_index, + src_updated_running_var_cb_index, + src_old_running_var_cb_index}) { + unpack_to_dest_mode[cb_index] = UnpackToDestMode::UnpackToDestFp32; + } + } + std::vector compute_kernel_args = { static_cast(running_mean_has_value), static_cast(running_var_has_value)}; auto compute_kernel_id = tt_metal::CreateKernel( program, - "ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/running_statistics_kernel.cpp", + fmt::format( + "ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/running_statistics_{}.cpp", + fp32_dest_acc_en ? "sfpu_kernel" : "kernel"), all_device_cores, - tt_metal::ComputeConfig{.fp32_dest_acc_en = fp32_dest_acc_en, .compile_args = compute_kernel_args}); + tt_metal::ComputeConfig{ + .fp32_dest_acc_en = fp32_dest_acc_en, + .unpack_to_dest_mode = std::move(unpack_to_dest_mode), + .compile_args = compute_kernel_args}); auto set_runtime_args = [](Program& program, KernelHandle kernel_id, CoreCoord core, auto&& args) { tt_metal::SetRuntimeArgs(program, kernel_id, core, args); From 1e52287de8c5a5610bb25f8114f9ba348a8cc5b3 Mon Sep 17 00:00:00 2001 From: VirdhatchaniKN Date: Wed, 12 Feb 2025 05:25:49 +0000 Subject: [PATCH 099/316] #12253: Add test for fp32 BN Testing --- .../unit_tests/operations/test_batch_norm.py | 44 +++++++++---------- .../running_statistics_sfpu_kernel.cpp | 2 - 2 files changed, 22 insertions(+), 24 deletions(-) diff --git a/tests/ttnn/unit_tests/operations/test_batch_norm.py b/tests/ttnn/unit_tests/operations/test_batch_norm.py index 1305fc33005..377e32bc0af 100644 --- a/tests/ttnn/unit_tests/operations/test_batch_norm.py +++ b/tests/ttnn/unit_tests/operations/test_batch_norm.py @@ -96,29 +96,29 @@ def test_batch_norm_training_fp32( eps=eps, momentum=momentum, ) - comp_pass = compare_results_batch_norm([tt_output], [torch_result]) + comp_BN_Output = compare_results_batch_norm([tt_output], [torch_result]) if training: channels = input_shapes[1] if check_mean: - comp_pass_1 = compare_results_batch_norm( + comp_BN_running_mean = compare_results_batch_norm( [tt_updated_mean], [mean_data.view(1, channels, 1, 1)], stats=True ) # Check Updated running mean else: if tt_updated_mean is None: - comp_pass_1 = True + comp_BN_running_mean = True else: - comp_pass_1 = False + comp_BN_running_mean = False if check_var: - comp_pass_2 = compare_results_batch_norm( + comp_BN_running_var = compare_results_batch_norm( [tt_updated_var], [var_data.view(1, channels, 1, 1)], stats=True ) # Check Updated running var else: if tt_updated_var is None: - comp_pass_2 = True + comp_BN_running_var = True else: - comp_pass_2 = False - comp_pass = comp_pass and comp_pass_1 and comp_pass_2 - assert comp_pass + comp_BN_running_var = False + comp_BN_Output = comp_BN_Output and comp_BN_running_mean and comp_BN_running_var + assert comp_BN_Output @skip_for_grayskull("Unsupported dtype for Grayskull") @@ -237,10 +237,10 @@ def test_batch_norm_fp32( training=training, eps=eps, ) - comp_pass = compare_results_batch_norm([tt_output], [torch_result]) and torch.allclose( + comp_BN_Output = compare_results_batch_norm([tt_output], [torch_result]) and torch.allclose( torch_result, tt_output, atol=1e-6, rtol=1e-3 ) - assert comp_pass + assert comp_BN_Output @pytest.mark.parametrize( @@ -311,30 +311,30 @@ def test_batch_norm(input_shapes, training, check_mean, check_var, weight, bias, eps=eps, momentum=momentum, ) - comp_pass = compare_results_batch_norm([tt_output], [torch_result]) # Check BN Result + comp_BN_Output = compare_results_batch_norm([tt_output], [torch_result]) # Check BN Result if training: channels = input_shapes[1] if check_mean: - comp_pass_1 = compare_results_batch_norm( + comp_BN_running_mean = compare_results_batch_norm( [tt_updated_mean], [mean_data.view(1, channels, 1, 1)], stats=True ) # Check Updated running mean else: if tt_updated_mean is None: - comp_pass_1 = True + comp_BN_running_mean = True else: - comp_pass_1 = False + comp_BN_running_mean = False if check_var: - comp_pass_2 = compare_results_batch_norm( + comp_BN_running_var = compare_results_batch_norm( [tt_updated_var], [var_data.view(1, channels, 1, 1)], stats=True ) # Check Updated running var else: if tt_updated_var is None: - comp_pass_2 = True + comp_BN_running_var = True else: - comp_pass_2 = False - comp_pass = comp_pass and comp_pass_1 and comp_pass_2 + comp_BN_running_var = False + comp_BN_Output = comp_BN_Output and comp_BN_running_mean and comp_BN_running_var - assert comp_pass + assert comp_BN_Output @pytest.mark.parametrize( @@ -365,5 +365,5 @@ def test_batch_norm_program_cache_and_default(input_shapes, mem_layout, device): ) tt_output = ttnn.to_torch(tt_output_tensor_on_device) torch_result = torch.nn.functional.batch_norm(input=in_data, running_mean=mean_data, running_var=var_data) - comp_pass = compare_results_batch_norm([tt_output], [torch_result]) - assert comp_pass + comp_BN_Output = compare_results_batch_norm([tt_output], [torch_result]) + assert comp_BN_Output diff --git a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/running_statistics_sfpu_kernel.cpp b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/running_statistics_sfpu_kernel.cpp index 47256317ee8..dd3fd1a5ba8 100644 --- a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/running_statistics_sfpu_kernel.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/running_statistics_sfpu_kernel.cpp @@ -83,7 +83,6 @@ void MAIN { cb_wait_front(cb_tmp1, onetile); cb_wait_front(cb_old_running_mean, onetile); cb_reserve_back(cb_tmp3, onetile); - mul_binary_tile_init(); tile_regs_acquire(); tile_regs_wait(); copy_tile_to_dst_init_short_with_dt(cb_tmp1, cb_old_running_mean); @@ -172,7 +171,6 @@ void MAIN { cb_wait_front(cb_tmp1, onetile); cb_wait_front(cb_old_running_var, onetile); cb_reserve_back(cb_tmp3, onetile); - mul_binary_tile_init(); tile_regs_acquire(); tile_regs_wait(); copy_tile_to_dst_init_short_with_dt(cb_tmp1, cb_old_running_var); From 34a0bc1e6b5f21577521a382ef4b8c0e50eab950 Mon Sep 17 00:00:00 2001 From: VirdhatchaniKN Date: Mon, 10 Feb 2025 10:05:20 +0000 Subject: [PATCH 100/316] #17758: Refactor Training mode compute kernel Buffer Index --- .../compute/running_statistics_kernel.cpp | 24 +++---- .../running_statistics_sfpu_kernel.cpp | 24 +++---- .../running_statistics_program_factory.cpp | 62 ++++++++++--------- 3 files changed, 56 insertions(+), 54 deletions(-) diff --git a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/running_statistics_kernel.cpp b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/running_statistics_kernel.cpp index 642a1c6f807..5895f8284d5 100644 --- a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/running_statistics_kernel.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/running_statistics_kernel.cpp @@ -13,18 +13,18 @@ void MAIN { constexpr uint32_t old_running_mean_has_value = get_compile_time_arg_val(0) == 1; constexpr uint32_t old_running_var_has_value = get_compile_time_arg_val(1) == 1; - constexpr auto cb_batch_mean = tt::CBIndex::c_0; // batch mean - constexpr auto cb_batch_var = tt::CBIndex::c_1; // batch var - constexpr auto cb_out0 = tt::CBIndex::c_2; - constexpr auto cb_old_running_mean = tt::CBIndex::c_3; // old running mean tensor - constexpr auto cb_old_running_var = tt::CBIndex::c_4; // old running var tensor - constexpr auto cb_updated_running_mean = tt::CBIndex::c_27; // updated running mean tensor - constexpr auto cb_updated_running_var = tt::CBIndex::c_28; // updated running var tensor - constexpr auto cb_momentum = tt::CBIndex::c_5; // momentum - constexpr auto cb_one = tt::CBIndex::c_6; // stores 1 - constexpr auto cb_tmp1 = tt::CBIndex::c_21; // tmp 1 - constexpr auto cb_tmp2 = tt::CBIndex::c_22; // tmp 2 - constexpr auto cb_tmp3 = tt::CBIndex::c_23; // tmp 3 + constexpr auto cb_batch_mean = get_compile_time_arg_val(2); // batch mean + constexpr auto cb_batch_var = get_compile_time_arg_val(3); // batch var + constexpr auto cb_out0 = get_compile_time_arg_val(4); + constexpr auto cb_old_running_mean = get_compile_time_arg_val(5); // old running mean tensor + constexpr auto cb_old_running_var = get_compile_time_arg_val(6); // old running var tensor + constexpr auto cb_updated_running_mean = get_compile_time_arg_val(7); // updated running mean tensor + constexpr auto cb_updated_running_var = get_compile_time_arg_val(8); // updated running var tensor + constexpr auto cb_momentum = get_compile_time_arg_val(9); // momentum + constexpr auto cb_one = get_compile_time_arg_val(10); // stores 1 + constexpr auto cb_tmp1 = get_compile_time_arg_val(11); // tmp 1 + constexpr auto cb_tmp2 = get_compile_time_arg_val(12); // tmp 2 + constexpr auto cb_tmp3 = get_compile_time_arg_val(13); // tmp 3 binary_op_init_common(cb_batch_mean, cb_batch_var, cb_out0); constexpr uint32_t onetile = 1; diff --git a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/running_statistics_sfpu_kernel.cpp b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/running_statistics_sfpu_kernel.cpp index dd3fd1a5ba8..d40ed7dd185 100644 --- a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/running_statistics_sfpu_kernel.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/running_statistics_sfpu_kernel.cpp @@ -16,18 +16,18 @@ void MAIN { constexpr uint32_t old_running_mean_has_value = get_compile_time_arg_val(0) == 1; constexpr uint32_t old_running_var_has_value = get_compile_time_arg_val(1) == 1; - constexpr auto cb_batch_mean = tt::CBIndex::c_0; // batch mean - constexpr auto cb_batch_var = tt::CBIndex::c_1; // batch var - constexpr auto cb_out0 = tt::CBIndex::c_2; - constexpr auto cb_old_running_mean = tt::CBIndex::c_3; // old running mean tensor - constexpr auto cb_old_running_var = tt::CBIndex::c_4; // old running var tensor - constexpr auto cb_updated_running_mean = tt::CBIndex::c_27; // updated running mean tensor - constexpr auto cb_updated_running_var = tt::CBIndex::c_28; // updated running var tensor - constexpr auto cb_momentum = tt::CBIndex::c_5; // momentum - constexpr auto cb_one = tt::CBIndex::c_6; // stores 1 - constexpr auto cb_tmp1 = tt::CBIndex::c_21; // tmp 1 - constexpr auto cb_tmp2 = tt::CBIndex::c_22; // tmp 2 - constexpr auto cb_tmp3 = tt::CBIndex::c_23; // tmp 3 + constexpr auto cb_batch_mean = get_compile_time_arg_val(2); // batch mean + constexpr auto cb_batch_var = get_compile_time_arg_val(3); // batch var + constexpr auto cb_out0 = get_compile_time_arg_val(4); + constexpr auto cb_old_running_mean = get_compile_time_arg_val(5); // old running mean tensor + constexpr auto cb_old_running_var = get_compile_time_arg_val(6); // old running var tensor + constexpr auto cb_updated_running_mean = get_compile_time_arg_val(7); // updated running mean tensor + constexpr auto cb_updated_running_var = get_compile_time_arg_val(8); // updated running var tensor + constexpr auto cb_momentum = get_compile_time_arg_val(9); // momentum + constexpr auto cb_one = get_compile_time_arg_val(10); // stores 1 + constexpr auto cb_tmp1 = get_compile_time_arg_val(11); // tmp 1 + constexpr auto cb_tmp2 = get_compile_time_arg_val(12); // tmp 2 + constexpr auto cb_tmp3 = get_compile_time_arg_val(13); // tmp 3 unary_op_init_common(cb_batch_mean, cb_out0); constexpr uint32_t onetile = 1; diff --git a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/running_statistics_program_factory.cpp b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/running_statistics_program_factory.cpp index 05ea322dc21..a8795ae63eb 100644 --- a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/running_statistics_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/running_statistics_program_factory.cpp @@ -169,37 +169,37 @@ RunningStatistics::RunningStatisticsProgramFactory::create( uint32_t b_num_tiles_per_cb = num_tiles_per_cb; // Input buffers - auto [a_cb, a_cb_handle] = create_cb( + auto [batch_mean_tensor_cb, batch_mean_tensor_cb_handle] = create_cb( tt::CBIndex::c_0, program, all_device_cores, a_single_tile_size, num_tiles_per_cb, a_data_format); // batch_mean - auto [b_cb, b_cb_handle] = create_cb( + auto [batch_var_tensor_cb, batch_var_tensor_cb_handle] = create_cb( tt::CBIndex::c_1, program, all_device_cores, b_single_tile_size, b_num_tiles_per_cb, b_data_format); // batch_var - auto [c_cb, c_cb_handle] = create_cb( + auto [output_tensor_cb, output_tensor_cb_handle] = create_cb( tt::CBIndex::c_2, program, all_device_cores, c_single_tile_size, num_tiles_per_cb, c_data_format); // output - auto [d_cb, d_cb_handle] = create_cb( + auto [old_running_mean_tensor_cb, old_running_mean_tensor_cb_handle] = create_cb( tt::CBIndex::c_3, program, all_device_cores, d_single_tile_size, b_num_tiles_per_cb, d_data_format); // old running mean - auto [e_cb, e_cb_handle] = create_cb( + auto [old_running_var_tensor_cb, old_running_var_tensor_cb_handle] = create_cb( tt::CBIndex::c_4, program, all_device_cores, e_single_tile_size, b_num_tiles_per_cb, e_data_format); // old running var - auto [f_cb, f_cb_handle] = create_cb( + auto [momentum_cb, momentum_cb_handle] = create_cb( tt::CBIndex::c_5, program, all_device_cores, @@ -284,38 +284,40 @@ RunningStatistics::RunningStatisticsProgramFactory::create( bool fp32_dest_acc_en = c_data_format == tt::DataFormat::UInt32 || c_data_format == tt::DataFormat::Int32 || c_data_format == tt::DataFormat::Float32; - uint32_t src_batch_mean_cb_index = tt::CBIndex::c_0; - uint32_t src_batch_var_cb_index = tt::CBIndex::c_1; - uint32_t src_momentum_cb_index = tt::CBIndex::c_5; - uint32_t src_one_cb_index = tt::CBIndex::c_6; - uint32_t src_temp_1_cb_index = tt::CBIndex::c_21; - uint32_t src_temp_2_cb_index = tt::CBIndex::c_22; - uint32_t src_temp_3_cb_index = tt::CBIndex::c_23; - uint32_t src_updated_running_mean_cb_index = tt::CBIndex::c_27; - uint32_t src_old_running_mean_cb_index = tt::CBIndex::c_3; - uint32_t src_updated_running_var_cb_index = tt::CBIndex::c_28; - uint32_t src_old_running_var_cb_index = tt::CBIndex::c_4; - std::vector unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default); if (fp32_dest_acc_en) { for (const auto cb_index : - {src_batch_mean_cb_index, - src_batch_var_cb_index, - src_momentum_cb_index, - src_one_cb_index, - src_temp_1_cb_index, - src_temp_2_cb_index, - src_temp_3_cb_index, - src_updated_running_mean_cb_index, - src_old_running_mean_cb_index, - src_updated_running_var_cb_index, - src_old_running_var_cb_index}) { + {batch_mean_tensor_cb, + batch_var_tensor_cb, + output_tensor_cb, + old_running_mean_tensor_cb, + old_running_var_tensor_cb, + updated_m_cb, + updated_v_cb, + momentum_cb, + one_cb, + tmp1_cb, + tmp2_cb, + tmp3_cb}) { unpack_to_dest_mode[cb_index] = UnpackToDestMode::UnpackToDestFp32; } } std::vector compute_kernel_args = { - static_cast(running_mean_has_value), static_cast(running_var_has_value)}; + static_cast(running_mean_has_value), + static_cast(running_var_has_value), + batch_mean_tensor_cb, + batch_var_tensor_cb, + output_tensor_cb, + old_running_mean_tensor_cb, + old_running_var_tensor_cb, + updated_m_cb, + updated_v_cb, + momentum_cb, + one_cb, + tmp1_cb, + tmp2_cb, + tmp3_cb}; auto compute_kernel_id = tt_metal::CreateKernel( program, fmt::format( From 54676beb0aee9a127777bb2f229f8b36fb60c024 Mon Sep 17 00:00:00 2001 From: VirdhatchaniKN Date: Mon, 10 Feb 2025 10:09:27 +0000 Subject: [PATCH 101/316] #17758: Switch reader buffer index to compile-time args --- .../device/kernels/dataflow/reader_running_statistics.cpp | 6 +++--- .../device/running_statistics_program_factory.cpp | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/reader_running_statistics.cpp b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/reader_running_statistics.cpp index e3c457c13c6..02437e03d6e 100644 --- a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/reader_running_statistics.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/reader_running_statistics.cpp @@ -21,9 +21,9 @@ void kernel_main() { constexpr bool src_is_dram = get_compile_time_arg_val(0) == 1; - constexpr auto cb_id_src = tt::CBIndex::c_0; - constexpr auto cb_id_momentum = tt::CBIndex::c_5; - constexpr auto cb_id_one = tt::CBIndex::c_6; + constexpr auto cb_id_src = get_compile_time_arg_val(1); + constexpr auto cb_id_momentum = get_compile_time_arg_val(2); + constexpr auto cb_id_one = get_compile_time_arg_val(3); constexpr uint32_t onetile = 1; const uint32_t src_tile_bytes = get_tile_size(cb_id_src); diff --git a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/running_statistics_program_factory.cpp b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/running_statistics_program_factory.cpp index a8795ae63eb..a4d6ee3f27c 100644 --- a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/running_statistics_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/running_statistics_program_factory.cpp @@ -261,7 +261,8 @@ RunningStatistics::RunningStatisticsProgramFactory::create( program, "ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/reader_running_statistics.cpp", all_device_cores, - tt_metal::ReaderDataMovementConfig({a_is_dram}, std::move(reader_defines))); + tt_metal::ReaderDataMovementConfig( + {a_is_dram, batch_mean_tensor_cb, momentum_cb, one_cb}, std::move(reader_defines))); // WRITER KERNEL auto writer_defines = dataflow_defines; From 4cecf1a7093376eb5b138c5e712aea92c15fc397 Mon Sep 17 00:00:00 2001 From: VirdhatchaniKN Date: Mon, 10 Feb 2025 10:47:17 +0000 Subject: [PATCH 102/316] #17758: Update Running stats Writer kernel --- .../kernels/dataflow/writer_running_statistics.cpp | 12 ++++++------ .../device/running_statistics_program_factory.cpp | 6 ++++++ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/writer_running_statistics.cpp b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/writer_running_statistics.cpp index 6924193e6f6..03b2b474b36 100644 --- a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/writer_running_statistics.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/writer_running_statistics.cpp @@ -22,7 +22,7 @@ void kernel_main() { constexpr uint32_t onetile = 1; - constexpr auto cb_id_src = tt::CBIndex::c_1; + constexpr auto cb_id_src = get_compile_time_arg_val(6); constexpr bool src_is_dram = get_compile_time_arg_val(0) == 1; const uint32_t src_tile_bytes = get_tile_size(cb_id_src); const DataFormat src_data_format = get_dataformat(cb_id_src); @@ -30,7 +30,7 @@ void kernel_main() { const InterleavedAddrGenFast src = { .bank_base_address = src_addr, .page_size = src_tile_bytes, .data_format = src_data_format}; - constexpr auto cb_id_dst = tt::CBIndex::c_2; + constexpr auto cb_id_dst = get_compile_time_arg_val(7); constexpr bool dst_is_dram = get_compile_time_arg_val(1) == 1; const uint32_t dst_tile_bytes = get_tile_size(cb_id_dst); const DataFormat dst_data_format = get_dataformat(cb_id_dst); @@ -39,7 +39,7 @@ void kernel_main() { .bank_base_address = dst_addr, .page_size = dst_tile_bytes, .data_format = dst_data_format}; // old running mean - constexpr auto cb_id_old_running_mean = tt::CBIndex::c_3; + constexpr auto cb_id_old_running_mean = get_compile_time_arg_val(8); constexpr bool old_running_mean_is_dram = get_compile_time_arg_val(2) == 1; const uint32_t old_running_mean_tile_bytes = get_tile_size(cb_id_old_running_mean); const DataFormat old_running_mean_data_format = get_dataformat(cb_id_old_running_mean); @@ -50,7 +50,7 @@ void kernel_main() { .data_format = old_running_mean_data_format}; // old running var - constexpr auto cb_id_old_running_var = tt::CBIndex::c_4; + constexpr auto cb_id_old_running_var = get_compile_time_arg_val(9); constexpr bool old_running_var_is_dram = get_compile_time_arg_val(3) == 1; const uint32_t old_running_var_tile_bytes = get_tile_size(cb_id_old_running_var); const DataFormat old_running_var_data_format = get_dataformat(cb_id_old_running_var); @@ -62,8 +62,8 @@ void kernel_main() { constexpr bool old_running_mean_has_value = get_compile_time_arg_val(4) == 1; constexpr bool old_running_var_has_value = get_compile_time_arg_val(5) == 1; - constexpr auto cb_id_updated_running_mean = tt::CBIndex::c_27; - constexpr auto cb_id_updated_running_var = tt::CBIndex::c_28; + constexpr auto cb_id_updated_running_mean = get_compile_time_arg_val(10); + constexpr auto cb_id_updated_running_var = get_compile_time_arg_val(11); uint32_t tiles_per_batch = HtWt * C; uint32_t start_n = start_tile_id / tiles_per_batch; diff --git a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/running_statistics_program_factory.cpp b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/running_statistics_program_factory.cpp index a4d6ee3f27c..0dfa6b218b0 100644 --- a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/running_statistics_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/running_statistics_program_factory.cpp @@ -278,6 +278,12 @@ RunningStatistics::RunningStatisticsProgramFactory::create( e_is_dram, static_cast(running_mean_has_value), static_cast(running_var_has_value), + batch_var_tensor_cb, + output_tensor_cb, + old_running_mean_tensor_cb, + old_running_var_tensor_cb, + updated_m_cb, + updated_v_cb, }, std::move(writer_defines))); From 53d4192a5bb07df8ae8da5f2ff8ca60967d9118b Mon Sep 17 00:00:00 2001 From: VirdhatchaniKN Date: Mon, 10 Feb 2025 11:26:26 +0000 Subject: [PATCH 103/316] #17758: Sequential buffer Indexing for Training mode running stats --- .../device/running_statistics_program_factory.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/running_statistics_program_factory.cpp b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/running_statistics_program_factory.cpp index 0dfa6b218b0..3263f995fd3 100644 --- a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/running_statistics_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/running_statistics_program_factory.cpp @@ -214,14 +214,14 @@ RunningStatistics::RunningStatisticsProgramFactory::create( b_num_tiles_per_cb, b_data_format); // to store 1 auto [updated_m_cb, updated_m_cb_handle] = create_cb( - tt::CBIndex::c_27, + tt::CBIndex::c_7, program, all_device_cores, d_single_tile_size, b_num_tiles_per_cb, d_data_format); // updated running mean auto [updated_v_cb, updated_v_cb_handle] = create_cb( - tt::CBIndex::c_28, + tt::CBIndex::c_8, program, all_device_cores, e_single_tile_size, @@ -230,13 +230,13 @@ RunningStatistics::RunningStatisticsProgramFactory::create( // Intermediate buffers required for updation of running stats auto [tmp1_cb, tmp1_cb_handle] = - create_cb(tt::CBIndex::c_21, program, all_device_cores, b_single_tile_size, b_num_tiles_per_cb, b_data_format); + create_cb(tt::CBIndex::c_9, program, all_device_cores, b_single_tile_size, b_num_tiles_per_cb, b_data_format); auto [tmp2_cb, tmp2_cb_handle] = - create_cb(tt::CBIndex::c_22, program, all_device_cores, b_single_tile_size, b_num_tiles_per_cb, b_data_format); + create_cb(tt::CBIndex::c_10, program, all_device_cores, b_single_tile_size, b_num_tiles_per_cb, b_data_format); auto [tmp3_cb, tmp3_cb_handle] = - create_cb(tt::CBIndex::c_23, program, all_device_cores, b_single_tile_size, b_num_tiles_per_cb, b_data_format); + create_cb(tt::CBIndex::c_11, program, all_device_cores, b_single_tile_size, b_num_tiles_per_cb, b_data_format); auto a_is_dram = static_cast(batch_mean_tensor.buffer()->buffer_type() == tt_metal::BufferType::DRAM); auto b_is_dram = static_cast(batch_var_tensor.buffer()->buffer_type() == tt_metal::BufferType::DRAM); From 0678c733771568e69062851c5037ce4facba289f Mon Sep 17 00:00:00 2001 From: Bryan Wilder Field Lozano Date: Thu, 13 Feb 2025 08:41:12 -0600 Subject: [PATCH 104/316] Reland "Remove tt_cluster.hpp from public API (#17813)" (#17868) --- .../device/test_galaxy_cluster_api.cpp | 2 +- .../dispatch/test_bw_and_latency.cpp | 3 + .../dispatch/test_pgm_dispatch.cpp | 1 + .../test_ethernet_read_and_send_data.cpp | 2 + ...ers_and_erisc_datamover_unidirectional.cpp | 2 + ...st_vs_multicast_to_single_core_latency.cpp | 1 + .../old/matmul/matmul_global_l1.cpp | 1 + .../old/matmul/matmul_local_l1.cpp | 1 + .../old/noc/test_noc_read_global_l1.cpp | 1 + .../old/noc/test_noc_read_local_l1.cpp | 1 + .../old/pcie/test_enqueue_rw_buffer.cpp | 1 + .../old/pcie/test_rw_buffer.cpp | 1 + .../old/pcie/test_rw_device_dram.cpp | 1 + .../old/pcie/test_rw_device_l1.cpp | 1 + .../tt_metal/test_stress_noc_mcast.cpp | 2 + .../unit_tests/gtests/test_ccl_on_galaxy.cpp | 2 + tt-train/tests/core/n300_utils_test.cpp | 4 +- .../model/linear_regression_ddp_test.cpp | 3 +- .../tests/modules/distributed/linear_test.cpp | 3 +- .../tests/ops/distributed/comm_ops_test.cpp | 3 +- .../distributed/distributed_ttnn_ops_test.cpp | 3 +- tt_fabric/CMakeLists.txt | 8 +- tt_fabric/control_plane.cpp | 2 + tt_fabric/mesh_graph.hpp | 4 +- tt_metal/api/tt-metalium/core_descriptor.hpp | 18 +-- tt_metal/api/tt-metalium/device.hpp | 1 - tt_metal/api/tt-metalium/device_impl.hpp | 1 - .../api/tt-metalium/dispatch_core_common.hpp | 2 + .../api/tt-metalium/dispatch_settings.hpp | 6 +- tt_metal/api/tt-metalium/hal_exp.hpp | 8 ++ tt_metal/common/CMakeLists.txt | 1 - tt_metal/common/core_assignment.cpp | 1 + tt_metal/common/core_assignment.hpp | 5 +- tt_metal/distributed/CMakeLists.txt | 1 + .../distributed/coordinate_translation.cpp | 2 + tt_metal/distributed/mesh_command_queue.cpp | 1 + tt_metal/distributed/system_mesh.cpp | 2 + tt_metal/experimental/hal.cpp | 2 + tt_metal/impl/buffers/dispatch.cpp | 2 + .../impl/buffers/global_circular_buffer.cpp | 2 + tt_metal/impl/buffers/global_semaphore.cpp | 2 + tt_metal/impl/debug/watcher_server.hpp | 2 + tt_metal/impl/device/device_pool.cpp | 2 + tt_metal/impl/dispatch/debug_tools.cpp | 3 + .../impl/dispatch/hardware_command_queue.cpp | 2 + .../impl/dispatch/kernel_config/fd_kernel.hpp | 1 + tt_metal/impl/dispatch/topology.cpp | 2 + tt_metal/impl/event/dispatch.cpp | 2 + .../impl/sub_device/sub_device_manager.cpp | 2 + tt_metal/jit_build/CMakeLists.txt | 1 + tt_metal/jit_build/build_env_manager.cpp | 2 +- tt_metal/llrt/CMakeLists.txt | 2 + tt_metal/{common => llrt}/core_descriptor.cpp | 14 +++ .../{api/tt-metalium => llrt}/tt_cluster.hpp | 112 +++++++++--------- ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp | 8 +- .../moreh/moreh_helper_functions.cpp | 7 +- .../reduction/prod/device/prod_op_all.cpp | 2 + 57 files changed, 189 insertions(+), 85 deletions(-) rename tt_metal/{common => llrt}/core_descriptor.cpp (94%) rename tt_metal/{api/tt-metalium => llrt}/tt_cluster.hpp (79%) diff --git a/tests/tt_metal/tt_metal/device/test_galaxy_cluster_api.cpp b/tests/tt_metal/tt_metal/device/test_galaxy_cluster_api.cpp index 5a59b2c03f8..8c998b1705e 100644 --- a/tests/tt_metal/tt_metal/device/test_galaxy_cluster_api.cpp +++ b/tests/tt_metal/tt_metal/device/test_galaxy_cluster_api.cpp @@ -5,7 +5,7 @@ #include #include "galaxy_fixture.hpp" -#include +#include "tt_cluster.hpp" #include using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp index 100534ab260..3053fd4c7ed 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp @@ -12,12 +12,15 @@ #include "logger.hpp" #include #include +#include #include #include #include #include #include +#include "tt_cluster.hpp" + constexpr uint32_t DEFAULT_ITERATIONS = 1000; constexpr uint32_t DEFAULT_WARMUP_ITERATIONS = 2; constexpr uint32_t DEFAULT_PAGE_SIZE = 2048; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp index bedd3d9d8f8..416566e7655 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_read_and_send_data.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_read_and_send_data.cpp index b8d8917462c..4eac223e08e 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_read_and_send_data.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_read_and_send_data.cpp @@ -21,6 +21,8 @@ #include "tt_metal/test_utils/stimulus.hpp" #include "tt_metal/test_utils/env_vars.hpp" +#include "tt_cluster.hpp" + // TODO: ARCH_NAME specific, must remove #include "eth_l1_address_map.h" diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp index a06c59ca543..2e7a24662d2 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp @@ -23,6 +23,8 @@ #include "tt_metal/test_utils/print_helpers.hpp" #include "tt_metal/test_utils/stimulus.hpp" +#include "tt_cluster.hpp" + // TODO: ARCH_NAME specific, must remove #include "eth_l1_address_map.h" diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_unicast_vs_multicast_to_single_core_latency.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_unicast_vs_multicast_to_single_core_latency.cpp index 5cc3d654981..ef049ae2f0a 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_unicast_vs_multicast_to_single_core_latency.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_unicast_vs_multicast_to_single_core_latency.cpp @@ -9,6 +9,7 @@ #include #include "dprint_server.hpp" #include "tt_metal/test_utils/deprecated/tensor.hpp" +#include "tt_cluster.hpp" using namespace tt; // diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp index 660e43fa781..13eb1015602 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include "dprint_server.hpp" diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1.cpp index 31b1ff6d780..b15d222a21d 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include "dprint_server.hpp" #include "tt_metal/test_utils/deprecated/tensor.hpp" diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp index 9e333537946..24580476130 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include "dprint_server.hpp" #include "tt_metal/test_utils/deprecated/tensor.hpp" diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_local_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_local_l1.cpp index be56b013dde..a08ec04c278 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_local_l1.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_local_l1.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include "dprint_server.hpp" #include "tt_metal/test_utils/deprecated/tensor.hpp" diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_enqueue_rw_buffer.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_enqueue_rw_buffer.cpp index 930199dd4e7..caa962ab89e 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_enqueue_rw_buffer.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_enqueue_rw_buffer.cpp @@ -8,6 +8,7 @@ #include #include +#include #include #include diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_buffer.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_buffer.cpp index 02f4ba02ab2..714e0b2af26 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_buffer.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_buffer.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_dram.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_dram.cpp index bc4cb0b2896..4ab4568663b 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_dram.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_dram.cpp @@ -9,6 +9,7 @@ #include #include +#include #include using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_l1.cpp index 193e687648e..04ae58dc362 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_l1.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_l1.cpp @@ -9,6 +9,7 @@ #include #include +#include #include using namespace tt; diff --git a/tests/tt_metal/tt_metal/test_stress_noc_mcast.cpp b/tests/tt_metal/tt_metal/test_stress_noc_mcast.cpp index df113d4c4d4..2ab7e642602 100644 --- a/tests/tt_metal/tt_metal/test_stress_noc_mcast.cpp +++ b/tests/tt_metal/tt_metal/test_stress_noc_mcast.cpp @@ -18,6 +18,7 @@ #include "logger.hpp" #include #include +#include #include #include #include @@ -25,6 +26,7 @@ #include #include #include +#include "tt_cluster.hpp" using namespace tt; diff --git a/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp b/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp index 8d5f455a4d2..69ba9810227 100644 --- a/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp +++ b/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp @@ -13,6 +13,8 @@ #include "ttnn/tensor/layout/tensor_layout.hpp" #include "ttnn_multi_command_queue_fixture.hpp" +#include "tt_cluster.hpp" + using namespace tt; using namespace tt_metal; diff --git a/tt-train/tests/core/n300_utils_test.cpp b/tt-train/tests/core/n300_utils_test.cpp index 6dca6e9d811..e4f05a45bf0 100644 --- a/tt-train/tests/core/n300_utils_test.cpp +++ b/tt-train/tests/core/n300_utils_test.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include +#include #include #include @@ -13,8 +14,9 @@ #include "core/tt_tensor_utils.hpp" auto check_board_is_n300() { - return tt::Cluster::instance().get_board_type(0) == BoardType::N300; + return tt_ClusterDescriptor::create()->get_board_type(0) == BoardType::N300; } + class N300UtilsTest : public ::testing::Test { protected: void SetUp() override { diff --git a/tt-train/tests/model/linear_regression_ddp_test.cpp b/tt-train/tests/model/linear_regression_ddp_test.cpp index 082ebdba960..cb29f87b187 100644 --- a/tt-train/tests/model/linear_regression_ddp_test.cpp +++ b/tt-train/tests/model/linear_regression_ddp_test.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include +#include #include #include @@ -22,7 +23,7 @@ namespace { auto check_board_is_n300() { - return tt::Cluster::instance().get_board_type(0) == BoardType::N300; + return tt_ClusterDescriptor::create()->get_board_type(0) == BoardType::N300; } } // namespace diff --git a/tt-train/tests/modules/distributed/linear_test.cpp b/tt-train/tests/modules/distributed/linear_test.cpp index 39fc1c587f3..fb1c47c23be 100644 --- a/tt-train/tests/modules/distributed/linear_test.cpp +++ b/tt-train/tests/modules/distributed/linear_test.cpp @@ -5,6 +5,7 @@ #include "modules/distributed/linear.hpp" #include +#include #include #include @@ -16,7 +17,7 @@ namespace { auto check_board_is_n300() { - return tt::Cluster::instance().get_board_type(0) == BoardType::N300; + return tt_ClusterDescriptor::create()->get_board_type(0) == BoardType::N300; } ttml::autograd::TensorPtr get_parameter(auto& parameters, const std::string& name_substring) { diff --git a/tt-train/tests/ops/distributed/comm_ops_test.cpp b/tt-train/tests/ops/distributed/comm_ops_test.cpp index e9ca096998e..e0d938d06eb 100644 --- a/tt-train/tests/ops/distributed/comm_ops_test.cpp +++ b/tt-train/tests/ops/distributed/comm_ops_test.cpp @@ -5,6 +5,7 @@ #include "ops/distributed/comm_ops.hpp" #include +#include #include #include @@ -17,7 +18,7 @@ namespace { auto check_board_is_n300() { - return tt::Cluster::instance().get_board_type(0) == BoardType::N300; + return tt_ClusterDescriptor::create()->get_board_type(0) == BoardType::N300; } } // namespace diff --git a/tt-train/tests/ttnn_fixed/distributed/distributed_ttnn_ops_test.cpp b/tt-train/tests/ttnn_fixed/distributed/distributed_ttnn_ops_test.cpp index b52c099a586..ff3cf5f838d 100644 --- a/tt-train/tests/ttnn_fixed/distributed/distributed_ttnn_ops_test.cpp +++ b/tt-train/tests/ttnn_fixed/distributed/distributed_ttnn_ops_test.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include +#include #include #include @@ -17,7 +18,7 @@ namespace { auto check_board_is_n300() { - return tt::Cluster::instance().get_board_type(0) == BoardType::N300; + return tt_ClusterDescriptor::create()->get_board_type(0) == BoardType::N300; } class TrivialTnnFixedDistributedTest : public ::testing::Test { diff --git a/tt_fabric/CMakeLists.txt b/tt_fabric/CMakeLists.txt index 34add9c0350..23cd638d49d 100644 --- a/tt_fabric/CMakeLists.txt +++ b/tt_fabric/CMakeLists.txt @@ -9,12 +9,18 @@ target_sources( mesh_graph.cpp ) -target_include_directories(tt_fabric PRIVATE .) +target_include_directories( + tt_fabric + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + ${PROJECT_SOURCE_DIR}/tt_metal/api/tt-metalium +) target_link_libraries( tt_fabric PRIVATE Metalium::Metal + Metalium::Metal::LLRT umd::device metal_common_libs magic_enum diff --git a/tt_fabric/control_plane.cpp b/tt_fabric/control_plane.cpp index 0bfede9f0a0..70bba401531 100644 --- a/tt_fabric/control_plane.cpp +++ b/tt_fabric/control_plane.cpp @@ -6,6 +6,8 @@ #include "control_plane.hpp" #include +#include "tt_cluster.hpp" + namespace tt::tt_fabric { // Get the physical chip ids for a mesh diff --git a/tt_fabric/mesh_graph.hpp b/tt_fabric/mesh_graph.hpp index 414b8947527..1b9ac9c6359 100644 --- a/tt_fabric/mesh_graph.hpp +++ b/tt_fabric/mesh_graph.hpp @@ -11,9 +11,11 @@ #include #include -#include #include +#include // tt::ARCH +#include // chip_id_t + namespace tt::tt_fabric { struct ChipSpec { tt::ARCH arch; diff --git a/tt_metal/api/tt-metalium/core_descriptor.hpp b/tt_metal/api/tt-metalium/core_descriptor.hpp index f403f7c23d6..9b45020a67d 100644 --- a/tt_metal/api/tt-metalium/core_descriptor.hpp +++ b/tt_metal/api/tt-metalium/core_descriptor.hpp @@ -5,10 +5,12 @@ #pragma once #include "core_coord.hpp" -#include "tt_cluster.hpp" #include "hal.hpp" #include "dispatch_core_common.hpp" +#include // tt::ARCH +#include // chip_id_t + namespace tt { struct core_descriptor_t { @@ -38,18 +40,8 @@ const core_descriptor_t& get_core_descriptor_config( const std::tuple& get_physical_worker_grid_config( chip_id_t chip, uint8_t num_hw_cqs, const tt_metal::DispatchCoreConfig& dispatch_core_config); -inline std::optional get_storage_core_bank_size( - chip_id_t device_id, const uint8_t num_hw_cqs, const tt_metal::DispatchCoreConfig& dispatch_core_config) { - const core_descriptor_t& core_desc = get_core_descriptor_config(device_id, num_hw_cqs, dispatch_core_config); - const metal_SocDescriptor& soc_desc = tt::Cluster::instance().get_soc_desc(device_id); - if (core_desc.storage_core_bank_size.has_value()) { - TT_FATAL( - core_desc.storage_core_bank_size.value() % tt_metal::hal.get_alignment(tt_metal::HalMemType::L1) == 0, - "Storage core bank size must be {} B aligned", - tt_metal::hal.get_alignment(tt_metal::HalMemType::L1)); - } - return core_desc.storage_core_bank_size; -} +std::optional get_storage_core_bank_size( + chip_id_t device_id, const uint8_t num_hw_cqs, const tt_metal::DispatchCoreConfig& dispatch_core_config); inline const std::vector& get_logical_storage_cores( chip_id_t device_id, const uint8_t num_hw_cqs, const tt_metal::DispatchCoreConfig& dispatch_core_config) { diff --git a/tt_metal/api/tt-metalium/device.hpp b/tt_metal/api/tt-metalium/device.hpp index be8e9af943f..36df50bb957 100644 --- a/tt_metal/api/tt-metalium/device.hpp +++ b/tt_metal/api/tt-metalium/device.hpp @@ -15,7 +15,6 @@ #include "data_types.hpp" #include "program_device_map.hpp" #include "build.hpp" -#include "tt_cluster.hpp" #include "hal.hpp" #include "command_queue_interface.hpp" #include "sub_device_manager.hpp" diff --git a/tt_metal/api/tt-metalium/device_impl.hpp b/tt_metal/api/tt-metalium/device_impl.hpp index 88dd1d44bc4..71cb322c39a 100644 --- a/tt_metal/api/tt-metalium/device_impl.hpp +++ b/tt_metal/api/tt-metalium/device_impl.hpp @@ -15,7 +15,6 @@ #include "data_types.hpp" #include "program_device_map.hpp" #include "build.hpp" -#include "tt_cluster.hpp" #include "hal.hpp" #include "command_queue_interface.hpp" #include "command_queue.hpp" diff --git a/tt_metal/api/tt-metalium/dispatch_core_common.hpp b/tt_metal/api/tt-metalium/dispatch_core_common.hpp index e6306d9238d..322d8d57641 100644 --- a/tt_metal/api/tt-metalium/dispatch_core_common.hpp +++ b/tt_metal/api/tt-metalium/dispatch_core_common.hpp @@ -9,6 +9,8 @@ #include "data_types.hpp" #include "reflection.hpp" +#include // CoreType + namespace tt::tt_metal { enum DispatchWorkerType : uint32_t { diff --git a/tt_metal/api/tt-metalium/dispatch_settings.hpp b/tt_metal/api/tt-metalium/dispatch_settings.hpp index 357e5220d16..fe91d61183f 100644 --- a/tt_metal/api/tt-metalium/dispatch_settings.hpp +++ b/tt_metal/api/tt-metalium/dispatch_settings.hpp @@ -7,12 +7,16 @@ #include #include #include +#include "dev_msgs.h" // go_msg_t #include "hal.hpp" -#include "tt_cluster.hpp" #include #include #include "umd/device/tt_core_coordinates.h" +namespace tt { +class Cluster; +} + namespace tt::tt_metal { // diff --git a/tt_metal/api/tt-metalium/hal_exp.hpp b/tt_metal/api/tt-metalium/hal_exp.hpp index a90a93cd8ea..5e14b0a5353 100644 --- a/tt_metal/api/tt-metalium/hal_exp.hpp +++ b/tt_metal/api/tt-metalium/hal_exp.hpp @@ -6,9 +6,17 @@ #include #include +#include namespace tt::tt_metal::experimental::hal { +/** + * @brief Uses the hardware abstraction layer to inform client of the architecture + * + * @return Architecture enum defined by UMD + */ +tt::ARCH get_arch(); + /** * @brief Uses the hardware abstraction layer to inform client of the architecture name * diff --git a/tt_metal/common/CMakeLists.txt b/tt_metal/common/CMakeLists.txt index 551051ea52b..3a31f8e6e07 100644 --- a/tt_metal/common/CMakeLists.txt +++ b/tt_metal/common/CMakeLists.txt @@ -1,7 +1,6 @@ set(COMMON_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/core_assignment.cpp ${CMAKE_CURRENT_SOURCE_DIR}/core_coord.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/core_descriptor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/metal_soc_descriptor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/shape2d.cpp ${CMAKE_CURRENT_SOURCE_DIR}/shape_base.cpp diff --git a/tt_metal/common/core_assignment.cpp b/tt_metal/common/core_assignment.cpp index 6131b31c9d8..0016850befe 100644 --- a/tt_metal/common/core_assignment.cpp +++ b/tt_metal/common/core_assignment.cpp @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 +#include "assert.hpp" #include "core_assignment.hpp" namespace tt { diff --git a/tt_metal/common/core_assignment.hpp b/tt_metal/common/core_assignment.hpp index 311a351d564..9ac23c17f28 100644 --- a/tt_metal/common/core_assignment.hpp +++ b/tt_metal/common/core_assignment.hpp @@ -3,7 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 #include "core_coord.hpp" -#include + +#include // tt::ARCH namespace tt { namespace tt_metal { @@ -12,7 +13,7 @@ namespace tt_metal { // a DRAM read or write. // Worker cores are derived based on architecture, harvesting configurations and DRAM Controller placement. std::vector get_optimal_dram_to_physical_worker_assignment( - ARCH arch, + tt::ARCH arch, const std::vector& dram_phy_coords, uint32_t full_grid_size_x, uint32_t full_grid_size_y, diff --git a/tt_metal/distributed/CMakeLists.txt b/tt_metal/distributed/CMakeLists.txt index 62f068ca7cc..ba9dbb1a442 100644 --- a/tt_metal/distributed/CMakeLists.txt +++ b/tt_metal/distributed/CMakeLists.txt @@ -17,5 +17,6 @@ target_link_libraries( common PRIVATE Metalium::Metal::Impl + Metalium::Metal::LLRT TT::Metalium::HostDevCommon ) diff --git a/tt_metal/distributed/coordinate_translation.cpp b/tt_metal/distributed/coordinate_translation.cpp index 5e4be86b0b8..e834ae37e2d 100644 --- a/tt_metal/distributed/coordinate_translation.cpp +++ b/tt_metal/distributed/coordinate_translation.cpp @@ -4,6 +4,8 @@ #include "tt_metal/distributed/coordinate_translation.hpp" +#include "tt_cluster.hpp" + #include namespace tt::tt_metal::distributed { diff --git a/tt_metal/distributed/mesh_command_queue.cpp b/tt_metal/distributed/mesh_command_queue.cpp index d19911a3112..e60010e150a 100644 --- a/tt_metal/distributed/mesh_command_queue.cpp +++ b/tt_metal/distributed/mesh_command_queue.cpp @@ -14,6 +14,7 @@ #include "tt_metal/impl/program/dispatch.hpp" #include "tt_metal/impl/dispatch/dispatch_query_manager.hpp" +#include "tt_cluster.hpp" namespace tt::tt_metal::distributed { struct MeshReadEventDescriptor { diff --git a/tt_metal/distributed/system_mesh.cpp b/tt_metal/distributed/system_mesh.cpp index 45185381ba6..e5399de7d69 100644 --- a/tt_metal/distributed/system_mesh.cpp +++ b/tt_metal/distributed/system_mesh.cpp @@ -7,6 +7,8 @@ #include "umd/device/types/cluster_descriptor_types.h" #include "tt_metal/distributed/coordinate_translation.hpp" +#include "tt_cluster.hpp" + namespace tt::tt_metal::distributed { class SystemMesh::Impl { diff --git a/tt_metal/experimental/hal.cpp b/tt_metal/experimental/hal.cpp index a93cfc65c70..d67c8d87e9c 100644 --- a/tt_metal/experimental/hal.cpp +++ b/tt_metal/experimental/hal.cpp @@ -17,6 +17,8 @@ using tt::tt_metal::HalSingleton; namespace tt::tt_metal::experimental::hal { +tt::ARCH get_arch() { return HalSingleton::getInstance().get_arch(); } + std::string get_arch_name() { auto arch_enum = HalSingleton::getInstance().get_arch(); return tt::get_string_lowercase(arch_enum); diff --git a/tt_metal/impl/buffers/dispatch.cpp b/tt_metal/impl/buffers/dispatch.cpp index 56b9e2a8c57..8655c830709 100644 --- a/tt_metal/impl/buffers/dispatch.cpp +++ b/tt_metal/impl/buffers/dispatch.cpp @@ -9,6 +9,8 @@ #include #include +#include "tt_cluster.hpp" + namespace tt::tt_metal { namespace buffer_dispatch { diff --git a/tt_metal/impl/buffers/global_circular_buffer.cpp b/tt_metal/impl/buffers/global_circular_buffer.cpp index 9759c6314ae..10974d388f9 100644 --- a/tt_metal/impl/buffers/global_circular_buffer.cpp +++ b/tt_metal/impl/buffers/global_circular_buffer.cpp @@ -18,6 +18,8 @@ #include #include +#include "tt_cluster.hpp" + namespace tt::tt_metal { namespace v1 { diff --git a/tt_metal/impl/buffers/global_semaphore.cpp b/tt_metal/impl/buffers/global_semaphore.cpp index 96164f64871..7102161571e 100644 --- a/tt_metal/impl/buffers/global_semaphore.cpp +++ b/tt_metal/impl/buffers/global_semaphore.cpp @@ -18,6 +18,8 @@ #include #include +#include "tt_cluster.hpp" + namespace tt::tt_metal { GlobalSemaphore::GlobalSemaphore( diff --git a/tt_metal/impl/debug/watcher_server.hpp b/tt_metal/impl/debug/watcher_server.hpp index 79f6680d4de..38a16e3c8ce 100644 --- a/tt_metal/impl/debug/watcher_server.hpp +++ b/tt_metal/impl/debug/watcher_server.hpp @@ -6,6 +6,8 @@ #include +struct metal_SocDescriptor; + namespace tt { void watcher_init(tt_metal::IDevice* device); diff --git a/tt_metal/impl/device/device_pool.cpp b/tt_metal/impl/device/device_pool.cpp index cd73f565e73..a269e823dd3 100644 --- a/tt_metal/impl/device/device_pool.cpp +++ b/tt_metal/impl/device/device_pool.cpp @@ -23,6 +23,8 @@ #include "tt_metal/impl/dispatch/dispatch_query_manager.hpp" #include "tt_metal/jit_build/build_env_manager.hpp" +#include "tt_cluster.hpp" + using namespace tt::tt_metal; namespace tt { diff --git a/tt_metal/impl/dispatch/debug_tools.cpp b/tt_metal/impl/dispatch/debug_tools.cpp index 95707965738..fc8980679e3 100644 --- a/tt_metal/impl/dispatch/debug_tools.cpp +++ b/tt_metal/impl/dispatch/debug_tools.cpp @@ -3,6 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 #include "debug_tools.hpp" + +#include "tt_cluster.hpp" + namespace internal { using namespace tt::tt_metal; diff --git a/tt_metal/impl/dispatch/hardware_command_queue.cpp b/tt_metal/impl/dispatch/hardware_command_queue.cpp index 8a72db6e742..d0aa1824264 100644 --- a/tt_metal/impl/dispatch/hardware_command_queue.cpp +++ b/tt_metal/impl/dispatch/hardware_command_queue.cpp @@ -12,6 +12,8 @@ #include #include +#include "tt_cluster.hpp" + // Because we are a Friend of Program, accessing Program::get_program_transfer_info() and Program::get_kernels_buffer() // MUST REMOVE #include diff --git a/tt_metal/impl/dispatch/kernel_config/fd_kernel.hpp b/tt_metal/impl/dispatch/kernel_config/fd_kernel.hpp index 33d394abf91..d60d15c991b 100644 --- a/tt_metal/impl/dispatch/kernel_config/fd_kernel.hpp +++ b/tt_metal/impl/dispatch/kernel_config/fd_kernel.hpp @@ -6,6 +6,7 @@ #include #include #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" +#include "tt_cluster.hpp" #define UNUSED_LOGICAL_CORE tt_cxy_pair(device_->id(), 0, 0) #define UNUSED_SEM_ID 0 diff --git a/tt_metal/impl/dispatch/topology.cpp b/tt_metal/impl/dispatch/topology.cpp index 6a9ff796669..b8eff2dd822 100644 --- a/tt_metal/impl/dispatch/topology.cpp +++ b/tt_metal/impl/dispatch/topology.cpp @@ -15,6 +15,8 @@ #include "kernel_config/eth_router.hpp" #include "kernel_config/eth_tunneler.hpp" +#include "tt_cluster.hpp" + namespace tt::tt_metal { // For readablity, unset = x = -1 diff --git a/tt_metal/impl/event/dispatch.cpp b/tt_metal/impl/event/dispatch.cpp index 36a62181c60..dad0f24cb7e 100644 --- a/tt_metal/impl/event/dispatch.cpp +++ b/tt_metal/impl/event/dispatch.cpp @@ -7,6 +7,8 @@ #include "tt_metal/impl/dispatch/dispatch_query_manager.hpp" #include +#include "tt_cluster.hpp" + namespace tt::tt_metal { namespace event_dispatch { diff --git a/tt_metal/impl/sub_device/sub_device_manager.cpp b/tt_metal/impl/sub_device/sub_device_manager.cpp index 042e46ae828..0a29d896618 100644 --- a/tt_metal/impl/sub_device/sub_device_manager.cpp +++ b/tt_metal/impl/sub_device/sub_device_manager.cpp @@ -20,6 +20,8 @@ #include #include "tt_metal/impl/dispatch/dispatch_query_manager.hpp" +#include "tt_cluster.hpp" + namespace tt::tt_metal { // assert here to avoid the need to include command_queue_interface.hpp in header diff --git a/tt_metal/jit_build/CMakeLists.txt b/tt_metal/jit_build/CMakeLists.txt index 9d15f575899..80533221018 100644 --- a/tt_metal/jit_build/CMakeLists.txt +++ b/tt_metal/jit_build/CMakeLists.txt @@ -14,6 +14,7 @@ target_link_libraries( common PRIVATE Metalium::Metal::Common + Metalium::Metal::LLRT Tracy::TracyClient Taskflow::Taskflow TT::Metalium::HostDevCommon diff --git a/tt_metal/jit_build/build_env_manager.cpp b/tt_metal/jit_build/build_env_manager.cpp index 6cb7d59e105..0d0c0217ac0 100644 --- a/tt_metal/jit_build/build_env_manager.cpp +++ b/tt_metal/jit_build/build_env_manager.cpp @@ -3,8 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 #include "build_env_manager.hpp" -#include #include +#include "tt_cluster.hpp" namespace tt::tt_metal { diff --git a/tt_metal/llrt/CMakeLists.txt b/tt_metal/llrt/CMakeLists.txt index 3f60ed70a06..439492cc309 100644 --- a/tt_metal/llrt/CMakeLists.txt +++ b/tt_metal/llrt/CMakeLists.txt @@ -82,6 +82,7 @@ target_link_libraries( set(LLRT_SRC ${CMAKE_CURRENT_SOURCE_DIR}/llrt.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/core_descriptor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/rtoptions.cpp ${CMAKE_CURRENT_SOURCE_DIR}/tlb_config.cpp ${CMAKE_CURRENT_SOURCE_DIR}/tt_cluster.cpp @@ -105,6 +106,7 @@ target_link_libraries( Tracy::TracyClient nlohmann_json::nlohmann_json Reflect::Reflect + yaml-cpp::yaml-cpp magic_enum span common diff --git a/tt_metal/common/core_descriptor.cpp b/tt_metal/llrt/core_descriptor.cpp similarity index 94% rename from tt_metal/common/core_descriptor.cpp rename to tt_metal/llrt/core_descriptor.cpp index a54e5fbe818..99fd72ec096 100644 --- a/tt_metal/common/core_descriptor.cpp +++ b/tt_metal/llrt/core_descriptor.cpp @@ -4,6 +4,7 @@ #include "core_descriptor.hpp" #include "rtoptions.hpp" +#include "tt_cluster.hpp" #include "yaml-cpp/yaml.h" @@ -241,4 +242,17 @@ const std::tuple& get_physical_worker_grid_config( return physical_grid_config_cache.at(config_hash); } +std::optional get_storage_core_bank_size( + chip_id_t device_id, const uint8_t num_hw_cqs, const tt_metal::DispatchCoreConfig& dispatch_core_config) { + const core_descriptor_t& core_desc = get_core_descriptor_config(device_id, num_hw_cqs, dispatch_core_config); + const metal_SocDescriptor& soc_desc = tt::Cluster::instance().get_soc_desc(device_id); + if (core_desc.storage_core_bank_size.has_value()) { + TT_FATAL( + core_desc.storage_core_bank_size.value() % tt_metal::hal.get_alignment(tt_metal::HalMemType::L1) == 0, + "Storage core bank size must be {} B aligned", + tt_metal::hal.get_alignment(tt_metal::HalMemType::L1)); + } + return core_desc.storage_core_bank_size; +} + } // namespace tt diff --git a/tt_metal/api/tt-metalium/tt_cluster.hpp b/tt_metal/llrt/tt_cluster.hpp similarity index 79% rename from tt_metal/api/tt-metalium/tt_cluster.hpp rename to tt_metal/llrt/tt_cluster.hpp index cecb702cda6..666e9fa4eed 100644 --- a/tt_metal/api/tt-metalium/tt_cluster.hpp +++ b/tt_metal/llrt/tt_cluster.hpp @@ -39,20 +39,20 @@ enum class TargetDevice : std::uint8_t { }; class Cluster { - public: - Cluster &operator=(const Cluster &) = delete; - Cluster &operator=(Cluster &&other) noexcept = delete; - Cluster(const Cluster &) = delete; - Cluster(Cluster &&other) noexcept = delete; +public: + Cluster& operator=(const Cluster&) = delete; + Cluster& operator=(Cluster&& other) noexcept = delete; + Cluster(const Cluster&) = delete; + Cluster(Cluster&& other) noexcept = delete; - static const Cluster &instance(); + static const Cluster& instance(); // For TG Galaxy systems, mmio chips are gateway chips that are only used for dispatc, so user_devices are meant for // user facing host apis size_t number_of_user_devices() const { if (this->is_tg_cluster_) { - const auto &chips = this->cluster_desc_->get_all_chips(); - return std::count_if(chips.begin(), chips.end(), [&](const auto &id) { + const auto& chips = this->cluster_desc_->get_all_chips(); + return std::count_if(chips.begin(), chips.end(), [&](const auto& id) { return this->cluster_desc_->get_board_type(id) == BoardType::GALAXY; }); } else { @@ -68,10 +68,12 @@ class Cluster { ARCH arch() const { return this->arch_; } - const metal_SocDescriptor &get_soc_desc(chip_id_t chip) const; - CoreCoord get_virtual_coordinate_from_logical_coordinates(chip_id_t chip_id, CoreCoord logical_coord, const CoreType& core_type) const; + const metal_SocDescriptor& get_soc_desc(chip_id_t chip) const; + CoreCoord get_virtual_coordinate_from_logical_coordinates( + chip_id_t chip_id, CoreCoord logical_coord, const CoreType& core_type) const; CoreCoord get_virtual_coordinate_from_physical_coordinates(chip_id_t chip_id, CoreCoord physical_coord) const; - tt_cxy_pair get_virtual_coordinate_from_logical_coordinates(tt_cxy_pair logical_coordinate, const CoreType& core_type) const; + tt_cxy_pair get_virtual_coordinate_from_logical_coordinates( + tt_cxy_pair logical_coordinate, const CoreType& core_type) const; CoreCoord get_physical_coordinate_from_logical_coordinates( chip_id_t chip_id, CoreCoord logical_coord, const CoreType& core_type, bool no_warn = false) const; const std::unordered_set& get_virtual_worker_cores(chip_id_t chip_id) const; @@ -83,14 +85,15 @@ class Cluster { } //! device driver and misc apis - void verify_sw_fw_versions(int device_id, std::uint32_t sw_version, std::vector &fw_versions) const; + void verify_sw_fw_versions(int device_id, std::uint32_t sw_version, std::vector& fw_versions) const; - void deassert_risc_reset_at_core(const tt_cxy_pair &physical_chip_coord) const; - void assert_risc_reset_at_core(const tt_cxy_pair &physical_chip_coord) const; + void deassert_risc_reset_at_core(const tt_cxy_pair& physical_chip_coord) const; + void assert_risc_reset_at_core(const tt_cxy_pair& physical_chip_coord) const; - void write_dram_vec(std::vector &vec, tt_target_dram dram, uint64_t addr, bool small_access = false) const; + void write_dram_vec( + std::vector& vec, tt_target_dram dram, uint64_t addr, bool small_access = false) const; void read_dram_vec( - std::vector &vec, + std::vector& vec, uint32_t size_in_bytes, tt_target_dram dram, uint64_t addr, @@ -98,48 +101,52 @@ class Cluster { // Accepts physical noc coordinates void write_core( - const void *mem_ptr, uint32_t sz_in_bytes, tt_cxy_pair core, uint64_t addr, bool small_access = false) const; + const void* mem_ptr, uint32_t sz_in_bytes, tt_cxy_pair core, uint64_t addr, bool small_access = false) const; void read_core( - void *mem_ptr, uint32_t sz_in_bytes, tt_cxy_pair core, uint64_t addr, bool small_access = false) const; + void* mem_ptr, uint32_t sz_in_bytes, tt_cxy_pair core, uint64_t addr, bool small_access = false) const; void read_core( - std::vector &data, uint32_t sz_in_bytes, tt_cxy_pair core, uint64_t addr, bool small_access = false) const; + std::vector& data, + uint32_t sz_in_bytes, + tt_cxy_pair core, + uint64_t addr, + bool small_access = false) const; - std::optional> get_tlb_data(const tt_cxy_pair &target) const { - tt::umd::Cluster *device = dynamic_cast(driver_.get()); + std::optional> get_tlb_data(const tt_cxy_pair& target) const { + tt::umd::Cluster* device = dynamic_cast(driver_.get()); tt::umd::CoreCoord target_coord = get_soc_desc(target.chip).get_coord_at(target, CoordSystem::TRANSLATED); return device->get_tlb_data_from_target(target.chip, target_coord); } - std::function get_fast_pcie_static_tlb_write_callable( - int chip_id) const { + std::function get_fast_pcie_static_tlb_write_callable(int chip_id) const { chip_id_t mmio_device_id = device_to_mmio_device_.at(chip_id); - tt::umd::Cluster *device = dynamic_cast(driver_.get()); + tt::umd::Cluster* device = dynamic_cast(driver_.get()); return device->get_fast_pcie_static_tlb_write_callable(mmio_device_id); } // Returns a writer object which holds a pointer to a static tlb - // Allows for fast writes when targeting same device core by only doing the lookup once and avoiding repeated stack traversals + // Allows for fast writes when targeting same device core by only doing the lookup once and avoiding repeated stack + // traversals tt::Writer get_static_tlb_writer(tt_cxy_pair target) const { - tt::umd::Cluster *device = dynamic_cast(driver_.get()); + tt::umd::Cluster* device = dynamic_cast(driver_.get()); tt::umd::CoreCoord target_coord = get_soc_desc(target.chip).get_coord_at(target, CoordSystem::TRANSLATED); return device->get_static_tlb_writer(target.chip, target_coord); } std::uint32_t get_numa_node_for_device(uint32_t device_id) const { uint32_t mmio_device_id = this->get_associated_mmio_device(device_id); - tt::umd::Cluster *device = dynamic_cast(driver_.get()); + tt::umd::Cluster* device = dynamic_cast(driver_.get()); return driver_->get_numa_node_for_pcie_device(mmio_device_id); } - void write_reg(const std::uint32_t *mem_ptr, tt_cxy_pair target, uint64_t addr) const; - void read_reg(std::uint32_t *mem_ptr, tt_cxy_pair target, uint64_t addr) const; + void write_reg(const std::uint32_t* mem_ptr, tt_cxy_pair target, uint64_t addr) const; + void read_reg(std::uint32_t* mem_ptr, tt_cxy_pair target, uint64_t addr) const; void write_sysmem( - const void *mem_ptr, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id, uint16_t channel) const; + const void* mem_ptr, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id, uint16_t channel) const; void read_sysmem( - void *mem_ptr, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id, uint16_t channel) const; + void* mem_ptr, uint32_t size_in_bytes, uint64_t addr, chip_id_t src_device_id, uint16_t channel) const; - int get_device_aiclk(const chip_id_t &chip_id) const; + int get_device_aiclk(const chip_id_t& chip_id) const; void dram_barrier(chip_id_t chip_id) const; void l1_barrier(chip_id_t chip_id) const; @@ -147,7 +154,7 @@ class Cluster { uint32_t get_num_host_channels(chip_id_t device_id) const; uint32_t get_host_channel_size(chip_id_t device_id, uint32_t channel) const; // Returns address in host space - void *host_dma_address(uint64_t offset, chip_id_t src_device_id, uint16_t channel) const; + void* host_dma_address(uint64_t offset, chip_id_t src_device_id, uint16_t channel) const; uint64_t get_pcie_base_addr_from_device(chip_id_t chip_id) const; // Ethernet cluster api @@ -170,12 +177,11 @@ class Cluster { // get_ethernet_sockets(a, b)[0] is connected to get_ethernet_sockets(b, a)[0] std::vector get_ethernet_sockets(chip_id_t local_chip, chip_id_t remote_chip) const; // Converts logical ethernet core coord to physical ethernet core coord - CoreCoord ethernet_core_from_logical_core(chip_id_t chip_id, const CoreCoord &logical_core) const; + CoreCoord ethernet_core_from_logical_core(chip_id_t chip_id, const CoreCoord& logical_core) const; // Returns virtual eth coord from channel CoreCoord get_virtual_eth_core_from_channel(chip_id_t chip_id, int channel) const; - // Bookkeeping for mmio device tunnels uint32_t get_mmio_device_max_tunnel_depth(chip_id_t mmio_device) const; uint32_t get_mmio_device_tunnel_count(chip_id_t mmio_device) const; @@ -186,7 +192,8 @@ class Cluster { tt_cxy_pair get_eth_core_for_dispatch_core( tt_cxy_pair logical_dispatch_core, EthRouterMode mode, chip_id_t connected_chip_id) const; - std::tuple get_eth_tunnel_core(chip_id_t upstream_chip_id, chip_id_t downstream_chip_id, EthRouterMode mode) const; + std::tuple get_eth_tunnel_core( + chip_id_t upstream_chip_id, chip_id_t downstream_chip_id, EthRouterMode mode) const; // Internal routing for SD and FD enables launching user ethernet kernels and FD tunneling for all devices in the // cluster. When using multiple devices in a cluster, this should be the flow: @@ -196,14 +203,13 @@ class Cluster { // set_internal_routing_info_for_ethernet_cores(false); // CloseDevice(0) // CloseDevice(1) - void set_internal_routing_info_for_ethernet_cores(bool enable_internal_routing, const std::vector& target_mmio_devices = {}) const; - + void set_internal_routing_info_for_ethernet_cores( + bool enable_internal_routing, const std::vector& target_mmio_devices = {}) const; std::unordered_map>> - get_ethernet_connections() const { - return this->cluster_desc_->get_ethernet_connections(); - } - + get_ethernet_connections() const { + return this->cluster_desc_->get_ethernet_connections(); + } // Returns MMIO device ID (logical) that controls given `device_id`. If `device_id` is MMIO device it is returned. chip_id_t get_associated_mmio_device(chip_id_t device_id) const { @@ -215,7 +221,7 @@ class Cluster { } // Returns collection of devices that are controlled by the specified MMIO device inclusive of the MMIO device - const std::set &get_devices_controlled_by_mmio_device(chip_id_t mmio_device_id) const { + const std::set& get_devices_controlled_by_mmio_device(chip_id_t mmio_device_id) const { TT_ASSERT( this->devices_grouped_by_assoc_mmio_device_.count(mmio_device_id), "Expected device {} to be an MMIO device!", @@ -239,8 +245,8 @@ class Cluster { // Returns Wormhole chip board type. BoardType get_board_type(chip_id_t chip_id) const; - bool is_worker_core(const CoreCoord &core, chip_id_t chip_id) const; - bool is_ethernet_core(const CoreCoord &core, chip_id_t chip_id) const; + bool is_worker_core(const CoreCoord& core, chip_id_t chip_id) const; + bool is_ethernet_core(const CoreCoord& core, chip_id_t chip_id) const; CoreCoord get_logical_ethernet_core_from_virtual(chip_id_t chip, CoreCoord core) const; // These two functions should be removed in favor of direct translation. @@ -248,7 +254,8 @@ class Cluster { const std::unordered_map get_worker_logical_to_virtual_y(chip_id_t chip_id) const; const std::unordered_map& get_virtual_routing_to_profiler_flat_id(chip_id_t chip_id) const; - private: + +private: Cluster(); ~Cluster(); @@ -256,14 +263,13 @@ class Cluster { void generate_cluster_descriptor(); void initialize_device_drivers(); void assert_risc_reset(); - void assign_mem_channels_to_devices(chip_id_t mmio_device_id, const std::set &controlled_device_ids); - void open_driver( - const bool &skip_driver_allocs = false); - void start_driver(tt_device_params &device_params) const; + void assign_mem_channels_to_devices(chip_id_t mmio_device_id, const std::set& controlled_device_ids); + void open_driver(const bool& skip_driver_allocs = false); + void start_driver(tt_device_params& device_params) const; void get_metal_desc_from_tt_desc( - const std::unordered_map &input, - const std::unordered_map &per_chip_id_harvesting_masks); + const std::unordered_map& input, + const std::unordered_map& per_chip_id_harvesting_masks); void generate_virtual_to_umd_coord_mapping(); void generate_virtual_to_profiler_flat_id_mapping(); @@ -326,4 +332,4 @@ class Cluster { } // namespace tt -std::ostream &operator<<(std::ostream &os, tt_target_dram const &dram); +std::ostream& operator<<(std::ostream& os, const tt_target_dram& dram); diff --git a/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp b/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp index a8b1db8196b..3d684c08996 100644 --- a/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp @@ -12,9 +12,13 @@ #include "ttnn/operations/data_movement/slice/slice.hpp" #include "ttnn/operations/data_movement/concat/concat.hpp" +#include "tt-metalium/hal_exp.hpp" + namespace ttnn { namespace ccl { +using namespace tt::tt_metal::experimental; + void SyncModeSpec::add_signal(uint32_t sem_id, uint32_t wait_count) { this->sem_ids.push_back(sem_id); this->wait_counts.push_back(wait_count); @@ -213,8 +217,8 @@ void generate_edm_kernels_for_ring_or_linear_topology( std::vector const& counter_clockwise_edm_builders, std::optional receiver_device_id, std::optional sender_device_id) { - auto sender_noc = tt::tt_metal::detail::GetPreferredNOCForDRAMRead(tt::Cluster::instance().arch()); - auto receiver_noc = tt::tt_metal::detail::GetPreferredNOCForDRAMWrite(tt::Cluster::instance().arch()); + auto sender_noc = tt::tt_metal::detail::GetPreferredNOCForDRAMRead(hal::get_arch()); + auto receiver_noc = tt::tt_metal::detail::GetPreferredNOCForDRAMWrite(hal::get_arch()); uint32_t sender_socket_idx = 0; uint32_t receiver_socket_idx = 0; if (receiver_device_id == sender_device_id) { diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_helper_functions.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_helper_functions.cpp index 4964b963bf1..7429ff9efa9 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_helper_functions.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_helper_functions.cpp @@ -11,11 +11,14 @@ #include #include +#include "tt-metalium/hal_exp.hpp" + namespace ttnn { namespace operations { using namespace tt; using namespace tt::tt_metal; +using namespace tt::tt_metal::experimental; using namespace constants; std::tuple add_core_offset( @@ -102,7 +105,7 @@ std::tuple #include "tools/profiler/op_profiler.hpp" +#include // tt_ClusterDescriptor + namespace tt { using namespace constants; namespace operations { From ac426de3d4a9c274964843fdae6aa83ea3960a30 Mon Sep 17 00:00:00 2001 From: Stuti Raizada <159130512+sraizada-tt@users.noreply.github.com> Date: Thu, 13 Feb 2025 12:09:59 -0600 Subject: [PATCH 105/316] [skip ci] #0: Ipdate matmul config arg in TG Llama3 --- models/demos/llama3/tt/model_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/models/demos/llama3/tt/model_config.py b/models/demos/llama3/tt/model_config.py index dceb72a2ecf..d93dd3949c1 100644 --- a/models/demos/llama3/tt/model_config.py +++ b/models/demos/llama3/tt/model_config.py @@ -1402,7 +1402,7 @@ def matmul_1d_config( grid = ttnn.CoreGrid(x=grid.x, y=grid_y) per_core_m = m // tile_height - per_core_k = (self.find_largest_divisor(k // (self.tile_size * grid.num_cores)),) + per_core_k = self.find_largest_divisor(k // (self.tile_size * grid.num_cores)) per_core_n = math.ceil(n / tile_width / grid.num_cores) if is_fp32_accumulate: From 941b34cff33ce2953cf984ec8898af25dbfbfbb3 Mon Sep 17 00:00:00 2001 From: Andrew Fuller Date: Thu, 13 Feb 2025 18:30:42 -0500 Subject: [PATCH 106/316] Use the namespaced target for magic_enum (#17879) ### Ticket #15795 ### Problem description We always should use the namespaced target, and we MUST use the namespaced target when using an externally provided dependency. ### What's changed magic_enum -> magic_enum::magic_enum --- tests/CMakeLists.txt | 2 +- tt-train/sources/ttml/CMakeLists.txt | 2 +- tt_fabric/CMakeLists.txt | 2 +- tt_metal/CMakeLists.txt | 2 +- tt_metal/common/CMakeLists.txt | 2 +- tt_metal/llrt/CMakeLists.txt | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 034ec2c7051..921e87e4ae0 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -8,7 +8,7 @@ target_link_libraries( pthread gmock_main nlohmann_json::nlohmann_json - magic_enum + magic_enum::magic_enum fmt::fmt-header-only span small_vector diff --git a/tt-train/sources/ttml/CMakeLists.txt b/tt-train/sources/ttml/CMakeLists.txt index 14c315e6e20..39cc5f7c034 100644 --- a/tt-train/sources/ttml/CMakeLists.txt +++ b/tt-train/sources/ttml/CMakeLists.txt @@ -104,7 +104,7 @@ target_link_libraries( Metalium::TTNN Python::Python fmt::fmt-header-only - magic_enum + magic_enum::magic_enum yaml-cpp::yaml-cpp xtensor xtensor-blas diff --git a/tt_fabric/CMakeLists.txt b/tt_fabric/CMakeLists.txt index 23cd638d49d..aa32e36a7e9 100644 --- a/tt_fabric/CMakeLists.txt +++ b/tt_fabric/CMakeLists.txt @@ -23,7 +23,7 @@ target_link_libraries( Metalium::Metal::LLRT umd::device metal_common_libs - magic_enum + magic_enum::magic_enum fmt::fmt-header-only yaml-cpp::yaml-cpp ) diff --git a/tt_metal/CMakeLists.txt b/tt_metal/CMakeLists.txt index bee22b18640..1802aeeaf99 100644 --- a/tt_metal/CMakeLists.txt +++ b/tt_metal/CMakeLists.txt @@ -13,7 +13,7 @@ target_link_libraries( tt_metal PUBLIC umd::device - magic_enum + magic_enum::magic_enum fmt::fmt-header-only span small_vector diff --git a/tt_metal/common/CMakeLists.txt b/tt_metal/common/CMakeLists.txt index 3a31f8e6e07..28f27de3edf 100644 --- a/tt_metal/common/CMakeLists.txt +++ b/tt_metal/common/CMakeLists.txt @@ -17,7 +17,7 @@ target_link_libraries( common PUBLIC nlohmann_json::nlohmann_json - magic_enum + magic_enum::magic_enum fmt::fmt-header-only span small_vector diff --git a/tt_metal/llrt/CMakeLists.txt b/tt_metal/llrt/CMakeLists.txt index 439492cc309..7e24a412cf3 100644 --- a/tt_metal/llrt/CMakeLists.txt +++ b/tt_metal/llrt/CMakeLists.txt @@ -107,7 +107,7 @@ target_link_libraries( nlohmann_json::nlohmann_json Reflect::Reflect yaml-cpp::yaml-cpp - magic_enum + magic_enum::magic_enum span common ) From 75429dc27567a6d15b4c09d425ebe8aec245f3d2 Mon Sep 17 00:00:00 2001 From: VirdhatchaniKN Date: Thu, 13 Feb 2025 11:10:27 +0000 Subject: [PATCH 107/316] #17864: QueueId support for Batch Norm --- .../unit_tests/operations/test_batch_norm.py | 40 +++++++++++++++++++ .../normalization/batch_norm/batch_norm.cpp | 3 +- .../normalization/batch_norm/batch_norm.hpp | 3 +- .../batch_norm/batch_norm_pybind.cpp | 6 +-- 4 files changed, 47 insertions(+), 5 deletions(-) diff --git a/tests/ttnn/unit_tests/operations/test_batch_norm.py b/tests/ttnn/unit_tests/operations/test_batch_norm.py index 377e32bc0af..8d0422f36ac 100644 --- a/tests/ttnn/unit_tests/operations/test_batch_norm.py +++ b/tests/ttnn/unit_tests/operations/test_batch_norm.py @@ -367,3 +367,43 @@ def test_batch_norm_program_cache_and_default(input_shapes, mem_layout, device): torch_result = torch.nn.functional.batch_norm(input=in_data, running_mean=mean_data, running_var=var_data) comp_BN_Output = compare_results_batch_norm([tt_output], [torch_result]) assert comp_BN_Output + + +@pytest.mark.parametrize( + "input_shapes", + [ + torch.Size([3, 2, 32, 32]), + ], +) +def test_batch_norm_qid_Default(input_shapes, device): + N, H, W, C = input_shapes + in_data, input_tensor = data_gen_with_range_batch_norm(input_shapes, 5, 10, device, is_input=True) + mean_data, mean_tensor = data_gen_with_range_batch_norm(input_shapes, 4, 10, device) + var_data, var_tensor = data_gen_with_range_batch_norm(input_shapes, 4, 20, device) + + tt_output_tensor_on_device = ttnn.batch_norm( + input_tensor, running_mean=mean_tensor, running_var=var_tensor, queue_id=0 + ) + tt_output = ttnn.to_torch(tt_output_tensor_on_device) + torch_result = torch.nn.functional.batch_norm(input=in_data, running_mean=mean_data, running_var=var_data) + comp_BN_Output = compare_results_batch_norm([tt_output], [torch_result]) + assert comp_BN_Output + + +@pytest.mark.parametrize( + "input_shapes", + [ + torch.Size([3, 2, 32, 32]), + ], +) +def test_batch_norm_qid(input_shapes, device): + N, H, W, C = input_shapes + in_data, input_tensor = data_gen_with_range_batch_norm(input_shapes, 2, 10, device, is_input=True) + mean_data, mean_tensor = data_gen_with_range_batch_norm(input_shapes, 2, 10, device) + var_data, var_tensor = data_gen_with_range_batch_norm(input_shapes, 2, 20, device) + + tt_output_tensor_on_device = ttnn.batch_norm(input_tensor, running_mean=mean_tensor, running_var=var_tensor) + tt_output = ttnn.to_torch(tt_output_tensor_on_device) + torch_result = torch.nn.functional.batch_norm(input=in_data, running_mean=mean_data, running_var=var_data) + comp_BN_Output = compare_results_batch_norm([tt_output], [torch_result]) + assert comp_BN_Output diff --git a/ttnn/cpp/ttnn/operations/normalization/batch_norm/batch_norm.cpp b/ttnn/cpp/ttnn/operations/normalization/batch_norm/batch_norm.cpp index 013bb132d01..55a4df1f82f 100644 --- a/ttnn/cpp/ttnn/operations/normalization/batch_norm/batch_norm.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/batch_norm/batch_norm.cpp @@ -30,7 +30,8 @@ Tensor BatchNorm::invoke( const std::optional& weight, const std::optional& bias, const std::optional& output, - const std::optional& memory_config) { + const std::optional& memory_config, + QueueId queue_id) { Tensor batch_mean = mean_NHW(input, memory_config); Tensor mean_sq = mean_NHW(ttnn::square(input, memory_config), memory_config); Tensor batch_var = ttnn::subtract(mean_sq, ttnn::square(batch_mean, memory_config), std::nullopt, memory_config); diff --git a/ttnn/cpp/ttnn/operations/normalization/batch_norm/batch_norm.hpp b/ttnn/cpp/ttnn/operations/normalization/batch_norm/batch_norm.hpp index df4d0029915..09010c4bf43 100644 --- a/ttnn/cpp/ttnn/operations/normalization/batch_norm/batch_norm.hpp +++ b/ttnn/cpp/ttnn/operations/normalization/batch_norm/batch_norm.hpp @@ -19,7 +19,8 @@ struct BatchNorm { const std::optional& weight = std::nullopt, const std::optional& bias = std::nullopt, const std::optional& output = std::nullopt, - const std::optional& memory_config = std::nullopt); + const std::optional& memory_config = std::nullopt, + QueueId queue_id = DefaultQueueId); }; } // namespace operations::normalization diff --git a/ttnn/cpp/ttnn/operations/normalization/batch_norm/batch_norm_pybind.cpp b/ttnn/cpp/ttnn/operations/normalization/batch_norm/batch_norm_pybind.cpp index 0a9250ac123..537030d1828 100644 --- a/ttnn/cpp/ttnn/operations/normalization/batch_norm/batch_norm_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/batch_norm/batch_norm_pybind.cpp @@ -31,6 +31,7 @@ void bind_batch_norm_operation(pybind11::module& module) { training (bool, optional): Selection between training mode and inference (evaluation) mode. Defaults to `False` (Inference mode). output (ttnn.Tensor, optional): Preallocated output tensor to store batch norm result of shape `[N, C, H, W]`. Defaults to `None`. memory_config (ttnn.MemoryConfig, optional): memory configuration for the operation. Defaults to `None`. + queue_id (int, optional): command queue id. Defaults to 0. Returns: @@ -75,8 +76,7 @@ void bind_batch_norm_operation(pybind11::module& module) { py::arg("weight") = std::nullopt, py::arg("bias") = std::nullopt, py::arg("output") = std::nullopt, - py::arg("memory_config") = std::nullopt - - }); + py::arg("memory_config") = std::nullopt, + py::arg("queue_id") = DefaultQueueId}); } } // namespace ttnn::operations::normalization::detail From 5be82c8dafefc9c4db8daa96540ee87ded9f6565 Mon Sep 17 00:00:00 2001 From: VirdhatchaniKN Date: Thu, 13 Feb 2025 12:28:06 +0000 Subject: [PATCH 108/316] #12253: Add test for optional output tensor in BN --- .../unit_tests/operations/test_batch_norm.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/ttnn/unit_tests/operations/test_batch_norm.py b/tests/ttnn/unit_tests/operations/test_batch_norm.py index 8d0422f36ac..fc2ab1abb6c 100644 --- a/tests/ttnn/unit_tests/operations/test_batch_norm.py +++ b/tests/ttnn/unit_tests/operations/test_batch_norm.py @@ -407,3 +407,23 @@ def test_batch_norm_qid(input_shapes, device): torch_result = torch.nn.functional.batch_norm(input=in_data, running_mean=mean_data, running_var=var_data) comp_BN_Output = compare_results_batch_norm([tt_output], [torch_result]) assert comp_BN_Output + + +@pytest.mark.parametrize( + "input_shapes", + [ + torch.Size([2, 3, 120, 120]), + ], +) +def test_batch_norm_output_Default(input_shapes, device): + N, H, W, C = input_shapes + _, tt_output_tensor = data_gen_with_range_batch_norm(input_shapes, 5, 10, device, is_input=True) + in_data, input_tensor = data_gen_with_range_batch_norm(input_shapes, 5, 10, device, is_input=True) + mean_data, mean_tensor = data_gen_with_range_batch_norm(input_shapes, 4, 10, device) + var_data, var_tensor = data_gen_with_range_batch_norm(input_shapes, 4, 20, device) + + ttnn.batch_norm(input_tensor, running_mean=mean_tensor, running_var=var_tensor, queue_id=0, output=tt_output_tensor) + tt_output = ttnn.to_torch(tt_output_tensor) + torch_result = torch.nn.functional.batch_norm(input=in_data, running_mean=mean_data, running_var=var_data) + comp_BN_Output = compare_results_batch_norm([tt_output], [torch_result]) + assert comp_BN_Output From 10258e368ed303a3c00fc6007561ec187fd39e37 Mon Sep 17 00:00:00 2001 From: Oleg Milyutin Date: Fri, 14 Feb 2025 12:14:32 -0500 Subject: [PATCH 109/316] #0: Remove the unused mesh register functionality (#17860) ### Ticket N/A ### Problem description The `assigned_mesh_device_devices_` and `assigned_devices_` in `SystemMesh` appear unused. ### What's changed Remove the data members and the associated mesh "register" functionality. ### Checklist - [X] [All post commit](https://github.com/tenstorrent/tt-metal/actions/runs/13302294435) - one unrelated test failure. --- tt_metal/api/tt-metalium/system_mesh.hpp | 1 - tt_metal/distributed/mesh_device.cpp | 2 -- tt_metal/distributed/system_mesh.cpp | 19 ------------------- 3 files changed, 22 deletions(-) diff --git a/tt_metal/api/tt-metalium/system_mesh.hpp b/tt_metal/api/tt-metalium/system_mesh.hpp index 841e95691d6..64c040edf82 100644 --- a/tt_metal/api/tt-metalium/system_mesh.hpp +++ b/tt_metal/api/tt-metalium/system_mesh.hpp @@ -39,7 +39,6 @@ class SystemMesh { // Get the physical device IDs mapped to a MeshDevice std::vector get_mapped_physical_device_ids(const MeshDeviceConfig& config) const; std::vector request_available_devices(const MeshDeviceConfig& config) const; - void register_mesh_device(const std::shared_ptr& mesh_device, const std::vector& devices); }; } // namespace tt::tt_metal::distributed diff --git a/tt_metal/distributed/mesh_device.cpp b/tt_metal/distributed/mesh_device.cpp index eb4bc712a70..04edd94373b 100644 --- a/tt_metal/distributed/mesh_device.cpp +++ b/tt_metal/distributed/mesh_device.cpp @@ -165,7 +165,6 @@ std::shared_ptr MeshDevice::create_submesh(const MeshShape& submesh_ auto submesh_devices = view_->get_devices(start_coordinate, end_coordinate); submesh->view_ = std::make_unique(submesh_devices, submesh_shape); - SystemMesh::instance().register_mesh_device(submesh, submesh_devices); submeshes_.push_back(submesh); log_trace( LogMetal, @@ -598,7 +597,6 @@ bool MeshDevice::initialize( tt::stl::Span l1_bank_remap, bool minimal) { view_ = std::make_unique(scoped_devices_->get_devices(), mesh_shape_); - SystemMesh::instance().register_mesh_device(shared_from_this(), this->get_devices()); // For MeshDevice, we support uniform sub-devices across all devices and we do not support ethernet subdevices. const auto& compute_grid_size = this->compute_with_storage_grid_size(); diff --git a/tt_metal/distributed/system_mesh.cpp b/tt_metal/distributed/system_mesh.cpp index e5399de7d69..c90fed6f897 100644 --- a/tt_metal/distributed/system_mesh.cpp +++ b/tt_metal/distributed/system_mesh.cpp @@ -13,9 +13,6 @@ namespace tt::tt_metal::distributed { class SystemMesh::Impl { private: - std::unordered_map> assigned_devices_; - std::unordered_map> assigned_mesh_device_devices_; - MeshShape logical_mesh_shape_; CoordinateTranslationMap logical_to_physical_coordinates_; std::unordered_map logical_to_device_id_; @@ -33,7 +30,6 @@ class SystemMesh::Impl { std::vector get_mapped_physical_device_ids(const MeshDeviceConfig& config) const; std::vector request_available_devices(const MeshDeviceConfig& config) const; IDevice* get_device(const chip_id_t physical_device_id) const; - void register_mesh_device(const std::shared_ptr& mesh_device, const std::vector& devices); chip_id_t get_physical_device_id(size_t logical_row_idx, size_t logical_col_idx) const; }; @@ -202,16 +198,6 @@ std::vector SystemMesh::Impl::get_mapped_physical_device_ids(const Me return physical_device_ids; } -void SystemMesh::Impl::register_mesh_device( - const std::shared_ptr& mesh_device, const std::vector& devices) { - std::vector physical_device_ids; - for (auto device : devices) { - physical_device_ids.push_back(device->id()); - } - assigned_mesh_device_devices_.insert({mesh_device->id(), mesh_device}); - assigned_devices_.insert({mesh_device->id(), physical_device_ids}); -} - std::vector SystemMesh::Impl::request_available_devices(const MeshDeviceConfig& config) const { auto [requested_num_rows, requested_num_cols] = config.mesh_shape; auto [max_num_rows, max_num_cols] = logical_mesh_shape_; @@ -248,11 +234,6 @@ const MeshShape& SystemMesh::get_shape() const { return pimpl_->get_shape(); } size_t SystemMesh::get_num_devices() const { return pimpl_->get_num_devices(); } -void SystemMesh::register_mesh_device( - const std::shared_ptr& mesh_device, const std::vector& devices) { - pimpl_->register_mesh_device(mesh_device, devices); -} - std::vector SystemMesh::request_available_devices(const MeshDeviceConfig& config) const { return pimpl_->request_available_devices(config); } From 2e0816ddad49988f841d4142b2a2b8ed84088d17 Mon Sep 17 00:00:00 2001 From: Bryan Wilder Field Lozano Date: Fri, 14 Feb 2025 11:46:38 -0600 Subject: [PATCH 110/316] [skip ci] Add CMake hookup for code coverage (#17886) --- build_metal.sh | 10 ++++++++++ cmake/project_options.cmake | 1 + tt_metal/CMakeLists.txt | 6 ++++++ ttnn/CMakeLists.txt | 6 ++++++ 4 files changed, 23 insertions(+) diff --git a/build_metal.sh b/build_metal.sh index 5d962c7472c..a6be2e82d79 100755 --- a/build_metal.sh +++ b/build_metal.sh @@ -36,6 +36,7 @@ show_help() { echo " --ttnn-shared-sub-libs Use shared libraries for ttnn." echo " --toolchain-path Set path to CMake toolchain file." echo " --configure-only Only configure the project, do not build." + echo " --enable-coverage Instrument the binaries for code coverage." } clean() { @@ -69,6 +70,7 @@ c_compiler_path="" ttnn_shared_sub_libs="OFF" toolchain_path="cmake/x86_64-linux-clang-17-libcpp-toolchain.cmake" configure_only="OFF" +enable_coverage="OFF" declare -a cmake_args @@ -105,6 +107,7 @@ c-compiler-path: ttnn-shared-sub-libs toolchain-path: configure-only +enable-coverage " # Flatten LONGOPTIONS into a comma-separated string for getopt @@ -138,6 +141,8 @@ while true; do enable_tsan="ON";; -u|--enable-ubsan) enable_ubsan="ON";; + --enable-coverage) + enable_coverage="ON";; -b|--build-type) build_type="$2";shift;; -p|--enable-profiler) @@ -228,6 +233,7 @@ echo "INFO: Enable AddressSanitizer: $enable_asan" echo "INFO: Enable MemorySanitizer: $enable_msan" echo "INFO: Enable ThreadSanitizer: $enable_tsan" echo "INFO: Enable UndefinedBehaviorSanitizer: $enable_ubsan" +echo "INFO: Enable Coverage: $enable_coverage" echo "INFO: Build directory: $build_dir" echo "INFO: Install Prefix: $cmake_install_prefix" echo "INFO: Build tests: $build_tests" @@ -284,6 +290,10 @@ if [ "$enable_profiler" = "ON" ]; then cmake_args+=("-DENABLE_TRACY=ON") fi +if [ "$enable_coverage" = "ON" ]; then + cmake_args+=("-DENABLE_COVERAGE=ON") +fi + if [ "$export_compile_commands" = "ON" ]; then cmake_args+=("-DCMAKE_EXPORT_COMPILE_COMMANDS=ON") else diff --git a/cmake/project_options.cmake b/cmake/project_options.cmake index 3937b609500..bf39879e8c3 100644 --- a/cmake/project_options.cmake +++ b/cmake/project_options.cmake @@ -20,6 +20,7 @@ option(TT_UNITY_BUILDS "Build with Unity builds" ON) option(BUILD_TT_TRAIN "Enables build of tt-train" OFF) option(ENABLE_TTNN_SHARED_SUBLIBS "Use shared libraries for ttnn to speed up incremental builds" OFF) option(TT_ENABLE_LIGHT_METAL_TRACE "Enable Light Metal Trace" ON) +option(ENABLE_COVERAGE "Enable code coverage instrumentation" OFF) ########################################################################################### diff --git a/tt_metal/CMakeLists.txt b/tt_metal/CMakeLists.txt index 1802aeeaf99..19227774e5e 100644 --- a/tt_metal/CMakeLists.txt +++ b/tt_metal/CMakeLists.txt @@ -1,3 +1,9 @@ +if(ENABLE_COVERAGE AND CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang") + message(STATUS "Enabling code coverage flags for all tt_metal targets") + add_compile_options(--coverage) + add_link_options(--coverage) +endif() + add_library(tt_metal) add_library(Metalium::Metal ALIAS tt_metal) diff --git a/ttnn/CMakeLists.txt b/ttnn/CMakeLists.txt index e8a6f887a09..7eb79f85d0d 100644 --- a/ttnn/CMakeLists.txt +++ b/ttnn/CMakeLists.txt @@ -1,3 +1,9 @@ +if(ENABLE_COVERAGE AND CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang") + message(STATUS "Enabling code coverage flags for all ttnn targets") + add_compile_options(--coverage) + add_link_options(--coverage) +endif() + set(TTNN_BASE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/async_runtime.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/config.cpp From a7a2eaba8a4450e53878f1c6962e10b8f2e18e1c Mon Sep 17 00:00:00 2001 From: Bryan Wilder Field Lozano Date: Fri, 14 Feb 2025 11:59:05 -0600 Subject: [PATCH 111/316] [skip ci] Dockerize device perf workflow (#17005) --- .github/workflows/build-artifact.yaml | 27 ++++- .github/workflows/full-new-models-suite.yaml | 4 + .../workflows/perf-device-models-impl.yaml | 106 +++++++++++++++--- .github/workflows/perf-device-models.yaml | 5 + .github/workflows/pipeline-select.yaml | 4 + tt_metal/python_env/requirements-dev.txt | 2 + 6 files changed, 128 insertions(+), 20 deletions(-) diff --git a/.github/workflows/build-artifact.yaml b/.github/workflows/build-artifact.yaml index 5d8b458c636..1d06d1afce5 100644 --- a/.github/workflows/build-artifact.yaml +++ b/.github/workflows/build-artifact.yaml @@ -54,6 +54,12 @@ on: #ci-test-docker-image: # description: "Docker tag for the CI Test Docker image for testing TT-Metalium et al" # value: ${{ jobs.build-docker-image.outputs.ci-test-tag }} + build-artifact-name: + description: "Name of the published build artifact" + value: ${{ jobs.build-artifact.outputs.build_artifact_name }} + wheel-artifact-name: + description: "Name of the published wheel artifact" + value: ${{ jobs.build-artifact.outputs.wheel_artifact_name }} workflow_dispatch: @@ -107,6 +113,9 @@ jobs: runs-on: - build - in-service + outputs: + build_artifact_name: ${{ steps.set_build_artifact_name.outputs.build_artifact_name }} + wheel_artifact_name: ${{ steps.set_wheel_artifact_name.outputs.wheel_artifact_name }} container: image: ${{ needs.build-docker-image.outputs.ci-build-tag }} env: @@ -201,11 +210,18 @@ jobs: ccache -s >> $GITHUB_STEP_SUMMARY echo '```' >> $GITHUB_STEP_SUMMARY + - name: Set wheel artifact name + id: set_wheel_artifact_name + run: | + WHEEL_ARTIFACT_NAME="eager-dist-${{ inputs.distro }}-${{ inputs.version }}-any${{ (inputs.tracy && '-profiler') || '' }}" + echo "wheel_artifact_name=$WHEEL_ARTIFACT_NAME" >> "$GITHUB_ENV" + echo "wheel_artifact_name=$WHEEL_ARTIFACT_NAME" >> "$GITHUB_OUTPUT" + - name: ☁️ Upload wheel if: ${{ inputs.build-wheel }} uses: actions/upload-artifact@v4 with: - name: eager-dist-${{ inputs.distro }}-${{ inputs.version }}-any + name: ${{ env.wheel_artifact_name }} path: /work/dist/ if-no-files-found: error @@ -213,11 +229,18 @@ jobs: if: ${{ inputs.publish-artifact }} run: tar -cvhf /work/ttm_any.tar ttnn/ttnn/*.so build/lib ttnn/ttnn/*.so build/programming_examples build/test build/tools build/tt-train data runtime + - name: Set build artifact name + id: set_build_artifact_name + run: | + BUILD_ARTIFACT_NAME="TTMetal_build_any${{ (inputs.tracy && '_profiler') || '' }}" + echo "build_artifact_name=$BUILD_ARTIFACT_NAME" >> "$GITHUB_ENV" + echo "build_artifact_name=$BUILD_ARTIFACT_NAME" >> "$GITHUB_OUTPUT" + - name: ☁️ Upload tarball if: ${{ inputs.publish-artifact }} uses: actions/upload-artifact@v4 with: - name: TTMetal_build_any${{ (inputs.tracy && '_profiler') || '' }} + name: ${{ env.build_artifact_name }} path: /work/ttm_any.tar if-no-files-found: error diff --git a/.github/workflows/full-new-models-suite.yaml b/.github/workflows/full-new-models-suite.yaml index 76e09f92be1..8c21f065c83 100644 --- a/.github/workflows/full-new-models-suite.yaml +++ b/.github/workflows/full-new-models-suite.yaml @@ -39,6 +39,10 @@ jobs: needs: build-artifact-profiler uses: ./.github/workflows/perf-device-models-impl.yaml secrets: inherit + with: + docker-image: ${{ needs.build-artifact-profiler.outputs.ci-build-docker-image }} + build-artifact-name: ${{ needs.build-artifact-profiler.outputs.build-artifact-name }} + wheel-artifact-name: ${{ needs.build-artifact-profiler.outputs.wheel-artifact-name }} e2e-model-perf-single-card: needs: build-artifact uses: ./.github/workflows/perf-models-impl.yaml diff --git a/.github/workflows/perf-device-models-impl.yaml b/.github/workflows/perf-device-models-impl.yaml index 43610aa2cfd..9ebf440d07f 100644 --- a/.github/workflows/perf-device-models-impl.yaml +++ b/.github/workflows/perf-device-models-impl.yaml @@ -7,6 +7,15 @@ on: required: false type: string default: "ubuntu-20.04" + docker-image: + required: true + type: string + build-artifact-name: + required: true + type: string + wheel-artifact-name: + required: true + type: string jobs: device-perf: @@ -16,30 +25,79 @@ jobs: fail-fast: false matrix: test-info: [ - {name: "GS", arch: grayskull, runs-on: ["perf-no-reset-grayskull", "bare-metal", "in-service"], machine-type: "bare_metal", timeout: 40}, - {name: "N300 WH B0", arch: wormhole_b0, runs-on: ["N300", "pipeline-perf", "bare-metal", "in-service"], machine-type: "bare_metal", timeout: 40}, + {name: "N300 WH B0", arch: wormhole_b0, runs-on: ["N300", "pipeline-perf", "bare-metal", "in-service"], machine-type: "bare_metal", timeout: 50}, ] name: "${{ matrix.test-info.name }} device perf" - env: - TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} - ARCH_NAME: ${{ matrix.test-info.arch }} - LOGURU_LEVEL: INFO - LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib runs-on: ${{ matrix.test-info.runs-on }} + container: + image: ${{ inputs.docker-image }} + env: + TT_METAL_HOME: /work + PYTHONPATH: /work + LD_LIBRARY_PATH: /work/build/lib + ARCH_NAME: ${{ matrix.test-info.arch }} + LOGURU_LEVEL: INFO + volumes: + - ${{ github.workspace }}/docker-job:/work # Subdir to workaround https://github.com/actions/runner/issues/691 + - /dev/hugepages-1G:/dev/hugepages-1G + - /mnt/MLPerf:/mnt/MLPerf + options: "--device /dev/tenstorrent" + defaults: + run: + shell: bash + working-directory: /work # https://github.com/actions/runner/issues/878 steps: - - uses: tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main - - uses: ./.github/actions/ensure-active-weka-mount - - name: Set up dynamic env vars for build - run: | - echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV - - uses: ./.github/actions/prepare-metal-run + - name: ⬇️ Checkout + uses: actions/checkout@v4 + with: + submodules: recursive + path: docker-job # Here be dragons; keep it scoped to our desired volume, yet must be under github.workspace and be sure to clean up at the end + + - name: ⬇️ Download Build + uses: actions/download-artifact@v4 + with: + name: ${{ inputs.build-artifact-name }} + path: docker-job + + - name: Extract files + shell: bash + run: tar -xvf ttm_any.tar + + - name: ⬇️ Download Wheel + uses: actions/download-artifact@v4 with: - is_profiler: 'true' - - name: ${{ matrix.test-group.name }} tests + name: ${{ inputs.wheel-artifact-name }} + path: docker-job + + - name: Install Wheel + run: | + WHEEL_FILENAME=$(ls -1 *.whl) + pip3 install $WHEEL_FILENAME + + - name: ${{ matrix.test-info.name }} tests timeout-minutes: ${{ matrix.test-info.timeout }} run: | - source python_env/bin/activate - ./tests/scripts/run_tests.sh --tt-arch $ARCH_NAME --pipeline-type models_device_performance_${{ matrix.test-info.machine-type }} + if [[ "${{ matrix.test-info.arch }}" == "wormhole_b0" ]]; then + export MAGIC_ENV=wormhole_b0_80_arch_eth_dispatch.yaml + fi + pytest models/demos/wormhole/stable_diffusion/tests -m models_device_performance_bare_metal --timeout=600 + pytest models/demos/distilbert/tests -m models_device_performance_bare_metal + pytest models/demos/vgg/tests/ -m models_device_performance_bare_metal + pytest models/demos/convnet_mnist/tests/ -m models_device_performance_bare_metal + pytest models/demos/bert_tiny/tests/ -m models_device_performance_bare_metal + pytest models/demos/mnist/tests -m models_device_performance_bare_metal + pytest models/demos/squeezebert/tests -m models_device_performance_bare_metal + pytest models/demos/roberta/tests/ -m models_device_performance_bare_metal + WH_ARCH_YAML=$MAGIC_ENV pytest models/demos/wormhole/resnet50/tests -m models_device_performance_bare_metal + WH_ARCH_YAML=$MAGIC_ENV pytest models/experimental/functional_unet/tests/test_unet_perf.py -m models_device_performance_bare_metal + WH_ARCH_YAML=$MAGIC_ENV pytest models/demos/wormhole/mamba/tests -m models_device_performance_bare_metal + WH_ARCH_YAML=$MAGIC_ENV pytest models/demos/metal_BERT_large_11/tests -m models_device_performance_bare_metal + WH_ARCH_YAML=$MAGIC_ENV pytest models/demos/falcon7b_common/tests -m models_device_performance_bare_metal + WH_ARCH_YAML=$MAGIC_ENV pytest models/demos/wormhole/bert_tiny/tests -m models_device_performance_bare_metal + WH_ARCH_YAML=$MAGIC_ENV pytest models/demos/yolov4/tests -m models_device_performance_bare_metal + WH_ARCH_YAML=$MAGIC_ENV pytest models/demos/wormhole/distilbert/tests -m models_device_performance_bare_metal + python3 models/perf/merge_device_perf_results.py + - name: Check device perf report exists id: check-device-perf-report if: ${{ !cancelled() }} @@ -48,9 +106,21 @@ jobs: export DEVICE_PERF_REPORT_FILENAME=Models_Device_Perf_$(date +%Y_%m_%d).csv ls -hal $DEVICE_PERF_REPORT_FILENAME echo "device_perf_report_filename=$DEVICE_PERF_REPORT_FILENAME" >> "$GITHUB_OUTPUT" + - name: Upload device perf report if: ${{ !cancelled() && steps.check-device-perf-report.conclusion == 'success' }} uses: actions/upload-artifact@v4 with: name: device-perf-report-csv-${{ matrix.test-info.arch }}-${{ matrix.test-info.machine-type }} - path: "${{ steps.check-device-perf-report.outputs.device_perf_report_filename }}" + path: /work/${{ steps.check-device-perf-report.outputs.device_perf_report_filename }} + + - name: Cleanup + if: always() + run: | + # We are forced to checkout the repo into a subdir of the host's workdir; this pollutes the host + # with root-owned files. Be sure to clean up after ourselves in case we're on a non-ephemeral runner. + echo "pre rm" + ls -al /__w/tt-metal/tt-metal + rm -rf /__w/tt-metal/tt-metal/docker-job + echo "post rm" + ls -al /__w/tt-metal/tt-metal diff --git a/.github/workflows/perf-device-models.yaml b/.github/workflows/perf-device-models.yaml index 67ef9232f81..70c1c634aeb 100644 --- a/.github/workflows/perf-device-models.yaml +++ b/.github/workflows/perf-device-models.yaml @@ -11,8 +11,13 @@ jobs: uses: ./.github/workflows/build-artifact.yaml with: tracy: true + build-wheel: true secrets: inherit device-perf: needs: build-artifact-profiler secrets: inherit uses: ./.github/workflows/perf-device-models-impl.yaml + with: + docker-image: ${{ needs.build-artifact-profiler.outputs.ci-build-docker-image }} + build-artifact-name: ${{ needs.build-artifact-profiler.outputs.build-artifact-name }} + wheel-artifact-name: ${{ needs.build-artifact-profiler.outputs.wheel-artifact-name }} diff --git a/.github/workflows/pipeline-select.yaml b/.github/workflows/pipeline-select.yaml index d3d575e1191..2da4e57b861 100644 --- a/.github/workflows/pipeline-select.yaml +++ b/.github/workflows/pipeline-select.yaml @@ -57,6 +57,10 @@ jobs: secrets: inherit uses: ./.github/workflows/perf-device-models-impl.yaml if: ${{ inputs.perf-device-models }} + with: + docker-image: ${{ needs.build-artifact.outputs.ci-build-docker-image }} + build-artifact-name: ${{ needs.build-artifact.outputs.build-artifact-name }} + wheel-artifact-name: ${{ needs.build-artifact.outputs.wheel-artifact-name }} single-card-nightly: needs: build-artifact secrets: inherit diff --git a/tt_metal/python_env/requirements-dev.txt b/tt_metal/python_env/requirements-dev.txt index 01aa6a8d2f7..18a5c84dbc5 100644 --- a/tt_metal/python_env/requirements-dev.txt +++ b/tt_metal/python_env/requirements-dev.txt @@ -2,6 +2,8 @@ # not be available during environment installation. We recommend setuptools # and wheel before installing this requirements.txt file. +loguru + # During dep resolution, black may install platformdirs >=4.0.0, which is # a breaking dependency for virtualenv installed by pre-commit. virtualenv # requires <4.0.0 platformdirs, so we're pinning platformdirs here From 5611cc41465ab430d28271957eb4e66448687584 Mon Sep 17 00:00:00 2001 From: Bryan Wilder Field Lozano Date: Fri, 14 Feb 2025 11:44:00 -0700 Subject: [PATCH 112/316] Use CPM_USE_LOCAL_PACKAGES to get dependencies from Docker container (#17627) ### Ticket Closes https://github.com/tenstorrent/tt-metal/issues/15795 ### Problem description Currently the following dependencies are downloaded in every single build job: - boost - nlohmann json - fmt - magic_enum - xtl - Taskflow - ranges-v3 (There are more, but these are the ones I can do something about immediately). ### What's changed We can use the CPM CMake optiong `CPM_USE_LOCAL_PACKAGES` to tell the build to check if the necessary dependencies are already installed in the system using `find_package`. To make this work, I had to update the Docker image to build these dependencies from source. ### Checklist - [x] [All post commit](https://github.com/tenstorrent/tt-metal/actions/runs/13238807192) --------- Co-authored-by: Andrew Fuller --- .github/workflows/all-static-checks.yaml | 2 +- .github/workflows/build-artifact.yaml | 2 +- CMakeLists.txt | 5 +- build_metal.sh | 10 ++ dependencies/CMakeLists.txt | 37 ++++-- dockerfile/Dockerfile | 161 ++++++++++++++++++++++- tt-train/cmake/dependencies.cmake | 7 +- 7 files changed, 202 insertions(+), 22 deletions(-) diff --git a/.github/workflows/all-static-checks.yaml b/.github/workflows/all-static-checks.yaml index 7f079d23b6a..b3b45bad4b7 100644 --- a/.github/workflows/all-static-checks.yaml +++ b/.github/workflows/all-static-checks.yaml @@ -117,7 +117,7 @@ jobs: - uses: lukka/get-cmake@b516803a3c5fac40e2e922349d15cdebdba01e60 if: steps.changed-cmake-files.outputs.any_changed == 'true' with: - cmakeVersion: "~3.19.0" + cmakeVersion: "~3.24.0" - name: Check CMake version if: steps.changed-cmake-files.outputs.any_changed == 'true' run: cmake --version diff --git a/.github/workflows/build-artifact.yaml b/.github/workflows/build-artifact.yaml index 1d06d1afce5..3d425cd6b08 100644 --- a/.github/workflows/build-artifact.yaml +++ b/.github/workflows/build-artifact.yaml @@ -180,7 +180,7 @@ jobs: args_fixme=$([ "${{ inputs.skip-tt-train }}" = "true" ] && echo "--build-metal-tests --build-ttnn-tests --build-programming-examples" || echo "--build-all") echo "Args: ${args_fixme}" - build_command="./build_metal.sh --build-type ${{ inputs.build-type }} --toolchain-path ${{ inputs.toolchain }} ${args_fixme} --enable-ccache --configure-only" + build_command="./build_metal.sh --build-type ${{ inputs.build-type }} --toolchain-path ${{ inputs.toolchain }} ${args_fixme} --enable-ccache --configure-only --cpm-use-local-packages" echo "Build tracy: ${{ inputs.tracy }}" if [ "${{ inputs.tracy }}" = "true" ]; then build_command="$build_command --enable-profiler" diff --git a/CMakeLists.txt b/CMakeLists.txt index a26b956890a..f289b7d1b84 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.19...3.30) +cmake_minimum_required(VERSION 3.24...3.30) # Sanity check, forgetting to clone submodules is a common omission and results in a poor error message if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/tt_metal/third_party/umd/CMakeLists.txt") @@ -95,6 +95,9 @@ set(CMAKE_CXX_FLAGS_CI "-O3 -DDEBUG") # We're not currently using C++20 modules, so don't bother scanning for them set(CMAKE_CXX_SCAN_FOR_MODULES FALSE) +# Promote all IMPORTED targets discovered by find_package() to a GLOBAL scope +set(CMAKE_FIND_PACKAGE_TARGETS_GLOBAL TRUE) + ############################################################################################################################ # Project Options # The following options and their defaults impact what artifacts get built diff --git a/build_metal.sh b/build_metal.sh index a6be2e82d79..827821a1996 100755 --- a/build_metal.sh +++ b/build_metal.sh @@ -33,6 +33,7 @@ show_help() { echo " --cxx-compiler-path Set path to C++ compiler." echo " --c-compiler-path Set path to C++ compiler." echo " --cpm-source-cache Set path to CPM Source Cache." + echo " --cpm-use-local-packages Attempt to use locally installed dependencies." echo " --ttnn-shared-sub-libs Use shared libraries for ttnn." echo " --toolchain-path Set path to CMake toolchain file." echo " --configure-only Only configure the project, do not build." @@ -66,6 +67,7 @@ light_metal_trace="ON" build_all="OFF" cxx_compiler_path="" cpm_source_cache="" +cpm_use_local_packages="OFF" c_compiler_path="" ttnn_shared_sub_libs="OFF" toolchain_path="cmake/x86_64-linux-clang-17-libcpp-toolchain.cmake" @@ -103,6 +105,7 @@ debug clean cxx-compiler-path: cpm-source-cache: +cpm-use-local-packages c-compiler-path: ttnn-shared-sub-libs toolchain-path: @@ -177,6 +180,8 @@ while true; do cxx_compiler_path="$2";shift;; --cpm-source-cache) cpm_source_cache="$2";shift;; + --cpm-use-local-packages) + cpm_use_local_packages="ON";; --c-compiler-path) c_compiler_path="$2";shift;; --toolchain-path) @@ -261,6 +266,11 @@ if [ "$cpm_source_cache" != "" ]; then cmake_args+=("-DCPM_SOURCE_CACHE=$cpm_source_cache") fi +if [ "$cpm_use_local_packages" = "ON" ]; then + echo "INFO: CPM_USE_LOCAL_PACKAGES: $cpm_use_local_packages" + cmake_args+=("-DCPM_USE_LOCAL_PACKAGES=ON") +fi + if [ "$enable_ccache" = "ON" ]; then cmake_args+=("-DCMAKE_DISABLE_PRECOMPILE_HEADERS=TRUE") cmake_args+=("-DENABLE_CCACHE=TRUE") diff --git a/dependencies/CMakeLists.txt b/dependencies/CMakeLists.txt index 793e7f8c859..b62d1306a37 100644 --- a/dependencies/CMakeLists.txt +++ b/dependencies/CMakeLists.txt @@ -8,6 +8,14 @@ set(CMAKE_CXX_CLANG_TIDY "") # Boost ############################################################################################################################ +function(ensureboosttarget boostTarget) + if(NOT TARGET Boost::${boostTarget}) + add_library(Boost::${boostTarget} INTERFACE IMPORTED GLOBAL) + target_link_libraries(Boost::${boostTarget} INTERFACE Boost::headers) + message(STATUS "Defined Boost::${boostTarget} as an INTERFACE target.") + endif() +endfunction() + CPMAddPackage( NAME Boost VERSION 1.86.0 @@ -20,8 +28,14 @@ CPMAddPackage( "BOOST_SKIP_INSTALL_RULES ON" "BUILD_SHARED_LIBS OFF" "BOOST_INCLUDE_LIBRARIES core\\\;container\\\;smart_ptr\\\;interprocess" + FIND_PACKAGE_ARGUMENTS "CONFIG REQUIRED" ) +ensureboosttarget(core) +ensureboosttarget(container) +ensureboosttarget(smart_ptr) +ensureboosttarget(interprocess) + add_library(span INTERFACE) target_link_libraries(span INTERFACE Boost::core) @@ -44,15 +58,6 @@ CPMAddPackage( "YAML_BUILD_SHARED_LIBS OFF" ) -if(yaml-cpp_ADDED) - set_target_properties( - yaml-cpp - PROPERTIES - DEBUG_POSTFIX - "" - ) -endif() - ############################################################################################################################ # googletest ############################################################################################################################ @@ -118,7 +123,13 @@ CPMAddPackage(NAME pybind11 GITHUB_REPOSITORY pybind/pybind11 GIT_TAG v2.13.6 OP # nlohmann/json : https://github.com/nlohmann/json ############################################################################################################################ -CPMAddPackage(NAME json GITHUB_REPOSITORY nlohmann/json GIT_TAG v3.11.3 OPTIONS "CMAKE_MESSAGE_LOG_LEVEL NOTICE") +CPMAddPackage( + NAME nlohmann_json + GITHUB_REPOSITORY nlohmann/json + GIT_TAG v3.11.3 + OPTIONS + "CMAKE_MESSAGE_LOG_LEVEL NOTICE" +) ############################################################################################################################ # xtensor : https://github.com/xtensor-stack/xtensor @@ -177,13 +188,15 @@ endif() ############################################################################################################################ CPMAddPackage( - NAME taskflow + NAME Taskflow GITHUB_REPOSITORY taskflow/taskflow GIT_TAG v3.7.0 OPTIONS "CMAKE_MESSAGE_LOG_LEVEL NOTICE" # Taskflow's CMakeLists.txt is super noisy ) -add_library(Taskflow::Taskflow ALIAS Taskflow) +if(Taskflow_ADDED AND NOT TARGET Taskflow::Taskflow) + add_library(Taskflow::Taskflow ALIAS Taskflow) +endif() ############################################################################################################################ # flatbuffers : https://github.com/google/flatbuffers diff --git a/dockerfile/Dockerfile b/dockerfile/Dockerfile index e1a388d2f2b..c3f5937d1d2 100644 --- a/dockerfile/Dockerfile +++ b/dockerfile/Dockerfile @@ -23,6 +23,162 @@ RUN mkdir -p /usr/local/bin && wget -O /tmp/ccache.tar.xz https://github.com/cca tar -xf /tmp/ccache.tar.xz -C /usr/local/bin --strip-components=1 && \ rm /tmp/ccache.tar.xz +ARG BOOST_VERSION=1.86.0 +RUN mkdir -p /tmp/boost \ + && BOOST_VERSION_UNDERSCORE=$(echo ${BOOST_VERSION} | sed 's/\./_/g') \ + && wget -O /tmp/boost/boost_${BOOST_VERSION}.tar.gz "https://archives.boost.io/release/${BOOST_VERSION}/source/boost_${BOOST_VERSION_UNDERSCORE}.tar.gz" \ + && tar -xzf /tmp/boost/boost_${BOOST_VERSION}.tar.gz -C /tmp/boost --strip-components=1 \ + && cd /tmp/boost \ + && ./bootstrap.sh \ + && ./b2 install --prefix=/usr/local \ + && rm -rf /tmp/boost + +ARG FMT_VERSION=11.0.1 +RUN mkdir -p /tmp/fmt \ + && wget -O /tmp/fmt/fmt-${FMT_VERSION}.tar.gz "https://github.com/fmtlib/fmt/archive/${FMT_VERSION}.tar.gz" \ + && tar -xzf /tmp/fmt/fmt-${FMT_VERSION}.tar.gz -C /tmp/fmt --strip-components=1 \ + && cmake \ + -S /tmp/fmt \ + -B /tmp/fmt/build \ + -DCMAKE_BUILD_TYPE=Release \ + -DFMT_TEST=OFF \ + -DFMT_DOC=OFF \ + -DFMT_INSTALL=ON \ + && make -C /tmp/fmt/build -j$(nproc) \ + && make -C /tmp/fmt/build install \ + && rm -rf /tmp/fmt + +ARG PYBIND11_VERSION=2.13.6 +RUN mkdir -p /tmp/pybind11 \ + && wget -O /tmp/pybind11/pybind11-${PYBIND11_VERSION}.tar.gz "https://github.com/pybind/pybind11/archive/refs/tags/v${PYBIND11_VERSION}.tar.gz" \ + && tar -xzf /tmp/pybind11/pybind11-${PYBIND11_VERSION}.tar.gz -C /tmp/pybind11 --strip-components=1 \ + && cmake \ + -S /tmp/pybind11 \ + -B /tmp/pybind11/build \ + -DCMAKE_BUILD_TYPE=Release \ + -DPYBIND11_TEST=OFF \ + -DPYBIND11_INSTALL=ON \ + && make -C /tmp/pybind11/build -j$(nproc) \ + && make -C /tmp/pybind11/build install \ + && rm -rf /tmp/pybind11 + +ARG RANGE_V3_VERSION=0.12.0 +RUN mkdir -p /tmp/range-v3 \ + && wget -O /tmp/range-v3/range-v3-${RANGE_V3_VERSION}.tar.gz "https://github.com/ericniebler/range-v3/archive/refs/tags/${RANGE_V3_VERSION}.tar.gz" \ + && tar -xzf /tmp/range-v3/range-v3-${RANGE_V3_VERSION}.tar.gz -C /tmp/range-v3 --strip-components=1 \ + && cmake \ + -S /tmp/range-v3 \ + -B /tmp/range-v3/build \ + -DCMAKE_BUILD_TYPE=Release \ + -DRANGE_V3_TESTS=OFF \ + -DRANGE_V3_EXAMPLES=OFF \ + -DRANGE_V3_DOCS=OFF \ + -DCMAKE_INSTALL_PREFIX=/usr/local \ + && make -C /tmp/range-v3/build -j$(nproc) \ + && make -C /tmp/range-v3/build install \ + && rm -rf /tmp/range-v3 + + +# libstdc++ vs libc++ issue arises +#ARG YAML_VERSION=0.8.0 +#RUN mkdir -p /tmp/yaml \ +# && wget -O /tmp/yaml/yaml-${YAML_VERSION}.tar.gz "https://github.com/jbeder/yaml-cpp/archive/refs/tags/${YAML_VERSION}.tar.gz" \ +# && tar -xzf /tmp/yaml/yaml-${YAML_VERSION}.tar.gz -C /tmp/yaml --strip-components=1 \ +# && cmake \ +# -S /tmp/yaml \ +# -B /tmp/yaml/build \ +# -DCMAKE_BUILD_TYPE=Release \ +# -DYAML_CPP_BUILD_TESTS=OFF \ +# -DYAML_CPP_BUILD_TOOLS=OFF \ +# -DYAML_BUILD_SHARED_LIBS=OFF \ +# && make -C /tmp/yaml/build -j$(nproc) \ +# && make -C /tmp/yaml/build install \ +# && rm -rf /tmp/yaml + +ARG JSON_VERSION=3.11.3 +RUN mkdir -p /tmp/json \ + && wget -O /tmp/json/json-${JSON_VERSION}.tar.gz "https://github.com/nlohmann/json/archive/refs/tags/v${JSON_VERSION}.tar.gz" \ + && tar -xzf /tmp/json/json-${JSON_VERSION}.tar.gz -C /tmp/json --strip-components=1 \ + && cmake \ + -S /tmp/json \ + -B /tmp/json/build \ + -DCMAKE_BUILD_TYPE=Release \ + -DJSON_BuildTests=OFF \ + -DJSON_Install=ON \ + && make -C /tmp/json/build -j$(nproc) \ + && make -C /tmp/json/build install \ + && rm -rf /tmp/json + +ARG MAGIC_ENUM_VERSION=0.9.7 +RUN mkdir -p /tmp/magic_enum \ + && wget -O /tmp/magic_enum/magic_enum-${MAGIC_ENUM_VERSION}.tar.gz "https://github.com/Neargye/magic_enum/archive/refs/tags/v${MAGIC_ENUM_VERSION}.tar.gz" \ + && tar -xzf /tmp/magic_enum/magic_enum-${MAGIC_ENUM_VERSION}.tar.gz -C /tmp/magic_enum --strip-components=1 \ + && cmake \ + -S /tmp/magic_enum \ + -B /tmp/magic_enum/build \ + -DCMAKE_BUILD_TYPE=Release \ + -DMAGIC_ENUM_OPT_BUILD_TESTS=OFF \ + -DMAGIC_ENUM_OPT_BUILD_EXAMPLES=OFF \ + -DMAGIC_ENUM_OPT_INSTALL=ON \ + && make -C /tmp/magic_enum/build -j$(nproc) \ + && make -C /tmp/magic_enum/build install \ + && rm -rf /tmp/magic_enum + +ARG TAKSFLOW_VERSION=3.7.0 +RUN mkdir -p /tmp/taskflow \ + && wget -O /tmp/taskflow/taskflow-${TAKSFLOW_VERSION}.tar.gz "https://github.com/taskflow/taskflow/archive/v${TAKSFLOW_VERSION}.tar.gz" \ + && tar -xzf /tmp/taskflow/taskflow-${TAKSFLOW_VERSION}.tar.gz -C /tmp/taskflow --strip-components=1 \ + && cmake \ + -S /tmp/taskflow \ + -B /tmp/taskflow/build \ + -DCMAKE_BUILD_TYPE=Release \ + -DTF_BUILD_TESTS=OFF \ + -DTF_BUILD_EXAMPLES=OFF \ + -DTF_BUILD_BENCHMARKS=OFF \ + -DTF_BUILD_CUDA=OFF \ + -DTF_BUILD_SYCL=OFF \ + && make -C /tmp/taskflow/build -j$(nproc) \ + && make -C /tmp/taskflow/build install \ + && rm -rf /tmp/taskflow + +ARG XTENSOR_XTL_VERSION=0.7.7 +RUN mkdir -p /tmp/xtensor_xtl \ + && wget -O /tmp/xtensor_xtl/xtensor_xtl-${XTENSOR_XTL_VERSION}.tar.gz "https://github.com/xtensor-stack/xtl/archive/refs/tags/${XTENSOR_XTL_VERSION}.tar.gz" \ + && tar -xzf /tmp/xtensor_xtl/xtensor_xtl-${XTENSOR_XTL_VERSION}.tar.gz -C /tmp/xtensor_xtl --strip-components=1 \ + && cmake \ + -S /tmp/xtensor_xtl \ + -B /tmp/xtensor_xtl/build \ + -DCMAKE_BUILD_TYPE=Release \ + && make -C /tmp/xtensor_xtl/build -j$(nproc) \ + && make -C /tmp/xtensor_xtl/build install \ + && rm -rf /tmp/xtensor_xtl + +# xtensor problemstic +#ARG XTENSOR_VERSION=0.25.0 +#RUN mkdir -p /tmp/xtensor \ +# && wget -O /tmp/xtensor/xtensor-${XTENSOR_VERSION}.tar.gz "https://github.com/xtensor-stack/xtensor/archive/refs/tags/${XTENSOR_VERSION}.tar.gz" \ +# && tar -xzf /tmp/xtensor/xtensor-${XTENSOR_VERSION}.tar.gz -C /tmp/xtensor --strip-components=1 \ +# && cmake \ +# -S /tmp/xtensor \ +# -B /tmp/xtensor/build \ +# -DCMAKE_BUILD_TYPE=Release \ +# && make -C /tmp/xtensor/build -j$(nproc) \ +# && make -C /tmp/xtensor/build install \ +# && rm -rf /tmp/xtensor + +# Issue arises - No blas +#ARG XTENSOR_BLAS_VERSION=0.21.0 +#RUN mkdir -p /tmp/xtensor_blas \ +# && wget -O /tmp/xtensor_blas/xtensor_blas-${XTENSOR_BLAS_VERSION}.tar.gz "https://github.com/xtensor-stack/xtensor-blas/archive/refs/tags/${XTENSOR_BLAS_VERSION}.tar.gz" \ +# && tar -xzf /tmp/xtensor_blas/xtensor_blas-${XTENSOR_BLAS_VERSION}.tar.gz -C /tmp/xtensor_blas --strip-components=1 \ +# && cmake \ +# -S /tmp/xtensor_blas \ +# -B /tmp/xtensor_blas/build \ +# -DCMAKE_BUILD_TYPE=Release \ +# && make -C /tmp/xtensor_blas/build -j$(nproc) \ +# && make -C /tmp/xtensor_blas/build install \ +# && rm -rf /tmp/xtensor_blas + ARG DOXYGEN_VERSION=1.9.6 RUN mkdir -p /tmp/doxygen \ && wget -O /tmp/doxygen/doxygen-${DOXYGEN_VERSION}.linux.bin.tar.gz "https://www.doxygen.nl/files/doxygen-${DOXYGEN_VERSION}.linux.bin.tar.gz" \ @@ -53,12 +209,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ wget \ libtbb-dev \ libcapstone-dev \ - libfmt-dev \ - libyaml-cpp-dev \ - pybind11-dev \ - nlohmann-json3-dev \ libgtest-dev \ - libboost-all-dev \ && apt-get clean && rm -rf /var/lib/apt/lists/* ENV CCACHE_TEMPDIR=/tmp/ccache diff --git a/tt-train/cmake/dependencies.cmake b/tt-train/cmake/dependencies.cmake index d9ea7849b21..c29e4a9231f 100644 --- a/tt-train/cmake/dependencies.cmake +++ b/tt-train/cmake/dependencies.cmake @@ -70,7 +70,7 @@ CPMAddPackage(NAME magic_enum GITHUB_REPOSITORY Neargye/magic_enum GIT_TAG v0.9. # nlohmann/json : https://github.com/nlohmann/json ############################################################################################################################ -CPMAddPackage(NAME json GITHUB_REPOSITORY nlohmann/json GIT_TAG v3.11.3 OPTIONS "JSON_BuildTests OFF") +CPMAddPackage(NAME nlohmann_json GITHUB_REPOSITORY nlohmann/json GIT_TAG v3.11.3 OPTIONS "JSON_BuildTests OFF") CPMAddPackage(NAME xtl GITHUB_REPOSITORY xtensor-stack/xtl GIT_TAG 0.7.7 OPTIONS "XTL_ENABLE_TESTS OFF") @@ -84,7 +84,10 @@ CPMAddPackage( "XTENSOR_ENABLE_TESTS OFF" ) -CPMAddPackage(NAME taskflow GITHUB_REPOSITORY taskflow/taskflow GIT_TAG v3.7.0 OPTIONS "TF_BUILD_TESTS OFF") +CPMAddPackage(NAME Taskflow GITHUB_REPOSITORY taskflow/taskflow GIT_TAG v3.7.0 OPTIONS "TF_BUILD_TESTS OFF") +if(Taskflow_ADDED AND NOT TARGET Taskflow::Taskflow) + add_library(Taskflow::Taskflow ALIAS Taskflow) +endif() include(${PROJECT_SOURCE_DIR}/cmake/fetch_cli11.cmake) From 703a7a0ec0b1715c4564bd0bc47dd10a930cf9ac Mon Sep 17 00:00:00 2001 From: Mark O'Connor Date: Fri, 14 Feb 2025 21:12:16 +0100 Subject: [PATCH 113/316] #0: Move models/perf/perf_report to its own repository (#17889) --- models/perf/README.md | 127 +----- models/perf/perf_report.py | 862 ------------------------------------- 2 files changed, 4 insertions(+), 985 deletions(-) delete mode 100755 models/perf/perf_report.py diff --git a/models/perf/README.md b/models/perf/README.md index 5d9e5c82bc5..35ff6a1a28e 100644 --- a/models/perf/README.md +++ b/models/perf/README.md @@ -2,130 +2,11 @@ ![Example perf report](images/example_perf_report.png) -This tool analyzes performance traces from Metal operations, providing insights into throughput, bottlenecks, and optimization opportunities. +This has been moved to [tt-perf-report](https://github.com/tenstorrent/tt-perf-report). Short instructions: -## Generating Performance Traces - -1. Build Metal with performance tracing enabled: -```bash -./build_metal -p -``` - -2. Run your test with the tracy module to capture traces: -```bash -python -m tracy -r -p -v -m pytest path/to/test.py -``` -This generates a CSV file containing operation timing data. - -## Using Tracy Signposts - -Tracy signposts mark specific sections of code for analysis. Add signposts to your Python code: - -```python -import tracy - -# Mark different sections of your code -tracy.signpost("Compilation pass") -model(input_data) - -tracy.signpost("Performance pass") -for _ in range(10): - model(input_data) -``` - -The tool uses the last signpost by default, which is typically the most relevant section for a performance test(e.g., the final iteration after compilation / warmup). - -Common signpost usage: -- `--signpost name`: Analyze ops after the specified signpost -- `--ignore-signposts`: Analyze the entire trace - -## Filtering Operations - -The output of the performance report is a table of operations. Each operation is assigned a unique ID starting from 1. You can re-run the tool with different IDs to focus on specific sections of the trace. - -Use `--id-range` to analyze specific sections: ```bash -# Analyze ops 5 through 10 -python perf_report.py trace.csv --id-range 5-10 - -# Analyze from op 31 onwards -python perf_report.py trace.csv --id-range 31- - -# Analyze up to op 12 -python perf_report.py trace.csv --id-range -12 +pip install tt-perf-report +tt-perf-report your_metal_op_perf_report.csv ``` -This is particularly useful for: -- Isolating decode pass in prefill+decode LLM inference -- Analyzing single transformer layers without embeddings/projections -- Focusing on specific model components - -## Output Options - -- `--min-percentage value`: Hide ops below specified % of total time (default: 0.5) -- `--color/--no-color`: Force colored/plain output -- `--csv FILENAME`: Output the table to CSV format for further analysis or inclusion into automated reporting pipelines -- `--no-advice`: Show only performance table, skip optimization advice - -## Understanding the Performance Report - -The performance report provides several key metrics for analyzing operation performance: - -### Core Metrics - -- **Device Time**: Time spent executing the operation on device (in microseconds) -- **Op-to-op Gap**: Time between operations, including host overhead and kernel dispatch (in microseconds) -- **Total %**: Percentage of total execution time spent on this operation -- **Cores**: Number of cores used by the operation (max 64 on Wormhole) - -### Performance Metrics - -- **DRAM**: Memory bandwidth achieved (in GB/s) -- **DRAM %**: Percentage of theoretical peak DRAM bandwidth (288 GB/s on Wormhole) -- **FLOPs**: Compute throughput achieved (in TFLOPs) -- **FLOPs %**: Percentage of theoretical peak compute for the given math fidelity -- **Bound**: Performance classification of the operation: - - `DRAM`: Memory bandwidth bound (>65% of peak DRAM) - - `FLOP`: Compute bound (>65% of peak FLOPs) - - `BOTH`: Both memory and compute bound - - `SLOW`: Neither memory nor compute bound - - `HOST`: Operation running on host CPU - -### Additional Fields - -- **Math Fidelity**: Precision configuration used for matrix operations: - - `HiFi4`: Highest precision (74 TFLOPs/core) - - `HiFi2`: Medium precision (148 TFLOPs/core) - - `LoFi`: Lowest precision (262 TFLOPs/core) - -The tool automatically highlights potential optimization opportunities: -- Red op-to-op times indicate high host or kernel launch overhead (>6.5μs) -- Red core counts indicate underutilization (<10 cores) -- Green metrics indicate good utilization of available resources -- Yellow metrics indicate room for optimization - -## Examples - -Typical use: - -```bash -python perf_report.py trace.csv -``` - -Build a table of all ops with no advice: - -```bash -python perf_report.py trace.csv --no-advice -``` - -View ops 100-200 with advice: - -```bash -python perf_report.py trace.csv --id-range 100-200 -``` - -Export the table of ops and columns as a CSV file: - -```bash -python perf_report.py trace.csv --csv my_report.csv -``` +Contribute changes directly to [tt-perf-report](https://github.com/tenstorrent/tt-perf-report). If you don't have access, ping Mark on slack. Changes made in main there will automatically be rolled out to pip after a few minutes. diff --git a/models/perf/perf_report.py b/models/perf/perf_report.py deleted file mode 100755 index 67769112bbe..00000000000 --- a/models/perf/perf_report.py +++ /dev/null @@ -1,862 +0,0 @@ -#!/usr/bin/env python3 -# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. - -# SPDX-License-Identifier: Apache-2.0 - -import sys -import argparse -import re -from typing import Any, Optional, Union -from collections import defaultdict -import pandas as pd - -# Global variable to store color preference -color_output = None # None means auto-detect, True forces color, False forces no color - - -def set_color_output(force_color, force_no_color): - global color_output - if force_no_color: - color_output = False - elif force_color: - color_output = True - else: - color_output = None # Auto-detect - - -def colored(text, color): - if color_output is None: - should_color = sys.stdout.isatty() - else: - should_color = color_output - - if should_color and color: - colors = { - "grey": "\033[38;5;8m", - "red": "\033[38;5;9m", - "green": "\033[38;5;10m", - "yellow": "\033[38;5;11m", - "blue": "\033[38;5;12m", - "magenta": "\033[38;5;13m", - "cyan": "\033[38;5;14m", - "white": "\033[38;5;15m", - "end": "\033[0m", - } - return f"{colors[color]}{text}{colors['end']}" - else: - return text - - -def tflops_per_core(math_fidelity): - """Source: https://tenstorrent.com/assets/one-pagers/08.01.24_Wormhole.pdf""" - if math_fidelity == "HiFi4": - return 74 / 72 - elif math_fidelity == "HiFi2": - return 148 / 72 - elif math_fidelity == "LoFi": - return 262 / 72 - else: - assert False, f"Unknown math fidelity: {math_fidelity}" - - -class Cell: - def __init__(self, value: Any, unit: Optional[str] = None, decimals=0, color=None): - self.raw_value = value - self.unit = unit - self.decimals = decimals - self.color = color - - def format(self): - if self.raw_value is None or pd.isna(self.raw_value): - return "" - - if isinstance(self.raw_value, str) and "Matmul" in self.raw_value: - parts = self.raw_value.split(maxsplit=1) - op_name = parts[0] - size = parts[1] if len(parts) > 1 else "" - formatted = f"{colored(op_name, self.color) if self.color else op_name} {colored(size, 'grey')}" - else: - try: - formatted = f"{float(self.raw_value):,.{self.decimals}f}" - except (ValueError, TypeError): - formatted = str(self.raw_value) - - if self.color: - formatted = colored(formatted, self.color) - - if self.unit: - formatted += f" {colored(self.unit, 'grey')}" - - return formatted - - def __str__(self): - return self.format() - - -def filter_by_signpost(df, signpost=None, ignore_signposts=False): - signpost_rows = df[df["OP TYPE"] == "signpost"] - - if ignore_signposts: - print(colored("Ignoring all signposts. Using the entire file for analysis.", "cyan")) - return df - - if signpost: - if signpost in signpost_rows["OP CODE"].values: - print(colored(f"Using specified signpost: {signpost}", "cyan")) - return df[df["OP CODE"].eq(signpost).cummax()].iloc[1:] - print(colored(f"Specified signpost '{signpost}' not found. Defaulting to the last signpost.", "yellow")) - - if signpost_rows.empty: - print(colored("No signposts found in the file. Using the entire file for analysis.", "yellow")) - return df - - last_signpost = signpost_rows.iloc[-1]["OP CODE"] - print(colored(f"Detected signposts: {', '.join(signpost_rows['OP CODE'])}", "cyan")) - print(colored(f"Using last signpost: {last_signpost} for analysis.", "cyan")) - return df[df["OP CODE"].eq(last_signpost).cummax()].iloc[1:] - - -def get_datatype_size(datatype): - match = re.search(r"\d+", datatype) - return int(match.group()) / 8 if match else 4 - - -def visible_length(s): - return len(re.sub(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])", "", s)) - - -def pad_string(string, length, align="left"): - visible_len = visible_length(string) - padding = " " * (length - visible_len) - return padding + string if align == "right" else string + padding - - -def evaluate_fidelity(input_0_datatype, input_1_datatype, output_datatype, math_fidelity): - mantissa_bits = {"BFLOAT16": 8, "BFLOAT8_B": 7, "BFLOAT4_B": 3} - in0_bits = mantissa_bits[input_0_datatype] # activations -> srcB (7 bits) - in1_bits = mantissa_bits[input_1_datatype] # weights -> srcA (5 bits) - out_bits = mantissa_bits[output_datatype] - if in0_bits == 8 and out_bits >= 7: - if math_fidelity == "HiFi4": - return ( - "sufficient", - "HiFi2 may also work, it discards the lowest bit of the activations and has 2x the throughput of HiFi4", - ) - elif math_fidelity == "HiFi2": - return "too_low", "If your matmuls are not FLOP-bound use HiFi4 with BF16 activations for full accuracy" - elif math_fidelity == "LoFi": - return "too_low", "Use HiFi2 or HiFi4 with BF16 activations for improved accuracy" - else: - assert False, f"Unknown math fidelity: {math_fidelity}" - elif in0_bits == 8 and out_bits == 3: - if math_fidelity == "HiFi4": - return ( - "too_high", - "HiFi2 is very likely to work for BFP8 output; it discards the lowest bit of the activations and has 2x the throughput of HiFi4", - ) - elif math_fidelity == "HiFi2": - return ( - "sufficient", - "LoFi might also be sufficient with BFP4 output and has almost 2x the throughput of HiFi2", - ) - elif math_fidelity == "LoFi": - return ( - "too_low", - "HiFi2 may give better accuracy for large matmuls with many intermediate accumulations", - ) - else: - assert False, f"Unknown math fidelity: {math_fidelity}" - elif in1_bits >= 7 and out_bits >= 7: - if math_fidelity == "HiFi4": - return "too_high", "HiFi2 is sufficient for BFP8 multiplication and has 2x the throughput of HiFi4" - elif math_fidelity == "HiFi2": - return "sufficient", None - elif math_fidelity == "LoFi": - return "too_low", "HiFi2 is recommended for accuracy; LoFi discards the lowest 2 bits of the weights" - else: - assert False, f"Unknown math fidelity: {math_fidelity}" - elif in1_bits >= 7 and out_bits == 3: - if math_fidelity == "HiFi4": - return "too_high", "HiFi2 is sufficient for BFP8 multiplication and has 2x the throughput of HiFi4" - elif math_fidelity == "HiFi2": - return ( - "sufficient", - "LoFi might also be sufficient with BFP4 output and has almost 2x the throughput of HiFi2", - ) - elif math_fidelity == "LoFi": - return ( - "too_low", - "HiFi2 may give slightly better accuracy for large matmuls with many intermediate accumulations", - ) - else: - assert False, f"Unknown math fidelity: {math_fidelity}" - elif in1_bits == 3: - if math_fidelity == "LoFi": - return "sufficient", None - else: - return "too_high", "LoFi is sufficient with BFP4 weights, use it for much higher throughput" - else: - print(f"Using {math_fidelity} for {input_0_datatype}/{input_1_datatype} inputs and {output_datatype} output") - print(f"Bits: {in0_bits}/{in1_bits}/{out_bits}") - return ( - "unknown", - f"Using {math_fidelity} for {input_0_datatype}/{input_1_datatype} inputs and {output_datatype} output", - ) - - -def analyze_matmul(row): - input_0_from_dram = "DRAM" in row["INPUT_0_MEMORY"] - input_1_from_dram = "DRAM" in row["INPUT_1_MEMORY"] - - total_data_size_bytes = 0 - if input_0_from_dram: - total_data_size_bytes += ( - row["INPUT_0_W"] - * row["INPUT_0_Y"] - * row["INPUT_0_Z"] - * row["INPUT_0_X"] - * get_datatype_size(row["INPUT_0_DATATYPE"]) - ) - if input_1_from_dram: - total_data_size_bytes += ( - row["INPUT_1_W"] - * row["INPUT_1_Y"] - * row["INPUT_1_Z"] - * row["INPUT_1_X"] - * get_datatype_size(row["INPUT_1_DATATYPE"]) - ) - - # Always include output if it's written to DRAM - if "DRAM" in row["OUTPUT_0_MEMORY"]: - total_data_size_bytes += ( - row["OUTPUT_0_W"] - * row["OUTPUT_0_Y"] - * row["OUTPUT_0_Z"] - * row["OUTPUT_0_X"] - * get_datatype_size(row["OUTPUT_0_DATATYPE"]) - ) - - duration_s = row["DEVICE KERNEL DURATION [ns]"] * 1e-9 - dram_speed_gb_s = (total_data_size_bytes / duration_s) / 1e9 if total_data_size_bytes > 0 else None - - core_count = row["CORE COUNT"] - math_fidelity = row["MATH FIDELITY"] - - # Check for DRAM-sharded program config - attributes = row["ATTRIBUTES"] if pd.notna(row["ATTRIBUTES"]) else "" - is_dram_sharded = "MatmulMultiCoreReuseMultiCastDRAMShardedProgramConfig" in attributes - - # Override core count for DRAM-sharded matmuls - if is_dram_sharded: - core_count = 12 - - peak_flops_value = tflops_per_core(math_fidelity) * 1e12 * core_count - - M, K, N = int(row["INPUT_0_Y"]), int(row["INPUT_0_X"]), int(row["INPUT_1_X"]) - W, Z = int(row["INPUT_0_W"]), int(row["INPUT_0_Z"]) - - flops = (M * K * N * W * Z * 2) / duration_s - - size = f"{M} x {K} x {N}" - memory_info = f"({row['INPUT_0_DATATYPE']} {row['INPUT_0_MEMORY'].replace('DEV_0_', '')} @ {row['INPUT_1_DATATYPE']} {row['INPUT_1_MEMORY'].replace('DEV_0_', '')} => {row['OUTPUT_0_DATATYPE']} {row['OUTPUT_0_MEMORY'].replace('DEV_0_', '')})" - - dram_percentage = (dram_speed_gb_s / 288) * 100 if dram_speed_gb_s is not None else None - flops_percentage = (flops / peak_flops_value) * 100 - - return ( - dram_speed_gb_s, - dram_percentage, - flops, - flops_percentage, - size, - memory_info, - math_fidelity, - is_dram_sharded, - core_count, # Return the potentially adjusted core count - ) - - -def analyze_op(row, prev_row): - op_code = Cell(row["OP CODE"]) - cores = Cell(int(row["CORE COUNT"]) if pd.notna(row["CORE COUNT"]) else None) - device_time = Cell( - row["DEVICE KERNEL DURATION [ns]"] / 1000 if pd.notna(row["DEVICE KERNEL DURATION [ns]"]) else None, - unit="us", - decimals=0, - ) - - if prev_row is not None and pd.notna(prev_row["OP TO OP LATENCY [ns]"]): - op_to_op_gap = Cell( - row["OP TO OP LATENCY [ns]"] / 1000 if pd.notna(row["OP TO OP LATENCY [ns]"]) else None, - unit="us", - decimals=0, - ) - else: - op_to_op_gap = Cell(None, unit="us", decimals=0) - - def get_entry(k: str) -> Union[str, None]: - return row[k] if k in row else None - - output_datatype = get_entry("OUTPUT_0_DATATYPE") - input_0_datatype = get_entry("INPUT_0_DATATYPE") - input_1_datatype = get_entry("INPUT_1_DATATYPE") - output_datatype_cell = Cell(output_datatype) - input_0_datatype_cell = Cell(input_0_datatype) - input_1_datatype_cell = Cell(input_1_datatype) - - short_name = lambda n: {"BFLOAT16": "BF16", "BFLOAT8_B": "BFP8", "BFLOAT4_B": "BFP4"}.get(n, n) - - if "Matmul" in op_code.raw_value: - ( - dram_speed, - dram_percentage, - flops, - flops_percentage, - size, - memory_info, - math_fidelity, - is_dram_sharded, - adjusted_core_count, # Get the potentially adjusted core count - ) = analyze_matmul(row) - op_code = Cell(f"{op_code.raw_value} {size}") - dram_speed = Cell(dram_speed, unit="GB/s", decimals=0) - dram_percentage = Cell(dram_percentage, unit="%", decimals=1) - flops = Cell(flops / 1e12 if pd.notna(flops) else None, unit="TFLOPs", decimals=1) - flops_percentage = Cell(flops_percentage, unit="%", decimals=1) - cores.raw_value = adjusted_core_count - - math_fidelity_cell = Cell( - f"{math_fidelity} {short_name(input_0_datatype)} x {short_name(input_1_datatype)} => {short_name(output_datatype)}".strip() - if math_fidelity - else None - ) - else: - dram_speed = Cell(None, unit="GB/s", decimals=0) - dram_percentage = Cell(None, unit="%", decimals=1) - flops = Cell(None, unit="TFLOPs", decimals=1) - flops_percentage = Cell(None, unit="%", decimals=1) - - math_fidelity = "" - math_fidelity += f"{short_name(input_0_datatype)}" if pd.notna(input_0_datatype) else "" - math_fidelity += f", {short_name(input_1_datatype)}" if pd.notna(input_1_datatype) else "" - math_fidelity += f" => {short_name(output_datatype)}" if pd.notna(output_datatype) else "" - math_fidelity_cell = Cell(math_fidelity.strip()) - - is_dram_sharded = False - - output = { - "ID": None, - "Bound": Cell(""), - "OP Code": op_code, - "Device Time": device_time, - "Op-to-Op Gap": op_to_op_gap, - "Cores": cores, - "DRAM": dram_speed, - "DRAM %": dram_percentage, - "FLOPs": flops, - "FLOPs %": flops_percentage, - "Math Fidelity": math_fidelity_cell, - "Output Datatype": output_datatype_cell, - "Input 0 Datatype": input_0_datatype_cell, - "Input 1 Datatype": input_1_datatype_cell, - "DRAM Sharded": Cell(is_dram_sharded), - } - - input_0_memory = Cell(row["INPUT_0_MEMORY"] if pd.notna(row["INPUT_0_MEMORY"]) else None) - - # Extract program config details - attributes = row["ATTRIBUTES"] if pd.notna(row["ATTRIBUTES"]) else "" - in0_block_w = Cell(None) - out_subblock_h = Cell(None) - out_subblock_w = Cell(None) - - if "program_config" in attributes: - match = re.search(r"in0_block_w=(\d+)", attributes) - if match: - in0_block_w = Cell(int(match.group(1))) - - match = re.search(r"out_subblock_h=(\d+)", attributes) - if match: - out_subblock_h = Cell(int(match.group(1))) - - match = re.search(r"out_subblock_w=(\d+)", attributes) - if match: - out_subblock_w = Cell(int(match.group(1))) - - output["Input 0 Memory"] = input_0_memory - output["Inner Dim Block Size"] = in0_block_w - output["Output Subblock H"] = out_subblock_h - output["Output Subblock W"] = out_subblock_w - - return output, op_to_op_gap.raw_value - - -def add_derived_columns(rows): - total_duration = sum( - op_data["Device Time"].raw_value for op_data in rows if op_data["Device Time"].raw_value is not None - ) + sum(op_data["Op-to-Op Gap"].raw_value for op_data in rows if op_data["Op-to-Op Gap"].raw_value is not None) - for op_data in rows: - device_time = op_data["Device Time"].raw_value if op_data["Device Time"].raw_value is not None else 0 - op_to_op_gap = op_data["Op-to-Op Gap"].raw_value if op_data["Op-to-Op Gap"].raw_value is not None else 0 - op_data["Total %"] = Cell(((device_time + op_to_op_gap) / total_duration) * 100, unit="%", decimals=1) - if op_data["Device Time"].raw_value is None and op_data["Op-to-Op Gap"].raw_value is None: - op_data["Total %"].raw_value = None - - if "Matmul" in op_data["OP Code"].raw_value: - dram_percentage = op_data["DRAM %"].raw_value - flops_percentage = op_data["FLOPs %"].raw_value - if dram_percentage and flops_percentage: - if dram_percentage >= 65 and flops_percentage >= 65: - op_data["Bound"] = Cell("BOTH") - elif dram_percentage >= 65: - op_data["Bound"] = Cell("DRAM") - elif flops_percentage >= 65: - op_data["Bound"] = Cell("FLOP") - else: - op_data["Bound"] = Cell("SLOW") - elif "(torch)" in op_data["OP Code"].raw_value: - op_data["Bound"] = Cell("HOST") - - -def print_row(row, col_widths, headers): - def format_cell(header, cell): - # Avoid thousand separators for ID column - text = colored(str(cell.raw_value), cell.color) if header == "ID" else str(cell) - return pad_string(text, col_widths[headers.index(header)], align="left" if header == "OP Code" else "right") - - print(" ".join(format_cell(header, row[header]) for header in headers)) - - -def color_row(op_data, percentage, min_percentage): - if percentage is not None and percentage < min_percentage: - for v in op_data.values(): - v.color = "grey" - else: - op_colors = { - "(torch)": "red", - "Matmul": "magenta", - "LayerNorm": "cyan", - "AllGather": "cyan", - "AllReduce": "cyan", - "ScaledDotProductAttentionDecode": "blue", - "ScaledDotProductAttentionGQADecode": "blue", - "NlpCreateHeadsDeviceOperation": "blue", - "NLPConcatHeadsDecodeDeviceOperation": "blue", - "UpdateCache": "blue", - } - for op, color in op_colors.items(): - if op in op_data["OP Code"].raw_value: - op_data["OP Code"].color = color - break - else: - op_data["OP Code"].color = "white" - - num_cores = op_data["Cores"].raw_value - if num_cores is not None: - if num_cores < 10: - op_data["Cores"].color = "red" - elif num_cores == 64: - op_data["Cores"].color = "green" - else: - op_data["Cores"].color = "grey" - - if op_data["Bound"].raw_value == "DRAM": - op_data["Bound"].color = "green" - op_data["DRAM"].color = "green" - op_data["DRAM %"].color = "green" - elif op_data["Bound"].raw_value == "FLOP": - op_data["Bound"].color = "green" - op_data["FLOPs"].color = "green" - op_data["FLOPs %"].color = "green" - elif op_data["Bound"].raw_value == "SLOW": - op_data["Bound"].color = "yellow" - dram_percentage = op_data["DRAM %"].raw_value - flops_percentage = op_data["FLOPs %"].raw_value - if dram_percentage is not None and flops_percentage is not None: - if dram_percentage > flops_percentage: - op_data["DRAM"].color = "yellow" - op_data["DRAM %"].color = "yellow" - else: - op_data["FLOPs"].color = "yellow" - op_data["FLOPs %"].color = "yellow" - elif op_data["Bound"].raw_value == "HOST": - op_data["Bound"].color = "red" - - if op_data["Op-to-Op Gap"].raw_value is not None and op_data["Op-to-Op Gap"].raw_value > 6.5: - op_data["Op-to-Op Gap"].color = "red" - - if "Matmul" in op_data["OP Code"].raw_value and op_data["Math Fidelity"].raw_value: - math_fidelity = op_data["Math Fidelity"].raw_value.split()[0] - input_0_datatype = op_data["Input 0 Datatype"].raw_value - input_1_datatype = op_data["Input 1 Datatype"].raw_value - output_datatype = op_data["Output Datatype"].raw_value - - fidelity_evaluation, _ = evaluate_fidelity( - input_0_datatype, input_1_datatype, output_datatype, math_fidelity - ) - - if fidelity_evaluation == "sufficient": - op_data["Math Fidelity"].color = "green" - elif fidelity_evaluation == "too_high": - op_data["Math Fidelity"].color = "red" - elif fidelity_evaluation == "too_low": - op_data["Math Fidelity"].color = "cyan" - else: - op_data["Math Fidelity"].color = "white" - - return op_data - - -def print_performance_table(rows, headers, col_widths, device_ops, host_ops): - print("\n🚀 Performance Report 🚀\n========================\n") - - print(" ".join(pad_string(header, col_widths[i], align="left") for i, header in enumerate(headers))) - print("-" * sum(col_widths) + "-" * (len(headers) - 1) * 2) - - for idx, op_data in enumerate(rows): - print_row(op_data, col_widths, headers) - - print("-" * (sum(col_widths) + (len(headers) - 1) * 2)) - - total_device_time = sum( - op_data["Device Time"].raw_value for op_data in rows if op_data["Device Time"].raw_value is not None - ) - total_visible_gap = sum( - op_data["Op-to-Op Gap"].raw_value for op_data in rows if op_data["Op-to-Op Gap"].raw_value is not None - ) - total_row = { - "ID": Cell(""), - "Total %": Cell(100.0, unit="%", decimals=1), - "Bound": Cell(""), - "OP Code": Cell(f"{device_ops} device ops, {host_ops} host ops"), - "Device Time": Cell(total_device_time, unit="us", decimals=0), - "Op-to-Op Gap": Cell(total_visible_gap, unit="us", decimals=0), - } - for header in headers: - if header not in total_row: - total_row[header] = Cell("") - print_row( - {k: Cell(v.raw_value, v.unit, v.decimals, color="grey") for k, v in total_row.items()}, col_widths, headers - ) - - -def print_advice_section(rows, headers, col_widths): - print("\n💡 Advice 💡\n============\n") - - print_fallback_advice(rows, headers, col_widths) - print_op_to_op_gap_advice(rows, headers, col_widths) - print_matmul_advice(rows, headers, col_widths) - - -def print_fallback_advice(rows, headers, col_widths): - host_ops = [op_data for op_data in rows if "(torch)" in op_data["OP Code"].raw_value] - if host_ops: - print("Fallback\n--------") - for op_data in host_ops: - print_row(op_data, col_widths, headers) - print("\nThese ops should be moved to run on device.\n") - - -def print_op_to_op_gap_advice(rows, headers, col_widths): - high_gap_ops = [ - (idx + 1, op_data) - for idx, op_data in enumerate(rows) - if op_data["Op-to-Op Gap"].raw_value is not None and op_data["Op-to-Op Gap"].raw_value > 6.5 - ] - - if high_gap_ops: - print("High Op-to-Op Gap\n----------------") - for idx, op_data in high_gap_ops: - print_row(op_data, col_widths, headers) - max_gap_overhead = sum(op_data["Op-to-Op Gap"].raw_value - 6 for _, op_data in high_gap_ops) - - total_duration = sum( - op_data["Device Time"].raw_value for op_data in rows if op_data["Device Time"].raw_value is not None - ) + sum(op_data["Op-to-Op Gap"].raw_value for op_data in rows if op_data["Op-to-Op Gap"].raw_value is not None) - - percentage_saved = (max_gap_overhead / total_duration) * 100 - print( - f"\nThese ops have a >6us gap since the previous operation. Running with tracing could save {max_gap_overhead:.0f} us ({percentage_saved:.1f}% of overall time)" - ) - print( - "Alternatively ensure device is not waiting for the host and use device.enable_async(True). Experts can try moving runtime args in the kernels to compile-time args.\n" - ) - - -def print_matmul_advice(rows, headers, col_widths): - matmul_ops = [op_data for op_data in rows if "Matmul" in op_data["OP Code"].raw_value] - - if matmul_ops: - print("Matmul Optimization\n-------------------") - for op_data in matmul_ops: - print_row(op_data, col_widths, headers) - advice = [] - color = "grey" if op_data["OP Code"].color == "grey" else "white" - - math_fidelity = ( - op_data["Math Fidelity"].raw_value.split()[0] if op_data["Math Fidelity"].raw_value else None - ) - output_datatype = op_data["Output Datatype"].raw_value - input_0_datatype = op_data["Input 0 Datatype"].raw_value - input_1_datatype = op_data["Input 1 Datatype"].raw_value - cores = op_data["Cores"].raw_value - fidelity_evaluation, fidelity_advice = evaluate_fidelity( - input_0_datatype, input_1_datatype, output_datatype, math_fidelity - ) - - if op_data["Bound"].raw_value in ["DRAM", "BOTH"]: - if not op_data["DRAM Sharded"].raw_value: - advice.append( - "- Try a DRAM-sharded program config (MatmulMultiCoreReuseMultiCastDRAMShardedProgramConfig) to improve throughput further" - ) - if fidelity_evaluation == "too_low" and op_data["FLOPs %"].raw_value < 40: - advice.append(f"- {fidelity_advice}") - if fidelity_evaluation == "too_high": - advice.append(f"- {fidelity_advice}") - elif op_data["Bound"].raw_value in ["FLOP", "BOTH"]: - if cores < 64: - advice.append(f"- Increase grid size (currently using {cores})") - if fidelity_evaluation == "too_high": - advice.append(f"- {fidelity_advice}") - elif op_data["Bound"].raw_value == "SLOW": - input_0_memory = op_data["Input 0 Memory"].raw_value - if input_0_memory and "L1" not in input_0_memory: - advice.append(f"- If possible place input 0 in L1 (currently in {input_0_memory})") - - inner_dim_block = op_data["Inner Dim Block Size"].raw_value - out_h = op_data["Output Subblock H"].raw_value - out_w = op_data["Output Subblock W"].raw_value - - if inner_dim_block is None and out_h is None and out_w is None: - advice.append( - "- No program_config specified, try using one to override in0_block_w and out_subblock_h/w" - ) - else: - all_good = True - if inner_dim_block is not None: - if inner_dim_block < 2: - advice.append(f"- in0_block_w={inner_dim_block} is small, try in0_block_w=2 or above") - all_good = False - else: - advice.append("- No inner dim block size found") - all_good = False - - if out_h is not None and out_w is not None: - out_area = out_h * out_w - if out_area < 2: - advice.append( - f"- Output subblock {out_h}x{out_w} is small, try out_subblock_h * out_subblock_w >= 2 if possible" - ) - all_good = False - else: - advice.append("- No output subblock size found") - all_good = False - - if all_good: - advice.append( - f"- in0_block_w={inner_dim_block} and output subblock {out_h}x{out_w} look good 🤷" - ) - if fidelity_advice: - advice.append(f"- {fidelity_advice}") - - if advice: - for item in advice: - print(colored(item, color)) - else: - print(colored("✅ Optimized", color)) - print() # Add a blank line between matmuls - - -def merge_device_rows(df): - block_by_device = defaultdict(list) - - for _, row in df.iterrows(): - op_name = row["OP CODE"] - op_type = row["OP TYPE"] - - if op_type == "tt_dnn_device": - device_id = int(row["DEVICE ID"]) - block_by_device[device_id].append((op_name, row.to_dict())) - - device_ids = sorted(block_by_device.keys()) - merged_blocks = [] - - for blocks in zip(*[block_by_device[device_id] for device_id in device_ids]): - op_name = blocks[0][0] - - if "AllGather" in op_name or "ReduceScatter" in op_name: - # For collective ops, take the row with minimum duration - min_duration_block = min(blocks, key=lambda x: x[1]["DEVICE KERNEL DURATION [ns]"]) - merged_blocks.append(min_duration_block[1]) - else: - # For non-collective ops, take the row with maximum duration - max_duration_block = max(blocks, key=lambda x: x[1]["DEVICE KERNEL DURATION [ns]"]) - merged_blocks.append(max_duration_block[1]) - - return pd.DataFrame(merged_blocks) - - -def parse_id_range(id_range_str): - if id_range_str is None: - return None - - parts = id_range_str.split("-") - if len(parts) != 2: - raise ValueError("Invalid ID range format") - - start = int(parts[0].replace(",", "")) if parts[0] else None - end = int(parts[1].replace(",", "")) if parts[1] else None - - return (start, end) - - -def filter_by_id_range(rows, id_range): - if id_range: - start, end = id_range - if start is None: - print(colored(f"Filtering rows with IDs up to {end}", "cyan")) - filtered_rows = [row for row in rows if row["ID"].raw_value <= end] - elif end is None: - print(colored(f"Filtering rows with IDs from {start} onwards", "cyan")) - filtered_rows = [row for row in rows if row["ID"].raw_value >= start] - else: - print(colored(f"Filtering rows with IDs from {start} to {end}", "cyan")) - filtered_rows = [row for row in rows if start <= row["ID"].raw_value <= end] - - # Reset the op-to-op gap for the first item in the filtered range - if filtered_rows: - filtered_rows[0]["Op-to-Op Gap"] = Cell(None, unit="us", decimals=0) - - return filtered_rows - return rows - - -def main(csv_file, signpost, ignore_signposts, min_percentage, id_range, csv_output_file, no_advice, tracing_mode): - df = pd.read_csv(csv_file, low_memory=False) - - # Add a column for original row numbers - df["ORIGINAL_ROW"] = df.index + 2 # +2 to match Excel row numbers (1-based + header) - - # Sort the DataFrame by "HOST START TS" column - # Sorting by HOST START TS is incorrect when using tracing mode since the tracing ops timestamps are the ones when captured and not executed - if "HOST START TS" in df.columns and not tracing_mode: - print(colored("Sorting CSV by 'HOST START TS' column...", "cyan")) - df = df.sort_values(by="HOST START TS") - else: - print(colored("Warning: 'HOST START TS' column not found. CSV will not be sorted.", "yellow")) - - df = filter_by_signpost(df, signpost, ignore_signposts) - - # Check if the file contains multiple devices - if "DEVICE ID" in df.columns and df["DEVICE ID"].nunique() > 1: - print(colored(f"Detected data from {df['DEVICE ID'].nunique()} devices. Merging device data...", "cyan")) - df = merge_device_rows(df) - - rows = [] - prev_row = None - device_ops = 0 - host_ops = 0 - for _, row in df.iterrows(): - op_data, current_gap = analyze_op(row, prev_row) - op_data["ID"] = Cell(row["ORIGINAL_ROW"]) # Use the original row number - rows.append(op_data) - prev_row = row - - # Count device and host ops - if "(torch)" in op_data["OP Code"].raw_value: - host_ops += 1 - else: - device_ops += 1 - - # Calculate total duration and add derived columns - add_derived_columns(rows) - - # Filter rows based on id_range - rows = filter_by_id_range(rows, id_range) - - # Recalculate derived columns after filtering - add_derived_columns(rows) - - rows = [color_row(op_data, op_data["Total %"].raw_value, min_percentage) for op_data in rows] - - visible_headers = [ - "ID", - "Total %", - "Bound", - "OP Code", - "Device Time", - "Op-to-Op Gap", - "Cores", - "DRAM", - "DRAM %", - "FLOPs", - "FLOPs %", - "Math Fidelity", - ] - - if csv_output_file: - all_headers = visible_headers + [ - "Output Datatype", - "Input 0 Datatype", - "Input 1 Datatype", - "DRAM Sharded", - "Input 0 Memory", - "Inner Dim Block Size", - "Output Subblock H", - "Output Subblock W", - ] - print(colored(f"Writing CSV output to {csv_output_file}", "cyan")) - with open(csv_output_file, "w") as f: - f.write(",".join(all_headers) + "\n") - for op_data in rows: - f.write(",".join(str(op_data[header].raw_value) for header in all_headers) + "\n") - else: - col_widths = [ - max(max(visible_length(str(row[header])) for row in rows), visible_length(header)) - for header in visible_headers - ] - print_performance_table(rows, visible_headers, col_widths, device_ops, host_ops) - if not no_advice: - print_advice_section(rows, visible_headers, col_widths) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="User-friendly Performance Report Analysis Tool") - parser.add_argument("csv_file", type=str, help="Path to the performance report CSV file") - parser.add_argument("--signpost", type=str, help="Specify a signpost to use for analysis", default=None) - parser.add_argument( - "--ignore-signposts", action="store_true", help="Ignore all signposts and use the entire file for analysis" - ) - parser.add_argument( - "--min-percentage", type=float, default=0.5, help="Minimum percentage for coloring (default: 0.5)" - ) - parser.add_argument( - "--id-range", type=str, help="Show only rows with IDs in the specified range (e.g., '5-10', '31-', or '-12')" - ) - parser.add_argument("--color", action="store_true", help="Force colored output even when output is redirected") - parser.add_argument("--no-color", action="store_true", help="Force output without color") - parser.add_argument("--csv", type=str, help="Output filename for CSV format", metavar="OUTPUT_FILE") - parser.add_argument("--no-advice", action="store_true", help="Only show the table section of the report") - parser.add_argument("--tracing-mode", action="store_true", help="Do not sort when in tracing mode") - args = parser.parse_args() - - # Set the global color_output variable - set_color_output(args.color, args.no_color) - - # Parse id_range - try: - id_range = parse_id_range(args.id_range) - except ValueError: - print(colored("Invalid --id-range format. Please use 'START-END', 'START-', or '-END'.", "red")) - exit(1) - - main( - args.csv_file, - args.signpost, - args.ignore_signposts, - args.min_percentage, - id_range, - args.csv, - args.no_advice, - args.tracing_mode, - ) From fefe4788e1dd48e80b6e4f84c4ef7217cd5d41d8 Mon Sep 17 00:00:00 2001 From: Pavle Janevski <165378935+pjanevskiTT@users.noreply.github.com> Date: Sat, 15 Feb 2025 01:27:41 +0100 Subject: [PATCH 114/316] Bump UMD to fix TTDevice mutex issue (#17887) --- tt_metal/third_party/umd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tt_metal/third_party/umd b/tt_metal/third_party/umd index 5de287e9c5b..ebb0f945ed8 160000 --- a/tt_metal/third_party/umd +++ b/tt_metal/third_party/umd @@ -1 +1 @@ -Subproject commit 5de287e9c5b2fa3d55fbfd53e9bc59e2050f32fb +Subproject commit ebb0f945ed8d3c05e043158978201ed6fab884ec From 907fffd392cd4c8101313decfc4fea99a3fea2f8 Mon Sep 17 00:00:00 2001 From: Daiki Aminaka Date: Fri, 14 Feb 2025 21:48:49 -0800 Subject: [PATCH 115/316] Apply refactored constants. test bug fix (#17895) ### Ticket N/A ### Problem description Previous constant name refactoring was not applied to this file ### What's changed Use new constant ### Checklist - [ ] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes - [ ] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [ ] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [ ] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [ ] New/Existing tests provide coverage for changes --- .../tt_fabric_traffic_gen_rx_socket.cpp | 32 +++++++++---------- .../kernels/tt_fabric_traffic_gen_tx.cpp | 26 +++++++-------- .../tt_fabric_traffic_gen_tx_socket.cpp | 26 +++++++-------- 3 files changed, 42 insertions(+), 42 deletions(-) diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx_socket.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx_socket.cpp index 99330aa8047..f2152656090 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx_socket.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx_socket.cpp @@ -64,27 +64,27 @@ void kernel_main() { tt_fabric_init(); zero_l1_buf(test_results, test_results_size_bytes); - test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_STARTED; - test_results[PQ_TEST_MISC_INDEX] = 0xff000000; + test_results[TT_FABRIC_STATUS_INDEX] = TT_FABRIC_STATUS_STARTED; + test_results[TT_FABRIC_MISC_INDEX] = 0xff000000; zero_l1_buf( reinterpret_cast(data_buffer_start_addr), data_buffer_size_words * PACKET_WORD_SIZE_BYTES); - test_results[PQ_TEST_MISC_INDEX] = 0xff000001; + test_results[TT_FABRIC_MISC_INDEX] = 0xff000001; zero_l1_buf((uint32_t*)client_interface, sizeof(fabric_client_interface_t)); - test_results[PQ_TEST_MISC_INDEX] = 0xff000002; + test_results[TT_FABRIC_MISC_INDEX] = 0xff000002; zero_l1_buf((uint32_t*)client_pull_req_buf, sizeof(chan_req_buf)); - test_results[PQ_TEST_MISC_INDEX] = 0xff000003; + test_results[TT_FABRIC_MISC_INDEX] = 0xff000003; client_interface->gk_interface_addr = ((uint64_t)gk_interface_addr_h << 32) | gk_interface_addr_l; client_interface->gk_msg_buf_addr = client_interface->gk_interface_addr + offsetof(gatekeeper_info_t, gk_msg_buf); client_interface->pull_req_buf_addr = xy_local_addr | client_pull_req_buf_addr; - test_results[PQ_TEST_MISC_INDEX] = 0xff000004; + test_results[TT_FABRIC_MISC_INDEX] = 0xff000004; // make sure fabric node gatekeeper is available. fabric_endpoint_init(); socket_reader.init(data_buffer_start_addr, data_buffer_size_words); DPRINT << "Socket open on " << dest_device << ENDL(); - test_results[PQ_TEST_MISC_INDEX] = 0xff000005; + test_results[TT_FABRIC_MISC_INDEX] = 0xff000005; fabric_socket_open( 3, // the network plane to use for this socket @@ -96,7 +96,7 @@ void kernel_main() { dest_device & 0xFFFF, 0 // fabric virtual channel. ); - test_results[PQ_TEST_MISC_INDEX] = 0xff000006; + test_results[TT_FABRIC_MISC_INDEX] = 0xff000006; uint32_t loop_count = 0; uint32_t packet_count = 0; @@ -110,7 +110,7 @@ void kernel_main() { } if (pull_req->flags == FORWARD) { socket_reader.pull_socket_data(pull_req); - test_results[PQ_TEST_MISC_INDEX] = 0xDD000001; + test_results[TT_FABRIC_MISC_INDEX] = 0xDD000001; noc_async_read_barrier(); update_pull_request_words_cleared(pull_req); socket_reader.pull_words_in_flight = 0; @@ -119,11 +119,11 @@ void kernel_main() { if (socket_reader.packet_in_progress == 1 and socket_reader.packet_words_remaining == 0) { // wait for any pending sockat data writes to finish. - test_results[PQ_TEST_MISC_INDEX] = 0xDD000002; + test_results[TT_FABRIC_MISC_INDEX] = 0xDD000002; noc_async_write_barrier(); - test_results[PQ_TEST_MISC_INDEX] = 0xDD000003; + test_results[TT_FABRIC_MISC_INDEX] = 0xDD000003; // clear the flags field to invalidate pull request slot. // flags will be set to non-zero by next requestor. req_buf_advance_rdptr((chan_req_buf*)client_pull_req_buf); @@ -132,7 +132,7 @@ void kernel_main() { loop_count = 0; } } - test_results[PQ_TEST_MISC_INDEX] = 0xDD400000 | (loop_count & 0xfffff); + test_results[TT_FABRIC_MISC_INDEX] = 0xDD400000 | (loop_count & 0xfffff); loop_count++; if (packet_count > 0 and loop_count >= 0x10000) { @@ -142,13 +142,13 @@ void kernel_main() { } // write out results - set_64b_result(test_results, processed_packet_words, PQ_TEST_WORD_CNT_INDEX); + set_64b_result(test_results, processed_packet_words, TT_FABRIC_WORD_CNT_INDEX); set_64b_result(test_results, num_packets, TX_TEST_IDX_NPKT); if (async_wr_check_failed) { - test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_DATA_MISMATCH; + test_results[TT_FABRIC_STATUS_INDEX] = TT_FABRIC_STATUS_DATA_MISMATCH; } else { - test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_PASS; - test_results[PQ_TEST_MISC_INDEX] = 0xff000005; + test_results[TT_FABRIC_STATUS_INDEX] = TT_FABRIC_STATUS_PASS; + test_results[TT_FABRIC_MISC_INDEX] = 0xff000005; } } diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp index c13ac0ea9cf..48351327002 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp @@ -403,11 +403,11 @@ void kernel_main() { rx_addr_hi = base_target_address + rx_buf_size; zero_l1_buf(test_results, test_results_size_bytes); - test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_STARTED; - test_results[PQ_TEST_STATUS_INDEX+1] = (uint32_t) local_pull_request; + test_results[TT_FABRIC_STATUS_INDEX] = TT_FABRIC_STATUS_STARTED; + test_results[TT_FABRIC_STATUS_INDEX + 1] = (uint32_t)local_pull_request; - test_results[PQ_TEST_MISC_INDEX] = 0xff000000; - test_results[PQ_TEST_MISC_INDEX + 1] = 0xcc000000 | src_endpoint_id; + test_results[TT_FABRIC_MISC_INDEX] = 0xff000000; + test_results[TT_FABRIC_MISC_INDEX + 1] = 0xcc000000 | src_endpoint_id; zero_l1_buf(reinterpret_cast(data_buffer_start_addr), data_buffer_size_words * PACKET_WORD_SIZE_BYTES); zero_l1_buf((uint32_t*)local_pull_request, sizeof(local_pull_request_t)); @@ -445,7 +445,7 @@ void kernel_main() { // all the tx workers are ready on this chip while (*(volatile tt_l1_ptr uint32_t*)signal_address == 0); - test_results[PQ_TEST_MISC_INDEX] = 0xff000001; + test_results[TT_FABRIC_MISC_INDEX] = 0xff000001; uint64_t data_words_sent = 0; uint64_t iter = 0; @@ -520,9 +520,9 @@ void kernel_main() { uint64_t cycles_elapsed = get_timestamp() - start_timestamp; uint64_t num_packets = input_queue_state.get_num_packets(); - set_64b_result(test_results, data_words_sent, PQ_TEST_WORD_CNT_INDEX); - set_64b_result(test_results, cycles_elapsed, PQ_TEST_CYCLES_INDEX); - set_64b_result(test_results, iter, PQ_TEST_ITER_INDEX); + set_64b_result(test_results, data_words_sent, TT_FABRIC_WORD_CNT_INDEX); + set_64b_result(test_results, cycles_elapsed, TT_FABRIC_CYCLES_INDEX); + set_64b_result(test_results, iter, TT_FABRIC_ITER_INDEX); set_64b_result(test_results, total_data_words, TX_TEST_IDX_TOT_DATA_WORDS); set_64b_result(test_results, num_packets, TX_TEST_IDX_NPKT); set_64b_result(test_results, zero_data_sent_iter, TX_TEST_IDX_ZERO_DATA_WORDS_SENT_ITER); @@ -530,13 +530,13 @@ void kernel_main() { set_64b_result(test_results, many_data_sent_iter, TX_TEST_IDX_MANY_DATA_WORDS_SENT_ITER); if (test_producer.packet_corrupted) { - test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_BAD_HEADER; - test_results[PQ_TEST_MISC_INDEX] = packet_count; + test_results[TT_FABRIC_STATUS_INDEX] = TT_FABRIC_STATUS_BAD_HEADER; + test_results[TT_FABRIC_MISC_INDEX] = packet_count; } else if (!timeout) { - test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_PASS; - test_results[PQ_TEST_MISC_INDEX] = packet_count; + test_results[TT_FABRIC_STATUS_INDEX] = TT_FABRIC_STATUS_PASS; + test_results[TT_FABRIC_MISC_INDEX] = packet_count; } else { - test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_TIMEOUT; + test_results[TT_FABRIC_STATUS_INDEX] = TT_FABRIC_STATUS_TIMEOUT; set_64b_result(test_results, words_flushed, TX_TEST_IDX_WORDS_FLUSHED); } } diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx_socket.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx_socket.cpp index 0fcb8ae7c38..c4518f246b7 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx_socket.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx_socket.cpp @@ -338,11 +338,11 @@ void kernel_main() { } zero_l1_buf(test_results, test_results_size_bytes); - test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_STARTED; - test_results[PQ_TEST_STATUS_INDEX + 1] = (uint32_t)local_pull_request; + test_results[TT_FABRIC_STATUS_INDEX] = TT_FABRIC_STATUS_STARTED; + test_results[TT_FABRIC_STATUS_INDEX + 1] = (uint32_t)local_pull_request; - test_results[PQ_TEST_MISC_INDEX] = 0xff000000; - test_results[PQ_TEST_MISC_INDEX + 1] = 0xcc000000 | src_endpoint_id; + test_results[TT_FABRIC_MISC_INDEX] = 0xff000000; + test_results[TT_FABRIC_MISC_INDEX + 1] = 0xcc000000 | src_endpoint_id; zero_l1_buf( reinterpret_cast(data_buffer_start_addr), data_buffer_size_words * PACKET_WORD_SIZE_BYTES); @@ -385,7 +385,7 @@ void kernel_main() { // once tt_fabric kernels have been launched on all the test devices. while (*(volatile tt_l1_ptr uint32_t*)signal_address == 0); - test_results[PQ_TEST_MISC_INDEX] = 0xff000001; + test_results[TT_FABRIC_MISC_INDEX] = 0xff000001; uint64_t data_words_sent = 0; uint64_t iter = 0; @@ -476,9 +476,9 @@ void kernel_main() { uint64_t cycles_elapsed = get_timestamp() - start_timestamp; uint64_t num_packets = input_queue_state.get_num_packets(); - set_64b_result(test_results, data_words_sent, PQ_TEST_WORD_CNT_INDEX); - set_64b_result(test_results, cycles_elapsed, PQ_TEST_CYCLES_INDEX); - set_64b_result(test_results, iter, PQ_TEST_ITER_INDEX); + set_64b_result(test_results, data_words_sent, TT_FABRIC_WORD_CNT_INDEX); + set_64b_result(test_results, cycles_elapsed, TT_FABRIC_CYCLES_INDEX); + set_64b_result(test_results, iter, TT_FABRIC_ITER_INDEX); set_64b_result(test_results, total_data_words, TX_TEST_IDX_TOT_DATA_WORDS); set_64b_result(test_results, num_packets, TX_TEST_IDX_NPKT); set_64b_result(test_results, zero_data_sent_iter, TX_TEST_IDX_ZERO_DATA_WORDS_SENT_ITER); @@ -486,13 +486,13 @@ void kernel_main() { set_64b_result(test_results, many_data_sent_iter, TX_TEST_IDX_MANY_DATA_WORDS_SENT_ITER); if (test_producer.packet_corrupted) { - test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_BAD_HEADER; - test_results[PQ_TEST_MISC_INDEX] = packet_count; + test_results[TT_FABRIC_STATUS_INDEX] = TT_FABRIC_STATUS_BAD_HEADER; + test_results[TT_FABRIC_MISC_INDEX] = packet_count; } else if (!timeout) { - test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_PASS; - test_results[PQ_TEST_MISC_INDEX] = packet_count; + test_results[TT_FABRIC_STATUS_INDEX] = TT_FABRIC_STATUS_PASS; + test_results[TT_FABRIC_MISC_INDEX] = packet_count; } else { - test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_TIMEOUT; + test_results[TT_FABRIC_STATUS_INDEX] = TT_FABRIC_STATUS_TIMEOUT; set_64b_result(test_results, words_flushed, TX_TEST_IDX_WORDS_FLUSHED); } } From 52c53d562387ad929c76d08f6b26834d5f4fa60e Mon Sep 17 00:00:00 2001 From: Oleg Milyutin Date: Sat, 15 Feb 2025 20:55:29 -0500 Subject: [PATCH 116/316] #17477: Introduce ND coordinate system for TT-distributed (#17745) ### Ticket #17477 ### Problem description Existing mesh infra assumes 2D. This assumption won't hold in the future. ### What's changed Introduce a new `SimpleMeshShape` that will gradually replace the existing `MeshShape`, after which it will be renamed to `MeshShape`. Introduce `MeshCoordinate`, `MeshCoordinateRange`, and `MeshContainer` - primitives designed to work with the new ND coordinate system. `MeshContainer` allows efficient flat representation of various metadata that matches the mesh shape. Iterators are available to make it easy to use. `MeshCoordinate` along with strides that are precomputed on `SimpleMeshShape` allows for an easy point access. The integration with `MeshBuffer` demonstrates the use case. Next steps: * Replace the existing `MeshShape`, `MeshOffset`, and the related aliases with the new `SimpleMeshShape`, and `MeshCoordinate`. * No plans to generalize with `CoreCoord`, for now. Cores are fundamentally in 2D, so a more specialized system can be used for efficiency. Also it is not desired to make `CoreCoord` to interop with `MeshCoordinate` - the 2 sets of coordinates mean entirely different concepts. * More functionality might be added, as we continue working on TT-distributed. ### Checklist - [X] [All post commit](https://github.com/tenstorrent/tt-metal/actions/runs/13347753550) - [X] New/Existing tests provide coverage for changes --- tests/tt_metal/distributed/CMakeLists.txt | 1 + .../tt_metal/distributed/test_mesh_coord.cpp | 290 ++++++++++++++ tt_metal/api/tt-metalium/mesh_buffer.hpp | 7 +- tt_metal/api/tt-metalium/mesh_coord.hpp | 370 ++++++++++++++++++ tt_metal/api/tt-metalium/mesh_device.hpp | 2 + tt_metal/api/tt-metalium/shape_base.hpp | 6 +- tt_metal/common/CMakeLists.txt | 1 + tt_metal/common/mesh_coord.cpp | 161 ++++++++ tt_metal/common/shape_base.cpp | 11 +- tt_metal/distributed/mesh_buffer.cpp | 28 +- tt_metal/distributed/mesh_device.cpp | 6 +- .../distributed_buffer_rw.cpp | 2 +- .../distributed_eltwise_add.cpp | 2 +- .../ttnn/operations/data_movement/pad/pad.cpp | 5 +- 14 files changed, 862 insertions(+), 30 deletions(-) create mode 100644 tests/tt_metal/distributed/test_mesh_coord.cpp create mode 100644 tt_metal/api/tt-metalium/mesh_coord.hpp create mode 100644 tt_metal/common/mesh_coord.cpp diff --git a/tests/tt_metal/distributed/CMakeLists.txt b/tests/tt_metal/distributed/CMakeLists.txt index 27bb9ee7b53..08fededb592 100644 --- a/tests/tt_metal/distributed/CMakeLists.txt +++ b/tests/tt_metal/distributed/CMakeLists.txt @@ -1,6 +1,7 @@ set(UNIT_TESTS_DISTRIBUTED_SRC ${CMAKE_CURRENT_SOURCE_DIR}/test_distributed.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_buffer.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_coord.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_workload.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_sub_device.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_allocator.cpp diff --git a/tests/tt_metal/distributed/test_mesh_coord.cpp b/tests/tt_metal/distributed/test_mesh_coord.cpp new file mode 100644 index 00000000000..09853a488a0 --- /dev/null +++ b/tests/tt_metal/distributed/test_mesh_coord.cpp @@ -0,0 +1,290 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#include "mesh_coord.hpp" + +namespace tt::tt_metal::distributed { +namespace { + +using ::testing::ElementsAre; + +TEST(SimpleMeshShapeTest, Construction) { + SimpleMeshShape shape_1d(3); + EXPECT_EQ(shape_1d.dims(), 1); + EXPECT_EQ(shape_1d[0], 3); + EXPECT_EQ(shape_1d.mesh_size(), 3); + + SimpleMeshShape shape_2d(3, 4); + EXPECT_EQ(shape_2d.dims(), 2); + EXPECT_EQ(shape_2d[0], 3); + EXPECT_EQ(shape_2d[1], 4); + EXPECT_EQ(shape_2d.mesh_size(), 12); + + SimpleMeshShape shape_3d(2, 3, 4); + EXPECT_EQ(shape_3d.dims(), 3); + EXPECT_EQ(shape_3d[0], 2); + EXPECT_EQ(shape_3d[1], 3); + EXPECT_EQ(shape_3d[2], 4); + EXPECT_EQ(shape_3d.mesh_size(), 24); + + SimpleMeshShape shape_5d({2, 3, 4, 5, 6}); + EXPECT_EQ(shape_5d.dims(), 5); + EXPECT_EQ(shape_5d[0], 2); + EXPECT_EQ(shape_5d[1], 3); + EXPECT_EQ(shape_5d[2], 4); + EXPECT_EQ(shape_5d[3], 5); + EXPECT_EQ(shape_5d[4], 6); + EXPECT_EQ(shape_5d.mesh_size(), 720); +} + +TEST(SimpleMeshShapeTest, ZeroShape) { + SimpleMeshShape shape({}); + EXPECT_EQ(shape.dims(), 0); + EXPECT_EQ(shape.mesh_size(), 0); +} + +TEST(SimpleMeshShapeTest, Strides) { + SimpleMeshShape shape(2, 3, 4); + EXPECT_EQ(shape.get_stride(0), 12); // 3 * 4 + EXPECT_EQ(shape.get_stride(1), 4); // 4 + EXPECT_EQ(shape.get_stride(2), 1); // 1 +} + +TEST(SimpleMeshShapeTest, Comparison) { + SimpleMeshShape shape(2, 3); + + EXPECT_EQ(shape, SimpleMeshShape(2, 3)); + EXPECT_NE(shape, SimpleMeshShape(3, 2)); + EXPECT_NE(shape, SimpleMeshShape(1, 2, 3)); +} + +TEST(MeshCoordinateTest, Construction) { + MeshCoordinate coord_1d(1); + EXPECT_EQ(coord_1d.dims(), 1); + EXPECT_THAT(coord_1d.coords(), ElementsAre(1)); + EXPECT_EQ(coord_1d[0], 1); + + MeshCoordinate coord_2d(1, 2); + EXPECT_EQ(coord_2d.dims(), 2); + EXPECT_THAT(coord_2d.coords(), ElementsAre(1, 2)); + EXPECT_EQ(coord_2d[0], 1); + EXPECT_EQ(coord_2d[1], 2); + + MeshCoordinate coord_3d(1, 2, 3); + EXPECT_EQ(coord_3d.dims(), 3); + EXPECT_THAT(coord_3d.coords(), ElementsAre(1, 2, 3)); + EXPECT_EQ(coord_3d[0], 1); + EXPECT_EQ(coord_3d[1], 2); + EXPECT_EQ(coord_3d[2], 3); + + std::vector values = {1, 2, 3, 4, 5}; + MeshCoordinate coord_span(values); + EXPECT_EQ(coord_span.dims(), 5); + EXPECT_THAT(coord_span.coords(), ElementsAre(1, 2, 3, 4, 5)); + EXPECT_EQ(coord_span[0], 1); + EXPECT_EQ(coord_span[1], 2); + EXPECT_EQ(coord_span[2], 3); + EXPECT_EQ(coord_span[3], 4); + EXPECT_EQ(coord_span[4], 5); +} + +TEST(MeshCoordinateTest, Comparison) { + MeshCoordinate coord1(1, 2); + + EXPECT_EQ(coord1, MeshCoordinate(1, 2)); + EXPECT_NE(coord1, MeshCoordinate(2, 1)); + EXPECT_NE(coord1, MeshCoordinate(1, 2, 1)); +} + +TEST(MeshCoordinateRangeTest, FromShape) { + SimpleMeshShape shape(2, 3); + MeshCoordinateRange range(shape); + + std::vector coords; + for (const auto& coord : range) { + coords.push_back(coord); + } + + EXPECT_THAT( + coords, + ElementsAre( + MeshCoordinate(0, 0), + MeshCoordinate(0, 1), + MeshCoordinate(0, 2), + MeshCoordinate(1, 0), + MeshCoordinate(1, 1), + MeshCoordinate(1, 2))); +} + +TEST(MeshCoordinateRangeTest, Subrange) { + MeshCoordinate start(1, 1, 1); + MeshCoordinate end(2, 1, 4); + MeshCoordinateRange range(start, end); + + std::vector coords; + for (const auto& coord : range) { + coords.push_back(coord); + } + + EXPECT_THAT( + coords, + ElementsAre( + MeshCoordinate(1, 1, 1), + MeshCoordinate(1, 1, 2), + MeshCoordinate(1, 1, 3), + MeshCoordinate(1, 1, 4), + MeshCoordinate(2, 1, 1), + MeshCoordinate(2, 1, 2), + MeshCoordinate(2, 1, 3), + MeshCoordinate(2, 1, 4))); +} + +TEST(MeshCoordinateRangeTest, SubrangeOneElement) { + MeshCoordinate start(1, 1, 1); + MeshCoordinate end(1, 1, 1); + MeshCoordinateRange range(start, end); + + std::vector coords; + for (const auto& coord : range) { + coords.push_back(coord); + } + + EXPECT_THAT(coords, ElementsAre(MeshCoordinate(1, 1, 1))); +} + +TEST(MeshCoordinateRangeTest, MismatchedDimensions) { + MeshCoordinate start(1, 0); + MeshCoordinate end(2, 3, 1); + EXPECT_ANY_THROW(MeshCoordinateRange(start, end)); +} + +TEST(MeshCoordinateRangeTest, InvalidRange) { + MeshCoordinate start(1, 2, 0); + MeshCoordinate end(1, 1, 1); + EXPECT_ANY_THROW(MeshCoordinateRange(start, end)); +} + +TEST(ToLinearIndexTest, Basic) { + SimpleMeshShape shape(2, 2, 3); + + EXPECT_EQ(to_linear_index(shape, MeshCoordinate(0, 0, 0)), 0); + EXPECT_EQ(to_linear_index(shape, MeshCoordinate(0, 0, 1)), 1); + EXPECT_EQ(to_linear_index(shape, MeshCoordinate(0, 0, 2)), 2); + EXPECT_EQ(to_linear_index(shape, MeshCoordinate(0, 1, 0)), 3); + EXPECT_EQ(to_linear_index(shape, MeshCoordinate(0, 1, 1)), 4); + EXPECT_EQ(to_linear_index(shape, MeshCoordinate(0, 1, 2)), 5); + EXPECT_EQ(to_linear_index(shape, MeshCoordinate(1, 0, 0)), 6); + EXPECT_EQ(to_linear_index(shape, MeshCoordinate(1, 0, 1)), 7); + EXPECT_EQ(to_linear_index(shape, MeshCoordinate(1, 0, 2)), 8); + EXPECT_EQ(to_linear_index(shape, MeshCoordinate(1, 1, 0)), 9); + EXPECT_EQ(to_linear_index(shape, MeshCoordinate(1, 1, 1)), 10); + EXPECT_EQ(to_linear_index(shape, MeshCoordinate(1, 1, 2)), 11); +} + +TEST(ToLinearIndexTest, MismatchedDimensions) { + EXPECT_ANY_THROW(to_linear_index(SimpleMeshShape(1, 2, 3), MeshCoordinate(0, 0))); +} + +TEST(ToLinearIndexTest, OutOfBounds) { + EXPECT_ANY_THROW(to_linear_index(SimpleMeshShape(2, 3), MeshCoordinate(2, 0))); + EXPECT_ANY_THROW(to_linear_index(SimpleMeshShape(2, 3), MeshCoordinate(0, 3))); +} + +TEST(MeshContainerTest, InitialValues) { + SimpleMeshShape shape(2, 3); + MeshContainer container(shape, 3); + + std::vector initial_values; + for (const auto& [_, value] : container) { + initial_values.push_back(value); + } + EXPECT_THAT(initial_values, ElementsAre(3, 3, 3, 3, 3, 3)); +} + +TEST(MeshContainerTest, ElementAccessRowMajor) { + SimpleMeshShape shape(2, 3); + MeshContainer container(shape, 0); + + container.at(MeshCoordinate(0, 0)) = 0; + container.at(MeshCoordinate(0, 1)) = 1; + container.at(MeshCoordinate(0, 2)) = 2; + container.at(MeshCoordinate(1, 0)) = 3; + container.at(MeshCoordinate(1, 1)) = 4; + container.at(MeshCoordinate(1, 2)) = 5; + + std::vector coords; + std::vector values; + for (const auto& [coord, value] : container) { + coords.push_back(coord); + values.push_back(value); + } + EXPECT_THAT( + coords, + ElementsAre( + MeshCoordinate(0, 0), + MeshCoordinate(0, 1), + MeshCoordinate(0, 2), + MeshCoordinate(1, 0), + MeshCoordinate(1, 1), + MeshCoordinate(1, 2))); + EXPECT_THAT(values, ElementsAre(0, 1, 2, 3, 4, 5)); +} + +TEST(MeshContainerTest, ConstContainer) { + SimpleMeshShape shape(2, 3); + const MeshContainer container(shape, 0); + + std::vector coords; + std::vector values; + for (const auto& [coord, value] : container) { + coords.push_back(coord); + values.push_back(value); + } + EXPECT_THAT( + coords, + ElementsAre( + MeshCoordinate(0, 0), + MeshCoordinate(0, 1), + MeshCoordinate(0, 2), + MeshCoordinate(1, 0), + MeshCoordinate(1, 1), + MeshCoordinate(1, 2))); + EXPECT_THAT(values, ElementsAre(0, 0, 0, 0, 0, 0)); +} + +TEST(MeshContainerTest, MutateThroughProxy) { + SimpleMeshShape shape(2, 3); + MeshContainer container(shape, 0); + + // Proxy class provides access to the container value through the mutable reference. + int updated_value = 0; + for (auto& [_, value] : container) { + value = updated_value++; + } + + // `auto` makes a copy of the value, verify this loop is a no-op. + for (auto [_, value] : container) { + value = updated_value++; + } + + std::vector values; + for (const auto& [_, value] : container) { + values.push_back(value); + } + EXPECT_THAT(values, ElementsAre(0, 1, 2, 3, 4, 5)); +} + +TEST(MeshContainerTest, OutOfBounds) { + SimpleMeshShape shape(2, 3); + MeshContainer container(shape, 0); + + EXPECT_ANY_THROW(container.at(MeshCoordinate(2, 0))); + EXPECT_ANY_THROW(container.at(MeshCoordinate(0, 0, 0))); +} + +} // namespace +} // namespace tt::tt_metal::distributed diff --git a/tt_metal/api/tt-metalium/mesh_buffer.hpp b/tt_metal/api/tt-metalium/mesh_buffer.hpp index 0e029685b47..8656fc02e67 100644 --- a/tt_metal/api/tt-metalium/mesh_buffer.hpp +++ b/tt_metal/api/tt-metalium/mesh_buffer.hpp @@ -6,6 +6,7 @@ #include "buffer.hpp" #include "buffer_constants.hpp" +#include "mesh_coord.hpp" #include "mesh_device.hpp" #include "mesh_device_view.hpp" #include "shape2d.hpp" @@ -96,6 +97,7 @@ class MeshBuffer { const DeviceLocalBufferConfig& device_local_config() const { return device_local_config_; } std::shared_ptr get_device_buffer(const Coordinate& device_coord) const; + std::shared_ptr get_device_buffer(const MeshCoordinate& device_coord) const; uint32_t datum_size_bytes() const; Shape2D physical_shard_shape() const; std::pair replicated_dims() const; @@ -108,6 +110,7 @@ class MeshBuffer { DeviceAddr device_local_size, MeshDevice* mesh_device, std::shared_ptr backing_buffer) : + buffers_(SimpleMeshShape(mesh_device->shape()), nullptr), config_(config), device_local_config_(device_local_config), mesh_device_(mesh_device), @@ -122,6 +125,7 @@ class MeshBuffer { DeviceAddr address, DeviceAddr device_local_size, MeshDevice* mesh_device) : + buffers_(SimpleMeshShape(mesh_device->shape()), /*fill_value=*/nullptr), config_(config), device_local_config_(device_local_config), mesh_device_(mesh_device), @@ -136,8 +140,7 @@ class MeshBuffer { DeviceAddr address_ = 0; DeviceAddr device_local_size_ = 0; - // TODO: Consider optimizing with SmallVector. - std::vector>> buffers_; + MeshContainer> buffers_; // `MeshBufferState` specifies the state of the MeshBuffer. It can either be: // 1. Owned - a single device buffer is responsible for providing the address for the entire mesh buffer. diff --git a/tt_metal/api/tt-metalium/mesh_coord.hpp b/tt_metal/api/tt-metalium/mesh_coord.hpp new file mode 100644 index 00000000000..e346ce2ca83 --- /dev/null +++ b/tt_metal/api/tt-metalium/mesh_coord.hpp @@ -0,0 +1,370 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include + +#include "shape_base.hpp" + +namespace tt::tt_metal::distributed { + +struct MeshShape; + +// TODO: #17477 - Rename to `MeshShape` when the legacy type is gone. +class SimpleMeshShape : public ShapeBase { +public: + using ShapeBase::ShapeBase; + using ShapeBase::operator[]; + + // Shorthands for constructing 1D, 2D and 3D shapes. + SimpleMeshShape(uint32_t x); + SimpleMeshShape(uint32_t x, uint32_t y); + SimpleMeshShape(uint32_t x, uint32_t y, uint32_t z); + + // Temporary constructor for transitioning to `SimpleMeshShape`. + SimpleMeshShape(const MeshShape& legacy_shape); + + // Returns the dimensionality of the mesh. + size_t dims() const; + + // Returns the stride for the given dimension. + size_t get_stride(size_t dim) const; + + // Returns the total number of elements in the mesh. + size_t mesh_size() const; + + // Needed for reflect / fmt + static constexpr auto attribute_names = std::forward_as_tuple("value"); + auto attribute_values() const { return std::forward_as_tuple(value_); } + + friend bool operator==(const SimpleMeshShape& lhs, const SimpleMeshShape& rhs); + friend bool operator!=(const SimpleMeshShape& lhs, const SimpleMeshShape& rhs); + friend std::ostream& operator<<(std::ostream& os, const SimpleMeshShape& shape); + +private: + using ShapeBase::empty; + using ShapeBase::size; + + void compute_strides(); + tt::stl::SmallVector strides_; +}; + +class MeshCoordinate { +public: + // Shorthands for constructing 1D, 2D and 3D coordinates. + MeshCoordinate(uint32_t x); + MeshCoordinate(uint32_t x, uint32_t y); + MeshCoordinate(uint32_t x, uint32_t y, uint32_t z); + + // Constructs a generic N-dimensional coordinate. + explicit MeshCoordinate(tt::stl::Span coords); + + // Returns the dimensionality of the coordinate. + size_t dims() const; + + // Returns the coordinate values as a span. + tt::stl::Span coords() const; + + // Returns the coordinate value at the given index. + uint32_t operator[](size_t dim) const; + + // Needed for reflect / fmt + static constexpr auto attribute_names = std::forward_as_tuple("value"); + auto attribute_values() const { return std::forward_as_tuple(value_); } + + friend bool operator==(const MeshCoordinate& lhs, const MeshCoordinate& rhs); + friend bool operator!=(const MeshCoordinate& lhs, const MeshCoordinate& rhs); + friend std::ostream& operator<<(std::ostream& os, const MeshCoordinate& shape); + +private: + tt::stl::SmallVector value_; +}; + +// Converts a MeshCoordinate to a linear index. +// Throws if `coord` is out of bounds of `shape`. +size_t to_linear_index(const SimpleMeshShape& shape, const MeshCoordinate& coord); + +// Represents a range of MeshCoordinates. Requires that mesh coordinates have the same dimensionality. +class MeshCoordinateRange { +public: + // Constructs an inclusive range that iterates between `start` and `end`. + MeshCoordinateRange(const MeshCoordinate& start, const MeshCoordinate& end); + + // Constructs a range that iterates over all coordinates in the mesh. + MeshCoordinateRange(const SimpleMeshShape& shape); + + // Returns start and (inclusive) end coordinates of the range. + const MeshCoordinate& start_coord() const; + const MeshCoordinate& end_coord() const; + + class Iterator { + public: + Iterator& operator++(); + const MeshCoordinate& operator*() const; + bool operator==(const Iterator& other) const; + bool operator!=(const Iterator& other) const; + + private: + Iterator(const MeshCoordinateRange* range, const MeshCoordinate& current_coord, size_t linear_index); + friend class MeshCoordinateRange; + + const MeshCoordinateRange* range_ = nullptr; + + // For simplicity, rely on `linear_index_` for the iterator boundary check, and allow + // MeshCoordinate to wrap around the range end. + MeshCoordinate current_coord_; + size_t linear_index_ = 0; + }; + + Iterator begin() const; + Iterator end() const; + + friend bool operator==(const MeshCoordinateRange& lhs, const MeshCoordinateRange& rhs); + friend bool operator!=(const MeshCoordinateRange& lhs, const MeshCoordinateRange& rhs); + +private: + MeshCoordinate start_; + MeshCoordinate end_; +}; + +namespace detail { + +// Proxy class that allows convenient structured binding to a pair of a coordinate and the value it points to. +// This supports iterator semantics similar to `std::map` / `std::unordered_map`. +template +class MeshCoordinateValueProxy { +public: + MeshCoordinateValueProxy(const MeshCoordinate* coord, T* value_ptr) : coord_(coord), value_ptr_(value_ptr) {} + + const MeshCoordinate& coord() const { return *coord_; } + T& value() { return *value_ptr_; } + const T& value() const { return *value_ptr_; } + + template + decltype(auto) get() & { + if constexpr (I == 0) { + return coord(); + } else if constexpr (I == 1) { + return value(); + } else { + static_assert(I < 2); + } + } + + template + decltype(auto) get() const& { + if constexpr (I == 0) { + return coord(); + } else if constexpr (I == 1) { + return value(); + } else { + static_assert(I < 2); + } + } + + // Force a copy via `auto`. + template + auto get() const&& { + return get(); + } + +private: + const MeshCoordinate* coord_ = nullptr; + T* value_ptr_ = nullptr; +}; + +} // namespace detail + +// Allows storing data in a mesh-shaped flat container, with convenient accessors and iterators. +// The iteration order and the storage memory layout is row-major. +template +class MeshContainer { +public: + MeshContainer(const SimpleMeshShape& shape, const T& fill_value); + + // Returns a shape of the container. + const SimpleMeshShape& shape() const; + + // Accessor methods. + T& at(const MeshCoordinate& coord); + const T& at(const MeshCoordinate& coord) const; + + // Allows to iterate over the container elements, returning a pair of (coordinate, value reference). + class Iterator { + public: + using ValueProxy = detail::MeshCoordinateValueProxy; + + Iterator& operator++(); + ValueProxy& operator*(); + bool operator==(const Iterator& other) const; + bool operator!=(const Iterator& other) const; + + private: + Iterator(MeshContainer* container, const MeshCoordinateRange::Iterator& coord_iter, size_t linear_index); + friend class MeshContainer; + + MeshContainer* container_ = nullptr; + MeshCoordinateRange::Iterator coord_iter_; + size_t linear_index_ = 0; + + // Provides mutable access to the container value along with the coordinate from the range iterator. + ValueProxy value_proxy_; + }; + + class ConstIterator { + public: + using ValueProxy = detail::MeshCoordinateValueProxy; + + ConstIterator& operator++(); + const ValueProxy& operator*() const; + bool operator==(const ConstIterator& other) const; + bool operator!=(const ConstIterator& other) const; + + private: + ConstIterator( + const MeshContainer* container, const MeshCoordinateRange::Iterator& coord_iter, size_t linear_index); + friend class MeshContainer; + + const MeshContainer* container_ = nullptr; + MeshCoordinateRange::Iterator coord_iter_; + size_t linear_index_ = 0; + + // Provides mutable access to the container value along with the coordinate from the range iterator. + ValueProxy value_proxy_; + }; + + Iterator begin(); + Iterator end(); + ConstIterator begin() const; + ConstIterator end() const; + +private: + SimpleMeshShape shape_; + MeshCoordinateRange coord_range_; + std::vector values_; +}; + +template +MeshContainer::MeshContainer(const SimpleMeshShape& shape, const T& fill_value) : + shape_(shape), coord_range_(shape), values_(shape.mesh_size(), fill_value) {} + +template +const SimpleMeshShape& MeshContainer::shape() const { + return shape_; +} + +template +T& MeshContainer::at(const MeshCoordinate& coord) { + return values_.at(to_linear_index(shape_, coord)); +} + +template +const T& MeshContainer::at(const MeshCoordinate& coord) const { + return values_.at(to_linear_index(shape_, coord)); +} + +template +MeshContainer::Iterator::Iterator( + MeshContainer* container, const MeshCoordinateRange::Iterator& coord_iter, size_t linear_index) : + container_(container), + coord_iter_(coord_iter), + linear_index_(linear_index), + value_proxy_(&(*coord_iter_), &container_->values_[linear_index_]) {} + +template +typename MeshContainer::Iterator& MeshContainer::Iterator::operator++() { + ++linear_index_; + ++coord_iter_; + value_proxy_ = ValueProxy(&(*coord_iter_), &container_->values_[linear_index_]); + return *this; +} + +template +typename MeshContainer::Iterator::ValueProxy& MeshContainer::Iterator::operator*() { + return value_proxy_; +} + +template +MeshContainer::ConstIterator::ConstIterator( + const MeshContainer* container, const MeshCoordinateRange::Iterator& coord_iter, size_t linear_index) : + container_(container), + coord_iter_(coord_iter), + linear_index_(linear_index), + value_proxy_(&(*coord_iter_), &container_->values_[linear_index_]) {} + +template +typename MeshContainer::ConstIterator& MeshContainer::ConstIterator::operator++() { + ++linear_index_; + ++coord_iter_; + value_proxy_ = ValueProxy(&(*coord_iter_), &container_->values_[linear_index_]); + return *this; +} + +template +const typename MeshContainer::ConstIterator::ValueProxy& MeshContainer::ConstIterator::operator*() const { + return value_proxy_; +} + +template +bool MeshContainer::Iterator::operator==(const Iterator& other) const { + return container_ == other.container_ && coord_iter_ == other.coord_iter_ && linear_index_ == other.linear_index_; +} + +template +bool MeshContainer::Iterator::operator!=(const Iterator& other) const { + return !(*this == other); +} + +template +bool MeshContainer::ConstIterator::operator==(const ConstIterator& other) const { + return container_ == other.container_ && coord_iter_ == other.coord_iter_ && linear_index_ == other.linear_index_; +} + +template +bool MeshContainer::ConstIterator::operator!=(const ConstIterator& other) const { + return !(*this == other); +} + +template +typename MeshContainer::Iterator MeshContainer::begin() { + return Iterator(this, coord_range_.begin(), /* linear_index = */ 0); +} + +template +typename MeshContainer::Iterator MeshContainer::end() { + return Iterator(this, coord_range_.end(), shape_.mesh_size()); +} + +template +typename MeshContainer::ConstIterator MeshContainer::begin() const { + return ConstIterator(this, coord_range_.begin(), /* linear_index = */ 0); +} + +template +typename MeshContainer::ConstIterator MeshContainer::end() const { + return ConstIterator(this, coord_range_.end(), shape_.mesh_size()); +} + +} // namespace tt::tt_metal::distributed + +namespace std { + +template +struct tuple_size> : std::integral_constant { +}; + +template +struct tuple_element<0, tt::tt_metal::distributed::detail::MeshCoordinateValueProxy> { + using type = const tt::tt_metal::distributed::MeshCoordinate; +}; + +template +struct tuple_element<1, tt::tt_metal::distributed::detail::MeshCoordinateValueProxy> { + using type = T; +}; + +} // namespace std diff --git a/tt_metal/api/tt-metalium/mesh_device.hpp b/tt_metal/api/tt-metalium/mesh_device.hpp index 91638a57cb6..979e603a6cd 100644 --- a/tt_metal/api/tt-metalium/mesh_device.hpp +++ b/tt_metal/api/tt-metalium/mesh_device.hpp @@ -12,6 +12,7 @@ #include "device.hpp" #include "mesh_config.hpp" +#include "mesh_coord.hpp" #include "mesh_device_view.hpp" #include "sub_device_types.hpp" #include "span.hpp" @@ -204,6 +205,7 @@ class MeshDevice : public IDevice, public std::enable_shared_from_this -#include #include "small_vector.hpp" +#include "span.hpp" namespace tt::tt_metal { @@ -24,7 +24,7 @@ class ShapeBase { explicit ShapeBase(const std::array& arr) : value_(arr.begin(), arr.end()) { init(); } - explicit ShapeBase(std::span span) : value_(span.begin(), span.end()) { init(); } + explicit ShapeBase(tt::stl::Span span) : value_(span.begin(), span.end()) { init(); } template bool operator==(const std::array& other) const { @@ -42,7 +42,7 @@ class ShapeBase { Container::const_iterator cbegin() const; Container::const_iterator cend() const; - std::span view() const; + tt::stl::Span view() const; bool empty() const; diff --git a/tt_metal/common/CMakeLists.txt b/tt_metal/common/CMakeLists.txt index 28f27de3edf..7d43d25d5b0 100644 --- a/tt_metal/common/CMakeLists.txt +++ b/tt_metal/common/CMakeLists.txt @@ -1,6 +1,7 @@ set(COMMON_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/core_assignment.cpp ${CMAKE_CURRENT_SOURCE_DIR}/core_coord.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/mesh_coord.cpp ${CMAKE_CURRENT_SOURCE_DIR}/metal_soc_descriptor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/shape2d.cpp ${CMAKE_CURRENT_SOURCE_DIR}/shape_base.cpp diff --git a/tt_metal/common/mesh_coord.cpp b/tt_metal/common/mesh_coord.cpp new file mode 100644 index 00000000000..9a98a0ce801 --- /dev/null +++ b/tt_metal/common/mesh_coord.cpp @@ -0,0 +1,161 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include +#include +#include +#include +#include +#include + +namespace tt::tt_metal::distributed { +namespace { + +// Returns a zero coordinate of dimensionality `dims`. +MeshCoordinate zero_coordinate(size_t dims) { return MeshCoordinate(tt::stl::SmallVector(dims, 0)); } + +// Returns the last valid coordinate for the provided `shape`. +MeshCoordinate shape_back(const SimpleMeshShape& shape) { + tt::stl::SmallVector coords; + for (int i = 0; i < shape.dims(); i++) { + coords.push_back(shape[i] - 1); + } + return MeshCoordinate(coords); +} + +} // namespace + +SimpleMeshShape::SimpleMeshShape(uint32_t x) : ShapeBase({x}) { compute_strides(); } +SimpleMeshShape::SimpleMeshShape(uint32_t x, uint32_t y) : ShapeBase({x, y}) { compute_strides(); } +SimpleMeshShape::SimpleMeshShape(uint32_t x, uint32_t y, uint32_t z) : ShapeBase({x, y, z}) { compute_strides(); } + +SimpleMeshShape::SimpleMeshShape(const MeshShape& legacy_shape) : + SimpleMeshShape(legacy_shape.num_rows, legacy_shape.num_cols) {} + +void SimpleMeshShape::compute_strides() { + size_t stride = 1; + strides_.resize(dims()); + for (int dim = dims() - 1; dim >= 0; --dim) { + strides_[dim] = stride; + stride *= (*this)[dim]; + } +} + +size_t SimpleMeshShape::get_stride(size_t dim) const { return strides_[dim]; } + +size_t SimpleMeshShape::dims() const { return size(); } +size_t SimpleMeshShape::mesh_size() const { + return empty() ? 0 : std::accumulate(value_.begin(), value_.end(), 1, std::multiplies()); +} + +bool operator==(const SimpleMeshShape& lhs, const SimpleMeshShape& rhs) = default; +bool operator!=(const SimpleMeshShape& lhs, const SimpleMeshShape& rhs) = default; + +std::ostream& operator<<(std::ostream& os, const SimpleMeshShape& shape) { + os << "SimpleMeshShape(["; + for (size_t i = 0; i < shape.dims(); ++i) { + if (i > 0) { + os << ", "; + } + os << shape[i]; + } + os << "])"; + return os; +} + +MeshCoordinate::MeshCoordinate(uint32_t coord) : value_({coord}) {} +MeshCoordinate::MeshCoordinate(uint32_t x, uint32_t y) : value_({x, y}) {} +MeshCoordinate::MeshCoordinate(uint32_t x, uint32_t y, uint32_t z) : value_({x, y, z}) {} + +MeshCoordinate::MeshCoordinate(tt::stl::Span coords) : value_(coords.begin(), coords.end()) {} + +size_t MeshCoordinate::dims() const { return value_.size(); } +tt::stl::Span MeshCoordinate::coords() const { return value_; } +uint32_t MeshCoordinate::operator[](size_t dim) const { return value_[dim]; } + +bool operator==(const MeshCoordinate& lhs, const MeshCoordinate& rhs) { + return lhs.dims() == rhs.dims() && std::equal(lhs.coords().begin(), lhs.coords().end(), rhs.coords().begin()); +} +bool operator!=(const MeshCoordinate& lhs, const MeshCoordinate& rhs) { return !(lhs == rhs); } + +std::ostream& operator<<(std::ostream& os, const MeshCoordinate& coord) { + os << "MeshCoordinate(" << coord.dims() << ", ["; + for (size_t dim : coord.coords()) { + os << dim << ", "; + } + os << "])"; + return os; +} + +MeshCoordinateRange::MeshCoordinateRange(const MeshCoordinate& start, const MeshCoordinate& end) : + start_(start), end_(end) { + TT_FATAL( + start.dims() == end.dims(), + "Start and end dimensions of a coordinate range do not match: {} != {}", + start.dims(), + end.dims()); + for (size_t i = 0; i < start.dims(); ++i) { + TT_FATAL(start[i] <= end[i], "Start coordinate is greater than end coordinate: {} > {}", start, end); + } +} + +MeshCoordinateRange::MeshCoordinateRange(const SimpleMeshShape& shape) : + MeshCoordinateRange(zero_coordinate(shape.dims()), shape_back(shape)) {} + +const MeshCoordinate& MeshCoordinateRange::start_coord() const { return start_; } +const MeshCoordinate& MeshCoordinateRange::end_coord() const { return end_; } + +MeshCoordinateRange::Iterator::Iterator( + const MeshCoordinateRange* range, const MeshCoordinate& current, size_t linear_index) : + range_(range), current_coord_(current), linear_index_(linear_index) {} + +MeshCoordinateRange::Iterator& MeshCoordinateRange::Iterator::operator++() { + ++linear_index_; + + tt::stl::SmallVector new_coords(current_coord_.coords().begin(), current_coord_.coords().end()); + for (int i = new_coords.size() - 1; i >= 0; --i) { + auto& dimension_value = new_coords[i]; + if (++dimension_value > range_->end_coord()[i]) { + dimension_value = range_->start_coord()[i]; + } else { + break; + } + } + current_coord_ = MeshCoordinate(new_coords); + return *this; +} +const MeshCoordinate& MeshCoordinateRange::Iterator::operator*() const { return current_coord_; } +bool MeshCoordinateRange::Iterator::operator==(const Iterator& other) const { + return range_ == other.range_ && linear_index_ == other.linear_index_; +} +bool MeshCoordinateRange::Iterator::operator!=(const Iterator& other) const { return !(*this == other); } + +MeshCoordinateRange::Iterator MeshCoordinateRange::begin() const { return Iterator(this, start_, /*linear_index=*/0); } +MeshCoordinateRange::Iterator MeshCoordinateRange::end() const { + size_t range_size = 1; + for (size_t i = 0; i < start_.dims(); ++i) { + range_size *= end_[i] - start_[i] + 1; + } + // Set `start_` coordinate but `range_size` linear index as the wrap around condition. + return Iterator(this, start_, range_size); +} + +size_t to_linear_index(const SimpleMeshShape& shape, const MeshCoordinate& coord) { + TT_FATAL( + shape.dims() == coord.dims(), + "Shape and coordinate dimensions do not match: {} != {}", + shape.dims(), + coord.dims()); + + size_t linear_index = 0; + for (size_t dim = 0; dim < coord.dims(); ++dim) { + TT_FATAL(coord[dim] < shape[dim], "Coordinate {} is out of bounds for shape {}", coord, shape); + linear_index += coord[dim] * shape.get_stride(dim); + } + return linear_index; +} + +} // namespace tt::tt_metal::distributed diff --git a/tt_metal/common/shape_base.cpp b/tt_metal/common/shape_base.cpp index 57e69bb49e6..33acd941d22 100644 --- a/tt_metal/common/shape_base.cpp +++ b/tt_metal/common/shape_base.cpp @@ -4,7 +4,9 @@ #include "assert.hpp" #include "shape_base.hpp" +#include #include +#include #include "fmt/color.h" namespace tt::tt_metal { @@ -46,7 +48,14 @@ bool ShapeBase::empty() const { return original_size_ == 0; } size_t ShapeBase::size() const { return original_size_; } -std::span ShapeBase::view() const { return std::span(cbegin(), cend()); } +tt::stl::Span ShapeBase::view() const { + const auto begin = cbegin(); + const auto end = cend(); + // `Span` constructor requires a contiguous range of data. + static_assert( + std::is_base_of_v::iterator_category>); + return tt::stl::Span(&*begin, std::distance(begin, end)); +} bool ShapeBase::operator==(const ShapeBase& other) const = default; diff --git a/tt_metal/distributed/mesh_buffer.cpp b/tt_metal/distributed/mesh_buffer.cpp index a0bf7b76e86..13d1fc5e6cc 100644 --- a/tt_metal/distributed/mesh_buffer.cpp +++ b/tt_metal/distributed/mesh_buffer.cpp @@ -4,6 +4,8 @@ // SPDX-License-Identifier: Apache-2.0 #include +#include +#include #include #include @@ -110,12 +112,9 @@ std::shared_ptr MeshBuffer::create( } void MeshBuffer::initialize_device_buffers() { - buffers_ = std::vector>>( - mesh_device_->num_rows(), std::vector>(mesh_device_->num_cols())); - - auto init_device_buffer_at_address = [this](const Coordinate& coord) { + auto init_device_buffer_at_address = [this](const MeshCoordinate& coord) { std::shared_ptr buffer = Buffer::create( - mesh_device_->get_device(coord.row, coord.col), + mesh_device_->get_device(coord), address_, device_local_size_, device_local_config_.page_size, @@ -126,10 +125,8 @@ void MeshBuffer::initialize_device_buffers() { return buffer; }; - for (int row = 0; row < mesh_device_->num_rows(); row++) { - for (int col = 0; col < mesh_device_->num_cols(); col++) { - buffers_[row][col] = init_device_buffer_at_address(Coordinate{row, col}); - } + for (auto& [coord, device_buffer] : buffers_) { + device_buffer = init_device_buffer_at_address(coord); } } @@ -138,14 +135,11 @@ bool MeshBuffer::is_allocated() const { return not std::holds_alternative MeshBuffer::get_device_buffer(const Coordinate& device_coord) const { - TT_FATAL( - device_coord.row < mesh_device_->num_rows() and device_coord.col < mesh_device_->num_cols(), - "Logical coordinates must be within the bounds of the mesh: {}, {}, mesh shape: {}, {}", - device_coord.row, - device_coord.col, - mesh_device_->num_rows(), - mesh_device_->num_cols()); - return buffers_[device_coord.row][device_coord.col]; + return get_device_buffer(MeshCoordinate(device_coord.row, device_coord.col)); +} + +std::shared_ptr MeshBuffer::get_device_buffer(const MeshCoordinate& device_coord) const { + return buffers_.at(device_coord); } DeviceAddr MeshBuffer::size() const { diff --git a/tt_metal/distributed/mesh_device.cpp b/tt_metal/distributed/mesh_device.cpp index 04edd94373b..603ce95212e 100644 --- a/tt_metal/distributed/mesh_device.cpp +++ b/tt_metal/distributed/mesh_device.cpp @@ -211,7 +211,11 @@ std::vector MeshDevice::get_devices() const { return view_->get_device // TODO: Remove this function once we have a proper view interface IDevice* MeshDevice::get_device(size_t row_idx, size_t col_idx) const { - return this->get_device_index(row_idx * num_cols() + col_idx); + return get_device(MeshCoordinate{row_idx, col_idx}); +} + +IDevice* MeshDevice::get_device(const MeshCoordinate& coord) const { + return this->get_device_index(to_linear_index(SimpleMeshShape(mesh_shape_), coord)); } MeshCommandQueue& MeshDevice::mesh_command_queue(std::size_t cq_id) const { diff --git a/tt_metal/programming_examples/distributed/2_distributed_buffer_rw/distributed_buffer_rw.cpp b/tt_metal/programming_examples/distributed/2_distributed_buffer_rw/distributed_buffer_rw.cpp index d54d6a1c6e7..a1b17cec8d5 100644 --- a/tt_metal/programming_examples/distributed/2_distributed_buffer_rw/distributed_buffer_rw.cpp +++ b/tt_metal/programming_examples/distributed/2_distributed_buffer_rw/distributed_buffer_rw.cpp @@ -26,7 +26,7 @@ int main(int argc, char** argv) { // We will create a distributed buffer with 8 shards of {32, 32} and distribute it across the devices in the mesh. auto shard_shape = Shape2D{32, 32}; auto distributed_buffer_shape = Shape2D{32 * mesh_device->num_rows(), 32 * mesh_device->num_cols()}; - uint32_t tile_size_bytes = detail::TileSize(tt::DataFormat::UInt32); + uint32_t tile_size_bytes = tt::tt_metal::detail::TileSize(tt::DataFormat::UInt32); uint32_t distributed_buffer_size_bytes = 64 * 128 * tile_size_bytes; auto local_buffer_config = DeviceLocalBufferConfig{ diff --git a/tt_metal/programming_examples/distributed/3_distributed_eltwise_add/distributed_eltwise_add.cpp b/tt_metal/programming_examples/distributed/3_distributed_eltwise_add/distributed_eltwise_add.cpp index 73bf18ee0be..9dbf0bbbd61 100644 --- a/tt_metal/programming_examples/distributed/3_distributed_eltwise_add/distributed_eltwise_add.cpp +++ b/tt_metal/programming_examples/distributed/3_distributed_eltwise_add/distributed_eltwise_add.cpp @@ -92,7 +92,7 @@ int main(int argc, char** argv) { auto distributed_buffer_shape = Shape2D{shard_shape.height() * mesh_device->num_rows(), shard_shape.width() * mesh_device->num_cols()}; auto num_tiles = 1; - auto tile_size_bytes = detail::TileSize(tt::DataFormat::Float16_b); + auto tile_size_bytes = tt::tt_metal::detail::TileSize(tt::DataFormat::Float16_b); auto distributed_buffer_size_bytes = mesh_device->num_rows() * mesh_device->num_cols() * tile_size_bytes; // Configure device-local buffer settings diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/pad.cpp b/ttnn/cpp/ttnn/operations/data_movement/pad/pad.cpp index b5232f2c464..9e4382f3d73 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/pad/pad.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/pad/pad.cpp @@ -14,10 +14,7 @@ namespace ttnn::operations::data_movement { namespace { -template -bool eq_spans(const ArrayType& a, const ArrayType& b) { - return std::equal(a.begin(), a.end(), b.begin(), b.end()); -} +bool eq_spans(const auto a, const auto b) { return std::equal(a.begin(), a.end(), b.begin(), b.end()); } ttnn::Shape update_original_shape(const ttnn::Shape& padded_shape, const ttnn::Shape& input_shape) { ttnn::SmallVector updated_shape; From 7ee3c897d2d328ecc88fe3c5135f03fae96cf25b Mon Sep 17 00:00:00 2001 From: asaigal Date: Fri, 31 Jan 2025 21:27:50 +0000 Subject: [PATCH 117/316] Add TT-Mesh tests to N300 post commit --- .github/workflows/cpp-post-commit.yaml | 4 +- .../tt_metal/distributed/test_distributed.cpp | 2 +- .../distributed/test_mesh_allocator.cpp | 2 +- .../tt_metal/distributed/test_mesh_buffer.cpp | 163 +++++++++--------- .../tt_metal/distributed/test_mesh_events.cpp | 19 +- .../distributed/test_mesh_sub_device.cpp | 8 +- .../distributed/test_mesh_workload.cpp | 47 ++--- .../tt_metal/common/multi_device_fixture.hpp | 125 ++++++++++---- 8 files changed, 223 insertions(+), 147 deletions(-) diff --git a/.github/workflows/cpp-post-commit.yaml b/.github/workflows/cpp-post-commit.yaml index 93744a0bc7b..ed0c1f165e7 100644 --- a/.github/workflows/cpp-post-commit.yaml +++ b/.github/workflows/cpp-post-commit.yaml @@ -62,11 +62,9 @@ jobs: {name: eth, cmd: "./build/test/tt_metal/unit_tests_eth_${{ inputs.arch }}"}, {name: llk, cmd: "./build/test/tt_metal/unit_tests_llk"}, {name: stl, cmd: "./build/test/tt_metal/unit_tests_stl"}, - {name: distributed, cmd: "./build/test/tt_metal/distributed/distributed_unit_tests_${{ inputs.arch }} --gtest_filter=MeshDeviceSuite.*"}, - + {name: distributed, cmd: "./build/test/tt_metal/distributed/distributed_unit_tests_${{ inputs.arch }}"}, {name: lightmetal, cmd: "./build/test/tt_metal/unit_tests_lightmetal"}, {name: dispatch multicmd queue, cmd: "TT_METAL_GTEST_NUM_HW_CQS=2 ./build/test/tt_metal/unit_tests_dispatch_${{ inputs.arch }} --gtest_filter=MultiCommandQueue*Fixture.*"}, - {name: ttnn cpp unit tests, cmd: ./build/test/ttnn/unit_tests_ttnn}, {name: ttnn ccl cpp unit tests, cmd: ./build/test/ttnn/unit_tests_ttnn_ccl}, {name: ttnn tensor cpp unit tests, cmd: ./build/test/ttnn/unit_tests_ttnn_tensor}, diff --git a/tests/tt_metal/distributed/test_distributed.cpp b/tests/tt_metal/distributed/test_distributed.cpp index 218967b90df..bf8877879e3 100644 --- a/tests/tt_metal/distributed/test_distributed.cpp +++ b/tests/tt_metal/distributed/test_distributed.cpp @@ -9,7 +9,7 @@ namespace tt::tt_metal::distributed::test { namespace { -TEST_F(T3000MultiDeviceFixture, SimpleMeshDeviceTest) { +TEST_F(T3000MeshDeviceFixture, SimpleMeshDeviceTest) { EXPECT_EQ(mesh_device_->num_devices(), 8); EXPECT_EQ(mesh_device_->num_rows(), 2); EXPECT_EQ(mesh_device_->num_cols(), 4); diff --git a/tests/tt_metal/distributed/test_mesh_allocator.cpp b/tests/tt_metal/distributed/test_mesh_allocator.cpp index 903b3d6444c..89bda02642f 100644 --- a/tests/tt_metal/distributed/test_mesh_allocator.cpp +++ b/tests/tt_metal/distributed/test_mesh_allocator.cpp @@ -10,7 +10,7 @@ namespace tt::tt_metal::distributed::test { -using MeshAllocatorTest = T3000MultiDeviceFixture; +using MeshAllocatorTest = T3000MeshDeviceFixture; TEST_F(MeshAllocatorTest, BasicAllocationSanityCheck) { const size_t allocation_size = 1024 * 8; // 1KB diff --git a/tests/tt_metal/distributed/test_mesh_buffer.cpp b/tests/tt_metal/distributed/test_mesh_buffer.cpp index 0424f9250b4..5fdc6369a24 100644 --- a/tests/tt_metal/distributed/test_mesh_buffer.cpp +++ b/tests/tt_metal/distributed/test_mesh_buffer.cpp @@ -14,7 +14,8 @@ namespace tt::tt_metal::distributed::test { namespace { -using MeshBufferTest = T3000MultiDeviceFixture; +using MeshBufferTestT3000 = T3000MeshDeviceFixture; +using MeshBufferTestSuite = GenericMeshDeviceFixture; struct DeviceLocalShardedBufferTestConfig { Shape2D num_pages_per_core; @@ -47,36 +48,8 @@ struct DeviceLocalShardedBufferTestConfig { } }; -TEST_F(MeshBufferTest, ConfigValidation) { - const DeviceLocalBufferConfig device_local_config{ - .page_size = 1024, - .buffer_type = BufferType::DRAM, - .buffer_layout = TensorMemoryLayout::INTERLEAVED, - .bottom_up = false}; - - ASSERT_EQ(mesh_device_->num_rows(), 2); - ASSERT_EQ(mesh_device_->num_cols(), 4); - - // Unaligned shard shape - EXPECT_ANY_THROW(MeshBuffer::create( - ShardedBufferConfig{.global_size = 16 << 10, .global_buffer_shape = {64, 128}, .shard_shape = {32, 120}}, - device_local_config, - mesh_device_.get())); - - // Number of shards exceeds the number of devices - EXPECT_ANY_THROW(MeshBuffer::create( - ShardedBufferConfig{.global_size = 16 << 10, .global_buffer_shape = {64, 128}, .shard_shape = {16, 16}}, - device_local_config, - mesh_device_.get())); - - // 32x32 shards distributed across 2x4 mesh, resulting in 64x128 global shape. - auto buffer = MeshBuffer::create( - ShardedBufferConfig{.global_size = 16 << 10, .global_buffer_shape = {64, 128}, .shard_shape = {32, 32}}, - device_local_config, - mesh_device_.get()); -} - -TEST_F(MeshBufferTest, ShardedBufferInitialization) { +// MeshBuffer tests on T3000 +TEST_F(MeshBufferTestT3000, ShardedBufferInitialization) { const DeviceLocalBufferConfig device_local_config{ .page_size = 1024, .buffer_type = BufferType::DRAM, @@ -93,7 +66,7 @@ TEST_F(MeshBufferTest, ShardedBufferInitialization) { EXPECT_EQ(sharded_buffer->device_local_size(), 2 << 10); } -TEST_F(MeshBufferTest, ReplicatedBufferInitialization) { +TEST_F(MeshBufferTestT3000, ReplicatedBufferInitialization) { const DeviceLocalBufferConfig device_local_config{ .page_size = 1024, .buffer_type = BufferType::DRAM, @@ -108,7 +81,7 @@ TEST_F(MeshBufferTest, ReplicatedBufferInitialization) { EXPECT_EQ(replicated_buffer->device_local_size(), 16 << 10); } -TEST_F(MeshBufferTest, Deallocation) { +TEST_F(MeshBufferTestT3000, Deallocation) { // Verify that a buffer is deallocated on the MeshDevice when it goes // out of scope on host. Create a buffer with a certain config in limited // scope. Record its address. Create another buffer with the same config @@ -149,7 +122,7 @@ TEST_F(MeshBufferTest, Deallocation) { EXPECT_FALSE(buffer_view->is_allocated()); } -TEST_F(MeshBufferTest, GetDeviceBuffer) { +TEST_F(MeshBufferTestT3000, GetDeviceBuffer) { const DeviceLocalBufferConfig device_local_config{ .page_size = 1024, .buffer_type = BufferType::DRAM, @@ -165,50 +138,8 @@ TEST_F(MeshBufferTest, GetDeviceBuffer) { EXPECT_NO_THROW(replicated_buffer->get_device_buffer(Coordinate{1, 3})); } -TEST_F(MeshBufferTest, InterleavedShardsReadWrite) { - constexpr uint32_t NUM_ITERS = 100; - uint32_t seed = tt::parse_env("TT_METAL_SEED", 0); - uint32_t single_tile_size = ::tt::tt_metal::detail::TileSize(DataFormat::UInt32); - - for (auto buffer_type : {BufferType::L1, BufferType::DRAM}) { - DeviceLocalBufferConfig per_device_buffer_config{ - .page_size = single_tile_size, - .buffer_type = BufferType::L1, - .buffer_layout = TensorMemoryLayout::INTERLEAVED, - .bottom_up = false}; - - std::uniform_int_distribution gen_num_tiles(1, 1024); - std::mt19937 rng(seed); - for (int i = 0; i < NUM_ITERS; i++) { - uint32_t num_random_tiles = gen_num_tiles(rng); - ReplicatedBufferConfig global_buffer_config = { - .size = num_random_tiles * single_tile_size, - }; - - std::shared_ptr buf = - MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device_.get()); - - std::vector src_vec(num_random_tiles * single_tile_size / sizeof(uint32_t), 0); - std::iota(src_vec.begin(), src_vec.end(), i); - for (std::size_t logical_x = 0; logical_x < buf->device()->num_cols(); logical_x++) { - for (std::size_t logical_y = 0; logical_y < buf->device()->num_rows(); logical_y++) { - WriteShard(mesh_device_->mesh_command_queue(), buf, src_vec, Coordinate(logical_y, logical_x)); - } - } - - for (std::size_t logical_x = 0; logical_x < buf->device()->num_cols(); logical_x++) { - for (std::size_t logical_y = 0; logical_y < buf->device()->num_rows(); logical_y++) { - std::vector dst_vec = {}; - ReadShard(mesh_device_->mesh_command_queue(), dst_vec, buf, Coordinate(logical_y, logical_x)); - EXPECT_EQ(dst_vec, src_vec); - } - } - } - } -} - class DeviceLocalMeshBufferShardingTest - : public MeshBufferTest, + : public MeshBufferTestT3000, public testing::WithParamInterface< std::tuple, std::array, TensorMemoryLayout>> {}; @@ -274,7 +205,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values( TensorMemoryLayout::HEIGHT_SHARDED, TensorMemoryLayout::WIDTH_SHARDED, TensorMemoryLayout::BLOCK_SHARDED))); -TEST_F(MeshBufferTest, SweepShardAndConcat) { +TEST_F(MeshBufferTestT3000, SweepShardAndConcat) { uint32_t single_tile_size = ::tt::tt_metal::detail::TileSize(DataFormat::UInt32); DeviceLocalBufferConfig per_device_buffer_config{ @@ -312,7 +243,79 @@ TEST_F(MeshBufferTest, SweepShardAndConcat) { } } -TEST_F(MeshBufferTest, RowMajorShardingAndReplication) { +// MeshBuffer tests on N300 and T3000 +TEST_F(MeshBufferTestSuite, ConfigValidation) { + const DeviceLocalBufferConfig device_local_config{ + .page_size = 1024, + .buffer_type = BufferType::DRAM, + .buffer_layout = TensorMemoryLayout::INTERLEAVED, + .bottom_up = false}; + + // Unaligned shard shape + EXPECT_ANY_THROW(MeshBuffer::create( + ShardedBufferConfig{.global_size = 16 << 10, .global_buffer_shape = {64, 128}, .shard_shape = {32, 120}}, + device_local_config, + mesh_device_.get())); + + // Number of shards exceeds the number of devices + EXPECT_ANY_THROW(MeshBuffer::create( + ShardedBufferConfig{.global_size = 16 << 10, .global_buffer_shape = {64, 128}, .shard_shape = {16, 16}}, + device_local_config, + mesh_device_.get())); + + // Buffer with a global shape of 64x128 distributed across a 2x4 or 2x1 Mesh. + auto buffer = MeshBuffer::create( + ShardedBufferConfig{ + .global_size = 16 << 10, + .global_buffer_shape = {64, 128}, + .shard_shape = {64 / mesh_device_->num_rows(), 128 / mesh_device_->num_cols()}}, + device_local_config, + mesh_device_.get()); +} + +TEST_F(MeshBufferTestSuite, InterleavedShardsReadWrite) { + constexpr uint32_t NUM_ITERS = 100; + uint32_t seed = tt::parse_env("TT_METAL_SEED", 0); + uint32_t single_tile_size = ::tt::tt_metal::detail::TileSize(DataFormat::UInt32); + + for (auto buffer_type : {BufferType::L1, BufferType::DRAM}) { + DeviceLocalBufferConfig per_device_buffer_config{ + .page_size = single_tile_size, + .buffer_type = BufferType::L1, + .buffer_layout = TensorMemoryLayout::INTERLEAVED, + .bottom_up = false}; + + std::uniform_int_distribution gen_num_tiles(1, 1024); + std::mt19937 rng(seed); + for (int i = 0; i < NUM_ITERS; i++) { + uint32_t num_random_tiles = gen_num_tiles(rng); + ReplicatedBufferConfig global_buffer_config = { + .size = num_random_tiles * single_tile_size, + }; + + std::shared_ptr buf = + MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device_.get()); + + std::vector src_vec(num_random_tiles * single_tile_size / sizeof(uint32_t), 0); + std::iota(src_vec.begin(), src_vec.end(), i); + for (std::size_t logical_x = 0; logical_x < buf->device()->num_cols(); logical_x++) { + for (std::size_t logical_y = 0; logical_y < buf->device()->num_rows(); logical_y++) { + WriteShard(mesh_device_->mesh_command_queue(), buf, src_vec, Coordinate(logical_y, logical_x)); + } + } + + for (std::size_t logical_x = 0; logical_x < buf->device()->num_cols(); logical_x++) { + for (std::size_t logical_y = 0; logical_y < buf->device()->num_rows(); logical_y++) { + std::vector dst_vec = {}; + ReadShard(mesh_device_->mesh_command_queue(), dst_vec, buf, Coordinate(logical_y, logical_x)); + EXPECT_EQ(dst_vec, src_vec); + } + } + } + } +} + +TEST_F(MeshBufferTestSuite, RowMajorShardingAndReplication) { uint32_t single_tile_size = ::tt::tt_metal::detail::TileSize(DataFormat::UInt32); DeviceLocalBufferConfig per_device_buffer_config{ @@ -366,7 +369,7 @@ TEST_F(MeshBufferTest, RowMajorShardingAndReplication) { } } -TEST_F(MeshBufferTest, ColMajorShardingAndReplication) { +TEST_F(MeshBufferTestSuite, ColMajorShardingAndReplication) { uint32_t single_tile_size = ::tt::tt_metal::detail::TileSize(DataFormat::UInt32); DeviceLocalBufferConfig per_device_buffer_config{ diff --git a/tests/tt_metal/distributed/test_mesh_events.cpp b/tests/tt_metal/distributed/test_mesh_events.cpp index c19d3632800..336c8e8ccf1 100644 --- a/tests/tt_metal/distributed/test_mesh_events.cpp +++ b/tests/tt_metal/distributed/test_mesh_events.cpp @@ -14,9 +14,10 @@ namespace tt::tt_metal::distributed::test { namespace { -using MeshEventsTest = T3000MultiCQMultiDeviceFixture; +using MeshEventsTestT3000 = T3000MultiCQMeshDeviceFixture; +using MeshEventsTestSuite = GenericMultiCQMeshDeviceFixture; -TEST_F(MeshEventsTest, ReplicatedAsyncIO) { +TEST_F(MeshEventsTestSuite, ReplicatedAsyncIO) { uint32_t NUM_TILES = 1000; uint32_t num_iterations = 20; int32_t single_tile_size = ::tt::tt_metal::detail::TileSize(DataFormat::UInt32); @@ -61,7 +62,7 @@ TEST_F(MeshEventsTest, ReplicatedAsyncIO) { } } -TEST_F(MeshEventsTest, ShardedAsyncIO) { +TEST_F(MeshEventsTestT3000, ShardedAsyncIO) { uint32_t num_iterations = 20; uint32_t single_tile_size = ::tt::tt_metal::detail::TileSize(DataFormat::UInt32); @@ -108,7 +109,7 @@ TEST_F(MeshEventsTest, ShardedAsyncIO) { } } -TEST_F(MeshEventsTest, AsyncWorkloadAndIO) { +TEST_F(MeshEventsTestSuite, AsyncWorkloadAndIO) { uint32_t num_iters = 5; std::vector> src0_bufs = {}; std::vector> src1_bufs = {}; @@ -119,8 +120,8 @@ TEST_F(MeshEventsTest, AsyncWorkloadAndIO) { auto programs = tt::tt_metal::distributed::test::utils::create_eltwise_bin_programs( mesh_device_, src0_bufs, src1_bufs, output_bufs); auto mesh_workload = CreateMeshWorkload(); - LogicalDeviceRange devices_0 = LogicalDeviceRange({0, 0}, {3, 0}); - LogicalDeviceRange devices_1 = LogicalDeviceRange({0, 1}, {3, 1}); + LogicalDeviceRange devices_0 = LogicalDeviceRange({0, 0}, {mesh_device_->num_cols() - 1, 0}); + LogicalDeviceRange devices_1 = LogicalDeviceRange({0, 1}, {mesh_device_->num_cols() - 1, 1}); AddProgramToMeshWorkload(mesh_workload, *programs[0], devices_0); AddProgramToMeshWorkload(mesh_workload, *programs[1], devices_1); @@ -189,7 +190,7 @@ TEST_F(MeshEventsTest, AsyncWorkloadAndIO) { } } -TEST_F(MeshEventsTest, CustomDeviceRanges) { +TEST_F(MeshEventsTestSuite, CustomDeviceRanges) { uint32_t NUM_TILES = 1000; uint32_t num_iterations = 20; int32_t single_tile_size = ::tt::tt_metal::detail::TileSize(DataFormat::UInt32); @@ -209,8 +210,8 @@ TEST_F(MeshEventsTest, CustomDeviceRanges) { for (std::size_t i = 0; i < num_iterations; i++) { std::vector src_vec(NUM_TILES * single_tile_size / sizeof(uint32_t), i); std::iota(src_vec.begin(), src_vec.end(), i); - LogicalDeviceRange devices_0 = LogicalDeviceRange({0, 0}, {3, 0}); - LogicalDeviceRange devices_1 = LogicalDeviceRange({0, 1}, {3, 1}); + LogicalDeviceRange devices_0 = LogicalDeviceRange({0, 0}, {mesh_device_->num_cols() - 1, 0}); + LogicalDeviceRange devices_1 = LogicalDeviceRange({0, 1}, {mesh_device_->num_cols() - 1, 1}); std::vector> readback_vecs = {}; std::shared_ptr event_0 = std::make_shared(); diff --git a/tests/tt_metal/distributed/test_mesh_sub_device.cpp b/tests/tt_metal/distributed/test_mesh_sub_device.cpp index 7a21597dd59..d16bfedc48a 100644 --- a/tests/tt_metal/distributed/test_mesh_sub_device.cpp +++ b/tests/tt_metal/distributed/test_mesh_sub_device.cpp @@ -12,9 +12,9 @@ namespace tt::tt_metal::distributed::test { namespace { -using MeshSubDeviceTest = T3000MultiDeviceFixture; +using MeshSubDeviceTestSuite = GenericMeshDeviceFixture; -TEST_F(MeshSubDeviceTest, SyncWorkloadsOnSubDevice) { +TEST_F(MeshSubDeviceTestSuite, SyncWorkloadsOnSubDevice) { SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))}); SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})})}); @@ -43,7 +43,7 @@ TEST_F(MeshSubDeviceTest, SyncWorkloadsOnSubDevice) { Finish(mesh_device_->mesh_command_queue()); } -TEST_F(MeshSubDeviceTest, DataCopyOnSubDevices) { +TEST_F(MeshSubDeviceTestSuite, DataCopyOnSubDevices) { SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {0, 0}))}); SubDevice sub_device_2(std::array{CoreRangeSet(CoreRange({1, 1}, {1, 1}))}); SubDevice sub_device_3(std::array{CoreRangeSet(CoreRange({2, 2}, {2, 2}))}); @@ -136,7 +136,7 @@ TEST_F(MeshSubDeviceTest, DataCopyOnSubDevices) { } } -TEST_F(MeshSubDeviceTest, SubDeviceSwitching) { +TEST_F(MeshSubDeviceTestSuite, SubDeviceSwitching) { // Sub Devices for config 0 SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))}); SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})})}); diff --git a/tests/tt_metal/distributed/test_mesh_workload.cpp b/tests/tt_metal/distributed/test_mesh_workload.cpp index dcf3f9a4158..ef19ed2395c 100644 --- a/tests/tt_metal/distributed/test_mesh_workload.cpp +++ b/tests/tt_metal/distributed/test_mesh_workload.cpp @@ -389,9 +389,10 @@ void validate_sems( } } -using MeshWorkloadTest = T3000MultiDeviceFixture; +using MeshWorkloadTestT3000 = T3000MeshDeviceFixture; +using MeshWorkloadTestSuite = GenericMeshDeviceFixture; -TEST_F(MeshWorkloadTest, MeshWorkloadOnActiveEthAsserts) { +TEST_F(MeshWorkloadTestT3000, MeshWorkloadOnActiveEthAsserts) { // A MeshWorkload cannot be run on ethernet core - Runtime should assert if the // user tries this. Verify this functionality here. std::shared_ptr workload = std::make_shared(); @@ -403,14 +404,14 @@ TEST_F(MeshWorkloadTest, MeshWorkloadOnActiveEthAsserts) { IDevice* device = mesh_device_->get_device(logical_y, logical_x); auto programs = create_random_programs( 1, mesh_device_->compute_with_storage_grid_size(), seed, device->get_active_ethernet_cores(true)); - LogicalDeviceRange devices = {{logical_x, logical_y}, {logical_x + 1, logical_y + 1}}; + LogicalDeviceRange devices = {{logical_x, logical_y}, {logical_x, logical_y}}; AddProgramToMeshWorkload(*workload, *programs[0], devices); } } EXPECT_THROW(EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), *workload, false), std::exception); } -TEST_F(MeshWorkloadTest, SimultaneousMeshWorkloads) { +TEST_F(MeshWorkloadTestT3000, SimultaneousMeshWorkloads) { uint32_t num_programs = 100; uint32_t num_heterogeneous_programs = 64; uint32_t num_iterations = 1000; @@ -490,7 +491,8 @@ TEST_F(MeshWorkloadTest, SimultaneousMeshWorkloads) { Finish(mesh_device_->mesh_command_queue()); } -TEST_F(MeshWorkloadTest, RandomizedMeshWorkload) { +// MeshWorkload tests on N300 and T3000 +TEST_F(MeshWorkloadTestSuite, RandomizedMeshWorkload) { uint32_t num_programs = 60; uint32_t num_iterations = 1500; auto random_seed = 10; @@ -500,8 +502,8 @@ TEST_F(MeshWorkloadTest, RandomizedMeshWorkload) { log_info("Create {} MeshWorkloads", num_programs); auto programs = create_random_programs(num_programs, mesh_device_->compute_with_storage_grid_size(), seed); std::mt19937 rng(seed); - std::uniform_int_distribution gen_x(1, 4); - std::uniform_int_distribution gen_y(1, 2); + std::uniform_int_distribution gen_x(1, mesh_device_->num_cols()); + std::uniform_int_distribution gen_y(1, mesh_device_->num_rows()); std::vector> mesh_workloads = {}; // Create multiple mesh workloads on grids of random sizes. @@ -527,7 +529,7 @@ TEST_F(MeshWorkloadTest, RandomizedMeshWorkload) { Finish(mesh_device_->mesh_command_queue()); } -TEST_F(MeshWorkloadTest, EltwiseBinaryMeshWorkload) { +TEST_F(MeshWorkloadTestSuite, EltwiseBinaryMeshWorkload) { std::vector> src0_bufs = {}; std::vector> src1_bufs = {}; std::vector> output_bufs = {}; @@ -537,8 +539,9 @@ TEST_F(MeshWorkloadTest, EltwiseBinaryMeshWorkload) { auto programs = tt::tt_metal::distributed::test::utils::create_eltwise_bin_programs( mesh_device_, src0_bufs, src1_bufs, output_bufs); auto mesh_workload = CreateMeshWorkload(); - LogicalDeviceRange devices_0 = LogicalDeviceRange({0, 0}, {3, 0}); - LogicalDeviceRange devices_1 = LogicalDeviceRange({0, 1}, {3, 1}); + LogicalDeviceRange devices_0 = LogicalDeviceRange({0, 0}, {mesh_device_->num_cols() - 1, 0}); + LogicalDeviceRange devices_1 = LogicalDeviceRange( + {0, mesh_device_->num_rows() - 1}, {mesh_device_->num_cols() - 1, mesh_device_->num_rows() - 1}); AddProgramToMeshWorkload(mesh_workload, *programs[0], devices_0); AddProgramToMeshWorkload(mesh_workload, *programs[1], devices_1); std::vector src0_vec = create_constant_vector_of_bfloat16(src0_bufs[0]->size(), 2); @@ -583,7 +586,7 @@ TEST_F(MeshWorkloadTest, EltwiseBinaryMeshWorkload) { } } -TEST_F(MeshWorkloadTest, MeshWorkloadSanity) { +TEST_F(MeshWorkloadTestSuite, MeshWorkloadSanity) { CoreCoord worker_grid_size = mesh_device_->compute_with_storage_grid_size(); uint32_t single_tile_size = ::tt::tt_metal::detail::TileSize(DataFormat::Float16_b); @@ -648,8 +651,9 @@ TEST_F(MeshWorkloadTest, MeshWorkloadSanity) { } auto program_1 = initialize_dummy_program(worker_grid_size); auto mesh_workload = MeshWorkload(); - LogicalDeviceRange devices_0 = LogicalDeviceRange({0, 0}, {3, 0}); - LogicalDeviceRange devices_1 = LogicalDeviceRange({0, 1}, {3, 1}); + LogicalDeviceRange devices_0 = LogicalDeviceRange({0, 0}, {mesh_device_->num_cols() - 1, 0}); + LogicalDeviceRange devices_1 = LogicalDeviceRange( + {0, mesh_device_->num_rows() - 1}, {mesh_device_->num_cols() - 1, mesh_device_->num_rows() - 1}); AddProgramToMeshWorkload(mesh_workload, program, devices_0); AddProgramToMeshWorkload(mesh_workload, *program_1, devices_1); @@ -698,7 +702,7 @@ TEST_F(MeshWorkloadTest, MeshWorkloadSanity) { } } -TEST_F(MeshWorkloadTest, MeshWorkloadCBUpdate) { +TEST_F(MeshWorkloadTestSuite, MeshWorkloadCBUpdate) { std::shared_ptr program = std::make_shared(); CoreCoord worker_grid_size = mesh_device_->compute_with_storage_grid_size(); CoreRange cr = CoreRange({0, 0}, {worker_grid_size.x - 1, worker_grid_size.y - 1}); @@ -714,7 +718,8 @@ TEST_F(MeshWorkloadTest, MeshWorkloadCBUpdate) { initialize_dummy_kernels(*program, cr_set); auto mesh_workload = CreateMeshWorkload(); - LogicalDeviceRange devices = LogicalDeviceRange({0, 0}, {3, 1}); + LogicalDeviceRange devices = + LogicalDeviceRange({0, 0}, {mesh_device_->num_cols() - 1, mesh_device_->num_rows() - 1}); AddProgramToMeshWorkload(mesh_workload, *program, devices); EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), mesh_workload, false); @@ -733,7 +738,7 @@ TEST_F(MeshWorkloadTest, MeshWorkloadCBUpdate) { verify_cb_config(mesh_device_, mesh_workload, updated_cb_config_vector, cr_set); } -TEST_F(MeshWorkloadTest, MeshWorkloadSemaphoreSanity) { +TEST_F(MeshWorkloadTestSuite, MeshWorkloadSemaphoreSanity) { auto worker_grid_size = mesh_device_->compute_with_storage_grid_size(); auto full_grid = CoreRange({0, 0}, {worker_grid_size.x - 1, worker_grid_size.y - 1}); Program program; @@ -744,7 +749,8 @@ TEST_F(MeshWorkloadTest, MeshWorkloadSemaphoreSanity) { expected_semaphore_values.push_back(sem); } auto mesh_workload = CreateMeshWorkload(); - LogicalDeviceRange devices = LogicalDeviceRange({0, 0}, {3, 1}); + LogicalDeviceRange devices = + LogicalDeviceRange({0, 0}, {mesh_device_->num_cols() - 1, mesh_device_->num_rows() - 1}); AddProgramToMeshWorkload(mesh_workload, program, devices); EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), mesh_workload, false); Finish(mesh_device_->mesh_command_queue()); @@ -754,7 +760,7 @@ TEST_F(MeshWorkloadTest, MeshWorkloadSemaphoreSanity) { } } -TEST_F(MeshWorkloadTest, MeshWorkloadSemaphoreDifferentPrograms) { +TEST_F(MeshWorkloadTestSuite, MeshWorkloadSemaphoreDifferentPrograms) { auto worker_grid_size = mesh_device_->compute_with_storage_grid_size(); auto full_grid = CoreRange({0, 0}, {worker_grid_size.x - 1, worker_grid_size.y - 1}); Program program0; @@ -770,8 +776,9 @@ TEST_F(MeshWorkloadTest, MeshWorkloadSemaphoreDifferentPrograms) { expected_semaphore_values_1.push_back(sem + 1); } auto mesh_workload = CreateMeshWorkload(); - LogicalDeviceRange devices_0 = LogicalDeviceRange({0, 0}, {3, 0}); - LogicalDeviceRange devices_1 = LogicalDeviceRange({0, 1}, {3, 1}); + LogicalDeviceRange devices_0 = LogicalDeviceRange({0, 0}, {mesh_device_->num_cols() - 1, 0}); + LogicalDeviceRange devices_1 = LogicalDeviceRange( + {0, mesh_device_->num_rows() - 1}, {mesh_device_->num_cols() - 1, mesh_device_->num_rows() - 1}); AddProgramToMeshWorkload(mesh_workload, program0, devices_0); AddProgramToMeshWorkload(mesh_workload, program1, devices_1); diff --git a/tests/tt_metal/tt_metal/common/multi_device_fixture.hpp b/tests/tt_metal/tt_metal/common/multi_device_fixture.hpp index 1fa6f2443c9..752ada9b376 100644 --- a/tests/tt_metal/tt_metal/common/multi_device_fixture.hpp +++ b/tests/tt_metal/tt_metal/common/multi_device_fixture.hpp @@ -50,55 +50,122 @@ class N300DeviceFixture : public MultiDeviceFixture { } }; -class T3000MultiDeviceFixture : public ::testing::Test { +class MeshDeviceFixtureBase : public ::testing::Test { protected: - virtual void SetUp() override { - using tt::tt_metal::distributed::MeshDevice; - using tt::tt_metal::distributed::MeshDeviceConfig; - using tt::tt_metal::distributed::MeshShape; + using MeshDevice = ::tt::tt_metal::distributed::MeshDevice; + using MeshDeviceConfig = ::tt::tt_metal::distributed::MeshDeviceConfig; + using MeshShape = ::tt::tt_metal::distributed::MeshShape; + enum class MeshDeviceType { + N300, + T3000, + }; + + struct Config { + // If unset, the mesh device type will be deduced automatically based on the connected devices. + // The associated test will be run if the connected cluster corresponds to a supported topology. + std::optional mesh_device_type; + int num_cqs = 1; + }; + + MeshDeviceFixtureBase(const Config& fixture_config) : config_(fixture_config) {} + + void SetUp() override { auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); - const auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); - const size_t num_devices = tt::tt_metal::GetNumAvailableDevices(); if (slow_dispatch) { - GTEST_SKIP() << "Skipping Multi-Device test suite, since it can only be run in Fast Dispatch Mode."; + GTEST_SKIP() << "Skipping Mesh-Device test suite, since it can only be run in Fast Dispatch Mode."; } - if (num_devices < 8 or arch != tt::ARCH::WORMHOLE_B0) { - GTEST_SKIP() << "Skipping T3K Multi-Device test suite on non T3K machine."; + + const auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); + if (arch != tt::ARCH::WORMHOLE_B0) { + GTEST_SKIP() << "Skipping MeshDevice test suite on a non-wormhole machine."; + } + + const auto num_devices = tt::tt_metal::GetNumAvailableDevices(); + const auto mesh_device_type = derive_mesh_device_type(num_devices); + if (!mesh_device_type) { + GTEST_SKIP() << fmt::format( + "Skipping MeshDevice test suite on a machine with an unsupported number of devices {}.", num_devices); } - create_mesh_device(); + + if (config_.mesh_device_type.has_value() && *config_.mesh_device_type != *mesh_device_type) { + GTEST_SKIP() << fmt::format( + "Skipping MeshDevice test suite on a {} machine that does not match the configured mesh device type {}", + magic_enum::enum_name(*mesh_device_type), + magic_enum::enum_name(*config_.mesh_device_type)); + } + + // Use ethernet dispatch for more than 1 CQ on T3K/N300 + DispatchCoreType core_type = (config_.num_cqs >= 2) ? DispatchCoreType::ETH : DispatchCoreType::WORKER; + mesh_device_ = MeshDevice::create( + MeshDeviceConfig{.mesh_shape = get_mesh_shape(*mesh_device_type)}, 0, 0, config_.num_cqs, core_type); } void TearDown() override { if (!mesh_device_) { return; } - mesh_device_->close(); mesh_device_.reset(); } -protected: - virtual void create_mesh_device() { - using tt::tt_metal::distributed::MeshDevice; - using tt::tt_metal::distributed::MeshDeviceConfig; - using tt::tt_metal::distributed::MeshShape; + std::shared_ptr mesh_device_; - mesh_device_ = MeshDevice::create(MeshDeviceConfig{.mesh_shape = MeshShape{2, 4}}); +private: + // Returns the mesh shape for a given mesh device type. + MeshShape get_mesh_shape(MeshDeviceType mesh_device_type) { + switch (mesh_device_type) { + case MeshDeviceType::N300: return MeshShape(2, 1); + case MeshDeviceType::T3000: return MeshShape(2, 4); + default: TT_FATAL(false, "Querying shape for unspecified Mesh Type."); + } } - std::shared_ptr mesh_device_; + // Determines the mesh device type based on the number of devices. + std::optional derive_mesh_device_type(size_t num_devices) { + switch (num_devices) { + case 2: return MeshDeviceType::N300; + case 8: return MeshDeviceType::T3000; + default: return std::nullopt; + } + } + + Config config_; }; -class T3000MultiCQMultiDeviceFixture : public T3000MultiDeviceFixture { +// Fixtures that determine the mesh device type automatically. +// The associated test will be run if the topology is supported. +class GenericMeshDeviceFixture : public MeshDeviceFixtureBase { protected: - // Override only the mesh device creation logic - void create_mesh_device() override { - using tt::tt_metal::distributed::MeshDevice; - using tt::tt_metal::distributed::MeshDeviceConfig; - using tt::tt_metal::distributed::MeshShape; - - mesh_device_ = - MeshDevice::create(MeshDeviceConfig{.mesh_shape = MeshShape{2, 4}}, 0, 0, 2, DispatchCoreType::ETH); - } + GenericMeshDeviceFixture() : MeshDeviceFixtureBase(Config{.num_cqs = 1}) {} +}; + +class GenericMultiCQMeshDeviceFixture : public MeshDeviceFixtureBase { +protected: + GenericMultiCQMeshDeviceFixture() : MeshDeviceFixtureBase(Config{.num_cqs = 2}) {} +}; + +// Fixtures that specify the mesh device type explicitly. +// The associated test will be run if the cluster topology matches +// what is specified. +class N300MeshDeviceFixture : public MeshDeviceFixtureBase { +protected: + N300MeshDeviceFixture() : MeshDeviceFixtureBase(Config{.mesh_device_type = MeshDeviceType::N300}) {} +}; + +class T3000MeshDeviceFixture : public MeshDeviceFixtureBase { +protected: + T3000MeshDeviceFixture() : MeshDeviceFixtureBase(Config{.mesh_device_type = MeshDeviceType::T3000}) {} +}; + +class N300MultiCQMeshDeviceFixture : public MeshDeviceFixtureBase { +protected: + N300MultiCQMeshDeviceFixture() : + MeshDeviceFixtureBase(Config{.mesh_device_type = MeshDeviceType::N300, .num_cqs = 2}) {} +}; + +class T3000MultiCQMeshDeviceFixture : public MeshDeviceFixtureBase { +protected: + T3000MultiCQMeshDeviceFixture() : + MeshDeviceFixtureBase(Config{.mesh_device_type = MeshDeviceType::T3000, .num_cqs = 2}) {} }; From 44d31ebeb14d3b8b1c94cb251e2265436a9d3cb1 Mon Sep 17 00:00:00 2001 From: Pavle Josipovic Date: Fri, 7 Feb 2025 09:37:44 +0000 Subject: [PATCH 118/316] #17679: Remove conv tt eager tests These tests are not releavant anymore, as we have functional equivalents in TTNN tests. --- tests/scripts/run_tt_eager.py | 1 - tests/tt_eager/CMakeLists.txt | 1 - .../test_conv_prepare_weights_and_biases.cpp | 482 ------------------ .../conv/conv_op_trace_config.py | 143 ------ .../conv/conv_unit_test_utils.py | 86 ---- .../python_api_testing/conv/conv_utils.py | 25 - .../conv/generate_mm_tb_using_conv_tb.py | 36 -- .../conv/generated_mm_tb.yaml | 316 ------------ .../conv/pytorch_conv_tb.py | 138 ----- .../sweep_tests/generation_funcs.py | 14 - .../test_sweep_conv_with_address_map.py | 192 ------- .../fallback_ops/test_conv2d_op.py | 284 ----------- .../unit_testing/misc/test_downsample.py | 208 -------- ...est_resnet50_first_conv_folding_on_host.py | 101 ---- 14 files changed, 2027 deletions(-) delete mode 100644 tests/tt_eager/ops/test_conv_prepare_weights_and_biases.cpp delete mode 100644 tests/tt_eager/python_api_testing/conv/conv_op_trace_config.py delete mode 100644 tests/tt_eager/python_api_testing/conv/conv_unit_test_utils.py delete mode 100644 tests/tt_eager/python_api_testing/conv/conv_utils.py delete mode 100644 tests/tt_eager/python_api_testing/conv/generate_mm_tb_using_conv_tb.py delete mode 100644 tests/tt_eager/python_api_testing/conv/generated_mm_tb.yaml delete mode 100644 tests/tt_eager/python_api_testing/conv/pytorch_conv_tb.py delete mode 100644 tests/tt_eager/python_api_testing/sweep_tests/pytests/test_sweep_conv_with_address_map.py delete mode 100644 tests/tt_eager/python_api_testing/unit_testing/fallback_ops/test_conv2d_op.py delete mode 100644 tests/tt_eager/python_api_testing/unit_testing/misc/test_downsample.py delete mode 100644 tests/tt_eager/python_api_testing/unit_testing/misc/test_resnet50_first_conv_folding_on_host.py diff --git a/tests/scripts/run_tt_eager.py b/tests/scripts/run_tt_eager.py index faae14c4ffd..af0999114be 100644 --- a/tests/scripts/run_tt_eager.py +++ b/tests/scripts/run_tt_eager.py @@ -33,7 +33,6 @@ TestEntry("tt_eager/tests/ops/test_eltwise_binary_op", "ops/test_eltwise_binary_op"), TestEntry("tt_eager/tests/ops/test_bcast_op", "ops/test_bcast_op"), TestEntry("tt_eager/tests/ops/test_sliding_window_ops", "ops/test_sliding_window_ops"), - TestEntry("tt_eager/tests/ops/test_conv_prepare_weights_and_biases", "ops/test_conv_prepare_weights_and_biases"), TestEntry("tt_eager/tests/ops/test_bmm_op", "ops/test_bmm_op"), void_for_bh(void_for_whb0(TestEntry("tt_eager/tests/ops/test_eltwise_unary_op", "ops/test_eltwise_unary_op"))), TestEntry("tt_eager/tests/ops/test_layernorm_op", "ops/test_layernorm_op"), diff --git a/tests/tt_eager/CMakeLists.txt b/tests/tt_eager/CMakeLists.txt index 7c236dde39d..0d3cec67b9a 100644 --- a/tests/tt_eager/CMakeLists.txt +++ b/tests/tt_eager/CMakeLists.txt @@ -13,7 +13,6 @@ set(TT_EAGER_TESTS_OPS ops/test_sfpu.cpp ops/test_sliding_window_ops.cpp ops/test_fold_op.cpp - ops/test_conv_prepare_weights_and_biases.cpp ) set(TT_EAGER_TESTS_TENSORS diff --git a/tests/tt_eager/ops/test_conv_prepare_weights_and_biases.cpp b/tests/tt_eager/ops/test_conv_prepare_weights_and_biases.cpp deleted file mode 100644 index 8dc88558494..00000000000 --- a/tests/tt_eager/ops/test_conv_prepare_weights_and_biases.cpp +++ /dev/null @@ -1,482 +0,0 @@ -// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include -#include -#include "ttnn/cpp/ttnn/tensor/host_buffer/functions.hpp" -#include "ttnn/cpp/ttnn/tensor/types.hpp" -#include "ttnn/tensor/host_buffer/functions.hpp" -#include "ttnn/tensor/host_buffer/types.hpp" -#include "ttnn/tensor/tensor.hpp" -#include "ttnn/tensor/tensor.hpp" -#include "ttnn/operations/creation.hpp" -#include "ttnn/operations/functions.hpp" -#include "ttnn/tensor/types.hpp" -#include "ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.hpp" - -static std::vector> ref_weight_in = { - { - 16140, 16151, 16183, 16216, 16154, 16219, 16139, 16216, 16088, 16159, 16165, 16068, 16096, 16024, 16228, 15720, - 16246, 16011, 16068, 16116, 16202, 16207, 16135, 16117, 16145, 16073, 16236, 16214, 15761, 16044, 15794, 16165, - 15525, 16060, 16213, 16245, 16199, 15887, 16222, 16222, 16250, 16114, 16204, 16205, 16108, 16133, 16199, 16173, - 15858, 16184, 16163, 16148, 15890, 16137, 16241, 16194, 16133, 15832, 16084, 16114, 16007, 15934, 16198, 16188, - 16105, 15965, 16145, 15882, 15513, 16037, 16158, 15897, 16156, 15971, 16157, 16069, 16241, 16231, 16174, 16102, - 16056, 16156, 16095, 16231, 16178, 15819, 15734, 16248, 16170, 16167, 16171, 15919, 15959, 16055, 15876, 16192, - 16033, 16155, 16058, 16038, 16145, 15645, 16096, 16162, 16253, 16245, 15824, 16167, 15957, 16162, 15909, 16254, - 16167, 16148, 16001, 16084, 16110, 16115, 15994, 16159, 15906, 16045, 15842, 16172, 16168, 16034, 15885, 16199, - 15945, 16243, 16060, 16169, 16210, 15454, 15814, 16159, 16214, 16172, 15812, 16248, 16249, 16224, 16111, 16130, - 16250, 15716, 16154, 16102, 16189, 15523, 15648, 16098, 16016, 16250, 15862, 16056, 16023, 16118, 15859, 16176, - 16034, 16225, 16084, 16235, 15747, 15966, 16177, 16144, 16145, 16221, 16007, 16130, 16133, 16234, 15808, 16235, - 16147, 15786, 16237, 16014, 16035, 15385, 16170, 16215, 15878, 16165, 16183, 16215, 16020, 16007, 15931, 16075, - 16150, 16141, 15524, 15912, 16212, 16061, 15257, 15893, 16173, 16145, 16010, 16180, 16188, 16019, 16246, 16093, - 15998, 16193, 16147, 16074, 16151, 16229, 16146, 16163, 15972, 16228, 16243, 16174, 16100, 16101, 16216, 16250, - 16179, 15853, 16024, 16196, 16208, 16082, 16075, 16172, 16225, 15999, 16148, 16032, 16225, 16247, 16177, 16150, - 16185, 16168, 16128, 16136, 16244, 15980, 16164, 16074, 16089, 16158, 16155, 16115, 15517, 16112, 16026, 16183, - 16169, 16019, 16020, 16068, 16158, 16191, 16091, 16224, 15882, 15826, 16024, 15805, 16145, 16053, 16151, 16141, - 16147, 15625, 16167, 16248, 16166, 16036, 16092, 15970, 16229, 15888, 16060, 15815, 16095, 16251, 16228, 16005, - 16206, 16137, 16180, 16101, 15821, 15819, 16235, 16052, 16182, 16112, 16255, 16215, 15897, 16231, 16222, 15641, - 15910, 16130, 16157, 15914, 15869, 16199, 16217, 16221, 16206, 16082, 16145, 15887, 16080, 15624, 15757, 16251, - 16178, 16063, 16104, 16087, 16184, 15695, 16221, 16059, 16249, 15496, 16219, 15980, 15423, 16195, 16056, 16241, - 16186, 16191, 15919, 16045, 16133, 16122, 15710, 16045, 15948, 15927, 15511, 15919, 16203, 16109, 15973, 16223, - 16048, 16241, 16237, 16155, 16180, 16152, 15618, 16200, 15912, 16128, 16159, 15694, 16147, 16178, 15987, 16254, - 16239, 16008, 16157, 16173, 16137, 16221, 16151, 16192, 16186, 16246, 16031, 16141, 16075, 15961, 15958, 15971, - 15934, 15967, 16241, 16145, 16189, 16103, 16123, 16248, 15976, 16174, 16002, 15790, 15725, 15719, 16094, 16121, - 16031, 16225, 16178, 16249, 16065, 16158, 15927, 16138, 15562, 16218, 15753, 16190, 16173, 16117, 16104, 16173, - 16137, 16155, 16229, 16182, 16253, 16112, 15966, 16105, 16169, 16232, 16006, 15884, 15529, 15978, 16194, 16225, - 16035, 16231, 16068, 16165, 16150, 16038, 16212, 16133, 16161, 14440, 16223, 16031, 16012, 16089, 16204, 16226, - 15934, 16174, 16243, 16105, 16175, 16119, 15964, 16201, 16242, 15978, 16187, 16225, 16002, 16032, 15962, 16245, - 16132, 16113, 15570, 16182, 15956, 15901, 16089, 16186, 16063, 16165, 16109, 15964, 16014, 15934, 16150, 16206, - 16221, 16191, 15856, 16172, 16132, 16013, 15879, 15923, 16183, 16180, 16074, 16109, 16144, 16215, 15931, 15953, - 15892, 15912, 16121, 15871, 16054, 16184, 16240, 15609, 16195, 16191, 16191, 15805, 16231, 15966, 15786, 16191, - 16141, 16187, 16149, 15674, 16246, 15958, 16021, 16018, 15990, 16173, 15821, 15745, 15494, 16142, 16237, 15383, - 16171, 16213, 16200, 16251, 16016, 16180, 16150, 15929, 15746, 16131, 16120, 16148, 16250, 16201, 16224, 16155, - 16045, 15967, 16246, 16105, 15981, 16224, 16243, 16124, 16240, 16183, 16204, 16120, 16161, 16181, 16223, 16127, - 16022, 16216, 16217, 15943, 16158, 16197, 15448, 16249, 16049, 16220, 15895, 16199, 16251, 16252, 16116, 16192, - }, - - { - 16140, 16151, 16183, 16216, 16154, 16219, 16139, 16216, 16088, 16159, 16165, 16068, 16096, 16024, 16228, 15720, - 16246, 16011, 16068, 16116, 16202, 16207, 16135, 16117, 16145, 16073, 16236, 16214, 15761, 16044, 15794, 16165, - 15525, 16060, 16213, 16245, 16199, 15887, 16222, 16222, 16250, 16114, 16204, 16205, 16108, 16133, 16199, 16173, - 15858, 16184, 16163, 16148, 15890, 16137, 16241, 16194, 16133, 15832, 16084, 16114, 16007, 15934, 16198, 16188, - 16105, 15965, 16145, 15882, 15513, 16037, 16158, 15897, 16156, 15971, 16157, 16069, 16241, 16231, 16174, 16102, - 16056, 16156, 16095, 16231, 16178, 15819, 15734, 16248, 16170, 16167, 16171, 15919, 15959, 16055, 15876, 16192, - 16033, 16155, 16058, 16038, 16145, 15645, 16096, 16162, 16253, 16245, 15824, 16167, 15957, 16162, 15909, 16254, - 16167, 16148, 16001, 16084, 16110, 16115, 15994, 16159, 15906, 16045, 15842, 16172, 16168, 16034, 15885, 16199, - 15945, 16243, 16060, 16169, 16210, 15454, 15814, 16159, 16214, 16172, 15812, 16248, 16249, 16224, 16111, 16130, - 16250, 15716, 16154, 16102, 16189, 15523, 15648, 16098, 16016, 16250, 15862, 16056, 16023, 16118, 15859, 16176, - 16034, 16225, 16084, 16235, 15747, 15966, 16177, 16144, 16145, 16221, 16007, 16130, 16133, 16234, 15808, 16235, - 16147, 15786, 16237, 16014, 16035, 15385, 16170, 16215, 15878, 16165, 16183, 16215, 16020, 16007, 15931, 16075, - 16150, 16141, 15524, 15912, 16212, 16061, 15257, 15893, 16173, 16145, 16010, 16180, 16188, 16019, 16246, 16093, - 15998, 16193, 16147, 16074, 16151, 16229, 16146, 16163, 15972, 16228, 16243, 16174, 16100, 16101, 16216, 16250, - 16179, 15853, 16024, 16196, 16208, 16082, 16075, 16172, 16225, 15999, 16148, 16032, 16225, 16247, 16177, 16150, - 16185, 16168, 16128, 16136, 16244, 15980, 16164, 16074, 16089, 16158, 16155, 16115, 15517, 16112, 16026, 16183, - 16169, 16019, 16020, 16068, 16158, 16191, 16091, 16224, 15882, 15826, 16024, 15805, 16145, 16053, 16151, 16141, - 16147, 15625, 16167, 16248, 16166, 16036, 16092, 15970, 16229, 15888, 16060, 15815, 16095, 16251, 16228, 16005, - 16206, 16137, 16180, 16101, 15821, 15819, 16235, 16052, 16182, 16112, 16255, 16215, 15897, 16231, 16222, 15641, - 15910, 16130, 16157, 15914, 15869, 16199, 16217, 16221, 16206, 16082, 16145, 15887, 16080, 15624, 15757, 16251, - 16178, 16063, 16104, 16087, 16184, 15695, 16221, 16059, 16249, 15496, 16219, 15980, 15423, 16195, 16056, 16241, - 16186, 16191, 15919, 16045, 16133, 16122, 15710, 16045, 15948, 15927, 15511, 15919, 16203, 16109, 15973, 16223, - 16048, 16241, 16237, 16155, 16180, 16152, 15618, 16200, 15912, 16128, 16159, 15694, 16147, 16178, 15987, 16254, - 16239, 16008, 16157, 16173, 16137, 16221, 16151, 16192, 16186, 16246, 16031, 16141, 16075, 15961, 15958, 15971, - 15934, 15967, 16241, 16145, 16189, 16103, 16123, 16248, 15976, 16174, 16002, 15790, 15725, 15719, 16094, 16121, - 16031, 16225, 16178, 16249, 16065, 16158, 15927, 16138, 15562, 16218, 15753, 16190, 16173, 16117, 16104, 16173, - 16137, 16155, 16229, 16182, 16253, 16112, 15966, 16105, 16169, 16232, 16006, 15884, 15529, 15978, 16194, 16225, - 16035, 16231, 16068, 16165, 16150, 16038, 16212, 16133, 16161, 14440, 16223, 16031, 16012, 16089, 16204, 16226, - 15934, 16174, 16243, 16105, 16175, 16119, 15964, 16201, 16242, 15978, 16187, 16225, 16002, 16032, 15962, 16245, - 16132, 16113, 15570, 16182, 15956, 15901, 16089, 16186, 16063, 16165, 16109, 15964, 16014, 15934, 16150, 16206, - 16221, 16191, 15856, 16172, 16132, 16013, 15879, 15923, 16183, 16180, 16074, 16109, 16144, 16215, 15931, 15953, - 15892, 15912, 16121, 15871, 16054, 16184, 16240, 15609, 16195, 16191, 16191, 15805, 16231, 15966, 15786, 16191, - 16141, 16187, 16149, 15674, 16246, 15958, 16021, 16018, 15990, 16173, 15821, 15745, 15494, 16142, 16237, 15383, - 16171, 16213, 16200, 16251, 16016, 16180, 16150, 15929, 15746, 16131, 16120, 16148, 16250, 16201, 16224, 16155, - 16045, 15967, 16246, 16105, 15981, 16224, 16243, 16124, 16240, 16183, 16204, 16120, 16161, 16181, 16223, 16127, - 16022, 16216, 16217, 15943, 16158, 16197, 15448, 16249, 16049, 16220, 15895, 16199, 16251, 16252, 16116, 16192, - 16126, 15236, 16163, 16009, 16060, 16082, 15884, 16091, 16210, 16024, 15938, 16077, 16130, 15863, 15973, 16251, - 15816, 16079, 16220, 16145, 16249, 16047, 16245, 16201, 16232, 16082, 16198, 16055, 16042, 16076, 15782, 16026, - 16080, 16198, 15981, 16237, 15879, 16038, 15706, 16243, 16185, 15460, 15419, 16136, 16197, 16027, 15894, 16226, - 15778, 16000, 15799, 16173, 16172, 16207, 15995, 16093, 16087, 16192, 16142, 16212, 16220, 16066, 16186, 15813, - 16010, 16003, 15878, 16151, 15714, 16115, 16026, 16121, 16006, 16106, 16105, 16134, 16174, 16098, 16178, 16218, - 16017, 16093, 16066, 16211, 15929, 16130, 16201, 15792, 15720, 16168, 16178, 15955, 16199, 16216, 16199, 16174, - 16004, 15926, 16063, 15759, 16150, 15390, 16011, 16228, 16061, 15880, 15945, 16199, 16107, 16236, 15670, 16183, - 16204, 16123, 15773, 16112, 16132, 16225, 16029, 16122, 16147, 16084, 16245, 15922, 16165, 16115, 15632, 16200, - 16092, 16142, 16130, 15907, 16137, 15891, 16174, 16166, 16014, 16138, 15875, 16038, 16073, 15894, 16244, 15907, - 15935, 15876, 16231, 16148, 16139, 15804, 16105, 16233, 16225, 15785, 16106, 16204, 16185, 16224, 16076, 15807, - 16231, 16090, 16176, 16114, 16179, 16148, 16039, 16183, 16193, 15581, 16162, 16187, 15989, 16196, 15908, 15392, - 16203, 16029, 16245, 15982, 16106, 16128, 16151, 16244, 16219, 16142, 16106, 15815, 16243, 16159, 16147, 16220, - 16210, 15905, 16232, 16254, 16208, 15790, 15907, 15809, 16160, 16162, 16075, 16243, 15744, 16239, 16089, 16101, - 16004, 16186, 16217, 16190, 15624, 16029, 16245, 15861, 16053, 16099, 16054, 16072, 15493, 16136, 15933, 16216, - 16077, 16137, 16237, 16174, 15820, 16155, 16241, 15817, 16222, 15804, 16104, 15717, 16039, 15793, 15982, 15986, - 16157, 16214, 15623, 16133, 15487, 16131, 16091, 16166, 15755, 16139, 16000, 15620, 15970, 16148, 16001, 16197, - 15878, 16064, 15429, 16123, 15852, 16251, 16158, 15994, 16249, 16063, 16253, 15675, 16081, 16030, 15910, 16212, - 16163, 16206, 16123, 16163, 16253, 16060, 15749, 16032, 16200, 16205, 16019, 15760, 15991, 16174, 16169, 16066, - 15995, 16162, 16170, 16237, 16132, 16218, 16089, 16126, 16142, 16091, 16018, 16210, 16180, 16188, 16084, 16100, - 16056, 16248, 16212, 16057, 16236, 16075, 15676, 16189, 15982, 16101, 16050, 16239, 16208, 16003, 16252, 16067, - 16248, 16178, 16231, 16229, - }, - - { - 16140, 16151, 16183, 16216, 16154, 16219, 16139, 16216, 16088, 16159, 16165, 16068, 16096, 16024, 16228, 15720, - 16246, 16011, 16068, 16116, 16202, 16207, 16135, 16117, 16145, 16073, 16236, 16214, 15761, 16044, 15794, 16165, - 15525, 16060, 16213, 16245, 16199, 15887, 16222, 16222, 16250, 16114, 16204, 16205, 16108, 16133, 16199, 16173, - 15858, 16184, 16163, 16148, 15890, 16137, 16241, 16194, 16133, 15832, 16084, 16114, 16007, 15934, 16198, 16188, - 16105, 15965, 16145, 15882, 15513, 16037, 16158, 15897, 16156, 15971, 16157, 16069, 16241, 16231, 16174, 16102, - 16056, 16156, 16095, 16231, 16178, 15819, 15734, 16248, 16170, 16167, 16171, 15919, 15959, 16055, 15876, 16192, - 16033, 16155, 16058, 16038, 16145, 15645, 16096, 16162, 16253, 16245, 15824, 16167, 15957, 16162, 15909, 16254, - 16167, 16148, 16001, 16084, 16110, 16115, 15994, 16159, 15906, 16045, 15842, 16172, 16168, 16034, 15885, 16199, - 15945, 16243, 16060, 16169, 16210, 15454, 15814, 16159, 16214, 16172, 15812, 16248, 16249, 16224, 16111, 16130, - 16250, 15716, 16154, 16102, 16189, 15523, 15648, 16098, 16016, 16250, 15862, 16056, 16023, 16118, 15859, 16176, - 16034, 16225, 16084, 16235, 15747, 15966, 16177, 16144, 16145, 16221, 16007, 16130, 16133, 16234, 15808, 16235, - 16147, 15786, 16237, 16014, 16035, 15385, 16170, 16215, 15878, 16165, 16183, 16215, 16020, 16007, 15931, 16075, - 16150, 16141, 15524, 15912, 16212, 16061, 15257, 15893, 16173, 16145, 16010, 16180, 16188, 16019, 16246, 16093, - 15998, 16193, 16147, 16074, 16151, 16229, 16146, 16163, 15972, 16228, 16243, 16174, 16100, 16101, 16216, 16250, - 16179, 15853, 16024, 16196, 16208, 16082, 16075, 16172, 16225, 15999, 16148, 16032, 16225, 16247, 16177, 16150, - 16185, 16168, 16128, 16136, 16244, 15980, 16164, 16074, 16089, 16158, 16155, 16115, 15517, 16112, 16026, 16183, - 16169, 16019, 16020, 16068, 16158, 16191, 16091, 16224, 15882, 15826, 16024, 15805, 16145, 16053, 16151, 16141, - 16147, 15625, 16167, 16248, 16166, 16036, 16092, 15970, 16229, 15888, 16060, 15815, 16095, 16251, 16228, 16005, - 16206, 16137, 16180, 16101, 15821, 15819, 16235, 16052, 16182, 16112, 16255, 16215, 15897, 16231, 16222, 15641, - 15910, 16130, 16157, 15914, 15869, 16199, 16217, 16221, 16206, 16082, 16145, 15887, 16080, 15624, 15757, 16251, - 16178, 16063, 16104, 16087, 16184, 15695, 16221, 16059, 16249, 15496, 16219, 15980, 15423, 16195, 16056, 16241, - 16186, 16191, 15919, 16045, 16133, 16122, 15710, 16045, 15948, 15927, 15511, 15919, 16203, 16109, 15973, 16223, - 16048, 16241, 16237, 16155, 16180, 16152, 15618, 16200, 15912, 16128, 16159, 15694, 16147, 16178, 15987, 16254, - 16239, 16008, 16157, 16173, 16137, 16221, 16151, 16192, 16186, 16246, 16031, 16141, 16075, 15961, 15958, 15971, - 15934, 15967, 16241, 16145, 16189, 16103, 16123, 16248, 15976, 16174, 16002, 15790, 15725, 15719, 16094, 16121, - 16031, 16225, 16178, 16249, 16065, 16158, 15927, 16138, 15562, 16218, 15753, 16190, 16173, 16117, 16104, 16173, - 16137, 16155, 16229, 16182, 16253, 16112, 15966, 16105, 16169, 16232, 16006, 15884, 15529, 15978, 16194, 16225, - 16035, 16231, 16068, 16165, 16150, 16038, 16212, 16133, 16161, 14440, 16223, 16031, 16012, 16089, 16204, 16226, - 15934, 16174, 16243, 16105, 16175, 16119, 15964, 16201, 16242, 15978, 16187, 16225, 16002, 16032, 15962, 16245, - 16132, 16113, 15570, 16182, 15956, 15901, 16089, 16186, 16063, 16165, 16109, 15964, 16014, 15934, 16150, 16206, - 16221, 16191, 15856, 16172, 16132, 16013, 15879, 15923, 16183, 16180, 16074, 16109, 16144, 16215, 15931, 15953, - 15892, 15912, 16121, 15871, 16054, 16184, 16240, 15609, 16195, 16191, 16191, 15805, 16231, 15966, 15786, 16191, - 16141, 16187, 16149, 15674, 16246, 15958, 16021, 16018, 15990, 16173, 15821, 15745, 15494, 16142, 16237, 15383, - 16171, 16213, 16200, 16251, 16016, 16180, 16150, 15929, 15746, 16131, 16120, 16148, 16250, 16201, 16224, 16155, - 16045, 15967, 16246, 16105, 15981, 16224, 16243, 16124, 16240, 16183, 16204, 16120, 16161, 16181, 16223, 16127, - 16022, 16216, 16217, 15943, 16158, 16197, 15448, 16249, 16049, 16220, 15895, 16199, 16251, 16252, 16116, 16192, - 16126, 15236, 16163, 16009, 16060, 16082, 15884, 16091, 16210, 16024, 15938, 16077, 16130, 15863, 15973, 16251, - 15816, 16079, 16220, 16145, 16249, 16047, 16245, 16201, 16232, 16082, 16198, 16055, 16042, 16076, 15782, 16026, - 16080, 16198, 15981, 16237, 15879, 16038, 15706, 16243, 16185, 15460, 15419, 16136, 16197, 16027, 15894, 16226, - 15778, 16000, 15799, 16173, 16172, 16207, 15995, 16093, 16087, 16192, 16142, 16212, 16220, 16066, 16186, 15813, - 16010, 16003, 15878, 16151, 15714, 16115, 16026, 16121, 16006, 16106, 16105, 16134, 16174, 16098, 16178, 16218, - 16017, 16093, 16066, 16211, 15929, 16130, 16201, 15792, 15720, 16168, 16178, 15955, 16199, 16216, 16199, 16174, - 16004, 15926, 16063, 15759, 16150, 15390, 16011, 16228, 16061, 15880, 15945, 16199, 16107, 16236, 15670, 16183, - 16204, 16123, 15773, 16112, 16132, 16225, 16029, 16122, 16147, 16084, 16245, 15922, 16165, 16115, 15632, 16200, - 16092, 16142, 16130, 15907, 16137, 15891, 16174, 16166, 16014, 16138, 15875, 16038, 16073, 15894, 16244, 15907, - 15935, 15876, 16231, 16148, 16139, 15804, 16105, 16233, 16225, 15785, 16106, 16204, 16185, 16224, 16076, 15807, - 16231, 16090, 16176, 16114, 16179, 16148, 16039, 16183, 16193, 15581, 16162, 16187, 15989, 16196, 15908, 15392, - 16203, 16029, 16245, 15982, 16106, 16128, 16151, 16244, 16219, 16142, 16106, 15815, 16243, 16159, 16147, 16220, - 16210, 15905, 16232, 16254, 16208, 15790, 15907, 15809, 16160, 16162, 16075, 16243, 15744, 16239, 16089, 16101, - 16004, 16186, 16217, 16190, 15624, 16029, 16245, 15861, 16053, 16099, 16054, 16072, 15493, 16136, 15933, 16216, - 16077, 16137, 16237, 16174, 15820, 16155, 16241, 15817, 16222, 15804, 16104, 15717, 16039, 15793, 15982, 15986, - 16157, 16214, 15623, 16133, 15487, 16131, 16091, 16166, 15755, 16139, 16000, 15620, 15970, 16148, 16001, 16197, - 15878, 16064, 15429, 16123, 15852, 16251, 16158, 15994, 16249, 16063, 16253, 15675, 16081, 16030, 15910, 16212, - 16163, 16206, 16123, 16163, 16253, 16060, 15749, 16032, 16200, 16205, 16019, 15760, 15991, 16174, 16169, 16066, - }, - { - 16140, 16151, 16183, 16216, 16154, 16219, 16139, 16216, 16088, 16159, 16165, 16068, 16096, 16024, 16228, 15720, - 16246, 16011, 16068, 16116, 16202, 16207, 16135, 16117, 16145, 16073, 16236, 16214, 15761, 16044, 15794, 16165, - 15525, 16060, 16213, 16245, 16199, 15887, 16222, 16222, 16250, 16114, 16204, 16205, 16108, 16133, 16199, 16173, - 15858, 16184, 16163, 16148, 15890, 16137, 16241, 16194, 16133, 15832, 16084, 16114, 16007, 15934, 16198, 16188, - 16105, 15965, 16145, 15882, 15513, 16037, 16158, 15897, 16156, 15971, 16157, 16069, 16241, 16231, 16174, 16102, - 16056, 16156, 16095, 16231, 16178, 15819, 15734, 16248, 16170, 16167, 16171, 15919, 15959, 16055, 15876, 16192, - 16033, 16155, 16058, 16038, 16145, 15645, 16096, 16162, 16253, 16245, 15824, 16167, 15957, 16162, 15909, 16254, - 16167, 16148, 16001, 16084, 16110, 16115, 15994, 16159, 15906, 16045, 15842, 16172, 16168, 16034, 15885, 16199, - 15945, 16243, 16060, 16169, 16210, 15454, 15814, 16159, 16214, 16172, 15812, 16248, 16249, 16224, 16111, 16130, - 16250, 15716, 16154, 16102, 16189, 15523, 15648, 16098, 16016, 16250, 15862, 16056, 16023, 16118, 15859, 16176, - 16034, 16225, 16084, 16235, 15747, 15966, 16177, 16144, 16145, 16221, 16007, 16130, 16133, 16234, 15808, 16235, - 16147, 15786, 16237, 16014, 16035, 15385, 16170, 16215, 15878, 16165, 16183, 16215, 16020, 16007, 15931, 16075, - 16150, 16141, 15524, 15912, 16212, 16061, 15257, 15893, 16173, 16145, 16010, 16180, 16188, 16019, 16246, 16093, - 15998, 16193, 16147, 16074, 16151, 16229, 16146, 16163, 15972, 16228, 16243, 16174, 16100, 16101, 16216, 16250, - 16179, 15853, 16024, 16196, 16208, 16082, 16075, 16172, 16225, 15999, 16148, 16032, 16225, 16247, 16177, 16150, - 16185, 16168, 16128, 16136, 16244, 15980, 16164, 16074, 16089, 16158, 16155, 16115, 15517, 16112, 16026, 16183, - 16169, 16019, 16020, 16068, 16158, 16191, 16091, 16224, 15882, 15826, 16024, 15805, 16145, 16053, 16151, 16141, - 16147, 15625, 16167, 16248, 16166, 16036, 16092, 15970, 16229, 15888, 16060, 15815, 16095, 16251, 16228, 16005, - 16206, 16137, 16180, 16101, 15821, 15819, 16235, 16052, 16182, 16112, 16255, 16215, 15897, 16231, 16222, 15641, - 15910, 16130, 16157, 15914, 15869, 16199, 16217, 16221, 16206, 16082, 16145, 15887, 16080, 15624, 15757, 16251, - 16178, 16063, 16104, 16087, 16184, 15695, 16221, 16059, 16249, 15496, 16219, 15980, 15423, 16195, 16056, 16241, - 16186, 16191, 15919, 16045, 16133, 16122, 15710, 16045, 15948, 15927, 15511, 15919, 16203, 16109, 15973, 16223, - 16048, 16241, 16237, 16155, 16180, 16152, 15618, 16200, 15912, 16128, 16159, 15694, 16147, 16178, 15987, 16254, - 16239, 16008, 16157, 16173, 16137, 16221, 16151, 16192, 16186, 16246, 16031, 16141, 16075, 15961, 15958, 15971, - 15934, 15967, 16241, 16145, 16189, 16103, 16123, 16248, 15976, 16174, 16002, 15790, 15725, 15719, 16094, 16121, - 16031, 16225, 16178, 16249, 16065, 16158, 15927, 16138, 15562, 16218, 15753, 16190, 16173, 16117, 16104, 16173, - 16137, 16155, 16229, 16182, 16253, 16112, 15966, 16105, 16169, 16232, 16006, 15884, 15529, 15978, 16194, 16225, - 16035, 16231, 16068, 16165, 16150, 16038, 16212, 16133, 16161, 14440, 16223, 16031, 16012, 16089, 16204, 16226, - 15934, 16174, 16243, 16105, 16175, 16119, 15964, 16201, 16242, 15978, 16187, 16225, 16002, 16032, 15962, 16245, - 16132, 16113, 15570, 16182, 15956, 15901, 16089, 16186, 16063, 16165, 16109, 15964, 16014, 15934, 16150, 16206, - 16221, 16191, 15856, 16172, 16132, 16013, 15879, 15923, 16183, 16180, 16074, 16109, 16144, 16215, 15931, 15953, - 15892, 15912, 16121, 15871, 16054, 16184, 16240, 15609, 16195, 16191, 16191, 15805, 16231, 15966, 15786, 16191, - 16141, 16187, 16149, 15674, 16246, 15958, 16021, 16018, 15990, 16173, 15821, 15745, 15494, 16142, 16237, 15383, - 16171, 16213, 16200, 16251, 16016, 16180, 16150, 15929, 15746, 16131, 16120, 16148, 16250, 16201, 16224, 16155, - 16045, 15967, 16246, 16105, 15981, 16224, 16243, 16124, 16240, 16183, 16204, 16120, 16161, 16181, 16223, 16127, - 16022, 16216, 16217, 15943, 16158, 16197, 15448, 16249, 16049, 16220, 15895, 16199, 16251, 16252, 16116, 16192, - 16126, 15236, 16163, 16009, 16060, 16082, 15884, 16091, 16210, 16024, 15938, 16077, 16130, 15863, 15973, 16251, - 15816, 16079, 16220, 16145, 16249, 16047, 16245, 16201, 16232, 16082, 16198, 16055, 16042, 16076, 15782, 16026, - 16080, 16198, 15981, 16237, 15879, 16038, 15706, 16243, 16185, 15460, 15419, 16136, 16197, 16027, 15894, 16226, - 15778, 16000, 15799, 16173, 16172, 16207, 15995, 16093, 16087, 16192, 16142, 16212, 16220, 16066, 16186, 15813, - 16010, 16003, 15878, 16151, 15714, 16115, 16026, 16121, 16006, 16106, 16105, 16134, 16174, 16098, 16178, 16218, - 16017, 16093, 16066, 16211, 15929, 16130, 16201, 15792, 15720, 16168, 16178, 15955, 16199, 16216, 16199, 16174, - 16004, 15926, 16063, 15759, 16150, 15390, 16011, 16228, 16061, 15880, 15945, 16199, 16107, 16236, 15670, 16183, - 16204, 16123, 15773, 16112, 16132, 16225, 16029, 16122, 16147, 16084, 16245, 15922, 16165, 16115, 15632, 16200, - 16092, 16142, 16130, 15907, 16137, 15891, 16174, 16166, 16014, 16138, 15875, 16038, 16073, 15894, 16244, 15907, - 15935, 15876, 16231, 16148, 16139, 15804, 16105, 16233, 16225, 15785, 16106, 16204, 16185, 16224, 16076, 15807, - 16231, 16090, 16176, 16114, 16179, 16148, 16039, 16183, 16193, 15581, 16162, 16187, 15989, 16196, 15908, 15392, - 16203, 16029, 16245, 15982, 16106, 16128, 16151, 16244, 16219, 16142, 16106, 15815, 16243, 16159, 16147, 16220, - 16210, 15905, 16232, 16254, 16208, 15790, 15907, 15809, 16160, 16162, 16075, 16243, 15744, 16239, 16089, 16101, - 16004, 16186, 16217, 16190, 15624, 16029, 16245, 15861, 16053, 16099, 16054, 16072, 15493, 16136, 15933, 16216, - 16077, 16137, 16237, 16174, 15820, 16155, 16241, 15817, 16222, 15804, 16104, 15717, 16039, 15793, 15982, 15986, - 16157, 16214, 15623, 16133, 15487, 16131, 16091, 16166, 15755, 16139, 16000, 15620, 15970, 16148, 16001, 16197, - 15878, 16064, 15429, 16123, 15852, 16251, 16158, 15994, 16249, 16063, 16253, 15675, 16081, 16030, 15910, 16212, - 16163, 16206, 16123, 16163, 16253, 16060, 15749, 16032, 16200, 16205, 16019, 15760, 15991, 16174, 16169, 16066, - 15995, 16162, 16170, 16237, 16132, 16218, 16089, 16126, 16142, 16091, 16018, 16210, 16180, 16188, 16084, 16100, - 16056, 16248, 16212, 16057, 16236, 16075, 15676, 16189, 15982, 16101, 16050, 16239, 16208, 16003, 16252, 16067, - 16248, 16178, 16231, 16229, 16023, 15863, 16253, 15991, 15999, 15977, 15832, 16122, 16243, 16228, 15983, 16055, - 16176, 16069, 15727, 16234, 16187, 15849, 16225, 16161, 16011, 15880, 16066, 16063, 16063, 16038, 16191, 16174, - 15987, 16203, 15919, 16129, 16102, 16023, 16027, 16226, 16214, 16052, 15987, 16189, 16128, 16142, 16241, 15950, - 16162, 16140, 16222, 16133, 16240, 16050, 16192, 15561, 16179, 15896, 16247, 15879, 16254, 16181, 16103, 16181, - 15761, 16156, 16021, 16172, 15900, 16101, 16085, 16178, 15878, 16065, 16154, 15820, 16067, 16245, 16229, 15764, - 16247, 15518, 16140, 16250, 16012, 15896, 16151, 16004, 16229, 15964, 16080, 16148, 16141, 16249, 16011, 16011, - 16105, 16248, 16077, 15568, 15998, 16227, 16129, 16181, 16030, 16014, 16062, 16229, 16134, 15577, 16192, 16160, - 16042, 16040, 16236, 16247, 16220, 15916, 15687, 16230, 16001, 16040, 16100, 16227, 15830, 16131, 16050, 16130, - 16189, 16070, 16174, 16135, 16159, 16241, 16181, 16228, 15953, 16173, 16046, 16163, 16173, 16140, 16225, 16011, - 16139, 15895, 16016, 16219, 15607, 16162, 16181, 16025, 15361, 16107, 16062, 15560, 16135, 16142, 16236, 16056, - 15799, 16128, 16079, 15901, 15559, 16089, 16047, 16231, 16159, 15371, 16014, 16248, 15958, 16176, 15852, 15819, - 16147, 16020, 16177, 16138, 16172, 16185, 16242, 16071, - } - -}; -static std::vector> ref_weight_out = { - {16140, 16151, 16183, 16216, 16154, 16219, 16139, 16216, 16088, 16156, 15971, 16157, 16069, 16241, 16231, 16174, - 16102, 16056, 16250, 15716, 16154, 16102, 16189, 15523, 15648, 16098, 16016, 15972, 16228, 16243, 16174, 16100, - 16101, 16216, 16250, 16179, 16206, 16137, 16180, 16101, 15821, 15819, 16235, 16052, 16182, 15912, 16128, 16159, - 15694, 16147, 16178, 15987, 16254, 16239, 16035, 16231, 16068, 16165, 16150, 16038, 16212, 16133, 16161, 16195, - 16191, 16191, 15805, 16231, 15966, 15786, 16191, 16141, 16159, 16165, 16068, 16096, 16024, 16228, 15720, 16246, - 16011, 16156, 16095, 16231, 16178, 15819, 15734, 16248, 16170, 16167, 16250, 15862, 16056, 16023, 16118, 15859, - 16176, 16034, 16225, 15853, 16024, 16196, 16208, 16082, 16075, 16172, 16225, 15999, 16112, 16255, 16215, 15897, - 16231, 16222, 15641, 15910, 16130, 16008, 16157, 16173, 16137, 16221, 16151, 16192, 16186, 16246, 14440, 16223, - 16031, 16012, 16089, 16204, 16226, 15934, 16174, 16187, 16149, 15674, 16246, 15958, 16021, 16018, 15990, 16173, - 16068, 16116, 16202, 16207, 16135, 16117, 16145, 16073, 16236, 16171, 15919, 15959, 16055, 15876, 16192, 16033, - 16155, 16058, 16084, 16235, 15747, 15966, 16177, 16144, 16145, 16221, 16007, 16148, 16032, 16225, 16247, 16177, - 16150, 16185, 16168, 16128, 16157, 15914, 15869, 16199, 16217, 16221, 16206, 16082, 16145, 16031, 16141, 16075, - 15961, 15958, 15971, 15934, 15967, 16241, 16243, 16105, 16175, 16119, 15964, 16201, 16242, 15978, 16187, 15821, - 15745, 15494, 16142, 16237, 15383, 16171, 16213, 16200, 16214, 15761, 16044, 15794, 16165, 15525, 16060, 16213, - 16245, 16038, 16145, 15645, 16096, 16162, 16253, 16245, 15824, 16167, 16130, 16133, 16234, 15808, 16235, 16147, - 15786, 16237, 16014, 16136, 16244, 15980, 16164, 16074, 16089, 16158, 16155, 16115, 15887, 16080, 15624, 15757, - 16251, 16178, 16063, 16104, 16087, 16145, 16189, 16103, 16123, 16248, 15976, 16174, 16002, 15790, 16225, 16002, - 16032, 15962, 16245, 16132, 16113, 15570, 16182, 16251, 16016, 16180, 16150, 15929, 15746, 16131, 16120, 16148, - 16199, 15887, 16222, 16222, 16250, 16114, 16204, 16205, 16108, 15957, 16162, 15909, 16254, 16167, 16148, 16001, - 16084, 16110, 16035, 15385, 16170, 16215, 15878, 16165, 16183, 16215, 16020, 15517, 16112, 16026, 16183, 16169, - 16019, 16020, 16068, 16158, 16184, 15695, 16221, 16059, 16249, 15496, 16219, 15980, 15423, 15725, 15719, 16094, - 16121, 16031, 16225, 16178, 16249, 16065, 15956, 15901, 16089, 16186, 16063, 16165, 16109, 15964, 16014, 16250, - 16201, 16224, 16155, 16045, 15967, 16246, 16105, 15981, 16133, 16199, 16173, 15858, 16184, 16163, 16148, 15890, - 16137, 16115, 15994, 16159, 15906, 16045, 15842, 16172, 16168, 16034, 16007, 15931, 16075, 16150, 16141, 15524, - 15912, 16212, 16061, 16191, 16091, 16224, 15882, 15826, 16024, 15805, 16145, 16053, 16195, 16056, 16241, 16186, - 16191, 15919, 16045, 16133, 16122, 16158, 15927, 16138, 15562, 16218, 15753, 16190, 16173, 16117, 15934, 16150, - 16206, 16221, 16191, 15856, 16172, 16132, 16013, 16224, 16243, 16124, 16240, 16183, 16204, 16120, 16161, 16181, - 16241, 16194, 16133, 15832, 16084, 16114, 16007, 15934, 16198, 15885, 16199, 15945, 16243, 16060, 16169, 16210, - 15454, 15814, 15257, 15893, 16173, 16145, 16010, 16180, 16188, 16019, 16246, 16151, 16141, 16147, 15625, 16167, - 16248, 16166, 16036, 16092, 15710, 16045, 15948, 15927, 15511, 15919, 16203, 16109, 15973, 16104, 16173, 16137, - 16155, 16229, 16182, 16253, 16112, 15966, 15879, 15923, 16183, 16180, 16074, 16109, 16144, 16215, 15931, 16223, - 16127, 16022, 16216, 16217, 15943, 16158, 16197, 15448, 16188, 16105, 15965, 16145, 15882, 15513, 16037, 16158, - 15897, 16159, 16214, 16172, 15812, 16248, 16249, 16224, 16111, 16130, 16093, 15998, 16193, 16147, 16074, 16151, - 16229, 16146, 16163, 15970, 16229, 15888, 16060, 15815, 16095, 16251, 16228, 16005, 16223, 16048, 16241, 16237, - 16155, 16180, 16152, 15618, 16200, 16105, 16169, 16232, 16006, 15884, 15529, 15978, 16194, 16225, 15953, 15892, - 15912, 16121, 15871, 16054, 16184, 16240, 15609, 16249, 16049, 16220, 15895, 16199, 16251, 16252, 16116, 16192}, - { - 16140, 16171, 16035, 16159, 16038, 16007, 16068, 15957, 15257, 16151, 15919, 15385, 16165, 16145, 15931, 16116, - 16162, 15893, 16183, 15959, 16170, 16068, 15645, 16075, 16202, 15909, 16173, 16216, 16055, 16215, 16096, 16096, - 16150, 16207, 16254, 16145, 16154, 15876, 15878, 16024, 16162, 16141, 16135, 16167, 16010, 16219, 16192, 16165, - 16228, 16253, 15524, 16117, 16148, 16180, 16139, 16033, 16183, 15720, 16245, 15912, 16145, 16001, 16188, 16216, - 16155, 16215, 16246, 15824, 16212, 16073, 16084, 16019, 16088, 16058, 16020, 16011, 16167, 16061, 16236, 16110, - 16246, 16151, 15912, 16243, 15970, 16008, 16225, 16206, 16031, 15956, 16141, 16128, 16105, 16229, 16157, 16002, - 16137, 16141, 15901, 16147, 16159, 16175, 15888, 16173, 16032, 16180, 16075, 16089, 15625, 15694, 16119, 16060, - 16137, 15962, 16101, 15961, 16186, 16167, 16147, 15964, 15815, 16221, 16245, 15821, 15958, 16063, 16248, 16178, - 16201, 16095, 16151, 16132, 15819, 15971, 16165, 16166, 15987, 16242, 16251, 16192, 16113, 16235, 15934, 16109, - 16036, 16254, 15978, 16228, 16186, 15570, 16052, 15967, 15964, 16092, 16239, 16187, 16005, 16246, 16182, 16182, - 16241, 16014, 16250, 15995, 15935, 16224, 15813, 15785, 16223, 16006, 16176, 16201, 16093, 15876, 16243, 16010, - 16106, 16127, 16106, 16114, 16224, 16087, 16231, 16124, 16003, 16204, 16022, 16105, 16179, 16155, 16192, 16148, - 16240, 15878, 16185, 16216, 16134, 16148, 16045, 16142, 16139, 16183, 16151, 16224, 16217, 16174, 16039, 15967, - 16212, 15804, 16204, 15714, 16076, 15943, 16098, 16183, 16246, 16220, 16105, 16120, 16115, 15807, 16158, 16178, - 16193, 16105, 16066, 16233, 16161, 16026, 16231, 16197, 16218, 15581, 15981, 16186, 16225, 16181, 16121, 16090, - 15448, 16017, 16162, 16214, 16115, 16093, 16199, 15885, 15972, 16133, 16159, 15853, 15761, 15994, 15998, 15887, - 16199, 16228, 16199, 16214, 16024, 16044, 16159, 16193, 16222, 15945, 16243, 16173, 16172, 16196, 15794, 15906, - 16147, 16222, 16243, 16174, 15858, 15812, 16208, 16165, 16045, 16074, 16250, 16060, 16100, 16184, 16248, 16082, - 15525, 15842, 16151, 16114, 16169, 16101, 16163, 16249, 16075, 16060, 16172, 16229, 16204, 16210, 16216, 16148, - 16224, 16172, 16213, 16168, 16146, 16205, 15454, 16250, 15890, 16111, 16225, 16245, 16034, 16163, 16108, 15814, - 16179, 16137, 16130, 15999, 16112, 16145, 15934, 16157, 15725, 15879, 15887, 16158, 15953, 16255, 16189, 16150, - 15914, 15719, 15923, 16080, 15927, 15892, 16215, 16103, 16206, 15869, 16094, 16183, 15624, 16138, 15912, 15897, - 16123, 16221, 16199, 16121, 16180, 15757, 15562, 16121, 16231, 16248, 16191, 16217, 16031, 16074, 16251, 16218, - 15871, 16222, 15976, 15856, 16221, 16225, 16109, 16178, 15753, 16054, 15641, 16174, 16172, 16206, 16178, 16144, - 16063, 16190, 16184, 15910, 16002, 16132, 16082, 16249, 16215, 16104, 16173, 16240, 16130, 15790, 16013, 16145, - 16065, 15931, 16087, 16117, 15609, 16249, 16093, 16187, 16126, 16178, 16106, 16024, 15759, 16159, 16049, 16066, - 15989, 15236, 15955, 16128, 15938, 16150, 16147, 16220, 16211, 16196, 16163, 16199, 16151, 16077, 15390, 16220, - 15895, 15929, 15908, 16009, 16216, 16244, 16130, 16011, 16210, 16199, 16130, 15392, 16060, 16199, 16219, 15863, - 16228, 15905, 16251, 16201, 16203, 16082, 16174, 16142, 15973, 16061, 16232, 16252, 15792, 16029, 15884, 16004, - 16106, 16251, 15880, 16254, 16116, 15720, 16245, 16091, 15926, 15815, 15816, 15945, 16208, 16192, 16168, 15982, - 16210, 16063, 16243, 16079, 16199, 15790, 16241, 16250, 16148, 16188, 16250, 16136, 16156, 16084, 15517, 16194, - 15716, 16032, 16105, 15862, 16244, 15971, 16235, 16112, 16133, 16154, 16225, 15965, 16056, 15980, 16157, 15747, - 16026, 15832, 16102, 16247, 16145, 16023, 16164, 16069, 15966, 16183, 16084, 16189, 16177, 15882, 16118, 16074, - 16241, 16177, 16169, 16114, 15523, 16150, 15513, 15859, 16089, 16231, 16144, 16019, 16007, 15648, 16185, 16037, - 16176, 16158, 16174, 16145, 16020, 15934, 16098, 16168, 16158, 16034, 16155, 16102, 16221, 16068, 16198, 16016, - 16128, 15897, 16225, 16115, 16056, 16007, 16158, 16184, 16104, 16195, 16195, 16105, 16187, 15710, 16035, 15821, - 15695, 16173, 16191, 16056, 16169, 16149, 16045, 16231, 15745, 16221, 16137, 16191, 16241, 16232, 15674, 15948, - 16068, 15494, 16059, 16155, 15805, 16186, 16006, 16246, 15927, 16165, 16142, 16249, 16229, 16231, 16191, 15884, - 15958, 15511, 16150, 16237, 15496, 16182, 15966, 15919, 15529, 16021, 15919, 16038, 15383, 16219, 16253, 15786, - 16045, 15978, 16018, 16203, 16212, 16171, 15980, 16112, 16191, 16133, 16194, 15990, 16109, 16133, 16213, 15423, - 15966, 16141, 16122, 16225, 16173, 15973, 16161, 16200, 16220, 16107, 15907, 16055, 16225, 16101, 15879, 15632, - 16053, 16145, 16236, 15809, 16042, 16029, 16004, 16038, 16200, 16099, 16249, 15670, 16160, 16076, 16122, 16186, - 15706, 16092, 16054, 16047, 16183, 16162, 15782, 16147, 16217, 16243, 16142, 16072, 16245, 16204, 16075, 16026, - 16084, 16190, 16185, 16130, 15493, 16201, 16123, 16243, 16080, 16245, 15624, 15460, 15907, 16136, 16232, 15773, - 15744, 16198, 15922, 16029, 15419, 16137, 15933, 16082, 16112, 16239, 15981, 16165, 16245, 16136, 15891, 16216, - 16198, 16132, 16089, 16237, 16115, 15861, 16197, 16174, 16077, - }, - { - 16140, 16156, 16151, 15971, 16183, 16157, 16216, 16069, 16154, 16241, 16219, 16231, 16139, 16174, 16216, 16102, - 16088, 16056, 16250, 15972, 15716, 16228, 16154, 16243, 16102, 16174, 16189, 16100, 15523, 16101, 15648, 16216, - 16098, 16250, 16016, 16179, 16206, 15912, 16137, 16128, 16180, 16159, 16101, 15694, 15821, 16147, 15819, 16178, - 16235, 15987, 16052, 16254, 16182, 16239, 16035, 16195, 16231, 16191, 16068, 16191, 16165, 15805, 16150, 16231, - 16038, 15966, 16212, 15786, 16133, 16191, 16161, 16141, 16126, 16006, 15236, 16106, 16163, 16105, 16009, 16134, - 16060, 16174, 16082, 16098, 15884, 16178, 16091, 16218, 16210, 16017, 16159, 16156, 16165, 16095, 16068, 16231, - 16096, 16178, 16024, 15819, 16228, 15734, 15720, 16248, 16246, 16170, 16011, 16167, 16250, 15853, 15862, 16024, - 16056, 16196, 16023, 16208, 16118, 16082, 15859, 16075, 16176, 16172, 16034, 16225, 16225, 15999, 16112, 16008, - 16255, 16157, 16215, 16173, 15897, 16137, 16231, 16221, 16222, 16151, 15641, 16192, 15910, 16186, 16130, 16246, - 14440, 16187, 16223, 16149, 16031, 15674, 16012, 16246, 16089, 15958, 16204, 16021, 16226, 16018, 15934, 15990, - 16174, 16173, 16024, 16093, 15938, 16066, 16077, 16211, 16130, 15929, 15863, 16130, 15973, 16201, 16251, 15792, - 15816, 15720, 16079, 16168, 16068, 16171, 16116, 15919, 16202, 15959, 16207, 16055, 16135, 15876, 16117, 16192, - 16145, 16033, 16073, 16155, 16236, 16058, 16084, 16148, 16235, 16032, 15747, 16225, 15966, 16247, 16177, 16177, - 16144, 16150, 16145, 16185, 16221, 16168, 16007, 16128, 16157, 16031, 15914, 16141, 15869, 16075, 16199, 15961, - 16217, 15958, 16221, 15971, 16206, 15934, 16082, 15967, 16145, 16241, 16243, 15821, 16105, 15745, 16175, 15494, - 16119, 16142, 15964, 16237, 16201, 15383, 16242, 16171, 15978, 16213, 16187, 16200, 16220, 16178, 16145, 15955, - 16249, 16199, 16047, 16216, 16245, 16199, 16201, 16174, 16232, 16004, 16082, 15926, 16198, 16063, 16214, 16038, - 15761, 16145, 16044, 15645, 15794, 16096, 16165, 16162, 15525, 16253, 16060, 16245, 16213, 15824, 16245, 16167, - 16130, 16136, 16133, 16244, 16234, 15980, 15808, 16164, 16235, 16074, 16147, 16089, 15786, 16158, 16237, 16155, - 16014, 16115, 15887, 16145, 16080, 16189, 15624, 16103, 15757, 16123, 16251, 16248, 16178, 15976, 16063, 16174, - 16104, 16002, 16087, 15790, 16225, 16251, 16002, 16016, 16032, 16180, 15962, 16150, 16245, 15929, 16132, 15746, - 16113, 16131, 15570, 16120, 16182, 16148, 16055, 15759, 16042, 16150, 16076, 15390, 15782, 16011, 16026, 16228, - 16080, 16061, 16198, 15880, 15981, 15945, 16237, 16199, 16199, 15957, 15887, 16162, 16222, 15909, 16222, 16254, - 16250, 16167, 16114, 16148, 16204, 16001, 16205, 16084, 16108, 16110, 16035, 15517, 15385, 16112, 16170, 16026, - 16215, 16183, 15878, 16169, 16165, 16019, 16183, 16020, 16215, 16068, 16020, 16158, 16184, 15725, 15695, 15719, - 16221, 16094, 16059, 16121, 16249, 16031, 15496, 16225, 16219, 16178, 15980, 16249, 15423, 16065, 15956, 16250, - 15901, 16201, 16089, 16224, 16186, 16155, 16063, 16045, 16165, 15967, 16109, 16246, 15964, 16105, 16014, 15981, - 15879, 16107, 16038, 16236, 15706, 15670, 16243, 16183, 16185, 16204, 15460, 16123, 15419, 15773, 16136, 16112, - 16197, 16132, - }, - { - 16140, 16159, 16159, 16250, 16068, 16250, 16151, 16214, 16165, 15716, 16116, 15862, 16183, 16172, 16068, 16154, - 16202, 16056, 16216, 15812, 16096, 16102, 16207, 16023, 16154, 16248, 16024, 16189, 16135, 16118, 16219, 16249, - 16228, 15523, 16117, 15859, 16139, 16224, 15720, 15648, 16145, 16176, 16216, 16111, 16246, 16098, 16073, 16034, - 16088, 16130, 16011, 16016, 16236, 16225, 16151, 16158, 15970, 16104, 16206, 16105, 16141, 15927, 16229, 16173, - 16137, 16169, 16147, 16138, 15888, 16137, 16180, 16232, 15625, 15562, 16060, 16155, 16101, 16006, 16167, 16218, - 15815, 16229, 15821, 15884, 16248, 15753, 16095, 16182, 15819, 15529, 16166, 16190, 16251, 16253, 16235, 15978, - 16036, 16173, 16228, 16112, 16052, 16194, 16092, 16117, 16005, 15966, 16182, 16225, 16250, 15759, 16224, 16107, - 16223, 16225, 16201, 16150, 16243, 16236, 16127, 16029, 16224, 15390, 16124, 15670, 16022, 16122, 16155, 16011, - 16240, 16183, 16216, 16147, 16045, 16228, 16183, 16204, 16217, 16084, 15967, 16061, 16204, 16123, 15943, 16245, - 16246, 15880, 16120, 15773, 16158, 15922, 16105, 15945, 16161, 16112, 16197, 16165, 15981, 16199, 16181, 16132, - 15448, 16115, 16104, 16140, 16133, 16247, 15970, 16172, 15717, 16222, 15487, 15879, 16148, 15900, 16039, 16133, - 16131, 16254, 16001, 16101, 15793, 16240, 16091, 16181, 16197, 16085, 15982, 16050, 16166, 16103, 15878, 16178, - 15986, 16192, 15755, 16181, 16064, 15878, 16157, 15561, 16139, 15761, 15429, 16065, 16214, 16179, 16000, 16156, - 16123, 16154, 15623, 15896, 15620, 16021, 15852, 15820, 16214, 16084, 16199, 16130, 16133, 16035, 15761, 16235, - 15887, 16133, 16199, 15385, 16044, 15747, 16222, 16234, 16173, 16170, 15794, 15966, 16222, 15808, 15858, 16215, - 16165, 16177, 16250, 16235, 16184, 15878, 15525, 16144, 16114, 16147, 16163, 16165, 16060, 16145, 16204, 15786, - 16148, 16183, 16213, 16221, 16205, 16237, 15890, 16215, 16245, 16007, 16108, 16014, 16137, 16020, 16112, 16035, - 16157, 14440, 15887, 16243, 16255, 16231, 15914, 16223, 16080, 16105, 16215, 16068, 15869, 16031, 15624, 16175, - 15897, 16165, 16199, 16012, 15757, 16119, 16231, 16150, 16217, 16089, 16251, 15964, 16222, 16038, 16221, 16204, - 16178, 16201, 15641, 16212, 16206, 16226, 16063, 16242, 15910, 16133, 16082, 15934, 16104, 15978, 16130, 16161, - 16145, 16174, 16087, 16187, 16249, 15632, 16126, 16166, 16024, 15935, 16049, 16200, 15236, 16014, 15938, 15876, - 16220, 16092, 16163, 16138, 16077, 16231, 15895, 16142, 16009, 15875, 16130, 16148, 16199, 16130, 16060, 16038, - 15863, 16139, 16251, 15907, 16082, 16073, 15973, 15804, 16252, 16137, 15884, 15894, 16251, 16105, 16116, 15891, - 16091, 16244, 15816, 16233, 16192, 16174, 16210, 15907, 16079, 16225, 16251, 16067, 15910, 15896, 16032, 16011, - 16158, 16245, 16212, 16151, 16200, 16011, 15994, 16229, 16163, 16004, 16205, 16105, 16249, 15764, 16206, 16229, - 16019, 16248, 16063, 16247, 16123, 15964, 15760, 16077, 16253, 15518, 16163, 16080, 15991, 15568, 15675, 16140, - 16253, 16148, 16174, 15998, 16081, 16250, 16060, 16141, 16169, 16227, 16030, 16012, 15749, 16249, 16066, 16129, - 16241, 16007, 16188, 15257, 16156, 16093, 16194, 15931, 16105, 15893, 15971, 15998, 16133, 16075, 15965, 16173, - 16157, 16193, 15832, 16150, 16145, 16145, 16069, 16147, 16084, 16141, 15882, 16010, 16241, 16074, 16114, 15524, - 15513, 16180, 16231, 16151, 16007, 15912, 16037, 16188, 16174, 16229, 15934, 16212, 16158, 16019, 16102, 16146, - 16198, 16061, 15897, 16246, 16056, 16163, 16184, 16225, 16195, 15956, 15710, 15934, 15695, 16002, 16056, 15901, - 16045, 16150, 16221, 16032, 16241, 16089, 15948, 16206, 16059, 15962, 16186, 16186, 15927, 16221, 16249, 16245, - 16191, 16063, 15511, 16191, 15496, 16132, 15919, 16165, 15919, 15856, 16219, 16113, 16045, 16109, 16203, 16172, - 15980, 15570, 16133, 15964, 16109, 16132, 15423, 16182, 16122, 16014, 15973, 16013, 16220, 15785, 16055, 16176, - 15879, 16187, 16145, 16106, 16042, 16114, 16038, 15989, 16249, 16204, 16076, 16179, 15706, 16196, 16047, 16185, - 15782, 16148, 16243, 15908, 16245, 16224, 16026, 16039, 16185, 15392, 16201, 16076, 16080, 16183, 15460, 16203, - 16232, 15807, 16198, 16193, 15419, 16029, 16082, 16231, 15981, 15581, 16136, 16245, 16198, 16090, 16237, 16162, - 16197, 15982, 15995, 16181, 16091, 16042, 16212, 16040, 16162, 16030, 16018, 16040, 16057, 16100, 16170, 16014, - 16210, 16236, 16236, 16227, 16237, 16062, 16180, 16247, 16075, 15830, 16132, 16229, 16188, 16220, 15676, 16131, - 16218, 16134, 16084, 15916, 16189, 16050, 16089, 15577, 16100, 15687, 15982, 16130, 16126, 16192, 16056, 16230, - 16101, 16189, 16142, 16160, 16248, 16001, 16050, 16070, 16156, 15972, 16171, 15853, 16038, 16148, 16095, 16228, - 15919, 16024, 16145, 16032, 16231, 16243, 15959, 16196, 15645, 16225, 16178, 16174, 16055, 16208, 16096, 16247, - 15819, 16100, 15876, 16082, 16162, 16177, 15734, 16101, 16192, 16075, 16253, 16150, 16248, 16216, 16033, 16172, - 16245, 16185, 16170, 16250, 16155, 16225, 15824, 16168, 16167, 16179, 16058, 15999, 16167, 16128, 16223, 15879, - 15912, 15953, 16008, 16195, 16048, 15923, 16128, 15892, 16157, 16191, 16241, 16183, 16159, 15912, 16173, 16191, - 16237, 16180, 15694, 16121, 16137, 15805, 16155, 16074, 16147, 15871, 16221, 16231, 16180, 16109, 16178, 16054, - 16151, 15966, 16152, 16144, 15987, 16184, 16192, 15786, 15618, 16215, 16254, 16240, 16186, 16191, 16200, 15931, - 16239, 15609, 16246, 16141, 16027, 16106, 15995, 16159, 15813, 15907, 15894, 16128, 16093, 16147, 16010, 15809, - 16226, 16151, 16087, 16220, 16003, 16160, 15778, 16244, 16192, 16210, 15878, 16162, 16000, 16219, 16142, 15905, - 16151, 16075, 15799, 16142, 16212, 16232, 15714, 16243, 16173, 16106, 16220, 16254, 16115, 15744, 16172, 15815, - 16066, 16208, 16026, 16239, 16207, 16243, 16186, 15790, 16121, 16089, 16239, 16174, 16023, 16163, 16228, 15607, - 16208, 16135, 15863, 16173, 15983, 16162, 16003, 16159, 16253, 16140, 16055, 16181, 16252, 16241, 15991, 16225, - 16176, 16025, 16067, 16181, 15999, 16011, 16069, 15361, 16248, 16228, 15977, 16139, 15727, 16107, 16178, 15953, - 15832, 15895, 16234, 16062, 16231, 16173, 16122, 16016, 16187, 15560, 16229, 16046, 16243, 16219, 15849, 16135, - }}; - -static std::vector weight_tensor_shape = { - {8, 8, 3, 3}, {10, 10, 3, 3}, {12, 8, 3, 3}, {8, 15, 3, 3}}; -static std::vector bias_tensor_shape = { - {1, 1, 1, 32}, {1, 1, 1, 60}, {12, 1, 1, 320}, {8, 1, 1, 48}}; -static std::vector shards = {8, 3, 5, 4}; - -template -static uint32_t compare_out_with_ref(const owned_buffer::Buffer& out_buf, T& ref) { - uint32_t diff = 0, j = 0; - for (uint32_t i = 0; i < out_buf.size(); i++) { - if (out_buf[i] == 0) { - continue; - } - if (out_buf[i] != ref[j]) { - log_info( - tt::LogTest, - "Error at i = {}, Golden = {}, Calculated = {}", - i, - out_buf[i].to_float(), - ref[j].to_float()); - diff++; - } - j++; - } - return diff; -} - -static void test_convert_conv_weight_tensor_to_tiled_layout_block_sharded() { - tt::log_info(tt::LogTest, "Running {}", __func__); - for (auto i = 0; i < weight_tensor_shape.size(); i++) { - auto input_tensor = ttnn::zeros(ttnn::Shape(weight_tensor_shape[i])); - auto input_buffer = owned_buffer::get_as(input_tensor); - for (auto j = 0; j < input_buffer.size(); j++) { - input_buffer[j] = ref_weight_in[i][j]; - } - auto output_tensor = ttnn::operations::conv::convert_conv_weight_tensor_to_tiled_layout_block_sharded( - input_tensor, shards[i], DataType::BFLOAT16); - auto out_buffer = owned_buffer::get_as(output_tensor); - - TT_FATAL(compare_out_with_ref(out_buffer, ref_weight_out[i]) == 0, "Error"); - } -} - -static void test_convert_conv_bias_tensor_to_tiled_layout_block_sharded() { - tt::log_info(tt::LogTest, "Running {}", __func__); - for (auto i = 0; i < bias_tensor_shape.size(); i++) { - auto input_tensor = - ttnn::random::random(Shape(bias_tensor_shape[i]), DataType::BFLOAT16).to_layout(Layout::ROW_MAJOR).cpu(); - auto input_buffer = owned_buffer::get_as(input_tensor); - auto output_tensor = ttnn::operations::conv::convert_conv_bias_tensor_to_tiled_layout_block_sharded( - input_tensor, shards[i], DataType::BFLOAT16); - auto out_buffer = owned_buffer::get_as(output_tensor); - /* Expected output should be same as input buffer except some padding*/ - TT_FATAL(compare_out_with_ref(out_buffer, input_buffer) == 0, "Error"); - } -} - -int main() { - tt::log_info(tt::LogTest, "Tests for Tensor utils starts"); - test_convert_conv_weight_tensor_to_tiled_layout_block_sharded(); - test_convert_conv_bias_tensor_to_tiled_layout_block_sharded(); - tt::log_info(tt::LogTest, "Tests for Tensor utils ends"); - return 0; -} diff --git a/tests/tt_eager/python_api_testing/conv/conv_op_trace_config.py b/tests/tt_eager/python_api_testing/conv/conv_op_trace_config.py deleted file mode 100644 index 30881b2bd64..00000000000 --- a/tests/tt_eager/python_api_testing/conv/conv_op_trace_config.py +++ /dev/null @@ -1,143 +0,0 @@ -# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. - -# SPDX-License-Identifier: Apache-2.0 - -import torch -import numpy -from loguru import logger -from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import comp_equal, comp_allclose_and_pcc - - -def trace_conv_to_generate_data_top_left_indices_and_pad_metadata(conv_params, input_nchw_shape): - assert len(conv_params) == 10 - output_channels, input_channels, filter_h, filter_w, stride_h, stride_w, pad_h, pad_w, dilation, groups = [ - conv_params[i] for i in range(10) - ] - assert dilation == 1 and groups == 1 - assert len(input_nchw_shape) == 4 - input_n, input_c, input_h, input_w = [input_nchw_shape[i] for i in range(4)] - # image 1 data - # 1 2 3 4 5 6 7 8 - # 9 10 11 12 13 14 15 16 - # 17 18 19 20 21 22 23 24 - # 25 26 27 28 29 30 31 32 - # image 2 data - # 33 34 35 36 37 38 39 40 - # 41 42 43 44 45 46 47 48 - # 49 50 51 52 53 54 55 56 - # 57 58 59 60 61 62 63 64 - - # Concatenated image data from above - # Inserted padding above and between and on the sides of the images (pad = 1) - # 0 0 0 0 0 0 0 0 0 0 - # 0 1 2 3 4 5 6 7 8 0 - # 0 9 10 11 12 13 14 15 16 0 - # 0 17 18 19 20 21 22 23 24 0 - # 0 25 26 27 28 29 30 31 32 0 - # 0 0 0 0 0 0 0 0 0 0 - # 0 0 0 0 0 0 0 0 0 0 - # 0 33 34 35 36 37 38 39 40 0 - # 0 41 42 43 44 45 46 47 48 0 - # 0 49 50 51 52 53 54 55 56 0 - # 0 57 58 59 60 61 62 63 64 0 - # 0 0 0 0 0 0 0 0 0 0 - - # We encode above shown padded tensor into pad_metadata (list of boolean - true if padding location) - # pad_meta_data: [true, true, ..., false, ...] - - padded_input_h = input_h + (2 * pad_h) - padded_input_w = input_w + (2 * pad_w) - pad_metadata = [] - for n in range(input_n): - for h in range(padded_input_h): - for w in range(padded_input_w): - if h < pad_h or h >= (input_h + pad_h) or w < pad_w or w >= (input_w + pad_w): - pad_metadata.append(True) - else: - pad_metadata.append(False) - - # TODO: add support for dilation > 1 - output_h = ((int)(padded_input_h - filter_h / stride_h)) + 1 - output_w = ((int)(padded_input_w - filter_w / stride_w)) + 1 - # generate a list of input indices corresponding to the top left position of sliding window - # the index refers to the location in the padded tensor - data_top_left_indices = [] - for n in range(input_n): - for oh in range(output_h): - for ow in range(output_w): - ih = oh * stride_h - iw = ow * stride_w - channel_idx = (n * padded_input_h * padded_input_w) + (ih * padded_input_w) + iw - data_top_left_indices.append(channel_idx) - - return pad_metadata, data_top_left_indices - - -def traced_conv_reference(pad_metadata, data_top_left_indices, conv_params, input_nchw_shape): - assert len(conv_params) == 10 - output_channels, input_channels, filter_h, filter_w, stride_h, stride_w, pad_h, pad_w, dilation, groups = [ - conv_params[i] for i in range(10) - ] - # unpadded tensor - input_tensor = [] - assert len(input_nchw_shape) == 4 - input_n, input_c, input_h, input_w = input_nchw_shape - assert input_c == 1 # Ref done for channel size = 1 - input_volume = numpy.prod(input_nchw_shape) - - # Initialize tensor with data - # Inserting sequential integer data - for val in range(1, input_volume + 1): - input_tensor.append(val) - input_pyt_tensor = torch.tensor(input_tensor) - input_pyt_tensor = torch.reshape(input_pyt_tensor, input_nchw_shape) - - # Construct the padded tensor using pad_metadata - input_padded_tensor = [] - input_padded_width = input_w + (2 * pad_w) - input_padded_height = input_h + (2 * pad_h) - input_padded_volume = input_n * input_padded_height * input_padded_width - input_tensor_idx = 0 - assert len(pad_metadata) == input_padded_volume - for i in range(input_padded_volume): - if pad_metadata[i]: - input_padded_tensor.append(0) - else: - input_padded_tensor.append(input_tensor[input_tensor_idx]) - input_tensor_idx += 1 - - assert len(input_padded_tensor) == input_padded_volume - input_padded_pyt_tensor = torch.tensor(input_padded_tensor).reshape( - [1, input_n * input_padded_height, input_padded_width] - ) - filter_volume = filter_h * filter_w - # Initializing filters with all 1s - filter_pyt_tensor = torch.full((1, 1, filter_h, filter_w), 1) - - output_tensor = [] - # run conv over padded tensor using data_top_left_indices - for i in data_top_left_indices: - i_bh = (int)(i / input_padded_width) - i_w = (int)(i % input_padded_width) - output_tensor.append( - torch.dot( - input_padded_pyt_tensor[:, i_bh : i_bh + filter_h, i_w : i_w + filter_w].reshape(-1), - filter_pyt_tensor.reshape(-1), - ) - ) - - output_pyt_tensor = torch.tensor(output_tensor) - # run conv pytorch - out_golden_pyt_tensor = torch.nn.functional.conv2d( - input_pyt_tensor, filter_pyt_tensor, stride=(stride_h, stride_w), padding=(pad_h, pad_w) - ) - assert numpy.prod(output_pyt_tensor.size()) == numpy.prod(out_golden_pyt_tensor.size()) - output_pyt_tensor = torch.reshape(output_pyt_tensor, out_golden_pyt_tensor.size()) - - # compare to pytorch - passing_pcc, output_pcc = comp_equal(out_golden_pyt_tensor, output_pyt_tensor) - logger.debug(f"Passing={passing_pcc}") - logger.debug(f"Output pcc={output_pcc}") - assert passing_pcc - - return diff --git a/tests/tt_eager/python_api_testing/conv/conv_unit_test_utils.py b/tests/tt_eager/python_api_testing/conv/conv_unit_test_utils.py deleted file mode 100644 index 714b260b9d8..00000000000 --- a/tests/tt_eager/python_api_testing/conv/conv_unit_test_utils.py +++ /dev/null @@ -1,86 +0,0 @@ -# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. - -# SPDX-License-Identifier: Apache-2.0 - -import torch -import ttnn -from tt_lib.utils import _nearest_32, _nearest_y - - -def create_conv_act_tensor_special(torch_tensor, N, C, H, W, pad_h=0, pad_w=0, extra_pad_w_right=0): - # Convert NCHW to NHWC shape - torch_tensor = torch.permute(torch_tensor, (0, 2, 3, 1)) - # Padded input shape - act_shape_height_width_channel_padded = [N, H + (2 * pad_h), W + (2 * pad_w) + extra_pad_w_right, _nearest_y(C, 4)] - tt_tensor = ttnn.Tensor(torch_tensor, ttnn.bfloat16) - h_start = pad_h if pad_h > 0 else 0 - w_start = pad_w if pad_w > 0 else 0 - tt_tensor = tt_tensor.pad(act_shape_height_width_channel_padded, (0, h_start, w_start, 0), 0.0) - return tt_tensor - - -def create_conv_act_tensor(torch_tensor, N, C, H, W, pad_h=0, pad_w=0, extra_pad_w_right=0): - # Convert NCHW to NHWC shape - torch_tensor = torch.permute(torch_tensor, (0, 2, 3, 1)) - # Padded input shape - act_shape_height_width_channel_padded = [N, H + (2 * pad_h), W + (2 * pad_w) + extra_pad_w_right, _nearest_y(C, 16)] - tt_tensor = ttnn.Tensor(torch_tensor, ttnn.bfloat16) - h_start = pad_h if pad_h > 0 else 0 - w_start = pad_w if pad_w > 0 else 0 - tt_tensor = tt_tensor.pad(act_shape_height_width_channel_padded, (0, h_start, w_start, 0), 0.0) - return tt_tensor - - -def create_conv_bias_tensor(torch_tensor, N, K, padded_K, pad=0): - # Padded input shape - bias_shape = [N, 1, 1, K] - bias_padded_shape = [N, 1, 1, padded_K] - # bias_shape_padded = [N, 1, 1, _nearest_y(C, 16)] - tt_tensor = ttnn.Tensor(torch.flatten(torch_tensor).tolist(), bias_shape, ttnn.bfloat16, ttnn.ROW_MAJOR_LAYOUT).pad( - bias_padded_shape, (0, 0, 0, 0), 0.0 - ) - tt_tensor = tt_tensor.pad_to_tile(pad).to(ttnn.TILE_LAYOUT) - print(f"tt_tensor shape: {tt_tensor.padded_shape}") - return tt_tensor - - -def create_conv_weight_tensor(torch_tensor, K, C, R, S, in1_block_h, in1_block_w): - weights_shape = [K, C, R, S] - weights_channels_padded_shape = [_nearest_32(K), _nearest_y(C, 16), R, S] - B_ = ttnn.Tensor(torch.flatten(torch_tensor).tolist(), weights_shape, ttnn.bfloat16, ttnn.ROW_MAJOR_LAYOUT).pad( - weights_channels_padded_shape, (0, 0, 0, 0), 0.0 - ) - B_tiled_host = ttnn.operations.conv2d.convert_conv_weight_tensor_to_tiled_layout(B_, in1_block_h, in1_block_w) - return B_tiled_host - - -def create_conv_weight_tensor_special_special(torch_tensor, K, C, R, S, in1_block_h, in1_block_w, padded_S=0): - if padded_S == 0: - padded_S = S - else: - assert padded_S > S - weights_shape = [K, C, R, S] - weights_channels_padded_shape = [_nearest_32(K), _nearest_y(C, 4), R, padded_S] - B_ = ttnn.Tensor(torch.flatten(torch_tensor).tolist(), weights_shape, ttnn.bfloat16, ttnn.ROW_MAJOR_LAYOUT).pad( - weights_channels_padded_shape, (0, 0, 0, 0), 0.0 - ) - B_tiled_host = ttnn.operations.conv2d.convert_conv_weight_tensor_to_special_padding_tiled_layout( - B_, in1_block_h, in1_block_w - ) - return B_tiled_host - - -def create_conv_weight_tensor_special_padding(torch_tensor, K, C, R, S, in1_block_h, in1_block_w, padded_S=0): - if padded_S == 0: - padded_S = S - else: - assert padded_S > S - weights_shape = [K, C, R, S] - weights_channels_padded_shape = [_nearest_32(K), _nearest_y(C, 16), R, padded_S] - B_ = ttnn.Tensor(torch.flatten(torch_tensor).tolist(), weights_shape, ttnn.bfloat16, ttnn.ROW_MAJOR_LAYOUT).pad( - weights_channels_padded_shape, (0, 0, 0, 0), 0.0 - ) - B_tiled_host = ttnn.operations.conv2d.convert_conv_weight_tensor_to_special_padding_tiled_layout( - B_, in1_block_h, in1_block_w - ) - return B_tiled_host diff --git a/tests/tt_eager/python_api_testing/conv/conv_utils.py b/tests/tt_eager/python_api_testing/conv/conv_utils.py deleted file mode 100644 index 1f779788575..00000000000 --- a/tests/tt_eager/python_api_testing/conv/conv_utils.py +++ /dev/null @@ -1,25 +0,0 @@ -# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. - -# SPDX-License-Identifier: Apache-2.0 - -import torch -import ttnn -from tt_lib.utils import _nearest_32 - - -def create_conv_act_tensor(torch_tensor, N, C, H, W): - torch_tensor = torch.permute(torch_tensor, (0, 2, 3, 1)) - act_shape_channel_padded = [N, H, W, _nearest_32(C)] - tt_tensor = ttnn.Tensor(torch_tensor, ttnn.bfloat16) - tt_tensor = tt_tensor.pad(act_shape_channel_padded, (0, 0, 0, 0), 0.0) - return tt_tensor - - -def create_conv_weight_tensor(torch_tensor, K, C, R, S, in1_block_h, in1_block_w): - weights_shape = [K, C, R, S] - weights_channels_padded_shape = [_nearest_32(K), _nearest_32(C), R, S] - B_ = ttnn.Tensor(torch.flatten(torch_tensor).tolist(), weights_shape, ttnn.bfloat16, ttnn.ROW_MAJOR_LAYOUT).pad( - weights_channels_padded_shape, (0, 0, 0, 0), 0.0 - ) - B_tiled_host = ttnn.operations.conv2d.convert_conv_weight_tensor_to_tiled_layout(B_, in1_block_h, in1_block_w) - return B_tiled_host diff --git a/tests/tt_eager/python_api_testing/conv/generate_mm_tb_using_conv_tb.py b/tests/tt_eager/python_api_testing/conv/generate_mm_tb_using_conv_tb.py deleted file mode 100644 index f8b27c9a035..00000000000 --- a/tests/tt_eager/python_api_testing/conv/generate_mm_tb_using_conv_tb.py +++ /dev/null @@ -1,36 +0,0 @@ -# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. - -# SPDX-License-Identifier: Apache-2.0 - -import os -import yaml -from tests.tt_eager.python_api_testing.conv.pytorch_conv_tb import ConvTestParameters, generate_conv_tb -from tt_lib.utils import _nearest_32 - - -def generate_mm_tb_using_conv_tb(): - print("Sweeping over convolution sizes and parameters in conv_tb.yaml.") - print("Generating MM test bench with conv sweep parameters.") - mm_tb_list = [] - conv_test_bench = generate_conv_tb() - - for ctp_ in conv_test_bench: - ctp = ctp_.conv_params - conv_out_h = ((int)((ctp.act_shape[2] - ctp.weight_shape[2] + 2 * ctp.pad_h) / ctp.stride_h)) + 1 - conv_out_w = ((int)((ctp.act_shape[3] - ctp.weight_shape[3] + 2 * ctp.pad_w) / ctp.stride_w)) + 1 - M = conv_out_h * conv_out_w - K = ctp.weight_shape[1] * ctp.weight_shape[2] * ctp.weight_shape[3] - N = ctp.weight_shape[0] - # pad M, K, N to nearest multiple of 32 - mm_test_params = [_nearest_32(M), _nearest_32(K), _nearest_32(N)] - if mm_test_params not in mm_tb_list: - mm_tb_list.append(mm_test_params) - - mm_tb_yaml_dict = [{"MM test params [M,K,N]": mm_tb_list}] - # Dump test bench to yaml file for viewing - with open( - os.path.join(os.environ["TT_METAL_HOME"], "tests/python_api_testing/conv/generated_mm_tb.yaml"), "w" - ) as file: - mm_yaml = yaml.dump(mm_tb_yaml_dict, file) - print("Total number of MM tests generated - " + str(len(mm_tb_list))) - return mm_tb_list diff --git a/tests/tt_eager/python_api_testing/conv/generated_mm_tb.yaml b/tests/tt_eager/python_api_testing/conv/generated_mm_tb.yaml deleted file mode 100644 index 9ce6b201cb1..00000000000 --- a/tests/tt_eager/python_api_testing/conv/generated_mm_tb.yaml +++ /dev/null @@ -1,316 +0,0 @@ -- MM test params [M,K,N]: - - - 32 - - 32 - - 32 - - - 64 - - 32 - - 32 - - - 128 - - 32 - - 32 - - - 32 - - 288 - - 32 - - - 96 - - 288 - - 32 - - - 32 - - 800 - - 32 - - - 64 - - 800 - - 32 - - - 32 - - 1568 - - 32 - - - 32 - - 32 - - 64 - - - 64 - - 32 - - 64 - - - 128 - - 32 - - 64 - - - 32 - - 288 - - 64 - - - 96 - - 288 - - 64 - - - 32 - - 800 - - 64 - - - 64 - - 800 - - 64 - - - 32 - - 1568 - - 64 - - - 32 - - 32 - - 128 - - - 64 - - 32 - - 128 - - - 128 - - 32 - - 128 - - - 32 - - 288 - - 128 - - - 96 - - 288 - - 128 - - - 32 - - 64 - - 32 - - - 64 - - 64 - - 32 - - - 128 - - 64 - - 32 - - - 32 - - 576 - - 32 - - - 96 - - 576 - - 32 - # - - 32 - # - 1600 - # - 32 - - - 64 - - 1600 - - 32 - - - 32 - - 3136 - - 32 - - - 32 - - 64 - - 64 - - - 64 - - 64 - - 64 - - - 128 - - 64 - - 64 - # - - 32 - # - 576 - # - 64 - - - 96 - - 576 - - 64 - - - 32 - - 1600 - - 64 - - - 64 - - 1600 - - 64 - - - 32 - - 3136 - - 64 - - - 32 - - 64 - - 128 - - - 64 - - 64 - - 128 - - - 128 - - 64 - - 128 - - - 32 - - 576 - - 128 - - - 96 - - 576 - - 128 - - - 160 - - 64 - - 32 - # - - 64 - # - 576 - # - 32 - - - 128 - - 576 - - 32 - - - 64 - - 3136 - - 32 - - - 160 - - 64 - - 64 - # - - 64 - # - 576 - # - 64 - - - 128 - - 576 - - 64 - - - 64 - - 3136 - - 64 - - - 160 - - 64 - - 128 - - - 64 - - 576 - - 128 - - - 128 - - 576 - - 128 - - - 96 - - 64 - - 32 - - - 192 - - 64 - - 32 - - - 96 - - 1600 - - 32 - - - 96 - - 64 - - 64 - - - 192 - - 64 - - 64 - - - 96 - - 1600 - - 64 - - - 96 - - 64 - - 128 - - - 192 - - 64 - - 128 - - - 224 - - 64 - - 32 - - - 160 - - 576 - - 32 - - - 128 - - 1600 - - 32 - - - 224 - - 64 - - 64 - - - 160 - - 576 - - 64 - - - 128 - - 1600 - - 64 - - - 224 - - 64 - - 128 - - - 160 - - 576 - - 128 - - - 256 - - 64 - - 32 - - - 192 - - 576 - - 32 - - - 96 - - 3136 - - 32 - - - 256 - - 64 - - 64 - - - 192 - - 576 - - 64 - - - 96 - - 3136 - - 64 - - - 256 - - 64 - - 128 - - - 192 - - 576 - - 128 - - - 160 - - 32 - - 32 - - - 256 - - 32 - - 32 - - - 64 - - 288 - - 32 - - - 128 - - 288 - - 32 - - - 224 - - 288 - - 32 - - - 160 - - 800 - - 32 - - - 64 - - 1568 - - 32 - - - 128 - - 1568 - - 32 - - - 160 - - 32 - - 64 - - - 256 - - 32 - - 64 - - - 64 - - 288 - - 64 - - - 128 - - 288 - - 64 - - - 224 - - 288 - - 64 - - - 160 - - 800 - - 64 - - - 64 - - 1568 - - 64 - - - 128 - - 1568 - - 64 - - - 160 - - 32 - - 128 - - - 256 - - 32 - - 128 - - - 64 - - 288 - - 128 - - - 128 - - 288 - - 128 - - - 224 - - 288 - - 128 - - - 224 - - 576 - - 32 - - - 160 - - 1600 - - 32 - - - 128 - - 3136 - - 32 - - - 224 - - 576 - - 64 - - - 160 - - 1600 - - 64 - - - 128 - - 3136 - - 64 - - - 224 - - 576 - - 128 diff --git a/tests/tt_eager/python_api_testing/conv/pytorch_conv_tb.py b/tests/tt_eager/python_api_testing/conv/pytorch_conv_tb.py deleted file mode 100644 index d4b444826eb..00000000000 --- a/tests/tt_eager/python_api_testing/conv/pytorch_conv_tb.py +++ /dev/null @@ -1,138 +0,0 @@ -# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. - -# SPDX-License-Identifier: Apache-2.0 - -import os -import numpy as np -import torch -import yaml - -from enum import Enum - - -class TestLevel(Enum): - INPUT_TENSOR_CREATE = 1 - OP_OUTPUT_TENSOR_CREATE = 2 - OP_PROGRAM_CREATE = 3 - OP_FULL_COMPUTE = 4 - - -# Moved from conv_sweep_params.yaml -# Remove when the issue is fixed https://github.com/tenstorrent/tt-metal/issues/11257 - -CONV_TB = { - # activation - [[N,C,H,W]] - "activation_shapes": [ - [1, 32, 5, 5], - [1, 64, 5, 5], - [1, 64, 6, 6], - [1, 64, 7, 7], - [1, 64, 8, 8], - [1, 64, 9, 9], - [1, 32, 10, 10], - [1, 64, 10, 10], - ], - # kernel sizes - [[K,R,S]] - "kernel_sizes": [ - [32, 1, 1], - [32, 3, 3], - [32, 5, 5], - [32, 7, 7], - [64, 1, 1], - [64, 3, 3], - [64, 5, 5], - [64, 7, 7], - [128, 1, 1], - [128, 3, 3], - ], - # stride = [stride_h, stride_w] - "strides": [[1, 1], [2, 2]], - # padding = [[pad_h, pad_w]] - "paddings": [[0, 0], [1, 1], [3, 3]], -} - - -class ConvOpTestParameters: - def __init__(self, conv_params, test_level): - self.conv_params = conv_params - self.test_level = test_level - - def to_string(self): - cp = self.conv_params - line = "Act_shape=" + str(cp.act_shape) + ", Weight_shape=" + str(cp.weight_shape) - line += ", Stride_h=" + str(cp.stride_h) + ", Stride_w=" + str(cp.stride_w) - line += ", Pad_h=" + str(cp.pad_h) + ", Pad_w=" + str(cp.pad_w) - line += ", TestLevel=" + str(TestLevel(self.test_level).name) - return line - - def print(self, d): - print(d + self.to_string()) - - -class ConvTestParameters: - def __init__(self, activation_shape, weight_shape, stride_h, stride_w, pad_h, pad_w): - assert len(activation_shape) == 4 - assert len(weight_shape) == 4 - self.act_shape = activation_shape - self.weight_shape = weight_shape - self.stride_h = stride_h - self.stride_w = stride_w - self.pad_h = pad_h - self.pad_w = pad_w - - -def generate_pytorch_golden(conv_test_params): - ctp = conv_test_params - A = torch.randn(ctp.act_shape, dtype=torch.bfloat16).float() - B = torch.randn(ctp.weight_shape, dtype=torch.bfloat16).float() - C = torch.nn.functional.conv2d(A, B, stride=(ctp.stride_h, ctp.stride_w), padding=(ctp.pad_h, ctp.pad_w)) - return (A, B, C) - - -def generate_conv_tb(): - # sweep over activation sizes, kernel sizes, stride, padding specified in test bench yaml - conv_op_test_bench = [] - for act_shape in CONV_TB["activation_shapes"]: - for kernel_size in CONV_TB["kernel_sizes"]: - for stride in CONV_TB["strides"]: - for pad in CONV_TB["paddings"]: - H = act_shape[2] - W = act_shape[3] - R = kernel_size[1] - S = kernel_size[2] - # check if its a valid test - if (H - R + 2 * pad[0]) < 1 or (W - S + 2 * pad[1]) < 1: - # invalid parameters - continue - # weight shape - [K,C,R,S] - weight_shape = [kernel_size[0], act_shape[1], kernel_size[1], kernel_size[2]] - conv_test_params = ConvTestParameters(act_shape, weight_shape, stride[0], stride[1], pad[0], pad[1]) - op_full_compute = (R == S) and (pad[0] == pad[1]) and (H == W) - # if(H >= 5 and act_shape[1] == 64): - # op_full_compute = False - if op_full_compute: - conv_op_test_params = ConvOpTestParameters(conv_test_params, TestLevel.OP_FULL_COMPUTE) - else: - conv_op_test_params = ConvOpTestParameters(conv_test_params, TestLevel.INPUT_TENSOR_CREATE) - - conv_op_test_bench.append(conv_op_test_params) - - # Dump test bench to yaml file for viewing - - # with open(os.path.join(os.environ['TT_METAL_HOME'], 'tests/python_api_testing/conv/generated_conv_tb.yaml'), 'w') as file: - # mm_yaml = yaml.dump(mm_tb_yaml_dict, file) - # print("Total number of MM tests generated - " + str(len(mm_tb_list))) - return conv_op_test_bench - - -def generate_conv_tb_with_pytorch_golden(conv_test_bench): - test_bench_with_pytorch_golden = {} - # Generate pytorch golden result for each test in testbench - for conv_op_test_params in conv_test_bench: - conv_test_params = conv_op_test_params.conv_params - # print("Test with following parameters - ") - # conv_op_test_params.print(" ") - # generate_pytorch_golden returns input, weight and golden output tensors - pytorch_golden_test = generate_pytorch_golden(conv_test_params) - test_bench_with_pytorch_golden[conv_op_test_params] = pytorch_golden_test - return test_bench_with_pytorch_golden diff --git a/tests/tt_eager/python_api_testing/sweep_tests/generation_funcs.py b/tests/tt_eager/python_api_testing/sweep_tests/generation_funcs.py index 9390fee7df8..3d10704b4bb 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/generation_funcs.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/generation_funcs.py @@ -982,20 +982,6 @@ def gen_scalar_args( yield input_info -def gen_conv2d_args( - input_shapes, - dtypes, - layouts, - mem_configs, - do_sanitize_args=True, - coregrid=[], -): - for input_info in gen_conv_scalar_args( - input_shapes, dtypes, layouts, mem_configs, "conv_params", torch.int, do_sanitize_args=do_sanitize_args - ): - yield input_info - - def gen_conv_scalar_args( input_shapes, supported_dtypes, diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/test_sweep_conv_with_address_map.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/test_sweep_conv_with_address_map.py deleted file mode 100644 index a21d0413e66..00000000000 --- a/tests/tt_eager/python_api_testing/sweep_tests/pytests/test_sweep_conv_with_address_map.py +++ /dev/null @@ -1,192 +0,0 @@ -# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. - -# SPDX-License-Identifier: Apache-2.0 - -import pytest -from loguru import logger -import ttnn -import numpy as np -from tt_lib.utils import _nearest_32, _nearest_y -from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import comp_pcc -from tests.tt_eager.python_api_testing.conv.pytorch_conv_tb import ( - TestLevel, - generate_conv_tb_with_pytorch_golden, - generate_conv_tb, -) -from tests.tt_eager.python_api_testing.conv.conv_utils import ( - create_conv_act_tensor, - create_conv_weight_tensor, -) - -import torch -from time import sleep - - -def run_conv_as_large_matmul(conv_op_test_params, pytorch_inputs_and_golden, device): - print("Testing convolution with following parameters - ") - conv_op_test_params.print(" ") - ctp = conv_op_test_params.conv_params - N = ctp.act_shape[0] - C = ctp.act_shape[1] - H = ctp.act_shape[2] - W = ctp.act_shape[3] - K = ctp.weight_shape[0] - assert ctp.weight_shape[1] == C - R = ctp.weight_shape[2] - S = ctp.weight_shape[3] - stride_h = ctp.stride_h - stride_w = ctp.stride_w - pad_h = ctp.pad_h - pad_w = ctp.pad_w - - # torch.manual_seed(0) - - A_pyt = pytorch_inputs_and_golden[0] - B_pyt = pytorch_inputs_and_golden[1] - - # Parameters to define block dims - act_block_h = 4 - act_block_w = 4 - weight_block_h = act_block_w - weight_block_w = 4 - out_subblock_h = 4 - out_subblock_w = 2 - - OH = ((int)((H - R + 2 * pad_h) / stride_h)) + 1 - OW = ((int)((W - S + 2 * pad_w) / stride_w)) + 1 - conv_output_shape = [1, OH, OW, K] - - # Prepare activations - A_cl_host = create_conv_act_tensor(A_pyt, 1, C, H, W) - A = A_cl_host.to(device, ttnn.MemoryConfig(ttnn.TensorMemoryLayout.SINGLE_BANK)) - - # Prepare weights - B_tiled_host = create_conv_weight_tensor(B_pyt, K, C, R, S, weight_block_h, weight_block_w) - B_tiled = B_tiled_host.to(device, ttnn.MemoryConfig(ttnn.TensorMemoryLayout.SINGLE_BANK)) - - if conv_op_test_params.test_level == TestLevel.INPUT_TENSOR_CREATE: - print("Ran test till tensor creation only. Did not run full op compute.") - return True - - assert conv_op_test_params.test_level == TestLevel.OP_FULL_COMPUTE - - # Run TT metal OP - out = ttnn.experimental.tensor.conv_with_address_map( - A, - B_tiled, - None, - [R, S, stride_h, stride_w, pad_h, pad_w], - act_block_h, - act_block_w, - weight_block_w, - out_subblock_h, - out_subblock_w, - K, - ) - out = out.cpu() - assert out.padded_shape == conv_output_shape - assert out.get_layout() == ttnn.ROW_MAJOR_LAYOUT - - # Copy output to host and convert tt tensor to pytorch tensor - out_result = torch.tensor(out.to_torch()) - out_result = torch.transpose(out_result, 2, 3) - out_result = torch.transpose(out_result, 1, 2) - - # Compare against pytorch golden result - out_golden = pytorch_inputs_and_golden[2] - assert out_result.shape == out_golden.shape - passing_pcc, output_pcc = comp_pcc(out_golden, out_result, 0.99) - logger.debug(f"Passing={passing_pcc}") - logger.debug(f"Output pcc={output_pcc}") - return passing_pcc - - -@pytest.mark.skip(reason="Test is not ready to run") -def test_sweep_conv_tt(device): - test_bench = generate_conv_tb() - pytorch_conv_golden_tb = generate_conv_tb_with_pytorch_golden(test_bench) - passing = True - full_op_compute_passing_tests = [] - input_tensor_only_passing_tests = [] - input_tensor_only_failing_tests = [] - input_tensor_only_failing_tests_exception = [] - full_op_compute_failing_tests = [] - full_op_compute_failing_tests_with_exception = [] - input_tensor_only_tests = 0 - full_op_compute_tests = 0 - for ( - conv_op_test_params, - pytorch_inputs_and_golden, - ) in pytorch_conv_golden_tb.items(): - passing_tests = full_op_compute_passing_tests - failing_tests = full_op_compute_failing_tests - failing_tests_with_exception = full_op_compute_failing_tests_with_exception - if conv_op_test_params.test_level == TestLevel.INPUT_TENSOR_CREATE: - passing_tests = input_tensor_only_passing_tests - failing_tests = input_tensor_only_failing_tests - failing_tests_with_exception = input_tensor_only_failing_tests_exception - input_tensor_only_tests += 1 - else: - assert conv_op_test_params.test_level == TestLevel.OP_FULL_COMPUTE - full_op_compute_tests += 1 - try: - passing_ = run_conv_as_large_matmul(conv_op_test_params, pytorch_inputs_and_golden, device) - if passing_: - passing_tests.append(conv_op_test_params) - else: - failing_tests.append(conv_op_test_params) - print("Failed test - ") - conv_op_test_params.print(" ") - except Exception as e: - print("Exception error: " + str(e)) - failing_tests_with_exception.append(conv_op_test_params) - passing_ = False - passing &= passing_ - print("Following tests that create only input tensors passed - ") - for conv_op_test_params in input_tensor_only_passing_tests: - conv_op_test_params.print(" ") - print("Following tests that create only input tensors failed with exception/error - ") - for conv_op_test_params in input_tensor_only_failing_tests_exception: - conv_op_test_params.print(" ") - print("Following tests that ran full op compute passed - ") - for conv_op_test_params in full_op_compute_passing_tests: - conv_op_test_params.print(" ") - print("Following tests that ran full op compute failed with incorrect mismatch - ") - for conv_op_test_params in full_op_compute_failing_tests: - conv_op_test_params.print(" ") - print("Following tests that ran full op compute failed with exception/error - ") - for conv_op_test_params in full_op_compute_failing_tests_with_exception: - conv_op_test_params.print(" ") - - print( - str(len(input_tensor_only_passing_tests)) - + " out of " - + str(input_tensor_only_tests) - + ' "INPUT TENSORS CREATION" tests PASSED.' - ) - print( - str(len(input_tensor_only_failing_tests_exception)) - + " out of " - + str(input_tensor_only_tests) - + ' "INPUT TENSORS CREATION" tests FAILED with exception.' - ) - - print( - str(len(full_op_compute_passing_tests)) - + " out of " - + str(full_op_compute_tests) - + ' "FULL OP COMPUTE" tests PASSED.' - ) - print( - str(len(full_op_compute_failing_tests)) - + " out of " - + str(full_op_compute_tests) - + ' "FULL OP COMPUTE" tests FAILED due to mismatch with golden output.' - ) - print( - str(len(full_op_compute_failing_tests_with_exception)) - + " out of " - + str(full_op_compute_tests) - + ' "FULL OP COMPUTE" tests FAILED with exception/error.' - ) - assert passing diff --git a/tests/tt_eager/python_api_testing/unit_testing/fallback_ops/test_conv2d_op.py b/tests/tt_eager/python_api_testing/unit_testing/fallback_ops/test_conv2d_op.py deleted file mode 100644 index 9e1eb0f64ab..00000000000 --- a/tests/tt_eager/python_api_testing/unit_testing/fallback_ops/test_conv2d_op.py +++ /dev/null @@ -1,284 +0,0 @@ -# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. - -# SPDX-License-Identifier: Apache-2.0 - -import torch -import ttnn -import tt_lib.fallback_ops as fallback_ops - -from models.utility_functions import ( - comp_allclose_and_pcc, - comp_pcc, -) -from loguru import logger -import pytest - - -@pytest.mark.parametrize( - "input_shape, weight_shape, bias_shape, stride, padding, dilation, groups, on_device", - ( - ( - torch.Size([1, 3, 6, 4]), - torch.Size([3, 3, 6, 4]), - torch.Size([1, 1, 1, 3]), - 1, - 0, - 1, - 1, - False, - ), - ( - torch.Size([1, 4, 32, 16]), - torch.Size([4, 1, 32, 16]), - torch.Size([1, 1, 1, 4]), - 1, - 0, - 1, - 4, - True, - ), - ( - torch.Size([1, 3, 6, 4]), - torch.Size([3, 3, 6, 4]), - None, - 1, - 0, - 1, - 1, - False, - ), - ( - torch.Size([1, 4, 32, 16]), - torch.Size([4, 1, 32, 16]), - None, - 1, - 0, - 1, - 4, - True, - ), - ), -) -def test_conv2d_fallback( - input_shape, - weight_shape, - bias_shape, - stride, - padding, - dilation, - groups, - on_device, - device, -): - torch.manual_seed(1234) - - x = torch.randn(input_shape).bfloat16().float() - w = torch.randn(weight_shape).bfloat16().float() - b = torch.randn(bias_shape).bfloat16().float() if bias_shape is not None else bias_shape - pt_out = torch.conv2d( - x, - w, - torch.reshape(b, (b.shape[-1],)) if b is not None else b, - stride, - padding, - dilation, - groups, - ) - - # Test on host RM - t0 = ttnn.Tensor( - x.reshape(-1).tolist(), - x.shape, - ttnn.bfloat16, - ttnn.ROW_MAJOR_LAYOUT, - ) - if on_device: - t0 = t0.to(device) - - w0 = ttnn.Tensor( - w.reshape(-1).tolist(), - w.shape, - ttnn.bfloat16, - ttnn.ROW_MAJOR_LAYOUT, - ) - if on_device: - w0 = w0.to(device) - - if b is not None: - b0 = ttnn.Tensor( - b.reshape(-1).tolist(), - b.shape, - ttnn.bfloat16, - ttnn.ROW_MAJOR_LAYOUT, - ) - if on_device: - b0 = b0.to(device) - else: - b0 = b - - t1 = fallback_ops.conv2d(t0, w0, b0, stride, padding, dilation, groups) - - output = t1.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch() - comp_pass, _ = comp_pcc(pt_out, output, 0.9999) - _, comp_out = comp_allclose_and_pcc(pt_out, output) - logger.debug(comp_out) - - -@pytest.mark.parametrize( - "input_shape, weight_shape, bias_shape, in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias, padding_mode,on_device", - ( - ( - torch.Size([1, 3, 6, 4]), - torch.Size([3, 3, 6, 4]), - torch.Size([1, 1, 1, 3]), - 3, - 3, - 1, - 1, - 0, - 1, - 1, - True, - "zeros", - False, - ), - ( - torch.Size([1, 4, 6, 4]), - torch.Size([4, 1, 6, 4]), - torch.Size([1, 1, 1, 4]), - 4, - 4, - 1, - 1, - 0, - 1, - 4, - True, - "zeros", - True, - ), - ( - torch.Size([1, 3, 6, 4]), - torch.Size([3, 3, 6, 4]), - None, - 3, - 3, - 1, - 1, - 0, - 1, - 1, - False, - "zeros", - False, - ), - ( - torch.Size([1, 4, 6, 4]), - torch.Size([4, 1, 6, 4]), - None, - 4, - 4, - 1, - 1, - 0, - 1, - 4, - False, - "zeros", - True, - ), - ), -) -def test_Conv2d_fallback( - input_shape, - weight_shape, - bias_shape, - in_channels, - out_channels, - kernel_size, - stride, - padding, - dilation, - groups, - bias, - padding_mode, - on_device, - device, -): - torch.manual_seed(1234) - - x = torch.randn(input_shape).bfloat16().float() - w = torch.randn(weight_shape).bfloat16().float() - b = torch.randn(bias_shape).bfloat16().float() if bias_shape is not None else bias_shape - pt_nn = torch.nn.Conv2d( - in_channels, - out_channels, - kernel_size, - stride, - padding, - dilation, - groups, - bias, - padding_mode, - ) - - pt_nn.weight = torch.nn.Parameter(w) - if not bias and bias_shape is not None: - logger.warning("Bias set to false but trying to set a bias tensor, Ignoring specified bias tensor") - if bias: - pt_nn.bias = torch.nn.Parameter(b.reshape((b.shape[-1]))) if b is not None else b - - pt_out = pt_nn(x) - - # Test on host RM - t0 = ttnn.Tensor( - x.reshape(-1).tolist(), - x.shape, - ttnn.bfloat16, - ttnn.ROW_MAJOR_LAYOUT, - ) - if on_device: - t0 = t0.to(device) - - w0 = ttnn.Tensor( - w.reshape(-1).tolist(), - w.shape, - ttnn.bfloat16, - ttnn.ROW_MAJOR_LAYOUT, - ) - if on_device: - w0 = w0.to(device) - - if b is not None: - b0 = ttnn.Tensor( - b.reshape(-1).tolist(), - b.shape, - ttnn.bfloat16, - ttnn.ROW_MAJOR_LAYOUT, - ) - if on_device: - b0 = b0.to(device) - else: - b0 = None - - tt_nn = fallback_ops.Conv2d( - w0, - b0 if bias else None, - in_channels, - out_channels, - kernel_size, - stride, - padding, - dilation, - groups, - bias, - padding_mode, - ) - - t1 = tt_nn(t0) - - output = t1.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch() - comp_pass, _ = comp_pcc(pt_out, output, 0.9999) - _, comp_out = comp_allclose_and_pcc(pt_out, output) - logger.debug(comp_out) - assert comp_pass diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_downsample.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_downsample.py deleted file mode 100644 index 871c933d0fa..00000000000 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_downsample.py +++ /dev/null @@ -1,208 +0,0 @@ -# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. - -# SPDX-License-Identifier: Apache-2.0 - -import pytest -import math -from loguru import logger - -import ttnn -from tt_lib.utils import ( - tilize_to_list, - tilize, - untilize, - _nearest_32, - _nearest_y, - convert_weights_2d_matrix, -) -from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import comp_equal, comp_allclose_and_pcc -from tests.tt_eager.python_api_testing.conv.conv_unit_test_utils import ( - create_conv_act_tensor, - create_conv_weight_tensor, - create_conv_bias_tensor, - create_conv_weight_tensor_special_padding, -) -from models.utility_functions import skip_for_blackhole -import torch - - -@skip_for_blackhole("Mismatching on BH, see #12349") -@pytest.mark.parametrize("device_params", [{"l1_small_size": 8192}], indirect=True) -@pytest.mark.parametrize( - "batch_size, output_channels, input_channels, input_height, input_width, stride_h, stride_w, num_cores, grid_size, height_sharded", - ( - # (10, 64, 64, 16, 16, 2, 2, 20, (10,2), False), - # (10, 64, 64, 16, 16, 1, 1, 20, (10,2), False), - # (8, 64, 64, 56, 56, 1, 1, 98, (12,9), True), - (8, 256, 256, 56, 56, 2, 2, 98, (12, 9), True), - (8, 512, 512, 28, 28, 2, 2, 80, (10, 8), False), - (8, 1024, 1024, 14, 14, 2, 2, 56, (7, 8), False), - (16, 256, 256, 56, 56, 2, 2, 98, (12, 9), True), - (16, 512, 512, 28, 28, 2, 2, 80, (11, 8), False), - (16, 1024, 1024, 14, 14, 2, 2, 56, (9, 8), False), - ), -) -@pytest.mark.parametrize("dtype", [ttnn.bfloat16, ttnn.bfloat8_b]) -def test_run_downsample( - device, - use_program_cache, - batch_size, - output_channels, - input_channels, - input_height, - input_width, - stride_h, - stride_w, - num_cores, - grid_size, - height_sharded, - dtype, -): - if batch_size > 8 and dtype != ttnn.bfloat8_b: - pytest.skip("Batch > 8 must be run fully bfp8") - compute_grid_size = device.compute_with_storage_grid_size() - if grid_size[0] > compute_grid_size.x or grid_size[1] > compute_grid_size.y: - pytest.skip(f"Need {grid_size} grid size to run this test but core grid is {compute_grid_size}") - - assert input_channels % 32 == 0 - assert output_channels % 32 == 0 - assert stride_h == stride_w - - torch.set_printoptions(precision=3, sci_mode=False, linewidth=500, threshold=10000, edgeitems=32) - - torch.manual_seed(0) - a_activation_shape = [batch_size, input_channels, input_height, input_width] - A_pyt = torch.normal(mean=0, std=0.1, size=a_activation_shape).bfloat16() - - b_weights_shape = [output_channels, input_channels, 1, 1] - B_pyt = torch.normal(mean=0, std=0.1, size=b_weights_shape).bfloat16() - - output_height = math.ceil(input_height / stride_h) - output_width = math.ceil(input_width / stride_w) - - conv_output_shape = [batch_size, output_height, output_width, output_channels] - - # Convert NCHW to NHWC shape - A_pyt_nhwc = torch.permute(A_pyt, (0, 2, 3, 1)) - A_pyt_nhwc = A_pyt_nhwc.reshape(1, 1, batch_size * input_height * input_width, input_channels) - # for i in range(2): - # for j in range(32): - # logger.info(f"A_pyt_nhwc_2d[{i}][{j}]={A_pyt_nhwc[0][0][i][j]}") - # logger.info("A_pyt_nhwc_2d[32][0]=", A_pyt_nhwc[0][0][32][0]) - a_activation_shape_nhwc = [batch_size, input_height, input_width, input_channels] - A_cl_host = ttnn.Tensor(A_pyt_nhwc, dtype).reshape(1, 1, batch_size * input_height * input_width, input_channels) - num_cores_height_slices = num_cores if height_sharded else grid_size[0] - input_shape = [1, 1, _nearest_y(batch_size * input_height * input_width, 32), input_channels] - A_cl_host = A_cl_host.pad(input_shape, (0, 0, 0, 0), 0.0) - A_interleaved = A_cl_host.to(ttnn.TILE_LAYOUT).to( - device, - ttnn.L1_MEMORY_CONFIG, - ) - assert A_interleaved.padded_shape[0] == 1 and A_interleaved.padded_shape[1] == 1 - - # image flattened params - input_2d_height = A_interleaved.padded_shape[2] - input_2d_width = A_interleaved.padded_shape[3] - input_2d_height_padded = _nearest_y(input_2d_height, num_cores_height_slices * 32) - input_shard_height = (int)(input_2d_height_padded / num_cores_height_slices) - output_2d_height_padded = _nearest_y(batch_size * output_height * output_width, num_cores_height_slices * 32) - output_shard_height = (int)(output_2d_height_padded / num_cores_height_slices) - logger.debug(f"input_2d_height={input_2d_height}") - logger.debug(f"input_2d_width={input_2d_width}") - sharded_memory_layout = ( - ttnn.TensorMemoryLayout.HEIGHT_SHARDED if height_sharded else ttnn.TensorMemoryLayout.BLOCK_SHARDED - ) - sharded_memory_orientation = ttnn.ShardOrientation.ROW_MAJOR if height_sharded else ttnn.ShardOrientation.COL_MAJOR - input_shard_width = input_2d_width if height_sharded else ((int)(input_2d_width / grid_size[1])) - logger.debug(f"grid_size={grid_size}") - logger.debug(f"shard_memory_layout={sharded_memory_layout}") - logger.debug(f"input_shard_height={input_shard_height}, input_shard_width={input_shard_width}") - - A_sharded = ttnn.interleaved_to_sharded( - A_interleaved, - grid_size, - [input_shard_height, input_shard_width], - sharded_memory_layout, - sharded_memory_orientation, - ) - # Prepare weights for simple matmul - B_tiled_host = create_conv_weight_tensor(B_pyt, output_channels, input_channels, 1, 1, 1, 1) - B_tiled = B_tiled_host.to(device) - - # downsample golden output using maxpool - out_golden = torch.nn.functional.max_pool2d(A_pyt, 1, stride=stride_h) - out_golden_2d_nhwc = torch.permute(out_golden, (0, 2, 3, 1)).reshape( - 1, 1, batch_size * output_height * output_width, input_channels - ) - - downsample_params = [batch_size, input_height, input_width, stride_h, stride_w] - sharded_memory_config = ttnn.MemoryConfig(ttnn.TensorMemoryLayout.HEIGHT_SHARDED, ttnn.BufferType.L1) - # Run downsample op - A_downampled_sharded = ttnn.downsample(A_sharded, downsample_params, dtype=dtype) - A_downsampled = ttnn.sharded_to_interleaved( - A_downampled_sharded, - ttnn.L1_MEMORY_CONFIG, - ) - out = A_downsampled - out_shape = [1, 1, _nearest_y(batch_size * output_height * output_width, 32), input_channels] - assert out_shape == list(out.padded_shape) - out_shape_unpadded = [1, 1, batch_size * output_height * output_width, input_channels] - assert out_shape_unpadded == list(out.shape) - out = ttnn.format_output_tensor(out, out.shape, device, ttnn.ROW_MAJOR_LAYOUT) - out = out.cpu() - - out_debug = out - out_debug = out_debug.to_torch().float() - - # DEBUG - # for i in range(16): - # for j in range(input_2d_width): - # logger.debug(f"out_golden_2d_nhwc[{i}][{j}]={out_golden_2d_nhwc[0][0][i][j]}") - - # for i in range(16): - # for j in range(input_2d_width): - # logger.debug(f"out_result_2d_nhwc[{i}][{j}]={out_debug[0][0][i][j]}") - - num_errors = 0 - core_idx = 0 - start_i = core_idx * output_shard_height - end_i = start_i + output_shard_height - for i in range(start_i, end_i): - for j in range(input_shard_width): - calculated = out_golden_2d_nhwc[0][0][i][j] - golden = out_debug[0][0][i][j] - atol_delta = torch.abs(golden - calculated).item() - rtol_delta = torch.abs(golden - calculated) / torch.abs(calculated) - if dtype == ttnn.bfloat8_b: - fail = atol_delta > 0.1 - else: - fail = atol_delta > 0.1 or rtol_delta > 0.1 - if fail: - if num_errors < 10: - logger.debug( - f"Bad value at {i} (sharded index {i - start_i}), {j} with ATOL={atol_delta} and RTOL={rtol_delta}" - ) - logger.debug(f" result={calculated}, golden={golden}") - num_errors += 1 - # if (num_errors >= 10): - # assert False - logger.debug(f"Num errors: {num_errors}") - - out = out.reshape(batch_size, output_height, output_width, input_channels) - assert out.get_layout() == ttnn.ROW_MAJOR_LAYOUT - - # Copy output to host and convert tt tensor to pytorch tensor - out_result = out.to_torch().float() - out_result = torch.transpose(out_result, 2, 3) - out_result = torch.transpose(out_result, 1, 2) - - # logger.debug (f'OUTPUT: {out_result}') - # logger.debug (f'GOLDEN: {out_golden}') - - if dtype == ttnn.bfloat8_b: - passing, output_info = comp_allclose_and_pcc( - out_golden, out_result, rtol=0, atol=4e-3, pcc=0.9999 - ) # For LowFi we need 0.99976 - else: - passing, output_info = comp_equal(out_golden, out_result) - assert passing diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_resnet50_first_conv_folding_on_host.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_resnet50_first_conv_folding_on_host.py deleted file mode 100644 index a760d9566be..00000000000 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_resnet50_first_conv_folding_on_host.py +++ /dev/null @@ -1,101 +0,0 @@ -# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. - -# SPDX-License-Identifier: Apache-2.0 - -import pytest -from loguru import logger - -import numpy as np - -from tt_lib.utils import ( - tilize_to_list, - tilize, - untilize, - _nearest_32, - _nearest_y, - convert_weights_2d_matrix, -) -from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import ( - comp_allclose_and_pcc, - comp_pcc, -) -from models.utility_functions import ( - pad_and_fold_conv_activation_for_unity_stride, - pad_and_fold_conv_filters_for_unity_stride, -) -from tests.tt_eager.python_api_testing.conv.conv_unit_test_utils import ( - create_conv_act_tensor, - create_conv_act_tensor_special, - create_conv_weight_tensor, - create_conv_weight_tensor_special_special, - create_conv_bias_tensor, -) -import torch - - -@pytest.mark.parametrize("has_bias", (True,)) -@pytest.mark.parametrize("fuse_relu", (True,)) -@pytest.mark.parametrize( - "N", - (8,), -) -def test_resnet50_first_conv( - device, - use_program_cache, - N, - has_bias, - fuse_relu, -): - compute_grid_size = device.compute_with_storage_grid_size() - is_e75_grid_size = (compute_grid_size.x * compute_grid_size.y) == 88 - if N == 8 and is_e75_grid_size: - pytest.skip( - f"Skipping batch 8 on E75 because expected grid size is 12x9 but E75 grid size is {compute_grid_size}" - ) - if N != 8: - pytest.skip("Skipping non-batch 8 tests due to potential non-determinism") - - (K, C, padded_C, H, W, R, S, padded_S, stride_h, stride_w, pad_h, pad_w) = ( - 64, - 3, - 4, - 224, - 224, - 7, - 7, - 8, - 2, - 2, - 3, - 3, - ) - - torch.manual_seed(0) - a_activation_shape = [N, C, H, W] - A_pyt = torch.randn(a_activation_shape, dtype=torch.bfloat16).float() - b_weights_shape = [K, C, R, S] - B_pyt = torch.randn(b_weights_shape, dtype=torch.bfloat16).float() - bias_shape = [K] - bias_pyt = torch.randn(bias_shape) - - # Calculate conv result with golden result. Run Pytorch conv - out_golden = torch.nn.functional.conv2d( - A_pyt, B_pyt, bias=bias_pyt, stride=(stride_h, stride_w), padding=(pad_h, pad_w) - ) - if fuse_relu: - out_golden = torch.nn.ReLU()(out_golden) - A_pyt_padded_folded = pad_and_fold_conv_activation_for_unity_stride(A_pyt, pad_h, pad_w, stride_h, stride_w) - B_pyt_padded_folded = pad_and_fold_conv_filters_for_unity_stride(B_pyt, stride_h, stride_w) - - # Calculate conv result with folded conv. Run Pytorch conv with unity stride and no padding. - out_result = torch.nn.functional.conv2d(A_pyt_padded_folded, B_pyt_padded_folded, bias=bias_pyt) - if fuse_relu: - out_result = torch.nn.ReLU()(out_result) - - # Compare against golden - golden_pcc = 0.9999999999999847 - - passing_pcc, output_pcc = comp_pcc(out_golden, out_result, golden_pcc) - logger.debug(f"Passing={passing_pcc}") - logger.debug(f"Output pcc={output_pcc}") - assert passing_pcc From df3e71af58f53f17de4837bfc3bb76b6e4dfe41e Mon Sep 17 00:00:00 2001 From: pjosipovic Date: Sun, 9 Feb 2025 15:59:20 +0000 Subject: [PATCH 119/316] Fix I2S aligment issue on BH Running conv2d sweeps on BH exposed ~150 pcc issues. TT_METAL_WATCHER exposed unaligned noc transaction in ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reader_unary_stick_layout_sharded_blocks_interleaved_start_id.cpp in these test cases. block_width_bytes wasn't aligned to 16B in these cases. For some reason BH codepath was setting unaligned size in this case. --- tests/ttnn/unit_tests/test_to_layout.py | 23 +++++++++++++++++++ ...interleaved_to_sharded_program_factory.cpp | 2 +- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/tests/ttnn/unit_tests/test_to_layout.py b/tests/ttnn/unit_tests/test_to_layout.py index 4e0fe5c29bc..436ce03f0d6 100644 --- a/tests/ttnn/unit_tests/test_to_layout.py +++ b/tests/ttnn/unit_tests/test_to_layout.py @@ -339,3 +339,26 @@ def test_untilize_w4(shape, input_layout, output_layout, device): output_tensor = ttnn.to_torch(output_tensor) assert_with_pcc(input_a[:, :, :1, :10912], output_tensor) + + +def test_interleaved_to_sharded_block_shareded_unaligned_width(device): + torch_input_shape = [1, 1, 196, 92] + torch_input = torch.randn(torch_input_shape, dtype=torch.bfloat16).bfloat16() + + sharded_memory_config = ttnn.create_sharded_memory_config( + [32, 32], + core_grid=ttnn.CoreGrid( + x=7, + y=3, + ), + strategy=ttnn.ShardStrategy.BLOCK, + orientation=ttnn.ShardOrientation.COL_MAJOR, + use_height_and_width_as_shard_shape=True, + ) + ttnn_input = ttnn.from_torch(torch_input, device=device, layout=ttnn.ROW_MAJOR_LAYOUT) + ttnn_output = ttnn.to_memory_config(ttnn_input, sharded_memory_config) + + output_torch = ttnn.to_torch(ttnn_output) + + passing, pcc_msg = check_with_pcc_without_tensor_printout(torch_input, output_torch) + assert passing, pcc_msg diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/device/interleaved_to_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/device/interleaved_to_sharded_program_factory.cpp index 913dc4cc97b..748d10d20a9 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/device/interleaved_to_sharded_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/device/interleaved_to_sharded_program_factory.cpp @@ -290,7 +290,7 @@ operation::ProgramWithCallbacks interleaved_to_sharded_multi_core( num_units_per_row, shard_height, shard_width, - (is_blackhole) ? shard_width : padded_offset_bytes, + padded_offset_bytes, static_cast(aligned), aligned_width_offset, aligned_shard_width, From 1b266bbe0ae97fa7b08ea306954cc8a6f1f6b1af Mon Sep 17 00:00:00 2001 From: Pavle Josipovic Date: Thu, 6 Feb 2025 12:57:32 +0000 Subject: [PATCH 120/316] Allow shallow conv channel aligment to 8 --- .../sweeps/conv2d/short/conv2d_short_sweep.py | 2 -- .../misc/test_conv_op_trace_config.py | 32 ------------------- .../unit_tests/operations/test_new_conv2d.py | 7 ++-- .../operations/conv/conv2d/conv2d_utils.cpp | 9 +++--- .../conv2d_op_sharded_program_factory.cpp | 5 +-- 5 files changed, 9 insertions(+), 46 deletions(-) delete mode 100644 tests/tt_eager/python_api_testing/unit_testing/misc/test_conv_op_trace_config.py diff --git a/tests/sweep_framework/sweeps/conv2d/short/conv2d_short_sweep.py b/tests/sweep_framework/sweeps/conv2d/short/conv2d_short_sweep.py index aca2764aa59..f1589328a94 100644 --- a/tests/sweep_framework/sweeps/conv2d/short/conv2d_short_sweep.py +++ b/tests/sweep_framework/sweeps/conv2d/short/conv2d_short_sweep.py @@ -1622,8 +1622,6 @@ def test_conv2d_localrun(device, input_spec): [1, 768, 3, 224, 224, 32, 32, 32, 32, 0, 0, 1, 1, 1, False], # 1460 [1, 768, 3, 224, 224, 32, 32, 32, 32, 0, 0, 1, 1, 1, True], # 1461 [1, 768, 3, 384, 512, 32, 32, 32, 32, 0, 0, 1, 1, 1, True], # 1464 - [1, 64, 3, 720, 1280, 7, 7, 2, 2, 3, 3, 1, 1, 1, False], # 1471 - [1, 64, 3, 800, 1088, 7, 7, 2, 2, 3, 3, 1, 1, 1, False], # 1472 [1, 1, 64, 480, 640, 3, 3, 1, 1, 1, 1, 1, 1, 1, True], # 1495 [1, 64, 64, 480, 640, 3, 3, 1, 1, 1, 1, 1, 1, 1, True], # 1496 ] diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_conv_op_trace_config.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_conv_op_trace_config.py deleted file mode 100644 index 327025907ff..00000000000 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_conv_op_trace_config.py +++ /dev/null @@ -1,32 +0,0 @@ -# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. - -# SPDX-License-Identifier: Apache-2.0 - -import pytest -import torch -import numpy -from loguru import logger -from tests.tt_eager.python_api_testing.conv.conv_op_trace_config import ( - trace_conv_to_generate_data_top_left_indices_and_pad_metadata, - traced_conv_reference, -) -from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import comp_equal, comp_allclose_and_pcc - - -# conv params - output_channels, input_channels, filter_h, filter_w, stride_h, stride_w, pad_h, pad_w, dilation, groups -@pytest.mark.parametrize( - "conv_params, input_nchw_shape", - ( - ((1, 1, 2, 2, 1, 1, 0, 0, 1, 1), (8, 1, 8, 8)), - ((1, 1, 2, 2, 1, 1, 1, 1, 1, 1), (8, 1, 8, 8)), - ((1, 1, 4, 4, 1, 1, 0, 0, 1, 1), (8, 1, 115, 115)), - ), -) -def test_run_op_trace_config(conv_params, input_nchw_shape): - pad_metadata, data_top_left_indices = trace_conv_to_generate_data_top_left_indices_and_pad_metadata( - conv_params, input_nchw_shape - ) - logger.trace(f"Data top left indices - {data_top_left_indices}") - logger.trace(f"Pad meta data - {pad_metadata}") - # run trace conv reference - traced_conv_reference(pad_metadata, data_top_left_indices, conv_params, input_nchw_shape) diff --git a/tests/ttnn/unit_tests/operations/test_new_conv2d.py b/tests/ttnn/unit_tests/operations/test_new_conv2d.py index 7627f60e285..7c49616a514 100644 --- a/tests/ttnn/unit_tests/operations/test_new_conv2d.py +++ b/tests/ttnn/unit_tests/operations/test_new_conv2d.py @@ -127,9 +127,7 @@ def run_conv( dtype=activations_dtype, weights_dtype=weights_dtype, shard_layout=shard_layout if not auto_shard else None, - input_channels_alignment=( - 16 if use_shallow_conv_variant or (input_channels == 16 and input_height == 115) else 32 - ), + input_channels_alignment=8 if use_shallow_conv_variant and not auto_shard else 32, deallocate_activation=deallocate_activation, enable_act_double_buffer=False, enable_split_reader=False, @@ -258,7 +256,6 @@ def run_conv_with_split( dtype=activations_dtype, weights_dtype=weights_dtype, shard_layout=shard_layout if not auto_shard else None, - # input_channels_alignment=(16 if use_shallow_conv_variant else 32), ) compute_config = ttnn.init_device_compute_kernel_config( device.arch(), @@ -1689,7 +1686,7 @@ def test_unet_conv_wh( ) @pytest.mark.parametrize( "activations_dtype", - [ttnn.bfloat8_b], + [ttnn.bfloat16], ) @pytest.mark.parametrize("math_fidelity", [ttnn.MathFidelity.LoFi]) @pytest.mark.parametrize("output_layout", [ttnn.ROW_MAJOR_LAYOUT, ttnn.TILE_LAYOUT]) diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp index 426f6e52151..32fa50b9b63 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp @@ -772,15 +772,14 @@ Conv2dConfig determine_conv_config_for_auto_shard( Conv2dConfig conv_config = conv_config_in; conv_config.shard_layout = shard_layout; if (conv_config.act_block_h_override == 0) { - if (in_channels <= constants::TILE_WIDTH / 2 && - conv_config.input_channels_alignment == constants::TILE_WIDTH && !is_mm_conv && - conv_config.shard_layout == TensorMemoryLayout::HEIGHT_SHARDED && + if (in_channels < constants::TILE_WIDTH && conv_config.input_channels_alignment == constants::TILE_WIDTH && + !is_mm_conv && conv_config.shard_layout == TensorMemoryLayout::HEIGHT_SHARDED && input_tensor_layout == Layout::ROW_MAJOR) { log_debug(LogOp, "Auto shard, enable shallow conv"); - // height sharded, non matmul conv, with input channels <= 16, and default setting for + // height sharded, non matmul conv, with input channels < 32, and default setting for // input_channels_alignment // Currently data-movement ops have too many restrictions to support shallow convs with tiled input. - conv_config.input_channels_alignment = constants::TILE_WIDTH / 2; + conv_config.input_channels_alignment = 8; } else if (conv_config.shard_layout != TensorMemoryLayout::HEIGHT_SHARDED) { conv_config.input_channels_alignment = constants::TILE_WIDTH; } diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp index abab4fc1fac..a70d7093bf3 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp @@ -562,9 +562,10 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( TT_FATAL(input_channels_padded >= ashape[3], "Incorrect padding of input channels!"); // check is for 16-byte alignment TT_FATAL( - input_channels_padded % 16 == 0, + // Since fp16 is smalleset data format used for halo output, 8 input_channels is enough for 16 byte alignment + input_channels_padded % 8 == 0, "Expected input channels to be padded for 16 byte alignment in L1 ({} % 16 != 0)", - input_channels_padded); // TODO: For bfp16, check if its divisible by 8 not 16. + input_channels_padded); // Always use split reader for first conv in resnet which has input channels = 16 // TODO: Expose option to split readers for 1D convs to python? // bool split_reader = use_shallow_conv_variant; From 74766915bcd877baba2809a323701469c60d53ab Mon Sep 17 00:00:00 2001 From: Kalaivani Baskar <156762498+KalaivaniMCW@users.noreply.github.com> Date: Mon, 17 Feb 2025 15:12:03 +0530 Subject: [PATCH 121/316] #17871: skip mixed dtype case for ttnn.experimental.pow (#17884) ### Ticket Link to Github Issue #17871 ### Problem description A specific combination of input datatype in `tests/ttnn/unit_tests/operations/eltwise/test_pow.py::test_binary_sfpu_pow_bug ` fails once or twice when run in a loop `pytest --count=20 tests/ttnn/unit_tests/operations/eltwise/test_pow.py::test_binary_sfpu_pow_bug` ### What's changed Skipping the test case until we debug Tracked in #17883 ### Checklist - [x] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes https://github.com/tenstorrent/tt-metal/actions/runs/13365555065 - [ ] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [ ] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [ ] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [ ] New/Existing tests provide coverage for changes --- tests/ttnn/unit_tests/operations/eltwise/test_pow.py | 4 ++-- ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/ttnn/unit_tests/operations/eltwise/test_pow.py b/tests/ttnn/unit_tests/operations/eltwise/test_pow.py index c2574a0a870..fa9ed63450d 100644 --- a/tests/ttnn/unit_tests/operations/eltwise/test_pow.py +++ b/tests/ttnn/unit_tests/operations/eltwise/test_pow.py @@ -263,8 +263,8 @@ def test_binary_pow(device, dtype_a, dtype_b, ttnn_function): ], ) def test_binary_sfpu_pow_bug(device, input_shapes, dtype_a, dtype_b, ttnn_function): - if (ttnn_function == ttnn.pow) and (dtype_a != dtype_b): - pytest.skip("Mixed datatypes not supported in ttnn.pow") + if dtype_a != dtype_b: + pytest.skip("Mixed datatypes not supported in ttnn.pow or ttnn.experimental.pow") torch.manual_seed(0) torch_dtype_a = getattr(torch, dtype_a) ttnn_dtype_a = getattr(ttnn, dtype_a) diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp index cbda641693b..2f70f722368 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp @@ -1825,7 +1825,9 @@ void py_module(py::module& module) { R"doc(BFLOAT16, BFLOAT8_B)doc"); detail::bind_power( - module, ttnn::pow, R"doc(When :attr:`exponent` is a Tensor, supported dtypes are: BFLOAT16, FLOAT32)doc"); + module, + ttnn::pow, + R"doc(When :attr:`exponent` is a Tensor, supported dtypes are: BFLOAT16, FLOAT32. Both input tensors should be of same dtype.)doc"); } } // namespace binary From 16419fe127899c3813a4c294267968767aa59781 Mon Sep 17 00:00:00 2001 From: Mohamed Bahnas <116673264+mbahnasTT@users.noreply.github.com> Date: Mon, 17 Feb 2025 14:47:47 -0800 Subject: [PATCH 122/316] #0: updating codeowners for vit-segformer-yolov4 (#17903) Co-authored-by: Dalar Vartanians --- CODEOWNERS | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/CODEOWNERS b/CODEOWNERS index 6e2fffa151b..f50e3bb6075 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -176,10 +176,12 @@ models/demos/t3000/mixtral8x7b @yieldthought @mtairum @uaydonat models/demos/tg/llama3_70b @cglagovichTT @uaydonat @johanna-rock-tt @djordje-tt @kpaigwar models/demos/tg/falcon7b @skhorasganiTT @djordje-tt @uaydonat models/demos/grayskull @uaydonat -models/demos/yolov4 @dvartaniansTT @tenstorrent/metalium-developers-convolutions -models/demos/wormhole/yolov4 @dvartaniansTT @tenstorrent/metalium-developers-convolutions +models/demos/yolov4 @dvartaniansTT @mbahnasTT @tenstorrent/metalium-developers-convolutions +models/demos/wormhole/yolov4 @dvartaniansTT @mbahnasTT @tenstorrent/metalium-developers-convolutions models/demos/**/*resnet* @tt-aho @tenstorrent/metalium-developers-convolutions models/experimental/functional_unet @esmalTT @uaydonat @tenstorrent/metalium-developers-convolutions +models/experimental/functional_vit @mbahnasTT @uaydonat +models/demos/segformer @mbahnasTT @uaydonat @tenstorrent/metalium-developers-convolutions models/perf/ @uaydonat models/perf/perf_report.py @yieldthought @uaydonat models/perf/benchmarking_utils.py @skhorasganiTT From 0ef76c0b7aaf9ddb7205713d0083d11000662a20 Mon Sep 17 00:00:00 2001 From: Andrew Fuller Date: Mon, 17 Feb 2025 20:38:07 -0500 Subject: [PATCH 123/316] First package (TT-Metalium runtime) (#17694) ### Ticket A step towards #7915 ### Problem description We don't have any .deb packages. ### What's changed Added a `tt-metalium` package. This is the runtime files for the TT-Metalium layer. tt-metalium-jit is split into its own package for technical (dbgsym) reasons. ### Checklist - [x] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI [passes](https://github.com/tenstorrent/tt-metal/actions/runs/13380833931) --- .github/workflows/build-artifact.yaml | 32 +++++++-- CMakeLists.txt | 22 +++++-- cmake/packaging.cmake | 57 +++++++++++++++- cmake/version.cmake | 5 ++ dockerfile/Dockerfile | 1 + tt_metal/CMakeLists.txt | 76 +++++++++++++++++++++ tt_metal/hostdevcommon/CMakeLists.txt | 36 +++++++--- tt_metal/hw/CMakeLists.txt | 95 +++++++++++++++++++++++++++ tt_metal/hw/firmware/CMakeLists.txt | 41 ++++++++++++ 9 files changed, 345 insertions(+), 20 deletions(-) create mode 100644 tt_metal/hw/firmware/CMakeLists.txt diff --git a/.github/workflows/build-artifact.yaml b/.github/workflows/build-artifact.yaml index 3d425cd6b08..d5a210887e9 100644 --- a/.github/workflows/build-artifact.yaml +++ b/.github/workflows/build-artifact.yaml @@ -54,6 +54,9 @@ on: #ci-test-docker-image: # description: "Docker tag for the CI Test Docker image for testing TT-Metalium et al" # value: ${{ jobs.build-docker-image.outputs.ci-test-tag }} + packages-artifact-name: + description: "Name to give download-artifact to get the packages" + value: ${{ jobs.build-artifact.outputs.packages-artifact-name }} build-artifact-name: description: "Name of the published build artifact" value: ${{ jobs.build-artifact.outputs.build_artifact_name }} @@ -61,7 +64,6 @@ on: description: "Name of the published wheel artifact" value: ${{ jobs.build-artifact.outputs.wheel_artifact_name }} - workflow_dispatch: inputs: build-type: @@ -114,6 +116,7 @@ jobs: - build - in-service outputs: + packages-artifact-name: ${{ steps.set-artifact-name.outputs.name }} build_artifact_name: ${{ steps.set_build_artifact_name.outputs.build_artifact_name }} wheel_artifact_name: ${{ steps.set_wheel_artifact_name.outputs.wheel_artifact_name }} container: @@ -149,11 +152,22 @@ jobs: exit 1 fi + - name: Set artifact name + id: set-artifact-name + run: | + TOOLCHAIN="${{ inputs.toolchain }}" + TOOLCHAIN_CLEANED=$(echo "$TOOLCHAIN" | sed -E 's/^cmake\///; s/-toolchain\.cmake$//') + ARTIFACT_NAME="packages-${{ inputs.distro }}-${{ inputs.version }}-${{ inputs.architecture }}-${{ inputs.build-type }}-${TOOLCHAIN_CLEANED}${{ (inputs.tracy && '_profiler') || '' }}" + + echo "name=$ARTIFACT_NAME" >> "$GITHUB_OUTPUT" + echo "ARTIFACT_NAME=$ARTIFACT_NAME" >> "$GITHUB_ENV" + - name: ⬇️ Checkout uses: actions/checkout@v4 with: - fetch-depth: 0 submodules: recursive + fetch-depth: 500 # Need enough history for `git describe` + fetch-tags: true # Need tags for `git describe` path: docker-job # Here be dragons; keep it scoped to our desired volume, yet must be under github.workspace and be sure to clean up at the end - name: Sanity check @@ -194,9 +208,9 @@ jobs: nice -19 cmake --build build --target install - name: 📦 Package - if: false # Packaging coming later run: | - nice -19 cmake --build $build_dir --target package + nice -n 19 cmake --build build --target package + ls -1sh build/*.deb build/*.ddeb - name: 🐍 Build wheel if: ${{ inputs.build-wheel }} @@ -210,6 +224,16 @@ jobs: ccache -s >> $GITHUB_STEP_SUMMARY echo '```' >> $GITHUB_STEP_SUMMARY + - name: ☁️ Upload packages + uses: actions/upload-artifact@v4 + with: + name: ${{ env.ARTIFACT_NAME }} + path: | + /work/build/*.deb + /work/build/*.ddeb + compression-level: 0 + if-no-files-found: error + - name: Set wheel artifact name id: set_wheel_artifact_name run: | diff --git a/CMakeLists.txt b/CMakeLists.txt index f289b7d1b84..21ffe59c943 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,6 +18,7 @@ project( DESCRIPTION "Tenstorrent Metalium" HOMEPAGE_URL "https://github.com/tenstorrent/tt-metal" LANGUAGES + C # Some of the jit-build files are plain C CXX ) message(STATUS "Metalium version: ${PROJECT_VERSION}") @@ -242,6 +243,11 @@ else() add_compile_definitions(TT_ENABLE_LIGHT_METAL_TRACE=0) endif() +include(GNUInstallDirs) +# GNUInstallDirs takes PROJECT_DIR verbatim, but directories should always be lowercase +string(TOLOWER ${PROJECT_NAME} PROJECT_NAME_LOWER) +string(REPLACE ${PROJECT_NAME} ${PROJECT_NAME_LOWER} CMAKE_INSTALL_DOCDIR ${CMAKE_INSTALL_DOCDIR}) + if(ENABLE_CODE_TIMERS) add_compile_definitions(TT_ENABLE_CODE_TIMERS) endif() @@ -265,7 +271,6 @@ endif() # For top level install: cmake --build build --target install or make/ninja install -C build ############################################################################################################################ # Install for build artifacts that will upload build/lib -include(GNUInstallDirs) install( TARGETS @@ -274,7 +279,7 @@ install( DESTINATION ${CMAKE_INSTALL_LIBDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - COMPONENT dev + COMPONENT tar ) install( TARGETS @@ -283,7 +288,7 @@ install( DESTINATION ${CMAKE_INSTALL_LIBDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - COMPONENT dev + COMPONENT tar ) install( TARGETS @@ -292,7 +297,7 @@ install( DESTINATION ${CMAKE_INSTALL_LIBDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - COMPONENT dev + COMPONENT tar ) if(WITH_PYTHON_BINDINGS) # Install .so into src files for pybinds implementation @@ -306,6 +311,15 @@ if(WITH_PYTHON_BINDINGS) ) endif() +# FIXME(17578): figure out what bits we actually need to ship and omit the rest +install( + DIRECTORY + runtime + DESTINATION ${CMAKE_INSTALL_LIBEXECDIR}/tt-metalium + USE_SOURCE_PERMISSIONS + COMPONENT jit-build +) + # Custom clean target for `built` folder for when new kernel changes are pulled add_custom_target( clean-built diff --git a/cmake/packaging.cmake b/cmake/packaging.cmake index 61700a4fc8f..20e06931c5d 100644 --- a/cmake/packaging.cmake +++ b/cmake/packaging.cmake @@ -1,5 +1,56 @@ -set(CPACK_GENERATOR "DEB") -set(CPACK_DEBIAN_PACKAGE_MAINTAINER "support@tenstorrent.com") -#set(CPACK_DEBIAN_PACKAGE_DEPENDS "") +set(CPACK_GENERATOR DEB) +set(CPACK_PACKAGE_CONTACT "support@tenstorrent.com") +set(CMAKE_PROJECT_HOMEPAGE_URL "https://tenstorrent.com") +set(CPACK_PACKAGE_NAME tt) + +set(CPACK_COMPONENT_METALIUM_DESCRIPTION "TT-Metalium runtime library") +set(CPACK_DEBIAN_METALIUM_PACKAGE_SECTION "libs") + +set(CPACK_DEB_COMPONENT_INSTALL YES) +set(CPACK_DEBIAN_PACKAGE_VERSION "${VERSION_DEB}") +set(CPACK_DEBIAN_FILE_NAME DEB-DEFAULT) + +set(CPACK_DEBIAN_PACKAGE_CONTROL_STRICT_PERMISSION TRUE) +# set(CPACK_DEBIAN_DEBUGINFO_PACKAGE TRUE) +set(CPACK_DEBIAN_METALIUM_DEBUGINFO_PACKAGE TRUE) +set(CPACK_DEBIAN_JIT-BUILD_DEBUGINFO_PACKAGE FALSE) # Some binaries don't have a Build ID; we cannot split dbgsyms + +set(CPACK_INSTALL_DEFAULT_DIRECTORY_PERMISSIONS + OWNER_READ + OWNER_WRITE + OWNER_EXECUTE + GROUP_READ + GROUP_EXECUTE + WORLD_READ + WORLD_EXECUTE +) + +set(CPACK_DEBIAN_ENABLE_COMPONENT_DEPENDS TRUE) +set(CPACK_DEBIAN_PACKAGE_SHLIBDEPS FALSE) + +get_cmake_property(CPACK_COMPONENTS_ALL COMPONENTS) +list( + REMOVE_ITEM + CPACK_COMPONENTS_ALL + umd-dev # FIXME: -dev packages will come later + tt_pybinds # Wow this one is big! + tar # TODO: Remove that tarball entirely + # Deps that define install targets that we can't (or haven't) disabled + msgpack-cxx + Headers + Library + Unspecified # TODO: audit if there's anything we need to ship here +) + +# Logically we should ship jit-build with metalium-runtime, but jit-build fails to split dbgsyms for now (lacking a Build ID on the binaries) +cpack_add_component(jit-build GROUP metalium-jit) + +cpack_add_component(metalium-runtime GROUP metalium) +cpack_add_component(umd-runtime GROUP metalium) +cpack_add_component(dev GROUP metalium) # FIXME: delete this line when we bump UMD submodule +cpack_add_component_group(metalium) + +cpack_add_component(gtest GROUP metalium-validation) +cpack_add_component_group(metalium-validation) include(CPack) diff --git a/cmake/version.cmake b/cmake/version.cmake index 1af7f36dfc0..f98317d0985 100644 --- a/cmake/version.cmake +++ b/cmake/version.cmake @@ -74,20 +74,25 @@ function(ParseGitDescribe) endif() set(VERSION_FULL "${VERSION_NUMERIC}") + set(VERSION_DEB "${VERSION_NUMERIC}") if(VERSION_STATUS) string(APPEND VERSION_FULL "-${VERSION_STATUS}") + string(APPEND VERSION_DEB "~${VERSION_STATUS}") # Debian versioning uses a ~ for "less than blank" endif() if(VERSION_COMMIT_COUNT) string(APPEND VERSION_FULL "+${VERSION_COMMIT_COUNT}.${VERSION_HASH}") + string(APPEND VERSION_DEB "+${VERSION_COMMIT_COUNT}.${VERSION_HASH}") endif() if(VERSION_DIRTY) string(APPEND VERSION_FULL "+m") + string(APPEND VERSION_DEB "+m") endif() message(STATUS "Version: ${VERSION_FULL}") # Output variables set(VERSION_FULL "${VERSION_FULL}" PARENT_SCOPE) + set(VERSION_DEB "${VERSION_DEB}" PARENT_SCOPE) set(VERSION_NUMERIC "${VERSION_NUMERIC}" PARENT_SCOPE) set(VERSION_HASH "${VERSION_HASH}" PARENT_SCOPE) endfunction() diff --git a/dockerfile/Dockerfile b/dockerfile/Dockerfile index c3f5937d1d2..cc060d7f775 100644 --- a/dockerfile/Dockerfile +++ b/dockerfile/Dockerfile @@ -202,6 +202,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ clang-tidy-17 \ curl \ dialog \ + file \ graphviz \ jq \ pandoc \ diff --git a/tt_metal/CMakeLists.txt b/tt_metal/CMakeLists.txt index 19227774e5e..44f80bb4ec0 100644 --- a/tt_metal/CMakeLists.txt +++ b/tt_metal/CMakeLists.txt @@ -43,6 +43,68 @@ target_link_libraries( FlatBuffers::FlatBuffers ) +# TODO(afuller): this should be self-describing modules. +# For now just cherry-pick all the files I discovered empirally by trying to run a test. +if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.23) + target_sources( + tt_metal + PUBLIC + FILE_SET jit_api + TYPE HEADERS + BASE_DIRS ${CMAKE_CURRENT_SOURCE_DIR} + FILES + api/tt-metalium/dev_msgs.h + api/tt-metalium/tt_log.h + api/tt-metalium/circular_buffer_constants.h + api/tt-metalium/cq_commands.hpp + soc_descriptors/grayskull_120_arch.yaml + soc_descriptors/wormhole_b0_80_arch.yaml + soc_descriptors/blackhole_140_arch.yaml + core_descriptors/grayskull_120_arch.yaml + core_descriptors/wormhole_b0_80_arch.yaml + core_descriptors/blackhole_140_arch.yaml + third_party/tt_llk_blackhole/common/inc/ckernel.h + third_party/tt_llk_blackhole/common/inc/ckernel_include.h + third_party/tt_llk_blackhole/common/inc/ckernel_defs.h + third_party/tt_llk_blackhole/common/inc/ckernel_instr_params.h + third_party/tt_llk_blackhole/common/inc/ckernel_addrmod.h + third_party/tt_llk_blackhole/common/inc/ckernel_gpr_map.h + third_party/tt_llk_blackhole/common/inc/ckernel_structs.h + third_party/tt_llk_blackhole/common/inc/ckernel_ops.h + third_party/tt_llk_blackhole/common/inc/ckernel_globals.h + third_party/tt_llk_blackhole/llk_lib/llk_defs.h + third_party/tt_llk_wormhole_b0/common/inc/ckernel.h + third_party/tt_llk_wormhole_b0/common/inc/ckernel_include.h + third_party/tt_llk_wormhole_b0/common/inc/ckernel_defs.h + third_party/tt_llk_wormhole_b0/common/inc/ckernel_instr_params.h + third_party/tt_llk_wormhole_b0/common/inc/ckernel_addrmod.h + third_party/tt_llk_wormhole_b0/common/inc/ckernel_gpr_map.h + third_party/tt_llk_wormhole_b0/common/inc/ckernel_structs.h + third_party/tt_llk_wormhole_b0/common/inc/ckernel_ops.h + third_party/tt_llk_wormhole_b0/common/inc/ckernel_globals.h + third_party/tt_llk_wormhole_b0/llk_lib/llk_defs.h + third_party/tt_llk_grayskull/common/inc/ckernel.h + third_party/tt_llk_grayskull/common/inc/ckernel_include.h + third_party/tt_llk_grayskull/common/inc/ckernel_defs.h + third_party/tt_llk_grayskull/common/inc/ckernel_instr_params.h + third_party/tt_llk_grayskull/common/inc/ckernel_addrmod.h + third_party/tt_llk_grayskull/common/inc/ckernel_gpr_map.h + third_party/tt_llk_grayskull/common/inc/ckernel_structs.h + third_party/tt_llk_grayskull/common/inc/ckernel_ops.h + third_party/tt_llk_grayskull/common/inc/ckernel_globals.h + third_party/tt_llk_grayskull/llk_lib/llk_defs.h + tools/profiler/kernel_profiler.hpp + impl/dispatch/kernels/cq_common.hpp + impl/dispatch/kernels/cq_helpers.hpp + impl/dispatch/kernels/packet_queue_ctrl.hpp + impl/dispatch/kernels/packet_queue.hpp + # Kernel sources + impl/dispatch/kernels/cq_dispatch_slave.cpp + impl/dispatch/kernels/cq_dispatch.cpp + impl/dispatch/kernels/cq_prefetch.cpp + ) +endif() + target_precompile_headers( tt_metal PRIVATE @@ -95,3 +157,17 @@ add_subdirectory(impl) add_subdirectory(detail) add_subdirectory(distributed) add_subdirectory(tt_stl) + +if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.23) + install( + TARGETS + tt_metal + LIBRARY + COMPONENT metalium-runtime + FILE_SET + jit_api + DESTINATION + ${CMAKE_INSTALL_LIBEXECDIR}/tt-metalium/tt_metal # FIXME: fix the include paths for jit_build + COMPONENT metalium-runtime + ) +endif() diff --git a/tt_metal/hostdevcommon/CMakeLists.txt b/tt_metal/hostdevcommon/CMakeLists.txt index 58c361264e7..3c8bfb5d249 100644 --- a/tt_metal/hostdevcommon/CMakeLists.txt +++ b/tt_metal/hostdevcommon/CMakeLists.txt @@ -1,14 +1,32 @@ add_library(ttmetalium_hostdevcommon INTERFACE) add_library(TT::Metalium::HostDevCommon ALIAS ttmetalium_hostdevcommon) -target_sources( - ttmetalium_hostdevcommon - INTERFACE - api/hostdevcommon/common_runtime_address_map.h - api/hostdevcommon/common_values.hpp - api/hostdevcommon/dprint_common.h - api/hostdevcommon/kernel_structs.h - api/hostdevcommon/profiler_common.h -) +if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.23) + target_sources( + ttmetalium_hostdevcommon + PUBLIC + FILE_SET jit_api + TYPE HEADERS + BASE_DIRS api + FILES + api/hostdevcommon/common_runtime_address_map.h + api/hostdevcommon/common_values.hpp + api/hostdevcommon/dprint_common.h + api/hostdevcommon/kernel_structs.h + api/hostdevcommon/profiler_common.h + ) +endif() target_include_directories(ttmetalium_hostdevcommon INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/api) + +if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.23) + install( + TARGETS + ttmetalium_hostdevcommon + FILE_SET + jit_api + DESTINATION + ${CMAKE_INSTALL_LIBEXECDIR}/tt-metalium/tt_metal/hostdevcommon/api # FIXME: fix the include paths for jit_build + COMPONENT metalium-runtime + ) +endif() diff --git a/tt_metal/hw/CMakeLists.txt b/tt_metal/hw/CMakeLists.txt index bd487cb2ab7..25387208487 100644 --- a/tt_metal/hw/CMakeLists.txt +++ b/tt_metal/hw/CMakeLists.txt @@ -206,4 +206,99 @@ add_library(Metalium::Metal::Hardware ALIAS hw) target_include_directories(hw INTERFACE inc) +if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.23) + target_sources( + hw + PUBLIC + FILE_SET jit_api + TYPE HEADERS + BASE_DIRS ${CMAKE_CURRENT_SOURCE_DIR} + FILES + inc/blackhole/noc/noc_parameters.h + inc/blackhole/noc/noc_overlay_parameters.h + inc/blackhole/noc/noc.h + inc/blackhole/eth_l1_address_map.h + inc/blackhole/tensix.h + inc/blackhole/tensix_types.h + inc/blackhole/cfg_defines.h + inc/blackhole/stream_io_map.h + inc/blackhole/noc_nonblocking_api.h + inc/blackhole/core_config.h + inc/blackhole/dev_mem_map.h + inc/blackhole/c_tensix_core.h + inc/blackhole/tdma_xmov.h + inc/grayskull/noc/noc_parameters.h + inc/grayskull/noc/noc_overlay_parameters.h + inc/grayskull/noc/noc.h + inc/grayskull/eth_l1_address_map.h + inc/grayskull/tensix.h + inc/grayskull/tensix_types.h + inc/grayskull/cfg_defines.h + inc/grayskull/stream_io_map.h + inc/grayskull/noc_nonblocking_api.h + inc/grayskull/core_config.h + inc/grayskull/dev_mem_map.h + inc/grayskull/c_tensix_core.h + inc/grayskull/tdma_xmov.h + inc/wormhole/noc/noc_parameters.h + inc/wormhole/noc/noc_overlay_parameters.h + inc/wormhole/noc/noc.h + inc/wormhole/eth_l1_address_map.h + inc/wormhole/stream_io_map.h + inc/wormhole/noc_nonblocking_api.h + inc/wormhole/core_config.h + inc/wormhole/dev_mem_map.h + inc/wormhole/c_tensix_core.h + inc/wormhole/tdma_xmov.h + inc/atomic_rwptr.h + inc/bit_utils.h + inc/circular_buffer_init.h + inc/circular_buffer.h + inc/cmd_defs.h + inc/compile_time_args.h + inc/dataflow_api.h + inc/dataflow_cmd_bufs.h + inc/dataflow_internal.h + inc/firmware_common.h + inc/mod_div_lib.h + inc/remote_circular_buffer_api.h + inc/risc_attribs.h + inc/risc_common.h + inc/tensix_functions.h + inc/vptr_uint.h + inc/debug/assert.h + inc/debug/fw_debug.h + inc/debug/dprint.h + inc/debug/dprint_buffer.h + inc/debug/dprint_tile.h + inc/debug/noc_logging.h + inc/debug/ring_buffer.h + inc/debug/sanitize_noc.h + inc/debug/stack_usage.h + inc/debug/waypoint.h + inc/debug/watcher_common.h + inc/ethernet/erisc.h + inc/utils/utils.h + inc/ethernet/dataflow_api.h + inc/ethernet/tt_eth_api.h + inc/ethernet/tunneling.h + inc/ethernet/tt_eth_ss_regs.h + ckernels/blackhole/metal/llk_io/llk_io.h + ) +endif() + target_link_libraries(hw INTERFACE TT::Metalium::HostDevCommon) + +add_subdirectory(firmware) + +if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.23) + install( + TARGETS + hw + FILE_SET + jit_api + DESTINATION + ${CMAKE_INSTALL_LIBEXECDIR}/tt-metalium/tt_metal/hw # FIXME: fix the include paths for jit_build + COMPONENT metalium-runtime + ) +endif() diff --git a/tt_metal/hw/firmware/CMakeLists.txt b/tt_metal/hw/firmware/CMakeLists.txt new file mode 100644 index 00000000000..8de5e412bb6 --- /dev/null +++ b/tt_metal/hw/firmware/CMakeLists.txt @@ -0,0 +1,41 @@ +add_library(metalium_firmware INTERFACE) +add_library(TT::Metalium::Firmware ALIAS metalium_firmware) + +if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.23) + target_sources( + metalium_firmware + INTERFACE + FILE_SET jit_api + TYPE HEADERS + BASE_DIRS ${CMAKE_CURRENT_SOURCE_DIR} + FILES + src/blackhole/noc.c + src/grayskull/noc.c + src/wormhole/noc.c + src/active_erisc.cc + src/active_erisck.cc + src/brisc.cc + src/brisck.cc + src/erisc.cc + src/erisck.cc + src/idle_erisc.cc + src/idle_erisck.cc + src/ncrisc.cc + src/ncrisck.cc + src/slave_idle_erisc.cc + src/tdma_xmov.c + src/trisc.cc + src/trisck.cc + src/tt_eth_api.cpp + ) + + install( + TARGETS + metalium_firmware + FILE_SET + jit_api + DESTINATION + ${CMAKE_INSTALL_LIBEXECDIR}/tt-metalium/tt_metal/hw/firmware # FIXME: fix the include paths for jit_build + COMPONENT metalium-runtime + ) +endif() From 64e4badc6951d2bde32f4b69ebed4814af47e63d Mon Sep 17 00:00:00 2001 From: Pavle Milenkovic Date: Tue, 18 Feb 2025 13:48:43 +0100 Subject: [PATCH 124/316] #16174: Support for int32 subtraction for WHB0 and BH (#17359) ### Ticket #16174 ### Problem description Subtraction of int32 dtype was not supported on WHB0 and BH. ### What's changed Added necessary APIs, LLKs, and modified codepaths to include sub int32 operation. This operation was done through SFPU. ### Checklist - [x] Post commit CI passes - [x] Blackhole Post commit (if applicable) - [ ] Model regression CI testing passes (if applicable) - [ ] Device performance regression CI testing passes (if applicable) - [ ] **(For models and ops writers)** Full [new models](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) tests passes - [x] New/Existing tests provide coverage for changes --- .../operations/eltwise/test_binary_fp32.py | 20 ++++++++ .../llk_api/llk_sfpu/ckernel_sfpu_sub_int32.h | 22 +++++++++ .../llk_math_eltwise_binary_sfpu_sub_int32.h | 27 +++++++++++ .../llk_api/llk_sfpu/ckernel_sfpu_sub_int32.h | 22 +++++++++ .../llk_math_eltwise_binary_sfpu_sub_int32.h | 27 +++++++++++ .../compute_kernel_api/sub_int32_sfpu.h | 47 +++++++++++++++++++ tt_metal/third_party/tt_llk_blackhole | 2 +- tt_metal/third_party/tt_llk_wormhole_b0 | 2 +- .../eltwise/binary/common/binary_op_utils.cpp | 9 +++- .../binary/device/binary_device_operation.cpp | 3 +- .../compute/eltwise_binary_sfpu_kernel.cpp | 4 ++ 11 files changed, 180 insertions(+), 5 deletions(-) create mode 100644 tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_sub_int32.h create mode 100644 tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_sub_int32.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_sub_int32.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_sub_int32.h create mode 100644 tt_metal/include/compute_kernel_api/sub_int32_sfpu.h diff --git a/tests/ttnn/unit_tests/operations/eltwise/test_binary_fp32.py b/tests/ttnn/unit_tests/operations/eltwise/test_binary_fp32.py index 6c3c37fc7d5..eb73010e54f 100644 --- a/tests/ttnn/unit_tests/operations/eltwise/test_binary_fp32.py +++ b/tests/ttnn/unit_tests/operations/eltwise/test_binary_fp32.py @@ -93,6 +93,26 @@ def test_add_int32(device, ttnn_function): assert status +@skip_for_grayskull("Unsupported dtype for Grayskull") +@pytest.mark.parametrize( + "ttnn_function", + [ + ttnn.sub, + ], +) +def test_sub_int32(device, ttnn_function): + x_torch = torch.tensor([[11, 23, 0, -23, -1, -100]], dtype=torch.int32) + y_torch = torch.tensor([[78, 99, 34, -33, -1, 100]], dtype=torch.int32) + golden_fn = ttnn.get_golden_function(ttnn_function) + z_torch = golden_fn(x_torch, y_torch) + x_tt = ttnn.from_torch(x_torch, dtype=ttnn.int32, layout=ttnn.TILE_LAYOUT, device=device) + y_tt = ttnn.from_torch(y_torch, dtype=ttnn.int32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt = ttnn.from_torch(z_torch, dtype=ttnn.int32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt_sub = ttnn.sub(x_tt, y_tt) + tt_out = ttnn.to_torch(z_tt_sub) + assert torch.allclose(z_torch, tt_out, atol=1e-10, rtol=1e-5, equal_nan=False) + + @skip_for_grayskull("Unsupported dtype for Grayskull") @pytest.mark.parametrize( "ttnn_function", diff --git a/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_sub_int32.h b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_sub_int32.h new file mode 100644 index 00000000000..154cf20122e --- /dev/null +++ b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_sub_int32.h @@ -0,0 +1,22 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "ckernel.h" +#include "ckernel_defs.h" +#include "sfpi.h" + +using namespace sfpi; + +namespace ckernel { +namespace sfpu { + +template +inline void calculate_sub_int32(const uint dst_offset) { + _sub_int32_(dst_offset); +} + +} // namespace sfpu +} // namespace ckernel diff --git a/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_sub_int32.h b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_sub_int32.h new file mode 100644 index 00000000000..4efe45a1c23 --- /dev/null +++ b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_sub_int32.h @@ -0,0 +1,27 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "llk_math_eltwise_binary_sfpu_init.h" +#include "llk_math_eltwise_binary_sfpu_params.h" +#include "ckernel_sfpu_sub_int32.h" + +namespace ckernel { + +// New LLK SFPU APIs + +template +inline void llk_math_eltwise_binary_sfpu_sub_int32_init() { + llk_math_eltwise_binary_sfpu_init(); +} + +template +inline void llk_math_eltwise_binary_sfpu_sub_int32( + uint dst_index0, uint32_t dst_index1, int vector_mode = VectorMode::RC) { + llk_math_eltwise_binary_sfpu_params( + ckernel::sfpu::calculate_sub_int32, dst_index0, dst_index1, vector_mode); +} + +} // namespace ckernel diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_sub_int32.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_sub_int32.h new file mode 100644 index 00000000000..154cf20122e --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_sub_int32.h @@ -0,0 +1,22 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "ckernel.h" +#include "ckernel_defs.h" +#include "sfpi.h" + +using namespace sfpi; + +namespace ckernel { +namespace sfpu { + +template +inline void calculate_sub_int32(const uint dst_offset) { + _sub_int32_(dst_offset); +} + +} // namespace sfpu +} // namespace ckernel diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_sub_int32.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_sub_int32.h new file mode 100644 index 00000000000..4efe45a1c23 --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_sub_int32.h @@ -0,0 +1,27 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "llk_math_eltwise_binary_sfpu_init.h" +#include "llk_math_eltwise_binary_sfpu_params.h" +#include "ckernel_sfpu_sub_int32.h" + +namespace ckernel { + +// New LLK SFPU APIs + +template +inline void llk_math_eltwise_binary_sfpu_sub_int32_init() { + llk_math_eltwise_binary_sfpu_init(); +} + +template +inline void llk_math_eltwise_binary_sfpu_sub_int32( + uint dst_index0, uint32_t dst_index1, int vector_mode = VectorMode::RC) { + llk_math_eltwise_binary_sfpu_params( + ckernel::sfpu::calculate_sub_int32, dst_index0, dst_index1, vector_mode); +} + +} // namespace ckernel diff --git a/tt_metal/include/compute_kernel_api/sub_int32_sfpu.h b/tt_metal/include/compute_kernel_api/sub_int32_sfpu.h new file mode 100644 index 00000000000..ee3c9b998c7 --- /dev/null +++ b/tt_metal/include/compute_kernel_api/sub_int32_sfpu.h @@ -0,0 +1,47 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "compute_kernel_api/common_globals.h" +#ifdef TRISC_MATH +#include "llk_math_eltwise_binary_sfpu_sub_int32.h" +#define MAIN math_main() +#define MATH(x) x +#else +#define MATH(x) +#endif + +namespace ckernel { + +// clang-format off +/** + * Performs an elementwise sub operation with the two integer inputs: y = sub(x0,x1) + * Output overwrites first operand in DST. + * + * The DST register buffer must be in acquired state via *acquire_dst* call. This call is blocking and is only available + * on the compute engine. + * A maximum of 4 tiles from each operand can be loaded into DST at once, for a total of 8 tiles, + * when using 16 bit formats. This gets reduced to 2 tiles from each operand for 32 bit formats. + * + * Return value: None + * + * | Argument | Description | Type | Valid Range | Required | + * |-----------------------|-----------------------------------------------------------------------------|----------|-------------------------------------------------------|----------| + * | idst0 | The index of the tile in DST register buffer to use as first operand | uint32_t | Must be less than the size of the DST register buffer | True | + * | idst1 | The index of the tile in DST register buffer to use as second operand | uint32_t | Must be less than the size of the DST register buffer | True | + * | sign_magnitude_format | Whether the Int32 values are in sign-magnitude format (not 2's complement) | bool | | False | + */ +// clang-format on +template +ALWI void sub_int32_tile(uint32_t idst0, uint32_t idst1) { + MATH((llk_math_eltwise_binary_sfpu_sub_int32(idst0, idst1))); +} + +/** + * Please refer to documentation for any_init. + */ +ALWI void sub_int32_tile_init() { MATH((llk_math_eltwise_binary_sfpu_sub_int32_init())); } + +} // namespace ckernel diff --git a/tt_metal/third_party/tt_llk_blackhole b/tt_metal/third_party/tt_llk_blackhole index 9fd3e2d93d1..76b5357a75b 160000 --- a/tt_metal/third_party/tt_llk_blackhole +++ b/tt_metal/third_party/tt_llk_blackhole @@ -1 +1 @@ -Subproject commit 9fd3e2d93d1532373f52e11e963de40c1cdf9a55 +Subproject commit 76b5357a75bfed7dac22a7b0417bb5589c2e0c5b diff --git a/tt_metal/third_party/tt_llk_wormhole_b0 b/tt_metal/third_party/tt_llk_wormhole_b0 index 0ec3177bfc2..a34e1966683 160000 --- a/tt_metal/third_party/tt_llk_wormhole_b0 +++ b/tt_metal/third_party/tt_llk_wormhole_b0 @@ -1 +1 @@ -Subproject commit 0ec3177bfc262f7edf6cfc19531ecb8f669895d2 +Subproject commit a34e1966683c478d575d5ea79413004955c8a57f diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/common/binary_op_utils.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/common/binary_op_utils.cpp index 153c99488ba..1b2d48bf618 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary/common/binary_op_utils.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/common/binary_op_utils.cpp @@ -191,8 +191,13 @@ std::map get_defines_fp32( } break; case BinaryOpType::SUB: - new_defines.insert({"BINOP_INIT", fmt::format("sub_binary_tile_init();")}); - op_name = "sub_binary_tile"; + if (input_a_dtype == DataType::INT32 && input_b_dtype == DataType::INT32) { + new_defines.insert({"SUB_INT32_INIT", "sub_int32_tile_init();"}); + op_name = "sub_int32_tile"; + } else { + new_defines.insert({"BINOP_INIT", "sub_binary_tile_init();"}); + op_name = "sub_binary_tile"; + } break; case BinaryOpType::MUL: new_defines.insert({"BINOP_INIT", fmt::format("mul_binary_tile_init();")}); diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_device_operation.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_device_operation.cpp index a3c7d86cc81..094d5d2a0cc 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_device_operation.cpp @@ -17,8 +17,9 @@ namespace ttnn::operations::binary { namespace utils { bool is_binary_sfpu_op(BinaryOpType val, DataType a, DataType b) { switch (val) { - case BinaryOpType::ADD: return ((a == DataType::FLOAT32 && b == DataType::FLOAT32) || (a == DataType::INT32 && b == DataType::INT32)); + case BinaryOpType::ADD: case BinaryOpType::SUB: + return ((a == DataType::FLOAT32 && b == DataType::FLOAT32) || (a == DataType::INT32 && b == DataType::INT32)); case BinaryOpType::MUL: case BinaryOpType::DIV_FAST: case BinaryOpType::RSUB: diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_sfpu_kernel.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_sfpu_kernel.cpp index c083a354fae..032118851f7 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_sfpu_kernel.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_sfpu_kernel.cpp @@ -13,6 +13,7 @@ #include "compute_kernel_api/binary_bitwise_sfpu.h" #include "compute_kernel_api/binary_shift.h" #include "compute_kernel_api/add_int32_sfpu.h" +#include "compute_kernel_api/sub_int32_sfpu.h" #define PRE_SCALE defined SFPU_OP_INIT_PRE_IN0_0 || defined SFPU_OP_INIT_PRE_IN1_0 @@ -113,6 +114,9 @@ void MAIN { #ifdef ADD_INT32_INIT ADD_INT32_INIT #endif +#ifdef SUB_INT32_INIT + SUB_INT32_INIT +#endif #ifdef BITWISE_INIT BITWISE_INIT #endif From d6e71128c6384017d30d6cf0d04fbf4bbba1b95f Mon Sep 17 00:00:00 2001 From: Rashid Kaleem Date: Tue, 18 Feb 2025 08:29:56 -0600 Subject: [PATCH 125/316] Fix matrix shard config. (#17893) ### Ticket `google/gemma-2-2b-it` fails to load correctly on N300. ### Problem description The `wo` matrix was using an incorrect memory config causing some models to fail in sharding. ### What's changed Fixed the dimension of the matrix to account to use `n_heads*head_dim` instead of model dim. Testing with `google/gemma-2-2b-it` on N300 verifies that the model loading succeeds. Note - `google/gemma-2-2b-it` produces incorrect output due to unsupported ops which are being worked on. This PR unblocks work on those operations. Fix thanks to @yieldthought ### Checklist - [ ] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes - [ ] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [ ] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [ ] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [ ] New/Existing tests provide coverage for changes Signed-off-by: Rashid Kaleem --- models/demos/llama3/tt/llama_attention.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/models/demos/llama3/tt/llama_attention.py b/models/demos/llama3/tt/llama_attention.py index a8c8581dc98..d1c1bee93b0 100644 --- a/models/demos/llama3/tt/llama_attention.py +++ b/models/demos/llama3/tt/llama_attention.py @@ -206,7 +206,7 @@ def __init__( pt_wo = self.state_dict[f"{wo_str}.weight"].transpose(-1, -2).unsqueeze(0).unsqueeze(0) wo_mem_config = configuration.create_dram_sharded_mem_config( - configuration.dim // configuration.num_devices, configuration.dim + (configuration.n_heads * configuration.head_dim) // configuration.num_devices, configuration.dim ) self.wo = ttnn.as_tensor( From d277980875bf0b8ba2e60113c9009a236622b564 Mon Sep 17 00:00:00 2001 From: Sean Nijjar Date: Tue, 18 Feb 2025 10:30:18 -0500 Subject: [PATCH 126/316] Apply EDM Fabric Optimizations - Up to 13.5 GB/s bidir unicast and 10.5 GB/s bidir mcast @4k packet size (#17930) Numerous EDM Fabric (1D Fabric) optimizations that take the EDM fabric to the following approximate performance with 4K packet size: - 13.5 GB/s in neighbour exchange test - 10.5 GB/s in 4chip mcast test Measured ~ 1 GB/s higher when compiling with -O3 but that is currently not enabled in this PR The optimizations in this PR include: - Add optimized power-of-2 queue pointer handling and enable power-of-2 buffer slot counts - Add optimized power-of-2 transaction ID handling and use power-of-2 transaction IDs on write - Mild cleanup/optimizations of volatile pointer usage - Optimize main top level control loop of EDM fabric - Reduce the frequency of context switch/teardown checks - Nest main control loop in a tight loop - Partially unrol sender state execution steps (one for each channel) instead of using a sender channel ID to alternate through them --- .../gtests/ccl/kernels/edm_fabric_writer.cpp | 13 +- ...erisc_data_mover_loopback_with_workers.cpp | 11 ++ .../ccl/erisc_datamover_builder.cpp | 27 ++- .../ccl/erisc_datamover_builder.hpp | 2 + .../edm_fabric_flow_control_helpers.hpp | 162 +++++++++++++++++ .../edm_fabric/edm_fabric_worker_adapters.hpp | 93 +++++++--- .../fabric_edm_packet_transmission.hpp | 17 +- .../edm_fabric/fabric_erisc_datamover.cpp | 163 +++++++++++------- .../fabric_erisc_datamover_channels.hpp | 147 +--------------- 9 files changed, 381 insertions(+), 254 deletions(-) create mode 100644 ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_flow_control_helpers.hpp diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp index 952a4963104..91fe40d181e 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp @@ -139,13 +139,9 @@ void kernel_main() { safe_get_noc_addr(static_cast(dest_noc_x), static_cast(dest_noc_y), dest_bank_addr); noc_async_write(source_l1_buffer_address, dest_addr, packet_payload_size_bytes); if (fabric_connection.has_forward_connection()) { - DeviceZoneScopedN("WR-FWD"); mcast_fwd_packet_header->to_noc_unicast_write( NocUnicastCommandHeader{noc0_dest_addr}, packet_payload_size_bytes); - { - DeviceZoneScopedN("WR-FWD-WAIT"); - fabric_connection.get_forward_connection().wait_for_empty_write_slot(); - } + fabric_connection.get_forward_connection().wait_for_empty_write_slot(); print_pkt_header(mcast_fwd_packet_header); fabric_connection.get_forward_connection().send_payload_without_header_non_blocking_from_address( source_l1_buffer_address, packet_payload_size_bytes); @@ -154,13 +150,9 @@ void kernel_main() { } if (fabric_connection.has_backward_connection()) { - DeviceZoneScopedN("WR-BWD"); mcast_bwd_packet_header->to_noc_unicast_write( NocUnicastCommandHeader{noc0_dest_addr}, packet_payload_size_bytes); - { - DeviceZoneScopedN("WR-BWD-WAIT"); - fabric_connection.get_backward_connection().wait_for_empty_write_slot(); - } + fabric_connection.get_backward_connection().wait_for_empty_write_slot(); print_pkt_header(mcast_bwd_packet_header); fabric_connection.get_backward_connection().send_payload_without_header_non_blocking_from_address( source_l1_buffer_address, packet_payload_size_bytes); @@ -176,7 +168,6 @@ void kernel_main() { for (size_t i = 0; i < num_unicasts; i++) { auto noc0_dest_addr = safe_get_noc_addr(static_cast(dest_noc_x), static_cast(dest_noc_y), dest_bank_addr, 0); - DeviceZoneScopedN("UNICAST-WRITE"); auto& fabric_conn = unicast_is_fwd ? fabric_connection.get_forward_connection() : fabric_connection.get_backward_connection(); unicast_packet_header->to_noc_unicast_write(NocUnicastCommandHeader{noc0_dest_addr}, packet_payload_size_bytes); diff --git a/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp b/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp index 4f9eadf730c..1ab121ffec7 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp @@ -3590,6 +3590,17 @@ TEST(EdmFabric, BasicMcastThroughputTest_2) { RunWriteThroughputStabilityTestWithPersistentFabric(num_mcasts, num_unicasts, num_links, num_op_invocations); } +TEST(EdmFabric, BasicMcastThroughputTest_3_SingleLink) { + const size_t num_mcasts = 200000; + const size_t num_unicasts = 0; + const size_t num_links = 1; + const size_t num_op_invocations = 1; + const bool line_sync = true; + WriteThroughputStabilityTestWithPersistentFabricParams params; + params.line_sync = line_sync; + RunWriteThroughputStabilityTestWithPersistentFabric( + num_mcasts, num_unicasts, num_links, num_op_invocations, params); +} TEST(EdmFabric, BasicMcastThroughputTest_3) { const size_t num_mcasts = 200000; const size_t num_unicasts = 2; diff --git a/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.cpp b/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.cpp index 8be28978f47..2f505f41586 100644 --- a/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.cpp @@ -75,24 +75,43 @@ FabricEriscDatamoverConfig::FabricEriscDatamoverConfig( TT_FATAL(sender_channel_1_buffer_index_address != sender_channel_0_buffer_index_address, "FabricEriscDatamoverConfig was constructed with illegal buffer index address"); const size_t min_buffer_size = sizeof(tt::fabric::PacketHeader) + 2 * FabricEriscDatamoverConfig::eth_channel_sync_size; TT_FATAL(channel_buffer_size_bytes >= min_buffer_size, "FabricEriscDatamoverConfig was constructed with `channel_buffer_size_bytes` argument set smaller than minimum size of {}", min_buffer_size); + + constexpr size_t default_pow2_num_sender_buffer_slots = 8; + constexpr size_t default_pow2_num_receiver_buffer_slots = 16; + const std::size_t channel_buffer_size_with_channel_sync = channel_buffer_size_bytes + sizeof(tt::fabric::PacketHeader); // + 16 // sizeof(tt::fabric::PacketHeader); - this->channel_buffer_size_bytes = channel_buffer_size_bytes; + const size_t next_lowest_power_of_2_buffer_slot_count = + + this->channel_buffer_size_bytes = channel_buffer_size_bytes; this->channel_buffer_size_bytes_with_channel_sync = channel_buffer_size_with_channel_sync; const std::size_t total_ratio_count = 2 * sender_ratio_size + receiver_ratio_size; + this->sender_0_channel_size_bytes = tt::round_down( (available_channel_buffering_space / total_ratio_count) * sender_ratio_size, channel_buffer_size_with_channel_sync); - this->sender_0_num_buffers = this->sender_0_channel_size_bytes / channel_buffer_size_with_channel_sync; + if constexpr (FabricEriscDatamoverConfig::constrain_to_power_of_2_buffer_slot_counts) { + this->sender_0_num_buffers = default_pow2_num_sender_buffer_slots; + } else { + this->sender_0_num_buffers = this->sender_0_channel_size_bytes / channel_buffer_size_with_channel_sync; + } this->sender_1_channel_size_bytes = tt::round_down( (available_channel_buffering_space / total_ratio_count) * sender_ratio_size, channel_buffer_size_with_channel_sync); - this->sender_1_num_buffers = this->sender_1_channel_size_bytes / channel_buffer_size_with_channel_sync; + if constexpr (FabricEriscDatamoverConfig::constrain_to_power_of_2_buffer_slot_counts) { + this->sender_1_num_buffers = default_pow2_num_sender_buffer_slots; + } else { + this->sender_1_num_buffers = this->sender_1_channel_size_bytes / channel_buffer_size_with_channel_sync; + } this->receiver_channel_size_bytes = tt::round_down( (available_channel_buffering_space / total_ratio_count) * receiver_ratio_size, channel_buffer_size_with_channel_sync); - this->receiver_num_buffers = this->receiver_channel_size_bytes / channel_buffer_size_with_channel_sync; + if constexpr (FabricEriscDatamoverConfig::constrain_to_power_of_2_buffer_slot_counts) { + this->receiver_num_buffers = default_pow2_num_receiver_buffer_slots; + } else { + this->receiver_num_buffers = this->receiver_channel_size_bytes / channel_buffer_size_with_channel_sync; + } this->sender_0_channel_base_address = buffer_region_start; this->sender_1_channel_base_address = this->sender_0_channel_base_address + this->sender_0_channel_size_bytes; diff --git a/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp b/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp index 1d32db7f8c3..a9d1a076ba6 100644 --- a/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp @@ -30,6 +30,8 @@ namespace ccl { struct FabricEriscDatamoverConfig { + static constexpr bool constrain_to_power_of_2_buffer_slot_counts = true; + static constexpr std::size_t field_size = 16; static constexpr std::size_t buffer_alignment = 32; static constexpr std::size_t eth_word_l1_alignment = 16; diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_flow_control_helpers.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_flow_control_helpers.hpp new file mode 100644 index 00000000000..63bf9bad9f3 --- /dev/null +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_flow_control_helpers.hpp @@ -0,0 +1,162 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include + +#include "tt_metal/hw/inc/utils/utils.h" +#include "risc_attribs.h" + +namespace tt::fabric { + +template +class NamedType { +public: + FORCE_INLINE explicit NamedType(const T& value) : value_(value) {} + FORCE_INLINE explicit NamedType(T&& value) : value_(std::move(value)) {} + FORCE_INLINE NamedType& operator=(const NamedType& rhs) = default; + FORCE_INLINE T& get() { return value_; } + FORCE_INLINE const T& get() const { return value_; } + FORCE_INLINE operator T() const { return value_; } + FORCE_INLINE operator T&() { return value_; } + +private: + T value_; +}; + +using BufferIndex = NamedType; +using BufferPtr = NamedType; + +// Increments val and wraps to 0 if it reaches limit +template +FORCE_INLINE auto wrap_increment(T val) -> T { + constexpr bool is_pow2 = LIMIT != 0 && is_power_of_2(LIMIT); + if constexpr (LIMIT == 1) { + return val; + } else if constexpr (LIMIT == 2) { + return 1 - val; + } else if constexpr (is_pow2) { + return (val + 1) & (static_cast(LIMIT - 1)); + } else { + return (val == static_cast(LIMIT - 1)) ? static_cast(0) : static_cast(val + 1); + } +} +template +FORCE_INLINE auto wrap_increment_n(T val, uint8_t increment) -> T { + constexpr bool is_pow2 = LIMIT != 0 && is_power_of_2(LIMIT); + if constexpr (LIMIT == 1) { + return val; + } else if constexpr (LIMIT == 2) { + return 1 - val; + } else if constexpr (is_pow2) { + return (val + increment) & (LIMIT - 1); + } else { + T new_unadjusted_val = val + increment; + bool wraps = new_unadjusted_val >= LIMIT; + return wraps ? static_cast(new_unadjusted_val - LIMIT) : static_cast(new_unadjusted_val); + } +} + +FORCE_INLINE +auto normalize_ptr(BufferPtr ptr, uint8_t num_buffers) -> BufferIndex { + // note it may make sense to calculate this only when we increment + // which will save calculations overall (but may add register pressure) + // and introduce undesirable loads + bool normalize = ptr >= num_buffers; + uint8_t normalized_ptr = ptr.get() - static_cast(normalize * num_buffers); + ASSERT(normalized_ptr < num_buffers); + return BufferIndex{normalized_ptr}; +} +template +FORCE_INLINE auto normalize_ptr(BufferPtr ptr) -> BufferIndex { + static_assert(NUM_BUFFERS != 0, "normalize_ptr called with NUM_BUFFERS of 0; it must be greater than 0"); + constexpr bool is_size_pow2 = NUM_BUFFERS != 0 && (NUM_BUFFERS & (NUM_BUFFERS - 1)) == 0; + constexpr bool is_size_2 = NUM_BUFFERS == 2; + constexpr bool is_size_1 = NUM_BUFFERS == 1; + constexpr uint8_t wrap_mask = NUM_BUFFERS - 1; + if constexpr (is_size_pow2) { + return BufferIndex{static_cast(ptr.get() & wrap_mask)}; + } else if constexpr (is_size_2) { + return BufferIndex{(uint8_t)1 - ptr.get()}; + } else if constexpr (is_size_1) { + return BufferIndex{0}; + } else { + // note it may make sense to calculate this only when we increment + // which will save calculations overall (but may add register pressure) + // and introduce undesirable loads + return normalize_ptr(ptr, NUM_BUFFERS); + } +} + +FORCE_INLINE uint8_t +distance_behind(const BufferPtr& trailing_ptr, const BufferPtr& leading_ptr, uint8_t ptr_wrap_size) { + bool leading_gte_trailing_ptr = leading_ptr >= trailing_ptr; + return leading_gte_trailing_ptr ? leading_ptr - trailing_ptr : ptr_wrap_size - (trailing_ptr - leading_ptr); +} +template +FORCE_INLINE uint8_t distance_behind(const BufferPtr& trailing_ptr, const BufferPtr& leading_ptr) { + static_assert(NUM_BUFFERS != 0, "distance_behind called with NUM_BUFFERS of 0; it must be greater than 0"); + constexpr bool is_size_pow2 = is_power_of_2(NUM_BUFFERS); + constexpr uint8_t ptr_wrap_mask = (2 * NUM_BUFFERS) - 1; + constexpr uint8_t ptr_wrap_size = 2 * NUM_BUFFERS; + bool leading_gte_trailing_ptr = leading_ptr >= trailing_ptr; + if constexpr (is_size_pow2) { + return (leading_ptr - trailing_ptr) & ptr_wrap_mask; + } else { + return distance_behind(trailing_ptr, leading_ptr, ptr_wrap_size); + } +} + +template +class ChannelBufferPointer { + static_assert( + NUM_BUFFERS <= std::numeric_limits::max() / 2, + "NUM_BUFFERS must be less than or half of std::numeric_limits::max() due to the internal " + "implementation"); + +public: + static constexpr bool is_size_pow2 = (NUM_BUFFERS & (NUM_BUFFERS - 1)) == 0; + static constexpr bool is_size_2 = NUM_BUFFERS == 2; + static constexpr bool is_size_1 = NUM_BUFFERS == 1; + static constexpr uint8_t ptr_wrap_size = 2 * NUM_BUFFERS; + + // Only to use if is_size_pow2 + static constexpr uint8_t ptr_wrap_mask = (2 * NUM_BUFFERS) - 1; + static constexpr uint8_t buffer_wrap_mask = NUM_BUFFERS - 1; + ChannelBufferPointer() : ptr(0) {} + /* + * Returns the "raw" pointer - not usable to index the buffer channel + */ + FORCE_INLINE BufferPtr get_ptr() const { return this->ptr; } + + FORCE_INLINE bool is_caught_up_to(const ChannelBufferPointer& leading_ptr) const { + return this->is_caught_up_to(leading_ptr.get_ptr()); + } + FORCE_INLINE uint8_t distance_behind(const ChannelBufferPointer& leading_ptr) const { + return this->distance_behind(leading_ptr.get_ptr()); + } + + /* + * Returns the buffer index pointer which is usable to index into the buffer memory + */ + FORCE_INLINE BufferIndex get_buffer_index() const { return BufferIndex{normalize_ptr(this->ptr)}; } + + FORCE_INLINE void increment_n(uint8_t n) { + this->ptr = BufferPtr{wrap_increment_n<2 * NUM_BUFFERS>(this->ptr.get(), n)}; + } + FORCE_INLINE void increment() { this->ptr = BufferPtr{wrap_increment<2 * NUM_BUFFERS>(this->ptr.get())}; } + +private: + // Make these private to make sure caller doesn't accidentally mix two pointers pointing to + // different sized channels + FORCE_INLINE bool is_caught_up_to(const BufferPtr& leading_ptr) const { return this->get_ptr() == leading_ptr; } + FORCE_INLINE uint8_t distance_behind(const BufferPtr& leading_ptr) const { + return tt::fabric::distance_behind(this->ptr, leading_ptr); + } + BufferPtr ptr = BufferPtr{0}; +}; + +} // namespace tt::fabric diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp index e6b2253c277..4864cea0b29 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp @@ -10,6 +10,8 @@ #include "cpp/ttnn/operations/ccl/kernel_common/worker_edm_utils.hpp" #include "cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header_validate.hpp" #include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_types.hpp" +#include "cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_flow_control_helpers.hpp" +#include "tt_metal/hw/inc/utils/utils.h" #include "debug/assert.h" #include "debug/dprint.h" #include @@ -17,7 +19,7 @@ namespace tt::fabric { /* - * The WorkerToFabricEdmSender acts as an adapter between the worker and the EDM, it hides details + * The WorkerToFabricEdmSenderImpl acts as an adapter between the worker and the EDM, it hides details * of the communication between worker and EDM to provide flexibility for the implementation to change * over time without kernel updates. Additionally, details for adapter setup w.r.t runtime args is also hidden. * The main functionality provided is: @@ -34,15 +36,20 @@ namespace tt::fabric { * As the adapter writes into the EDM, it updates the local wrptr. As the EDM reads from its local L1 channel buffer, * it will notify the worker/adapter (here) by updating the worker remote_rdptr to carry the value of the EDM rdptr. */ -struct WorkerToFabricEdmSender { +template +struct WorkerToFabricEdmSenderImpl { + static constexpr bool USER_DEFINED_NUM_BUFFER_SLOTS = EDM_NUM_BUFFER_SLOTS != 0; + static constexpr bool IS_POW2_NUM_BUFFERS = USER_DEFINED_NUM_BUFFER_SLOTS && is_power_of_2(EDM_NUM_BUFFER_SLOTS); + static constexpr size_t BUFFER_SLOT_PTR_WRAP = EDM_NUM_BUFFER_SLOTS * 2; + static constexpr size_t LAST_BUFFER_SLOT_PTR_BEFORE_WRAP = BUFFER_SLOT_PTR_WRAP - 1; static constexpr uint32_t unused_connection_value = 0; static constexpr uint32_t open_connection_value = 1; static constexpr uint32_t close_connection_request_value = 2; - WorkerToFabricEdmSender() : from_remote_buffer_slot_rdptr_ptr(nullptr) {} + WorkerToFabricEdmSenderImpl() : from_remote_buffer_slot_rdptr_ptr(nullptr) {} template - static WorkerToFabricEdmSender build_from_args(std::size_t& arg_idx) { + static WorkerToFabricEdmSenderImpl build_from_args(std::size_t& arg_idx) { bool is_persistent_fabric = get_arg_val(arg_idx++); WorkerXY const edm_worker_xy = WorkerXY::from_uint32(get_arg_val(arg_idx++)); auto const edm_buffer_base_addr = get_arg_val(arg_idx++); @@ -64,7 +71,7 @@ struct WorkerToFabricEdmSender { (my_core_type == ProgrammableCoreType::TENSIX && (uint32_t)writer_send_sem_addr < 1499136) || (my_core_type == ProgrammableCoreType::ACTIVE_ETH && (uint32_t)writer_send_sem_addr < 262144)); ASSERT(edm_buffer_index_addr < 262144); - return WorkerToFabricEdmSender( + return WorkerToFabricEdmSenderImpl( is_persistent_fabric, edm_worker_xy.x, edm_worker_xy.y, @@ -80,7 +87,7 @@ struct WorkerToFabricEdmSender { worker_buffer_index_semaphore_addr); } - WorkerToFabricEdmSender( + WorkerToFabricEdmSenderImpl( bool connected_to_persistent_fabric, uint8_t edm_worker_x, uint8_t edm_worker_y, @@ -116,18 +123,45 @@ struct WorkerToFabricEdmSender { edm_noc_x(edm_worker_x), edm_noc_y(edm_worker_y) { ASSERT(buffer_size_bytes > 0); + if constexpr (USER_DEFINED_NUM_BUFFER_SLOTS) { + ASSERT(num_buffers_per_channel == EDM_NUM_BUFFER_SLOTS); + } } FORCE_INLINE bool edm_has_space_for_packet() const { - auto const wrptr = *this->buffer_slot_wrptr_ptr; - auto const rdptr = *this->from_remote_buffer_slot_rdptr_ptr; - bool wrptr_ge_rptr = wrptr >= rdptr; - uint8_t slots_used = wrptr_ge_rptr ? (wrptr - rdptr) : ((2 * this->num_buffers_per_channel) - rdptr) + wrptr; - return slots_used < this->num_buffers_per_channel; + using namespace tt::fabric; + if constexpr (USER_DEFINED_NUM_BUFFER_SLOTS) { + auto slots_used = distance_behind( + BufferPtr{static_cast(*this->from_remote_buffer_slot_rdptr_ptr)}, + BufferPtr{static_cast(*this->buffer_slot_wrptr_ptr)}); + return slots_used < this->num_buffers_per_channel; + } else { + auto const rdptr = *this->from_remote_buffer_slot_rdptr_ptr; + auto const wrptr = *this->buffer_slot_wrptr_ptr; + auto buffer_ptr_wrap = 2 * this->num_buffers_per_channel; + auto slots_used = distance_behind( + BufferPtr{static_cast(rdptr)}, + BufferPtr{static_cast(wrptr)}, + buffer_ptr_wrap); + return slots_used < this->num_buffers_per_channel; + } } FORCE_INLINE void wait_for_empty_write_slot() const { - while (!this->edm_has_space_for_packet()); + using namespace tt::fabric; + if constexpr (USER_DEFINED_NUM_BUFFER_SLOTS) { + while (distance_behind(BufferPtr{static_cast(*this->from_remote_buffer_slot_rdptr_ptr)}, BufferPtr{static_cast(*this->buffer_slot_wrptr_ptr)}) < this->num_buffers_per_channel); + } else { + auto const first_rdptr = *this->from_remote_buffer_slot_rdptr_ptr; + auto buffer_ptr_wrap = 2 * this->num_buffers_per_channel; + bool has_space = distance_behind( + BufferPtr{static_cast(first_rdptr)}, + BufferPtr{static_cast(*this->buffer_slot_wrptr_ptr)}, + buffer_ptr_wrap) < this->num_buffers_per_channel; + if (!has_space) { + while (first_rdptr == *this->from_remote_buffer_slot_rdptr_ptr); + } + } } FORCE_INLINE void send_payload_blocking(uint32_t cb_id, uint32_t num_pages, uint32_t page_size) { @@ -192,6 +226,8 @@ struct WorkerToFabricEdmSender { const uint64_t edm_connection_handshake_noc_addr = dest_noc_addr_coord_only | edm_connection_handshake_l1_addr; noc_inline_dw_write(edm_connection_handshake_noc_addr, open_connection_value); noc_async_read_barrier(); + + this->edm_buffer_addr = this->edm_buffer_base_addr + (this->get_buffer_slot_index() * (this->buffer_size_bytes + sizeof(eth_channel_sync_t))); ASSERT(*this->buffer_slot_wrptr_ptr < 20); } @@ -249,25 +285,27 @@ struct WorkerToFabricEdmSender { noc_inline_dw_write(noc_sem_addr, *this->buffer_slot_wrptr_ptr); } - FORCE_INLINE void advance_buffer_slot_wrptr() { - // TODO: smarter addition if we are working with pow2 - uint8_t wrptr = *this->buffer_slot_wrptr_ptr; - *this->buffer_slot_wrptr_ptr = - !(wrptr == ((this->num_buffers_per_channel * 2) - 1)) ? wrptr + 1 : 0; - } - FORCE_INLINE uint8_t get_buffer_slot_index() const { - auto const wrptr = *this->buffer_slot_wrptr_ptr; - bool normalize = wrptr >= this->num_buffers_per_channel; - return wrptr - (normalize * this->num_buffers_per_channel); + if constexpr (USER_DEFINED_NUM_BUFFER_SLOTS) { + return normalize_ptr(BufferPtr{static_cast(*this->buffer_slot_wrptr_ptr)}); + } else { + return normalize_ptr(BufferPtr{static_cast(*this->buffer_slot_wrptr_ptr)}, this->num_buffers_per_channel); + } } - FORCE_INLINE uint32_t compute_dest_buffer_slot_bank_address() const { - return this->edm_buffer_addr + (this->get_buffer_slot_index() * (this->buffer_size_bytes + sizeof(eth_channel_sync_t))); + FORCE_INLINE void advance_buffer_slot_wrptr() { + if constexpr (USER_DEFINED_NUM_BUFFER_SLOTS) { + *this->buffer_slot_wrptr_ptr = wrap_increment(*this->buffer_slot_wrptr_ptr); + } else { + uint8_t wrptr = *this->buffer_slot_wrptr_ptr; + *this->buffer_slot_wrptr_ptr = + !(wrptr == ((this->num_buffers_per_channel * 2) - 1)) ? wrptr + 1 : 0; + } + this->edm_buffer_addr = this->edm_buffer_base_addr + (this->get_buffer_slot_index() * (this->buffer_size_bytes + sizeof(eth_channel_sync_t))); } FORCE_INLINE uint64_t compute_dest_buffer_slot_noc_addr() const { - return get_noc_addr(this->edm_noc_x, this->edm_noc_y, this->compute_dest_buffer_slot_bank_address()); + return get_noc_addr(this->edm_noc_x, this->edm_noc_y, this->edm_buffer_addr); } FORCE_INLINE void post_send_payload_increment_pointers() { @@ -319,4 +357,9 @@ struct WorkerToFabricEdmSender { } }; +using WorkerToFabricEdmSender = WorkerToFabricEdmSenderImpl<0>; + +template +using EdmToEdmSender = WorkerToFabricEdmSenderImpl; + } // namespace tt::fabric diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp index 35533d4d26e..85553bf6dab 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp @@ -16,7 +16,7 @@ static constexpr size_t DESTINATION_HOP_COUNT = 1; // TODO: make 0 and the associated field to num mcast destinations static constexpr size_t LAST_MCAST_DESTINATION = 1; -void print_pkt_hdr_routing_fields(volatile tt::fabric::PacketHeader *const packet_start) { +FORCE_INLINE void print_pkt_hdr_routing_fields(volatile tt::fabric::PacketHeader *const packet_start) { #ifdef DEBUG_PRINT_ENABLED switch (packet_start->chip_send_type) { case tt::fabric::CHIP_UNICAST: { @@ -32,7 +32,7 @@ void print_pkt_hdr_routing_fields(volatile tt::fabric::PacketHeader *const packe #endif } -void print_pkt_header_noc_fields(volatile tt::fabric::PacketHeader *const packet_start) { +FORCE_INLINE void print_pkt_header_noc_fields(volatile tt::fabric::PacketHeader *const packet_start) { #ifdef DEBUG_PRINT_ENABLED switch (packet_start->noc_send_type) { case tt::fabric::NocSendType::NOC_UNICAST_WRITE: { @@ -50,7 +50,7 @@ void print_pkt_header_noc_fields(volatile tt::fabric::PacketHeader *const packet #endif } -void print_pkt_header(volatile tt::fabric::PacketHeader *const packet_start) { +FORCE_INLINE void print_pkt_header(volatile tt::fabric::PacketHeader *const packet_start) { #ifdef DEBUG_PRINT_ENABLED auto const& header = *packet_start; DPRINT << "PKT: nsnd_t:" << (uint32_t) packet_start->noc_send_type << @@ -64,12 +64,12 @@ void print_pkt_header(volatile tt::fabric::PacketHeader *const packet_start) { // Since we unicast to local, we must omit the packet header -FORCE_INLINE void execute_chip_unicast_to_local_chip(volatile tt::fabric::PacketHeader *const packet_start, uint32_t transaction_id) { +FORCE_INLINE void execute_chip_unicast_to_local_chip( + volatile tt::fabric::PacketHeader *const packet_start, uint16_t payload_size_bytes, uint32_t transaction_id) { auto const& header = *packet_start; uint32_t payload_start_address = reinterpret_cast(packet_start) + sizeof(tt::fabric::PacketHeader); tt::fabric::NocSendType noc_send_type = packet_start->noc_send_type; - auto const payload_size_bytes = header.payload_size_bytes; switch (noc_send_type) { case tt::fabric::NocSendType::NOC_UNICAST_WRITE: { auto const dest_address = header.command_fields.unicast_write.noc_address; @@ -125,13 +125,14 @@ FORCE_INLINE void update_packet_header_for_next_hop(volatile tt::fabric::PacketH // !!!WARNING!!! * do NOT call before determining if the packet should be consumed locally or forwarded // !!!WARNING!!! * ENSURE DOWNSTREAM EDM HAS SPACE FOR PACKET BEFORE CALLING // !!!WARNING!!! +template FORCE_INLINE void forward_payload_to_downstream_edm( volatile tt::fabric::PacketHeader *packet_header, + uint16_t payload_size_bytes, tt::fabric::RoutingFields cached_routing_fields, - tt::fabric::WorkerToFabricEdmSender &downstream_edm_interface, + tt::fabric::EdmToEdmSender &downstream_edm_interface, uint8_t transaction_id ) { - DPRINT << "Fwding pkt to downstream\n"; // TODO: PERF - this should already be getting checked by the caller so this should be redundant make it an ASSERT ASSERT(downstream_edm_interface.edm_has_space_for_packet()); // best effort check @@ -140,6 +141,6 @@ FORCE_INLINE void forward_payload_to_downstream_edm( update_packet_header_for_next_hop(packet_header, cached_routing_fields); downstream_edm_interface.send_payload_non_blocking_from_address_with_trid( reinterpret_cast(packet_header), - packet_header->get_payload_size_including_header(), + payload_size_bytes + sizeof(tt::fabric::PacketHeader), transaction_id); } diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp index b0c732ee00b..4f7b82b5ce7 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp @@ -14,6 +14,7 @@ #include "cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp" #include "noc_overlay_parameters.h" +#include "tt_metal/hw/inc/utils/utils.h" #include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_counters.hpp" @@ -23,7 +24,7 @@ using ttnn::ccl::WorkerXY; -static constexpr bool enable_first_level_ack = true; +static constexpr bool enable_first_level_ack = false; static constexpr bool fuse_receiver_flush_and_completion_ptr = true; /* @@ -110,8 +111,8 @@ by the worker (the EDM is a slave in this protocol). *NOTE*: Additionally, if a worker pushes packets to a channel it isn't connected to, behaviour is undefined. *NOTE*: Undefined == likely hang -The `WorkerToFabricEdmSender` from `ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp` -provides an implementation of the connection protocol. `WorkerToFabricEdmSender` also acts as a wrapper around that +The `EdmToEdmSender` from `ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp` +provides an implementation of the connection protocol. `EdmToEdmSender` also acts as a wrapper around that protocol so workers can simply call `open()` to execute the connection protocol without having to manually reimplement for each kernel. @@ -265,40 +266,64 @@ struct TransactionIdCounter { template struct WriteTransactionIdTracker { static constexpr uint8_t INVALID_TRID = MAX_TRANSACTION_IDS; + static constexpr bool N_TRIDS_IS_POW2 = is_power_of_2(MAX_TRANSACTION_IDS); + static constexpr bool N_CHANS_IS_POW2 = is_power_of_2(NUM_CHANNELS); + static constexpr uint8_t TRID_POW2_MASK = MAX_TRANSACTION_IDS - 1; + static constexpr bool BOTH_PARAMS_ARE_POW2 = N_TRIDS_IS_POW2 && N_CHANS_IS_POW2; + WriteTransactionIdTracker() { for (size_t i = 0; i < NUM_CHANNELS; i++) { this->buffer_slot_trids[i] = INVALID_TRID; } } FORCE_INLINE void set_buffer_slot_trid(uint8_t trid, tt::fabric::BufferIndex buffer_index) { - this->buffer_slot_trids[buffer_index] = trid; - } - - FORCE_INLINE void advance_trid_counter() { - this->trid_counter.increment(); + if constexpr (!BOTH_PARAMS_ARE_POW2) { + this->buffer_slot_trids[buffer_index] = trid; + } } FORCE_INLINE uint8_t update_buffer_slot_to_next_trid_and_advance_trid_counter(tt::fabric::BufferIndex buffer_index) { - uint8_t next_trid = this->trid_counter.get(); - this->buffer_slot_trids[buffer_index] = next_trid; - this->trid_counter.increment(); - return next_trid; + if constexpr (BOTH_PARAMS_ARE_POW2) { + uint8_t next_trid = buffer_index & TRID_POW2_MASK; + this->trid_counter.increment(); + return next_trid; + } else { + uint8_t next_trid = this->trid_counter.get(); + this->buffer_slot_trids[buffer_index] = next_trid; + this->trid_counter.increment(); + return next_trid; + } } FORCE_INLINE void clear_trid_at_buffer_slot(tt::fabric::BufferIndex buffer_index) { - this->buffer_slot_trids[buffer_index] = INVALID_TRID; + if constexpr (!BOTH_PARAMS_ARE_POW2) { + this->buffer_slot_trids[buffer_index] = INVALID_TRID; + } } FORCE_INLINE uint8_t get_buffer_slot_trid(tt::fabric::BufferIndex buffer_index) const { - return this->buffer_slot_trids[buffer_index]; + if constexpr (BOTH_PARAMS_ARE_POW2) { + return buffer_index & TRID_POW2_MASK; + } else { + return this->buffer_slot_trids[buffer_index]; + } } FORCE_INLINE bool transaction_flushed(tt::fabric::BufferIndex buffer_index) const { - auto trid = this->get_buffer_slot_trid(buffer_index); - return trid == INVALID_TRID || ncrisc_noc_nonposted_write_with_transaction_id_flushed(noc_index, trid); + if constexpr (BOTH_PARAMS_ARE_POW2) { + auto trid = this->get_buffer_slot_trid(buffer_index); + return ncrisc_noc_nonposted_write_with_transaction_id_flushed(noc_index, trid); + } else { + // TODO: should be able to remove compare against INVALID_TRID + auto trid = this->get_buffer_slot_trid(buffer_index); + return trid == INVALID_TRID || ncrisc_noc_nonposted_write_with_transaction_id_flushed(noc_index, trid); + } } private: std::array buffer_slot_trids; TransactionIdCounter trid_counter; + + // TODO: cleanup - only used for when both params are pow2, else above are used. + uint8_t next_trid = 0; }; static constexpr uint32_t DEFAULT_ETH_TXQ = 0; @@ -366,6 +391,8 @@ constexpr std::array to_sender_packets_completed_streams = {{ */ template struct OutboundReceiverChannelPointers { + static constexpr bool is_pow2 = is_power_of_2(RECEIVER_NUM_BUFFERS); + tt::fabric::ChannelBufferPointer wrptr; tt::fabric::ChannelBufferPointer ack_ptr; tt::fabric::ChannelBufferPointer completion_ptr; @@ -571,11 +598,10 @@ FORCE_INLINE void receiver_send_completion_ack( remote_sender_completion_ptr.increment(); } - +template FORCE_INLINE bool can_forward_packet_completely( - const volatile tt::fabric::PacketHeader* packet_header, tt::fabric::RoutingFields cached_routing_fields, - tt::fabric::WorkerToFabricEdmSender& downstream_edm_interface) { + tt::fabric::EdmToEdmSender& downstream_edm_interface) { // We always check if it is the terminal mcast packet value. We can do this because all unicast packets have the // mcast terminal value masked in to the routing field. This simplifies the check here to a single compare. bool deliver_locally_only = cached_routing_fields.value == tt::fabric::RoutingFields::LAST_MCAST_VAL; @@ -583,20 +609,22 @@ FORCE_INLINE bool can_forward_packet_completely( } // !!!WARNING!!! - MAKE SURE CONSUMER HAS SPACE BEFORE CALLING +template FORCE_INLINE void receiver_forward_packet( // TODO: have a separate cached copy of the packet header to save some additional L1 loads volatile tt::fabric::PacketHeader *packet_start, tt::fabric::RoutingFields cached_routing_fields, - tt::fabric::WorkerToFabricEdmSender &downstream_edm_interface, + tt::fabric::EdmToEdmSender &downstream_edm_interface, uint8_t transaction_id) { bool start_distance_is_terminal_value = (cached_routing_fields.value & tt::fabric::RoutingFields::HOP_DISTANCE_MASK) == tt::fabric::RoutingFields::LAST_HOP_DISTANCE_VAL; + uint16_t payload_size_bytes = packet_start->payload_size_bytes; if (start_distance_is_terminal_value) { - execute_chip_unicast_to_local_chip(packet_start, transaction_id); + execute_chip_unicast_to_local_chip(packet_start, payload_size_bytes, transaction_id); } bool not_last_destination_device = cached_routing_fields.value != tt::fabric::RoutingFields::LAST_MCAST_VAL; if (not_last_destination_device) { - forward_payload_to_downstream_edm(packet_start, cached_routing_fields, downstream_edm_interface, transaction_id); + forward_payload_to_downstream_edm(packet_start, payload_size_bytes, cached_routing_fields, downstream_edm_interface, transaction_id); } } @@ -633,7 +661,6 @@ FORCE_INLINE bool run_sender_channel_step( tt::fabric::validate(*packet_header); packet_header_recorder.record_packet_header(packet_header); } - print_pkt_header(packet_header); send_next_data( local_sender_channel, local_sender_channel_worker_interface, @@ -710,17 +737,16 @@ FORCE_INLINE bool run_sender_channel_step( return did_something; }; -template +template FORCE_INLINE void run_receiver_channel_step( tt::fabric::EthChannelBuffer &local_receiver_channel, std::array, NUM_SENDER_CHANNELS> &remote_sender_channnels, - tt::fabric::WorkerToFabricEdmSender &downstream_edm_interface, + tt::fabric::EdmToEdmSender &downstream_edm_interface, volatile tt::fabric::EdmFabricReceiverChannelCounters *receiver_channel_counters_ptr, std::array, NUM_SENDER_CHANNELS> &remote_eth_sender_wrptrs, ReceiverChannelPointers &receiver_channel_pointers, PacketHeaderRecorder &packet_header_recorder, - WriteTransactionIdTracker &receiver_channel_trid_tracker, - ReceiverState *const receiver_state_out) { + WriteTransactionIdTracker &receiver_channel_trid_tracker) { auto &ack_ptr = receiver_channel_pointers.ack_ptr; auto pkts_received_since_last_check = get_ptr_val(); @@ -750,12 +776,11 @@ FORCE_INLINE void run_receiver_channel_step( volatile auto packet_header = local_receiver_channel.get_packet_header(receiver_buffer_index); tt::fabric::RoutingFields cached_routing_fields = const_cast(packet_header)->routing_fields; - print_pkt_header(packet_header); bool can_send_to_all_local_chip_receivers = - can_forward_packet_completely(packet_header, cached_routing_fields, downstream_edm_interface); + can_forward_packet_completely( + cached_routing_fields, downstream_edm_interface); bool trid_flushed = receiver_channel_trid_tracker.transaction_flushed(receiver_buffer_index); if (can_send_to_all_local_chip_receivers && trid_flushed) { - // DeviceZoneScopedN("EDMR-Send-Impl"); uint8_t trid = receiver_channel_trid_tracker.update_buffer_slot_to_next_trid_and_advance_trid_counter(receiver_buffer_index); receiver_forward_packet(packet_header, cached_routing_fields, downstream_edm_interface, trid); wr_sent_ptr.increment(); @@ -822,7 +847,7 @@ FORCE_INLINE bool got_termination_signal(volatile tt::fabric::TerminationSignal got_graceful_termination_signal(termination_signal_ptr); } -template +template bool all_channels_drained(tt::fabric::EthChannelBuffer &local_receiver_channel, std::array, NUM_SENDER_CHANNELS> &local_sender_channels, std::array, NUM_SENDER_CHANNELS> &local_sender_channel_worker_interfaces, @@ -849,12 +874,12 @@ bool all_channels_drained(tt::fabric::EthChannelBuffer &lo * Every loop iteration visit a sender channel and the receiver channel. Switch between sender * channels every iteration unless it is unsafe/undesirable to do so (e.g. for performance reasons). */ -template +template void run_fabric_edm_main_loop( tt::fabric::EthChannelBuffer &local_receiver_channel, std::array, NUM_SENDER_CHANNELS> &local_sender_channels, std::array, NUM_SENDER_CHANNELS> &local_sender_channel_worker_interfaces, - tt::fabric::WorkerToFabricEdmSender &downstream_edm_noc_interface, + tt::fabric::EdmToEdmSender &downstream_edm_noc_interface, std::array, NUM_SENDER_CHANNELS> &remote_sender_channels, tt::fabric::EthChannelBuffer &remote_receiver_channel, volatile tt::fabric::TerminationSignal *termination_signal_ptr, @@ -864,7 +889,6 @@ void run_fabric_edm_main_loop( std::array &sender_channel_packet_recorders) { std::array sender_states = { SenderState::SENDER_WAIT_WORKER_HANDSHAKE, SenderState::SENDER_WAIT_WORKER_HANDSHAKE}; - ReceiverState receiver_state = ReceiverState::RECEIVER_WAITING_FOR_ETH; size_t sender_channel_index = 0; size_t did_nothing_count = 0; *termination_signal_ptr = tt::fabric::TerminationSignal::KEEP_RUNNING; @@ -883,6 +907,11 @@ void run_fabric_edm_main_loop( WriteTransactionIdTracker receiver_channel_trid_tracker; + // This value defines the number of loop iterations we perform of the main control sequence before exiting + // to check for termination and context switch. Removing the these checks from the inner loop can drastically + // improve performance. The value of 32 was chosen somewhat empirically and then raised up slightly. + constexpr uint32_t DEFAULT_ITERATIONS_BETWEEN_CTX_SWITCH_AND_TEARDOWN_CHECKS = 32; + while (!got_immediate_termination_signal(termination_signal_ptr)) { bool got_graceful_termination = got_graceful_termination_signal(termination_signal_ptr); if (got_graceful_termination) { @@ -894,33 +923,41 @@ void run_fabric_edm_main_loop( return; } } - - // Capture these to see if we made progress - auto old_recv_state = receiver_state; - - // There are some cases, mainly for performance, where we don't want to switch between sender channels - // so we interoduce this to provide finer grain control over when we disable the automatic switching - bool did_something_sender = run_sender_channel_step( - local_sender_channels[sender_channel_index], - local_sender_channel_worker_interfaces[sender_channel_index], - outbound_to_receiver_channel_pointers, - remote_receiver_channel, - sender_channel_counters_ptrs[sender_channel_index], - sender_channel_packet_recorders[sender_channel_index], - channel_connection_established[sender_channel_index], - sender_channel_index); - - sender_channel_index = 1 - sender_channel_index; - - run_receiver_channel_step( - local_receiver_channel, remote_sender_channels, downstream_edm_noc_interface, receiver_channel_counters_ptr, - remote_eth_sender_wrptrs, - receiver_channel_pointers, - receiver_channel_packet_recorder, - receiver_channel_trid_tracker, - &receiver_state); - - bool did_something = did_something_sender || old_recv_state != receiver_state; + bool did_something = false; + for (size_t i = 0; i < DEFAULT_ITERATIONS_BETWEEN_CTX_SWITCH_AND_TEARDOWN_CHECKS; i++) { + // Capture these to see if we made progress + + // There are some cases, mainly for performance, where we don't want to switch between sender channels + // so we interoduce this to provide finer grain control over when we disable the automatic switching + bool did_something_sender = run_sender_channel_step( + local_sender_channels[0], + local_sender_channel_worker_interfaces[0], + outbound_to_receiver_channel_pointers, + remote_receiver_channel, + sender_channel_counters_ptrs[0], + sender_channel_packet_recorders[0], + channel_connection_established[0], + 0); + + run_receiver_channel_step( + local_receiver_channel, remote_sender_channels, downstream_edm_noc_interface, receiver_channel_counters_ptr, + remote_eth_sender_wrptrs, + receiver_channel_pointers, + receiver_channel_packet_recorder, + receiver_channel_trid_tracker); + + bool did_something_sender2 = run_sender_channel_step( + local_sender_channels[1], + local_sender_channel_worker_interfaces[1], + outbound_to_receiver_channel_pointers, + remote_receiver_channel, + sender_channel_counters_ptrs[1], + sender_channel_packet_recorders[1], + channel_connection_established[1], + 1); + + did_something = did_something || did_something_sender || did_something_sender2; + } if (did_something) { did_nothing_count = 0; @@ -1113,7 +1150,7 @@ void kernel_main() { } auto downstream_edm_noc_interface = has_downstream_edm_buffer_connection - ? tt::fabric::WorkerToFabricEdmSender( + ? tt::fabric::EdmToEdmSender( //persistent_mode -> hardcode to false because for EDM -> EDM // connections we must always use semaphore lookup false, @@ -1129,7 +1166,7 @@ void kernel_main() { reinterpret_cast(edm_forwarding_semaphore_address), reinterpret_cast(edm_teardown_semaphore_address), downstream_noc_interface_buffer_index_local_addr) - : tt::fabric::WorkerToFabricEdmSender(); + : tt::fabric::EdmToEdmSender(); auto local_receiver_channel = tt::fabric::EthChannelBuffer( local_receiver_channel_buffer_address, diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp index 2285a6c42cb..369c4f57f33 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp @@ -17,148 +17,9 @@ #include "cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_types.hpp" #include "cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp" #include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp" - +#include "cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_flow_control_helpers.hpp" namespace tt::fabric { -template -class NamedType -{ -public: - FORCE_INLINE explicit NamedType(T const& value) : value_(value) {} - FORCE_INLINE explicit NamedType(T&& value) : value_(std::move(value)) {} - FORCE_INLINE NamedType &operator=(NamedType const& rhs) = default; - FORCE_INLINE T& get() { return value_; } - FORCE_INLINE T const& get() const {return value_; } - FORCE_INLINE operator T() const { return value_; } - FORCE_INLINE operator T&() { return value_; } -private: - T value_; -}; - -using BufferIndex = NamedType; -using BufferPtr = NamedType; - - -// Increments val and wraps to 0 if it reaches limit -template -FORCE_INLINE -auto wrap_increment(T val) -> T { - static_assert(LIMIT != 0, "wrap_increment called with limit of 0; it must be greater than 0"); - constexpr bool is_pow2 = is_power_of_2(LIMIT); - if constexpr (LIMIT == 1) { - return val; - } else if constexpr (LIMIT == 2) { - return 1 - val; - } else if constexpr (is_pow2) { - return (val + 1) & (LIMIT - 1); - } else { - return (val == static_cast(LIMIT - 1)) ? static_cast(0) : static_cast(val + 1); - } -} -template -FORCE_INLINE -auto wrap_increment_n(T val, uint8_t increment) -> T { - static_assert(LIMIT != 0, "wrap_increment called with limit of 0; it must be greater than 0"); - constexpr bool is_pow2 = is_power_of_2(LIMIT); - if constexpr (LIMIT == 1) { - return val; - } else if constexpr (LIMIT == 2) { - return 1 - val; - } else if constexpr (is_pow2) { - return (val + increment) & (LIMIT - 1); - } else { - T new_unadjusted_val = val + increment; - bool wraps = new_unadjusted_val >= LIMIT; - return wraps ? static_cast(new_unadjusted_val - LIMIT) : static_cast(new_unadjusted_val); - } -} - -template -FORCE_INLINE -auto normalize_ptr(BufferPtr ptr) -> BufferIndex { - static_assert(NUM_BUFFERS != 0, "normalize_ptr called with NUM_BUFFERS of 0; it must be greater than 0"); - constexpr bool is_size_pow2 = (NUM_BUFFERS & (NUM_BUFFERS - 1)) == 0; - constexpr bool is_size_2 = NUM_BUFFERS == 2; - constexpr bool is_size_1 = NUM_BUFFERS == 1; - constexpr uint8_t wrap_mask = NUM_BUFFERS - 1; - if constexpr (is_size_pow2) { - return BufferIndex{ptr & wrap_mask}; - } else if constexpr (is_size_2) { - return BufferIndex{(uint8_t)1 - ptr}; - } else if constexpr (is_size_1) { - return BufferIndex{0}; - } else { - // note it may make sense to calculate this only when we increment - // which will save calculations overall (but may add register pressure) - // and introduce undesirable loads - bool normalize = ptr >= NUM_BUFFERS; - uint8_t normalized_ptr = ptr.get() - static_cast(normalize * NUM_BUFFERS); - ASSERT(normalized_ptr < NUM_BUFFERS); - return BufferIndex{normalized_ptr}; - } -} - - -template -class ChannelBufferPointer { - static_assert(NUM_BUFFERS <= std::numeric_limits::max() / 2, "NUM_BUFFERS must be less than or half of std::numeric_limits::max() due to the internal implementation"); - public: - static constexpr bool is_size_pow2 = (NUM_BUFFERS & (NUM_BUFFERS - 1)) == 0; - static constexpr bool is_size_2 = NUM_BUFFERS == 2; - static constexpr bool is_size_1 = NUM_BUFFERS == 1; - static constexpr uint8_t ptr_wrap_size = 2 * NUM_BUFFERS; - - // Only to use if is_size_pow2 - static constexpr uint8_t ptr_wrap_mask = (2 * NUM_BUFFERS) - 1; - static constexpr uint8_t buffer_wrap_mask = NUM_BUFFERS - 1; - ChannelBufferPointer() : ptr(0) {} - /* - * Returns the "raw" pointer - not usable to index the buffer channel - */ - FORCE_INLINE BufferPtr get_ptr() const { - return this->ptr; - } - - FORCE_INLINE bool is_caught_up_to(ChannelBufferPointer const& leading_ptr) const { - return this->is_caught_up_to(leading_ptr.get_ptr()); - } - FORCE_INLINE uint8_t distance_behind(ChannelBufferPointer const& leading_ptr) const { - return this->distance_behind(leading_ptr.get_ptr()); - } - - /* - * Returns the buffer index pointer which is usable to index into the buffer memory - */ - FORCE_INLINE BufferIndex get_buffer_index() const { - return BufferIndex{normalize_ptr(this->ptr)}; - } - - FORCE_INLINE void increment_n(uint8_t n) { - this->ptr = BufferPtr{wrap_increment_n<2*NUM_BUFFERS>(this->ptr.get(), n)}; - } - FORCE_INLINE void increment() { - this->ptr = wrap_increment<2*NUM_BUFFERS>(this->ptr); - } - - private: - // Make these private to make sure caller doesn't accidentally mix two pointers pointing to - // different sized channels - FORCE_INLINE bool is_caught_up_to(BufferPtr const& leading_ptr) const { - return this->get_ptr() == leading_ptr; - } - FORCE_INLINE uint8_t distance_behind(BufferPtr const& leading_ptr) const { - bool leading_gte_trailing_ptr = leading_ptr >= this->ptr; - if constexpr (is_size_pow2) { - return (leading_ptr - this->ptr) & ptr_wrap_mask; - } else { - return leading_gte_trailing_ptr ? - leading_ptr - this->ptr : - ptr_wrap_size - (this->ptr - leading_ptr); - } - } - BufferPtr ptr = BufferPtr{0}; -}; - template FORCE_INLINE auto wrap_increment(T val, size_t max) { @@ -310,7 +171,7 @@ struct EdmChannelWorkerInterface { (uint32_t)worker_info.worker_xy.x, (uint32_t)worker_info.worker_xy.y, worker_info.worker_teardown_semaphore_address); // Set connection to unused so it's available for next worker - *this->connection_live_semaphore = tt::fabric::WorkerToFabricEdmSender::unused_connection_value; + *this->connection_live_semaphore = tt::fabric::EdmToEdmSender<0>::unused_connection_value; *reinterpret_cast(&(worker_location_info_ptr->edm_rdptr)) = last_edm_rdptr_value; @@ -329,8 +190,8 @@ struct EdmChannelWorkerInterface { worker_location_info_ptr->edm_rdptr = local_ackptr.get_ptr(); } - [[nodiscard]] FORCE_INLINE bool has_worker_teardown_request() const { return *connection_live_semaphore == tt::fabric::WorkerToFabricEdmSender::close_connection_request_value; } - [[nodiscard]] FORCE_INLINE bool connection_is_live() const { return *connection_live_semaphore == tt::fabric::WorkerToFabricEdmSender::open_connection_value; } + [[nodiscard]] FORCE_INLINE bool has_worker_teardown_request() const { return *connection_live_semaphore == tt::fabric::EdmToEdmSender<0>::close_connection_request_value; } + [[nodiscard]] FORCE_INLINE bool connection_is_live() const { return *connection_live_semaphore == tt::fabric::EdmToEdmSender<0>::open_connection_value; } volatile EDMChannelWorkerLocationInfo *worker_location_info_ptr; volatile tt_l1_ptr uint32_t *const remote_producer_wrptr; From 2958cac744e213b1816e1565b92b71c19786f07e Mon Sep 17 00:00:00 2001 From: Nour Ardo Date: Tue, 18 Feb 2025 10:38:18 -0500 Subject: [PATCH 127/316] Fix shape in outer (#17492) ### Ticket Link to Github Issue https://github.com/tenstorrent/tt-metal/issues/16882 ### Problem description ttnn::outer fails after tilizing the inputs ### What's changed outer op is checking the padded size of the inputs which is causing the error. This PR changes the shape used in outer ### Checklist - [x] Post commit CI passes https://github.com/tenstorrent/tt-metal/actions/runs/13167635235 - [ ] Blackhole Post commit (if applicable) - [ ] Model regression CI testing passes (if applicable) - [ ] Device performance regression CI testing passes (if applicable) - [ ] **(For models and ops writers)** Full [new models](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) tests passes - [ ] New/Existing tests provide coverage for changes --- .../eltwise/binary/device/binary_composite_op.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp index a4dac8812f1..7a9cbc4be60 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp @@ -476,8 +476,8 @@ Tensor _scatter(const Tensor& input_a, const Tensor& input_b, const std::optiona * by running reshape. */ Tensor _outer(const Tensor& input_a, const Tensor& input_b, const std::optional& output_mem_config) { - const ttnn::Shape s_a = input_a.padded_shape(); - const ttnn::Shape s_b = input_b.padded_shape(); + const ttnn::Shape s_a = input_a.get_logical_shape(); + const ttnn::Shape s_b = input_b.get_logical_shape(); auto num_ones = [](const ttnn::Shape& s) -> uint32_t { uint32_t num1s = 0; for (uint32_t idx = 0; idx < 4; idx++) { @@ -497,10 +497,12 @@ Tensor _outer(const Tensor& input_a, const Tensor& input_b, const std::optional< Tensor b_slim = input_b; if (!skip_reshape_a) { - a_slim = ttnn::reshape(input_a, ttnn::Shape{std::array{1, 1, input_a.volume(), 1}}); + uint32_t a_volume = s_a[0] * s_a[1] * s_a[2] * s_a[3]; + a_slim = ttnn::reshape(input_a, ttnn::Shape{std::array{1, 1, a_volume, 1}}); } if (!skip_reshape_b) { - b_slim = ttnn::reshape(input_b, ttnn::Shape{std::array{1, 1, 1, input_b.volume()}}); + uint32_t b_volume = s_b[0] * s_b[1] * s_b[2] * s_b[3]; + b_slim = ttnn::reshape(input_b, ttnn::Shape{std::array{1, 1, 1, b_volume}}); } a_slim = ttnn::to_layout(a_slim, ttnn::TILE_LAYOUT, std::nullopt, std::nullopt, (IDevice*)nullptr); b_slim = ttnn::to_layout(b_slim, ttnn::TILE_LAYOUT, std::nullopt, std::nullopt, (IDevice*)nullptr); From be555b1d3d9c165f24c2f1019be3aca179e59b1c Mon Sep 17 00:00:00 2001 From: Nicholas Smith Date: Fri, 14 Feb 2025 15:12:10 -0600 Subject: [PATCH 128/316] Install RPATH ORIGIN Add ORIGIN to both ttnn and tt_metal library RPATH's to simplify wheel installation for upstream consumers. --- tt_metal/CMakeLists.txt | 2 +- ttnn/CMakeLists.txt | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tt_metal/CMakeLists.txt b/tt_metal/CMakeLists.txt index 44f80bb4ec0..11c36177fa9 100644 --- a/tt_metal/CMakeLists.txt +++ b/tt_metal/CMakeLists.txt @@ -131,7 +131,7 @@ set_target_properties( tt_metal PROPERTIES INSTALL_RPATH - "${PROJECT_BINARY_DIR}/lib" + "${PROJECT_BINARY_DIR}/lib;$ORIGIN" ADDITIONAL_CLEAN_FILES "${PROJECT_BINARY_DIR}/lib;${PROJECT_BINARY_DIR}/obj" ) diff --git a/ttnn/CMakeLists.txt b/ttnn/CMakeLists.txt index 7eb79f85d0d..eb63d038eda 100644 --- a/ttnn/CMakeLists.txt +++ b/ttnn/CMakeLists.txt @@ -861,6 +861,7 @@ TT_ENABLE_UNITY_BUILD(ttnn) set(TTNN_INSTALL_RPATH "${PROJECT_BINARY_DIR}/lib" "$ORIGIN/build/lib" + "$ORIGIN" ) #Make sure library built is _ttnn.so and that it can find all it's linked libraries From ed210e7dae8dafba91a5434d6fbb50dc7dce8932 Mon Sep 17 00:00:00 2001 From: Atul Krishnadas Date: Tue, 18 Feb 2025 08:36:59 -0800 Subject: [PATCH 129/316] #17094: fill implicit pad sharded using the new shardedAddrGen (#17692) --- .../unit_tests/operations/test_fill_pad.py | 153 +++++++++++++++++- .../fill_pad/device/fill_pad_op.cpp | 6 - .../device/fill_pad_program_factory.cpp | 13 +- .../kernels/dataflow/fill_pad_writer.cpp | 28 +++- 4 files changed, 187 insertions(+), 13 deletions(-) diff --git a/tests/ttnn/unit_tests/operations/test_fill_pad.py b/tests/ttnn/unit_tests/operations/test_fill_pad.py index 48dff554b6c..489cb371325 100644 --- a/tests/ttnn/unit_tests/operations/test_fill_pad.py +++ b/tests/ttnn/unit_tests/operations/test_fill_pad.py @@ -5,6 +5,7 @@ import pytest import torch import ttnn +import math from tests.ttnn.utils_for_testing import assert_with_pcc from models.utility_functions import torch_random, run_for_wormhole_b0 @@ -52,12 +53,12 @@ def create_nd_padded_tiled_tensor(shape, tile_size, fill_value, dtype): ttnn.bfloat16: torch.float32, } +# torch.set_printoptions(threshold=10000) + -# @pytest.mark.parametrize("shape", [(2, 32, 300, 256)]) @pytest.mark.parametrize( "shape", [ - # 2D shapes with edge cases for fill_pad (1, 16), (16, 1), (1, 17), @@ -67,6 +68,7 @@ def create_nd_padded_tiled_tensor(shape, tile_size, fill_value, dtype): (31, 31), (33, 33), (65, 65), + (97, 97), (1, 2, 3, 2, 1, 2, 97, 97), ], ) @@ -96,3 +98,150 @@ def test_fill_pad( padded_torch_output_tensor = ttnn.from_device(output_tensor).to_torch() assert_with_pcc(padded_torch_tensor, padded_torch_output_tensor) + + +@pytest.mark.parametrize("fill_value", [1]) +@pytest.mark.parametrize( + "shape", + [ + (1, 16), + (97, 97), + ], +) +@pytest.mark.parametrize( + "shard_scheme", + [ + ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + ttnn.TensorMemoryLayout.WIDTH_SHARDED, + ttnn.TensorMemoryLayout.BLOCK_SHARDED, + ], +) +@pytest.mark.parametrize("dtype", [ttnn.bfloat16, ttnn.uint32]) +def test_fill_pad_complex_sharding(device, fill_value, shape, shard_scheme, dtype): + torch.manual_seed(1234) + torch_input_tensor, padded_torch_tensor = create_nd_padded_tiled_tensor( + shape, 32, fill_value, ttnn_dtype_to_torch_dtype[dtype] + ) + num_cores_xblock = 2 + num_cores_yblock = 4 + num_cores = num_cores_xblock * num_cores_yblock + + # Add complex shard grid with 2 X 4 = 8 cores + shard_grid = ttnn.CoreRangeSet( + [ + ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(0, 1)), + ttnn.CoreRange(ttnn.CoreCoord(2, 0), ttnn.CoreCoord(3, 1)), + ttnn.CoreRange(ttnn.CoreCoord(0, 4), ttnn.CoreCoord(0, 5)), + ] + ) + + tiles_per_2d = padded_torch_tensor.shape[-2] * padded_torch_tensor.shape[-1] / (32 * 32) + dims_b4_last_dim = 1 + for i in range(len(padded_torch_tensor.shape) - 1): + dims_b4_last_dim *= padded_torch_tensor.shape[i] + + shard_shape = [32, 32] + if shard_scheme == ttnn.TensorMemoryLayout.WIDTH_SHARDED: + shard_shape = (dims_b4_last_dim, 32 * math.ceil((math.ceil(padded_torch_tensor.shape[-1] / 32) / num_cores))) + elif shard_scheme == ttnn.TensorMemoryLayout.HEIGHT_SHARDED: + tile_widths_per_core = math.ceil(dims_b4_last_dim / num_cores) + shard_shape = (32 * tile_widths_per_core, padded_torch_tensor.shape[-1]) + elif shard_scheme == ttnn.TensorMemoryLayout.BLOCK_SHARDED: + tile_widths_per_core = math.ceil(dims_b4_last_dim / num_cores_xblock) + shard_shape = ( + 32 * tile_widths_per_core, + 32 * math.ceil((math.ceil(padded_torch_tensor.shape[-1] / 32) / num_cores_yblock)), + ) + else: + shard_shape = (math.ceil(math.sqrt(tiles_per_core)), math.ceil(math.sqrt(tiles_per_core))) + + shard_spec = ttnn.ShardSpec(shard_grid, shard_shape, ttnn.ShardOrientation.ROW_MAJOR) + output_mem_config = ttnn.MemoryConfig( + shard_scheme, + ttnn.BufferType.L1, + shard_spec, + ) + + input_tensor = ttnn.to_device( + ttnn.from_torch(torch_input_tensor, dtype=dtype, layout=ttnn.TILE_LAYOUT), + device, + memory_config=output_mem_config, + ) + + output_tensor = ttnn.fill_implicit_tile_padding(input_tensor, fill_value, memory_config=ttnn.DRAM_MEMORY_CONFIG) + padded_torch_output_tensor = ttnn.from_device(output_tensor).to_torch() + + assert_with_pcc(padded_torch_tensor, padded_torch_output_tensor, 0.99) + + +@pytest.mark.parametrize("fill_value", [1]) +@pytest.mark.parametrize( + "shape", + [ + (1, 16), + (16, 1), + (17, 17), + (17, 1), + (16, 16), + (17, 17), + (31, 31), + (33, 33), + (97, 97), + ], +) +@pytest.mark.parametrize( + "shard_scheme", + [ + ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + ttnn.TensorMemoryLayout.WIDTH_SHARDED, + ttnn.TensorMemoryLayout.BLOCK_SHARDED, + ], +) +@pytest.mark.parametrize("dtype", [ttnn.bfloat16, ttnn.uint32]) +def test_fill_pad_sharded(device, fill_value, shape, shard_scheme, dtype): + torch.manual_seed(1234) + torch_input_tensor, padded_torch_tensor = create_nd_padded_tiled_tensor( + shape, 32, fill_value, ttnn_dtype_to_torch_dtype[dtype] + ) + + num_cores_x = 8 + num_cores_y = 7 + num_cores = num_cores_x * num_cores_y + shard_grid = ttnn.CoreRangeSet( + [ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(num_cores_x - 1, num_cores_y - 1))] + ) + + tiles_per_2d = padded_torch_tensor.shape[-2] * padded_torch_tensor.shape[-1] / (32 * 32) + dims_b4_last_dim = 1 + for i in range(len(padded_torch_tensor.shape) - 1): + dims_b4_last_dim *= padded_torch_tensor.shape[i] + + shard_shape = [32, 32] + if shard_scheme == ttnn.TensorMemoryLayout.WIDTH_SHARDED: + shard_shape = (dims_b4_last_dim, 32 * math.ceil((math.ceil(padded_torch_tensor.shape[-1] / 32) / num_cores))) + elif shard_scheme == ttnn.TensorMemoryLayout.HEIGHT_SHARDED: + tile_widths_per_core = math.ceil(dims_b4_last_dim / num_cores) + shard_shape = (32 * tile_widths_per_core, padded_torch_tensor.shape[-1]) + elif shard_scheme == ttnn.TensorMemoryLayout.BLOCK_SHARDED: + tile_widths_per_core = math.ceil(dims_b4_last_dim / num_cores_x) + shard_shape = (32 * tile_widths_per_core, 32 * math.ceil((padded_torch_tensor.shape[-1] / 32 / num_cores_y))) + else: + shard_shape = (math.ceil(math.sqrt(tiles_per_core)), math.ceil(math.sqrt(tiles_per_core))) + + shard_spec = ttnn.ShardSpec(shard_grid, shard_shape, ttnn.ShardOrientation.ROW_MAJOR) + output_mem_config = ttnn.MemoryConfig( + shard_scheme, + ttnn.BufferType.L1, + shard_spec, + ) + + input_tensor = ttnn.to_device( + ttnn.from_torch(torch_input_tensor, dtype=dtype, layout=ttnn.TILE_LAYOUT), + device, + memory_config=output_mem_config, + ) + + output_tensor = ttnn.fill_implicit_tile_padding(input_tensor, fill_value, memory_config=ttnn.DRAM_MEMORY_CONFIG) + padded_torch_output_tensor = ttnn.from_device(output_tensor).to_torch() + + assert_with_pcc(padded_torch_tensor, padded_torch_output_tensor, 0.99) diff --git a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_op.cpp index 78c13267c69..3de81f581ff 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_op.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_op.cpp @@ -14,12 +14,6 @@ namespace ttnn::operations::data_movement { void FillPad::validate(const std::vector& input_tensors) const { const auto& input_tensor_a = input_tensors.at(0); TT_FATAL(input_tensor_a.get_layout() == TILE_LAYOUT, "FillPad should only be used for tile layout"); - TT_FATAL( - input_tensor_a.memory_config().memory_layout == TensorMemoryLayout::INTERLEAVED, - "FillPad does not currently support sharding"); - TT_FATAL( - this->output_mem_config.memory_layout == TensorMemoryLayout::INTERLEAVED, - "FillPad does not currently support sharding"); } std::vector FillPad::compute_output_specs(const std::vector& input_tensors) const { diff --git a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_program_factory.cpp index e798d9f0c3f..b07c6e65bf0 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_program_factory.cpp @@ -9,6 +9,7 @@ #include #include #include +#include "ttnn/operations/ccl/sharding_addrgen_helper.hpp" bool is_power_of_two_at_least_32(uint32_t value) { return value >= 32 && (value & (value - 1)) == 0; } @@ -68,6 +69,8 @@ operation::ProgramWithCallbacks fill_pad_multi_core(const Tensor& input_tensor, padded_height / tt::constants::TILE_HEIGHT * padded_width / tt::constants::TILE_HEIGHT; uint32_t tiles_per_tile_row = padded_width / tt::constants::TILE_HEIGHT; + bool sharded = input_tensor.memory_config().memory_layout != TensorMemoryLayout::INTERLEAVED; + // create kernel // reader compile time args std::vector writer_compile_time_args = { @@ -82,7 +85,12 @@ operation::ProgramWithCallbacks fill_pad_multi_core(const Tensor& input_tensor, (std::uint32_t)tiles_per_2d_tensor, (std::uint32_t)tiles_per_tile_row, (std::uint32_t)tt::constants::TILE_HEIGHT, - (std::uint32_t)tt::constants::FACE_HEIGHT}; + (std::uint32_t)tt::constants::FACE_HEIGHT, + (std::uint32_t)sharded}; + + if (sharded) { + shard_builder::extend_sharding_compile_time_args(input_tensor, writer_compile_time_args); + } tt::tt_metal::KernelHandle writer_kernel_id = tt::tt_metal::CreateKernel( program, @@ -102,6 +110,9 @@ operation::ProgramWithCallbacks fill_pad_multi_core(const Tensor& input_tensor, { writer_runtime_args[2] = tile_offset; writer_runtime_args[3] = local_num_2d_tensors; + if (sharded) { + shard_builder::extend_sharding_run_time_args(input_tensor, writer_runtime_args); + } tt_metal::SetRuntimeArgs(program, writer_kernel_id, core, writer_runtime_args); } diff --git a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/kernels/dataflow/fill_pad_writer.cpp b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/kernels/dataflow/fill_pad_writer.cpp index a94aa7fdea0..91d166e9510 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/kernels/dataflow/fill_pad_writer.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/kernels/dataflow/fill_pad_writer.cpp @@ -3,6 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 #include "dataflow_api.h" +#include "cpp/ttnn/operations/ccl/shared_with_host/sharded_tensor_addr_gen.hpp" +#include "ttnn/cpp/ttnn/operations/ccl/kernel_common/sharding_addrgen.hpp" void kernel_main() { constexpr uint32_t cb_id_0 = get_compile_time_arg_val(0); @@ -19,20 +21,38 @@ void kernel_main() { constexpr uint32_t tile_size = get_compile_time_arg_val(10); constexpr uint32_t tile_hw = tile_size * tile_size; constexpr uint32_t face_size = get_compile_time_arg_val(11); +#define SHARDED get_compile_time_arg_val(12) == 1 constexpr uint32_t face_hw = face_size * face_size; constexpr uint32_t alignment_adjustor = 16; - uint32_t dst_addr = get_arg_val(0); - uint32_t cb_page_size = get_arg_val(1); - uint32_t starting_tile_offset = get_arg_val(2); - uint32_t num_2d_tensors = get_arg_val(3); + uint32_t rt_arg_ind = 0; + uint32_t dst_addr = get_arg_val(rt_arg_ind++); + uint32_t cb_page_size = get_arg_val(rt_arg_ind++); + uint32_t starting_tile_offset = get_arg_val(rt_arg_ind++); + uint32_t num_2d_tensors = get_arg_val(rt_arg_ind++); +#if (SHARDED) + typedef ShardedInfo< + get_compile_time_arg_val(13), + get_compile_time_arg_val(14), + get_compile_time_arg_val(15), + get_compile_time_arg_val(16), + get_compile_time_arg_val(17), + get_compile_time_arg_val(18), + get_compile_time_arg_val(19)> + tensor_shard_info; + + const auto [mapping_table, rt_increment] = + experimental::shard_addr_gen_utils::get_shard_map(get_arg_addr(rt_arg_ind)); + experimental::ShardedAddrGen s0 = {.bank_base_address = dst_addr, .shard_array = mapping_table}; +#else const DataFormat data_format = get_dataformat(cb_id_0); const InterleavedAddrGenFast s0 = { .bank_base_address = dst_addr, .page_size = tile_hw * element_size_bytes, .data_format = data_format // page_size needs to be tile_size_bytes }; +#endif // Reserve and push the fill value into the circular buffer cb_reserve_back(cb_id_0, 1); From 6e257a5c5fdbbd7d4b1bd6944936c82ece768460 Mon Sep 17 00:00:00 2001 From: William Ly Date: Tue, 18 Feb 2025 12:24:08 -0500 Subject: [PATCH 130/316] [skip ci] #0: Fix produce_data bug "jq: error: writing output failed: Broken pipe" (#17953) ### Ticket ### Problem description Recent produce_data workflows started bugging out on a line that checks github API for artifacts starting with "test_reports_*" with `jq: error: writing output failed: Broken pipe` https://github.com/tenstorrent/tt-metal/actions/runs/13382103493/job/37372300588#step:7:9 ### What's changed Store all output from gh api into var, and then `grep -q` after. ### Checklist - [x] New/Existing tests provide coverage for changes Same failing workflow, rerun on branch with fix: https://github.com/tenstorrent/tt-metal/actions/runs/13396159663 --- .../github/download_cicd_logs_and_artifacts.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/infra/data_collection/github/download_cicd_logs_and_artifacts.sh b/infra/data_collection/github/download_cicd_logs_and_artifacts.sh index 1c5d3852a8d..48e265c6f61 100755 --- a/infra/data_collection/github/download_cicd_logs_and_artifacts.sh +++ b/infra/data_collection/github/download_cicd_logs_and_artifacts.sh @@ -17,7 +17,9 @@ download_artifacts() { local repo=$1 local workflow_run_id=$2 - if gh api --paginate /repos/$repo/actions/runs/$workflow_run_id/artifacts | jq '.artifacts[] | .name' | grep -q "test_reports_"; then + echo "[info] Downloading test reports for workflow run $workflow_run_id" + api_output=$(gh api --paginate /repos/$repo/actions/runs/$workflow_run_id/artifacts | jq -r '.artifacts[] | .name') + if echo "$api_output" | grep -q "test_reports_"; then gh run download --repo $repo -D generated/cicd/$workflow_run_id/artifacts --pattern test_reports_* $workflow_run_id else echo "[Warning] Test reports not found for workflow run $workflow_run_id" From d08245ef3c03197bab2b199a49e6fd5d99f3b195 Mon Sep 17 00:00:00 2001 From: Oleg Milyutin Date: Tue, 18 Feb 2025 12:37:44 -0500 Subject: [PATCH 131/316] #0: Include in xtensor conversion utils (#17948) ### Ticket N/A ### Problem description `tt::stl::SmallVector` removed a dependency on c++20 std::span, which was transitively included in this header. This [breaks](https://github.com/tenstorrent/tt-mlir/actions/runs/13384256221/job/37378049606?pr=2194) tt-mlir. ### What's changed Include ``. ### Checklist - Compilation tested locally, @brataTT confirmed the fix works for tt-mlir. --- ttnn/cpp/ttnn/tensor/xtensor/conversion_utils.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/ttnn/cpp/ttnn/tensor/xtensor/conversion_utils.hpp b/ttnn/cpp/ttnn/tensor/xtensor/conversion_utils.hpp index df97212e648..fa7b15c6ee4 100644 --- a/ttnn/cpp/ttnn/tensor/xtensor/conversion_utils.hpp +++ b/ttnn/cpp/ttnn/tensor/xtensor/conversion_utils.hpp @@ -4,6 +4,7 @@ #pragma once +#include #include #include "ttnn/tensor/tensor.hpp" From 6573fa85d63b8f2041076cabe33afdb3c3ef9643 Mon Sep 17 00:00:00 2001 From: aagarwalTT Date: Tue, 18 Feb 2025 11:41:28 -0600 Subject: [PATCH 132/316] Remove gatekeeper kernel from fabric launch --- .../kernels/tt_fabric_traffic_gen_tx.cpp | 13 +- .../routing/kernels/tt_fabric_tx_ubench.cpp | 12 +- .../routing/test_tt_fabric_sanity.cpp | 151 ++++-------------- tt_fabric/hw/inc/tt_fabric.h | 2 +- tt_fabric/hw/inc/tt_fabric_api.h | 36 +---- tt_fabric/impl/kernels/tt_fabric_router.cpp | 69 +++++--- 6 files changed, 93 insertions(+), 190 deletions(-) diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp index 48351327002..2dac3ffaebe 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp @@ -83,10 +83,6 @@ packet_header_t packet_header __attribute__((aligned(16))); uint32_t target_address; uint32_t noc_offset; uint32_t rx_addr_hi; - -uint32_t gk_interface_addr_l; -uint32_t gk_interface_addr_h; - uint32_t controller_noc_offset; // flag to check if need to zero out notification addr @@ -389,11 +385,9 @@ void kernel_main() { src_endpoint_id = get_arg_val(increment_arg_idx(rt_args_idx)); noc_offset = get_arg_val(increment_arg_idx(rt_args_idx)); controller_noc_offset = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t routing_plane = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t outbound_eth_chan = get_arg_val(increment_arg_idx(rt_args_idx)); dest_device = get_arg_val(increment_arg_idx(rt_args_idx)); uint32_t rx_buf_size = get_arg_val(increment_arg_idx(rt_args_idx)); - gk_interface_addr_l = get_arg_val(increment_arg_idx(rt_args_idx)); - gk_interface_addr_h = get_arg_val(increment_arg_idx(rt_args_idx)); if constexpr (ASYNC_WR & test_command) { base_target_address = get_arg_val(increment_arg_idx(rt_args_idx)); @@ -462,9 +456,8 @@ void kernel_main() { uint32_t packet_count = 0; // initalize client - fabric_endpoint_init(client_interface_addr, gk_interface_addr_l, gk_interface_addr_h); - routing_table = reinterpret_cast( - client_interface->routing_tables_l1_offset + sizeof(fabric_router_l1_config_t) * routing_plane); + fabric_endpoint_init(client_interface_addr, outbound_eth_chan); + routing_table = reinterpret_cast(client_interface->routing_tables_l1_offset); while (true) { iter++; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_tx_ubench.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_tx_ubench.cpp index d9991ed8b67..ae1bebc19de 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_tx_ubench.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_tx_ubench.cpp @@ -68,8 +68,6 @@ volatile fabric_client_interface_t* client_interface; uint64_t xy_local_addr; uint32_t target_address; uint32_t noc_offset; -uint32_t gk_interface_addr_l; -uint32_t gk_interface_addr_h; uint32_t controller_noc_offset; uint32_t time_seed; @@ -94,11 +92,9 @@ void kernel_main() { src_endpoint_id = get_arg_val(increment_arg_idx(rt_args_idx)); noc_offset = get_arg_val(increment_arg_idx(rt_args_idx)); controller_noc_offset = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t routing_plane = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t outbound_eth_chan = get_arg_val(increment_arg_idx(rt_args_idx)); dest_device = get_arg_val(increment_arg_idx(rt_args_idx)); uint32_t rx_buf_size = get_arg_val(increment_arg_idx(rt_args_idx)); - gk_interface_addr_l = get_arg_val(increment_arg_idx(rt_args_idx)); - gk_interface_addr_h = get_arg_val(increment_arg_idx(rt_args_idx)); if constexpr (ASYNC_WR & test_command) { base_target_address = get_arg_val(increment_arg_idx(rt_args_idx)); @@ -140,7 +136,7 @@ void kernel_main() { } // initalize client - fabric_endpoint_init(client_interface_addr, gk_interface_addr_l, gk_interface_addr_h); + fabric_endpoint_init(client_interface_addr, outbound_eth_chan); // notify the controller kernel that this worker is ready to proceed notify_traffic_controller(); @@ -161,7 +157,7 @@ void kernel_main() { client_interface->local_pull_request.pull_request.words_read = 0; if constexpr (mcast_data) { fabric_async_write_multicast( - routing_plane, // the network plane to use for this transaction + 0, // the network plane to use for this transaction data_buffer_start_addr, // source address in sender’s memory dest_device >> 16, dest_device & 0xFFFF, @@ -173,7 +169,7 @@ void kernel_main() { s_depth); } else { fabric_async_write( - routing_plane, // the network plane to use for this transaction + 0, // the network plane to use for this transaction data_buffer_start_addr, // source address in sender’s memory dest_device >> 16, dest_device & 0xFFFF, diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp index a0e91bd4dc2..f9ff6e03670 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp @@ -34,15 +34,7 @@ uint32_t time_seed; // decides if the tx puts the data directly on eth or if a noc hop is allowed as well bool allow_1st_noc_hop = false; -// Gatekeeper kernel coordinates -uint32_t gk_x, gk_y; - -// Check if gatekeeper runs on tensix worker or idle ethernet based on the board type -bool run_gk_on_idle_ethernet; - uint32_t routing_table_addr; -uint32_t gk_interface_addr; -uint32_t socket_info_addr; // if the traffic b/w any pair of chips is bi-directional bool bidirectional_traffic; @@ -54,7 +46,6 @@ uint32_t tx_signal_address; uint32_t host_signal_address; // kernels -const std::string gatekeeper_kernel_src = "tt_fabric/impl/kernels/tt_fabric_gatekeeper.cpp"; const std::string router_kernel_src = "tt_fabric/impl/kernels/tt_fabric_router.cpp"; const std::string traffic_controller_src = "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_controller.cpp"; @@ -149,11 +140,6 @@ typedef struct test_board { } else { physical_chip_ids = available_chip_ids; } - - // gatekeeper - run on idle ethernet for n300/T3K - if (("n300" == board_type_) || ("t3k" == board_type_)) { - run_gk_on_idle_ethernet = true; - } } void _init_galaxy_board(uint32_t num_chips, bool all_pcie = false) { @@ -468,13 +454,11 @@ typedef struct test_device { std::vector router_virtual_cores; CoreCoord core_range_start_virtual; CoreCoord core_range_end_virtual; - CoreCoord gk_logical_core; - CoreCoord gk_phys_core; mesh_id_t mesh_id; chip_id_t logical_chip_id; + uint32_t master_router_idx; uint32_t mesh_chip_id = 0; uint32_t router_mask = 0; - uint32_t gk_noc_offset; metal_SocDescriptor soc_desc; std::unordered_map>> router_worker_map; // router chan to worker logical cores @@ -519,20 +503,7 @@ typedef struct test_device { _generate_router_worker_map(); } - // gatekeeper - if (run_gk_on_idle_ethernet) { - auto idle_eth_cores = device_handle->get_inactive_ethernet_cores(); - if (idle_eth_cores.size() == 0) { - throw std::runtime_error("No idle ethernet cores found on the device"); - } - - gk_logical_core = *idle_eth_cores.begin(); - gk_phys_core = device_handle->ethernet_core_from_logical_core(gk_logical_core); - } else { - gk_logical_core = {gk_x, gk_y}; - gk_phys_core = device_handle->worker_core_from_logical_core(gk_logical_core); - } - gk_noc_offset = tt_metal::hal.noc_xy_encoding(gk_phys_core.x, gk_phys_core.y); + master_router_idx = 0; } void create_router_kernels(std::vector& compile_args, std::map& defines) { @@ -540,14 +511,21 @@ typedef struct test_device { std::vector zero_buf(1, 0); for (auto i = 0; i < num_routers; i++) { + std::vector router_compile_args = compile_args; // setup run time args std::vector runtime_args = { - num_routers, // 0: number of active fabric routers - router_mask, // 1: active fabric router mask - gk_interface_addr, // 2: gk_message_addr_l - gk_noc_offset, // 3: gk_message_addr_h + num_routers, // 0: number of active fabric routers + router_mask, // 1: active fabric router mask + router_logical_cores[master_router_idx].y // 2: master router eth chan }; + // pass is_master flag as compile arg, index 0 is master + if (master_router_idx == i) { + router_compile_args.push_back(1); + } else { + router_compile_args.push_back(0); + } + // initialize the semaphore tt::llrt::write_hex_vec_to_core( device_handle->id(), router_virtual_cores[i], zero_buf, FABRIC_ROUTER_SYNC_SEM); @@ -557,70 +535,25 @@ typedef struct test_device { router_kernel_src, router_logical_cores[i], tt_metal::EthernetConfig{ - .noc = tt_metal::NOC::NOC_0, .compile_args = compile_args, .defines = defines}); + .noc = tt_metal::NOC::NOC_0, .compile_args = router_compile_args, .defines = defines}); tt_metal::SetRuntimeArgs(program_handle, kernel, router_logical_cores[i], runtime_args); } } - void create_gatekeeper_kernel(std::vector& compile_args, std::map& defines) { - uint32_t num_routers = router_logical_cores.size(); - std::vector zero_buf(12, 0); - - std::vector runtime_args = { - num_routers, // 0: number of active fabric routers - router_mask, // 1: active fabric router mask - }; - - // initialize the semaphore - tt::llrt::write_hex_vec_to_core(device_handle->id(), gk_phys_core, zero_buf, gk_interface_addr); - - KernelHandle kernel; - - if (run_gk_on_idle_ethernet) { - kernel = tt_metal::CreateKernel( - program_handle, - gatekeeper_kernel_src, - {gk_logical_core}, - tt_metal::EthernetConfig{ - .eth_mode = Eth::IDLE, - .noc = tt_metal::NOC::NOC_0, - .compile_args = compile_args, - .defines = defines}); - } else { - kernel = tt_metal::CreateKernel( - program_handle, - gatekeeper_kernel_src, - {gk_logical_core}, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt_metal::NOC::RISCV_0_default, - .compile_args = compile_args, - .defines = defines}); - } - - tt_metal::SetRuntimeArgs(program_handle, kernel, gk_logical_core, runtime_args); - } - - void wait_for_gatekeeper_sync() { - uint32_t gk_status = 0; - uint32_t num_routers = router_logical_cores.size(); - uint32_t sync_addr = gk_interface_addr + offsetof(gatekeeper_info_t, router_sync) + offsetof(sync_word_t, val); - while (num_routers != gk_status) { - gk_status = tt::llrt::read_hex_vec_from_core(device_handle->id(), gk_phys_core, sync_addr, 4)[0]; + void wait_for_router_sync() { + uint32_t master_router_status = 0; + uint32_t expected_val = router_logical_cores.size(); + while (expected_val != master_router_status) { + master_router_status = tt::llrt::read_hex_vec_from_core( + device_handle->id(), router_virtual_cores[master_router_idx], FABRIC_ROUTER_SYNC_SEM, 4)[0]; } } void terminate_router_kernels() { std::vector zero_buf(1, 0); - for (auto& core : router_virtual_cores) { - tt::llrt::write_hex_vec_to_core(device_handle->id(), core, zero_buf, FABRIC_ROUTER_SYNC_SEM); - } - } - - void terminate_gatekeeper_kernel() { - std::vector zero_buf(12, 0); - tt::llrt::write_hex_vec_to_core(device_handle->id(), gk_phys_core, zero_buf, gk_interface_addr); + tt::llrt::write_hex_vec_to_core( + device_handle->id(), router_virtual_cores[master_router_idx], zero_buf, FABRIC_ROUTER_SYNC_SEM); } std::vector select_random_worker_cores(uint32_t count) { @@ -951,11 +884,9 @@ typedef struct test_traffic { tx_device->get_endpoint_id(tx_core), // 1: src_endpoint_id rx_devices[0]->get_noc_offset(rx_core), // 2: dest_noc_offset tx_device->get_noc_offset(controller_logical_core), // 3: controller noc offset - routing_plane, // 4: routing plane to use + eth_chan, // 4: outbound eth chan mesh_chip_id, // 5: mesh and chip id rx_buf_size, // 6: space in rx's L1 - gk_interface_addr, // 7: gk_message_addr_l - tx_device->gk_noc_offset, // 8: gk_message_addr_h }; if (ASYNC_WR & fabric_command) { @@ -968,13 +899,14 @@ typedef struct test_traffic { log_info( LogTest, - "[Device: Phys: {}, Logical: {}] TX kernel running on: logical: x={},y={}; virtual: x={},y={}", + "[Device: Phys: {}, Logical: {}] TX running on: logical: x={},y={}; virtual: x={},y={}, Eth chan: {}", tx_device->physical_chip_id, (uint32_t)tx_device->logical_chip_id, tx_core.x, tx_core.y, tx_virtual_cores[i].x, - tx_virtual_cores[i].y); + tx_virtual_cores[i].y, + (uint32_t)eth_chan); auto kernel = tt_metal::CreateKernel( tx_device->program_handle, tx_kernel_src, @@ -1262,8 +1194,6 @@ int main(int argc, char **argv) { constexpr uint32_t default_tx_y = 0; constexpr uint32_t default_rx_x = 0; constexpr uint32_t default_rx_y = 3; - constexpr uint32_t default_gk_x = 0; - constexpr uint32_t default_gk_y = 9; constexpr uint32_t default_mux_x = 0; constexpr uint32_t default_mux_y = 1; @@ -1379,8 +1309,6 @@ int main(int argc, char **argv) { uint32_t tx_y = test_args::get_command_option_uint32(input_args, "--tx_y", default_tx_y); uint32_t rx_x = test_args::get_command_option_uint32(input_args, "--rx_x", default_rx_x); uint32_t rx_y = test_args::get_command_option_uint32(input_args, "--rx_y", default_rx_y); - gk_x = test_args::get_command_option_uint32(input_args, "--gk_x", default_gk_x); - gk_y = test_args::get_command_option_uint32(input_args, "--gk_y", default_gk_y); uint32_t prng_seed = test_args::get_command_option_uint32(input_args, "--prng_seed", default_prng_seed); uint32_t data_kb_per_tx = test_args::get_command_option_uint32(input_args, "--data_kb_per_tx", default_data_kb_per_tx); @@ -1618,14 +1546,6 @@ int main(int argc, char **argv) { uint32_t worker_unreserved_base_addr = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); - if (run_gk_on_idle_ethernet) { - routing_table_addr = hal.get_dev_addr(HalProgrammableCoreType::IDLE_ETH, HalL1MemAddrType::UNRESERVED); - } else { - routing_table_addr = worker_unreserved_base_addr; - } - gk_interface_addr = routing_table_addr + sizeof(fabric_router_l1_config_t) * 4; - socket_info_addr = gk_interface_addr + sizeof(gatekeeper_info_t); - // create router kernels std::vector router_compile_args = { (tunneler_queue_size_bytes >> 4), // 0: rx_queue_size_words @@ -1637,19 +1557,6 @@ int main(int argc, char **argv) { test_device->create_router_kernels(router_compile_args, defines); } - // create gatekeeper kernel - std::vector gatekeeper_compile_args = { - gk_interface_addr, // 0: gk info addr - socket_info_addr, // 1: - routing_table_addr, // 2: - test_results_addr, // 3: test_results_addr - test_results_size, // 4: test_results_size - 0, // 5: timeout_cycles - }; - for (auto& [chip_id, test_device] : test_devices) { - test_device->create_gatekeeper_kernel(gatekeeper_compile_args, defines); - } - if (check_txrx_timeout) { defines["CHECK_TIMEOUT"] = ""; } @@ -1719,9 +1626,9 @@ int main(int argc, char **argv) { tt_metal::detail::LaunchProgram(test_device->device_handle, test_device->program_handle, false); } - // wait for all routers to handshake with their gatekeepers + // wait for all routers to handshake with master router for (auto& [chip_id, test_device] : test_devices) { - test_device->wait_for_gatekeeper_sync(); + test_device->wait_for_router_sync(); } // notify tx controller to signal the tx workers @@ -1735,7 +1642,7 @@ int main(int argc, char **argv) { } // terminate fabric routers for (auto& [chip_id, test_device] : test_devices) { - test_device->terminate_gatekeeper_kernel(); + test_device->terminate_router_kernels(); } // wait for programs to exit diff --git a/tt_fabric/hw/inc/tt_fabric.h b/tt_fabric/hw/inc/tt_fabric.h index 04fa643b82c..6065f927953 100644 --- a/tt_fabric/hw/inc/tt_fabric.h +++ b/tt_fabric/hw/inc/tt_fabric.h @@ -23,7 +23,7 @@ const uint32_t SYNC_BUF_PTR_MASK = ((SYNC_BUF_SIZE << 1) - 1); extern uint64_t xy_local_addr; extern volatile local_pull_request_t* local_pull_request; -extern volatile fabric_router_l1_config_t* routing_table; +extern volatile tt_l1_ptr fabric_router_l1_config_t* routing_table; extern chan_payload_ptr inbound_rdptr_ack; extern volatile chan_payload_ptr remote_rdptr; diff --git a/tt_fabric/hw/inc/tt_fabric_api.h b/tt_fabric/hw/inc/tt_fabric_api.h index 5b66fa860d1..fd96de1a1bd 100644 --- a/tt_fabric/hw/inc/tt_fabric_api.h +++ b/tt_fabric/hw/inc/tt_fabric_api.h @@ -245,43 +245,19 @@ inline void fabric_socket_connect(socket_handle_t* socket_handle) { while (((volatile socket_handle_t*)socket_handle)->socket_state != SocketState::ACTIVE); } -inline void fabric_endpoint_init(uint32_t base_address, uint32_t gk_interface_addr_l, uint32_t gk_interface_addr_h) { +inline void fabric_endpoint_init(uint32_t base_address, uint32_t outbound_eth_chan) { tt_fabric_init(); client_interface = (volatile fabric_client_interface_t*)base_address; uint32_t routing_tables_offset = base_address + sizeof(fabric_client_interface_t); zero_l1_buf((uint32_t*)client_interface, sizeof(fabric_client_interface_t)); - client_interface->gk_interface_addr = ((uint64_t)gk_interface_addr_h << 32) | gk_interface_addr_l; - client_interface->gk_msg_buf_addr = - (((uint64_t)gk_interface_addr_h << 32) | gk_interface_addr_l) + offsetof(gatekeeper_info_t, gk_msg_buf); client_interface->routing_tables_l1_offset = routing_tables_offset; + client_interface->num_routing_planes = 1; - // make sure fabric node gatekeeper is available. - uint64_t noc_addr = client_interface->gk_interface_addr + offsetof(gatekeeper_info_t, ep_sync); - client_interface->return_status[0] = 0; - while (1) { - noc_async_read_one_packet(noc_addr, (uint32_t)&client_interface->return_status[0], 4); - noc_async_read_barrier(); - if (client_interface->return_status[0] != 0) { - break; - } - } - - // read the gk info first at routing table addr and later override with routing tables - noc_async_read_one_packet( - client_interface->gk_interface_addr, client_interface->routing_tables_l1_offset, sizeof(gatekeeper_info_t)); - noc_async_read_barrier(); - - client_interface->num_routing_planes = ((gatekeeper_info_t*)routing_tables_offset)->routing_planes; - - // read routing tables - uint64_t gk_rt_noc_addr = client_interface->gk_interface_addr - sizeof(fabric_router_l1_config_t) * 4; - uint32_t table_offset; - for (uint32_t i = 0; i < client_interface->num_routing_planes; i++) { - table_offset = sizeof(fabric_router_l1_config_t) * i; - noc_async_read_one_packet( - gk_rt_noc_addr + table_offset, routing_tables_offset + table_offset, sizeof(fabric_router_l1_config_t)); - } + // read routing table + uint64_t dest_addr = get_noc_addr_helper( + eth_chan_to_noc_xy[noc_index][outbound_eth_chan], eth_l1_mem::address_map::FABRIC_ROUTER_CONFIG_BASE); + noc_async_read_one_packet(dest_addr, routing_tables_offset, sizeof(fabric_router_l1_config_t)); noc_async_read_barrier(); } diff --git a/tt_fabric/impl/kernels/tt_fabric_router.cpp b/tt_fabric/impl/kernels/tt_fabric_router.cpp index 0eeb7879f9d..9cd08cbe2d8 100644 --- a/tt_fabric/impl/kernels/tt_fabric_router.cpp +++ b/tt_fabric/impl/kernels/tt_fabric_router.cpp @@ -24,10 +24,12 @@ constexpr uint32_t fvc_data_buf_size_bytes = fvc_data_buf_size_words * PACKET_WO constexpr uint32_t kernel_status_buf_addr_arg = get_compile_time_arg_val(1); constexpr uint32_t kernel_status_buf_size_bytes = get_compile_time_arg_val(2); constexpr uint32_t timeout_cycles = get_compile_time_arg_val(3); +constexpr bool is_master = get_compile_time_arg_val(4); uint32_t sync_val; uint32_t router_mask; -uint32_t gk_message_addr_l; -uint32_t gk_message_addr_h; +uint32_t master_router_chan; +uint64_t xy_local_addr; +bool terminated_slave_routers = false; // careful, may be null tt_l1_ptr uint32_t* const kernel_status = reinterpret_cast(kernel_status_buf_addr_arg); @@ -35,16 +37,23 @@ tt_l1_ptr volatile chan_req_buf* fvc_consumer_req_buf = reinterpret_cast(FABRIC_ROUTER_REQ_QUEUE_START); volatile tt_l1_ptr fabric_router_l1_config_t* routing_table = reinterpret_cast(eth_l1_mem::address_map::FABRIC_ROUTER_CONFIG_BASE); -uint64_t xy_local_addr; + +volatile uint32_t* sync_sem_addr = (volatile uint32_t*)FABRIC_ROUTER_SYNC_SEM; #define SWITCH_THRESHOLD 0x3FFF -inline void notify_gatekeeper() { - // send semaphore increment to gatekeeper on this device. +inline void wait_for_sem(uint32_t value) { + while (*sync_sem_addr != value) { + // context switch while waiting to allow slow dispatch traffic to go through + internal_::risc_context_switch(); + } +} + +inline void notify_master_router() { + // send semaphore increment to master router on this device. // semaphore notifies all other routers that this router has completed // startup handshake with its ethernet peer. - uint64_t dest_addr = - (((uint64_t)gk_message_addr_h << 32) | gk_message_addr_l) + offsetof(gatekeeper_info_t, router_sync); + uint64_t dest_addr = get_noc_addr_helper(eth_chan_to_noc_xy[noc_index][master_router_chan], FABRIC_ROUTER_SYNC_SEM); noc_fast_atomic_increment( noc_index, NCRISC_AT_CMD_BUF, @@ -55,27 +64,31 @@ inline void notify_gatekeeper() { false, false, MEM_NOC_ATOMIC_RET_VAL_ADDR); +} - volatile uint32_t* sync_sem_addr = (volatile uint32_t*)FABRIC_ROUTER_SYNC_SEM; - // wait for all device routers to have incremented the sync semaphore. - // sync_val is equal to number of tt-fabric routers running on a device. - while (*sync_sem_addr != sync_val) { - // context switch while waiting to allow slow dispatch traffic to go through - internal_::risc_context_switch(); +inline void notify_slave_routers(uint32_t notification) { + uint32_t remaining_cores = router_mask; + for (uint32_t i = 0; i < 16; i++) { + if (remaining_cores == 0) { + break; + } + if ((remaining_cores & (0x1 << i)) && (master_router_chan != i)) { + uint64_t dest_addr = get_noc_addr_helper(eth_chan_to_noc_xy[noc_index][i], FABRIC_ROUTER_SYNC_SEM); + noc_inline_dw_write(dest_addr, notification); + remaining_cores &= ~(0x1 << i); + } } } void kernel_main() { + tt_fabric_init(); fvc_producer_state_t fvc_producer_state; rtos_context_switch_ptr = (void (*)())RtosTable[0]; uint32_t rt_args_idx = 0; sync_val = get_arg_val(rt_args_idx++); router_mask = get_arg_val(rt_args_idx++); - gk_message_addr_l = get_arg_val(rt_args_idx++); - gk_message_addr_h = get_arg_val(rt_args_idx++); - - tt_fabric_init(); + master_router_chan = get_arg_val(rt_args_idx++); write_kernel_status(kernel_status, TT_FABRIC_STATUS_INDEX, TT_FABRIC_STATUS_STARTED); write_kernel_status(kernel_status, TT_FABRIC_MISC_INDEX, 0xff000000); @@ -112,7 +125,19 @@ void kernel_main() { return; } - notify_gatekeeper(); + if constexpr (is_master) { + // wait for all device routers to have incremented the sync semaphore. + // sync_val is equal to number of tt-fabric routers running on a device. + wait_for_sem(sync_val - 1); + notify_slave_routers(sync_val); + // increment the sync sem to signal host that handshake is complete + *sync_sem_addr += 1; + } else { + notify_master_router(); + // wait for the signal from the master router + wait_for_sem(sync_val); + } + uint64_t start_timestamp = get_timestamp(); write_kernel_status(kernel_status, TT_FABRIC_MISC_INDEX, 0xff000001); @@ -176,7 +201,13 @@ void kernel_main() { internal_::risc_context_switch(); } if (*(volatile uint32_t*)FABRIC_ROUTER_SYNC_SEM == 0) { - // terminate signal from host sw. + // terminate signal from host sw + if constexpr (is_master) { + if (!terminated_slave_routers) { + notify_slave_routers(0); + terminated_slave_routers = true; + } + } if (loop_count >= 0x1000) { break; } From 2d4f9945fbb70a8bc4fe1525ef645d99ff6247c3 Mon Sep 17 00:00:00 2001 From: Brian Liu Date: Wed, 12 Feb 2025 09:36:41 -0800 Subject: [PATCH 133/316] #0: Clean up ShardSpecBuffer - Rename tensor2d_shape() to tensor2d_shape_in_pages() - Rename size() to num_pages() - Flip height/width in shape_in_pages() - Remove DEBUG_PRINT_SHARD --- .../tt_metal/distributed/test_mesh_buffer.cpp | 10 +++-- ...queueWriteBuffer_and_EnqueueReadBuffer.cpp | 40 +++++++++++-------- tt_metal/api/tt-metalium/buffer.hpp | 26 ++++++------ tt_metal/api/tt-metalium/tt_metal.hpp | 2 +- tt_metal/impl/buffers/buffer.cpp | 20 +++++----- tt_metal/impl/buffers/dispatch.cpp | 10 +++-- tt_metal/tt_metal.cpp | 5 --- .../multi_core/all_gather_op_multi_core.cpp | 8 ++-- .../ccl/sharding_addrgen_helper.cpp | 7 ++-- .../operations/experimental/reshape/view.cpp | 2 +- ttnn/cpp/ttnn/tensor/layout/tensor_layout.cpp | 4 +- ttnn/cpp/ttnn/tensor/tensor.cpp | 4 +- 12 files changed, 73 insertions(+), 65 deletions(-) diff --git a/tests/tt_metal/distributed/test_mesh_buffer.cpp b/tests/tt_metal/distributed/test_mesh_buffer.cpp index 5fdc6369a24..f85f57a329b 100644 --- a/tests/tt_metal/distributed/test_mesh_buffer.cpp +++ b/tests/tt_metal/distributed/test_mesh_buffer.cpp @@ -25,11 +25,11 @@ struct DeviceLocalShardedBufferTestConfig { TensorMemoryLayout mem_config = TensorMemoryLayout::HEIGHT_SHARDED; ShardOrientation shard_orientation = ShardOrientation::ROW_MAJOR; - Shape2D tensor2d_shape() { + Shape2D tensor2d_shape_in_pages() { return {num_pages_per_core.height() * num_cores.height(), num_pages_per_core.width() * num_cores.width()}; } - uint32_t num_pages() { return tensor2d_shape().height() * tensor2d_shape().width(); } + uint32_t num_pages() { return tensor2d_shape_in_pages().height() * tensor2d_shape_in_pages().width(); } std::array shard_shape() { return {num_pages_per_core.height() * page_shape.height(), num_pages_per_core.width() * page_shape.width()}; @@ -44,7 +44,11 @@ struct DeviceLocalShardedBufferTestConfig { ShardSpecBuffer shard_parameters() { return ShardSpecBuffer( - this->shard_grid(), this->shard_shape(), this->shard_orientation, this->page_shape, this->tensor2d_shape()); + this->shard_grid(), + this->shard_shape(), + this->shard_orientation, + this->page_shape, + this->tensor2d_shape_in_pages()); } }; diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp index 4b5b1826c97..77a870d07f3 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp @@ -56,11 +56,11 @@ class BufferStressTestConfigSharded { this->num_cores = cores; } - std::array tensor2d_shape() { + std::array tensor2d_shape_in_pages() { return {num_pages_per_core[0] * num_cores[0], num_pages_per_core[1] * num_cores[1]}; } - uint32_t num_pages() { return tensor2d_shape()[0] * tensor2d_shape()[1]; } + uint32_t num_pages() { return tensor2d_shape_in_pages()[0] * tensor2d_shape_in_pages()[1]; } std::array shard_shape() { return {num_pages_per_core[0] * page_shape[0], num_pages_per_core[1] * page_shape[1]}; @@ -73,7 +73,11 @@ class BufferStressTestConfigSharded { ShardSpecBuffer shard_parameters() { return ShardSpecBuffer( - this->shard_grid(), this->shard_shape(), this->shard_orientation, this->page_shape, this->tensor2d_shape()); + this->shard_grid(), + this->shard_shape(), + this->shard_orientation, + this->page_shape, + this->tensor2d_shape_in_pages()); } uint32_t page_size() { return page_shape[0] * page_shape[1] * element_size; } @@ -87,7 +91,7 @@ struct ShardedSubBufferStressTestConfig { CoreRangeSet cores; Shape2D shard_shape; Shape2D page_shape; - Shape2D tensor2d_shape; + Shape2D tensor2d_shape_in_pages; TensorMemoryLayout layout; ShardOrientation orientation; }; @@ -133,11 +137,12 @@ vector generate_sharded_sub_buffer_test_config uint32_t page_shape_width_div_factor = 1; while (page_shape_width_div_factor <= num_pages_per_shard) { if (page_shape_width_div_factor * page_shape_height_div_factor == num_pages_per_shard) { - uint32_t tensor2d_shape_height = page_shape_height_div_factor; - while (tensor2d_shape_height <= num_pages) { - uint32_t tensor2d_shape_width = page_shape_width_div_factor; - while (tensor2d_shape_width <= num_pages) { - if (tensor2d_shape_height * tensor2d_shape_width == num_pages) { + uint32_t tensor2d_shape_in_pages_height = page_shape_height_div_factor; + while (tensor2d_shape_in_pages_height <= num_pages) { + uint32_t tensor2d_shape_in_pages_width = page_shape_width_div_factor; + while (tensor2d_shape_in_pages_width <= num_pages) { + if (tensor2d_shape_in_pages_height * tensor2d_shape_in_pages_width == + num_pages) { for (TensorMemoryLayout layout : {TensorMemoryLayout::HEIGHT_SHARDED, TensorMemoryLayout::BLOCK_SHARDED, @@ -157,17 +162,18 @@ vector generate_sharded_sub_buffer_test_config page_shape_height_div_factor, tt::constants::TILE_WIDTH / page_shape_width_div_factor}, - .tensor2d_shape = - {tensor2d_shape_height, tensor2d_shape_width}, + .tensor2d_shape_in_pages = + {tensor2d_shape_in_pages_height, + tensor2d_shape_in_pages_width}, .layout = layout, .orientation = orientation}; configs.push_back(config); } } } - tensor2d_shape_width += page_shape_width_div_factor; + tensor2d_shape_in_pages_width += page_shape_width_div_factor; } - tensor2d_shape_height += page_shape_height_div_factor; + tensor2d_shape_in_pages_height += page_shape_height_div_factor; } } page_shape_width_div_factor += 1; @@ -1018,7 +1024,7 @@ TEST_F(CommandQueueSingleCardBufferFixture, TestReadWriteShardedSubBufferForL1) tt::log_debug( tt::LogTest, "Device: {} buffer_size: {} page_size: {} region_offset: {} region_size: {} shard_shape: [{}, {}] " - "page_shape: [{}, {}] tensor2d_shape: [{}, {}] layout: {} orientation: {} cores: {}", + "page_shape: [{}, {}] tensor2d_shape_in_pages: [{}, {}] layout: {} orientation: {} cores: {}", device->id(), config.buffer_size, config.page_size, @@ -1028,8 +1034,8 @@ TEST_F(CommandQueueSingleCardBufferFixture, TestReadWriteShardedSubBufferForL1) config.shard_shape.width(), config.page_shape.height(), config.page_shape.width(), - config.tensor2d_shape.height(), - config.tensor2d_shape.width(), + config.tensor2d_shape_in_pages.height(), + config.tensor2d_shape_in_pages.width(), magic_enum::enum_name(config.layout).data(), magic_enum::enum_name(config.orientation).data(), config.cores.str()); @@ -1039,7 +1045,7 @@ TEST_F(CommandQueueSingleCardBufferFixture, TestReadWriteShardedSubBufferForL1) {tt::constants::TILE_HEIGHT, tt::constants::TILE_WIDTH}, config.orientation, config.page_shape, - config.tensor2d_shape); + config.tensor2d_shape_in_pages); auto buffer = Buffer::create(device, config.buffer_size, config.page_size, BufferType::L1, config.layout, shard_spec); diff --git a/tt_metal/api/tt-metalium/buffer.hpp b/tt_metal/api/tt-metalium/buffer.hpp index 119900e5929..e52f45b2105 100644 --- a/tt_metal/api/tt-metalium/buffer.hpp +++ b/tt_metal/api/tt-metalium/buffer.hpp @@ -86,33 +86,33 @@ std::ostream& operator<<(std::ostream& os, const ShardSpec& spec); struct ShardSpecBuffer { ShardSpec tensor_shard_spec; std::array page_shape; - std::array tensor2d_shape; + std::array tensor2d_shape_in_pages; ShardSpecBuffer( - const CoreRangeSet &core_sets_, - const std::array &shard_shape_, - const ShardOrientation &shard_orientation_, - const std::array &page_shape, - const std::array &tensor2d_shape) : + const CoreRangeSet& core_sets_, + const std::array& shard_shape_, + const ShardOrientation& shard_orientation_, + const std::array& page_shape, + const std::array& tensor2d_shape_in_pages) : tensor_shard_spec(core_sets_, shard_shape_, shard_orientation_) { this->page_shape = page_shape; - this->tensor2d_shape = tensor2d_shape; + this->tensor2d_shape_in_pages = tensor2d_shape_in_pages; } ShardSpecBuffer( - const ShardSpec &shard_spec, - const std::array &page_shape, - const std::array &tensor2d_shape) : + const ShardSpec& shard_spec, + const std::array& page_shape, + const std::array& tensor2d_shape_in_pages) : tensor_shard_spec(shard_spec) { this->page_shape = page_shape; - this->tensor2d_shape = tensor2d_shape; + this->tensor2d_shape_in_pages = tensor2d_shape_in_pages; } CoreRangeSet grid() const { return tensor_shard_spec.grid; } std::array shape() const { return tensor_shard_spec.shape; } ShardOrientation orientation() const { return tensor_shard_spec.orientation; } void set_shard_spec(const ShardSpec& shard_spec) { tensor_shard_spec = shard_spec; }; - /* Shape in pages of the full tensor, not per core */ + /* Shape in pages of the full shard */ std::array shape_in_pages() const; - DeviceAddr size() const; + DeviceAddr num_pages() const; }; inline namespace v0 { diff --git a/tt_metal/api/tt-metalium/tt_metal.hpp b/tt_metal/api/tt-metalium/tt_metal.hpp index c5d3bf708b2..b56b6fd168d 100644 --- a/tt_metal/api/tt-metalium/tt_metal.hpp +++ b/tt_metal/api/tt-metalium/tt_metal.hpp @@ -112,7 +112,7 @@ void ReadShard(Buffer& buffer, uint8_t* host_buffer, const uint32_t& core_id); */ template void ReadShard(Buffer& buffer, std::vector& host_buffer, const uint32_t& core_id) { - host_buffer.resize(buffer.page_size() * buffer.shard_spec().size()); + host_buffer.resize(buffer.page_size() * buffer.shard_spec().num_pages()); ReadShard(buffer, reinterpret_cast(host_buffer.data()), core_id); } diff --git a/tt_metal/impl/buffers/buffer.cpp b/tt_metal/impl/buffers/buffer.cpp index e615e87669c..29cdf05c980 100644 --- a/tt_metal/impl/buffers/buffer.cpp +++ b/tt_metal/impl/buffers/buffer.cpp @@ -208,12 +208,12 @@ BufferPageMapping generate_buffer_page_mapping(const Buffer& buffer) { uint32_t num_dev_pages = buffer.num_dev_pages(); auto [core_host_page_indices, shard_shape] = core_to_host_pages( num_dev_pages, - shard_spec.size(), + shard_spec.num_pages(), num_cores, buffer.buffer_layout(), shard_spec.page_shape, shard_spec.shape(), - shard_spec.tensor2d_shape); + shard_spec.tensor2d_shape_in_pages); buffer_page_mapping.core_host_page_indices_ = std::vector>(num_cores); @@ -229,7 +229,7 @@ BufferPageMapping generate_buffer_page_mapping(const Buffer& buffer) { auto shape_in_pages = shard_spec.shape_in_pages(); for (uint32_t core_index = 0; core_index < core_host_page_indices.size(); core_index++) { uint32_t valid_shard_page = 0; - buffer_page_mapping.core_host_page_indices_[core_index].reserve(shard_spec.size()); + buffer_page_mapping.core_host_page_indices_[core_index].reserve(shard_spec.num_pages()); uint32_t shard_page_id = 0; for (uint32_t shard_page_x = 0; shard_page_x < shape_in_pages[0]; shard_page_x++) { for (uint32_t shard_page_y = 0; shard_page_y < shape_in_pages[1]; shard_page_y++) { @@ -469,7 +469,7 @@ uint32_t Buffer::num_dev_pages() const { return this->num_pages(); } - return this->shard_spec().size() * this->num_cores().value(); + return this->shard_spec().num_pages() * this->num_cores().value(); } CoreType Buffer::core_type() const { @@ -523,7 +523,7 @@ DeviceAddr Buffer::bank_local_page_address(uint32_t bank_id, uint32_t page_index uint32_t offset; if (is_sharded(this->buffer_layout())) { auto shard_spec = this->shard_spec(); - uint32_t pages_offset_within_bank = page_index % shard_spec.size(); + uint32_t pages_offset_within_bank = page_index % shard_spec.num_pages(); offset = (round_up(this->page_size(), this->alignment()) * pages_offset_within_bank); } else { uint32_t pages_offset_within_bank = page_index / num_banks; @@ -550,7 +550,7 @@ DeviceAddr Buffer::aligned_size_per_bank() const { DeviceAddr Buffer::sharded_page_address(uint32_t bank_id, uint32_t page_index) const { TT_FATAL(is_sharded(this->buffer_layout()), "Buffer not sharded"); auto shard_spec = this->shard_spec(); - uint32_t pages_offset_within_bank = page_index % shard_spec.size(); + uint32_t pages_offset_within_bank = page_index % shard_spec.num_pages(); auto offset = (round_up(this->page_size(), this->alignment()) * pages_offset_within_bank); return translate_page_address(offset, bank_id); } @@ -591,12 +591,12 @@ bool ShardSpec::operator==(const ShardSpec&) const = default; bool ShardSpec::operator!=(const ShardSpec&) const = default; std::array ShardSpecBuffer::shape_in_pages() const { - auto width_in_pages = page_shape[0] == 0 ? 0 : tensor_shard_spec.shape[0] / page_shape[0]; - auto height_in_pages = page_shape[1] == 0 ? 0 : tensor_shard_spec.shape[1] / page_shape[1]; - return {width_in_pages, height_in_pages}; + auto height_in_pages = page_shape[0] == 0 ? 0 : tensor_shard_spec.shape[0] / page_shape[0]; + auto width_in_pages = page_shape[1] == 0 ? 0 : tensor_shard_spec.shape[1] / page_shape[1]; + return {height_in_pages, width_in_pages}; } -DeviceAddr ShardSpecBuffer::size() const { +DeviceAddr ShardSpecBuffer::num_pages() const { auto shape_in_pages_ = this->shape_in_pages(); return shape_in_pages_[0] * shape_in_pages_[1]; } diff --git a/tt_metal/impl/buffers/dispatch.cpp b/tt_metal/impl/buffers/dispatch.cpp index 8655c830709..f1de42f22e9 100644 --- a/tt_metal/impl/buffers/dispatch.cpp +++ b/tt_metal/impl/buffers/dispatch.cpp @@ -77,11 +77,12 @@ ShardedBufferWriteDispatchParams initialize_sharded_buf_dispatch_params( const BufferDispatchConstants& buf_dispatch_constants, const BufferRegion& region) { ShardedBufferWriteDispatchParams dispatch_params; - dispatch_params.width_split = buffer.shard_spec().shape_in_pages()[1] != buffer.shard_spec().tensor2d_shape[1]; + dispatch_params.width_split = + buffer.shard_spec().shape_in_pages()[1] != buffer.shard_spec().tensor2d_shape_in_pages[1]; dispatch_params.buffer_page_mapping = (dispatch_params.width_split) ? buffer.get_buffer_page_mapping() : nullptr; dispatch_params.total_pages_to_write = region.size / buffer.page_size(); dispatch_params.total_pages_written = 0; - dispatch_params.max_pages_per_shard = buffer.shard_spec().size(); + dispatch_params.max_pages_per_shard = buffer.shard_spec().num_pages(); dispatch_params.page_size_to_write = buffer.aligned_page_size(); dispatch_params.dst_page_index = region.offset / buffer.page_size(); dispatch_params.starting_dst_host_page_index = region.offset / buffer.page_size(); @@ -587,11 +588,12 @@ ShardedBufferReadDispatchParams initialize_sharded_buf_read_dispatch_params( dispatch_params.src_page_index = region.offset / buffer.page_size(); dispatch_params.starting_src_host_page_index = region.offset / buffer.page_size(); dispatch_params.unpadded_dst_offset = 0; - dispatch_params.width_split = buffer.shard_spec().shape_in_pages()[1] != buffer.shard_spec().tensor2d_shape[1]; + dispatch_params.width_split = + buffer.shard_spec().shape_in_pages()[1] != buffer.shard_spec().tensor2d_shape_in_pages[1]; dispatch_params.buffer_page_mapping = (dispatch_params.width_split) ? buffer.get_buffer_page_mapping() : nullptr; dispatch_params.total_pages_to_read = region.size / buffer.page_size(); dispatch_params.total_pages_read = 0; - dispatch_params.max_pages_per_shard = buffer.shard_spec().size(); + dispatch_params.max_pages_per_shard = buffer.shard_spec().num_pages(); dispatch_params.expected_num_workers_completed = expected_num_workers_completed; return dispatch_params; } diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index 4caeae9b22c..59e6543a82e 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -293,8 +293,6 @@ inline void SetRuntimeArgsImpl( } // namespace -// #define DEBUG_PRINT_SHARD - namespace detail { bool WriteToDeviceDRAMChannel(IDevice* device, int dram_channel, uint32_t address, std::vector& host_buffer) { @@ -586,9 +584,6 @@ void ReadFromDeviceSharded(Buffer& buffer, uint8_t* host_buffer, bool shard_orde TensorMemoryLayout buffer_layout = buffer.buffer_layout(); auto device = buffer.device(); -#ifdef DEBUG_PRINT_SHARD - std::cout << "Reading From Device Height Sharded " << std::endl; -#endif auto total_pages = buffer.num_dev_pages(); uint32_t page_size = buffer.page_size(); diff --git a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/multi_core/all_gather_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/multi_core/all_gather_op_multi_core.cpp index 6951764459f..a31309388e3 100644 --- a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/multi_core/all_gather_op_multi_core.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/multi_core/all_gather_op_multi_core.cpp @@ -342,12 +342,12 @@ operation::ProgramWithCallbacks all_gather_multi_core_with_workers_helper( log_trace(tt::LogOp, "input_buffer->page_size: {}", input_page_size); log_trace( tt::LogOp, - "input_buffer->shard_spec().tensor2d_shape[0]: {}", - input_buffer->shard_spec().tensor2d_shape[0]); + "input_buffer->shard_spec().tensor2d_shape_in_pages[0]: {}", + input_buffer->shard_spec().tensor2d_shape_in_pages[0]); log_trace( tt::LogOp, - "input_buffer->shard_spec().tensor2d_shape[1]: {}", - input_buffer->shard_spec().tensor2d_shape[1]); + "input_buffer->shard_spec().tensor2d_shape_in_pages[1]: {}", + input_buffer->shard_spec().tensor2d_shape_in_pages[1]); } const uint32_t max_buffer_per_chunk = tt::round_down(all_gather_config.get_eth_buffer_size(), input_page_size); const uint32_t max_pages_per_chunk = max_buffer_per_chunk / input_page_size; diff --git a/ttnn/cpp/ttnn/operations/ccl/sharding_addrgen_helper.cpp b/ttnn/cpp/ttnn/operations/ccl/sharding_addrgen_helper.cpp index 1bb57fa6e51..5e221b3fdf7 100644 --- a/ttnn/cpp/ttnn/operations/ccl/sharding_addrgen_helper.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/sharding_addrgen_helper.cpp @@ -155,16 +155,17 @@ std::vector generate_compile_time_args(const tt::tt_metal::Tensor& t) shard_addr_gen_consts::ContiguityType contiguity = (t.buffer()->aligned_page_size() != t.buffer()->page_size()) ? shard_addr_gen_consts::ContiguityType::PADDING_BETWEEN_PAGES - : (buf_shard_spec.tensor2d_shape[1] == (pages_per_shard_x * get_sharding_core_count(t))) + : (buf_shard_spec.tensor2d_shape_in_pages[1] == (pages_per_shard_x * get_sharding_core_count(t))) ? shard_addr_gen_consts::ContiguityType::NO_SHARD_PADDING : shard_addr_gen_consts::ContiguityType::PADDING_IN_RIGHTMOST_SHARD; args.push_back(static_cast(t.memory_config().memory_layout)); // Memory layout args.push_back(static_cast(get_sharding_core_count(t))); // The number of sharding cores args.push_back(static_cast(t.buffer()->aligned_page_size())); // The page size we offset each write to TT_FATAL(t.buffer()->aligned_page_size() > 0, "aligned page size is 0"); - TT_FATAL(buf_shard_spec.tensor2d_shape[1] > 0, "the page is empty"); + TT_FATAL(buf_shard_spec.tensor2d_shape_in_pages[1] > 0, "the page is empty"); args.push_back(static_cast( - buf_shard_spec.tensor2d_shape[1])); // The number of pages in each sharding row not including padding pages + buf_shard_spec + .tensor2d_shape_in_pages[1])); // The number of pages in each sharding row not including padding pages args.push_back(static_cast(contiguity)); // This defines times when contiguous pages can't be calculated args.push_back(pages_per_shard_x); args.push_back(pages_per_shard_y); diff --git a/ttnn/cpp/ttnn/operations/experimental/reshape/view.cpp b/ttnn/cpp/ttnn/operations/experimental/reshape/view.cpp index 1a7aaf2fa0d..0753f8468dc 100644 --- a/ttnn/cpp/ttnn/operations/experimental/reshape/view.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/reshape/view.cpp @@ -108,7 +108,7 @@ Tensor tensor_reshape( shard_spec.shape[1] = new_logical_shape[-1]; shard_spec_buffer.page_shape = {1, new_logical_shape[-1]}; - shard_spec_buffer.tensor2d_shape = {shard_spec.shape[0], 1}; + shard_spec_buffer.tensor2d_shape_in_pages = {shard_spec.shape[0], 1}; shard_spec_buffer.set_shard_spec(shard_spec); device_buffer->set_shard_spec(shard_spec_buffer); diff --git a/ttnn/cpp/ttnn/tensor/layout/tensor_layout.cpp b/ttnn/cpp/ttnn/tensor/layout/tensor_layout.cpp index f119c7bc621..298f9c6f5e6 100644 --- a/ttnn/cpp/ttnn/tensor/layout/tensor_layout.cpp +++ b/ttnn/cpp/ttnn/tensor/layout/tensor_layout.cpp @@ -184,7 +184,7 @@ std::optional TensorLayout::compute_shard_spec_buffer(const ttn page_shape.height()); const auto width_in_pages = physical_size.width() / page_shape.width(); const auto height_in_pages = physical_size.height() / page_shape.height(); - const std::array tensor2d_shape{height_in_pages, width_in_pages}; + const std::array tensor2d_shape_in_pages{height_in_pages, width_in_pages}; auto shard_spec = memory_config_.shard_spec.value(); @@ -198,7 +198,7 @@ std::optional TensorLayout::compute_shard_spec_buffer(const ttn default: TT_THROW("Unsupported shard mode {} in compute_shard_spec_buffer!", shard_spec.mode); } - ShardSpecBuffer shard_spec_buffer(shard_spec, std::array(page_shape), tensor2d_shape); + ShardSpecBuffer shard_spec_buffer(shard_spec, std::array(page_shape), tensor2d_shape_in_pages); return shard_spec_buffer; } diff --git a/ttnn/cpp/ttnn/tensor/tensor.cpp b/ttnn/cpp/ttnn/tensor/tensor.cpp index 1e5e153417b..fef10f167c2 100644 --- a/ttnn/cpp/ttnn/tensor/tensor.cpp +++ b/ttnn/cpp/ttnn/tensor/tensor.cpp @@ -809,8 +809,8 @@ bool Tensor::is_allocated() const { std::vector Tensor::host_page_ordering() { const auto& buffer_page_mapping = *this->buffer()->get_buffer_page_mapping(); auto cores = buffer_page_mapping.all_cores_; - auto shard_size = buffer()->shard_spec().size(); - auto num_pages = cores.size() * shard_size; + auto shard_num_pages = buffer()->shard_spec().num_pages(); + auto num_pages = cores.size() * shard_num_pages; std::vector ret_vec; ret_vec.reserve(num_pages); From b904dcfc97a9c5a672ff5ff3fc22bc4c5ddae8c0 Mon Sep 17 00:00:00 2001 From: John Bauman Date: Mon, 17 Feb 2025 19:23:47 +0000 Subject: [PATCH 134/316] #0: Update pgm_dispatch_golden.json *_all_cores_1_rta* and kernel_groups_*_shadow improved with the increse to 8 launch message slots. --- .../dispatch/pgm_dispatch_golden.json | 1052 ++++++++--------- 1 file changed, 526 insertions(+), 526 deletions(-) diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/pgm_dispatch_golden.json b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/pgm_dispatch_golden.json index 7c26e13390b..99404547dc7 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/pgm_dispatch_golden.json +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/pgm_dispatch_golden.json @@ -1,10 +1,10 @@ { "context": { - "date": "2025-01-30T07:41:06+00:00", - "host_name": "tt-metal-ci-vm-46", + "date": "2025-02-17T16:09:05+00:00", + "host_name": "tt-metal-ci-vm-190", "executable": "./build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_wormhole_b0", "num_cpus": 14, - "mhz_per_cpu": 3000, + "mhz_per_cpu": 2300, "cpu_scaling_enabled": false, "caches": [ { @@ -32,7 +32,7 @@ "num_sharing": 1 } ], - "load_avg": [4.38,5.15,5.13], + "load_avg": [8.73,8.27,8.15], "library_version": "v1.9.1", "library_build_type": "debug", "json_schema_version": 1 @@ -48,10 +48,10 @@ "repetition_index": 0, "threads": 1, "iterations": 26, - "real_time": 2.6528769230769236e+07, - "cpu_time": 2.4539615384614768e+04, + "real_time": 2.6730076923076924e+07, + "cpu_time": 2.3336153846153637e+04, "time_unit": "ns", - "IterationTime": 2.6528769230769236e-06 + "IterationTime": 2.6730076923076924e-06 }, { "name": "BM_pgm_dispatch/brisc_only_trace/512/manual_time", @@ -63,10 +63,10 @@ "repetition_index": 0, "threads": 1, "iterations": 26, - "real_time": 2.7177769230769232e+07, - "cpu_time": 6.6236615384615341e+05, + "real_time": 2.6894346153846148e+07, + "cpu_time": 2.4738846153846353e+04, "time_unit": "ns", - "IterationTime": 2.7177769230769232e-06 + "IterationTime": 2.6894346153846151e-06 }, { "name": "BM_pgm_dispatch/brisc_only_trace/1024/manual_time", @@ -78,10 +78,10 @@ "repetition_index": 0, "threads": 1, "iterations": 26, - "real_time": 2.6907692307692308e+07, - "cpu_time": 2.5008846153847287e+04, + "real_time": 2.7130807692307692e+07, + "cpu_time": 2.3016923076922227e+04, "time_unit": "ns", - "IterationTime": 2.6907692307692305e-06 + "IterationTime": 2.7130807692307694e-06 }, { "name": "BM_pgm_dispatch/brisc_only_trace/2048/manual_time", @@ -92,11 +92,11 @@ "repetitions": 1, "repetition_index": 0, "threads": 1, - "iterations": 26, - "real_time": 2.7348384615384616e+07, - "cpu_time": 2.6198846153846491e+04, + "iterations": 25, + "real_time": 2.7683120000000004e+07, + "cpu_time": 2.3659639999999981e+04, "time_unit": "ns", - "IterationTime": 2.7348384615384615e-06 + "IterationTime": 2.7683120000000002e-06 }, { "name": "BM_pgm_dispatch/brisc_only_trace/4096/manual_time", @@ -108,10 +108,10 @@ "repetition_index": 0, "threads": 1, "iterations": 24, - "real_time": 2.9458791666666668e+07, - "cpu_time": 2.4594583333331178e+04, + "real_time": 2.9706791666666672e+07, + "cpu_time": 2.2529416666666744e+04, "time_unit": "ns", - "IterationTime": 2.9458791666666667e-06 + "IterationTime": 2.9706791666666672e-06 }, { "name": "BM_pgm_dispatch/brisc_only_trace/8192/manual_time", @@ -123,10 +123,10 @@ "repetition_index": 0, "threads": 1, "iterations": 22, - "real_time": 3.2293863636363629e+07, - "cpu_time": 3.4424999999999542e+04, + "real_time": 3.2475590909090903e+07, + "cpu_time": 2.4634954545455952e+04, "time_unit": "ns", - "IterationTime": 3.2293863636363629e-06 + "IterationTime": 3.2475590909090901e-06 }, { "name": "BM_pgm_dispatch/brisc_only_trace/12288/manual_time", @@ -138,10 +138,10 @@ "repetition_index": 0, "threads": 1, "iterations": 20, - "real_time": 3.5449499999999985e+07, - "cpu_time": 3.3763000000003318e+04, + "real_time": 3.5464200000000007e+07, + "cpu_time": 2.2655500000001717e+04, "time_unit": "ns", - "IterationTime": 3.5449499999999986e-06 + "IterationTime": 3.5464200000000010e-06 }, { "name": "BM_pgm_dispatch/ncrisc_only_trace/256/manual_time", @@ -153,10 +153,10 @@ "repetition_index": 0, "threads": 1, "iterations": 26, - "real_time": 2.6535307692307692e+07, - "cpu_time": 2.4758076923075540e+04, + "real_time": 2.6713653846153848e+07, + "cpu_time": 2.2773076923076318e+04, "time_unit": "ns", - "IterationTime": 2.6535307692307693e-06 + "IterationTime": 2.6713653846153849e-06 }, { "name": "BM_pgm_dispatch/ncrisc_only_trace/512/manual_time", @@ -168,10 +168,10 @@ "repetition_index": 0, "threads": 1, "iterations": 26, - "real_time": 2.6651076923076920e+07, - "cpu_time": 2.3059999999997824e+04, + "real_time": 2.6892884615384616e+07, + "cpu_time": 2.3196538461534874e+04, "time_unit": "ns", - "IterationTime": 2.6651076923076916e-06 + "IterationTime": 2.6892884615384616e-06 }, { "name": "BM_pgm_dispatch/ncrisc_only_trace/1024/manual_time", @@ -183,10 +183,10 @@ "repetition_index": 0, "threads": 1, "iterations": 26, - "real_time": 2.6915653846153848e+07, - "cpu_time": 2.1726153846153458e+04, + "real_time": 2.7130423076923076e+07, + "cpu_time": 2.1398461538454285e+04, "time_unit": "ns", - "IterationTime": 2.6915653846153843e-06 + "IterationTime": 2.7130423076923079e-06 }, { "name": "BM_pgm_dispatch/ncrisc_only_trace/2048/manual_time", @@ -197,11 +197,11 @@ "repetitions": 1, "repetition_index": 0, "threads": 1, - "iterations": 26, - "real_time": 2.7345269230769232e+07, - "cpu_time": 2.5469230769228700e+04, + "iterations": 25, + "real_time": 2.7683520000000000e+07, + "cpu_time": 2.2990679999992382e+04, "time_unit": "ns", - "IterationTime": 2.7345269230769231e-06 + "IterationTime": 2.7683520000000004e-06 }, { "name": "BM_pgm_dispatch/ncrisc_only_trace/4096/manual_time", @@ -213,10 +213,10 @@ "repetition_index": 0, "threads": 1, "iterations": 24, - "real_time": 2.9465708333333332e+07, - "cpu_time": 2.4081250000002052e+04, + "real_time": 2.9707708333333340e+07, + "cpu_time": 2.4864708333331248e+04, "time_unit": "ns", - "IterationTime": 2.9465708333333327e-06 + "IterationTime": 2.9707708333333341e-06 }, { "name": "BM_pgm_dispatch/ncrisc_only_trace/8192/manual_time", @@ -227,11 +227,11 @@ "repetitions": 1, "repetition_index": 0, "threads": 1, - "iterations": 21, - "real_time": 3.2290095238095239e+07, - "cpu_time": 2.9266190476183780e+04, + "iterations": 22, + "real_time": 3.2475227272727262e+07, + "cpu_time": 2.3398636363641304e+04, "time_unit": "ns", - "IterationTime": 3.2290095238095240e-06 + "IterationTime": 3.2475227272727262e-06 }, { "name": "BM_pgm_dispatch/ncrisc_only_trace/12288/manual_time", @@ -243,10 +243,10 @@ "repetition_index": 0, "threads": 1, "iterations": 20, - "real_time": 3.5448250000000000e+07, - "cpu_time": 3.0898500000009488e+04, + "real_time": 3.5465350000000000e+07, + "cpu_time": 2.4466999999994689e+04, "time_unit": "ns", - "IterationTime": 3.5448250000000000e-06 + "IterationTime": 3.5465349999999997e-06 }, { "name": "BM_pgm_dispatch/trisc_only_trace/256/manual_time", @@ -258,10 +258,10 @@ "repetition_index": 0, "threads": 1, "iterations": 24, - "real_time": 2.9081875000000000e+07, - "cpu_time": 2.8146249999999596e+04, + "real_time": 2.9075708333333332e+07, + "cpu_time": 2.3487499999993073e+04, "time_unit": "ns", - "IterationTime": 2.9081874999999999e-06 + "IterationTime": 2.9075708333333332e-06 }, { "name": "BM_pgm_dispatch/trisc_only_trace/512/manual_time", @@ -273,10 +273,10 @@ "repetition_index": 0, "threads": 1, "iterations": 24, - "real_time": 2.9074791666666668e+07, - "cpu_time": 2.1040416666673333e+04, + "real_time": 2.9075458333333340e+07, + "cpu_time": 2.5067874999988122e+04, "time_unit": "ns", - "IterationTime": 2.9074791666666663e-06 + "IterationTime": 2.9075458333333340e-06 }, { "name": "BM_pgm_dispatch/trisc_only_trace/1024/manual_time", @@ -287,11 +287,11 @@ "repetitions": 1, "repetition_index": 0, "threads": 1, - "iterations": 24, - "real_time": 2.9619875000000000e+07, - "cpu_time": 2.9417083333334780e+04, + "iterations": 23, + "real_time": 2.9828217391304348e+07, + "cpu_time": 2.2127217391293176e+04, "time_unit": "ns", - "IterationTime": 2.9619875000000002e-06 + "IterationTime": 2.9828217391304348e-06 }, { "name": "BM_pgm_dispatch/trisc_only_trace/2048/manual_time", @@ -303,10 +303,10 @@ "repetition_index": 0, "threads": 1, "iterations": 21, - "real_time": 3.3206285714285720e+07, - "cpu_time": 2.6806666666667228e+04, + "real_time": 3.3546238095238108e+07, + "cpu_time": 2.2843809523807682e+04, "time_unit": "ns", - "IterationTime": 3.3206285714285720e-06 + "IterationTime": 3.3546238095238102e-06 }, { "name": "BM_pgm_dispatch/trisc_only_trace/4096/manual_time", @@ -318,10 +318,10 @@ "repetition_index": 0, "threads": 1, "iterations": 18, - "real_time": 3.8654333333333336e+07, - "cpu_time": 2.1724999999997133e+04, + "real_time": 3.8659222222222216e+07, + "cpu_time": 2.3362222222224183e+04, "time_unit": "ns", - "IterationTime": 3.8654333333333337e-06 + "IterationTime": 3.8659222222222217e-06 }, { "name": "BM_pgm_dispatch/trisc_only_trace/8192/manual_time", @@ -333,10 +333,10 @@ "repetition_index": 0, "threads": 1, "iterations": 15, - "real_time": 4.5888933333333336e+07, - "cpu_time": 3.0266666666663627e+04, + "real_time": 4.6317666666666664e+07, + "cpu_time": 2.5929333333341019e+04, "time_unit": "ns", - "IterationTime": 4.5888933333333334e-06 + "IterationTime": 4.6317666666666669e-06 }, { "name": "BM_pgm_dispatch/trisc_only_trace/12288/manual_time", @@ -348,10 +348,10 @@ "repetition_index": 0, "threads": 1, "iterations": 13, - "real_time": 5.4422076923076935e+07, - "cpu_time": 2.7813076923070941e+04, + "real_time": 5.4694230769230768e+07, + "cpu_time": 2.7805461538474508e+04, "time_unit": "ns", - "IterationTime": 5.4422076923076937e-06 + "IterationTime": 5.4694230769230770e-06 }, { "name": "BM_pgm_dispatch/brisc_trisc_only_trace/256/manual_time", @@ -362,11 +362,11 @@ "repetitions": 1, "repetition_index": 0, "threads": 1, - "iterations": 24, - "real_time": 2.9736291666666668e+07, - "cpu_time": 2.7123333333323175e+04, + "iterations": 23, + "real_time": 2.9950565217391301e+07, + "cpu_time": 2.1679434782619621e+04, "time_unit": "ns", - "IterationTime": 2.9736291666666669e-06 + "IterationTime": 2.9950565217391299e-06 }, { "name": "BM_pgm_dispatch/brisc_trisc_only_trace/512/manual_time", @@ -378,10 +378,10 @@ "repetition_index": 0, "threads": 1, "iterations": 23, - "real_time": 3.0138434782608692e+07, - "cpu_time": 1.1968652173913788e+05, + "real_time": 3.0197434782608695e+07, + "cpu_time": 2.2568478260875934e+04, "time_unit": "ns", - "IterationTime": 3.0138434782608690e-06 + "IterationTime": 3.0197434782608692e-06 }, { "name": "BM_pgm_dispatch/brisc_trisc_only_trace/1024/manual_time", @@ -393,10 +393,10 @@ "repetition_index": 0, "threads": 1, "iterations": 22, - "real_time": 3.1487136363636352e+07, - "cpu_time": 2.2024545454544445e+04, + "real_time": 3.1887909090909086e+07, + "cpu_time": 2.3819681818183399e+04, "time_unit": "ns", - "IterationTime": 3.1487136363636350e-06 + "IterationTime": 3.1887909090909085e-06 }, { "name": "BM_pgm_dispatch/brisc_trisc_only_trace/2048/manual_time", @@ -407,11 +407,11 @@ "repetitions": 1, "repetition_index": 0, "threads": 1, - "iterations": 20, - "real_time": 3.5620800000000000e+07, - "cpu_time": 2.1623500000012009e+04, + "iterations": 19, + "real_time": 3.5937210526315793e+07, + "cpu_time": 2.1740000000004005e+04, "time_unit": "ns", - "IterationTime": 3.5620800000000005e-06 + "IterationTime": 3.5937210526315797e-06 }, { "name": "BM_pgm_dispatch/brisc_trisc_only_trace/4096/manual_time", @@ -423,10 +423,10 @@ "repetition_index": 0, "threads": 1, "iterations": 17, - "real_time": 4.1212882352941178e+07, - "cpu_time": 2.2929411764718690e+04, + "real_time": 4.1428294117647067e+07, + "cpu_time": 2.6309411764709432e+04, "time_unit": "ns", - "IterationTime": 4.1212882352941174e-06 + "IterationTime": 4.1428294117647069e-06 }, { "name": "BM_pgm_dispatch/brisc_trisc_only_trace/8192/manual_time", @@ -438,10 +438,10 @@ "repetition_index": 0, "threads": 1, "iterations": 13, - "real_time": 5.2495692307692297e+07, - "cpu_time": 2.6122307692294937e+04, + "real_time": 5.2825692307692304e+07, + "cpu_time": 2.5559999999988584e+04, "time_unit": "ns", - "IterationTime": 5.2495692307692305e-06 + "IterationTime": 5.2825692307692300e-06 }, { "name": "BM_pgm_dispatch/brisc_trisc_only_trace/12288/manual_time", @@ -453,10 +453,10 @@ "repetition_index": 0, "threads": 1, "iterations": 11, - "real_time": 6.4178272727272741e+07, - "cpu_time": 2.9057272727264011e+04, + "real_time": 6.4249545454545468e+07, + "cpu_time": 2.4714545454566789e+04, "time_unit": "ns", - "IterationTime": 6.4178272727272731e-06 + "IterationTime": 6.4249545454545459e-06 }, { "name": "BM_pgm_dispatch/all_processors_trace/256/manual_time", @@ -467,11 +467,11 @@ "repetitions": 1, "repetition_index": 0, "threads": 1, - "iterations": 23, - "real_time": 3.1110000000000004e+07, - "cpu_time": 2.7360000000002841e+04, + "iterations": 22, + "real_time": 3.1338136363636352e+07, + "cpu_time": 2.3316954545463374e+04, "time_unit": "ns", - "IterationTime": 3.1110000000000004e-06 + "IterationTime": 3.1338136363636358e-06 }, { "name": "BM_pgm_dispatch/all_processors_trace/512/manual_time", @@ -483,10 +483,10 @@ "repetition_index": 0, "threads": 1, "iterations": 22, - "real_time": 3.1720136363636363e+07, - "cpu_time": 2.4381363636350914e+04, + "real_time": 3.1957136363636363e+07, + "cpu_time": 2.4401090909075374e+04, "time_unit": "ns", - "IterationTime": 3.1720136363636365e-06 + "IterationTime": 3.1957136363636368e-06 }, { "name": "BM_pgm_dispatch/all_processors_trace/1024/manual_time", @@ -498,10 +498,10 @@ "repetition_index": 0, "threads": 1, "iterations": 21, - "real_time": 3.3226571428571433e+07, - "cpu_time": 2.4255714285738504e+04, + "real_time": 3.3438000000000007e+07, + "cpu_time": 2.2249333333332477e+04, "time_unit": "ns", - "IterationTime": 3.3226571428571433e-06 + "IterationTime": 3.3438000000000005e-06 }, { "name": "BM_pgm_dispatch/all_processors_trace/2048/manual_time", @@ -513,10 +513,10 @@ "repetition_index": 0, "threads": 1, "iterations": 18, - "real_time": 3.8702166666666664e+07, - "cpu_time": 3.0808888888881702e+04, + "real_time": 3.8705333333333336e+07, + "cpu_time": 2.1913888888885285e+04, "time_unit": "ns", - "IterationTime": 3.8702166666666665e-06 + "IterationTime": 3.8705333333333330e-06 }, { "name": "BM_pgm_dispatch/all_processors_trace/4096/manual_time", @@ -528,10 +528,10 @@ "repetition_index": 0, "threads": 1, "iterations": 15, - "real_time": 4.5186800000000000e+07, - "cpu_time": 2.9478666666656511e+04, + "real_time": 4.5641533333333343e+07, + "cpu_time": 2.3505999999991665e+04, "time_unit": "ns", - "IterationTime": 4.5186799999999999e-06 + "IterationTime": 4.5641533333333340e-06 }, { "name": "BM_pgm_dispatch/all_processors_trace/8192/manual_time", @@ -543,10 +543,10 @@ "repetition_index": 0, "threads": 1, "iterations": 12, - "real_time": 5.9359500000000007e+07, - "cpu_time": 3.5621666666670513e+04, + "real_time": 5.9665083333333321e+07, + "cpu_time": 2.5379166666672503e+04, "time_unit": "ns", - "IterationTime": 5.9359500000000012e-06 + "IterationTime": 5.9665083333333329e-06 }, { "name": "BM_pgm_dispatch/all_processors_trace/12288/manual_time", @@ -557,11 +557,11 @@ "repetitions": 1, "repetition_index": 0, "threads": 1, - "iterations": 10, - "real_time": 7.3335700000000000e+07, - "cpu_time": 2.1444000000014894e+04, + "iterations": 9, + "real_time": 7.3753111111111119e+07, + "cpu_time": 2.4642222222216584e+04, "time_unit": "ns", - "IterationTime": 7.3335700000000000e-06 + "IterationTime": 7.3753111111111126e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_trace/256/manual_time", @@ -572,11 +572,11 @@ "repetitions": 1, "repetition_index": 0, "threads": 1, - "iterations": 23, - "real_time": 3.1027391304347824e+07, - "cpu_time": 2.1345652173898128e+04, + "iterations": 22, + "real_time": 3.1155954545454539e+07, + "cpu_time": 2.2925454545448658e+04, "time_unit": "ns", - "IterationTime": 3.1027391304347827e-06 + "IterationTime": 3.1155954545454542e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_trace/512/manual_time", @@ -588,10 +588,10 @@ "repetition_index": 0, "threads": 1, "iterations": 22, - "real_time": 3.1538681818181816e+07, - "cpu_time": 6.6108636363642654e+04, + "real_time": 3.1700909090909079e+07, + "cpu_time": 2.3464227272729233e+04, "time_unit": "ns", - "IterationTime": 3.1538681818181818e-06 + "IterationTime": 3.1700909090909077e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_trace/1024/manual_time", @@ -603,10 +603,10 @@ "repetition_index": 0, "threads": 1, "iterations": 21, - "real_time": 3.3231714285714280e+07, - "cpu_time": 2.5581904761904996e+04, + "real_time": 3.3428095238095231e+07, + "cpu_time": 2.2474714285730934e+04, "time_unit": "ns", - "IterationTime": 3.3231714285714278e-06 + "IterationTime": 3.3428095238095233e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_trace/2048/manual_time", @@ -618,10 +618,10 @@ "repetition_index": 0, "threads": 1, "iterations": 18, - "real_time": 3.8709611111111112e+07, - "cpu_time": 3.1501666666667992e+04, + "real_time": 3.8703722222222224e+07, + "cpu_time": 2.3273944444469744e+04, "time_unit": "ns", - "IterationTime": 3.8709611111111112e-06 + "IterationTime": 3.8703722222222221e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_trace/4096/manual_time", @@ -633,10 +633,10 @@ "repetition_index": 0, "threads": 1, "iterations": 15, - "real_time": 4.5198066666666664e+07, - "cpu_time": 3.2696000000006126e+04, + "real_time": 4.5644800000000000e+07, + "cpu_time": 3.3046666666673256e+04, "time_unit": "ns", - "IterationTime": 4.5198066666666663e-06 + "IterationTime": 4.5644800000000004e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_trace/8192/manual_time", @@ -648,10 +648,10 @@ "repetition_index": 0, "threads": 1, "iterations": 12, - "real_time": 5.9581500000000007e+07, - "cpu_time": 2.8343333333347153e+04, + "real_time": 5.9704833333333321e+07, + "cpu_time": 2.4242500000030512e+04, "time_unit": "ns", - "IterationTime": 5.9581500000000007e-06 + "IterationTime": 5.9704833333333331e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_trace/12288/manual_time", @@ -662,11 +662,11 @@ "repetitions": 1, "repetition_index": 0, "threads": 1, - "iterations": 10, - "real_time": 7.3603599999999985e+07, - "cpu_time": 2.6549000000031239e+04, + "iterations": 9, + "real_time": 7.3861777777777776e+07, + "cpu_time": 2.5335555555629064e+04, "time_unit": "ns", - "IterationTime": 7.3603599999999988e-06 + "IterationTime": 7.3861777777777777e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_1cb/256/manual_time", @@ -678,10 +678,10 @@ "repetition_index": 0, "threads": 1, "iterations": 20, - "real_time": 3.4208100000000000e+07, - "cpu_time": 4.2618999999977088e+04, + "real_time": 3.4477300000000000e+07, + "cpu_time": 2.3501999999986368e+04, "time_unit": "ns", - "IterationTime": 3.4208100000000006e-06 + "IterationTime": 3.4477299999999996e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_1cb/512/manual_time", @@ -693,10 +693,10 @@ "repetition_index": 0, "threads": 1, "iterations": 20, - "real_time": 3.4671650000000000e+07, - "cpu_time": 2.2378000000000677e+04, + "real_time": 3.4912649999999993e+07, + "cpu_time": 2.4015000000021661e+04, "time_unit": "ns", - "IterationTime": 3.4671649999999997e-06 + "IterationTime": 3.4912649999999992e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_1cb/1024/manual_time", @@ -708,10 +708,10 @@ "repetition_index": 0, "threads": 1, "iterations": 19, - "real_time": 3.6346789473684214e+07, - "cpu_time": 2.4048947368416368e+04, + "real_time": 3.6714894736842096e+07, + "cpu_time": 2.4035315789486402e+04, "time_unit": "ns", - "IterationTime": 3.6346789473684213e-06 + "IterationTime": 3.6714894736842097e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_1cb/2048/manual_time", @@ -723,10 +723,10 @@ "repetition_index": 0, "threads": 1, "iterations": 17, - "real_time": 4.1517000000000000e+07, - "cpu_time": 2.7388823529412068e+04, + "real_time": 4.1945941176470585e+07, + "cpu_time": 2.5924117647079052e+04, "time_unit": "ns", - "IterationTime": 4.1517000000000004e-06 + "IterationTime": 4.1945941176470588e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_1cb/4096/manual_time", @@ -738,10 +738,10 @@ "repetition_index": 0, "threads": 1, "iterations": 14, - "real_time": 4.8400428571428575e+07, - "cpu_time": 2.1936428571464210e+04, + "real_time": 4.8923285714285716e+07, + "cpu_time": 2.6736428571475353e+04, "time_unit": "ns", - "IterationTime": 4.8400428571428574e-06 + "IterationTime": 4.8923285714285717e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_1cb/8192/manual_time", @@ -753,10 +753,10 @@ "repetition_index": 0, "threads": 1, "iterations": 11, - "real_time": 6.2822727272727273e+07, - "cpu_time": 2.7970909090881756e+04, + "real_time": 6.3098818181818180e+07, + "cpu_time": 2.2529999999934800e+04, "time_unit": "ns", - "IterationTime": 6.2822727272727275e-06 + "IterationTime": 6.3098818181818184e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_32cb/256/manual_time", @@ -768,10 +768,10 @@ "repetition_index": 0, "threads": 1, "iterations": 20, - "real_time": 3.4406349999999993e+07, - "cpu_time": 3.2772999999997053e+04, + "real_time": 3.4805099999999993e+07, + "cpu_time": 2.4124099999989212e+04, "time_unit": "ns", - "IterationTime": 3.4406349999999996e-06 + "IterationTime": 3.4805099999999994e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_32cb/512/manual_time", @@ -783,10 +783,10 @@ "repetition_index": 0, "threads": 1, "iterations": 20, - "real_time": 3.4864350000000000e+07, - "cpu_time": 2.5396500000018917e+04, + "real_time": 3.5100100000000007e+07, + "cpu_time": 2.5931549999969051e+04, "time_unit": "ns", - "IterationTime": 3.4864350000000001e-06 + "IterationTime": 3.5100100000000006e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_32cb/1024/manual_time", @@ -798,10 +798,10 @@ "repetition_index": 0, "threads": 1, "iterations": 19, - "real_time": 3.6571105263157904e+07, - "cpu_time": 2.8427894736846472e+04, + "real_time": 3.7149842105263159e+07, + "cpu_time": 3.0253684210560106e+04, "time_unit": "ns", - "IterationTime": 3.6571105263157904e-06 + "IterationTime": 3.7149842105263159e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_32cb/2048/manual_time", @@ -813,10 +813,10 @@ "repetition_index": 0, "threads": 1, "iterations": 17, - "real_time": 4.2001294117647052e+07, - "cpu_time": 2.9972941176491975e+04, + "real_time": 4.2246647058823526e+07, + "cpu_time": 2.9003529411721647e+04, "time_unit": "ns", - "IterationTime": 4.2001294117647055e-06 + "IterationTime": 4.2246647058823523e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_32cb/4096/manual_time", @@ -828,10 +828,10 @@ "repetition_index": 0, "threads": 1, "iterations": 14, - "real_time": 4.8543642857142843e+07, - "cpu_time": 3.2606428571454620e+04, + "real_time": 4.9113000000000000e+07, + "cpu_time": 3.1937142857112784e+04, "time_unit": "ns", - "IterationTime": 4.8543642857142838e-06 + "IterationTime": 4.9112999999999999e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_32cb/8192/manual_time", @@ -843,10 +843,10 @@ "repetition_index": 0, "threads": 1, "iterations": 11, - "real_time": 6.6060363636363648e+07, - "cpu_time": 2.3154545454560284e+04, + "real_time": 6.6463000000000007e+07, + "cpu_time": 3.2335727272761716e+04, "time_unit": "ns", - "IterationTime": 6.6060363636363638e-06 + "IterationTime": 6.6463000000000011e-06 }, { "name": "BM_pgm_dispatch/all_processors_1_core_1_rta/256/manual_time", @@ -858,10 +858,10 @@ "repetition_index": 0, "threads": 1, "iterations": 20, - "real_time": 3.4203399999999985e+07, - "cpu_time": 2.3524999999979809e+04, + "real_time": 3.4480349999999993e+07, + "cpu_time": 2.8031049999999166e+04, "time_unit": "ns", - "IterationTime": 3.4203399999999987e-06 + "IterationTime": 3.4480349999999989e-06 }, { "name": "BM_pgm_dispatch/all_processors_1_core_1_rta/512/manual_time", @@ -873,10 +873,10 @@ "repetition_index": 0, "threads": 1, "iterations": 20, - "real_time": 3.4668699999999993e+07, - "cpu_time": 2.0978999999998749e+04, + "real_time": 3.4916699999999993e+07, + "cpu_time": 2.8380200000022171e+04, "time_unit": "ns", - "IterationTime": 3.4668699999999998e-06 + "IterationTime": 3.4916699999999991e-06 }, { "name": "BM_pgm_dispatch/all_processors_1_core_1_rta/1024/manual_time", @@ -888,10 +888,10 @@ "repetition_index": 0, "threads": 1, "iterations": 19, - "real_time": 3.6354105263157889e+07, - "cpu_time": 3.4621578947361573e+04, + "real_time": 3.6713736842105277e+07, + "cpu_time": 3.5802631578961627e+04, "time_unit": "ns", - "IterationTime": 3.6354105263157889e-06 + "IterationTime": 3.6713736842105279e-06 }, { "name": "BM_pgm_dispatch/all_processors_1_core_1_rta/2048/manual_time", @@ -903,10 +903,10 @@ "repetition_index": 0, "threads": 1, "iterations": 17, - "real_time": 4.1516411764705889e+07, - "cpu_time": 2.4685882352963352e+04, + "real_time": 4.1953000000000007e+07, + "cpu_time": 3.1220588235308609e+04, "time_unit": "ns", - "IterationTime": 4.1516411764705891e-06 + "IterationTime": 4.1953000000000003e-06 }, { "name": "BM_pgm_dispatch/all_processors_1_core_1_rta/4096/manual_time", @@ -918,10 +918,10 @@ "repetition_index": 0, "threads": 1, "iterations": 14, - "real_time": 4.8427785714285716e+07, - "cpu_time": 3.2893571428628733e+04, + "real_time": 4.8927500000000000e+07, + "cpu_time": 3.0061428571442102e+04, "time_unit": "ns", - "IterationTime": 4.8427785714285715e-06 + "IterationTime": 4.8927499999999990e-06 }, { "name": "BM_pgm_dispatch/all_processors_1_core_1_rta/8192/manual_time", @@ -933,10 +933,10 @@ "repetition_index": 0, "threads": 1, "iterations": 11, - "real_time": 6.2665909090909094e+07, - "cpu_time": 3.8618181818202269e+04, + "real_time": 6.2969909090909101e+07, + "cpu_time": 3.1834636363631769e+04, "time_unit": "ns", - "IterationTime": 6.2665909090909089e-06 + "IterationTime": 6.2969909090909095e-06 }, { "name": "BM_pgm_dispatch/one_processor_all_cores_128_rta/256/manual_time", @@ -948,10 +948,10 @@ "repetition_index": 0, "threads": 1, "iterations": 12, - "real_time": 5.6816083333333336e+07, - "cpu_time": 3.6818333333317925e+04, + "real_time": 5.7600500000000000e+07, + "cpu_time": 3.8343500000056119e+04, "time_unit": "ns", - "IterationTime": 5.6816083333333333e-06 + "IterationTime": 5.7600500000000000e-06 }, { "name": "BM_pgm_dispatch/one_processor_all_cores_128_rta/512/manual_time", @@ -963,10 +963,10 @@ "repetition_index": 0, "threads": 1, "iterations": 12, - "real_time": 5.6961083333333336e+07, - "cpu_time": 5.0632500000036271e+04, + "real_time": 5.7762833333333336e+07, + "cpu_time": 3.0340916666649064e+04, "time_unit": "ns", - "IterationTime": 5.6961083333333329e-06 + "IterationTime": 5.7762833333333342e-06 }, { "name": "BM_pgm_dispatch/one_processor_all_cores_128_rta/1024/manual_time", @@ -978,10 +978,10 @@ "repetition_index": 0, "threads": 1, "iterations": 12, - "real_time": 5.7168166666666664e+07, - "cpu_time": 3.5675833333304043e+04, + "real_time": 5.8090666666666664e+07, + "cpu_time": 2.9895833333348779e+04, "time_unit": "ns", - "IterationTime": 5.7168166666666668e-06 + "IterationTime": 5.8090666666666666e-06 }, { "name": "BM_pgm_dispatch/one_processor_all_cores_128_rta/2048/manual_time", @@ -993,10 +993,10 @@ "repetition_index": 0, "threads": 1, "iterations": 12, - "real_time": 5.7815583333333321e+07, - "cpu_time": 2.5740833333361599e+04, + "real_time": 5.8695666666666664e+07, + "cpu_time": 3.0913333333308183e+04, "time_unit": "ns", - "IterationTime": 5.7815583333333331e-06 + "IterationTime": 5.8695666666666663e-06 }, { "name": "BM_pgm_dispatch/one_processor_all_cores_128_rta/4096/manual_time", @@ -1008,10 +1008,10 @@ "repetition_index": 0, "threads": 1, "iterations": 12, - "real_time": 6.0225916666666664e+07, - "cpu_time": 4.8669999999972904e+04, + "real_time": 6.0850166666666657e+07, + "cpu_time": 3.4490833333252383e+04, "time_unit": "ns", - "IterationTime": 6.0225916666666664e-06 + "IterationTime": 6.0850166666666669e-06 }, { "name": "BM_pgm_dispatch/one_processor_all_cores_128_rta/8192/manual_time", @@ -1023,10 +1023,10 @@ "repetition_index": 0, "threads": 1, "iterations": 11, - "real_time": 6.3566636363636367e+07, - "cpu_time": 4.6558181818151017e+04, + "real_time": 6.3639545454545468e+07, + "cpu_time": 2.4531909090958430e+04, "time_unit": "ns", - "IterationTime": 6.3566636363636358e-06 + "IterationTime": 6.3639545454545460e-06 }, { "name": "BM_pgm_dispatch/one_processors_all_cores_1_rta/256/manual_time", @@ -1037,11 +1037,11 @@ "repetitions": 1, "repetition_index": 0, "threads": 1, - "iterations": 17, - "real_time": 4.0642352941176474e+07, - "cpu_time": 3.4324705882310729e+04, + "iterations": 19, + "real_time": 3.7484105263157889e+07, + "cpu_time": 2.1082684210533014e+04, "time_unit": "ns", - "IterationTime": 4.0642352941176473e-06 + "IterationTime": 3.7484105263157885e-06 }, { "name": "BM_pgm_dispatch/one_processors_all_cores_1_rta/512/manual_time", @@ -1052,11 +1052,11 @@ "repetitions": 1, "repetition_index": 0, "threads": 1, - "iterations": 17, - "real_time": 4.0742764705882363e+07, - "cpu_time": 3.4803529411738222e+04, + "iterations": 19, + "real_time": 3.7578157894736834e+07, + "cpu_time": 2.0652526315825377e+04, "time_unit": "ns", - "IterationTime": 4.0742764705882361e-06 + "IterationTime": 3.7578157894736839e-06 }, { "name": "BM_pgm_dispatch/one_processors_all_cores_1_rta/1024/manual_time", @@ -1067,11 +1067,11 @@ "repetitions": 1, "repetition_index": 0, "threads": 1, - "iterations": 17, - "real_time": 4.0922823529411770e+07, - "cpu_time": 3.6109411764684301e+04, + "iterations": 19, + "real_time": 3.7757578947368421e+07, + "cpu_time": 2.0148947368394791e+04, "time_unit": "ns", - "IterationTime": 4.0922823529411774e-06 + "IterationTime": 3.7757578947368423e-06 }, { "name": "BM_pgm_dispatch/one_processors_all_cores_1_rta/2048/manual_time", @@ -1082,11 +1082,11 @@ "repetitions": 1, "repetition_index": 0, "threads": 1, - "iterations": 17, - "real_time": 4.1291941176470600e+07, - "cpu_time": 3.6387647058835086e+04, + "iterations": 18, + "real_time": 3.8168833333333336e+07, + "cpu_time": 1.8871666666599020e+04, "time_unit": "ns", - "IterationTime": 4.1291941176470603e-06 + "IterationTime": 3.8168833333333331e-06 }, { "name": "BM_pgm_dispatch/one_processors_all_cores_1_rta/4096/manual_time", @@ -1097,11 +1097,11 @@ "repetitions": 1, "repetition_index": 0, "threads": 1, - "iterations": 17, - "real_time": 4.2081882352941178e+07, - "cpu_time": 2.9368823529447458e+04, + "iterations": 18, + "real_time": 3.9009111111111112e+07, + "cpu_time": 2.0109444444453096e+04, "time_unit": "ns", - "IterationTime": 4.2081882352941182e-06 + "IterationTime": 3.9009111111111116e-06 }, { "name": "BM_pgm_dispatch/one_processors_all_cores_1_rta/8192/manual_time", @@ -1112,11 +1112,11 @@ "repetitions": 1, "repetition_index": 0, "threads": 1, - "iterations": 16, - "real_time": 4.3890374999999993e+07, - "cpu_time": 3.6661249999903055e+04, + "iterations": 17, + "real_time": 4.1178411764705881e+07, + "cpu_time": 3.0142941176503722e+04, "time_unit": "ns", - "IterationTime": 4.3890374999999999e-06 + "IterationTime": 4.1178411764705887e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_1_rta/256/manual_time", @@ -1127,11 +1127,11 @@ "repetitions": 1, "repetition_index": 0, "threads": 1, - "iterations": 16, - "real_time": 4.4284249999999993e+07, - "cpu_time": 3.2343125000000582e+04, + "iterations": 17, + "real_time": 4.0965764705882341e+07, + "cpu_time": 3.2121941176508615e+04, "time_unit": "ns", - "IterationTime": 4.4284249999999990e-06 + "IterationTime": 4.0965764705882342e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_1_rta/512/manual_time", @@ -1142,11 +1142,11 @@ "repetitions": 1, "repetition_index": 0, "threads": 1, - "iterations": 16, - "real_time": 4.4468812500000000e+07, - "cpu_time": 2.6208124999982374e+04, + "iterations": 17, + "real_time": 4.1141235294117637e+07, + "cpu_time": 2.9815529411770989e+04, "time_unit": "ns", - "IterationTime": 4.4468812500000001e-06 + "IterationTime": 4.1141235294117641e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_1_rta/1024/manual_time", @@ -1157,11 +1157,11 @@ "repetitions": 1, "repetition_index": 0, "threads": 1, - "iterations": 16, - "real_time": 4.4998062500000007e+07, - "cpu_time": 2.3879375000035452e+04, + "iterations": 17, + "real_time": 4.1674705882352941e+07, + "cpu_time": 3.0351529411815398e+04, "time_unit": "ns", - "IterationTime": 4.4998062500000010e-06 + "IterationTime": 4.1674705882352947e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_1_rta/2048/manual_time", @@ -1172,11 +1172,11 @@ "repetitions": 1, "repetition_index": 0, "threads": 1, - "iterations": 15, - "real_time": 4.6238066666666664e+07, - "cpu_time": 2.4299333333388517e+04, + "iterations": 16, + "real_time": 4.4369937500000007e+07, + "cpu_time": 3.1336250000069122e+04, "time_unit": "ns", - "IterationTime": 4.6238066666666665e-06 + "IterationTime": 4.4369937500000004e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_1_rta/4096/manual_time", @@ -1188,10 +1188,10 @@ "repetition_index": 0, "threads": 1, "iterations": 14, - "real_time": 5.1613500000000000e+07, - "cpu_time": 2.9413571428525491e+04, + "real_time": 4.9822928571428575e+07, + "cpu_time": 3.2757142857141120e+04, "time_unit": "ns", - "IterationTime": 5.1613500000000010e-06 + "IterationTime": 4.9822928571428567e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_1_rta/8192/manual_time", @@ -1203,10 +1203,10 @@ "repetition_index": 0, "threads": 1, "iterations": 10, - "real_time": 7.2194500000000000e+07, - "cpu_time": 3.6729999999884909e+04, + "real_time": 6.9507500000000015e+07, + "cpu_time": 3.1938000000053533e+04, "time_unit": "ns", - "IterationTime": 7.2194500000000000e-06 + "IterationTime": 6.9507500000000012e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_32_rta/256/manual_time", @@ -1218,10 +1218,10 @@ "repetition_index": 0, "threads": 1, "iterations": 13, - "real_time": 5.5028076923076920e+07, - "cpu_time": 7.6855384615343442e+04, + "real_time": 5.5500076923076913e+07, + "cpu_time": 3.6943769230810212e+04, "time_unit": "ns", - "IterationTime": 5.5028076923076928e-06 + "IterationTime": 5.5500076923076912e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_32_rta/512/manual_time", @@ -1233,10 +1233,10 @@ "repetition_index": 0, "threads": 1, "iterations": 13, - "real_time": 5.5557769230769239e+07, - "cpu_time": 3.0360000000092452e+04, + "real_time": 5.5804769230769232e+07, + "cpu_time": 3.2049923076918130e+04, "time_unit": "ns", - "IterationTime": 5.5557769230769238e-06 + "IterationTime": 5.5804769230769237e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_32_rta/1024/manual_time", @@ -1248,10 +1248,10 @@ "repetition_index": 0, "threads": 1, "iterations": 12, - "real_time": 5.7755416666666664e+07, - "cpu_time": 2.7314999999935215e+04, + "real_time": 5.7422916666666657e+07, + "cpu_time": 3.0158166666627294e+04, "time_unit": "ns", - "IterationTime": 5.7755416666666676e-06 + "IterationTime": 5.7422916666666659e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_32_rta/2048/manual_time", @@ -1263,10 +1263,10 @@ "repetition_index": 0, "threads": 1, "iterations": 11, - "real_time": 6.2133090909090921e+07, - "cpu_time": 2.8591818181731491e+04, + "real_time": 6.2508999999999993e+07, + "cpu_time": 3.7220090909138227e+04, "time_unit": "ns", - "IterationTime": 6.2133090909090908e-06 + "IterationTime": 6.2508999999999980e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_32_rta/4096/manual_time", @@ -1278,10 +1278,10 @@ "repetition_index": 0, "threads": 1, "iterations": 10, - "real_time": 6.9159299999999985e+07, - "cpu_time": 3.2427000000012642e+04, + "real_time": 7.0115900000000015e+07, + "cpu_time": 3.5648000000065847e+04, "time_unit": "ns", - "IterationTime": 6.9159299999999989e-06 + "IterationTime": 7.0115900000000001e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_32_rta/8192/manual_time", @@ -1293,10 +1293,10 @@ "repetition_index": 0, "threads": 1, "iterations": 8, - "real_time": 8.4782000000000000e+07, - "cpu_time": 4.7415000000050612e+04, + "real_time": 8.5774750000000015e+07, + "cpu_time": 3.3160000000087566e+04, "time_unit": "ns", - "IterationTime": 8.4781999999999996e-06 + "IterationTime": 8.5774750000000021e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_128_rta/256/manual_time", @@ -1308,10 +1308,10 @@ "repetition_index": 0, "threads": 1, "iterations": 6, - "real_time": 1.1739150000000001e+08, - "cpu_time": 4.6193333333259070e+04, + "real_time": 1.1872416666666667e+08, + "cpu_time": 3.5832500000054781e+04, "time_unit": "ns", - "IterationTime": 1.1739150000000001e-05 + "IterationTime": 1.1872416666666667e-05 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_128_rta/512/manual_time", @@ -1323,10 +1323,10 @@ "repetition_index": 0, "threads": 1, "iterations": 6, - "real_time": 1.1783783333333333e+08, - "cpu_time": 3.4633333333348543e+04, + "real_time": 1.1916200000000000e+08, + "cpu_time": 3.4728499999895728e+04, "time_unit": "ns", - "IterationTime": 1.1783783333333332e-05 + "IterationTime": 1.1916200000000001e-05 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_128_rta/1024/manual_time", @@ -1338,10 +1338,10 @@ "repetition_index": 0, "threads": 1, "iterations": 6, - "real_time": 1.1965316666666667e+08, - "cpu_time": 3.0425000000278145e+04, + "real_time": 1.2089416666666664e+08, + "cpu_time": 2.3970000000280343e+04, "time_unit": "ns", - "IterationTime": 1.1965316666666664e-05 + "IterationTime": 1.2089416666666665e-05 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_128_rta/2048/manual_time", @@ -1353,10 +1353,10 @@ "repetition_index": 0, "threads": 1, "iterations": 6, - "real_time": 1.2517616666666667e+08, - "cpu_time": 3.9178333333111936e+04, + "real_time": 1.2610266666666667e+08, + "cpu_time": 2.4575166666688612e+04, "time_unit": "ns", - "IterationTime": 1.2517616666666667e-05 + "IterationTime": 1.2610266666666667e-05 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_128_rta/4096/manual_time", @@ -1368,10 +1368,10 @@ "repetition_index": 0, "threads": 1, "iterations": 5, - "real_time": 1.3123179999999997e+08, - "cpu_time": 5.1429999999896841e+04, + "real_time": 1.3209140000000003e+08, + "cpu_time": 2.9534000000097651e+04, "time_unit": "ns", - "IterationTime": 1.3123179999999999e-05 + "IterationTime": 1.3209140000000003e-05 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_128_rta/8192/manual_time", @@ -1383,10 +1383,10 @@ "repetition_index": 0, "threads": 1, "iterations": 5, - "real_time": 1.4652700000000000e+08, - "cpu_time": 4.9845999999575950e+04, + "real_time": 1.4751780000000000e+08, + "cpu_time": 2.7633999999920889e+04, "time_unit": "ns", - "IterationTime": 1.4652699999999999e-05 + "IterationTime": 1.4751780000000000e-05 }, { "name": "BM_pgm_dispatch/sems_1_core_1_processor_trace/256/manual_time", @@ -1397,11 +1397,11 @@ "repetitions": 1, "repetition_index": 0, "threads": 1, - "iterations": 24, - "real_time": 2.9750791666666672e+07, - "cpu_time": 2.3049166666696172e+04, + "iterations": 23, + "real_time": 3.0070826086956523e+07, + "cpu_time": 1.9661304347930236e+04, "time_unit": "ns", - "IterationTime": 2.9750791666666672e-06 + "IterationTime": 3.0070826086956525e-06 }, { "name": "BM_pgm_dispatch/sems_1_core_1_processor_trace/512/manual_time", @@ -1413,10 +1413,10 @@ "repetition_index": 0, "threads": 1, "iterations": 23, - "real_time": 2.9881043478260878e+07, - "cpu_time": 2.9393043478342446e+04, + "real_time": 3.0183391304347832e+07, + "cpu_time": 2.0213999999958043e+04, "time_unit": "ns", - "IterationTime": 2.9881043478260880e-06 + "IterationTime": 3.0183391304347831e-06 }, { "name": "BM_pgm_dispatch/sems_1_core_1_processor_trace/1024/manual_time", @@ -1428,10 +1428,10 @@ "repetition_index": 0, "threads": 1, "iterations": 23, - "real_time": 3.0100869565217391e+07, - "cpu_time": 2.8044347826101271e+04, + "real_time": 3.0480260869565219e+07, + "cpu_time": 1.9658826087010082e+04, "time_unit": "ns", - "IterationTime": 3.0100869565217392e-06 + "IterationTime": 3.0480260869565220e-06 }, { "name": "BM_pgm_dispatch/sems_1_core_1_processor_trace/2048/manual_time", @@ -1443,10 +1443,10 @@ "repetition_index": 0, "threads": 1, "iterations": 23, - "real_time": 3.0822086956521746e+07, - "cpu_time": 2.6283043478359934e+04, + "real_time": 3.1034043478260871e+07, + "cpu_time": 1.8955478260807013e+04, "time_unit": "ns", - "IterationTime": 3.0822086956521745e-06 + "IterationTime": 3.1034043478260867e-06 }, { "name": "BM_pgm_dispatch/sems_1_core_1_processor_trace/4096/manual_time", @@ -1458,10 +1458,10 @@ "repetition_index": 0, "threads": 1, "iterations": 21, - "real_time": 3.2712571428571429e+07, - "cpu_time": 2.8664761904804796e+04, + "real_time": 3.2993238095238108e+07, + "cpu_time": 1.9665619047616801e+04, "time_unit": "ns", - "IterationTime": 3.2712571428571428e-06 + "IterationTime": 3.2993238095238104e-06 }, { "name": "BM_pgm_dispatch/sems_1_core_1_processor_trace/8192/manual_time", @@ -1472,11 +1472,11 @@ "repetitions": 1, "repetition_index": 0, "threads": 1, - "iterations": 20, - "real_time": 3.5663249999999993e+07, - "cpu_time": 2.9508999999983131e+04, + "iterations": 19, + "real_time": 3.5972473684210517e+07, + "cpu_time": 1.8976315789655619e+04, "time_unit": "ns", - "IterationTime": 3.5663249999999993e-06 + "IterationTime": 3.5972473684210520e-06 }, { "name": "BM_pgm_dispatch/sems_all_cores_1_processor_trace/256/manual_time", @@ -1487,11 +1487,11 @@ "repetitions": 1, "repetition_index": 0, "threads": 1, - "iterations": 24, - "real_time": 2.9767750000000000e+07, - "cpu_time": 4.1820000000007225e+04, + "iterations": 23, + "real_time": 3.0070695652173907e+07, + "cpu_time": 2.0065217391309332e+04, "time_unit": "ns", - "IterationTime": 2.9767750000000000e-06 + "IterationTime": 3.0070695652173906e-06 }, { "name": "BM_pgm_dispatch/sems_all_cores_1_processor_trace/512/manual_time", @@ -1503,10 +1503,10 @@ "repetition_index": 0, "threads": 1, "iterations": 23, - "real_time": 2.9881173913043484e+07, - "cpu_time": 2.9256521739192154e+04, + "real_time": 3.0182782608695649e+07, + "cpu_time": 1.9268260869586622e+04, "time_unit": "ns", - "IterationTime": 2.9881173913043482e-06 + "IterationTime": 3.0182782608695648e-06 }, { "name": "BM_pgm_dispatch/sems_all_cores_1_processor_trace/1024/manual_time", @@ -1518,10 +1518,10 @@ "repetition_index": 0, "threads": 1, "iterations": 23, - "real_time": 3.0127913043478265e+07, - "cpu_time": 5.0092173913061859e+04, + "real_time": 3.0480173913043477e+07, + "cpu_time": 2.0814782608624682e+04, "time_unit": "ns", - "IterationTime": 3.0127913043478269e-06 + "IterationTime": 3.0480173913043482e-06 }, { "name": "BM_pgm_dispatch/sems_all_cores_1_processor_trace/2048/manual_time", @@ -1533,10 +1533,10 @@ "repetition_index": 0, "threads": 1, "iterations": 23, - "real_time": 3.0837826086956523e+07, - "cpu_time": 3.1766956521758639e+04, + "real_time": 3.1036086956521735e+07, + "cpu_time": 1.9879521739063006e+04, "time_unit": "ns", - "IterationTime": 3.0837826086956521e-06 + "IterationTime": 3.1036086956521736e-06 }, { "name": "BM_pgm_dispatch/sems_all_cores_1_processor_trace/4096/manual_time", @@ -1548,10 +1548,10 @@ "repetition_index": 0, "threads": 1, "iterations": 21, - "real_time": 3.2726619047619052e+07, - "cpu_time": 3.3239047618950834e+04, + "real_time": 3.3019095238095239e+07, + "cpu_time": 2.0720428571406403e+04, "time_unit": "ns", - "IterationTime": 3.2726619047619052e-06 + "IterationTime": 3.3019095238095238e-06 }, { "name": "BM_pgm_dispatch/sems_all_cores_1_processor_trace/8192/manual_time", @@ -1562,11 +1562,11 @@ "repetitions": 1, "repetition_index": 0, "threads": 1, - "iterations": 20, - "real_time": 3.5720450000000007e+07, - "cpu_time": 3.0909500000042557e+04, + "iterations": 19, + "real_time": 3.5973947368421055e+07, + "cpu_time": 2.0178684210529689e+04, "time_unit": "ns", - "IterationTime": 3.5720450000000000e-06 + "IterationTime": 3.5973947368421058e-06 }, { "name": "BM_pgm_dispatch/maxed_config_params_trace/256/manual_time", @@ -1578,10 +1578,10 @@ "repetition_index": 0, "threads": 1, "iterations": 7, - "real_time": 1.0229942857142857e+08, - "cpu_time": 4.1028571428418967e+04, + "real_time": 1.0377071428571427e+08, + "cpu_time": 2.2170000000138705e+04, "time_unit": "ns", - "IterationTime": 1.0229942857142856e-05 + "IterationTime": 1.0377071428571427e-05 }, { "name": "BM_pgm_dispatch/maxed_config_params_trace/512/manual_time", @@ -1593,10 +1593,10 @@ "repetition_index": 0, "threads": 1, "iterations": 7, - "real_time": 1.0272971428571428e+08, - "cpu_time": 5.3112857142727829e+04, + "real_time": 1.0426657142857143e+08, + "cpu_time": 2.3283000000365715e+04, "time_unit": "ns", - "IterationTime": 1.0272971428571428e-05 + "IterationTime": 1.0426657142857143e-05 }, { "name": "BM_pgm_dispatch/maxed_config_params_trace/1024/manual_time", @@ -1608,10 +1608,10 @@ "repetition_index": 0, "threads": 1, "iterations": 7, - "real_time": 1.0450971428571430e+08, - "cpu_time": 3.8204285714422374e+04, + "real_time": 1.0614242857142857e+08, + "cpu_time": 2.7466428570781838e+04, "time_unit": "ns", - "IterationTime": 1.0450971428571430e-05 + "IterationTime": 1.0614242857142859e-05 }, { "name": "BM_pgm_dispatch/maxed_config_params_trace/2048/manual_time", @@ -1623,10 +1623,10 @@ "repetition_index": 0, "threads": 1, "iterations": 6, - "real_time": 1.0965600000000000e+08, - "cpu_time": 3.6405000000054126e+04, + "real_time": 1.1098866666666667e+08, + "cpu_time": 2.3233333333649851e+04, "time_unit": "ns", - "IterationTime": 1.0965599999999999e-05 + "IterationTime": 1.1098866666666666e-05 }, { "name": "BM_pgm_dispatch/maxed_config_params_trace/4096/manual_time", @@ -1638,10 +1638,10 @@ "repetition_index": 0, "threads": 1, "iterations": 6, - "real_time": 1.1631933333333333e+08, - "cpu_time": 2.9645000000281623e+04, + "real_time": 1.1733233333333333e+08, + "cpu_time": 2.4433333333462317e+04, "time_unit": "ns", - "IterationTime": 1.1631933333333333e-05 + "IterationTime": 1.1733233333333333e-05 }, { "name": "BM_pgm_dispatch/maxed_config_params_trace/8192/manual_time", @@ -1653,10 +1653,10 @@ "repetition_index": 0, "threads": 1, "iterations": 5, - "real_time": 1.3114020000000000e+08, - "cpu_time": 2.9339999999677955e+04, + "real_time": 1.3236920000000000e+08, + "cpu_time": 2.6089799999340357e+04, "time_unit": "ns", - "IterationTime": 1.3114020000000000e-05 + "IterationTime": 1.3236920000000002e-05 }, { "name": "BM_pgm_dispatch/kernel_groups_trace/256/manual_time", @@ -1668,10 +1668,10 @@ "repetition_index": 0, "threads": 1, "iterations": 6, - "real_time": 1.2280300000000000e+08, - "cpu_time": 3.5453333333398026e+04, + "real_time": 1.2223816666666667e+08, + "cpu_time": 2.6801666667353173e+04, "time_unit": "ns", - "IterationTime": 1.2280300000000002e-05 + "IterationTime": 1.2223816666666666e-05 }, { "name": "BM_pgm_dispatch/kernel_groups_trace/512/manual_time", @@ -1683,10 +1683,10 @@ "repetition_index": 0, "threads": 1, "iterations": 6, - "real_time": 1.2349466666666667e+08, - "cpu_time": 3.7411666666523997e+04, + "real_time": 1.2258733333333333e+08, + "cpu_time": 2.7776666667496858e+04, "time_unit": "ns", - "IterationTime": 1.2349466666666666e-05 + "IterationTime": 1.2258733333333330e-05 }, { "name": "BM_pgm_dispatch/kernel_groups_trace/1024/manual_time", @@ -1698,10 +1698,10 @@ "repetition_index": 0, "threads": 1, "iterations": 6, - "real_time": 1.2518999999999999e+08, - "cpu_time": 8.0054999999745749e+04, + "real_time": 1.2489916666666667e+08, + "cpu_time": 2.6563333333247861e+04, "time_unit": "ns", - "IterationTime": 1.2519000000000000e-05 + "IterationTime": 1.2489916666666665e-05 }, { "name": "BM_pgm_dispatch/kernel_groups_trace/2048/manual_time", @@ -1713,10 +1713,10 @@ "repetition_index": 0, "threads": 1, "iterations": 5, - "real_time": 1.4144700000000000e+08, - "cpu_time": 1.2695599999972274e+05, + "real_time": 1.4246980000000000e+08, + "cpu_time": 2.9727999999806798e+04, "time_unit": "ns", - "IterationTime": 1.4144699999999999e-05 + "IterationTime": 1.4246980000000001e-05 }, { "name": "BM_pgm_dispatch/kernel_groups_trace/4096/manual_time", @@ -1728,10 +1728,10 @@ "repetition_index": 0, "threads": 1, "iterations": 3, - "real_time": 2.0084833333333334e+08, - "cpu_time": 1.2952333333278906e+05, + "real_time": 2.0078166666666666e+08, + "cpu_time": 3.5603333332782466e+04, "time_unit": "ns", - "IterationTime": 2.0084833333333335e-05 + "IterationTime": 2.0078166666666670e-05 }, { "name": "BM_pgm_dispatch/kernel_groups_trace/8192/manual_time", @@ -1743,10 +1743,10 @@ "repetition_index": 0, "threads": 1, "iterations": 2, - "real_time": 3.1806550000000000e+08, - "cpu_time": 8.4689999997777937e+04, + "real_time": 3.1837400000000000e+08, + "cpu_time": 7.5791000000435815e+04, "time_unit": "ns", - "IterationTime": 3.1806549999999998e-05 + "IterationTime": 3.1837399999999994e-05 }, { "name": "BM_pgm_dispatch/10000_kernel_all_cores_all_processors_32_cbs_trace/256/manual_time", @@ -1758,10 +1758,10 @@ "repetition_index": 0, "threads": 1, "iterations": 6, - "real_time": 1.1880933333333333e+08, - "cpu_time": 3.3488333333053786e+04, + "real_time": 1.1883483333333333e+08, + "cpu_time": 3.1042833333809012e+04, "time_unit": "ns", - "IterationTime": 1.1880933333333333e-05 + "IterationTime": 1.1883483333333336e-05 }, { "name": "BM_pgm_dispatch/10000_kernel_all_cores_all_processors_32_cbs_trace/512/manual_time", @@ -1773,10 +1773,10 @@ "repetition_index": 0, "threads": 1, "iterations": 6, - "real_time": 1.1882700000000000e+08, - "cpu_time": 3.7786666666761448e+04, + "real_time": 1.1884550000000000e+08, + "cpu_time": 3.5406666666422854e+04, "time_unit": "ns", - "IterationTime": 1.1882700000000001e-05 + "IterationTime": 1.1884549999999998e-05 }, { "name": "BM_pgm_dispatch/10000_kernel_all_cores_all_processors_32_cbs_trace/1024/manual_time", @@ -1788,10 +1788,10 @@ "repetition_index": 0, "threads": 1, "iterations": 6, - "real_time": 1.1891783333333333e+08, - "cpu_time": 3.1728499999180334e+04, + "real_time": 1.1890100000000000e+08, + "cpu_time": 3.3865000000095810e+04, "time_unit": "ns", - "IterationTime": 1.1891783333333332e-05 + "IterationTime": 1.1890100000000000e-05 }, { "name": "BM_pgm_dispatch/10000_kernel_all_cores_all_processors_32_cbs_trace/2048/manual_time", @@ -1803,10 +1803,10 @@ "repetition_index": 0, "threads": 1, "iterations": 6, - "real_time": 1.1946583333333336e+08, - "cpu_time": 2.6834999999891807e+04, + "real_time": 1.1947133333333333e+08, + "cpu_time": 3.3283333332671340e+04, "time_unit": "ns", - "IterationTime": 1.1946583333333335e-05 + "IterationTime": 1.1947133333333333e-05 }, { "name": "BM_pgm_dispatch/10000_kernel_all_cores_all_processors_32_cbs_trace/4096/manual_time", @@ -1818,10 +1818,10 @@ "repetition_index": 0, "threads": 1, "iterations": 6, - "real_time": 1.2124800000000000e+08, - "cpu_time": 2.6059999999716863e+04, + "real_time": 1.2130549999999999e+08, + "cpu_time": 3.2995499999799453e+04, "time_unit": "ns", - "IterationTime": 1.2124800000000001e-05 + "IterationTime": 1.2130549999999999e-05 }, { "name": "BM_pgm_dispatch/10000_kernel_all_cores_all_processors_32_cbs_trace/8192/manual_time", @@ -1833,10 +1833,10 @@ "repetition_index": 0, "threads": 1, "iterations": 4, - "real_time": 1.6583399999999997e+08, - "cpu_time": 2.6357500001239488e+04, + "real_time": 1.6620975000000003e+08, + "cpu_time": 2.9792750000368073e+04, "time_unit": "ns", - "IterationTime": 1.6583399999999998e-05 + "IterationTime": 1.6620975000000001e-05 }, { "name": "BM_pgm_dispatch/5000_kernel_all_cores_all_processors_32_cbs_trace/256/manual_time", @@ -1848,10 +1848,10 @@ "repetition_index": 0, "threads": 1, "iterations": 10, - "real_time": 6.8061800000000000e+07, - "cpu_time": 2.9687800000033349e+04, + "real_time": 6.8096700000000000e+07, + "cpu_time": 2.6223100000066781e+04, "time_unit": "ns", - "IterationTime": 6.8061799999999988e-06 + "IterationTime": 6.8096699999999990e-06 }, { "name": "BM_pgm_dispatch/5000_kernel_all_cores_all_processors_32_cbs_trace/512/manual_time", @@ -1863,10 +1863,10 @@ "repetition_index": 0, "threads": 1, "iterations": 10, - "real_time": 6.8067600000000000e+07, - "cpu_time": 2.2842899999631070e+04, + "real_time": 6.8104800000000015e+07, + "cpu_time": 3.1231999999903335e+04, "time_unit": "ns", - "IterationTime": 6.8067600000000012e-06 + "IterationTime": 6.8104800000000006e-06 }, { "name": "BM_pgm_dispatch/5000_kernel_all_cores_all_processors_32_cbs_trace/1024/manual_time", @@ -1878,10 +1878,10 @@ "repetition_index": 0, "threads": 1, "iterations": 10, - "real_time": 6.8170400000000015e+07, - "cpu_time": 2.2918400000548900e+04, + "real_time": 6.8165500000000000e+07, + "cpu_time": 2.5873999999959098e+04, "time_unit": "ns", - "IterationTime": 6.8170400000000012e-06 + "IterationTime": 6.8165500000000008e-06 }, { "name": "BM_pgm_dispatch/5000_kernel_all_cores_all_processors_32_cbs_trace/2048/manual_time", @@ -1893,10 +1893,10 @@ "repetition_index": 0, "threads": 1, "iterations": 10, - "real_time": 6.8726600000000000e+07, - "cpu_time": 2.5596999999777381e+04, + "real_time": 6.8736599999999985e+07, + "cpu_time": 3.0934999999487900e+04, "time_unit": "ns", - "IterationTime": 6.8726600000000009e-06 + "IterationTime": 6.8736599999999988e-06 }, { "name": "BM_pgm_dispatch/5000_kernel_all_cores_all_processors_32_cbs_trace/4096/manual_time", @@ -1908,10 +1908,10 @@ "repetition_index": 0, "threads": 1, "iterations": 10, - "real_time": 7.0519899999999985e+07, - "cpu_time": 2.6065000000130567e+04, + "real_time": 7.0558000000000015e+07, + "cpu_time": 2.3976199999964367e+04, "time_unit": "ns", - "IterationTime": 7.0519899999999989e-06 + "IterationTime": 7.0558000000000011e-06 }, { "name": "BM_pgm_dispatch/5000_kernel_all_cores_all_processors_32_cbs_trace/8192/manual_time", @@ -1923,14 +1923,14 @@ "repetition_index": 0, "threads": 1, "iterations": 6, - "real_time": 1.1566050000000000e+08, - "cpu_time": 3.1591666666959860e+04, + "real_time": 1.1595766666666667e+08, + "cpu_time": 2.9203333333782666e+04, "time_unit": "ns", - "IterationTime": 1.1566049999999999e-05 + "IterationTime": 1.1595766666666667e-05 }, { "name": "BM_pgm_dispatch/kernel_groups_4_shadow/256/manual_time", - "family_index": 18, + "family_index": 20, "per_family_instance_index": 0, "run_name": "BM_pgm_dispatch/kernel_groups_4_shadow/256/manual_time", "run_type": "iteration", @@ -1938,14 +1938,14 @@ "repetition_index": 0, "threads": 1, "iterations": 1, - "real_time": 7.3889200000000000e+08, - "cpu_time": 6.6459999999324282e+04, + "real_time": 5.4237800000000000e+08, + "cpu_time": 4.8290000002282337e+04, "time_unit": "ns", - "IterationTime": 7.3889199999999997e-05 + "IterationTime": 5.4237800000000004e-05 }, { "name": "BM_pgm_dispatch/kernel_groups_4_shadow/512/manual_time", - "family_index": 18, + "family_index": 20, "per_family_instance_index": 1, "run_name": "BM_pgm_dispatch/kernel_groups_4_shadow/512/manual_time", "run_type": "iteration", @@ -1953,14 +1953,14 @@ "repetition_index": 0, "threads": 1, "iterations": 1, - "real_time": 7.4147600000000000e+08, - "cpu_time": 7.4659999995674298e+04, + "real_time": 5.4552000000000000e+08, + "cpu_time": 4.1389999999807973e+04, "time_unit": "ns", - "IterationTime": 7.4147600000000008e-05 + "IterationTime": 5.4551999999999995e-05 }, { "name": "BM_pgm_dispatch/kernel_groups_4_shadow/1024/manual_time", - "family_index": 18, + "family_index": 20, "per_family_instance_index": 2, "run_name": "BM_pgm_dispatch/kernel_groups_4_shadow/1024/manual_time", "run_type": "iteration", @@ -1968,14 +1968,14 @@ "repetition_index": 0, "threads": 1, "iterations": 1, - "real_time": 7.4624400000000000e+08, - "cpu_time": 7.7210000000604850e+04, + "real_time": 5.5493300000000000e+08, + "cpu_time": 4.2209000000070773e+04, "time_unit": "ns", - "IterationTime": 7.4624399999999997e-05 + "IterationTime": 5.5493299999999999e-05 }, { "name": "BM_pgm_dispatch/kernel_groups_4_shadow/2048/manual_time", - "family_index": 18, + "family_index": 20, "per_family_instance_index": 3, "run_name": "BM_pgm_dispatch/kernel_groups_4_shadow/2048/manual_time", "run_type": "iteration", @@ -1983,14 +1983,14 @@ "repetition_index": 0, "threads": 1, "iterations": 1, - "real_time": 7.5857400000000000e+08, - "cpu_time": 7.8590000001099717e+04, + "real_time": 5.9554600000000000e+08, + "cpu_time": 3.8520000003927635e+04, "time_unit": "ns", - "IterationTime": 7.5857400000000003e-05 + "IterationTime": 5.9554600000000001e-05 }, { "name": "BM_pgm_dispatch/kernel_groups_4_shadow/4096/manual_time", - "family_index": 18, + "family_index": 20, "per_family_instance_index": 4, "run_name": "BM_pgm_dispatch/kernel_groups_4_shadow/4096/manual_time", "run_type": "iteration", @@ -1998,14 +1998,14 @@ "repetition_index": 0, "threads": 1, "iterations": 1, - "real_time": 8.5193400000000000e+08, - "cpu_time": 5.7419999997421197e+04, + "real_time": 8.5543900000000000e+08, + "cpu_time": 4.7340000001838693e+04, "time_unit": "ns", - "IterationTime": 8.5193399999999996e-05 + "IterationTime": 8.5543899999999999e-05 }, { "name": "BM_pgm_dispatch/kernel_groups_4_shadow/8192/manual_time", - "family_index": 18, + "family_index": 20, "per_family_instance_index": 5, "run_name": "BM_pgm_dispatch/kernel_groups_4_shadow/8192/manual_time", "run_type": "iteration", @@ -2013,14 +2013,14 @@ "repetition_index": 0, "threads": 1, "iterations": 1, - "real_time": 1.5725240000000000e+09, - "cpu_time": 7.8990000005774171e+04, + "real_time": 1.5866330000000000e+09, + "cpu_time": 6.2331000002302520e+04, "time_unit": "ns", - "IterationTime": 1.5725240000000001e-04 + "IterationTime": 1.5866329999999999e-04 }, { "name": "BM_pgm_dispatch/kernel_groups_5_shadow/256/manual_time", - "family_index": 19, + "family_index": 21, "per_family_instance_index": 0, "run_name": "BM_pgm_dispatch/kernel_groups_5_shadow/256/manual_time", "run_type": "iteration", @@ -2028,14 +2028,14 @@ "repetition_index": 0, "threads": 1, "iterations": 1, - "real_time": 8.4809500000000000e+08, - "cpu_time": 4.7850000001403714e+04, + "real_time": 6.5096400000000000e+08, + "cpu_time": 4.1160000002093962e+04, "time_unit": "ns", - "IterationTime": 8.4809500000000002e-05 + "IterationTime": 6.5096400000000002e-05 }, { "name": "BM_pgm_dispatch/kernel_groups_5_shadow/512/manual_time", - "family_index": 19, + "family_index": 21, "per_family_instance_index": 1, "run_name": "BM_pgm_dispatch/kernel_groups_5_shadow/512/manual_time", "run_type": "iteration", @@ -2043,14 +2043,14 @@ "repetition_index": 0, "threads": 1, "iterations": 1, - "real_time": 8.5168000000000000e+08, - "cpu_time": 7.4699999998983913e+04, + "real_time": 6.5486500000000000e+08, + "cpu_time": 3.6379999997393497e+04, "time_unit": "ns", - "IterationTime": 8.5167999999999995e-05 + "IterationTime": 6.5486499999999997e-05 }, { "name": "BM_pgm_dispatch/kernel_groups_5_shadow/1024/manual_time", - "family_index": 19, + "family_index": 21, "per_family_instance_index": 2, "run_name": "BM_pgm_dispatch/kernel_groups_5_shadow/1024/manual_time", "run_type": "iteration", @@ -2058,14 +2058,14 @@ "repetition_index": 0, "threads": 1, "iterations": 1, - "real_time": 8.5788800000000000e+08, - "cpu_time": 7.2150000001158784e+04, + "real_time": 6.6611600000000000e+08, + "cpu_time": 3.5420000003227869e+04, "time_unit": "ns", - "IterationTime": 8.5788800000000001e-05 + "IterationTime": 6.6611600000000004e-05 }, { "name": "BM_pgm_dispatch/kernel_groups_5_shadow/2048/manual_time", - "family_index": 19, + "family_index": 21, "per_family_instance_index": 3, "run_name": "BM_pgm_dispatch/kernel_groups_5_shadow/2048/manual_time", "run_type": "iteration", @@ -2073,14 +2073,14 @@ "repetition_index": 0, "threads": 1, "iterations": 1, - "real_time": 8.7446000000000000e+08, - "cpu_time": 9.0679999999565553e+04, + "real_time": 7.1765000000000000e+08, + "cpu_time": 3.1180000000574637e+04, "time_unit": "ns", - "IterationTime": 8.7446000000000003e-05 + "IterationTime": 7.1765000000000002e-05 }, { "name": "BM_pgm_dispatch/kernel_groups_5_shadow/4096/manual_time", - "family_index": 19, + "family_index": 21, "per_family_instance_index": 4, "run_name": "BM_pgm_dispatch/kernel_groups_5_shadow/4096/manual_time", "run_type": "iteration", @@ -2088,14 +2088,14 @@ "repetition_index": 0, "threads": 1, "iterations": 1, - "real_time": 1.0215010000000000e+09, - "cpu_time": 6.7319999999426727e+04, + "real_time": 1.0249530000000000e+09, + "cpu_time": 3.6509000004514295e+04, "time_unit": "ns", - "IterationTime": 1.0215010000000001e-04 + "IterationTime": 1.0249529999999999e-04 }, { "name": "BM_pgm_dispatch/kernel_groups_5_shadow/8192/manual_time", - "family_index": 19, + "family_index": 21, "per_family_instance_index": 5, "run_name": "BM_pgm_dispatch/kernel_groups_5_shadow/8192/manual_time", "run_type": "iteration", @@ -2103,14 +2103,14 @@ "repetition_index": 0, "threads": 1, "iterations": 1, - "real_time": 1.8475720000000000e+09, - "cpu_time": 6.4230000006659793e+04, + "real_time": 1.8616840000000000e+09, + "cpu_time": 3.8631000002453671e+04, "time_unit": "ns", - "IterationTime": 1.8475719999999998e-04 + "IterationTime": 1.8616840000000001e-04 }, { "name": "BM_pgm_dispatch/eth_dispatch/256/manual_time", - "family_index": 20, + "family_index": 22, "per_family_instance_index": 0, "run_name": "BM_pgm_dispatch/eth_dispatch/256/manual_time", "run_type": "iteration", @@ -2118,14 +2118,14 @@ "repetition_index": 0, "threads": 1, "iterations": 18, - "real_time": 3.9670388888888881e+07, - "cpu_time": 9.0561111111103150e+04, + "real_time": 3.9575555555555552e+07, + "cpu_time": 2.2015555555378163e+04, "time_unit": "ns", - "IterationTime": 3.9670388888888881e-06 + "IterationTime": 3.9575555555555552e-06 }, { "name": "BM_pgm_dispatch/eth_dispatch/512/manual_time", - "family_index": 20, + "family_index": 22, "per_family_instance_index": 1, "run_name": "BM_pgm_dispatch/eth_dispatch/512/manual_time", "run_type": "iteration", @@ -2133,14 +2133,14 @@ "repetition_index": 0, "threads": 1, "iterations": 18, - "real_time": 3.9587888888888881e+07, - "cpu_time": 2.8096111111134785e+04, + "real_time": 3.9568499999999993e+07, + "cpu_time": 1.8607777777488209e+04, "time_unit": "ns", - "IterationTime": 3.9587888888888878e-06 + "IterationTime": 3.9568499999999992e-06 }, { "name": "BM_pgm_dispatch/eth_dispatch/1024/manual_time", - "family_index": 20, + "family_index": 22, "per_family_instance_index": 2, "run_name": "BM_pgm_dispatch/eth_dispatch/1024/manual_time", "run_type": "iteration", @@ -2148,14 +2148,14 @@ "repetition_index": 0, "threads": 1, "iterations": 18, - "real_time": 3.9585444444444448e+07, - "cpu_time": 2.6070555555356299e+04, + "real_time": 3.9578277777777784e+07, + "cpu_time": 2.2552444444477893e+04, "time_unit": "ns", - "IterationTime": 3.9585444444444451e-06 + "IterationTime": 3.9578277777777777e-06 }, { "name": "BM_pgm_dispatch/eth_dispatch/2048/manual_time", - "family_index": 20, + "family_index": 22, "per_family_instance_index": 3, "run_name": "BM_pgm_dispatch/eth_dispatch/2048/manual_time", "run_type": "iteration", @@ -2163,14 +2163,14 @@ "repetition_index": 0, "threads": 1, "iterations": 18, - "real_time": 3.9588000000000000e+07, - "cpu_time": 2.6295000000213047e+04, + "real_time": 3.9572277777777784e+07, + "cpu_time": 1.9345055555675117e+04, "time_unit": "ns", - "IterationTime": 3.9588000000000001e-06 + "IterationTime": 3.9572277777777781e-06 }, { "name": "BM_pgm_dispatch/eth_dispatch/4096/manual_time", - "family_index": 20, + "family_index": 22, "per_family_instance_index": 4, "run_name": "BM_pgm_dispatch/eth_dispatch/4096/manual_time", "run_type": "iteration", @@ -2178,14 +2178,14 @@ "repetition_index": 0, "threads": 1, "iterations": 18, - "real_time": 3.9599777777777776e+07, - "cpu_time": 3.6100555555880950e+04, + "real_time": 3.9572444444444448e+07, + "cpu_time": 2.3290999999956184e+04, "time_unit": "ns", - "IterationTime": 3.9599777777777774e-06 + "IterationTime": 3.9572444444444448e-06 }, { "name": "BM_pgm_dispatch/eth_dispatch/8192/manual_time", - "family_index": 20, + "family_index": 22, "per_family_instance_index": 5, "run_name": "BM_pgm_dispatch/eth_dispatch/8192/manual_time", "run_type": "iteration", @@ -2193,14 +2193,14 @@ "repetition_index": 0, "threads": 1, "iterations": 18, - "real_time": 3.9588722222222216e+07, - "cpu_time": 3.3049999999769221e+04, + "real_time": 3.9588333333333336e+07, + "cpu_time": 2.9833888889009409e+04, "time_unit": "ns", - "IterationTime": 3.9588722222222222e-06 + "IterationTime": 3.9588333333333335e-06 }, { "name": "BM_pgm_dispatch/tensix_eth_2/256/manual_time", - "family_index": 21, + "family_index": 23, "per_family_instance_index": 0, "run_name": "BM_pgm_dispatch/tensix_eth_2/256/manual_time", "run_type": "iteration", @@ -2208,14 +2208,14 @@ "repetition_index": 0, "threads": 1, "iterations": 5, - "real_time": 1.4899440000000000e+08, - "cpu_time": 4.8180000000286331e+04, + "real_time": 1.4142620000000000e+08, + "cpu_time": 3.8329999999575652e+04, "time_unit": "ns", - "IterationTime": 1.4899439999999998e-05 + "IterationTime": 1.4142619999999999e-05 }, { "name": "BM_pgm_dispatch/tensix_eth_2/512/manual_time", - "family_index": 21, + "family_index": 23, "per_family_instance_index": 1, "run_name": "BM_pgm_dispatch/tensix_eth_2/512/manual_time", "run_type": "iteration", @@ -2223,14 +2223,14 @@ "repetition_index": 0, "threads": 1, "iterations": 5, - "real_time": 1.5149100000000000e+08, - "cpu_time": 3.5819999999375796e+04, + "real_time": 1.4812320000000000e+08, + "cpu_time": 3.3424000000081833e+04, "time_unit": "ns", - "IterationTime": 1.5149100000000000e-05 + "IterationTime": 1.4812319999999998e-05 }, { "name": "BM_pgm_dispatch/tensix_eth_2/1024/manual_time", - "family_index": 21, + "family_index": 23, "per_family_instance_index": 2, "run_name": "BM_pgm_dispatch/tensix_eth_2/1024/manual_time", "run_type": "iteration", @@ -2238,14 +2238,14 @@ "repetition_index": 0, "threads": 1, "iterations": 5, - "real_time": 1.5150920000000000e+08, - "cpu_time": 3.9678000000265005e+04, + "real_time": 1.5148540000000000e+08, + "cpu_time": 2.4277999999355870e+04, "time_unit": "ns", - "IterationTime": 1.5150920000000001e-05 + "IterationTime": 1.5148539999999998e-05 }, { "name": "BM_pgm_dispatch/tensix_eth_2/2048/manual_time", - "family_index": 21, + "family_index": 23, "per_family_instance_index": 3, "run_name": "BM_pgm_dispatch/tensix_eth_2/2048/manual_time", "run_type": "iteration", @@ -2253,14 +2253,14 @@ "repetition_index": 0, "threads": 1, "iterations": 4, - "real_time": 1.6228725000000000e+08, - "cpu_time": 5.8867499999948333e+04, + "real_time": 1.6367274999999997e+08, + "cpu_time": 3.2517750000238266e+04, "time_unit": "ns", - "IterationTime": 1.6228724999999998e-05 + "IterationTime": 1.6367274999999997e-05 }, { "name": "BM_pgm_dispatch/tensix_eth_2/4096/manual_time", - "family_index": 21, + "family_index": 23, "per_family_instance_index": 4, "run_name": "BM_pgm_dispatch/tensix_eth_2/4096/manual_time", "run_type": "iteration", @@ -2268,14 +2268,14 @@ "repetition_index": 0, "threads": 1, "iterations": 3, - "real_time": 2.1681099999999997e+08, - "cpu_time": 4.1203333331907292e+04, + "real_time": 2.1807833333333334e+08, + "cpu_time": 2.7522999999973763e+04, "time_unit": "ns", - "IterationTime": 2.1681099999999998e-05 + "IterationTime": 2.1807833333333332e-05 }, { "name": "BM_pgm_dispatch/tensix_eth_2/8192/manual_time", - "family_index": 21, + "family_index": 23, "per_family_instance_index": 5, "run_name": "BM_pgm_dispatch/tensix_eth_2/8192/manual_time", "run_type": "iteration", @@ -2283,14 +2283,14 @@ "repetition_index": 0, "threads": 1, "iterations": 2, - "real_time": 3.2368400000000000e+08, - "cpu_time": 5.7760000000683933e+04, + "real_time": 3.2477100000000006e+08, + "cpu_time": 3.4020000001078188e+04, "time_unit": "ns", - "IterationTime": 3.2368399999999993e-05 + "IterationTime": 3.2477100000000001e-05 }, { "name": "BM_pgm_dispatch/tensix_eth_2_4_shadow/256/manual_time", - "family_index": 22, + "family_index": 24, "per_family_instance_index": 0, "run_name": "BM_pgm_dispatch/tensix_eth_2_4_shadow/256/manual_time", "run_type": "iteration", @@ -2298,14 +2298,14 @@ "repetition_index": 0, "threads": 1, "iterations": 1, - "real_time": 1.1004590000000000e+09, - "cpu_time": 5.8760000001711887e+04, + "real_time": 1.0864170000000000e+09, + "cpu_time": 3.6670000000071923e+04, "time_unit": "ns", - "IterationTime": 1.1004590000000000e-04 + "IterationTime": 1.0864170000000000e-04 }, { "name": "BM_pgm_dispatch/tensix_eth_2_4_shadow/512/manual_time", - "family_index": 22, + "family_index": 24, "per_family_instance_index": 1, "run_name": "BM_pgm_dispatch/tensix_eth_2_4_shadow/512/manual_time", "run_type": "iteration", @@ -2313,14 +2313,14 @@ "repetition_index": 0, "threads": 1, "iterations": 1, - "real_time": 1.1251760000000000e+09, - "cpu_time": 6.8139999996219558e+04, + "real_time": 1.1051990000000000e+09, + "cpu_time": 3.6348999998381260e+04, "time_unit": "ns", - "IterationTime": 1.1251759999999999e-04 + "IterationTime": 1.1051990000000001e-04 }, { "name": "BM_pgm_dispatch/tensix_eth_2_4_shadow/1024/manual_time", - "family_index": 22, + "family_index": 24, "per_family_instance_index": 2, "run_name": "BM_pgm_dispatch/tensix_eth_2_4_shadow/1024/manual_time", "run_type": "iteration", @@ -2328,14 +2328,14 @@ "repetition_index": 0, "threads": 1, "iterations": 1, - "real_time": 1.1302340000000000e+09, - "cpu_time": 6.0260000005030175e+04, + "real_time": 1.1301090000000000e+09, + "cpu_time": 3.0899999998723615e+04, "time_unit": "ns", - "IterationTime": 1.1302339999999999e-04 + "IterationTime": 1.1301090000000001e-04 }, { "name": "BM_pgm_dispatch/tensix_eth_2_4_shadow/2048/manual_time", - "family_index": 22, + "family_index": 24, "per_family_instance_index": 3, "run_name": "BM_pgm_dispatch/tensix_eth_2_4_shadow/2048/manual_time", "run_type": "iteration", @@ -2343,14 +2343,14 @@ "repetition_index": 0, "threads": 1, "iterations": 1, - "real_time": 1.1302240000000000e+09, - "cpu_time": 5.9579999998504718e+04, + "real_time": 1.1301990000000000e+09, + "cpu_time": 3.8449999998135812e+04, "time_unit": "ns", - "IterationTime": 1.1302240000000000e-04 + "IterationTime": 1.1301989999999999e-04 }, { "name": "BM_pgm_dispatch/tensix_eth_2_4_shadow/4096/manual_time", - "family_index": 22, + "family_index": 24, "per_family_instance_index": 4, "run_name": "BM_pgm_dispatch/tensix_eth_2_4_shadow/4096/manual_time", "run_type": "iteration", @@ -2358,14 +2358,14 @@ "repetition_index": 0, "threads": 1, "iterations": 1, - "real_time": 1.2231670000000000e+09, - "cpu_time": 7.3360000001798646e+04, + "real_time": 1.2371950000000000e+09, + "cpu_time": 3.1809999995857652e+04, "time_unit": "ns", - "IterationTime": 1.2231669999999999e-04 + "IterationTime": 1.2371950000000001e-04 }, { "name": "BM_pgm_dispatch/tensix_eth_2_4_shadow/8192/manual_time", - "family_index": 22, + "family_index": 24, "per_family_instance_index": 5, "run_name": "BM_pgm_dispatch/tensix_eth_2_4_shadow/8192/manual_time", "run_type": "iteration", @@ -2373,10 +2373,10 @@ "repetition_index": 0, "threads": 1, "iterations": 1, - "real_time": 1.8250840000000000e+09, - "cpu_time": 6.4179999995417347e+04, + "real_time": 1.8342070000000000e+09, + "cpu_time": 3.7970999997583021e+04, "time_unit": "ns", - "IterationTime": 1.8250840000000001e-04 + "IterationTime": 1.8342070000000000e-04 } ] } From ef443a7fb3d3524a29caf2bb03f5cff7d7b99af3 Mon Sep 17 00:00:00 2001 From: Joseph Chu Date: Fri, 14 Feb 2025 08:34:38 +0000 Subject: [PATCH 135/316] #0: Provide an example of hybrid TP/DP using all-gather w/ line topo --- ...brid_data_tensor_parallel_example_T3000.py | 94 +++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 tests/ttnn/distributed/test_hybrid_data_tensor_parallel_example_T3000.py diff --git a/tests/ttnn/distributed/test_hybrid_data_tensor_parallel_example_T3000.py b/tests/ttnn/distributed/test_hybrid_data_tensor_parallel_example_T3000.py new file mode 100644 index 00000000000..65c8b954784 --- /dev/null +++ b/tests/ttnn/distributed/test_hybrid_data_tensor_parallel_example_T3000.py @@ -0,0 +1,94 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +import ttnn +import torch +import transformers +import pytest + +from tests.ttnn.utils_for_testing import assert_with_pcc +from ttnn.model_preprocessing import preprocess_model_parameters + +CLUSTER_AXIS_X = 1 + + +class TtFalconMLP: + def __init__(self, parameters, mesh_device): + super().__init__() + self.mesh_device = mesh_device + self.dense_h_to_4h_weights = parameters.dense_h_to_4h.weight + self.dense_4h_to_h_weights = parameters.dense_4h_to_h.weight + + def __call__(self, x: ttnn.Tensor) -> ttnn.Tensor: + ff1_linear: ttnn.Tensor = ttnn.linear(x, self.dense_h_to_4h_weights) + gelu = ttnn.gelu(ff1_linear) + + # Effectively invokes CCL Line All Gather for every row of the mesh + gelu = ttnn.all_gather( + gelu, + dim=-1, + num_links=1, + cluster_axis=CLUSTER_AXIS_X, + mesh_device=self.mesh_device, + topology=ttnn.Topology.Linear, + ) + + ff2_linear: ttnn.Tensor = ttnn.linear(gelu, self.dense_4h_to_h_weights) + + return ff2_linear + + +def test_tensor_parallel_falcon_mlp(): + if ttnn.get_num_devices() < 8: + pytest.skip() + + mesh_device = ttnn.open_mesh_device( + ttnn.MeshShape(2, 4), + ) + mesh_device.enable_async(True) + + # Set PyTorch seed for reproducibility + torch.manual_seed(0) + + # Load Falcon MLP model from huggingface + config = transformers.FalconConfig.from_pretrained("tiiuae/falcon-7b-instruct") + model = transformers.models.falcon.modeling_falcon.FalconMLP(config).eval() + + # Initialize hidden states + batch_size, sequence_length = 2, 256 + torch_hidden_states = (torch.rand(batch_size, 1, sequence_length, config.hidden_size, dtype=torch.float32) * 2) - 1 + torch_output = model.forward(torch_hidden_states) + + # DP = 2; shard activations on batch-dim: [2,1,sequence_length,hidden_size] and replicate along columns of the mesh + # [A0, A0, A0, A0] + # [A1, A1, A1, A1] + hidden_states, parameters = None, None + mesh_shape = tuple(mesh_device.shape) + + with ttnn.distribute(ttnn.ShardTensor2dMesh(mesh_device, mesh_shape=mesh_shape, dims=(0, None))): + hidden_states = ttnn.from_torch( + torch_hidden_states, + dtype=ttnn.bfloat16, + layout=ttnn.TILE_LAYOUT, + device=mesh_device, + ) + + # TP = 4; ctx manager replicate model weights along rows of the mesh and shards replicas on columns of the mesh + # [W0, W1, W2, W3] + # [W0, W1, W2, W3] + with ttnn.distribute(ttnn.ShardTensor2dMesh(mesh_device, mesh_shape=mesh_shape, dims=(None, -1))): + parameters = ttnn.model_preprocessing.preprocess_model_parameters( + initialize_model=lambda: model, + device=mesh_device, + ) + + # Initialize Model + ttnn_model = TtFalconMLP(parameters, mesh_device) + + # Run Model + ttnn_output = ttnn_model(hidden_states) + + with ttnn.distribute(ttnn.ConcatMesh2dToTensor(mesh_device, mesh_shape=(2, 4), dims=(0, -1))): + assert_with_pcc(torch_output, ttnn.to_torch(ttnn_output), 0.98) From b3faec4a37e318c6365dcb19f1d89781aa8c0d8d Mon Sep 17 00:00:00 2001 From: Jay Kruer Date: Tue, 4 Feb 2025 21:45:30 +0000 Subject: [PATCH 136/316] #0: Fix includes in clip_grad_norm.cpp --- tt-train/sources/ttml/core/clip_grad_norm.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tt-train/sources/ttml/core/clip_grad_norm.cpp b/tt-train/sources/ttml/core/clip_grad_norm.cpp index 6577be9c8de..80f49b3ca83 100644 --- a/tt-train/sources/ttml/core/clip_grad_norm.cpp +++ b/tt-train/sources/ttml/core/clip_grad_norm.cpp @@ -2,10 +2,10 @@ // // SPDX-License-Identifier: Apache-2.0 -#include -#include #include -#include +#include "core/clip_grad_norm.hpp" +#include "core/compute_kernel_config.hpp" +#include "serialization/serializable.hpp" namespace ttml::core { From 81451213a655ed1bd02a43036261d7e06407cb9a Mon Sep 17 00:00:00 2001 From: Jay Kruer Date: Tue, 4 Feb 2025 21:46:09 +0000 Subject: [PATCH 137/316] #0: Move grad clipping w.r.t grad accumulation in nanogpt --- tt-train/sources/examples/nano_gpt/main.cpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/tt-train/sources/examples/nano_gpt/main.cpp b/tt-train/sources/examples/nano_gpt/main.cpp index 03dfa68eed0..737d81f3171 100644 --- a/tt-train/sources/examples/nano_gpt/main.cpp +++ b/tt-train/sources/examples/nano_gpt/main.cpp @@ -635,14 +635,13 @@ int main(int argc, char **argv) { auto samples = features->get_value().get_logical_shape()[0]; gradient_accumulator_helper.update(loss_float, samples); - // synchronize gradients for multi-device case, no-op if single device - auto parameters = model->parameters(); - ttml::core::distributed::synchronize_parameters(parameters); - if (config.use_clip_grad_norm) { - ttml::core::clip_grad_norm(parameters, config.clip_grad_norm_max_norm); - } - if (gradient_accumulator_helper.should_step()) { + // synchronize gradients for multi-device case, no-op if single device + auto parameters = model->parameters(); + ttml::core::distributed::synchronize_parameters(parameters); + if (config.use_clip_grad_norm) { + ttml::core::clip_grad_norm(parameters, config.clip_grad_norm_max_norm); + } optimizer->step(); scheduler->step(); auto global_step = optimizer->get_steps(); From e8f974e4548a93763fdbfcbbf4d478f9f624058c Mon Sep 17 00:00:00 2001 From: Jay Kruer Date: Wed, 5 Feb 2025 18:00:45 +0000 Subject: [PATCH 138/316] #0: Add ddp to nanogpt yaml --- tt-train/configs/training_shakespear_nanogpt.yaml | 1 + tt-train/sources/examples/nano_gpt/main.cpp | 15 +++++++++------ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/tt-train/configs/training_shakespear_nanogpt.yaml b/tt-train/configs/training_shakespear_nanogpt.yaml index 3ce6e32d1f8..45f734f7720 100644 --- a/tt-train/configs/training_shakespear_nanogpt.yaml +++ b/tt-train/configs/training_shakespear_nanogpt.yaml @@ -11,6 +11,7 @@ training_config: use_kahan_summation: false use_clip_grad_norm: false clip_grad_norm_max_norm: 1.0 + use_ddp: false transformer_config: num_heads: 6 embedding_dim: 384 diff --git a/tt-train/sources/examples/nano_gpt/main.cpp b/tt-train/sources/examples/nano_gpt/main.cpp index 737d81f3171..927814c8741 100644 --- a/tt-train/sources/examples/nano_gpt/main.cpp +++ b/tt-train/sources/examples/nano_gpt/main.cpp @@ -333,6 +333,7 @@ struct TrainingConfig { std::string scheduler_type = "identity"; bool use_clip_grad_norm = false; float clip_grad_norm_max_norm = 1.0F; + bool use_ddp = false; ttml::models::gpt2::TransformerConfig transformer_config; }; @@ -356,6 +357,7 @@ TrainingConfig parse_config(const YAML::Node &yaml_config) { config.tokenizer_type = training_config["tokenizer_type"].as(config.tokenizer_type); config.scheduler_type = training_config["scheduler_type"].as(config.scheduler_type); config.use_clip_grad_norm = training_config["use_clip_grad_norm"].as(config.use_clip_grad_norm); + config.use_ddp = training_config["use_ddp"].as(config.use_ddp); config.clip_grad_norm_max_norm = training_config["clip_grad_norm_max_norm"].as(config.clip_grad_norm_max_norm); @@ -377,14 +379,18 @@ int main(int argc, char **argv) { bool is_eval = false; bool add_time_to_name = true; bool enable_wandb = true; - bool ddp = false; app.add_option("-c,--config", config_name, "Yaml Config name")->default_val(config_name); app.add_option("-e,--eval", is_eval, "Is evaluation")->default_val(is_eval); app.add_option("-t,--add_time_to_name", add_time_to_name, "Add time to run name")->default_val(add_time_to_name); app.add_option("-w,--wandb", enable_wandb, "Enable wandb logging")->default_val(enable_wandb); - app.add_option("-d,--ddp", ddp, "Enable DDP")->default_val(ddp); CLI11_PARSE(app, argc, argv); + auto yaml_config = YAML::LoadFile(config_name); + TrainingConfig config = parse_config(yaml_config); + EvalConfig eval_config = parse_eval_config(yaml_config); + + bool ddp = config.use_ddp; + initialize_device(ddp); if (enable_wandb) { @@ -395,10 +401,6 @@ int main(int argc, char **argv) { } } - auto yaml_config = YAML::LoadFile(config_name); - TrainingConfig config = parse_config(yaml_config); - EvalConfig eval_config = parse_eval_config(yaml_config); - if (enable_wandb) { wandbcpp::init({.project = config.project_name, .name = generate_run_name(config, add_time_to_name)}); wandbcpp::update_config({ @@ -424,6 +426,7 @@ int main(int argc, char **argv) { {"scheduler_type", config.scheduler_type}, {"using_clip_grad_norm", config.use_clip_grad_norm}, {"clip_grad_norm_max_norm", config.clip_grad_norm_max_norm}, + {"use_ddp", config.use_ddp}, }); } From 4abdb6a888b618b9befbd552b99aacdf62909232 Mon Sep 17 00:00:00 2001 From: Jay Kruer Date: Tue, 18 Feb 2025 17:00:48 +0000 Subject: [PATCH 139/316] Revert "#0: Add ddp to nanogpt yaml" This reverts commit b2ccef2672ea8763f398325a32db867bf1f92683. --- tt-train/configs/training_shakespear_nanogpt.yaml | 1 - tt-train/sources/examples/nano_gpt/main.cpp | 15 ++++++--------- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/tt-train/configs/training_shakespear_nanogpt.yaml b/tt-train/configs/training_shakespear_nanogpt.yaml index 45f734f7720..3ce6e32d1f8 100644 --- a/tt-train/configs/training_shakespear_nanogpt.yaml +++ b/tt-train/configs/training_shakespear_nanogpt.yaml @@ -11,7 +11,6 @@ training_config: use_kahan_summation: false use_clip_grad_norm: false clip_grad_norm_max_norm: 1.0 - use_ddp: false transformer_config: num_heads: 6 embedding_dim: 384 diff --git a/tt-train/sources/examples/nano_gpt/main.cpp b/tt-train/sources/examples/nano_gpt/main.cpp index 927814c8741..737d81f3171 100644 --- a/tt-train/sources/examples/nano_gpt/main.cpp +++ b/tt-train/sources/examples/nano_gpt/main.cpp @@ -333,7 +333,6 @@ struct TrainingConfig { std::string scheduler_type = "identity"; bool use_clip_grad_norm = false; float clip_grad_norm_max_norm = 1.0F; - bool use_ddp = false; ttml::models::gpt2::TransformerConfig transformer_config; }; @@ -357,7 +356,6 @@ TrainingConfig parse_config(const YAML::Node &yaml_config) { config.tokenizer_type = training_config["tokenizer_type"].as(config.tokenizer_type); config.scheduler_type = training_config["scheduler_type"].as(config.scheduler_type); config.use_clip_grad_norm = training_config["use_clip_grad_norm"].as(config.use_clip_grad_norm); - config.use_ddp = training_config["use_ddp"].as(config.use_ddp); config.clip_grad_norm_max_norm = training_config["clip_grad_norm_max_norm"].as(config.clip_grad_norm_max_norm); @@ -379,18 +377,14 @@ int main(int argc, char **argv) { bool is_eval = false; bool add_time_to_name = true; bool enable_wandb = true; + bool ddp = false; app.add_option("-c,--config", config_name, "Yaml Config name")->default_val(config_name); app.add_option("-e,--eval", is_eval, "Is evaluation")->default_val(is_eval); app.add_option("-t,--add_time_to_name", add_time_to_name, "Add time to run name")->default_val(add_time_to_name); app.add_option("-w,--wandb", enable_wandb, "Enable wandb logging")->default_val(enable_wandb); + app.add_option("-d,--ddp", ddp, "Enable DDP")->default_val(ddp); CLI11_PARSE(app, argc, argv); - auto yaml_config = YAML::LoadFile(config_name); - TrainingConfig config = parse_config(yaml_config); - EvalConfig eval_config = parse_eval_config(yaml_config); - - bool ddp = config.use_ddp; - initialize_device(ddp); if (enable_wandb) { @@ -401,6 +395,10 @@ int main(int argc, char **argv) { } } + auto yaml_config = YAML::LoadFile(config_name); + TrainingConfig config = parse_config(yaml_config); + EvalConfig eval_config = parse_eval_config(yaml_config); + if (enable_wandb) { wandbcpp::init({.project = config.project_name, .name = generate_run_name(config, add_time_to_name)}); wandbcpp::update_config({ @@ -426,7 +424,6 @@ int main(int argc, char **argv) { {"scheduler_type", config.scheduler_type}, {"using_clip_grad_norm", config.use_clip_grad_norm}, {"clip_grad_norm_max_norm", config.clip_grad_norm_max_norm}, - {"use_ddp", config.use_ddp}, }); } From b5b199bac7d6c45fc62ae65676bc188c1a0381df Mon Sep 17 00:00:00 2001 From: Michael Chiou <156848643+ttmchiou@users.noreply.github.com> Date: Tue, 18 Feb 2025 11:25:32 -0800 Subject: [PATCH 140/316] Revert "#17094: fill implicit pad sharded using the new shardedAddrGen (#17692)" This reverts commit ed210e7dae8dafba91a5434d6fbb50dc7dce8932. --- .../unit_tests/operations/test_fill_pad.py | 153 +----------------- .../fill_pad/device/fill_pad_op.cpp | 6 + .../device/fill_pad_program_factory.cpp | 13 +- .../kernels/dataflow/fill_pad_writer.cpp | 28 +--- 4 files changed, 13 insertions(+), 187 deletions(-) diff --git a/tests/ttnn/unit_tests/operations/test_fill_pad.py b/tests/ttnn/unit_tests/operations/test_fill_pad.py index 489cb371325..48dff554b6c 100644 --- a/tests/ttnn/unit_tests/operations/test_fill_pad.py +++ b/tests/ttnn/unit_tests/operations/test_fill_pad.py @@ -5,7 +5,6 @@ import pytest import torch import ttnn -import math from tests.ttnn.utils_for_testing import assert_with_pcc from models.utility_functions import torch_random, run_for_wormhole_b0 @@ -53,12 +52,12 @@ def create_nd_padded_tiled_tensor(shape, tile_size, fill_value, dtype): ttnn.bfloat16: torch.float32, } -# torch.set_printoptions(threshold=10000) - +# @pytest.mark.parametrize("shape", [(2, 32, 300, 256)]) @pytest.mark.parametrize( "shape", [ + # 2D shapes with edge cases for fill_pad (1, 16), (16, 1), (1, 17), @@ -68,7 +67,6 @@ def create_nd_padded_tiled_tensor(shape, tile_size, fill_value, dtype): (31, 31), (33, 33), (65, 65), - (97, 97), (1, 2, 3, 2, 1, 2, 97, 97), ], ) @@ -98,150 +96,3 @@ def test_fill_pad( padded_torch_output_tensor = ttnn.from_device(output_tensor).to_torch() assert_with_pcc(padded_torch_tensor, padded_torch_output_tensor) - - -@pytest.mark.parametrize("fill_value", [1]) -@pytest.mark.parametrize( - "shape", - [ - (1, 16), - (97, 97), - ], -) -@pytest.mark.parametrize( - "shard_scheme", - [ - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, - ttnn.TensorMemoryLayout.WIDTH_SHARDED, - ttnn.TensorMemoryLayout.BLOCK_SHARDED, - ], -) -@pytest.mark.parametrize("dtype", [ttnn.bfloat16, ttnn.uint32]) -def test_fill_pad_complex_sharding(device, fill_value, shape, shard_scheme, dtype): - torch.manual_seed(1234) - torch_input_tensor, padded_torch_tensor = create_nd_padded_tiled_tensor( - shape, 32, fill_value, ttnn_dtype_to_torch_dtype[dtype] - ) - num_cores_xblock = 2 - num_cores_yblock = 4 - num_cores = num_cores_xblock * num_cores_yblock - - # Add complex shard grid with 2 X 4 = 8 cores - shard_grid = ttnn.CoreRangeSet( - [ - ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(0, 1)), - ttnn.CoreRange(ttnn.CoreCoord(2, 0), ttnn.CoreCoord(3, 1)), - ttnn.CoreRange(ttnn.CoreCoord(0, 4), ttnn.CoreCoord(0, 5)), - ] - ) - - tiles_per_2d = padded_torch_tensor.shape[-2] * padded_torch_tensor.shape[-1] / (32 * 32) - dims_b4_last_dim = 1 - for i in range(len(padded_torch_tensor.shape) - 1): - dims_b4_last_dim *= padded_torch_tensor.shape[i] - - shard_shape = [32, 32] - if shard_scheme == ttnn.TensorMemoryLayout.WIDTH_SHARDED: - shard_shape = (dims_b4_last_dim, 32 * math.ceil((math.ceil(padded_torch_tensor.shape[-1] / 32) / num_cores))) - elif shard_scheme == ttnn.TensorMemoryLayout.HEIGHT_SHARDED: - tile_widths_per_core = math.ceil(dims_b4_last_dim / num_cores) - shard_shape = (32 * tile_widths_per_core, padded_torch_tensor.shape[-1]) - elif shard_scheme == ttnn.TensorMemoryLayout.BLOCK_SHARDED: - tile_widths_per_core = math.ceil(dims_b4_last_dim / num_cores_xblock) - shard_shape = ( - 32 * tile_widths_per_core, - 32 * math.ceil((math.ceil(padded_torch_tensor.shape[-1] / 32) / num_cores_yblock)), - ) - else: - shard_shape = (math.ceil(math.sqrt(tiles_per_core)), math.ceil(math.sqrt(tiles_per_core))) - - shard_spec = ttnn.ShardSpec(shard_grid, shard_shape, ttnn.ShardOrientation.ROW_MAJOR) - output_mem_config = ttnn.MemoryConfig( - shard_scheme, - ttnn.BufferType.L1, - shard_spec, - ) - - input_tensor = ttnn.to_device( - ttnn.from_torch(torch_input_tensor, dtype=dtype, layout=ttnn.TILE_LAYOUT), - device, - memory_config=output_mem_config, - ) - - output_tensor = ttnn.fill_implicit_tile_padding(input_tensor, fill_value, memory_config=ttnn.DRAM_MEMORY_CONFIG) - padded_torch_output_tensor = ttnn.from_device(output_tensor).to_torch() - - assert_with_pcc(padded_torch_tensor, padded_torch_output_tensor, 0.99) - - -@pytest.mark.parametrize("fill_value", [1]) -@pytest.mark.parametrize( - "shape", - [ - (1, 16), - (16, 1), - (17, 17), - (17, 1), - (16, 16), - (17, 17), - (31, 31), - (33, 33), - (97, 97), - ], -) -@pytest.mark.parametrize( - "shard_scheme", - [ - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, - ttnn.TensorMemoryLayout.WIDTH_SHARDED, - ttnn.TensorMemoryLayout.BLOCK_SHARDED, - ], -) -@pytest.mark.parametrize("dtype", [ttnn.bfloat16, ttnn.uint32]) -def test_fill_pad_sharded(device, fill_value, shape, shard_scheme, dtype): - torch.manual_seed(1234) - torch_input_tensor, padded_torch_tensor = create_nd_padded_tiled_tensor( - shape, 32, fill_value, ttnn_dtype_to_torch_dtype[dtype] - ) - - num_cores_x = 8 - num_cores_y = 7 - num_cores = num_cores_x * num_cores_y - shard_grid = ttnn.CoreRangeSet( - [ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(num_cores_x - 1, num_cores_y - 1))] - ) - - tiles_per_2d = padded_torch_tensor.shape[-2] * padded_torch_tensor.shape[-1] / (32 * 32) - dims_b4_last_dim = 1 - for i in range(len(padded_torch_tensor.shape) - 1): - dims_b4_last_dim *= padded_torch_tensor.shape[i] - - shard_shape = [32, 32] - if shard_scheme == ttnn.TensorMemoryLayout.WIDTH_SHARDED: - shard_shape = (dims_b4_last_dim, 32 * math.ceil((math.ceil(padded_torch_tensor.shape[-1] / 32) / num_cores))) - elif shard_scheme == ttnn.TensorMemoryLayout.HEIGHT_SHARDED: - tile_widths_per_core = math.ceil(dims_b4_last_dim / num_cores) - shard_shape = (32 * tile_widths_per_core, padded_torch_tensor.shape[-1]) - elif shard_scheme == ttnn.TensorMemoryLayout.BLOCK_SHARDED: - tile_widths_per_core = math.ceil(dims_b4_last_dim / num_cores_x) - shard_shape = (32 * tile_widths_per_core, 32 * math.ceil((padded_torch_tensor.shape[-1] / 32 / num_cores_y))) - else: - shard_shape = (math.ceil(math.sqrt(tiles_per_core)), math.ceil(math.sqrt(tiles_per_core))) - - shard_spec = ttnn.ShardSpec(shard_grid, shard_shape, ttnn.ShardOrientation.ROW_MAJOR) - output_mem_config = ttnn.MemoryConfig( - shard_scheme, - ttnn.BufferType.L1, - shard_spec, - ) - - input_tensor = ttnn.to_device( - ttnn.from_torch(torch_input_tensor, dtype=dtype, layout=ttnn.TILE_LAYOUT), - device, - memory_config=output_mem_config, - ) - - output_tensor = ttnn.fill_implicit_tile_padding(input_tensor, fill_value, memory_config=ttnn.DRAM_MEMORY_CONFIG) - padded_torch_output_tensor = ttnn.from_device(output_tensor).to_torch() - - assert_with_pcc(padded_torch_tensor, padded_torch_output_tensor, 0.99) diff --git a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_op.cpp index 3de81f581ff..78c13267c69 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_op.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_op.cpp @@ -14,6 +14,12 @@ namespace ttnn::operations::data_movement { void FillPad::validate(const std::vector& input_tensors) const { const auto& input_tensor_a = input_tensors.at(0); TT_FATAL(input_tensor_a.get_layout() == TILE_LAYOUT, "FillPad should only be used for tile layout"); + TT_FATAL( + input_tensor_a.memory_config().memory_layout == TensorMemoryLayout::INTERLEAVED, + "FillPad does not currently support sharding"); + TT_FATAL( + this->output_mem_config.memory_layout == TensorMemoryLayout::INTERLEAVED, + "FillPad does not currently support sharding"); } std::vector FillPad::compute_output_specs(const std::vector& input_tensors) const { diff --git a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_program_factory.cpp index b07c6e65bf0..e798d9f0c3f 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_program_factory.cpp @@ -9,7 +9,6 @@ #include #include #include -#include "ttnn/operations/ccl/sharding_addrgen_helper.hpp" bool is_power_of_two_at_least_32(uint32_t value) { return value >= 32 && (value & (value - 1)) == 0; } @@ -69,8 +68,6 @@ operation::ProgramWithCallbacks fill_pad_multi_core(const Tensor& input_tensor, padded_height / tt::constants::TILE_HEIGHT * padded_width / tt::constants::TILE_HEIGHT; uint32_t tiles_per_tile_row = padded_width / tt::constants::TILE_HEIGHT; - bool sharded = input_tensor.memory_config().memory_layout != TensorMemoryLayout::INTERLEAVED; - // create kernel // reader compile time args std::vector writer_compile_time_args = { @@ -85,12 +82,7 @@ operation::ProgramWithCallbacks fill_pad_multi_core(const Tensor& input_tensor, (std::uint32_t)tiles_per_2d_tensor, (std::uint32_t)tiles_per_tile_row, (std::uint32_t)tt::constants::TILE_HEIGHT, - (std::uint32_t)tt::constants::FACE_HEIGHT, - (std::uint32_t)sharded}; - - if (sharded) { - shard_builder::extend_sharding_compile_time_args(input_tensor, writer_compile_time_args); - } + (std::uint32_t)tt::constants::FACE_HEIGHT}; tt::tt_metal::KernelHandle writer_kernel_id = tt::tt_metal::CreateKernel( program, @@ -110,9 +102,6 @@ operation::ProgramWithCallbacks fill_pad_multi_core(const Tensor& input_tensor, { writer_runtime_args[2] = tile_offset; writer_runtime_args[3] = local_num_2d_tensors; - if (sharded) { - shard_builder::extend_sharding_run_time_args(input_tensor, writer_runtime_args); - } tt_metal::SetRuntimeArgs(program, writer_kernel_id, core, writer_runtime_args); } diff --git a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/kernels/dataflow/fill_pad_writer.cpp b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/kernels/dataflow/fill_pad_writer.cpp index 91d166e9510..a94aa7fdea0 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/kernels/dataflow/fill_pad_writer.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/kernels/dataflow/fill_pad_writer.cpp @@ -3,8 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 #include "dataflow_api.h" -#include "cpp/ttnn/operations/ccl/shared_with_host/sharded_tensor_addr_gen.hpp" -#include "ttnn/cpp/ttnn/operations/ccl/kernel_common/sharding_addrgen.hpp" void kernel_main() { constexpr uint32_t cb_id_0 = get_compile_time_arg_val(0); @@ -21,38 +19,20 @@ void kernel_main() { constexpr uint32_t tile_size = get_compile_time_arg_val(10); constexpr uint32_t tile_hw = tile_size * tile_size; constexpr uint32_t face_size = get_compile_time_arg_val(11); -#define SHARDED get_compile_time_arg_val(12) == 1 constexpr uint32_t face_hw = face_size * face_size; constexpr uint32_t alignment_adjustor = 16; - uint32_t rt_arg_ind = 0; - uint32_t dst_addr = get_arg_val(rt_arg_ind++); - uint32_t cb_page_size = get_arg_val(rt_arg_ind++); - uint32_t starting_tile_offset = get_arg_val(rt_arg_ind++); - uint32_t num_2d_tensors = get_arg_val(rt_arg_ind++); + uint32_t dst_addr = get_arg_val(0); + uint32_t cb_page_size = get_arg_val(1); + uint32_t starting_tile_offset = get_arg_val(2); + uint32_t num_2d_tensors = get_arg_val(3); -#if (SHARDED) - typedef ShardedInfo< - get_compile_time_arg_val(13), - get_compile_time_arg_val(14), - get_compile_time_arg_val(15), - get_compile_time_arg_val(16), - get_compile_time_arg_val(17), - get_compile_time_arg_val(18), - get_compile_time_arg_val(19)> - tensor_shard_info; - - const auto [mapping_table, rt_increment] = - experimental::shard_addr_gen_utils::get_shard_map(get_arg_addr(rt_arg_ind)); - experimental::ShardedAddrGen s0 = {.bank_base_address = dst_addr, .shard_array = mapping_table}; -#else const DataFormat data_format = get_dataformat(cb_id_0); const InterleavedAddrGenFast s0 = { .bank_base_address = dst_addr, .page_size = tile_hw * element_size_bytes, .data_format = data_format // page_size needs to be tile_size_bytes }; -#endif // Reserve and push the fill value into the circular buffer cb_reserve_back(cb_id_0, 1); From 2578433dbac501d02103ac8a35f716b689f1ecc6 Mon Sep 17 00:00:00 2001 From: Ata Tuzuner Date: Tue, 18 Feb 2025 14:43:43 -0500 Subject: [PATCH 141/316] #15450: Remove default values from circular buffer parameters in LLK compute APIs: Matmul (#16571) ### Ticket [Link to Github Issue](https://github.com/tenstorrent/tt-metal/issues/15450) ### Problem description Default values for circular buffer arguments in the LLK compute API can cause errors. Forgetting to set these arguments explicitly may lead to errors due to wrong cb usage. This PR is specific to the changes in the matmul kernel APIs: ./tt_metal/include/compute_kernel_api/matmul.h ### What's changed Default values for the circular buffer parameters have been removed from functions within these files. The call chains invoking these functions have been updated to contain explicit arguments for these parameters. ### Checklist - [x] [Post commit CI passes](https://github.com/tenstorrent/tt-metal/actions/runs/13395111513) - [x] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/runs/13395116648) (if applicable) - [ ] Model regression CI testing passes (if applicable) - [ ] Device performance regression CI testing passes (if applicable) - [ ] **(For models and ops writers)** Full [new models](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) tests passes - [ ] New/Existing tests provide coverage for changes --- METALIUM_GUIDE.md | 4 +-- ...m_large_block_zm_fused_bias_activation.cpp | 2 +- ...m_large_block_zm_fused_bias_activation.cpp | 2 +- .../old/matmul/kernels/compute_local_l1.cpp | 2 +- .../tt_metal/test_kernels/compute/bmm.cpp | 2 +- .../compute/bmm_large_block_zm.cpp | 4 +-- ...m_large_block_zm_fused_bias_activation.cpp | 2 +- .../compute/bmm_tilize_untilize.cpp | 6 ++--- .../tt_metal/test_kernels/compute/matmul.cpp | 2 +- .../compute/matmul_large_block.cpp | 8 +++--- .../matmul_large_block_generalized.cpp | 8 +++--- .../compute/matmul_large_block_zm.cpp | 4 +-- .../test_kernels/compute/matmul_with_bias.cpp | 2 +- .../unit_tests/matmul/multi_block_compute.cpp | 4 +-- .../unit_tests/matmul/multi_tile_compute.cpp | 2 +- .../unit_tests/matmul/single_tile_compute.cpp | 2 +- tt_metal/include/compute_kernel_api/matmul.h | 27 +++++++++---------- .../matmul_common/kernels/compute/bmm.cpp | 2 +- .../kernels/compute/bmm_large_block_zm.cpp | 4 +-- .../kernels/compute/bmm_tilize_untilize.cpp | 6 ++--- .../compute/rotary_embedding_llama.cpp | 2 +- .../rotary_embedding_llama_sharded.cpp | 2 +- .../matmul/device/kernels/compute/bmm.cpp | 2 +- .../device/kernels/compute/reduce_w.cpp | 2 +- .../device/kernels/compute/joint_sdpa.cpp | 2 +- .../sdpa/device/kernels/compute/sdpa.cpp | 2 +- .../kernels/compute/sdpa_flash_decode.cpp | 2 +- 27 files changed, 54 insertions(+), 55 deletions(-) diff --git a/METALIUM_GUIDE.md b/METALIUM_GUIDE.md index 5ddc05de55e..96233f76355 100644 --- a/METALIUM_GUIDE.md +++ b/METALIUM_GUIDE.md @@ -125,7 +125,7 @@ kernel: ``` namespace NAMESPACE { void MAIN { - mm_init(); + mm_init(tt::CBIndex::c_0, tt::CBIndex::c_1, tt::CBIndex::c_16); acquire_dst(); cb_wait_front(tt::CBIndex::c_0, /* number of tiles */ 1); @@ -297,7 +297,7 @@ with `tile_regs_..()` functions like: ``` namespace NAMESPACE { void MAIN { - mm_init(); + mm_init(tt::CBIndex::c_0, tt::CBIndex::c_1, tt::CBIndex::c_16); cb_wait_front(tt::CBIndex::c_0, /* number of tiles */ 1); cb_wait_front(tt::CBIndex::c_1, /* number of tiles */ 1); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/kernels/bmm_large_block_zm_fused_bias_activation.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/kernels/bmm_large_block_zm_fused_bias_activation.cpp index 43ba5dee588..fb23c6513d0 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/kernels/bmm_large_block_zm_fused_bias_activation.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/kernels/bmm_large_block_zm_fused_bias_activation.cpp @@ -115,7 +115,7 @@ void MAIN { } cb_pop_front(mm_bias_intermediate_cb_id, out_subblock_num_tiles); // reconfigure init for matmul - mm_init_short(); + mm_init_short(in0_cb_id, in1_cb_id); // reconfigure unpacker df for src B reconfig_data_format(in1_cb_id, in0_cb_id); #endif diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/kernels/bmm_large_block_zm_fused_bias_activation.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/kernels/bmm_large_block_zm_fused_bias_activation.cpp index 8a1ec1c7c45..71010733509 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/kernels/bmm_large_block_zm_fused_bias_activation.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/kernels/bmm_large_block_zm_fused_bias_activation.cpp @@ -115,7 +115,7 @@ void MAIN { } cb_pop_front(mm_bias_intermediate_cb_id, out_subblock_num_tiles); // reconfigure init for matmul - mm_init_short(); + mm_init_short(in0_cb_id, in1_cb_id); // reconfigure unpacker df for src B reconfig_data_format(in1_cb_id, in0_cb_id); #endif diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/kernels/compute_local_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/kernels/compute_local_l1.cpp index cf6377c765b..c4cbb82b508 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/kernels/compute_local_l1.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/kernels/compute_local_l1.cpp @@ -12,7 +12,7 @@ void MAIN { constexpr int onetile = 1; - mm_init(); + mm_init(tt::CBIndex::c_0, tt::CBIndex::c_1, tt::CBIndex::c_16); for (uint32_t mt = 0; mt < sub_Mt; ++mt) { for (uint32_t nt = 0; nt < sub_Nt; ++nt) { diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/bmm.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/bmm.cpp index 06df68f3425..249f2ed5f23 100644 --- a/tests/tt_metal/tt_metal/test_kernels/compute/bmm.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/compute/bmm.cpp @@ -22,7 +22,7 @@ void MAIN { uint32_t Kt = get_compile_time_arg_val(2); uint32_t Nt = get_compile_time_arg_val(3); - mm_init(); + mm_init(tt::CBIndex::c_0, tt::CBIndex::c_1, tt::CBIndex::c_16); // the simplest possible version of outer product blocked matmul // the reader is expected to read the A's and B's tile rows and tile columns for each output tile diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm.cpp index 2ec32305293..e456700b57c 100644 --- a/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm.cpp @@ -22,7 +22,7 @@ void MAIN { uint32_t out_subblock_num_tiles = get_compile_time_arg_val(10); // out_subblock_h * out_subblock_w; uint32_t batch = get_compile_time_arg_val(11); // batch dim - mm_init(); + mm_init(tt::CBIndex::c_0, tt::CBIndex::c_1, tt::CBIndex::c_16); for (uint32_t b = 0; b < batch; b++) { bool spill = num_blocks > 1; @@ -47,7 +47,7 @@ void MAIN { copy_tile(tt::CBIndex::c_24, i, i); } cb_pop_front(tt::CBIndex::c_24, out_subblock_num_tiles); - mm_init_short(); + mm_init_short(tt::CBIndex::c_0, tt::CBIndex::c_1); } // Compute output sub-block from in0_subblock x in1_subblock diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp index 8a1ec1c7c45..71010733509 100644 --- a/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp @@ -115,7 +115,7 @@ void MAIN { } cb_pop_front(mm_bias_intermediate_cb_id, out_subblock_num_tiles); // reconfigure init for matmul - mm_init_short(); + mm_init_short(in0_cb_id, in1_cb_id); // reconfigure unpacker df for src B reconfig_data_format(in1_cb_id, in0_cb_id); #endif diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/bmm_tilize_untilize.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_tilize_untilize.cpp index ff2660e047a..5c408f7935c 100644 --- a/tests/tt_metal/tt_metal/test_kernels/compute/bmm_tilize_untilize.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_tilize_untilize.cpp @@ -147,7 +147,7 @@ void MAIN { bool last_out = (in0_block_w_i == in0_num_blocks_w - 1); if (tilize_in0) { tilize_in(in0_cb_id, in0_subblock_h, in0_block_w, in0_num_subblocks, tilized_in0_cb_id); - mm_init_short(); + mm_init_short(tilized_in0_cb_id, in1_cb_id); cb_wait_front(tilized_in0_cb_id, in0_block_num_tiles); } else { cb_wait_front(in0_cb_id, in0_block_num_tiles); @@ -217,7 +217,7 @@ void MAIN { // do not pop front bias as it may be used again for subsequent blocks cb_pop_front(out_for_bias_cb_id, out_subblock_num_tiles); // reconfig for matmul - mm_init_short(); + mm_init_short(tilize_in0 ? tilized_in0_cb_id : in0_cb_id, in1_cb_id); // reconfig unpacker df for srcB // reconfig_data_format(in1_cb_id, in0_cb_id); } @@ -251,7 +251,7 @@ void MAIN { untilize_mode_final_matmul_partials_cb, untilize_mode_reblock_cb, out_cb_id); - mm_init_short(); + mm_init_short(tilize_in0 ? tilized_in0_cb_id : in0_cb_id, in1_cb_id); } // last_out #endif in0_index_subblock_offset += in0_subblock_num_tiles; diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/matmul.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/matmul.cpp index b4a264f32a4..fdb66ca4d62 100644 --- a/tests/tt_metal/tt_metal/test_kernels/compute/matmul.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/compute/matmul.cpp @@ -17,7 +17,7 @@ void MAIN { uint32_t in1_block_tile_cnt = get_compile_time_arg_val(5); uint32_t out_block_tile_cnt = get_compile_time_arg_val(6); - mm_init(); + mm_init(tt::CBIndex::c_0, tt::CBIndex::c_1, tt::CBIndex::c_16); acquire_dst(); for (uint32_t b = 0; b < block_cnt; ++b) { diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block.cpp index d1a1e46d6fc..8104239a4e1 100644 --- a/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block.cpp @@ -135,12 +135,12 @@ void MAIN { uint32_t untilize_mode_reblock_cb = tt::CBIndex::c_27; uint32_t out0_cb = tt::CBIndex::c_16; - mm_init(); + mm_init(in0_cb, tt::CBIndex::c_1, out0_cb); for (uint32_t block = 0; block < num_blocks; block++) { bool last_out = block == (num_blocks - 1); if (tilize_in) { tilize_activation(in0_cb, in0_subblock_h, in0_block_w, in0_num_subblocks, tilize_mode_tilized_in0_cb); - mm_init_short(); + mm_init_short(tilize_mode_tilized_in0_cb, tt::CBIndex::c_1); cb_wait_front(tilize_mode_tilized_in0_cb, in0_block_num_tiles); } else { cb_wait_front(in0_cb, in0_block_num_tiles); @@ -160,7 +160,7 @@ void MAIN { copy_tile(matmul_partials_cb, i, i); } cb_pop_front(matmul_partials_cb, out_subblock_num_tiles); - mm_init_short(); + mm_init_short(tilize_in ? tilize_mode_tilized_in0_cb : in0_cb, tt::CBIndex::c_1); } // Compute output sub-block from in0_subblock x in1_subblock @@ -217,7 +217,7 @@ void MAIN { untilize_mode_final_matmul_partials_cb, untilize_mode_reblock_cb, out0_cb); - mm_init_short(); + mm_init_short(tilize_in ? tilize_mode_tilized_in0_cb : in0_cb, tt::CBIndex::c_1); } } diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_generalized.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_generalized.cpp index 1094c3463bf..9ed54e19152 100644 --- a/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_generalized.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_generalized.cpp @@ -132,7 +132,7 @@ void MAIN { uint32_t untilize_mode_final_matmul_partials_cb = tt::CBIndex::c_26; uint32_t untilize_mode_reblock_cb = tt::CBIndex::c_27; uint32_t out0_cb = tt::CBIndex::c_16; - mm_init(); + mm_init(in0_cb, tt::CBIndex::c_1, out0_cb); for (uint32_t block_in0_h = 0; block_in0_h < num_blocks_in0_h; block_in0_h++) { for (uint32_t block_in1_w = 0; block_in1_w < num_blocks_in1_w; block_in1_w++) { enable_reload = false; @@ -142,7 +142,7 @@ void MAIN { if (tilize_in) { tilize_activation( in0_cb, in0_subblock_h, in0_block_w, in0_num_subblocks, tilize_mode_tilized_in0_cb); - mm_init_short(); + mm_init_short(tilize_mode_tilized_in0_cb, tt::CBIndex::c_1); cb_wait_front(tilize_mode_tilized_in0_cb, in0_block_num_tiles); } else { @@ -164,7 +164,7 @@ void MAIN { copy_tile(matmul_partials_cb, i, i); } cb_pop_front(matmul_partials_cb, out_subblock_num_tiles); - mm_init_short(); + mm_init_short(tilize_in ? tilize_mode_tilized_in0_cb : in0_cb, tt::CBIndex::c_1); } // Compute output sub-block from in0_subblock x in1_subblock @@ -224,7 +224,7 @@ void MAIN { untilize_mode_final_matmul_partials_cb, untilize_mode_reblock_cb, out0_cb); - mm_init_short(); + mm_init_short(tilize_in ? tilize_mode_tilized_in0_cb : in0_cb, tt::CBIndex::c_1); } } diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_zm.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_zm.cpp index 1b0234e9832..447fef413ee 100644 --- a/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_zm.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_zm.cpp @@ -24,7 +24,7 @@ void MAIN { bool spill = num_blocks > uint32_t(1); - mm_init(); + mm_init(tt::CBIndex::c_0, tt::CBIndex::c_1, tt::CBIndex::c_16); bool enable_reload = false; for (uint32_t block = 0; block < num_blocks; block++) { @@ -45,7 +45,7 @@ void MAIN { copy_tile(tt::CBIndex::c_24, i, i); } cb_pop_front(tt::CBIndex::c_24, out_subblock_num_tiles); - mm_init_short(); + mm_init_short(tt::CBIndex::c_0, tt::CBIndex::c_1); } // Compute output sub-block from in0_subblock x in1_subblock diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/matmul_with_bias.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/matmul_with_bias.cpp index ea630993cff..94e51299482 100644 --- a/tests/tt_metal/tt_metal/test_kernels/compute/matmul_with_bias.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/compute/matmul_with_bias.cpp @@ -23,7 +23,7 @@ void MAIN { acquire_dst(); - mm_init(); + mm_init(tt::CBIndex::c_0, tt::CBIndex::c_1, tt::CBIndex::c_16); for (uint32_t b = 0; b < block_cnt; ++b) { cb_wait_front(tt::CBIndex::c_0, in0_block_tile_cnt); cb_wait_front(tt::CBIndex::c_1, in1_block_tile_cnt); diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/multi_block_compute.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/multi_block_compute.cpp index 0e1dc0c1216..7a325557a10 100644 --- a/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/multi_block_compute.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/multi_block_compute.cpp @@ -25,7 +25,7 @@ void MAIN { // we are looking at block // out = in0[r x k]*in1[k x c] - mm_init(); + mm_init(in0_cb, in1_cb, out_cb); for (uint32_t block_id = 0; block_id < num_blocks; block_id++) { acquire_dst(); if (block_id > 0) { @@ -35,7 +35,7 @@ void MAIN { copy_tile(partials_cb, i, i); } cb_pop_front(partials_cb, out_block_num_tiles); - mm_init_short(); + mm_init_short(in0_cb, in1_cb); } uint32_t out_tile_index = 0; uint32_t in0_index_r_offset = 0; diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/multi_tile_compute.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/multi_tile_compute.cpp index 8c50ca5a30e..ebb68fc031f 100644 --- a/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/multi_tile_compute.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/multi_tile_compute.cpp @@ -22,7 +22,7 @@ void MAIN { // we are looking at block // out = in0[r x k]*in1[k x c] - mm_init(); + mm_init(in0_cb, in1_cb, out_cb); acquire_dst(); uint32_t out_tile_index = 0; diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/single_tile_compute.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/single_tile_compute.cpp index cb8eb194d98..c09c4064a85 100644 --- a/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/single_tile_compute.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/single_tile_compute.cpp @@ -19,7 +19,7 @@ void MAIN { const uint32_t in1_tile_index = 0; const uint32_t out_tile_index = 0; const bool transpose = false; - mm_init(); + mm_init(in0_cb, in1_cb, out_cb); cb_reserve_back(out_cb, num_out_tiles); acquire_dst(); cb_wait_front(in0_cb, num_in0_tiles); diff --git a/tt_metal/include/compute_kernel_api/matmul.h b/tt_metal/include/compute_kernel_api/matmul.h index 2e90cecfd4f..d36924d3954 100644 --- a/tt_metal/include/compute_kernel_api/matmul.h +++ b/tt_metal/include/compute_kernel_api/matmul.h @@ -27,9 +27,8 @@ namespace ckernel { * | out_cb_id | The identifier of the output circular buffer (CB) | uint32_t | 0 to 31 | False | * | transpose | The transpose flag for performing transpose operation on B | uint32_t | Any positive value will indicate tranpose is set | False | */ - // clang-format on -ALWI void mm_init( - uint32_t in0_cb_id = 0, uint32_t in1_cb_id = 1, uint32_t out_cb_id = 16, const uint32_t transpose = 0) { +// clang-format on +ALWI void mm_init(uint32_t in0_cb_id, uint32_t in1_cb_id, uint32_t out_cb_id, const uint32_t transpose = 0) { UNPACK((llk_unpack_AB_matmul_hw_configure_disaggregated(in0_cb_id, in1_cb_id))); UNPACK((llk_unpack_AB_matmul_init(in0_cb_id, in1_cb_id, transpose))); @@ -103,8 +102,8 @@ ALWI void matmul_tiles_math(uint32_t idst) { * | in1_cb_id | The identifier of the second input circular buffer (CB) | uint32_t | 0 to 31 | False | * | transpose | The transpose flag for performing transpose operation on B | uint32_t | Any positive value will indicate tranpose is set | False | */ - // clang-format on -ALWI void mm_init_short(uint32_t in0_cb_id = 0, uint32_t in1_cb_id = 1, const uint32_t transpose = 0) { +// clang-format on +ALWI void mm_init_short(uint32_t in0_cb_id, uint32_t in1_cb_id, const uint32_t transpose = 0) { MATH((llk_math_matmul_init(in0_cb_id, in1_cb_id, transpose))); UNPACK((llk_unpack_AB_matmul_init(in0_cb_id, in1_cb_id, transpose))); } @@ -125,7 +124,7 @@ ALWI void mm_init_short(uint32_t in0_cb_id = 0, uint32_t in1_cb_id = 1, const ui */ // clang-format on ALWI void mm_init_short_with_dt( - uint32_t in0_cb_id = 0, uint32_t in1_cb_id = 1, uint32_t c_in_old_srca = 2, const uint32_t transpose = 0) { + uint32_t in0_cb_id, uint32_t in1_cb_id, uint32_t c_in_old_srca, const uint32_t transpose = 0) { UNPACK((llk_unpack_reconfig_data_format_srca(c_in_old_srca, in1_cb_id))); MATH((llk_math_reconfig_data_format_srca(c_in_old_srca, in1_cb_id))); mm_init_short(in0_cb_id, in1_cb_id, transpose); @@ -148,9 +147,9 @@ ALWI void mm_init_short_with_dt( */ // clang-format on ALWI void mm_block_init( - uint32_t in0_cb_id = 0, - uint32_t in1_cb_id = 1, - uint32_t out_cb_id = 16, + uint32_t in0_cb_id, + uint32_t in1_cb_id, + uint32_t out_cb_id, const uint32_t transpose = 0, uint32_t ct_dim = 1, uint32_t rt_dim = 1, @@ -221,8 +220,8 @@ ALWI void matmul_block( */ // clang-format on ALWI void mm_block_init_short( - uint32_t in0_cb_id = 0, - uint32_t in1_cb_id = 1, + uint32_t in0_cb_id, + uint32_t in1_cb_id, const uint32_t transpose = 0, uint32_t ct_dim = 1, uint32_t rt_dim = 1, @@ -249,9 +248,9 @@ ALWI void mm_block_init_short( */ // clang-format on ALWI void mm_block_init_short_with_dt( - uint32_t in0_cb_id = 0, - uint32_t in1_cb_id = 1, - uint32_t old_in1_cb_id = 2, + uint32_t in0_cb_id, + uint32_t in1_cb_id, + uint32_t old_in1_cb_id, const uint32_t transpose = 0, uint32_t ct_dim = 1, uint32_t rt_dim = 1, diff --git a/tt_metal/programming_examples/matmul_common/kernels/compute/bmm.cpp b/tt_metal/programming_examples/matmul_common/kernels/compute/bmm.cpp index 06df68f3425..249f2ed5f23 100644 --- a/tt_metal/programming_examples/matmul_common/kernels/compute/bmm.cpp +++ b/tt_metal/programming_examples/matmul_common/kernels/compute/bmm.cpp @@ -22,7 +22,7 @@ void MAIN { uint32_t Kt = get_compile_time_arg_val(2); uint32_t Nt = get_compile_time_arg_val(3); - mm_init(); + mm_init(tt::CBIndex::c_0, tt::CBIndex::c_1, tt::CBIndex::c_16); // the simplest possible version of outer product blocked matmul // the reader is expected to read the A's and B's tile rows and tile columns for each output tile diff --git a/tt_metal/programming_examples/matmul_common/kernels/compute/bmm_large_block_zm.cpp b/tt_metal/programming_examples/matmul_common/kernels/compute/bmm_large_block_zm.cpp index 2ec32305293..e456700b57c 100644 --- a/tt_metal/programming_examples/matmul_common/kernels/compute/bmm_large_block_zm.cpp +++ b/tt_metal/programming_examples/matmul_common/kernels/compute/bmm_large_block_zm.cpp @@ -22,7 +22,7 @@ void MAIN { uint32_t out_subblock_num_tiles = get_compile_time_arg_val(10); // out_subblock_h * out_subblock_w; uint32_t batch = get_compile_time_arg_val(11); // batch dim - mm_init(); + mm_init(tt::CBIndex::c_0, tt::CBIndex::c_1, tt::CBIndex::c_16); for (uint32_t b = 0; b < batch; b++) { bool spill = num_blocks > 1; @@ -47,7 +47,7 @@ void MAIN { copy_tile(tt::CBIndex::c_24, i, i); } cb_pop_front(tt::CBIndex::c_24, out_subblock_num_tiles); - mm_init_short(); + mm_init_short(tt::CBIndex::c_0, tt::CBIndex::c_1); } // Compute output sub-block from in0_subblock x in1_subblock diff --git a/ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/bmm_tilize_untilize.cpp b/ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/bmm_tilize_untilize.cpp index 1d6e1b23807..ab6294cf55d 100644 --- a/ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/bmm_tilize_untilize.cpp +++ b/ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/bmm_tilize_untilize.cpp @@ -157,7 +157,7 @@ void MAIN { bool last_out = (in0_block_w_i == in0_num_blocks_w - 1); if (tilize_in0) { tilize_in(in0_cb_id, in0_subblock_h, in0_block_w, in0_num_subblocks, tilized_in0_cb_id); - mm_init_short(); + mm_init_short(tilized_in0_cb_id, in1_cb_id); cb_wait_front(tilized_in0_cb_id, in0_block_num_tiles); } else { cb_wait_front(in0_cb_id, in0_block_num_tiles); @@ -229,7 +229,7 @@ void MAIN { // do not pop front bias as it may be used again for subsequent blocks cb_pop_front(out_for_bias_cb_id, out_subblock_num_tiles); // reconfig for matmul - mm_init_short(); + mm_init_short(tilize_in0 ? tilized_in0_cb_id : in0_cb_id, in1_cb_id); // reconfig unpacker df for srcB // reconfig_data_format(in1_cb_id, in0_cb_id); } @@ -266,7 +266,7 @@ void MAIN { untilize_mode_final_matmul_partials_cb, untilize_mode_reblock_cb, out_cb_id); - mm_init_short(); + mm_init_short(tilize_in0 ? tilized_in0_cb_id : in0_cb_id, in1_cb_id); reconfig_data_format(in1_cb_id, in0_cb_id); } // last_out #endif diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/kernels/compute/rotary_embedding_llama.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/kernels/compute/rotary_embedding_llama.cpp index 7d9dc699d61..d06c6ec1740 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/kernels/compute/rotary_embedding_llama.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/kernels/compute/rotary_embedding_llama.cpp @@ -36,7 +36,7 @@ void MAIN { const uint32_t my_seq_tiles = seq_t_end - seq_t_start; const uint32_t my_cos_sin_tiles = my_seq_tiles * Wt; - mm_init(); + mm_init(in_cb, trans_mat_cb, out_cb); binary_op_init_common(rotated_in_interm_cb, cos_cb, out_cb); // General Init for all binary ops // Get the trans_mat diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/kernels/compute/rotary_embedding_llama_sharded.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/kernels/compute/rotary_embedding_llama_sharded.cpp index 2a4c2562e73..d8456b9d819 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/kernels/compute/rotary_embedding_llama_sharded.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/kernels/compute/rotary_embedding_llama_sharded.cpp @@ -27,7 +27,7 @@ void MAIN { constexpr uint32_t Wt = get_compile_time_arg_val(8); constexpr uint32_t Ht = get_compile_time_arg_val(9); // How many rows (tiles) in n_heads dimension - mm_init(); + mm_init(in_cb, trans_mat_cb, out_cb); binary_op_init_common(rotated_in_interm_cb, sin_cb, sin_interm_cb); // General Init for all binary ops // Get the trans_mat diff --git a/ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm.cpp b/ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm.cpp index e8f891f8efd..a74e2fd963d 100644 --- a/ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm.cpp @@ -23,7 +23,7 @@ void MAIN { uint32_t Kt = get_compile_time_arg_val(2); uint32_t Nt = get_compile_time_arg_val(3); - mm_init(); + mm_init(tt::CBIndex::c_0, tt::CBIndex::c_1, tt::CBIndex::c_16); // the simplest possible version of outer product blocked matmul // the reader is expected to read the A's and B's tile rows and tile columns for each output tile diff --git a/ttnn/cpp/ttnn/operations/reduction/generic/device/kernels/compute/reduce_w.cpp b/ttnn/cpp/ttnn/operations/reduction/generic/device/kernels/compute/reduce_w.cpp index 734d4bea149..7bfafaf009b 100644 --- a/ttnn/cpp/ttnn/operations/reduction/generic/device/kernels/compute/reduce_w.cpp +++ b/ttnn/cpp/ttnn/operations/reduction/generic/device/kernels/compute/reduce_w.cpp @@ -20,7 +20,7 @@ void MAIN { #ifndef REDUCE_ROW_SUM_VIA_MM reduce_init(tt::CBIndex::c_0, tt::CBIndex::c_2, tt::CBIndex::c_16); #else - mm_init(tt::CBIndex::c_0, tt::CBIndex::c_2); + mm_init(tt::CBIndex::c_0, tt::CBIndex::c_2, tt::CBIndex::c_16); #endif cb_wait_front(tt::CBIndex::c_2, 1); // scaler tile from the reader diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa/device/kernels/compute/joint_sdpa.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa/device/kernels/compute/joint_sdpa.cpp index f7178265a14..48121372ad4 100644 --- a/ttnn/cpp/ttnn/operations/transformer/sdpa/device/kernels/compute/joint_sdpa.cpp +++ b/ttnn/cpp/ttnn/operations/transformer/sdpa/device/kernels/compute/joint_sdpa.cpp @@ -68,7 +68,7 @@ void MAIN { constexpr uint32_t cb_out = tt::CBIndex::c_16; - mm_init(); + mm_init(cb_q_in, cb_k_in, cb_qk_im); for (uint32_t nb = local_batch_start; nb < local_batch_end; ++nb) { for (uint32_t nq = local_nh_start; nq < local_nh_end; ++nq) { diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa/device/kernels/compute/sdpa.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa/device/kernels/compute/sdpa.cpp index 99852a49683..fc33507275a 100644 --- a/ttnn/cpp/ttnn/operations/transformer/sdpa/device/kernels/compute/sdpa.cpp +++ b/ttnn/cpp/ttnn/operations/transformer/sdpa/device/kernels/compute/sdpa.cpp @@ -76,7 +76,7 @@ void MAIN { constexpr uint32_t cb_out = tt::CBIndex::c_16; - mm_init(); + mm_init(cb_q_in, cb_k_in, cb_out); for (uint32_t nb = local_batch_start; nb < local_batch_end; ++nb) { for (uint32_t nq = local_nh_start; nq < local_nh_end; ++nq) { diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/compute/sdpa_flash_decode.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/compute/sdpa_flash_decode.cpp index 8b571c1b1b1..ff662fc020e 100644 --- a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/compute/sdpa_flash_decode.cpp +++ b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/compute/sdpa_flash_decode.cpp @@ -127,7 +127,7 @@ void MAIN { num_cores_to_wait = k_num_chunks - 1; } - mm_init(); + mm_init(cb_q_in, cb_k_in, cb_out_final); cb_wait_front(cb_q_in, q_chunk_tiles); for (uint32_t cur_head_work = 0; cur_head_work < num_heads_per_core; ++cur_head_work) { From a0ea595018cd3b3d0d242a5058d2e05106a4ee7e Mon Sep 17 00:00:00 2001 From: Brian Liu Date: Fri, 31 Jan 2025 21:37:23 +0000 Subject: [PATCH 142/316] #0: Switch ttnn.to_torch to use tensor.to_torch_with_logical_shape - Modify get_logical_and_physical_shard_shapes for encode/decode tensor data * Switch to use padded shape as physical shard shape instead of deriving for tile layout ** This adds support for row major interleaved tensors with arbitrary 2D padding * Switch to use logical shape as logical shard shape if physically sharded and has padding ** In general, cannot use shard shape as logical shard shape if tensor has padding - Add support for empty tensors in encode/decode tensor data - Skip test with incorrect usage of reshape * We should not support modifying the logical shape of a borrowed buffer --- tests/ttnn/unit_tests/test_to_layout.py | 1 + ttnn/cpp/ttnn/tensor/layout/tensor_layout.cpp | 3 +- ttnn/cpp/ttnn/tensor/tensor_impl.cpp | 28 ++++++++++++------- ttnn/ttnn/operations/core.py | 15 +--------- 4 files changed, 22 insertions(+), 25 deletions(-) diff --git a/tests/ttnn/unit_tests/test_to_layout.py b/tests/ttnn/unit_tests/test_to_layout.py index 436ce03f0d6..735eda892c6 100644 --- a/tests/ttnn/unit_tests/test_to_layout.py +++ b/tests/ttnn/unit_tests/test_to_layout.py @@ -25,6 +25,7 @@ def test_to_layout_2D(device, height, width, on_device, from_layout, to_layout, pad_h = (ttnn.TILE_SIZE - height % ttnn.TILE_SIZE) % ttnn.TILE_SIZE pad_w = (ttnn.TILE_SIZE - width % ttnn.TILE_SIZE) % ttnn.TILE_SIZE if start_with_padding: + pytest.skip("Modifying logical shape with borrowed buffer is not supported!") torch_padded_input_tensor = torch.nn.functional.pad( torch_input_tensor, (0, pad_w, 0, pad_h), mode="constant", value=0.0 ) diff --git a/ttnn/cpp/ttnn/tensor/layout/tensor_layout.cpp b/ttnn/cpp/ttnn/tensor/layout/tensor_layout.cpp index 298f9c6f5e6..8b501714a93 100644 --- a/ttnn/cpp/ttnn/tensor/layout/tensor_layout.cpp +++ b/ttnn/cpp/ttnn/tensor/layout/tensor_layout.cpp @@ -236,7 +236,8 @@ Shape2D TensorLayout::get_logical_shard_shape() const { TT_FATAL( memory_config_.shard_spec.has_value(), "Shard spec must have value for TensorLayout::get_logical_shard_shape!"); - // Shape in shard spec will always represent logical shard shape in either mode + // In physical mode, shape in shard spec is logical shard shape if no padding + // Otherwise, not possible to infer logical shard shape in general return Shape2D(memory_config_.shard_spec.value().shape); } diff --git a/ttnn/cpp/ttnn/tensor/tensor_impl.cpp b/ttnn/cpp/ttnn/tensor/tensor_impl.cpp index 1f2706e91a8..edcf4a2ad4d 100644 --- a/ttnn/cpp/ttnn/tensor/tensor_impl.cpp +++ b/ttnn/cpp/ttnn/tensor/tensor_impl.cpp @@ -882,23 +882,23 @@ Tensor to_device_mesh_tensor( namespace { namespace CMAKE_UNIQUE_NAMESPACE { -// TODO: Remove when we generalize interleaved and sharded; when we do, directly get from TensorLayout +// TODO: Remove when we get rid of physical sharding and generalize interleaved and sharded; when we do, directly get +// from TensorLayout std::array get_logical_and_physical_shard_shapes(const TensorSpec& tensor_spec) { - if (tensor_spec.memory_config().is_sharded()) { + const auto& logical_shape = tensor_spec.logical_shape(); + const auto& padded_shape = tensor_spec.padded_shape(); + + // TODO: get_logical_shard_shape always returns shard shape from shard spec, which is not correct in physical mode + // if there is padding + if (tensor_spec.memory_config().is_sharded() and + (tensor_spec.memory_config().shard_spec.value().mode == ShardMode::LOGICAL or logical_shape == padded_shape)) { return { tensor_spec.tensor_layout().get_logical_shard_shape(), tensor_spec.tensor_layout().get_physical_shard_shape()}; } - const auto& logical_shape = tensor_spec.logical_shape(); Shape2D logical_shard_shape{logical_shape[-2], logical_shape[-1]}; - auto physical_shard_shape = logical_shard_shape; - if (tensor_spec.layout() == Layout::TILE) { - const auto& tile = tensor_spec.tile(); - auto physical_shard_height = tt::round_up(logical_shard_shape.height(), tile.get_height()); - auto physical_shard_width = tt::round_up(logical_shard_shape.width(), tile.get_width()); - physical_shard_shape = Shape2D{physical_shard_height, physical_shard_width}; - } + Shape2D physical_shard_shape = {padded_shape[-2], padded_shape[-1]}; return {logical_shard_shape, physical_shard_shape}; } @@ -942,6 +942,10 @@ std::vector compute_logical_to_physical_shards_mapping( template std::vector encode_tensor_data(std::vector&& logical_data, const TensorSpec& tensor_spec) { + if (logical_data.size() == 0) { + return {}; + } + const auto& logical_shape = tensor_spec.logical_shape(); TT_FATAL( logical_data.size() == logical_shape.volume(), @@ -1005,6 +1009,10 @@ template std::vector encode_tensor_data( template std::vector decode_tensor_data(std::vector&& physical_data, const TensorSpec& tensor_spec) { + if (physical_data.size() == 0) { + return {}; + } + const auto& physical_shape = tensor_spec.physical_shape(); TT_FATAL( physical_data.size() == physical_shape.height() * physical_shape.width(), diff --git a/ttnn/ttnn/operations/core.py b/ttnn/ttnn/operations/core.py index e5392ed8b86..179cb169384 100644 --- a/ttnn/ttnn/operations/core.py +++ b/ttnn/ttnn/operations/core.py @@ -319,20 +319,7 @@ def to_torch( ): tensor = tensor.to(ttnn.ROW_MAJOR_LAYOUT, device) - shape_without_tile_padding = tuple(tensor.shape) - logical_shape_rank = len(tensor.shape) - - while len(shape_without_tile_padding) < len(tensor.padded_shape): - shape_without_tile_padding = (1,) + shape_without_tile_padding - - tensor = tensor.to_torch() - slices = [slice(None, x) for x in shape_without_tile_padding] - tensor = tensor[slices] - - while len(tensor.shape) > logical_shape_rank: - if tensor.shape[0] != 1: - raise RuntimeError("ttnn: Unable to squeeze to desired rank!") - tensor = tensor.squeeze(0) + tensor = tensor.to_torch_with_logical_shape() if torch_rank is not None: while len(tensor.shape) > torch_rank: From b97324333f3e1b7675a09806ec91fbddbcea65ad Mon Sep 17 00:00:00 2001 From: Ata Tuzuner Date: Tue, 18 Feb 2025 14:49:45 -0500 Subject: [PATCH 143/316] #15450: Remove default values from circular buffer parameters in LLK compute APIs: Docs (#17567) ### Ticket [Link to Github Issue](https://github.com/tenstorrent/tt-metal/issues/15450) ### Problem description Default values for circular buffer arguments in the LLK API can cause errors. Forgetting to set these arguments explicitly may lead to errors due to wrong cb usage. This PR is specific to the changes in the documentation (.rst files) for compute kernel APIs. ### What's changed Default values for the circular buffer parameters have been removed from functions within these files. This PR assumes changes from [this PR](https://github.com/tenstorrent/tt-metal/pull/16571) and will be merged once that is merged. ### Checklist - [ ] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes - [ ] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [ ] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [ ] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [ ] New/Existing tests provide coverage for changes - [x] [Doc build test](https://github.com/tenstorrent/tt-metal/actions/runs/13145490034) --- .../tt_metal/apis/kernel_apis/compute/add_tiles.rst | 2 +- .../tt_metal/apis/kernel_apis/compute/add_tiles_bcast.rst | 4 ++-- .../tt_metal/apis/kernel_apis/compute/matmul_block.rst | 6 +++--- .../tt_metal/apis/kernel_apis/compute/matmul_tiles.rst | 6 +++--- .../tt_metal/apis/kernel_apis/compute/move_copy_tile.rst | 2 +- .../tt_metal/apis/kernel_apis/compute/mul_tiles.rst | 2 +- .../tt_metal/apis/kernel_apis/compute/mul_tiles_bcast.rst | 6 +++--- .../tt_metal/apis/kernel_apis/compute/sub_tiles.rst | 2 +- .../tt_metal/apis/kernel_apis/compute/sub_tiles_bcast.rst | 2 +- .../tt_metal/apis/kernel_apis/compute/transpose_wh_tile.rst | 2 +- 10 files changed, 17 insertions(+), 17 deletions(-) diff --git a/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/add_tiles.rst b/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/add_tiles.rst index 2773817627d..803734b1456 100644 --- a/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/add_tiles.rst +++ b/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/add_tiles.rst @@ -3,5 +3,5 @@ add_tiles .. doxygenfunction:: add_tiles_init_nof() -.. doxygenfunction:: add_tiles_init(uint32_t icb0 = 0, uint32_t icb1 = 1, bool acc_to_dest = false) +.. doxygenfunction:: add_tiles_init(uint32_t icb0, uint32_t icb1, bool acc_to_dest = false) .. doxygenfunction:: add_tiles(uint32_t icb0, uint32_t icb1, uint32_t itile0, uint32_t itile1, uint32_t idst) diff --git a/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/add_tiles_bcast.rst b/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/add_tiles_bcast.rst index edb1279324c..71f77751ada 100644 --- a/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/add_tiles_bcast.rst +++ b/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/add_tiles_bcast.rst @@ -1,6 +1,6 @@ add_tiles_bcast =============== -.. doxygenfunction:: add_bcast_cols_init_short(uint32_t icb0 = 0, uint32_t icb1 = 1) -.. doxygenfunction:: add_bcast_rows_init_short(uint32_t icb0 = 0, uint32_t icb1 = 1) +.. doxygenfunction:: add_bcast_cols_init_short(uint32_t icb0, uint32_t icb1) +.. doxygenfunction:: add_bcast_rows_init_short(uint32_t icb0, uint32_t icb1) .. doxygenfunction:: add_tiles_bcast(uint32_t icb0, uint32_t icb1, uint32_t itile0, uint32_t itile1, uint32_t idst) diff --git a/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/matmul_block.rst b/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/matmul_block.rst index be39d92bd01..ad2b54bbb44 100644 --- a/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/matmul_block.rst +++ b/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/matmul_block.rst @@ -1,7 +1,7 @@ matmul_block ============ -.. doxygenfunction:: mm_block_init(uint32_t in0_cb_id = 0, uint32_t in1_cb_id = 1, uint32_t out_cb_id = 16, const uint32_t transpose=0, uint32_t ct_dim = 1, uint32_t rt_dim = 1, uint32_t kt_dim = 1) -.. doxygenfunction:: mm_block_init_short(uint32_t in0_cb_id = 0, uint32_t in1_cb_id = 1, const uint32_t transpose=0, uint32_t ct_dim = 1, uint32_t rt_dim = 1, uint32_t kt_dim = 1) -.. doxygenfunction:: mm_block_init_short_with_dt(uint32_t in0_cb_id = 0, uint32_t in1_cb_id = 1, uint32_t old_in1_cb_id=2, const uint32_t transpose=0, uint32_t ct_dim = 1, uint32_t rt_dim = 1, uint32_t kt_dim = 1) +.. doxygenfunction:: mm_block_init(uint32_t in0_cb_id, uint32_t in1_cb_id, uint32_t out_cb_id, const uint32_t transpose=0, uint32_t ct_dim = 1, uint32_t rt_dim = 1, uint32_t kt_dim = 1) +.. doxygenfunction:: mm_block_init_short(uint32_t in0_cb_id, uint32_t in1_cb_id, const uint32_t transpose=0, uint32_t ct_dim = 1, uint32_t rt_dim = 1, uint32_t kt_dim = 1) +.. doxygenfunction:: mm_block_init_short_with_dt(uint32_t in0_cb_id, uint32_t in1_cb_id, uint32_t old_in1_cb_id, const uint32_t transpose=0, uint32_t ct_dim = 1, uint32_t rt_dim = 1, uint32_t kt_dim = 1) .. doxygenfunction:: matmul_block(uint32_t in0_cb_id, uint32_t in1_cb_id, uint32_t in0_tile_index, uint32_t in1_tile_index, uint32_t idst, const uint32_t transpose, uint32_t ct_dim, uint32_t rt_dim, uint32_t kt_dim) diff --git a/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/matmul_tiles.rst b/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/matmul_tiles.rst index 81c89a67dc2..ef06d4bd424 100644 --- a/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/matmul_tiles.rst +++ b/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/matmul_tiles.rst @@ -1,7 +1,7 @@ matmul_tiles ============ -.. doxygenfunction:: mm_init(uint32_t in0_cb_id = 0, uint32_t in1_cb_id = 1, uint32_t out_cb_id = 16, const uint32_t transpose=0) -.. doxygenfunction:: mm_init_short_with_dt(uint32_t in0_cb_id = 0, uint32_t in1_cb_id = 1, uint32_t c_in_old_srca = 2, const uint32_t transpose=0) -.. doxygenfunction:: mm_init_short(uint32_t in0_cb_id = 0, uint32_t in1_cb_id = 1, const uint32_t transpose=0) +.. doxygenfunction:: mm_init(uint32_t in0_cb_id, uint32_t in1_cb_id, uint32_t out_cb_id, const uint32_t transpose=0) +.. doxygenfunction:: mm_init_short_with_dt(uint32_t in0_cb_id, uint32_t in1_cb_id, uint32_t c_in_old_srca, const uint32_t transpose=0) +.. doxygenfunction:: mm_init_short(uint32_t in0_cb_id, uint32_t in1_cb_id, const uint32_t transpose=0) .. doxygenfunction:: matmul_tiles(uint32_t in0_cb_id, uint32_t in1_cb_id, uint32_t in0_tile_index, uint32_t in1_tile_index, uint32_t idst, const uint32_t transpose) diff --git a/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/move_copy_tile.rst b/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/move_copy_tile.rst index bfa7a320b3c..bd6dc6e7920 100644 --- a/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/move_copy_tile.rst +++ b/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/move_copy_tile.rst @@ -3,5 +3,5 @@ move_copy_tile .. doxygenfunction:: copy_tile_to_dst_init_short_with_dt(uint32_t old_cbid, uint32_t new_cbid, uint32_t transpose = 0) -.. doxygenfunction:: copy_tile_to_dst_init_short(uint32_t cbid = 0, uint32_t transpose = 0) +.. doxygenfunction:: copy_tile_to_dst_init_short(uint32_t cbid, uint32_t transpose = 0) .. doxygenfunction:: copy_tile_init(uint32_t cbid) diff --git a/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/mul_tiles.rst b/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/mul_tiles.rst index ab445b9f820..a8c6dabbeda 100644 --- a/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/mul_tiles.rst +++ b/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/mul_tiles.rst @@ -2,5 +2,5 @@ mul_tiles ========= .. doxygenfunction:: mul_tiles_init_f() -.. doxygenfunction:: mul_tiles_init(uint32_t icb0 = 0, uint32_t icb1 = 1) +.. doxygenfunction:: mul_tiles_init(uint32_t icb0, uint32_t icb1) .. doxygenfunction:: mul_tiles(uint32_t icb0, uint32_t icb1, uint32_t itile0, uint32_t itile1, uint32_t idst) diff --git a/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/mul_tiles_bcast.rst b/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/mul_tiles_bcast.rst index 8b0b59cb0b4..3f7b624bf3b 100644 --- a/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/mul_tiles_bcast.rst +++ b/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/mul_tiles_bcast.rst @@ -1,8 +1,8 @@ mul_tiles_bcast =============== -.. doxygenfunction:: mul_bcast_cols_init_short(uint32_t icb0 = 0, uint32_t icb1 = 1) -.. doxygenfunction:: mul_bcast_rows_init_short(uint32_t icb0 = 0, uint32_t icb1 = 1) +.. doxygenfunction:: mul_bcast_cols_init_short(uint32_t icb0, uint32_t icb1) +.. doxygenfunction:: mul_bcast_rows_init_short(uint32_t icb0, uint32_t icb1) .. doxygenfunction:: mul_tiles_bcast(uint32_t icb0, uint32_t icb1, uint32_t itile0, uint32_t itile1, uint32_t idst) -.. doxygenfunction:: mul_tiles_bcast_scalar_init_short(uint32_t icb0 = 0, uint32_t icb1 = 1) +.. doxygenfunction:: mul_tiles_bcast_scalar_init_short(uint32_t icb0, uint32_t icb1) .. doxygenfunction:: mul_tiles_bcast_scalar(uint32_t icb0, uint32_t icb1, uint32_t itile0, uint32_t itile1, uint32_t idst) diff --git a/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/sub_tiles.rst b/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/sub_tiles.rst index 1dfaa5f98c2..407fb0d5637 100644 --- a/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/sub_tiles.rst +++ b/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/sub_tiles.rst @@ -2,5 +2,5 @@ sub_tiles ========= .. doxygenfunction:: sub_tiles_init_nof() -.. doxygenfunction:: sub_tiles_init(uint32_t icb0 = 0, uint32_t icb1 = 1, bool acc_to_dest = false) +.. doxygenfunction:: sub_tiles_init(uint32_t icb0, uint32_t icb1, bool acc_to_dest = false) .. doxygenfunction:: sub_tiles( uint32_t icb0, uint32_t icb1, uint32_t itile0, uint32_t itile1, uint32_t idst) diff --git a/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/sub_tiles_bcast.rst b/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/sub_tiles_bcast.rst index d3ed375e3df..dc9fe366f0e 100644 --- a/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/sub_tiles_bcast.rst +++ b/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/sub_tiles_bcast.rst @@ -2,5 +2,5 @@ sub_tiles_bcast =============== -.. doxygenfunction:: sub_bcast_cols_init_short(uint32_t icb0 = 0, uint32_t icb1 = 1) +.. doxygenfunction:: sub_bcast_cols_init_short(uint32_t icb0, uint32_t icb1) .. doxygenfunction:: sub_tiles_bcast(uint32_t icb0, uint32_t icb1, uint32_t itile0, uint32_t itile1, uint32_t idst) diff --git a/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/transpose_wh_tile.rst b/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/transpose_wh_tile.rst index d6640cc5222..1a61b44d058 100644 --- a/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/transpose_wh_tile.rst +++ b/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/transpose_wh_tile.rst @@ -1,5 +1,5 @@ transpose_wh_tile ================= -.. doxygenfunction:: transpose_wh_init(uint32_t icb, uint32_t ocb = 16) +.. doxygenfunction:: transpose_wh_init(uint32_t icb, uint32_t ocb) .. doxygenfunction:: transpose_wh_tile(uint32_t icb, uint32_t itile, uint32_t idst) From e0fe53d5136133a77d7ba76313d4a8dbc5efd033 Mon Sep 17 00:00:00 2001 From: Jeffrey Jiang Date: Tue, 18 Feb 2025 13:50:14 -0600 Subject: [PATCH 144/316] Replace List Mesh to Tensor (#17667) ### Ticket Link to Github Issue https://github.com/tenstorrent/tt-metal/issues/15061 ### Problem description ListMeshToTensor was a python class in distributed.py that didn't match the xtensor->torch.tensor convention and instead output a list[torch.tensor]. I've added the utility method ttnn.shardedtensor_to_tensorlist(ttnn.tensor)->list[torch.tensor] instead. ### What's changed -ListMeshToTensor removed, all usages replaced with ttnn.sharded_tensor_to_torch_tensor_list (tensor: ttnn.tensor) hook -Added temporary python hook in operations/core.py to convert from ttnn.tensor lists to torch.tensor lists ### Checklist - [x] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes https://github.com/tenstorrent/tt-metal/actions/runs/13335705560 - [x] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) - [x] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [x] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [x] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [x] New/Existing tests provide coverage for changes: [tests/ttnn/unit_tests/test_multi_device_async.py](https://github.com/tenstorrent/tt-metal/blob/8063f0ab02bec18561f9461e8d1ff868ba0eeaf0/tests/ttnn/unit_tests/test_multi_device_async.py#L87), [tests/ttnn/unit_tests/operations/test_creation.py](https://github.com/tenstorrent/tt-metal/blob/8063f0ab02bec18561f9461e8d1ff868ba0eeaf0/tests/ttnn/unit_tests/operations/test_creation.py#L261), [tests/ttnn/unit_tests/tensor/test_tensor_prealloc_and_write.py](https://github.com/tenstorrent/tt-metal/blob/8063f0ab02bec18561f9461e8d1ff868ba0eeaf0/tests/ttnn/unit_tests/tensor/test_tensor_prealloc_and_write.py#L82), [tests/ttnn/distributed/test_multidevice_TG.py](https://github.com/tenstorrent/tt-metal/blob/8063f0ab02bec18561f9461e8d1ff868ba0eeaf0/tests/ttnn/distributed/test_multidevice_TG.py#L19), [models/demos/t3000/falcon40b/tests/test_falcon_model.py](https://github.com/tenstorrent/tt-metal/blob/8063f0ab02bec18561f9461e8d1ff868ba0eeaf0/models/demos/t3000/falcon40b/tests/test_falcon_model.py#L9) --- .../falcon40b/tests/test_falcon_model.py | 6 +- .../llama2_70b/tests/test_llama_generation.py | 2 +- .../tests/test_llama_attention_galaxy.py | 2 +- tests/ttnn/distributed/test_multidevice_TG.py | 27 +- .../unit_tests/operations/test_creation.py | 10 +- .../tensor/test_tensor_prealloc_and_write.py | 8 +- tests/ttnn/unit_tests/test_multi_device.py | 8 +- .../unit_tests/test_multi_device_async.py | 16 +- .../unit_tests/test_multi_device_events.py | 2 +- .../unit_tests/test_multi_device_trace.py | 2 +- .../unit_tests/test_multi_device_trace_TG.py | 2 +- .../unit_tests/test_multi_device_trace_tgg.py | 2 +- .../ttnn/distributed/distributed_pybind.cpp | 230 ++++++++++-------- .../ttnn/distributed/distributed_pybind.hpp | 1 + ttnn/ttnn/distributed/__init__.py | 1 - ttnn/ttnn/distributed/distributed.py | 12 - 16 files changed, 168 insertions(+), 163 deletions(-) diff --git a/models/demos/t3000/falcon40b/tests/test_falcon_model.py b/models/demos/t3000/falcon40b/tests/test_falcon_model.py index 5bebb599db6..287984dec02 100644 --- a/models/demos/t3000/falcon40b/tests/test_falcon_model.py +++ b/models/demos/t3000/falcon40b/tests/test_falcon_model.py @@ -6,7 +6,7 @@ import pytest from loguru import logger import ttnn -from ttnn import ShardTensorToMesh, ConcatMeshToTensor, ListMeshToTensor +from ttnn import ShardTensorToMesh, ConcatMeshToTensor from models.demos.t3000.falcon40b.reference.hf_modeling_falcon import ( FalconForCausalLM, ) @@ -196,7 +196,7 @@ def run_test_FalconModel_inference( use_cache=use_cache, ) # output of model is replicated - tensors = ttnn.to_torch(tt_out, device=mesh_device, mesh_composer=ListMeshToTensor(mesh_device)) + tensors = [ttnn.to_torch(shard) for shard in ttnn.get_device_tensors(tt_out.cpu())] tt_outs.append(tensors[0].squeeze(1)) tt_out = torch.vstack(tt_outs) @@ -213,7 +213,7 @@ def run_test_FalconModel_inference( use_cache=use_cache, ) # Output of model is replicated - tensors = ttnn.to_torch(tt_out, device=mesh_device, mesh_composer=ListMeshToTensor(mesh_device)) + tensors = [ttnn.to_torch(shard) for shard in ttnn.get_device_tensors(tt_out.cpu())] tt_out = tensors[0].squeeze(1).transpose(0, 1) # check outputs ---------------------------------------------------------------------- diff --git a/models/demos/t3000/llama2_70b/tests/test_llama_generation.py b/models/demos/t3000/llama2_70b/tests/test_llama_generation.py index f5af555dc39..babfe3b3657 100644 --- a/models/demos/t3000/llama2_70b/tests/test_llama_generation.py +++ b/models/demos/t3000/llama2_70b/tests/test_llama_generation.py @@ -6,7 +6,7 @@ import torch from torch import nn import ttnn -from ttnn import ShardTensorToMesh, ReplicateTensorToMesh, ConcatMeshToTensor, ListMeshToTensor +from ttnn import ShardTensorToMesh, ReplicateTensorToMesh, ConcatMeshToTensor import scipy diff --git a/models/demos/tg/llama3_70b/tests/test_llama_attention_galaxy.py b/models/demos/tg/llama3_70b/tests/test_llama_attention_galaxy.py index 5874891de72..a2ea1b7c792 100644 --- a/models/demos/tg/llama3_70b/tests/test_llama_attention_galaxy.py +++ b/models/demos/tg/llama3_70b/tests/test_llama_attention_galaxy.py @@ -337,7 +337,7 @@ def run_test_LlamaAttention_inference( attn_mask, mode=mode, ) - # tt_out = ttnn.to_torch(tt_out, mesh_composer=ListMeshToTensor(mesh_device))[0] + # tt_out = [ttnn.to_torch(shard) for shard in ttnn.get_device_tensors(tt_out.cpu())] tt_out = ttnn.to_torch( tt_out, mesh_composer=ConcatMesh2DToTensor(mesh_device, dims=(3, 1), cluster_shape=cluster_shape) diff --git a/tests/ttnn/distributed/test_multidevice_TG.py b/tests/ttnn/distributed/test_multidevice_TG.py index 58dd4d8b320..6c1c84c5dd9 100644 --- a/tests/ttnn/distributed/test_multidevice_TG.py +++ b/tests/ttnn/distributed/test_multidevice_TG.py @@ -16,7 +16,6 @@ ReplicateTensorToMesh, ConcatMeshToTensor, ConcatMesh2dToTensor, - ListMeshToTensor, MeshToTensor, ) from models.utility_functions import nearest_32 @@ -384,7 +383,7 @@ def test_galaxy_eltwise_add(M, N, mesh_device): memory_config=LN_OUTPUT_MEMCFG, ) - out = ttnn.to_torch(out, mesh_composer=ListMeshToTensor(mesh_device))[0] + out = [ttnn.to_torch(shard) for shard in ttnn.get_device_tensors(out.cpu())][0] out_pass, out_pcc = comp_pcc(gt, out, pcc=0.99999) logger.info(f"PCC value: {out_pcc}") @@ -564,17 +563,19 @@ def test_galaxy_nlp_create_heads_decode( ) # compare - q_heads_tt_cpu = ttnn.to_torch(q_heads_tt, mesh_composer=ListMeshToTensor(mesh_device))[0][..., :n_local_heads, :] + q_heads_tt_cpu = [ttnn.to_torch(shard) for shard in ttnn.get_device_tensors(q_heads_tt.cpu())][0][ + ..., :n_local_heads, : + ] out_pass_q, output_pcc_q = comp_pcc(q_heads_tt_cpu, q_heads_pt, pcc=0.9999) logger.info(f"PCC value: {output_pcc_q}") - k_heads_tt_cpu = ttnn.to_torch(k_heads_tt, mesh_composer=ListMeshToTensor(mesh_device))[0][ + k_heads_tt_cpu = [ttnn.to_torch(shard) for shard in ttnn.get_device_tensors(k_heads_tt.cpu())][0][ ..., :n_local_kv_heads, : ] out_pass_k, output_pcc_k = comp_pcc(k_heads_tt_cpu, k_heads_pt, pcc=0.9999) logger.info(f"PCC value: {output_pcc_k}") - v_heads_tt_cpu = ttnn.to_torch(v_heads_tt, mesh_composer=ListMeshToTensor(mesh_device))[0][ + v_heads_tt_cpu = [ttnn.to_torch(shard) for shard in ttnn.get_device_tensors(v_heads_tt.cpu())][0][ ..., :n_local_kv_heads, : ] out_pass_v, output_pcc_v = comp_pcc(v_heads_tt_cpu, v_heads_pt, pcc=0.9999) @@ -690,8 +691,8 @@ def test_galaxy_rotary_matmul(batch, seq_len, head_dim, n_local_heads, n_local_k query_layer_gt = q_heads_pt @ rot_mat_pt key_layer_gt = k_heads_pt @ rot_mat_pt - query_layer_cpu = ttnn.to_torch(query_layer, mesh_composer=ListMeshToTensor(mesh_device))[0] - key_layer_cpu = ttnn.to_torch(key_layer, mesh_composer=ListMeshToTensor(mesh_device))[0] + query_layer_cpu = [ttnn.to_torch(shard) for shard in ttnn.get_device_tensors(query_layer.cpu())][0] + key_layer_cpu = [ttnn.to_torch(shard) for shard in ttnn.get_device_tensors(key_layer.cpu())][0] out_pass_q, out_pcc_q = comp_pcc(query_layer_cpu, query_layer_gt, pcc=0.999) logger.info(f"PCC value: {out_pcc_q}") @@ -758,7 +759,7 @@ def test_fill_cache( cachett = ttnn.fill_cache(cachett, xt, i) cache[i : i + 1, :, : x.shape[-2], :] = x - tt_got_back = ttnn.to_torch(cachett, mesh_composer=ListMeshToTensor(mesh_device))[0] + tt_got_back = [ttnn.to_torch(shard) for shard in ttnn.get_device_tensors(cachett.cpu())][0] eq, output = comp_pcc(cache, tt_got_back) logger.info(output) assert eq @@ -833,7 +834,7 @@ def test_update_cache_decode( cachett = ttnn.update_cache(cachett, xt, cache_idx, batch_offset=batch_offset) cache[0:num_users, 0:num_heads, cache_idx : cache_idx + x.shape[-2], 0 : x.shape[-1]] = x - tt_got_back = ttnn.to_torch(cachett, mesh_composer=ListMeshToTensor(mesh_device))[0] + tt_got_back = [ttnn.to_torch(shard) for shard in ttnn.get_device_tensors(cachett.cpu())][0] eq_cache, output_cache = comp_pcc(cache, tt_got_back) # checks the entire kv cache eq_update, output_update = comp_pcc( @@ -978,7 +979,7 @@ def run_test_sdpa_decode_single_iter( memory_config=height_sharded_memcfg if sharded_out else dram_memcfg, ) - tt_back = ttnn.to_torch(tt_back, mesh_composer=ListMeshToTensor(mesh_device))[0] + tt_back = [ttnn.to_torch(shard) for shard in ttnn.get_device_tensors(tt_back.cpu())][0] tt_back = tt_back[:, :, :nh, :] Q_slice = Q[:, :, :nh, :].permute(1, 2, 0, 3) # b, nh, 1, d @@ -1078,7 +1079,7 @@ def test_galaxy_nlp_concat_heads_decode( concat_head_output_pt = concat_head_input[:, :, :n_local_heads].reshape(1, 1, batch, head_dim * n_local_heads) # Compare - concat_head_output_tt_cpu = ttnn.to_torch(concat_head_output, mesh_composer=ListMeshToTensor(mesh_device))[0] + concat_head_output_tt_cpu = [ttnn.to_torch(shard) for shard in ttnn.get_device_tensors(concat_head_output.cpu())][0] concat_head_output_tt_unpadded = concat_head_output_tt_cpu[:, :, :batch, :] out_pass, output_pcc = comp_pcc(concat_head_output_tt_unpadded, concat_head_output_pt, pcc=0.9999) logger.info(f"PCC value: {output_pcc}") @@ -1172,7 +1173,7 @@ def test_galaxy_layernorm(M, N, mesh_device): # Compare beta = torch.zeros(1, 1, N // 32, 32) - norm_output_tt_cpu = ttnn.to_torch(norm_output, mesh_composer=ListMeshToTensor(mesh_device))[0] + norm_output_tt_cpu = [ttnn.to_torch(shard) for shard in ttnn.get_device_tensors(norm_output.cpu())][0] ref_rmsnorm = rmsnorm(layernorm_input, norm_weights.flatten(), beta.flatten(), norm_eps) out_pass, output_pcc = comp_pcc(norm_output_tt_cpu, ref_rmsnorm, pcc=0.999) @@ -1420,7 +1421,7 @@ def test_line_all_gather_column_major(mesh_device): ttnn_tensor = ttnn.all_gather( ttnn_tensor, dim=3, cluster_axis=0, mesh_device=mesh_device, num_links=1, topology=ttnn.Topology.Linear ) - tt_outputs = ttnn.to_torch(ttnn_tensor, mesh_composer=ListMeshToTensor(mesh_device)) + tt_outputs = [ttnn.to_torch(shard) for shard in ttnn.get_device_tensors(ttnn_tensor.cpu())] for output in tt_outputs[1:]: assert output.shape == (1, 1, 32, 32 * 8) assert torch.allclose(output, tt_outputs[0]) diff --git a/tests/ttnn/unit_tests/operations/test_creation.py b/tests/ttnn/unit_tests/operations/test_creation.py index 79f09ca122d..36bb95e07bf 100644 --- a/tests/ttnn/unit_tests/operations/test_creation.py +++ b/tests/ttnn/unit_tests/operations/test_creation.py @@ -258,8 +258,7 @@ def test_full_multi_device(mesh_device, input_shape, fill_value, layout): tensor = ttnn.full(input_shape, device=mesh_device, fill_value=fill_value, layout=layout) assert ttnn.is_tensor_storage_on_device(tensor) - output_tensors = ttnn.to_torch(tensor, mesh_composer=ttnn.ListMeshToTensor(mesh_device)) - + output_tensors = [ttnn.to_torch(shard) for shard in ttnn.get_device_tensors(tensor.cpu())] for output_tensor in output_tensors: assert_with_pcc(torch_tensor, output_tensor, 0.9999) assert torch.allclose(torch_tensor, output_tensor) @@ -293,7 +292,6 @@ def test_arange(device, start, end, step): output_tensor = output_tensor[-1, -1, -1, :] if divup((end - start), step) % 2 != 0: output_tensor = output_tensor[:-1] - assert_with_pcc(torch_output_tensor, output_tensor, 0.9999) @@ -322,7 +320,7 @@ def test_arange_multi_device(mesh_device, start, end, step): ) output_tensor = ttnn.to_layout(output_tensor, ttnn.ROW_MAJOR_LAYOUT) output_tensor = ttnn.from_device(output_tensor) - output_tensors = ttnn.to_torch(output_tensor, mesh_composer=ttnn.ListMeshToTensor(mesh_device)) + output_tensors = [ttnn.to_torch(shard) for shard in ttnn.get_device_tensors(output_tensor.cpu())] for output_tensor in output_tensors: output_tensor = output_tensor[-1, -1, -1, :] if divup((end - start), step) % 2 != 0: @@ -369,7 +367,7 @@ def test_empty_multi_device(mesh_device, input_shapes): output_tensor = ttnn.empty(input_shapes, ttnn.bfloat16, ttnn.TILE_LAYOUT, mesh_device, ttnn.DRAM_MEMORY_CONFIG) output_tensor = ttnn.to_layout(output_tensor, ttnn.ROW_MAJOR_LAYOUT) output_tensor = ttnn.from_device(output_tensor) - output_tensors = ttnn.to_torch(output_tensor, mesh_composer=ttnn.ListMeshToTensor(mesh_device)) + output_tensors = [ttnn.to_torch(shard) for shard in ttnn.get_device_tensors(output_tensor.cpu())] for output_tensor in output_tensors: assert list(torch_output_tensor.shape) == list(output_tensor.shape) @@ -417,6 +415,6 @@ def test_empty_like_multi_device(mesh_device, input_shapes): output_tensor = ttnn.empty_like(input_tensor, layout=ttnn.TILE_LAYOUT) output_tensor = ttnn.to_layout(output_tensor, ttnn.ROW_MAJOR_LAYOUT) output_tensor = ttnn.from_device(output_tensor) - output_tensors = ttnn.to_torch(output_tensor, mesh_composer=ttnn.ListMeshToTensor(mesh_device)) + output_tensors = [ttnn.to_torch(shard) for shard in ttnn.get_device_tensors(output_tensor.cpu())] for output_tensor in output_tensors: assert list(torch_input_tensor.shape) == list(output_tensor.shape) diff --git a/tests/ttnn/unit_tests/tensor/test_tensor_prealloc_and_write.py b/tests/ttnn/unit_tests/tensor/test_tensor_prealloc_and_write.py index 68df7937879..029da544301 100644 --- a/tests/ttnn/unit_tests/tensor/test_tensor_prealloc_and_write.py +++ b/tests/ttnn/unit_tests/tensor/test_tensor_prealloc_and_write.py @@ -77,10 +77,10 @@ def test_tensor_preallocation_and_write_apis( mesh_mapper=ttnn.ReplicateTensorToMesh(mesh_device), ) ttnn.copy_host_to_device_tensor(tt_input_tensor_a, preallocated_tensor) - readback_tensors = ttnn.to_torch( - preallocated_tensor.cpu().to(ttnn.ROW_MAJOR_LAYOUT), - mesh_composer=ttnn.ListMeshToTensor(mesh_device), - ) + readback_tensors = [ + ttnn.to_torch(shard) + for shard in ttnn.get_device_tensors(preallocated_tensor.cpu().to(ttnn.ROW_MAJOR_LAYOUT)) + ] for readback_tensor in readback_tensors: allclose, output = comp_pcc(readback_tensor, input_tensor_a) assert allclose, f"FAILED: {output}" diff --git a/tests/ttnn/unit_tests/test_multi_device.py b/tests/ttnn/unit_tests/test_multi_device.py index f81039d1728..71ccbbceddc 100644 --- a/tests/ttnn/unit_tests/test_multi_device.py +++ b/tests/ttnn/unit_tests/test_multi_device.py @@ -11,7 +11,7 @@ from tests.ttnn.utils_for_testing import assert_with_pcc -from ttnn import ShardTensorToMesh, ReplicateTensorToMesh, ConcatMeshToTensor, ListMeshToTensor +from ttnn import ShardTensorToMesh, ReplicateTensorToMesh, ConcatMeshToTensor ####### @@ -183,7 +183,7 @@ def test_multi_device_check_per_device_shard(mesh_device, layout, memory_config, @pytest.mark.parametrize("memory_config", [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG]) def test_multi_device_replicate(mesh_device, shape, layout, memory_config): """Test ReplicateTensorToMesh to broadcast a tensor across multiple devices""" - from ttnn import ReplicateTensorToMesh, ListMeshToTensor + from ttnn import ReplicateTensorToMesh full_tensor = torch.rand(shape, dtype=torch.bfloat16) @@ -196,7 +196,9 @@ def test_multi_device_replicate(mesh_device, shape, layout, memory_config): ) ttnn_tensor = ttnn.to_device(ttnn_tensor, mesh_device) ttnn_loop_back_tensor = ttnn.from_device(ttnn_tensor) - loopback_replicated_tensors = ttnn.to_torch(ttnn_loop_back_tensor, mesh_composer=ListMeshToTensor(mesh_device)) + loopback_replicated_tensors = [ + ttnn.to_torch(shard) for shard in ttnn.get_device_tensors(ttnn_loop_back_tensor.cpu()) + ] for loopback_replicated_tensor in loopback_replicated_tensors: assert torch.all(full_tensor == loopback_replicated_tensor) diff --git a/tests/ttnn/unit_tests/test_multi_device_async.py b/tests/ttnn/unit_tests/test_multi_device_async.py index 5a8890c497e..3b1e75f500d 100644 --- a/tests/ttnn/unit_tests/test_multi_device_async.py +++ b/tests/ttnn/unit_tests/test_multi_device_async.py @@ -84,7 +84,7 @@ def test_multi_device_check_per_device_shard(pcie_mesh_device, layout, memory_co @pytest.mark.parametrize("memory_config", [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG]) def test_multi_device_replicate(pcie_mesh_device, shape, layout, memory_config): """Test ReplicateTensorToMesh to broadcast a tensor across multiple devices""" - from ttnn import ReplicateTensorToMesh, ListMeshToTensor + from ttnn import ReplicateTensorToMesh pcie_mesh_device.enable_async(True) @@ -100,9 +100,9 @@ def test_multi_device_replicate(pcie_mesh_device, shape, layout, memory_config): ) ttnn_tensor = ttnn.to_device(ttnn_tensor, pcie_mesh_device) ttnn_loop_back_tensor = ttnn.from_device(ttnn_tensor) - loopback_replicated_tensors = ttnn.to_torch( - ttnn_loop_back_tensor, mesh_composer=ListMeshToTensor(pcie_mesh_device) - ) + loopback_replicated_tensors = [ + ttnn.to_torch(shard) for shard in ttnn.get_device_tensors(ttnn_loop_back_tensor.cpu()) + ] for loopback_replicated_tensor in loopback_replicated_tensors: assert torch.all(full_tensor == loopback_replicated_tensor) @@ -114,7 +114,7 @@ def test_multi_device_replicate(pcie_mesh_device, shape, layout, memory_config): @pytest.mark.parametrize("dtype", [ttnn.bfloat8_b]) def test_ttnn_to_multi_device_tilized_parallel(pcie_mesh_device, layout, memory_config, dtype): """Test multi chip layout conversions on worker threads""" - from ttnn import ShardTensorToMesh, ConcatMeshToTensor, ListMeshToTensor + from ttnn import ShardTensorToMesh, ConcatMeshToTensor shard_dim = 3 pcie_mesh_device.enable_async(True) @@ -134,9 +134,7 @@ def test_ttnn_to_multi_device_tilized_parallel(pcie_mesh_device, layout, memory_ ) else: # Test Mesh Composer - readback_tensors = ttnn.to_torch( - ttnn_tensor, mesh_composer=ListMeshToTensor(pcie_mesh_device), device=pcie_mesh_device - ) + readback_tensors = [ttnn.to_torch(shard) for shard in ttnn.get_device_tensors(ttnn_tensor.cpu())] readback_tensor = torch.cat(readback_tensors, dim=shard_dim) assert torch.all(readback_tensor == torch_tensor) pcie_mesh_device.enable_async(False) @@ -320,7 +318,7 @@ def test_add_1D_tensor_and_scalar(pcie_mesh_device, scalar, size): mesh_mapper=ttnn.ReplicateTensorToMesh(pcie_mesh_device), ) output_tensor = input_tensor + scalar - output_tensors = ttnn.to_torch(output_tensor, mesh_composer=ttnn.ListMeshToTensor(pcie_mesh_device)) + output_tensors = [ttnn.to_torch(shard) for shard in ttnn.get_device_tensors(output_tensor.cpu())] for output_tensor in output_tensors: assert ttnn.pearson_correlation_coefficient(torch_output_tensor, output_tensor) >= 0.99988 assert output_tensor.shape == (1, size) diff --git a/tests/ttnn/unit_tests/test_multi_device_events.py b/tests/ttnn/unit_tests/test_multi_device_events.py index 0217fe9f33f..1eb2a98ae32 100644 --- a/tests/ttnn/unit_tests/test_multi_device_events.py +++ b/tests/ttnn/unit_tests/test_multi_device_events.py @@ -10,7 +10,7 @@ from loguru import logger import os from tests.ttnn.utils_for_testing import assert_with_pcc -from ttnn import ShardTensorToMesh, ReplicateTensorToMesh, ConcatMeshToTensor, ListMeshToTensor +from ttnn import ShardTensorToMesh, ReplicateTensorToMesh, ConcatMeshToTensor @pytest.mark.parametrize("shape", [(1, 1, 512, 512)]) diff --git a/tests/ttnn/unit_tests/test_multi_device_trace.py b/tests/ttnn/unit_tests/test_multi_device_trace.py index 2e81db7b248..4e4063b7f8a 100644 --- a/tests/ttnn/unit_tests/test_multi_device_trace.py +++ b/tests/ttnn/unit_tests/test_multi_device_trace.py @@ -10,7 +10,7 @@ from loguru import logger import os from tests.ttnn.utils_for_testing import assert_with_pcc -from ttnn import ShardTensorToMesh, ReplicateTensorToMesh, ConcatMeshToTensor, ListMeshToTensor +from ttnn import ShardTensorToMesh, ReplicateTensorToMesh, ConcatMeshToTensor NUM_TRACE_LOOPS = int(os.getenv("NUM_TRACE_LOOPS", 15)) diff --git a/tests/ttnn/unit_tests/test_multi_device_trace_TG.py b/tests/ttnn/unit_tests/test_multi_device_trace_TG.py index 60c5f57d613..86bc27aa1aa 100644 --- a/tests/ttnn/unit_tests/test_multi_device_trace_TG.py +++ b/tests/ttnn/unit_tests/test_multi_device_trace_TG.py @@ -10,7 +10,7 @@ from loguru import logger import os from tests.ttnn.utils_for_testing import assert_with_pcc -from ttnn import ShardTensorToMesh, ReplicateTensorToMesh, ConcatMeshToTensor, ListMeshToTensor +from ttnn import ShardTensorToMesh, ReplicateTensorToMesh, ConcatMeshToTensor NUM_TRACE_LOOPS = int(os.getenv("NUM_TRACE_LOOPS", 15)) diff --git a/tests/ttnn/unit_tests/test_multi_device_trace_tgg.py b/tests/ttnn/unit_tests/test_multi_device_trace_tgg.py index f7c9fb0c8e1..3036acf4a77 100644 --- a/tests/ttnn/unit_tests/test_multi_device_trace_tgg.py +++ b/tests/ttnn/unit_tests/test_multi_device_trace_tgg.py @@ -10,7 +10,7 @@ from loguru import logger import os from tests.ttnn.utils_for_testing import assert_with_pcc -from ttnn import ShardTensorToMesh, ReplicateTensorToMesh, ConcatMeshToTensor, ListMeshToTensor +from ttnn import ShardTensorToMesh, ReplicateTensorToMesh, ConcatMeshToTensor NUM_TRACE_LOOPS = int(os.getenv("NUM_TRACE_LOOPS", 15)) diff --git a/ttnn/cpp/ttnn/distributed/distributed_pybind.cpp b/ttnn/cpp/ttnn/distributed/distributed_pybind.cpp index fc49c0cdf09..83cb636335f 100644 --- a/ttnn/cpp/ttnn/distributed/distributed_pybind.cpp +++ b/ttnn/cpp/ttnn/distributed/distributed_pybind.cpp @@ -3,14 +3,12 @@ // SPDX-License-Identifier: Apache-2.0 #include "ttnn/distributed/distributed_pybind.hpp" -#include +#include +#include #include "ttnn/distributed/api.hpp" -#include "ttnn/tensor/tensor_utils.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/types.hpp" -#include -#include "pybind11/stl.h" using namespace tt::tt_metal; @@ -101,11 +99,12 @@ void py_module(py::module& module) { &MeshDevice::get_devices, py::return_value_policy::reference, R"doc( - Get the devices in the device mesh. + Get the devices in the device mesh. - Returns: - List[Device]: The devices in the device mesh. - )doc") + + Returns: + List[Device]: The devices in the device mesh. + )doc") .def( "create_submesh", &MeshDevice::create_submesh, @@ -121,86 +120,94 @@ void py_module(py::module& module) { "compute_with_storage_grid_size", &MeshDevice::compute_with_storage_grid_size, R"doc( - Get the compute grid size (x, y) of the first device in the device mesh denoting region that can be targeted by ops. + Get the compute grid size (x, y) of the first device in the device mesh denoting region that can be targeted by ops. + - Returns: - CoreCoord: The compute grid size of the first device in the device mesh. - )doc") + Returns: + CoreCoord: The compute grid size of the first device in the device mesh. + )doc") .def( "dram_grid_size", &MeshDevice::dram_grid_size, R"doc( - Get the dram grid size (x, y) of the first device in the device mesh. + Get the dram grid size (x, y) of the first device in the device mesh. - Returns: - CoreCoord: The dram grid size of the first device in the device mesh. - )doc") + + Returns: + CoreCoord: The dram grid size of the first device in the device mesh. + )doc") .def( "arch", &MeshDevice::arch, R"doc( - Get the arch of the first device in the device mesh. + Get the arch of the first device in the device mesh. + - Returns: - Arch: The arch of the first device in the device mesh. - )doc") + Returns: + Arch: The arch of the first device in the device mesh. + )doc") .def( "enable_async", &MeshDevice::enable_async, py::arg("enable"), R"doc( - Enable or disable async mode across all devices in the mesh. + Enable or disable async mode across all devices in the mesh. + - Args: - enable (bool): True to enable async mode, False to disable it. - )doc") + Args: + enable (bool): True to enable async mode, False to disable it. + )doc") .def( "enable_program_cache", &MeshDevice::enable_program_cache, R"doc( - Enable program cache across all devices in the mesh. - )doc") + Enable program cache across all devices in the mesh. + )doc") .def( "disable_and_clear_program_cache", &MeshDevice::disable_and_clear_program_cache, R"doc( - Disable program cache across all devices in the mesh. - )doc") + Disable program cache across all devices in the mesh. + )doc") .def_property_readonly( "shape", &MeshDevice::shape, R"doc( - Get the shape of the device mesh. + Get the shape of the device mesh. - Returns: - Tuple[int, int]: The shape of the device mesh as (num_rows, num_cols). - )doc") + + Returns: + Tuple[int, int]: The shape of the device mesh as (num_rows, num_cols). + )doc") .def( "reshape", &MeshDevice::reshape, py::arg("new_shape"), R"doc( - Reshapes the logical mesh and re-maps the physical devices to the new logical coordinates. - - Reshaping Rules: - 1. The old_shape volume must equal the new_shape volume (i.e. number of devices must remain constant) - 2. Line-to-Line Reshaping (when either dimension is 1): - - Always possible between 1xN and Nx1 shapes (e.g.: 1x8 <-> 8x1) - 3. Grid-to-Grid Reshaping: - - Only possible if the devices can form a connected physical mesh in the new shape - - Must maintain physical connectivity between adjacent devices - 4. Line-to-Grid Reshaping: - - Only possible if the physical devices can form a connected physical mesh in the new shape - - Example: 1x8 -> 2x4 is possible only if physical mesh permits a 2x4 configuration - - Args: - new_shape (MeshShape): The new shape of the mesh. - - Raises: - RuntimeError: If the reshaping constraints are not met: - 1. The old_shape volume must equal the new_shape volume (i.e. number of devices must remain constant) - 2. For Grid-to-Grid or Line-to-Grid reshaping: physical connectivity must be possible with current devices - )doc") + Reshapes the logical mesh and re-maps the physical devices to the new logical coordinates. + + + Reshaping Rules: + 1. The old_shape volume must equal the new_shape volume (i.e. number of devices must remain constant) + 2. Line-to-Line Reshaping (when either dimension is 1): + - Always possible between 1xN and Nx1 shapes (e.g.: 1x8 <-> 8x1) + 3. Grid-to-Grid Reshaping: + - Only possible if the devices can form a connected physical mesh in the new shape + - Must maintain physical connectivity between adjacent devices + 4. Line-to-Grid Reshaping: + - Only possible if the physical devices can form a connected physical mesh in the new shape + - Example: 1x8 -> 2x4 is possible only if physical mesh permits a 2x4 configuration + + + Args: + new_shape (MeshShape): The new shape of the mesh. + + + Raises: + RuntimeError: If the reshaping constraints are not met: + 1. The old_shape volume must equal the new_shape volume (i.e. number of devices must remain constant) + 2. For Grid-to-Grid or Line-to-Grid reshaping: physical connectivity must be possible with current devices + )doc") .def("__repr__", &MeshDevice::to_string) .def( "create_sub_device_manager", @@ -210,16 +217,18 @@ void py_module(py::module& module) { py::arg("sub_devices"), py::arg("local_l1_size"), R"doc( - Creates a sub-device manager for the given mesh device. + Creates a sub-device manager for the given mesh device. - Args: - sub_devices (List[ttnn.SubDevice]): The sub-devices to include in the sub-device manager. - This configuration will be used for each device in the MeshDevice. - local_l1_size (int): The size of the local allocators of each sub-device. The global allocator will be shrunk by this amount. - Returns: - MeshSubDeviceManagerId: The ID of the created sub-device manager. - )doc") + Args: + sub_devices (List[ttnn.SubDevice]): The sub-devices to include in the sub-device manager. + This configuration will be used for each device in the MeshDevice. + local_l1_size (int): The size of the local allocators of each sub-device. The global allocator will be shrunk by this amount. + + + Returns: + MeshSubDeviceManagerId: The ID of the created sub-device manager. + )doc") .def( "create_sub_device_manager_with_fabric", [](MeshDevice& self, const std::vector& sub_devices, DeviceAddr local_l1_size) { @@ -228,44 +237,48 @@ void py_module(py::module& module) { py::arg("sub_devices"), py::arg("local_l1_size"), R"doc( - Creates a sub-device manager for the given mesh device. This will automatically create a sub-device of ethernet cores for use with fabric. - Note that this is a temporary API until migration to actual fabric is complete. - - Args: - sub_devices (List[ttnn.SubDevice]): The sub-devices to include in the sub-device manager. No ethernet cores should be included in this list. - This configuration will be used for each device in the MeshDevice. - local_l1_size (int): The size of the local allocators of each sub-device. The global allocator will be shrunk by this amount. - - Returns: - MeshSubDeviceManagerId: The ID of the created sub-device manager. - SubDeviceId: The ID of the sub-device that will be used for fabric. - )doc") + Creates a sub-device manager for the given mesh device. This will automatically create a sub-device of ethernet cores for use with fabric. + Note that this is a temporary API until migration to actual fabric is complete. + + + Args: + sub_devices (List[ttnn.SubDevice]): The sub-devices to include in the sub-device manager. No ethernet cores should be included in this list. + This configuration will be used for each device in the MeshDevice. + local_l1_size (int): The size of the local allocators of each sub-device. The global allocator will be shrunk by this amount. + + + Returns: + MeshSubDeviceManagerId: The ID of the created sub-device manager. + SubDeviceId: The ID of the sub-device that will be used for fabric. + )doc") .def( "load_sub_device_manager", &MeshDevice::mesh_load_sub_device_manager, py::arg("mesh_sub_device_manager_id"), R"doc( - Loads the sub-device manager with the given ID. + Loads the sub-device manager with the given ID. + - Args: - mesh_sub_device_manager_id (MeshSubDeviceManagerId): The ID of the sub-device manager to load. - )doc") + Args: + mesh_sub_device_manager_id (MeshSubDeviceManagerId): The ID of the sub-device manager to load. + )doc") .def( "clear_loaded_sub_device_manager", &MeshDevice::mesh_clear_loaded_sub_device_manager, R"doc( - Clears the loaded sub-device manager for the given mesh device. - )doc") + Clears the loaded sub-device manager for the given mesh device. + )doc") .def( "remove_sub_device_manager", &MeshDevice::mesh_remove_sub_device_manager, py::arg("mesh_sub_device_manager_id"), R"doc( - Removes the sub-device manager with the given ID. + Removes the sub-device manager with the given ID. - Args: - mesh_sub_device_manager_id (MeshSubDeviceManagerId): The ID of the sub-device manager to remove. - )doc") + + Args: + mesh_sub_device_manager_id (MeshSubDeviceManagerId): The ID of the sub-device manager to remove. + )doc") .def( "set_sub_device_stall_group", [](MeshDevice& self, const std::vector& sub_device_ids) { @@ -273,20 +286,21 @@ void py_module(py::module& module) { }, py::arg("sub_device_ids"), R"doc( - Set the SubDevice IDs that will be stalled on by default for Fast Dispatch commands such as reading, writing, synchronizing. - Stalling here refers to the Fast Dispatch cores waiting for programs to complete execution on the specified SubDevices before proceeding with the specified instruction. - The default SubDevice IDs to stall on are set to all SubDevice IDs, and whenever a new SubDevice Manager is loaded. + Set the SubDevice IDs that will be stalled on by default for Fast Dispatch commands such as reading, writing, synchronizing. + Stalling here refers to the Fast Dispatch cores waiting for programs to complete execution on the specified SubDevices before proceeding with the specified instruction. + The default SubDevice IDs to stall on are set to all SubDevice IDs, and whenever a new SubDevice Manager is loaded. + - Args: - sub_device_ids (List[SubDeviceId]): The IDs of the SubDevices to stall on. - )doc") + Args: + sub_device_ids (List[SubDeviceId]): The IDs of the SubDevices to stall on. + )doc") .def( "reset_sub_device_stall_group", &MeshDevice::mesh_reset_sub_device_stall_group, R"doc( - Resets the sub_device_ids that will be stalled on by default for Fast Dispatch commands such as reading, writing, synchronizing - back to all SubDevice IDs. - )doc"); + Resets the sub_device_ids that will be stalled on by default for Fast Dispatch commands such as reading, writing, synchronizing + back to all SubDevice IDs. + )doc"); module.def( "open_mesh_device", @@ -308,15 +322,17 @@ void py_module(py::module& module) { py::arg("device_id"), py::kw_only(), R"doc( - Get the tensor shard corresponding to the device_id. + Get the tensor shard corresponding to the device_id. + - Args: - tensor (Tensor): The tensor to get the shard from. - device_id (int): The device id to get the shard for. + Args: + tensor (Tensor): The tensor to get the shard from. + device_id (int): The device id to get the shard for. - Returns: - Tensor: The shard of the tensor corresponding to the device_id. - )doc"); + + Returns: + Tensor: The shard of the tensor corresponding to the device_id. + )doc"); module.def( "get_device_tensor", py::overload_cast(&ttnn::distributed::get_device_tensor), @@ -324,15 +340,17 @@ void py_module(py::module& module) { py::arg("device"), py::kw_only(), R"doc( - Get the tensor shard corresponding to the device. + Get the tensor shard corresponding to the device. + + + Args: + tensor (Tensor): The tensor to get the shard from. + device (Device): The device to get the shard for. - Args: - tensor (Tensor): The tensor to get the shard from. - device (Device): The device to get the shard for. - Returns: - Tensor: The shard of the tensor corresponding to the device. - )doc"); + Returns: + Tensor: The shard of the tensor corresponding to the device. + )doc"); module.def("get_device_tensors", &get_device_tensors, py::arg("tensor"), py::kw_only()); module.def( "aggregate_as_tensor", diff --git a/ttnn/cpp/ttnn/distributed/distributed_pybind.hpp b/ttnn/cpp/ttnn/distributed/distributed_pybind.hpp index e197599e165..93d26f3f2d6 100644 --- a/ttnn/cpp/ttnn/distributed/distributed_pybind.hpp +++ b/ttnn/cpp/ttnn/distributed/distributed_pybind.hpp @@ -4,6 +4,7 @@ #pragma once #include "pybind11/pybind_fwd.hpp" +#include namespace py = pybind11; diff --git a/ttnn/ttnn/distributed/__init__.py b/ttnn/ttnn/distributed/__init__.py index 02b0c03e677..4b1a970eaa7 100644 --- a/ttnn/ttnn/distributed/__init__.py +++ b/ttnn/ttnn/distributed/__init__.py @@ -19,7 +19,6 @@ ReplicateTensorToMesh, MeshToTensor, ConcatMeshToTensor, - ListMeshToTensor, visualize_mesh_device, ConcatMesh2dToTensor, distribute, diff --git a/ttnn/ttnn/distributed/distributed.py b/ttnn/ttnn/distributed/distributed.py index cf3221e8158..46ee1e58c73 100644 --- a/ttnn/ttnn/distributed/distributed.py +++ b/ttnn/ttnn/distributed/distributed.py @@ -442,18 +442,6 @@ def compose(self, tensor: ttnn.Tensor) -> "torch.Tensor": return torch.cat(device_shards_converted_to_torch, dim=self.concat_dim) -# TODO: #15061 - Remove this function, as it does not abide to the MeshToTensor interface. -# Instead, lift this implementation to the caller. -class ListMeshToTensor(MeshToTensor): - def __init__(self, mesh_device: MeshDevice): - self.mesh_device = mesh_device - - def compose(self, tensor: ttnn.Tensor) -> List["torch.Tensor"]: - return [ - ttnn.to_torch(tt_input_tensor, mesh_composer=None) for tt_input_tensor in ttnn.get_device_tensors(tensor) - ] - - @contextlib.contextmanager def distribute(default: Union[TensorToMesh, MeshToTensor]): """ From 899b181701695d32a6797dc75f8328323ffb2b85 Mon Sep 17 00:00:00 2001 From: Bryan Wilder Field Lozano Date: Tue, 18 Feb 2025 12:14:12 -0800 Subject: [PATCH 145/316] [skip ci] Fix build and test wheel workflow (#17962) ### Problem description Workflow doesn't work, because build-wheel option wasn't passed. ### Checklist https://github.com/tenstorrent/tt-metal/actions/runs/13398987372 --- .github/workflows/build-and-test-wheels.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/build-and-test-wheels.yaml b/.github/workflows/build-and-test-wheels.yaml index 27494489a25..fddffdfee67 100644 --- a/.github/workflows/build-and-test-wheels.yaml +++ b/.github/workflows/build-and-test-wheels.yaml @@ -15,6 +15,8 @@ jobs: if: ${{ github.event_name == 'workflow_dispatch' && inputs.from-precompiled }} uses: ./.github/workflows/build-artifact.yaml secrets: inherit + with: + build-wheel: true test-wheels: needs: build-artifact if: ${{ always() }} From 25518393d229466ead6a0584f337ab60bd0b279b Mon Sep 17 00:00:00 2001 From: Martin Chang Date: Wed, 19 Feb 2025 04:47:03 +0800 Subject: [PATCH 146/316] Fix for failing to build on GCC-14 (#17906) ### Ticket #17905 ### Problem description Latest tt-metal can't build correctly on GCC-14 ### What's changed Base on https://stackoverflow.com/questions/76867698/what-does-ignoring-attributes-on-template-argument-mean-in-this-context the error is due to the attribute being lost when casting to function pointer. The simply workaround is to write a class and pass that around. This also has the benefit of not needing the `unique_ptr` to store 2 pointers. Please help run CI and merge the patch. ### Checklist - [x] [All post commit](https://github.com/tenstorrent/tt-metal/actions/runs/13384193646) CI passes --- ttnn/cpp/ttnn/tensor/serialization.cpp | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/ttnn/cpp/ttnn/tensor/serialization.cpp b/ttnn/cpp/ttnn/tensor/serialization.cpp index ee5209a0aa2..4d4940404c0 100644 --- a/ttnn/cpp/ttnn/tensor/serialization.cpp +++ b/ttnn/cpp/ttnn/tensor/serialization.cpp @@ -24,6 +24,14 @@ using MeshDevice = distributed::MeshDevice; namespace { +struct FileCloser { + void operator()(FILE* file) const { + if (file) { + TT_ASSERT(fclose(file) == 0, "Failed to close file"); + } + } +}; + struct Padding { enum class PadValue { Any, Zero, Infinity, NegativeInfinity }; struct PadDimension { @@ -393,7 +401,7 @@ Tensor load_tensor_helper(const std::string& file_name, T device) { if (not input_file) { TT_THROW("Cannot open \"{}\"", file_name); } - std::unique_ptr file_guard(input_file, &fclose); + std::unique_ptr file_guard(input_file); std::size_t read_sentinel; safe_fread(&read_sentinel, sizeof(read_sentinel), 1, input_file); @@ -435,7 +443,7 @@ void dump_tensor( if (not output_file) { TT_THROW("Cannot open \"{}\"", file_name); } - std::unique_ptr file_guard(output_file, &fclose); + std::unique_ptr file_guard(output_file); safe_fwrite(&SENTINEL_VALUE, sizeof(SENTINEL_VALUE), 1, output_file); safe_fwrite(&VERSION_ID, sizeof(VERSION_ID), 1, output_file); @@ -495,7 +503,7 @@ void dump_memory_config(const std::string& file_name, const MemoryConfig& memory if (not output_file) { TT_THROW("Cannot open \"{}\"", file_name); } - std::unique_ptr file_guard(output_file, &fclose); + std::unique_ptr file_guard(output_file); dump_memory_config(output_file, memory_config); } @@ -533,7 +541,7 @@ MemoryConfig load_memory_config(const std::string& file_name) { if (not input_file) { TT_THROW("Cannot open \"{}\"", file_name); } - std::unique_ptr file_guard(input_file, &fclose); + std::unique_ptr file_guard(input_file); return load_memory_config(input_file); } From a94728d0e76eecc15716ecc260b1e96cc22d4c5d Mon Sep 17 00:00:00 2001 From: Stanislav Minakov Date: Tue, 18 Feb 2025 21:16:21 +0000 Subject: [PATCH 147/316] Restore Moreh::sum behavior to be the same as before shape changes (#17772) ### Ticket ### Problem description NanoGPT training currently explodes, this PR tries to resolve the issue. In particular, porting Moreh sum to use SimpleShape affected handling of 1D shapes. Nano gpt training is still not fixed, the bisect shows that the other offending commit is the new repeat. It causes ttnn::add behavior to be different. This can be reproduced on shapes [64, 1, 256, 384] and [1, 1, 256, 384]. But the exact reason is not clear and will be further investigated by @rfurko-tt ### What's changed Updated Moreh sum operation to preserve the old behavior - always return rank >= 2. ### Checklist - [x] [All post commit CI passes](https://github.com/tenstorrent/tt-metal/actions/runs/13301141641) - [x] New/Existing tests provide coverage for changes --- tt-train/sources/ttml/ops/losses.cpp | 2 +- .../moreh/moreh_sum/device/moreh_sum_device_operation.cpp | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tt-train/sources/ttml/ops/losses.cpp b/tt-train/sources/ttml/ops/losses.cpp index 3763f2a9c9b..317e2dd4153 100644 --- a/tt-train/sources/ttml/ops/losses.cpp +++ b/tt-train/sources/ttml/ops/losses.cpp @@ -68,7 +68,7 @@ autograd::TensorPtr nll_loss( } auto* device = &autograd::ctx().get_device(); - auto divisor = core::empty(ttnn::Shape({1}), device, prediction->get_value().memory_config()); + auto divisor = core::empty(ttnn::Shape({1, 1}), device, prediction->get_value().memory_config()); auto tensor_shape = prediction->get_value().get_logical_shape(); uint32_t Ndim = tensor_shape[0] * tensor_shape[1] * tensor_shape[2]; diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_device_operation.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_device_operation.cpp index f2df3ab4dc4..d11c5de16fa 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_device_operation.cpp @@ -69,7 +69,10 @@ MorehSumOperation::spec_return_value_t MorehSumOperation::compute_output_specs( } const auto& input = tensor_args.input; - const auto& input_shape = input.get_logical_shape(); + auto input_shape = input.get_logical_shape(); + if (input_shape.rank() < 2) { + input_shape = input_shape.to_rank(2); + } const auto input_rank = input_shape.rank(); const bool is_tile_dim = (operation_attributes.dim == input_rank - 1 || operation_attributes.dim == input_rank - 2); log_debug( From 76596070df47545d07a03a7ae07d5c66c7c2fc63 Mon Sep 17 00:00:00 2001 From: Andrew Fuller Date: Tue, 18 Feb 2025 16:27:42 -0500 Subject: [PATCH 148/316] Afuller/rm rf archname (#17894) ### Ticket A step towards #17851 ### Problem description Some lines in these tests weren't using HAL and thus had to be built 1x/arch and produced multiple binaries. ### What's changed Switch to HAL and drop the arch-specific binaries. ### Checklist - [x] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI [passes](https://github.com/tenstorrent/tt-metal/actions/runs/13381523927) --- .github/workflows/build-and-unit-tests.yaml | 2 +- .github/workflows/cpp-post-commit.yaml | 4 +- tests/scripts/run_cpp_unit_tests.sh | 2 +- tests/scripts/run_tests.sh | 2 +- tests/scripts/t3000/run_t3000_unit_tests.sh | 4 +- tests/scripts/tg/run_tg_unit_tests.sh | 4 +- tests/scripts/tgg/run_tgg_unit_tests.sh | 2 +- .../tools/profiler/test_device_profiler.py | 6 +- tests/tt_metal/tt_metal/CMakeLists.txt | 4 +- .../tt_metal/tt_metal/dispatch/CMakeLists.txt | 99 +++++++------------ .../dispatch/dispatch_buffer/CMakeLists.txt | 5 - .../dispatch/dispatch_event/CMakeLists.txt | 5 - .../dispatch/dispatch_program/CMakeLists.txt | 9 -- .../dispatch_program/test_EnqueueProgram.cpp | 9 +- .../dispatch_program/test_dispatch.cpp | 7 +- .../dispatch/dispatch_trace/CMakeLists.txt | 5 - .../dispatch/dispatch_util/CMakeLists.txt | 5 - .../dispatch/sub_device_test_utils.hpp | 7 +- 18 files changed, 62 insertions(+), 119 deletions(-) delete mode 100644 tests/tt_metal/tt_metal/dispatch/dispatch_buffer/CMakeLists.txt delete mode 100644 tests/tt_metal/tt_metal/dispatch/dispatch_event/CMakeLists.txt delete mode 100644 tests/tt_metal/tt_metal/dispatch/dispatch_program/CMakeLists.txt delete mode 100644 tests/tt_metal/tt_metal/dispatch/dispatch_trace/CMakeLists.txt delete mode 100644 tests/tt_metal/tt_metal/dispatch/dispatch_util/CMakeLists.txt diff --git a/.github/workflows/build-and-unit-tests.yaml b/.github/workflows/build-and-unit-tests.yaml index aa0a14264b4..145fad832af 100644 --- a/.github/workflows/build-and-unit-tests.yaml +++ b/.github/workflows/build-and-unit-tests.yaml @@ -58,7 +58,7 @@ jobs: {name: api, cmd: "./build/test/tt_metal/unit_tests_api_${{ inputs.arch }}"}, {name: debug_tools, cmd: "./build/test/tt_metal/unit_tests_debug_tools_${{ inputs.arch }}"}, {name: device, cmd: "./build/test/tt_metal/unit_tests_device"}, - {name: dispatch, cmd: "./build/test/tt_metal/unit_tests_dispatch_${{ inputs.arch }}"}, + {name: dispatch, cmd: "./build/test/tt_metal/unit_tests_dispatch"}, {name: eth, cmd: "./build/test/tt_metal/unit_tests_eth_${{ inputs.arch }}"}, {name: llk, cmd: "./build/test/tt_metal/unit_tests_llk"}, {name: stl, cmd: "./build/test/tt_metal/unit_tests_stl"}, diff --git a/.github/workflows/cpp-post-commit.yaml b/.github/workflows/cpp-post-commit.yaml index ed0c1f165e7..f9689deec4e 100644 --- a/.github/workflows/cpp-post-commit.yaml +++ b/.github/workflows/cpp-post-commit.yaml @@ -58,13 +58,13 @@ jobs: {name: api, cmd: "./build/test/tt_metal/unit_tests_api_${{ inputs.arch }}"}, {name: debug_tools, cmd: "./build/test/tt_metal/unit_tests_debug_tools_${{ inputs.arch }}"}, {name: device, cmd: "./build/test/tt_metal/unit_tests_device"}, - {name: dispatch, cmd: "./build/test/tt_metal/unit_tests_dispatch_${{ inputs.arch }}"}, + {name: dispatch, cmd: "./build/test/tt_metal/unit_tests_dispatch"}, {name: eth, cmd: "./build/test/tt_metal/unit_tests_eth_${{ inputs.arch }}"}, {name: llk, cmd: "./build/test/tt_metal/unit_tests_llk"}, {name: stl, cmd: "./build/test/tt_metal/unit_tests_stl"}, {name: distributed, cmd: "./build/test/tt_metal/distributed/distributed_unit_tests_${{ inputs.arch }}"}, {name: lightmetal, cmd: "./build/test/tt_metal/unit_tests_lightmetal"}, - {name: dispatch multicmd queue, cmd: "TT_METAL_GTEST_NUM_HW_CQS=2 ./build/test/tt_metal/unit_tests_dispatch_${{ inputs.arch }} --gtest_filter=MultiCommandQueue*Fixture.*"}, + {name: dispatch multicmd queue, cmd: "TT_METAL_GTEST_NUM_HW_CQS=2 ./build/test/tt_metal/unit_tests_dispatch --gtest_filter=MultiCommandQueue*Fixture.*"}, {name: ttnn cpp unit tests, cmd: ./build/test/ttnn/unit_tests_ttnn}, {name: ttnn ccl cpp unit tests, cmd: ./build/test/ttnn/unit_tests_ttnn_ccl}, {name: ttnn tensor cpp unit tests, cmd: ./build/test/ttnn/unit_tests_ttnn_tensor}, diff --git a/tests/scripts/run_cpp_unit_tests.sh b/tests/scripts/run_cpp_unit_tests.sh index 1b1efc533cb..f035d3fec4a 100755 --- a/tests/scripts/run_cpp_unit_tests.sh +++ b/tests/scripts/run_cpp_unit_tests.sh @@ -18,7 +18,7 @@ if [[ ! -z "$TT_METAL_SLOW_DISPATCH_MODE" ]]; then else # Enable this on BH after #14613 if [[ "$ARCH_NAME" == "wormhole_b0" ]]; then - TT_METAL_GTEST_ETH_DISPATCH=1 ./build/test/tt_metal/unit_tests_dispatch_${ARCH_NAME} + TT_METAL_GTEST_ETH_DISPATCH=1 ./build/test/tt_metal/unit_tests_dispatch fi env python3 tests/scripts/run_tt_eager.py --dispatch-mode fast env python3 tests/scripts/run_tt_metal.py --dispatch-mode fast diff --git a/tests/scripts/run_tests.sh b/tests/scripts/run_tests.sh index 9448fbb0ae6..0f4d4480a11 100755 --- a/tests/scripts/run_tests.sh +++ b/tests/scripts/run_tests.sh @@ -79,7 +79,7 @@ run_frequent_api_pipeline_tests() { local dispatch_mode=$3 if [[ $dispatch_mode == "slow" ]]; then - TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests_dispatch_${ARCH_NAME} --gtest_filter=DispatchStress.TensixRunManyTimes + TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests_dispatch --gtest_filter=DispatchStress.TensixRunManyTimes echo "Running Python API unit tests in SD for frequent..." ./tests/scripts/run_python_api_unit_tests.sh fi diff --git a/tests/scripts/t3000/run_t3000_unit_tests.sh b/tests/scripts/t3000/run_t3000_unit_tests.sh index 87df13c964e..3eff90e9879 100755 --- a/tests/scripts/t3000/run_t3000_unit_tests.sh +++ b/tests/scripts/t3000/run_t3000_unit_tests.sh @@ -18,8 +18,8 @@ run_t3000_ttmetal_tests() { TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests_eth_${ARCH_NAME} --gtest_filter="DeviceFixture.ActiveEthKernelsSendInterleavedBufferAllConnectedChips" ; fail+=$? TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests_eth_${ARCH_NAME} --gtest_filter="DeviceFixture.ActiveEthKernelsDirectRingGatherAllChips" ; fail+=$? TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests_eth_${ARCH_NAME} --gtest_filter="DeviceFixture.ActiveEthKernelsInterleavedRingGatherAllChips" ; fail+=$? - TT_METAL_ENABLE_REMOTE_CHIP=1 ./build/test/tt_metal/unit_tests_dispatch_${ARCH_NAME} --gtest_filter="CommandQueueSingleCard*Fixture.*" ; fail+=$? - ./build/test/tt_metal/unit_tests_dispatch_${ARCH_NAME} --gtest_filter="CommandQueueMultiDevice*Fixture.*" ; fail+=$? + TT_METAL_ENABLE_REMOTE_CHIP=1 ./build/test/tt_metal/unit_tests_dispatch --gtest_filter="CommandQueueSingleCard*Fixture.*" ; fail+=$? + ./build/test/tt_metal/unit_tests_dispatch --gtest_filter="CommandQueueMultiDevice*Fixture.*" ; fail+=$? ./build/test/tt_metal/unit_tests_debug_tools_${ARCH_NAME} --gtest_filter="DPrintFixture.*:WatcherFixture.*" ; fail+=$? # Programming examples diff --git a/tests/scripts/tg/run_tg_unit_tests.sh b/tests/scripts/tg/run_tg_unit_tests.sh index c82a51861b7..f5b3752f840 100755 --- a/tests/scripts/tg/run_tg_unit_tests.sh +++ b/tests/scripts/tg/run_tg_unit_tests.sh @@ -105,11 +105,11 @@ run_tg_prefetcher_tests() { run_tg_tests() { if [[ "$1" == "unit" ]]; then echo "LOG_METAL: running run_tg_unit_tests" - TT_METAL_ENABLE_REMOTE_CHIP=1 ./build/test/tt_metal/unit_tests_dispatch_${ARCH_NAME} --gtest_filter="CommandQueueSingleCard*Fixture.*" + TT_METAL_ENABLE_REMOTE_CHIP=1 ./build/test/tt_metal/unit_tests_dispatch --gtest_filter="CommandQueueSingleCard*Fixture.*" ./build/test/ttnn/galaxy_unit_tests_ttnn TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests_device --gtest_filter="GalaxyFixture.*:TGFixture.*" ./build/test/tt_metal/unit_tests_device --gtest_filter="GalaxyFixture.*:TGFixture.*" - TT_METAL_GTEST_NUM_HW_CQS=2 ./build/test/tt_metal/unit_tests_dispatch_${ARCH_NAME} --gtest_filter="MultiCommandQueueMultiDevice*Fixture.*" + TT_METAL_GTEST_NUM_HW_CQS=2 ./build/test/tt_metal/unit_tests_dispatch --gtest_filter="MultiCommandQueueMultiDevice*Fixture.*" elif [[ "$1" == "fabric" ]]; then echo "LOG_FABRIC: running run_tg_fabric_tests" diff --git a/tests/scripts/tgg/run_tgg_unit_tests.sh b/tests/scripts/tgg/run_tgg_unit_tests.sh index 44005118903..0eb73d5e823 100755 --- a/tests/scripts/tgg/run_tgg_unit_tests.sh +++ b/tests/scripts/tgg/run_tgg_unit_tests.sh @@ -5,7 +5,7 @@ run_tgg_tests() { echo "LOG_METAL: running run_tgg_unit_tests" - TT_METAL_ENABLE_REMOTE_CHIP=1 ./build/test/tt_metal/unit_tests_dispatch_${ARCH_NAME} --gtest_filter="CommandQueueSingleCard*Fixture.*" + TT_METAL_ENABLE_REMOTE_CHIP=1 ./build/test/tt_metal/unit_tests_dispatch --gtest_filter="CommandQueueSingleCard*Fixture.*" ./build/test/ttnn/galaxy_unit_tests_ttnn TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests_device --gtest_filter="GalaxyFixture.*:TGGFixture.*" ./build/test/tt_metal/unit_tests_device --gtest_filter="GalaxyFixture.*:TGGFixture.*" diff --git a/tests/tt_metal/tools/profiler/test_device_profiler.py b/tests/tt_metal/tools/profiler/test_device_profiler.py index eb32531bae5..dbb2d6313f8 100644 --- a/tests/tt_metal/tools/profiler/test_device_profiler.py +++ b/tests/tt_metal/tools/profiler/test_device_profiler.py @@ -354,16 +354,16 @@ def test_timestamped_events(): def test_sub_device_profiler(): ARCH_NAME = os.getenv("ARCH_NAME") run_gtest_profiler_test( - "./build/test/tt_metal/unit_tests_dispatch" + "_" + ARCH_NAME, + "./build/test/tt_metal/unit_tests_dispatch", "CommandQueueSingleCardFixture.TensixTestSubDeviceBasicPrograms", ) os.environ["TT_METAL_PROFILER_SYNC"] = "1" run_gtest_profiler_test( - "./build/test/tt_metal/unit_tests_dispatch" + "_" + ARCH_NAME, + "./build/test/tt_metal/unit_tests_dispatch", "CommandQueueSingleCardFixture.TensixActiveEthTestSubDeviceBasicEthPrograms", ) os.environ["TT_METAL_PROFILER_SYNC"] = "0" run_gtest_profiler_test( - "./build/test/tt_metal/unit_tests_dispatch" + "_" + ARCH_NAME, + "./build/test/tt_metal/unit_tests_dispatch", "CommandQueueSingleCardTraceFixture.TensixTestSubDeviceTraceBasicPrograms", ) diff --git a/tests/tt_metal/tt_metal/CMakeLists.txt b/tests/tt_metal/tt_metal/CMakeLists.txt index e162b7cbc13..bafab7885dd 100644 --- a/tests/tt_metal/tt_metal/CMakeLists.txt +++ b/tests/tt_metal/tt_metal/CMakeLists.txt @@ -83,9 +83,7 @@ add_custom_target( unit_tests_debug_tools_wormhole_b0 unit_tests_debug_tools_blackhole unit_tests_device - unit_tests_dispatch_grayskull - unit_tests_dispatch_wormhole_b0 - unit_tests_dispatch_blackhole + unit_tests_dispatch unit_tests_eth_grayskull unit_tests_eth_wormhole_b0 unit_tests_eth_blackhole diff --git a/tests/tt_metal/tt_metal/dispatch/CMakeLists.txt b/tests/tt_metal/tt_metal/dispatch/CMakeLists.txt index fe13c3a77b3..d98671566ea 100644 --- a/tests/tt_metal/tt_metal/dispatch/CMakeLists.txt +++ b/tests/tt_metal/tt_metal/dispatch/CMakeLists.txt @@ -1,64 +1,41 @@ -add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/dispatch_buffer) -add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/dispatch_event) -add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/dispatch_program) -add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/dispatch_trace) -add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/dispatch_util) - -# Define the function to create test executables for each architecture -function(create_unit_test_executable arch_name) - # Define the test executable name using the architecture name - set(exec_name unit_tests_dispatch_${arch_name}) - string(REPLACE "wormhole" "wormhole_b0" exec_name ${exec_name}) - - # Create the test executable - add_executable(${exec_name}) - - target_sources( - ${exec_name} - PRIVATE - ${UNIT_TESTS_DISPATCH_BUFFER_SRC} - ${UNIT_TESTS_DISPATCH_EVENT_SRC} - ${UNIT_TESTS_DISPATCH_PROGRAM_SRC} - ${UNIT_TESTS_DISPATCH_TRACE_SRC} - ${UNIT_TESTS_DISPATCH_UTIL_SRC} - ) - - # Enable unity build for the executable - TT_ENABLE_UNITY_BUILD(${exec_name}) - - # Link libraries - target_link_libraries(${exec_name} PRIVATE test_metal_common_libs) - - # Set include directories - target_include_directories( - ${exec_name} - BEFORE - PRIVATE - ${PROJECT_SOURCE_DIR}/tt_metal/hw/inc/${arch_name} - "$" - ${PROJECT_SOURCE_DIR}/tests - ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/common - ${CMAKE_CURRENT_SOURCE_DIR} - ${CMAKE_CURRENT_SOURCE_DIR}/common - ) - - # Set runtime output directory - set_target_properties( - ${exec_name} - PROPERTIES - RUNTIME_OUTPUT_DIRECTORY - ${PROJECT_BINARY_DIR}/test/tt_metal - ) -endfunction() +add_executable(unit_tests_dispatch) + +target_sources( + unit_tests_dispatch + PRIVATE + dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp + dispatch_buffer/test_sub_device.cpp + dispatch_event/test_EnqueueWaitForEvent.cpp + dispatch_event/test_events.cpp + dispatch_program/test_dispatch_stress.cpp + dispatch_program/test_dispatch.cpp + dispatch_program/test_EnqueueProgram.cpp + dispatch_program/test_global_circular_buffers.cpp + dispatch_program/test_sub_device.cpp + dispatch_program/test_program_reuse.cpp + dispatch_trace/test_EnqueueTrace.cpp + dispatch_trace/test_sub_device.cpp + dispatch_util/test_dispatch_settings.cpp + dispatch_util/test_device_command.cpp +) -# Define the architectures for which to create test executables -set(ARCHITECTURES - "grayskull" - "wormhole" - "blackhole" +set_target_properties( + unit_tests_dispatch + PROPERTIES + RUNTIME_OUTPUT_DIRECTORY + ${PROJECT_BINARY_DIR}/test/tt_metal +) +TT_ENABLE_UNITY_BUILD(unit_tests_dispatch) + +target_include_directories( + unit_tests_dispatch + BEFORE + PRIVATE + "$" + ${PROJECT_SOURCE_DIR}/tests + ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/common + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/common ) -# Create a test executable for each architecture -foreach(arch ${ARCHITECTURES}) - create_unit_test_executable(${arch}) -endforeach() +target_link_libraries(unit_tests_dispatch PRIVATE test_metal_common_libs) diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/CMakeLists.txt b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/CMakeLists.txt deleted file mode 100644 index 3f3336dacd4..00000000000 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(UNIT_TESTS_DISPATCH_BUFFER_SRC - ${CMAKE_CURRENT_SOURCE_DIR}/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/test_sub_device.cpp - PARENT_SCOPE -) diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_event/CMakeLists.txt b/tests/tt_metal/tt_metal/dispatch/dispatch_event/CMakeLists.txt deleted file mode 100644 index 4d392b999c9..00000000000 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_event/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(UNIT_TESTS_DISPATCH_EVENT_SRC - ${CMAKE_CURRENT_SOURCE_DIR}/test_EnqueueWaitForEvent.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/test_events.cpp - PARENT_SCOPE -) diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_program/CMakeLists.txt b/tests/tt_metal/tt_metal/dispatch/dispatch_program/CMakeLists.txt deleted file mode 100644 index 68ad95357be..00000000000 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_program/CMakeLists.txt +++ /dev/null @@ -1,9 +0,0 @@ -set(UNIT_TESTS_DISPATCH_PROGRAM_SRC - ${CMAKE_CURRENT_SOURCE_DIR}/test_dispatch_stress.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/test_dispatch.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/test_EnqueueProgram.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/test_global_circular_buffers.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/test_sub_device.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/test_program_reuse.cpp - PARENT_SCOPE -) diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_EnqueueProgram.cpp index 1d036aeceb8..6c80d6a0561 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_EnqueueProgram.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_EnqueueProgram.cpp @@ -12,14 +12,12 @@ #include #include #include +#include #include #include #include #include "umd/device/tt_soc_descriptor.h" -// TODO: ARCH_NAME specific, must remove -#include "eth_l1_address_map.h" - using std::vector; using namespace tt::tt_metal; @@ -129,7 +127,8 @@ bool test_dummy_EnqueueProgram_with_runtime_args(IDevice* device, const CoreCoor auto eth_noc_xy = device->ethernet_core_from_logical_core(eth_core_coord); constexpr uint32_t num_runtime_args0 = 9; - constexpr uint32_t rta_base0 = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; + uint32_t rta_base0 = + hal.get_dev_addr(tt::tt_metal::HalProgrammableCoreType::ACTIVE_ETH, tt::tt_metal::HalL1MemAddrType::UNRESERVED); std::map dummy_defines0 = { {"DATA_MOVEMENT", "1"}, {"NUM_RUNTIME_ARGS", std::to_string(num_runtime_args0)}, @@ -151,7 +150,7 @@ bool test_dummy_EnqueueProgram_with_runtime_args(IDevice* device, const CoreCoor vector dummy_kernel0_args_readback = tt::llrt::read_hex_vec_from_core( device->id(), eth_noc_xy, - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE, + hal.get_dev_addr(tt::tt_metal::HalProgrammableCoreType::ACTIVE_ETH, tt::tt_metal::HalL1MemAddrType::UNRESERVED), dummy_kernel0_args.size() * sizeof(uint32_t)); pass &= (dummy_kernel0_args == dummy_kernel0_args_readback); diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch.cpp index 5104eb63dba..e6b6f8c2829 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch.cpp @@ -6,8 +6,7 @@ #include "dispatch_fixture.hpp" -// TODO: ARCH_NAME specific, must remove -#include "noc/noc_parameters.h" +#include using std::vector; @@ -64,7 +63,7 @@ static void test_sems_across_core_types( // Set up args vector eth_rtas = { - NOC_XY_ENCODING(phys_tensix_core.x, phys_tensix_core.y), + hal.noc_xy_encoding(phys_tensix_core.x, phys_tensix_core.y), eth_sem_id, tensix_sem_id, eth_sem_init_val, @@ -80,7 +79,7 @@ static void test_sems_across_core_types( SetRuntimeArgs(program, eth_kernel, eth_core, eth_rtas); vector tensix_rtas = { - NOC_XY_ENCODING(phys_eth_core.x, phys_eth_core.y), + hal.noc_xy_encoding(phys_eth_core.x, phys_eth_core.y), tensix_sem_id, eth_sem_id, tensix_sem_init_val, diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_trace/CMakeLists.txt b/tests/tt_metal/tt_metal/dispatch/dispatch_trace/CMakeLists.txt deleted file mode 100644 index a444d080617..00000000000 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_trace/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(UNIT_TESTS_DISPATCH_TRACE_SRC - ${CMAKE_CURRENT_SOURCE_DIR}/test_EnqueueTrace.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/test_sub_device.cpp - PARENT_SCOPE -) diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_util/CMakeLists.txt b/tests/tt_metal/tt_metal/dispatch/dispatch_util/CMakeLists.txt deleted file mode 100644 index 374623fd0d9..00000000000 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_util/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(UNIT_TESTS_DISPATCH_UTIL_SRC - ${CMAKE_CURRENT_SOURCE_DIR}/test_dispatch_settings.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/test_device_command.cpp - PARENT_SCOPE -) diff --git a/tests/tt_metal/tt_metal/dispatch/sub_device_test_utils.hpp b/tests/tt_metal/tt_metal/dispatch/sub_device_test_utils.hpp index 3444ca4b829..d5b27e598fd 100644 --- a/tests/tt_metal/tt_metal/dispatch/sub_device_test_utils.hpp +++ b/tests/tt_metal/tt_metal/dispatch/sub_device_test_utils.hpp @@ -6,9 +6,7 @@ #include #include - -// TODO: ARCH_NAME specific, must remove -#include "eth_l1_address_map.h" +#include inline std::tuple create_single_sync_program( IDevice* device, SubDevice sub_device) { @@ -102,7 +100,8 @@ inline std::tuple create_basic_eth_s syncer_core_physical.y, tensix_waiter_core_physical.x, tensix_waiter_core_physical.y, - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE}; + hal.get_dev_addr(tt::tt_metal::HalProgrammableCoreType::ACTIVE_ETH, tt::tt_metal::HalL1MemAddrType::UNRESERVED) + }; SetRuntimeArgs(waiter_program, waiter_kernel, waiter_core, waiter_rt_args); Program syncer_program = CreateProgram(); From ac9c6b7a3b84949241a187e216f3c14558a1ac9b Mon Sep 17 00:00:00 2001 From: Austin Ho Date: Tue, 18 Feb 2025 13:52:20 +0000 Subject: [PATCH 149/316] #0: Add new fabric apis for atomics and add a mode to bypass router lookup on device to resolve on host instead Cleanup some parts of fabric --- .../test_tt_fabric_multi_hop_sanity.cpp | 4 +- .../routing/test_tt_fabric_sanity.cpp | 4 +- .../routing/test_tt_fabric_socket_sanity.cpp | 4 +- tt_fabric/control_plane.cpp | 33 +-- tt_fabric/hw/inc/tt_fabric.h | 8 +- tt_fabric/hw/inc/tt_fabric_api.h | 190 +++++++++++++++--- 6 files changed, 174 insertions(+), 69 deletions(-) diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp index 8ac6dbd69b3..d6aab9503dd 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp @@ -228,9 +228,7 @@ int main(int argc, char** argv) { CoreCoord gk_core = {gk_x, gk_y}; - std::map defines = { - {"FD_CORE_TYPE", std::to_string(0)}, // todo, support dispatch on eth - }; + std::map defines; try { const std::filesystem::path tg_mesh_graph_desc_path = diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp index f9ff6e03670..eba9b2ed24e 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp @@ -1406,9 +1406,7 @@ int main(int argc, char **argv) { bool pass = true; uint32_t num_available_devices, num_allocated_devices = 0; - std::map defines = { - {"FD_CORE_TYPE", std::to_string(0)}, // todo, support dispatch on eth - }; + std::map defines; if (benchmark_mode) { prng_seed = 100; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp index cf140eeaf80..b6b81e575e1 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp @@ -226,9 +226,7 @@ int main(int argc, char** argv) { CoreCoord gk_core = {gk_x, gk_y}; - std::map defines = { - {"FD_CORE_TYPE", std::to_string(0)}, // todo, support dispatch on eth - }; + std::map defines; try { const std::filesystem::path tg_mesh_graph_desc_path = diff --git a/tt_fabric/control_plane.cpp b/tt_fabric/control_plane.cpp index 70bba401531..c4ba715a7dd 100644 --- a/tt_fabric/control_plane.cpp +++ b/tt_fabric/control_plane.cpp @@ -561,33 +561,22 @@ std::vector> ControlPlane::get_fabric_route( dst_chip_id); } auto physical_chip_id = logical_mesh_chip_id_to_physical_chip_id_mapping_[src_mesh_id][src_chip_id]; + chan_id_t next_chan_id = 0; if (src_mesh_id != dst_mesh_id) { // Inter-mesh routing - chan_id_t next_chan_id = - this->inter_mesh_routing_tables_[src_mesh_id][src_chip_id][src_chan_id][dst_mesh_id]; - if (src_chan_id != next_chan_id) { - // Chan to chan within chip - route.push_back({physical_chip_id, next_chan_id}); - } - std::tie(src_mesh_id, src_chip_id, src_chan_id) = - this->get_connected_mesh_chip_chan_ids(src_mesh_id, src_chip_id, next_chan_id); - auto connected_physical_chip_id = - logical_mesh_chip_id_to_physical_chip_id_mapping_[src_mesh_id][src_chip_id]; - route.push_back({connected_physical_chip_id, src_chan_id}); + next_chan_id = this->inter_mesh_routing_tables_[src_mesh_id][src_chip_id][src_chan_id][dst_mesh_id]; } else if (src_chip_id != dst_chip_id) { // Intra-mesh routing - chan_id_t next_chan_id = - this->intra_mesh_routing_tables_[src_mesh_id][src_chip_id][src_chan_id][dst_chip_id]; - if (src_chan_id != next_chan_id) { - // Chan to chan within chip - route.push_back({physical_chip_id, next_chan_id}); - } - std::tie(src_mesh_id, src_chip_id, src_chan_id) = - this->get_connected_mesh_chip_chan_ids(src_mesh_id, src_chip_id, next_chan_id); - auto connected_physical_chip_id = - logical_mesh_chip_id_to_physical_chip_id_mapping_[src_mesh_id][src_chip_id]; - route.push_back({connected_physical_chip_id, src_chan_id}); + next_chan_id = this->intra_mesh_routing_tables_[src_mesh_id][src_chip_id][src_chan_id][dst_chip_id]; + } + if (src_chan_id != next_chan_id) { + // Chan to chan within chip + route.push_back({physical_chip_id, next_chan_id}); } + std::tie(src_mesh_id, src_chip_id, src_chan_id) = + this->get_connected_mesh_chip_chan_ids(src_mesh_id, src_chip_id, next_chan_id); + auto connected_physical_chip_id = logical_mesh_chip_id_to_physical_chip_id_mapping_[src_mesh_id][src_chip_id]; + route.push_back({connected_physical_chip_id, src_chan_id}); } return route; diff --git a/tt_fabric/hw/inc/tt_fabric.h b/tt_fabric/hw/inc/tt_fabric.h index 6065f927953..02ae486c69d 100644 --- a/tt_fabric/hw/inc/tt_fabric.h +++ b/tt_fabric/hw/inc/tt_fabric.h @@ -15,11 +15,9 @@ using namespace tt::tt_fabric; -constexpr ProgrammableCoreType fd_core_type = static_cast(FD_CORE_TYPE); - -const uint32_t SYNC_BUF_SIZE = 16; // must be 2^N -const uint32_t SYNC_BUF_SIZE_MASK = (SYNC_BUF_SIZE - 1); -const uint32_t SYNC_BUF_PTR_MASK = ((SYNC_BUF_SIZE << 1) - 1); +constexpr uint32_t SYNC_BUF_SIZE = 16; // must be 2^N +constexpr uint32_t SYNC_BUF_SIZE_MASK = (SYNC_BUF_SIZE - 1); +constexpr uint32_t SYNC_BUF_PTR_MASK = ((SYNC_BUF_SIZE << 1) - 1); extern uint64_t xy_local_addr; extern volatile local_pull_request_t* local_pull_request; diff --git a/tt_fabric/hw/inc/tt_fabric_api.h b/tt_fabric/hw/inc/tt_fabric_api.h index fd96de1a1bd..b3c63d1da4f 100644 --- a/tt_fabric/hw/inc/tt_fabric_api.h +++ b/tt_fabric/hw/inc/tt_fabric_api.h @@ -13,13 +13,17 @@ using namespace tt::tt_fabric; -extern volatile local_pull_request_t* local_pull_request; extern volatile fabric_client_interface_t* client_interface; -#define ASYNC_WR_ALL 1 -#define ASYNC_WR_ADD_PR 2 -#define ASYNC_WR_SEND 3 +#define ASYNC_WR_ADD_PR 1 +#define ASYNC_WR_SEND 2 #define ASYNC_WR_ADD_HEADER 4 +#define ASYNC_WR_ALL ASYNC_WR_ADD_HEADER | ASYNC_WR_ADD_PR | ASYNC_WR_SEND + +enum RoutingType : uint8_t { + ROUTING_TABLE, + ROUTER_XY, +}; inline uint32_t get_next_hop_router_noc_xy(uint32_t routing_plane, uint32_t dst_mesh_id, uint32_t dst_dev_id) { ASSERT(routing_plane < client_interface->num_routing_planes); @@ -47,12 +51,37 @@ inline void fabric_setup_pull_request(uint32_t src_addr, uint32_t size) { client_interface->local_pull_request.pull_request.flags = FORWARD; } -inline void fabric_send_pull_request(uint32_t routing_plane, uint16_t dst_mesh_id, uint16_t dst_dev_id) { - uint64_t router_addr = ((uint64_t)get_next_hop_router_noc_xy(routing_plane, dst_mesh_id, dst_dev_id) << 32) | - FABRIC_ROUTER_REQ_QUEUE_START; +template +inline void fabric_send_pull_request(uint32_t routing, uint16_t dst_mesh_id, uint16_t dst_dev_id) { + uint64_t router_addr; + if constexpr (routing_type == RoutingType::ROUTING_TABLE) { + router_addr = ((uint64_t)get_next_hop_router_noc_xy(routing, dst_mesh_id, dst_dev_id) << 32) | + FABRIC_ROUTER_REQ_QUEUE_START; + } else { + router_addr = get_noc_addr_helper(routing, FABRIC_ROUTER_REQ_QUEUE_START); + } tt_fabric_send_pull_request(router_addr, (volatile local_pull_request_t*)&client_interface->local_pull_request); } +FORCE_INLINE void fabric_wait_for_pull_request_words_flushed(uint32_t words) { + while (client_interface->local_pull_request.pull_request.words_read < words) { +#pragma GCC unroll 4 + for (int i = 0; i < 4; i++) { + asm("nop"); + } + } +} + +inline void fabric_wait_for_pull_request_bytes_flushed(uint32_t size) { + uint32_t size_in_words = (size + PACKET_WORD_SIZE_BYTES - 1) >> 4; + fabric_wait_for_pull_request_words_flushed(size_in_words); +} + +inline void fabric_wait_for_pull_request_flushed() { + uint32_t words_written = client_interface->local_pull_request.pull_request.words_written; + fabric_wait_for_pull_request_words_flushed(words_written); +} + inline void fabric_async_write_add_header( uint32_t src_addr, // source address in sender’s memory uint16_t dst_mesh_id, @@ -70,27 +99,28 @@ inline void fabric_async_write_add_header( packet_header->session.target_offset_h = dst_addr >> 32; tt_fabric_add_header_checksum(packet_header); } + // Write packetized data over fabric to dst_mesh, dst_dev. // Packet is at src_addr in sender L1. -template +template inline void fabric_async_write( - uint32_t routing_plane, // the network plane to use for this transaction - uint32_t src_addr, // source address in sender’s memory + uint32_t routing, // the network plane to use for this transaction + uint32_t src_addr, // source address in sender’s memory uint16_t dst_mesh_id, uint16_t dst_dev_id, uint64_t dst_addr, uint32_t size // number of bytes to write to remote destination ) { - if constexpr (mode == ASYNC_WR_ALL or mode == ASYNC_WR_ADD_HEADER) { + if constexpr (mode & ASYNC_WR_ADD_HEADER) { fabric_async_write_add_header(src_addr, dst_mesh_id, dst_dev_id, dst_addr, size); } - if constexpr (mode == ASYNC_WR_ALL or mode == ASYNC_WR_ADD_PR) { + if constexpr (mode & ASYNC_WR_ADD_PR) { fabric_setup_pull_request(src_addr, size); } - if constexpr (mode == ASYNC_WR_ALL or mode == ASYNC_WR_SEND) { - fabric_send_pull_request(routing_plane, dst_mesh_id, dst_dev_id); + if constexpr (mode & ASYNC_WR_SEND) { + fabric_send_pull_request(routing, dst_mesh_id, dst_dev_id); } } @@ -100,10 +130,10 @@ inline void fabric_async_write_multicast_add_header( uint16_t dst_dev_id, uint64_t dst_addr, uint32_t size, // number of bytes to write to remote destination - uint32_t e_depth, - uint32_t w_depth, - uint32_t n_depth, - uint32_t s_depth) { + uint16_t e_depth, + uint16_t w_depth, + uint16_t n_depth, + uint16_t s_depth) { packet_header_t* packet_header = (packet_header_t*)(src_addr); packet_header->routing.flags = FORWARD | MCAST_DATA; packet_header->routing.packet_size_bytes = size; @@ -120,7 +150,7 @@ inline void fabric_async_write_multicast_add_header( } // Write packetized data over fabric to dst_mesh, dst_dev. // Packet is at src_addr in sender L1. -template +template inline void fabric_async_write_multicast( uint32_t routing_plane, // the network plane to use for this transaction uint32_t src_addr, // source address in sender’s memory @@ -128,21 +158,113 @@ inline void fabric_async_write_multicast( uint16_t dst_dev_id, uint64_t dst_addr, uint32_t size, // number of bytes to write to remote destination - uint32_t e_depth, - uint32_t w_depth, - uint32_t n_depth, - uint32_t s_depth) { - if constexpr (mode == ASYNC_WR_ALL or mode == ASYNC_WR_ADD_HEADER) { + uint16_t e_depth, + uint16_t w_depth, + uint16_t n_depth, + uint16_t s_depth) { + if constexpr (mode & ASYNC_WR_ADD_HEADER) { fabric_async_write_multicast_add_header( src_addr, dst_mesh_id, dst_dev_id, dst_addr, size, e_depth, w_depth, n_depth, s_depth); } - if constexpr (mode == ASYNC_WR_ALL or mode == ASYNC_WR_ADD_PR) { + if constexpr (mode & ASYNC_WR_ADD_PR) { + fabric_setup_pull_request(src_addr, size); + } + + if constexpr (mode & ASYNC_WR_SEND) { + fabric_send_pull_request(routing_plane, dst_mesh_id, dst_dev_id); + } +} + +inline void fabric_atomic_inc_add_header( + uint32_t src_addr, // source address in sender’s memory + uint16_t dst_mesh_id, + uint16_t dst_dev_id, + uint64_t dst_addr, + uint32_t atomic_inc, + uint32_t wrap_boundary) { + packet_header_t* packet_header = (packet_header_t*)(src_addr); + packet_header->routing.flags = INLINE_FORWARD; + packet_header->routing.packet_size_bytes = PACKET_HEADER_SIZE_BYTES; + packet_header->routing.dst_mesh_id = dst_mesh_id; + packet_header->routing.dst_dev_id = dst_dev_id; + packet_header->session.command = ATOMIC_INC; + packet_header->session.target_offset_l = (uint32_t)dst_addr; + packet_header->session.target_offset_h = dst_addr >> 32; + packet_header->packet_parameters.atomic_parameters.wrap_boundary = wrap_boundary; + packet_header->packet_parameters.atomic_parameters.increment = atomic_inc; + tt_fabric_add_header_checksum(packet_header); +} + +// Write packetized data over fabric to dst_mesh, dst_dev. +// Packet is at src_addr in sender L1. +template +inline void fabric_atomic_inc( + uint32_t routing, // the network plane to use for this transaction + uint32_t src_addr, // source address in sender’s memory + uint16_t dst_mesh_id, + uint16_t dst_dev_id, + uint64_t dst_addr, + uint32_t atomic_inc, + uint32_t wrap_boundary) { + if constexpr (mode & ASYNC_WR_ADD_HEADER) { + fabric_atomic_inc_add_header(src_addr, dst_mesh_id, dst_dev_id, dst_addr, atomic_inc, wrap_boundary); + } + + if constexpr (mode & ASYNC_WR_ADD_PR) { + fabric_setup_pull_request(src_addr, PACKET_HEADER_SIZE_BYTES); + } + + if constexpr (mode & ASYNC_WR_SEND) { + fabric_send_pull_request(routing, dst_mesh_id, dst_dev_id); + } +} + +inline void fabric_async_write_atomic_inc_add_header( + uint32_t src_addr, // source address in sender’s memory + uint16_t dst_mesh_id, + uint16_t dst_dev_id, + uint64_t dst_write_addr, + uint64_t dst_atomic_addr, + uint32_t size, // number of bytes to write to remote destination + uint32_t atomic_inc) { + packet_header_t* packet_header = (packet_header_t*)(src_addr); + packet_header->routing.flags = FORWARD; + packet_header->routing.packet_size_bytes = size; + packet_header->routing.dst_mesh_id = dst_mesh_id; + packet_header->routing.dst_dev_id = dst_dev_id; + packet_header->session.command = ASYNC_WR | ATOMIC_INC; + packet_header->session.target_offset_l = (uint32_t)dst_write_addr; + packet_header->session.target_offset_h = dst_atomic_addr >> 32; + packet_header->packet_parameters.async_wr_atomic_parameters.noc_xy = dst_atomic_addr >> 32; + packet_header->packet_parameters.async_wr_atomic_parameters.l1_offset = (uint32_t)dst_atomic_addr; + packet_header->packet_parameters.async_wr_atomic_parameters.increment = atomic_inc; + tt_fabric_add_header_checksum(packet_header); +} + +// Write packetized data over fabric to dst_mesh, dst_dev. +// Packet is at src_addr in sender L1. +template +inline void fabric_async_write_atomic_inc( + uint32_t routing, // the network plane to use for this transaction + uint32_t src_addr, // source address in sender’s memory + uint16_t dst_mesh_id, + uint16_t dst_dev_id, + uint64_t dst_write_addr, + uint64_t dst_atomic_addr, + uint32_t size, // number of bytes to write to remote destination + uint32_t atomic_inc) { + if constexpr (mode & ASYNC_WR_ADD_HEADER) { + fabric_async_write_atomic_inc_add_header( + src_addr, dst_mesh_id, dst_dev_id, dst_write_addr, dst_atomic_addr, size, atomic_inc); + } + + if constexpr (mode & ASYNC_WR_ADD_PR) { fabric_setup_pull_request(src_addr, size); } - if constexpr (mode == ASYNC_WR_ALL or mode == ASYNC_WR_SEND) { - fabric_send_pull_request(routing_plane, dst_mesh_id, dst_dev_id); + if constexpr (mode & ASYNC_WR_SEND) { + fabric_send_pull_request(routing, dst_mesh_id, dst_dev_id); } } @@ -245,9 +367,9 @@ inline void fabric_socket_connect(socket_handle_t* socket_handle) { while (((volatile socket_handle_t*)socket_handle)->socket_state != SocketState::ACTIVE); } +template inline void fabric_endpoint_init(uint32_t base_address, uint32_t outbound_eth_chan) { tt_fabric_init(); - client_interface = (volatile fabric_client_interface_t*)base_address; uint32_t routing_tables_offset = base_address + sizeof(fabric_client_interface_t); @@ -255,9 +377,11 @@ inline void fabric_endpoint_init(uint32_t base_address, uint32_t outbound_eth_ch client_interface->routing_tables_l1_offset = routing_tables_offset; client_interface->num_routing_planes = 1; - // read routing table - uint64_t dest_addr = get_noc_addr_helper( - eth_chan_to_noc_xy[noc_index][outbound_eth_chan], eth_l1_mem::address_map::FABRIC_ROUTER_CONFIG_BASE); - noc_async_read_one_packet(dest_addr, routing_tables_offset, sizeof(fabric_router_l1_config_t)); - noc_async_read_barrier(); + if constexpr (routing_type == RoutingType::ROUTING_TABLE) { + // read routing table + uint64_t dest_addr = get_noc_addr_helper( + eth_chan_to_noc_xy[noc_index][outbound_eth_chan], eth_l1_mem::address_map::FABRIC_ROUTER_CONFIG_BASE); + noc_async_read_one_packet(dest_addr, routing_tables_offset, sizeof(fabric_router_l1_config_t)); + noc_async_read_barrier(); + } } From 911e5c8e6710e851b7f96b346b7773bdcce0f2d4 Mon Sep 17 00:00:00 2001 From: Oleg Milyutin Date: Tue, 18 Feb 2025 16:43:17 -0500 Subject: [PATCH 150/316] #17477: Adopt ND coordinate system in system mesh, coordinate translation (#17926) ### Ticket #17477 ### Problem description TT-distributed needs to adopt ND coordinate system for mesh primitives. ### What's changed Plumbed `SimpleMeshShape` in `SystemMesh`, logical to physical coordinate translation mapping. ### Checklist - [X] [All post commit](https://github.com/tenstorrent/tt-metal/actions/runs/13395057290) - [X] New/Existing tests provide coverage for changes --- .../tt_metal/distributed/test_mesh_coord.cpp | 21 ++++- tests/ttnn/distributed/test_distributed.cpp | 16 ++-- .../distributed/test_distributed_atexit.cpp | 7 +- tt_metal/api/tt-metalium/mesh_coord.hpp | 39 ++++++---- tt_metal/api/tt-metalium/mesh_device.hpp | 4 +- tt_metal/api/tt-metalium/system_mesh.hpp | 9 +-- .../distributed/coordinate_translation.cpp | 58 ++++++-------- .../distributed/coordinate_translation.hpp | 9 ++- tt_metal/distributed/mesh_device.cpp | 31 ++++---- tt_metal/distributed/mesh_workload.cpp | 10 +-- tt_metal/distributed/system_mesh.cpp | 77 ++++++++++--------- ttnn/cpp/ttnn/distributed/api.cpp | 2 +- 12 files changed, 153 insertions(+), 130 deletions(-) diff --git a/tests/tt_metal/distributed/test_mesh_coord.cpp b/tests/tt_metal/distributed/test_mesh_coord.cpp index 09853a488a0..9c364c735b4 100644 --- a/tests/tt_metal/distributed/test_mesh_coord.cpp +++ b/tests/tt_metal/distributed/test_mesh_coord.cpp @@ -4,6 +4,7 @@ #include #include +#include #include "mesh_coord.hpp" @@ -11,7 +12,7 @@ namespace tt::tt_metal::distributed { namespace { using ::testing::ElementsAre; - +using ::testing::UnorderedElementsAre; TEST(SimpleMeshShapeTest, Construction) { SimpleMeshShape shape_1d(3); EXPECT_EQ(shape_1d.dims(), 1); @@ -100,6 +101,21 @@ TEST(MeshCoordinateTest, Comparison) { EXPECT_NE(coord1, MeshCoordinate(1, 2, 1)); } +TEST(MeshCoordinateTest, UnorderedSet) { + std::unordered_set set; + set.insert(MeshCoordinate(0, 0, 0)); + set.insert(MeshCoordinate(0, 0, 1)); + set.insert(MeshCoordinate(0, 0, 2)); + + EXPECT_FALSE(set.insert(MeshCoordinate(0, 0, 2)).second); + EXPECT_THAT( + set, + UnorderedElementsAre( + MeshCoordinate(0, 0, 0), // + MeshCoordinate(0, 0, 1), + MeshCoordinate(0, 0, 2))); +} + TEST(MeshCoordinateRangeTest, FromShape) { SimpleMeshShape shape(2, 3); MeshCoordinateRange range(shape); @@ -232,6 +248,7 @@ TEST(MeshContainerTest, ElementAccessRowMajor) { MeshCoordinate(1, 1), MeshCoordinate(1, 2))); EXPECT_THAT(values, ElementsAre(0, 1, 2, 3, 4, 5)); + EXPECT_THAT(container.values(), ElementsAre(0, 1, 2, 3, 4, 5)); } TEST(MeshContainerTest, ConstContainer) { @@ -254,6 +271,7 @@ TEST(MeshContainerTest, ConstContainer) { MeshCoordinate(1, 1), MeshCoordinate(1, 2))); EXPECT_THAT(values, ElementsAre(0, 0, 0, 0, 0, 0)); + EXPECT_THAT(container.values(), ElementsAre(0, 0, 0, 0, 0, 0)); } TEST(MeshContainerTest, MutateThroughProxy) { @@ -276,6 +294,7 @@ TEST(MeshContainerTest, MutateThroughProxy) { values.push_back(value); } EXPECT_THAT(values, ElementsAre(0, 1, 2, 3, 4, 5)); + EXPECT_THAT(container.values(), ElementsAre(0, 1, 2, 3, 4, 5)); } TEST(MeshContainerTest, OutOfBounds) { diff --git a/tests/ttnn/distributed/test_distributed.cpp b/tests/ttnn/distributed/test_distributed.cpp index cb4d22448c5..f6e4cf7d5da 100644 --- a/tests/ttnn/distributed/test_distributed.cpp +++ b/tests/ttnn/distributed/test_distributed.cpp @@ -4,7 +4,6 @@ #include -#include #include #include @@ -19,11 +18,16 @@ class DistributedTest : public ::testing::Test { TEST_F(DistributedTest, TestSystemMeshTearDownWithoutClose) { auto& sys = SystemMesh::instance(); auto mesh = ttnn::distributed::open_mesh_device( - {2, 4}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); - - auto [rows, cols] = sys.get_shape(); - EXPECT_GT(rows, 0); - EXPECT_GT(cols, 0); + /*mesh_shape=*/{2, 4}, + DEFAULT_L1_SMALL_SIZE, + DEFAULT_TRACE_REGION_SIZE, + 1, + tt::tt_metal::DispatchCoreType::WORKER); + + const auto system_shape = sys.get_shape(); + ASSERT_EQ(system_shape.dims(), 2); + EXPECT_EQ(system_shape[0], 2); + EXPECT_EQ(system_shape[1], 4); } TEST_F(DistributedTest, TestMemoryAllocationStatistics) { diff --git a/tests/ttnn/distributed/test_distributed_atexit.cpp b/tests/ttnn/distributed/test_distributed_atexit.cpp index 283076076b2..6d4461f7386 100644 --- a/tests/ttnn/distributed/test_distributed_atexit.cpp +++ b/tests/ttnn/distributed/test_distributed_atexit.cpp @@ -18,9 +18,10 @@ TEST(DistributedTestStandalone, TestSystemMeshTearDownWithoutClose) { mesh = ttnn::distributed::open_mesh_device( {2, 4}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); - auto [rows, cols] = sys.get_shape(); - EXPECT_GT(rows, 0); - EXPECT_GT(cols, 0); + const auto system_shape = sys.get_shape(); + ASSERT_EQ(system_shape.dims(), 2); + EXPECT_EQ(system_shape[0], 2); + EXPECT_EQ(system_shape[1], 4); } } // namespace ttnn::distributed::test diff --git a/tt_metal/api/tt-metalium/mesh_coord.hpp b/tt_metal/api/tt-metalium/mesh_coord.hpp index e346ce2ca83..5160bdb745f 100644 --- a/tt_metal/api/tt-metalium/mesh_coord.hpp +++ b/tt_metal/api/tt-metalium/mesh_coord.hpp @@ -9,6 +9,7 @@ #include #include "shape_base.hpp" +#include "utils.hpp" namespace tt::tt_metal::distributed { @@ -21,7 +22,7 @@ class SimpleMeshShape : public ShapeBase { using ShapeBase::operator[]; // Shorthands for constructing 1D, 2D and 3D shapes. - SimpleMeshShape(uint32_t x); + explicit SimpleMeshShape(uint32_t x); SimpleMeshShape(uint32_t x, uint32_t y); SimpleMeshShape(uint32_t x, uint32_t y, uint32_t z); @@ -56,7 +57,7 @@ class SimpleMeshShape : public ShapeBase { class MeshCoordinate { public: // Shorthands for constructing 1D, 2D and 3D coordinates. - MeshCoordinate(uint32_t x); + explicit MeshCoordinate(uint32_t x); MeshCoordinate(uint32_t x, uint32_t y); MeshCoordinate(uint32_t x, uint32_t y, uint32_t z); @@ -199,7 +200,10 @@ class MeshContainer { using ValueProxy = detail::MeshCoordinateValueProxy; Iterator& operator++(); - ValueProxy& operator*(); + ValueProxy& operator*() { return value_proxy_; } + const ValueProxy& operator*() const { return value_proxy_; } + ValueProxy* operator->() { return &value_proxy_; } + const ValueProxy* operator->() const { return &value_proxy_; } bool operator==(const Iterator& other) const; bool operator!=(const Iterator& other) const; @@ -220,7 +224,8 @@ class MeshContainer { using ValueProxy = detail::MeshCoordinateValueProxy; ConstIterator& operator++(); - const ValueProxy& operator*() const; + const ValueProxy& operator*() const { return value_proxy_; } + const ValueProxy* operator->() const { return &value_proxy_; } bool operator==(const ConstIterator& other) const; bool operator!=(const ConstIterator& other) const; @@ -237,11 +242,16 @@ class MeshContainer { ValueProxy value_proxy_; }; + // Iterators provide a reference to the value along with the coordinate. Iterator begin(); Iterator end(); ConstIterator begin() const; ConstIterator end() const; + // View of the flat container of values. + std::vector& values() { return values_; } + const std::vector& values() const { return values_; } + private: SimpleMeshShape shape_; MeshCoordinateRange coord_range_; @@ -283,11 +293,6 @@ typename MeshContainer::Iterator& MeshContainer::Iterator::operator++() { return *this; } -template -typename MeshContainer::Iterator::ValueProxy& MeshContainer::Iterator::operator*() { - return value_proxy_; -} - template MeshContainer::ConstIterator::ConstIterator( const MeshContainer* container, const MeshCoordinateRange::Iterator& coord_iter, size_t linear_index) : @@ -304,11 +309,6 @@ typename MeshContainer::ConstIterator& MeshContainer::ConstIterator::opera return *this; } -template -const typename MeshContainer::ConstIterator::ValueProxy& MeshContainer::ConstIterator::operator*() const { - return value_proxy_; -} - template bool MeshContainer::Iterator::operator==(const Iterator& other) const { return container_ == other.container_ && coord_iter_ == other.coord_iter_ && linear_index_ == other.linear_index_; @@ -367,4 +367,15 @@ struct tuple_element<1, tt::tt_metal::distributed::detail::MeshCoordinateValuePr using type = T; }; +template <> +struct hash { + size_t operator()(const tt::tt_metal::distributed::MeshCoordinate& coord) const noexcept { + size_t seed = 0; + for (const auto coord_value : coord.coords()) { + tt::utils::hash_combine(seed, coord_value); + } + return seed; + } +}; + } // namespace std diff --git a/tt_metal/api/tt-metalium/mesh_device.hpp b/tt_metal/api/tt-metalium/mesh_device.hpp index 979e603a6cd..1ff63629b16 100644 --- a/tt_metal/api/tt-metalium/mesh_device.hpp +++ b/tt_metal/api/tt-metalium/mesh_device.hpp @@ -33,7 +33,7 @@ class MeshDevice : public IDevice, public std::enable_shared_from_this opened_devices_; - std::vector devices_; + MeshContainer devices_; public: // Constructor acquires physical resources @@ -50,6 +50,7 @@ class MeshDevice : public IDevice, public std::enable_shared_from_this& get_devices() const; + IDevice* get_device(const MeshCoordinate& coord) const; }; std::shared_ptr scoped_devices_; @@ -202,7 +203,6 @@ class MeshDevice : public IDevice, public std::enable_shared_from_this get_devices() const; - IDevice* get_device_index(size_t logical_device_id) const; IDevice* get_device(chip_id_t physical_device_id) const; IDevice* get_device(size_t row_idx, size_t col_idx) const; IDevice* get_device(const MeshCoordinate& coord) const; diff --git a/tt_metal/api/tt-metalium/system_mesh.hpp b/tt_metal/api/tt-metalium/system_mesh.hpp index 64c040edf82..1ee91588dcc 100644 --- a/tt_metal/api/tt-metalium/system_mesh.hpp +++ b/tt_metal/api/tt-metalium/system_mesh.hpp @@ -8,8 +8,7 @@ #include #include "mesh_config.hpp" -#include "mesh_device.hpp" -#include "device.hpp" +#include "mesh_coord.hpp" namespace tt::tt_metal::distributed { @@ -21,7 +20,6 @@ class SystemMesh { class Impl; // Forward declaration only std::unique_ptr pimpl_; SystemMesh(); - ~SystemMesh(); public: static SystemMesh& instance(); @@ -30,11 +28,10 @@ class SystemMesh { SystemMesh(SystemMesh&&) = delete; SystemMesh& operator=(SystemMesh&&) = delete; - const MeshShape& get_shape() const; - size_t get_num_devices() const; + const SimpleMeshShape& get_shape() const; // Gets the physical device ID for a given logical row and column index - chip_id_t get_physical_device_id(size_t logical_row_idx, size_t logical_col_idx) const; + chip_id_t get_physical_device_id(const MeshCoordinate& coord) const; // Get the physical device IDs mapped to a MeshDevice std::vector get_mapped_physical_device_ids(const MeshDeviceConfig& config) const; diff --git a/tt_metal/distributed/coordinate_translation.cpp b/tt_metal/distributed/coordinate_translation.cpp index e834ae37e2d..2070a138ed0 100644 --- a/tt_metal/distributed/coordinate_translation.cpp +++ b/tt_metal/distributed/coordinate_translation.cpp @@ -36,7 +36,7 @@ CoordinateTranslationMap load_translation_map(const std::string& filename, const TT_THROW("Invalid coordinate format in JSON file: {}", filename); } result.emplace( - Coordinate{mapping[0][0], mapping[0][1]}, + MeshCoordinate(mapping[0][0], mapping[0][1]), PhysicalCoordinate{ mapping[1][0], // cluster_id mapping[1][2], // x @@ -49,49 +49,39 @@ CoordinateTranslationMap load_translation_map(const std::string& filename, const return result; } -MeshShape get_system_mesh_shape(size_t system_num_devices) { - static const std::unordered_map system_mesh_to_shape = { - {1, MeshShape{1, 1}}, // single-device - {2, MeshShape{1, 2}}, // N300 - {8, MeshShape{2, 4}}, // T3000; as ring to match existing tests - {32, MeshShape{8, 4}}, // TG, QG - {64, MeshShape{8, 8}}, // TGG - }; - TT_FATAL( - system_mesh_to_shape.contains(system_num_devices), "Unsupported number of devices: {}", system_num_devices); - auto shape = system_mesh_to_shape.at(system_num_devices); - log_debug(LogMetal, "Logical SystemMesh Shape: {}x{}", shape.num_rows, shape.num_cols); - return shape; -} - } // namespace -std::pair get_system_mesh_coordinate_translation_map() { - static const auto* cached_translation_map = new std::pair([] { - auto system_num_devices = tt::Cluster::instance().number_of_user_devices(); +const std::pair& get_system_mesh_coordinate_translation_map() { + static const auto* cached_translation_map = new std::pair([] { + const auto system_num_devices = tt::Cluster::instance().number_of_user_devices(); - std::string galaxy_mesh_descriptor = "TG.json"; - if (tt::Cluster::instance().number_of_pci_devices() == system_num_devices) { - galaxy_mesh_descriptor = "QG.json"; - } + const bool is_qg = tt::Cluster::instance().number_of_pci_devices() == system_num_devices; - const std::unordered_map system_mesh_translation_map = { - {1, "device.json"}, - {2, "N300.json"}, - {8, "T3000.json"}, - {32, galaxy_mesh_descriptor}, - {64, "TGG.json"}, + // TODO: #17477 - This assumes shapes and coordinates are in 2D. This will be extended for 3D. + // Consider if 1D can be used for single device and N300. + const std::unordered_map> system_mesh_translation_map = { + {1, std::make_pair("device.json", SimpleMeshShape(1, 1))}, + {2, std::make_pair("N300.json", SimpleMeshShape(1, 2))}, + {8, std::make_pair("T3000.json", SimpleMeshShape(2, 4))}, + {32, std::make_pair(is_qg ? "QG.json" : "TG.json", SimpleMeshShape(8, 4))}, + {64, std::make_pair("TGG.json", SimpleMeshShape(8, 8))}, }; - TT_FATAL( system_mesh_translation_map.contains(system_num_devices), "Unsupported number of devices: {}", system_num_devices); - auto translation_config_file = get_config_path(system_mesh_translation_map.at(system_num_devices)); - return std::pair{ - load_translation_map(translation_config_file, "logical_to_physical_coordinates"), - get_system_mesh_shape(system_num_devices)}; + const auto [translation_config_file, shape] = system_mesh_translation_map.at(system_num_devices); + TT_FATAL( + system_num_devices == shape.mesh_size(), + "Mismatch between number of devices and the mesh shape: {} != {}", + system_num_devices, + shape.mesh_size()); + log_debug(LogMetal, "Logical SystemMesh Shape: {}", shape); + + return std::pair{ + load_translation_map(get_config_path(translation_config_file), /*key=*/"logical_to_physical_coordinates"), + shape}; }()); return *cached_translation_map; diff --git a/tt_metal/distributed/coordinate_translation.hpp b/tt_metal/distributed/coordinate_translation.hpp index b4fc5c21b85..5aa0f7242f0 100644 --- a/tt_metal/distributed/coordinate_translation.hpp +++ b/tt_metal/distributed/coordinate_translation.hpp @@ -7,17 +7,18 @@ #include #include "umd/device/types/cluster_descriptor_types.h" +#include #include namespace tt::tt_metal::distributed { // TODO: Consider conversion to StrongType instead of alias -using LogicalCoordinate = Coordinate; using PhysicalCoordinate = eth_coord_t; -using CoordinateTranslationMap = std::unordered_map; +using CoordinateTranslationMap = std::unordered_map; -// Returns a translation map between logical coordinates in logical 2D space +// Returns a translation map between logical coordinates in logical ND space // to the physical coordinates as defined by the UMD layer. -std::pair get_system_mesh_coordinate_translation_map(); +// TODO: #17477 - Return MeshContainer that contains everything we need. +const std::pair& get_system_mesh_coordinate_translation_map(); } // namespace tt::tt_metal::distributed diff --git a/tt_metal/distributed/mesh_device.cpp b/tt_metal/distributed/mesh_device.cpp index 603ce95212e..63cf7a6621a 100644 --- a/tt_metal/distributed/mesh_device.cpp +++ b/tt_metal/distributed/mesh_device.cpp @@ -68,27 +68,36 @@ MeshDevice::ScopedDevices::ScopedDevices( size_t trace_region_size, size_t num_command_queues, const DispatchCoreConfig& dispatch_core_config, - const MeshDeviceConfig& config) { + const MeshDeviceConfig& config) : + devices_(SimpleMeshShape(config.mesh_shape), /*fill_value=*/nullptr) { auto& system_mesh = SystemMesh::instance(); auto physical_device_ids = system_mesh.request_available_devices(config); opened_devices_ = tt::tt_metal::detail::CreateDevices( physical_device_ids, num_command_queues, l1_small_size, trace_region_size, dispatch_core_config); + TT_FATAL( + physical_device_ids.size() == devices_.shape().mesh_size(), + "Device size mismatch; expected: {}, actual: {}", + devices_.shape().mesh_size(), + opened_devices_.size()); + + auto it = devices_.begin(); for (auto physical_device_id : physical_device_ids) { - devices_.push_back(opened_devices_.at(physical_device_id)); + it->value() = opened_devices_.at(physical_device_id); + ++it; } } MeshDevice::ScopedDevices::~ScopedDevices() { - if (not opened_devices_.empty()) { + if (!opened_devices_.empty()) { tt::tt_metal::detail::CloseDevices(opened_devices_); - opened_devices_.clear(); - devices_.clear(); } } -const std::vector& MeshDevice::ScopedDevices::get_devices() const { return devices_; } +const std::vector& MeshDevice::ScopedDevices::get_devices() const { return devices_.values(); } + +IDevice* MeshDevice::ScopedDevices::get_device(const MeshCoordinate& coord) const { return devices_.at(coord); } uint8_t MeshDevice::num_hw_cqs() const { return validate_and_get_reference_value( @@ -192,12 +201,6 @@ std::vector> MeshDevice::create_submeshes(const Mesh MeshDevice::~MeshDevice() {} -IDevice* MeshDevice::get_device_index(size_t device_index) const { - TT_FATAL(device_index >= 0 and device_index < num_devices(), "Invalid device index"); - const auto& devices = scoped_devices_->get_devices(); - return devices.at(device_index); -} - IDevice* MeshDevice::get_device(chip_id_t physical_device_id) const { for (auto device : this->get_devices()) { if (device->id() == physical_device_id) { @@ -214,9 +217,7 @@ IDevice* MeshDevice::get_device(size_t row_idx, size_t col_idx) const { return get_device(MeshCoordinate{row_idx, col_idx}); } -IDevice* MeshDevice::get_device(const MeshCoordinate& coord) const { - return this->get_device_index(to_linear_index(SimpleMeshShape(mesh_shape_), coord)); -} +IDevice* MeshDevice::get_device(const MeshCoordinate& coord) const { return scoped_devices_->get_device(coord); } MeshCommandQueue& MeshDevice::mesh_command_queue(std::size_t cq_id) const { TT_FATAL(this->using_fast_dispatch(), "Can only access the MeshCommandQueue when using Fast Dispatch."); diff --git a/tt_metal/distributed/mesh_workload.cpp b/tt_metal/distributed/mesh_workload.cpp index 21fd77cc409..a9efcb406c7 100644 --- a/tt_metal/distributed/mesh_workload.cpp +++ b/tt_metal/distributed/mesh_workload.cpp @@ -257,12 +257,11 @@ uint32_t MeshWorkload::get_sem_size( std::shared_ptr& mesh_device, CoreCoord logical_core, CoreType core_type) { uint32_t sem_size = 0; uint32_t program_idx = 0; - IDevice* device = mesh_device->get_device_index(0); for (auto& [device_range, program] : programs_) { if (program_idx) { - TT_ASSERT(sem_size == program.get_sem_size(device, logical_core, core_type)); + TT_ASSERT(sem_size == program.get_sem_size(mesh_device.get(), logical_core, core_type)); } else { - sem_size = program.get_sem_size(device, logical_core, core_type); + sem_size = program.get_sem_size(mesh_device.get(), logical_core, core_type); } program_idx++; } @@ -281,12 +280,11 @@ uint32_t MeshWorkload::get_cb_size( std::shared_ptr& mesh_device, CoreCoord logical_core, CoreType core_type) { uint32_t cb_size = 0; uint32_t program_idx = 0; - IDevice* device = mesh_device->get_device_index(0); for (auto& [device_range, program] : programs_) { if (program_idx) { - TT_ASSERT(cb_size == program.get_cb_size(device, logical_core, core_type)); + TT_ASSERT(cb_size == program.get_cb_size(mesh_device.get(), logical_core, core_type)); } else { - cb_size = program.get_cb_size(device, logical_core, core_type); + cb_size = program.get_cb_size(mesh_device.get(), logical_core, core_type); } program_idx++; } diff --git a/tt_metal/distributed/system_mesh.cpp b/tt_metal/distributed/system_mesh.cpp index c90fed6f897..20d912a3b1a 100644 --- a/tt_metal/distributed/system_mesh.cpp +++ b/tt_metal/distributed/system_mesh.cpp @@ -7,31 +7,30 @@ #include "umd/device/types/cluster_descriptor_types.h" #include "tt_metal/distributed/coordinate_translation.hpp" +#include "mesh_coord.hpp" #include "tt_cluster.hpp" namespace tt::tt_metal::distributed { class SystemMesh::Impl { private: - MeshShape logical_mesh_shape_; + SimpleMeshShape logical_mesh_shape_; CoordinateTranslationMap logical_to_physical_coordinates_; - std::unordered_map logical_to_device_id_; + std::unordered_map logical_to_device_id_; std::unordered_map physical_coordinate_to_device_id_; std::unordered_map physical_device_id_to_coordinate_; public: Impl() = default; - ~Impl() = default; bool is_system_mesh_initialized() const; void initialize(); - const MeshShape& get_shape() const; - size_t get_num_devices() const; + const SimpleMeshShape& get_shape() const; std::vector get_mapped_physical_device_ids(const MeshDeviceConfig& config) const; std::vector request_available_devices(const MeshDeviceConfig& config) const; - IDevice* get_device(const chip_id_t physical_device_id) const; - chip_id_t get_physical_device_id(size_t logical_row_idx, size_t logical_col_idx) const; + IDevice* get_device(const chip_id_t physical_device_id) const; + chip_id_t get_physical_device_id(const MeshCoordinate& coord) const; }; // Implementation of public methods @@ -69,30 +68,34 @@ void SystemMesh::Impl::initialize() { } } -const MeshShape& SystemMesh::Impl::get_shape() const { return logical_mesh_shape_; } -size_t SystemMesh::Impl::get_num_devices() const { - auto [num_rows, num_cols] = this->get_shape(); - return num_rows * num_cols; -} +const SimpleMeshShape& SystemMesh::Impl::get_shape() const { return logical_mesh_shape_; } -chip_id_t SystemMesh::Impl::get_physical_device_id(size_t logical_row_idx, size_t logical_col_idx) const { +chip_id_t SystemMesh::Impl::get_physical_device_id(const MeshCoordinate& coord) const { TT_FATAL( - logical_row_idx < logical_mesh_shape_.num_rows, - "Row index out of bounds: {} >= {}", - logical_row_idx, - logical_mesh_shape_.num_rows); - TT_FATAL( - logical_col_idx < logical_mesh_shape_.num_cols, - "Column index out of bounds: {} >= {}", - logical_col_idx, - logical_mesh_shape_.num_cols); - auto logical_coordinate = Coordinate{logical_row_idx, logical_col_idx}; - return logical_to_device_id_.at(logical_coordinate); + coord.dims() == logical_mesh_shape_.dims(), + "Coordinate dimensions mismatch: {} != {}", + coord.dims(), + logical_mesh_shape_.dims()); + for (size_t i = 0; i < coord.dims(); ++i) { + TT_FATAL( + coord[i] < logical_mesh_shape_[i], + "Coordinate at index {} out of bounds; mesh shape {}, coordinate {}", + i, + logical_mesh_shape_, + coord); + } + return logical_to_device_id_.at(coord); } std::vector SystemMesh::Impl::get_mapped_physical_device_ids(const MeshDeviceConfig& config) const { std::vector physical_device_ids; - auto [system_mesh_rows, system_mesh_cols] = this->get_shape(); + // TODO: #17477 - Extend to ND. + TT_FATAL( + logical_mesh_shape_.dims() == 2, + "SystemMesh only supports 2D meshes; requested dimensions: {}", + logical_mesh_shape_.dims()); + + auto [system_mesh_rows, system_mesh_cols] = std::make_tuple(logical_mesh_shape_[0], logical_mesh_shape_[1]); auto [requested_num_rows, requested_num_cols] = config.mesh_shape; auto [row_offset, col_offset] = config.offset; @@ -112,7 +115,8 @@ std::vector SystemMesh::Impl::get_mapped_physical_device_ids(const Me auto line_coords = MeshDeviceView::get_line_coordinates( line_length, Coordinate{row_offset, col_offset}, system_mesh_rows, system_mesh_cols); for (const auto& logical_coordinate : line_coords) { - auto physical_device_id = logical_to_device_id_.at(logical_coordinate); + auto physical_device_id = + logical_to_device_id_.at(MeshCoordinate(logical_coordinate.row, logical_coordinate.col)); physical_device_ids.push_back(physical_device_id); log_debug( @@ -178,17 +182,18 @@ std::vector SystemMesh::Impl::get_mapped_physical_device_ids(const Me } TT_FATAL( - logical_coordinate.row < logical_mesh_shape_.num_rows, + logical_coordinate.row < system_mesh_rows, "Row coordinate out of bounds: {} >= {}", logical_coordinate.row, - logical_mesh_shape_.num_rows); + system_mesh_rows); TT_FATAL( - logical_coordinate.col < logical_mesh_shape_.num_cols, + logical_coordinate.col < system_mesh_cols, "Column coordinate out of bounds: {} >= {}", logical_coordinate.col, - logical_mesh_shape_.num_cols); + system_mesh_cols); - auto physical_device_id = logical_to_device_id_.at(logical_coordinate); + auto physical_device_id = + logical_to_device_id_.at(MeshCoordinate(logical_coordinate.row, logical_coordinate.col)); physical_device_ids.push_back(physical_device_id); log_debug( @@ -200,7 +205,6 @@ std::vector SystemMesh::Impl::get_mapped_physical_device_ids(const Me std::vector SystemMesh::Impl::request_available_devices(const MeshDeviceConfig& config) const { auto [requested_num_rows, requested_num_cols] = config.mesh_shape; - auto [max_num_rows, max_num_cols] = logical_mesh_shape_; auto [row_offset, col_offset] = config.offset; log_debug( @@ -216,7 +220,6 @@ std::vector SystemMesh::Impl::request_available_devices(const MeshDev } SystemMesh::SystemMesh() : pimpl_(std::make_unique()) {} -SystemMesh::~SystemMesh() = default; SystemMesh& SystemMesh::instance() { static SystemMesh instance; @@ -226,13 +229,11 @@ SystemMesh& SystemMesh::instance() { return instance; } -chip_id_t SystemMesh::get_physical_device_id(size_t logical_row_idx, size_t logical_col_idx) const { - return pimpl_->get_physical_device_id(logical_row_idx, logical_col_idx); +chip_id_t SystemMesh::get_physical_device_id(const MeshCoordinate& coord) const { + return pimpl_->get_physical_device_id(coord); } -const MeshShape& SystemMesh::get_shape() const { return pimpl_->get_shape(); } - -size_t SystemMesh::get_num_devices() const { return pimpl_->get_num_devices(); } +const SimpleMeshShape& SystemMesh::get_shape() const { return pimpl_->get_shape(); } std::vector SystemMesh::request_available_devices(const MeshDeviceConfig& config) const { return pimpl_->request_available_devices(config); diff --git a/ttnn/cpp/ttnn/distributed/api.cpp b/ttnn/cpp/ttnn/distributed/api.cpp index bd0fd35a206..9133ec419ac 100644 --- a/ttnn/cpp/ttnn/distributed/api.cpp +++ b/ttnn/cpp/ttnn/distributed/api.cpp @@ -124,7 +124,7 @@ Tensor aggregate_as_tensor( std::vector get_t3k_physical_device_ids_ring() { using namespace tt::tt_metal::distributed; auto& instance = SystemMesh::instance(); - auto num_devices = instance.get_num_devices(); + auto num_devices = instance.get_shape().mesh_size(); TT_FATAL(num_devices == 8, "T3000 ring topology only works with 8 devices"); auto physical_device_ids = From 2456417965d7bc11bbe30f94dab5151f634f786b Mon Sep 17 00:00:00 2001 From: Almeet Bhullar Date: Tue, 18 Feb 2025 05:41:50 +0000 Subject: [PATCH 151/316] Slight refactor of eth ubenchmarks --- ...te_worker_with_transaction_id_bandwidth.py | 91 +++------ ...write_worker_with_transaction_id_common.py | 8 +- ...rite_worker_with_transaction_id_latency.py | 109 +++------- ...t_ethernet_write_worker_latency_no_edm.cpp | 83 ++++---- .../unit_tests/erisc/eth_ubenchmark_types.hpp | 29 +++ ...net_write_worker_latency_ubench_common.hpp | 191 ++++++++++++++---- ...t_write_worker_latency_ubench_receiver.cpp | 178 ++++++---------- ...net_write_worker_latency_ubench_sender.cpp | 148 ++++++-------- 8 files changed, 412 insertions(+), 425 deletions(-) create mode 100644 tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/eth_ubenchmark_types.hpp diff --git a/tests/tt_metal/microbenchmarks/ethernet/test_ethernet_link_write_worker_with_transaction_id_bandwidth.py b/tests/tt_metal/microbenchmarks/ethernet/test_ethernet_link_write_worker_with_transaction_id_bandwidth.py index eeaa1c399af..ddffe910ac1 100644 --- a/tests/tt_metal/microbenchmarks/ethernet/test_ethernet_link_write_worker_with_transaction_id_bandwidth.py +++ b/tests/tt_metal/microbenchmarks/ethernet/test_ethernet_link_write_worker_with_transaction_id_bandwidth.py @@ -27,7 +27,7 @@ def run_erisc_write_worker_bw( - sample_count, sample_size_expected_bw, channel_count, num_directions, enable_worker, disable_trid, file_name + benchmark_type, sample_count, sample_size_expected_bw, channel_count, disable_trid, file_name ): os.system(f"rm -rf {os.environ['TT_METAL_HOME']}/generated/profiler/.logs/profile_log_device.csv") @@ -40,12 +40,11 @@ def run_erisc_write_worker_bw( ARCH_NAME = os.getenv("ARCH_NAME") cmd = f"TT_METAL_DEVICE_PROFILER=1 \ {os.environ['TT_METAL_HOME']}/build/test/tt_metal/perf_microbenchmark/ethernet/test_ethernet_write_worker_latency_no_edm_{ARCH_NAME} \ + {benchmark_type} \ {sample_count} \ {sample_size} \ {channel_count} \ - {num_directions} \ {test_latency} \ - {enable_worker} \ {disable_trid}" rc = os.system(cmd) if rc != 0: @@ -53,7 +52,7 @@ def run_erisc_write_worker_bw( assert False main_loop_latency = profile_results( - sample_size, sample_count, channel_count, num_directions, test_latency, file_name + sample_size, sample_count, channel_count, benchmark_type, test_latency, file_name ) main_loop_bw = sample_size / main_loop_latency logger.info(f"sender_loop_latency {main_loop_latency}") @@ -62,103 +61,87 @@ def run_erisc_write_worker_bw( assert expected_bw_lower_bound <= main_loop_bw <= expected_bw_upper_bound -##################################### BW test ####################################################### -# uni-direction test for eth-sender <---> eth-receiver ---> worker +##################################### No Worker BW test ####################################################### +# uni-direction test for eth-sender <---> eth-receiver @pytest.mark.skipif(is_grayskull(), reason="Unsupported on GS") @pytest.mark.parametrize("sample_count", [256]) @pytest.mark.parametrize("channel_count", [16]) -@pytest.mark.parametrize("num_directions", [1]) -@pytest.mark.parametrize("enable_worker", [1]) -@pytest.mark.parametrize("disable_trid", [0]) @pytest.mark.parametrize( "sample_size_expected_bw", - [(16, 0.21), (128, 1.72), (256, 3.44), (512, 6.89), (1024, 11.73), (2048, 11.83), (4096, 12.04), (8192, 12.07)], + [(16, 0.28), (128, 2.25), (256, 4.39), (512, 8.35), (1024, 11.74), (2048, 11.84), (4096, 12.04), (8192, 12.07)], ) -def test_erisc_write_worker_bw_uni_dir( - sample_count, sample_size_expected_bw, channel_count, num_directions, enable_worker, disable_trid -): +def test_erisc_bw_uni_dir(sample_count, sample_size_expected_bw, channel_count): + benchmark_type_id = 0 + disable_trid = 0 # don't care in this case run_erisc_write_worker_bw( + benchmark_type_id, sample_count, sample_size_expected_bw, channel_count, - num_directions, - enable_worker, disable_trid, FILE_NAME, ) -# bi-direction test for eth-sender <---> eth-receiver ---> worker +# bi-direction test for eth-sender <---> eth-receiver @pytest.mark.skipif(is_grayskull(), reason="Unsupported on GS") @pytest.mark.parametrize("sample_count", [1000]) @pytest.mark.parametrize("channel_count", [16]) -@pytest.mark.parametrize("num_directions", [2]) -@pytest.mark.parametrize("enable_worker", [1]) -@pytest.mark.parametrize("disable_trid", [0]) @pytest.mark.parametrize( "sample_size_expected_bw", - [(16, 0.13), (128, 1.03), (256, 2.08), (512, 4.15), (1024, 8.31), (2048, 11.40), (4096, 11.82)], + [(16, 0.19), (128, 1.59), (256, 3.19), (512, 6.39), (1024, 10.9), (2048, 11.4), (4096, 11.82)], ) -def test_erisc_write_worker_bw_bi_dir( - sample_count, sample_size_expected_bw, channel_count, num_directions, enable_worker, disable_trid -): +def test_erisc_bw_bi_dir(sample_count, sample_size_expected_bw, channel_count): + benchmark_type_id = 1 + disable_trid = 0 # don't care in this case run_erisc_write_worker_bw( + benchmark_type_id, sample_count, sample_size_expected_bw, channel_count, - num_directions, - enable_worker, disable_trid, FILE_NAME, ) -##################################### No Worker BW test ####################################################### -# uni-direction test for eth-sender <---> eth-receiver +##################################### BW test ####################################################### +# uni-direction test for eth-sender <---> eth-receiver ---> worker @pytest.mark.skipif(is_grayskull(), reason="Unsupported on GS") @pytest.mark.parametrize("sample_count", [256]) @pytest.mark.parametrize("channel_count", [16]) -@pytest.mark.parametrize("num_directions", [1]) -@pytest.mark.parametrize("enable_worker", [0]) @pytest.mark.parametrize("disable_trid", [0]) @pytest.mark.parametrize( "sample_size_expected_bw", - [(16, 0.28), (128, 2.25), (256, 4.39), (512, 8.35), (1024, 11.74), (2048, 11.84), (4096, 12.04), (8192, 12.07)], + [(16, 0.21), (128, 1.72), (256, 3.44), (512, 6.89), (1024, 11.73), (2048, 11.83), (4096, 12.04), (8192, 12.07)], ) -def test_erisc_bw_uni_dir( - sample_count, sample_size_expected_bw, channel_count, num_directions, enable_worker, disable_trid -): +def test_erisc_write_worker_bw_uni_dir(sample_count, sample_size_expected_bw, channel_count, disable_trid): + benchmark_type_id = 2 run_erisc_write_worker_bw( + benchmark_type_id, sample_count, sample_size_expected_bw, channel_count, - num_directions, - enable_worker, disable_trid, FILE_NAME, ) -# bi-direction test for eth-sender <---> eth-receiver +# bi-direction test for eth-sender <---> eth-receiver ---> worker @pytest.mark.skipif(is_grayskull(), reason="Unsupported on GS") @pytest.mark.parametrize("sample_count", [1000]) @pytest.mark.parametrize("channel_count", [16]) -@pytest.mark.parametrize("num_directions", [2]) -@pytest.mark.parametrize("enable_worker", [0]) @pytest.mark.parametrize("disable_trid", [0]) @pytest.mark.parametrize( "sample_size_expected_bw", - [(16, 0.19), (128, 1.59), (256, 3.19), (512, 6.39), (1024, 10.9), (2048, 11.4), (4096, 11.82)], + [(16, 0.13), (128, 1.03), (256, 2.08), (512, 4.15), (1024, 8.31), (2048, 11.40), (4096, 11.82)], ) -def test_erisc_bw_bi_dir( - sample_count, sample_size_expected_bw, channel_count, num_directions, enable_worker, disable_trid -): +def test_erisc_write_worker_bw_bi_dir(sample_count, sample_size_expected_bw, channel_count, disable_trid): + benchmark_type_id = 3 run_erisc_write_worker_bw( + benchmark_type_id, sample_count, sample_size_expected_bw, channel_count, - num_directions, - enable_worker, disable_trid, FILE_NAME, ) @@ -169,22 +152,18 @@ def test_erisc_bw_bi_dir( @pytest.mark.skipif(is_grayskull(), reason="Unsupported on GS") @pytest.mark.parametrize("sample_count", [256]) @pytest.mark.parametrize("channel_count", [16]) -@pytest.mark.parametrize("num_directions", [1]) -@pytest.mark.parametrize("enable_worker", [1]) @pytest.mark.parametrize("disable_trid", [1]) @pytest.mark.parametrize( "sample_size_expected_bw", [(16, 0.18), (128, 1.46), (256, 2.93), (512, 5.73), (1024, 9.15), (2048, 11.83), (4096, 12.04), (8192, 12.07)], ) -def test_erisc_write_worker_bw_uni_dir_no_trid( - sample_count, sample_size_expected_bw, channel_count, num_directions, enable_worker, disable_trid -): +def test_erisc_write_worker_bw_uni_dir_no_trid(sample_count, sample_size_expected_bw, channel_count, disable_trid): + benchmark_type_id = 2 run_erisc_write_worker_bw( + benchmark_type_id, sample_count, sample_size_expected_bw, channel_count, - num_directions, - enable_worker, disable_trid, FILE_NAME, ) @@ -194,22 +173,18 @@ def test_erisc_write_worker_bw_uni_dir_no_trid( @pytest.mark.skipif(is_grayskull(), reason="Unsupported on GS") @pytest.mark.parametrize("sample_count", [1000]) @pytest.mark.parametrize("channel_count", [16]) -@pytest.mark.parametrize("num_directions", [2]) -@pytest.mark.parametrize("enable_worker", [1]) @pytest.mark.parametrize("disable_trid", [1]) @pytest.mark.parametrize( "sample_size_expected_bw", [(16, 0.10), (128, 0.87), (256, 1.73), (512, 3.44), (1024, 5.99), (2048, 9.70), (4096, 11.82)], ) -def test_erisc_write_worker_bw_bi_dir_no_trid( - sample_count, sample_size_expected_bw, channel_count, num_directions, enable_worker, disable_trid -): +def test_erisc_write_worker_bw_bi_dir_no_trid(sample_count, sample_size_expected_bw, channel_count, disable_trid): + benchmark_type_id = 3 run_erisc_write_worker_bw( + benchmark_type_id, sample_count, sample_size_expected_bw, channel_count, - num_directions, - enable_worker, disable_trid, FILE_NAME, ) diff --git a/tests/tt_metal/microbenchmarks/ethernet/test_ethernet_link_write_worker_with_transaction_id_common.py b/tests/tt_metal/microbenchmarks/ethernet/test_ethernet_link_write_worker_with_transaction_id_common.py index 30343e6ae81..cb7cb8722e6 100644 --- a/tests/tt_metal/microbenchmarks/ethernet/test_ethernet_link_write_worker_with_transaction_id_common.py +++ b/tests/tt_metal/microbenchmarks/ethernet/test_ethernet_link_write_worker_with_transaction_id_common.py @@ -35,7 +35,7 @@ def get_device_freq(): return freq -def profile_results(sample_size, sample_count, channel_count, num_directions, test_latency, file_name): +def profile_results(sample_size, sample_count, channel_count, benchmark_type, test_latency, file_name): freq = get_device_freq() / 1000.0 setup = device_post_proc_config.default_setup() setup.deviceInputLog = profiler_log_path @@ -60,7 +60,7 @@ def profile_results(sample_size, sample_count, channel_count, num_directions, te if test_latency == 1: main_loop_latency = main_loop_cycle / freq header = [ - "NUM_DIRECTIONS", + "BENCHMARK ID", "SAMPLE_SIZE", "LATENCY (ns)", ] @@ -69,7 +69,7 @@ def profile_results(sample_size, sample_count, channel_count, num_directions, te main_loop_latency = main_loop_cycle / freq / sample_count / channel_count bw = sample_size / main_loop_latency header = [ - "NUM_DIRECTIONS", + "BENCHMARK ID", "SAMPLE_SIZE", "BW (B/c)", ] @@ -78,7 +78,7 @@ def profile_results(sample_size, sample_count, channel_count, num_directions, te append_to_csv( file_name, header, - [num_directions, sample_size, res], + [benchmark_type, sample_size, res], write_header, ) return main_loop_latency diff --git a/tests/tt_metal/microbenchmarks/ethernet/test_ethernet_link_write_worker_with_transaction_id_latency.py b/tests/tt_metal/microbenchmarks/ethernet/test_ethernet_link_write_worker_with_transaction_id_latency.py index 190a7f265f9..971d4a0d842 100644 --- a/tests/tt_metal/microbenchmarks/ethernet/test_ethernet_link_write_worker_with_transaction_id_latency.py +++ b/tests/tt_metal/microbenchmarks/ethernet/test_ethernet_link_write_worker_with_transaction_id_latency.py @@ -27,7 +27,7 @@ def run_erisc_write_worker_latency( - sample_count, sample_size_expected_latency, channel_count, num_directions, enable_worker, disable_trid, file_name + benchmark_type, sample_count, sample_size_expected_latency, channel_count, disable_trid, file_name ): os.system(f"rm -rf {os.environ['TT_METAL_HOME']}/generated/profiler/.logs/profile_log_device.csv") @@ -41,12 +41,11 @@ def run_erisc_write_worker_latency( ARCH_NAME = os.getenv("ARCH_NAME") cmd = f"TT_METAL_DEVICE_PROFILER=1 \ {os.environ['TT_METAL_HOME']}/build/test/tt_metal/perf_microbenchmark/ethernet/test_ethernet_write_worker_latency_no_edm_{ARCH_NAME} \ + {benchmark_type} \ {sample_count} \ {sample_size} \ {channel_count} \ - {num_directions} \ {test_latency} \ - {enable_worker} \ {disable_trid} " rc = os.system(cmd) if rc != 0: @@ -54,79 +53,17 @@ def run_erisc_write_worker_latency( assert False main_loop_latency = profile_results( - sample_size, sample_count, channel_count, num_directions, test_latency, file_name + sample_size, sample_count, channel_count, benchmark_type, test_latency, file_name ) logger.info(f"sender_loop_latency {main_loop_latency}") assert expected_latency_lower_bound <= main_loop_latency <= expected_latency_upper_bound -# uni-direction test for eth-sender <---> eth-receiver ---> worker -@pytest.mark.skipif(is_grayskull(), reason="Unsupported on GS") -@pytest.mark.parametrize("sample_count", [1]) -@pytest.mark.parametrize("channel_count", [16]) -@pytest.mark.parametrize("num_directions", [1]) -@pytest.mark.parametrize("enable_worker", [1]) -@pytest.mark.parametrize("disable_trid", [0]) -@pytest.mark.parametrize( - "sample_size_expected_latency", - [ - (16, 984.0), - (128, 1002.0), - (256, 1019.0), - (512, 1074.0), - (1024, 1164.0), - (2048, 1308.0), - (4096, 1560.0), - (8192, 2048.0), - ], -) -def test_erisc_write_worker_latency_uni_dir( - sample_count, sample_size_expected_latency, channel_count, num_directions, enable_worker, disable_trid -): - run_erisc_write_worker_latency( - sample_count, - sample_size_expected_latency, - channel_count, - num_directions, - enable_worker, - disable_trid, - FILE_NAME, - ) - - -# bi-direction test for eth-sender <---> eth-receiver ---> worker -@pytest.mark.skipif(is_grayskull(), reason="Unsupported on GS") -@pytest.mark.parametrize("sample_count", [1]) -@pytest.mark.parametrize("channel_count", [16]) -@pytest.mark.parametrize("num_directions", [2]) -@pytest.mark.parametrize("enable_worker", [1]) -@pytest.mark.parametrize("disable_trid", [0]) -@pytest.mark.parametrize( - "sample_size_expected_latency", - [(16, 1077.0), (128, 1079.0), (256, 1077.0), (512, 1175.0), (1024, 1231.0), (2048, 1389.0), (4096, 1596.0)], -) -def test_erisc_write_worker_latency_bi_dir( - sample_count, sample_size_expected_latency, channel_count, num_directions, enable_worker, disable_trid -): - run_erisc_write_worker_latency( - sample_count, - sample_size_expected_latency, - channel_count, - num_directions, - enable_worker, - disable_trid, - FILE_NAME, - ) - - # uni-direction test for eth-sender <---> eth-receiver @pytest.mark.skipif(is_grayskull(), reason="Unsupported on GS") @pytest.mark.parametrize("sample_count", [1]) @pytest.mark.parametrize("channel_count", [16]) -@pytest.mark.parametrize("num_directions", [1]) -@pytest.mark.parametrize("enable_worker", [0]) -@pytest.mark.parametrize("disable_trid", [0]) @pytest.mark.parametrize( "sample_size_expected_latency", [ @@ -134,46 +71,50 @@ def test_erisc_write_worker_latency_bi_dir( (128, 911.0), (256, 966.0), (512, 984.0), - (1024, 1074.0), - (2048, 1200.0), - (4096, 1362.0), - (8192, 1686.0), + (1024, 1245.0), + (2048, 1479.0), + (4096, 1803.0), + (8192, 2451.0), ], ) -def test_erisc_latency_uni_dir( - sample_count, sample_size_expected_latency, channel_count, num_directions, enable_worker, disable_trid -): +def test_erisc_latency_uni_dir(sample_count, sample_size_expected_latency, channel_count): + benchmark_type_id = 0 + disable_trid = 0 # don't care in this case run_erisc_write_worker_latency( + benchmark_type_id, sample_count, sample_size_expected_latency, channel_count, - num_directions, - enable_worker, disable_trid, FILE_NAME, ) -# bi-direction test for eth-sender <---> eth-receiver ---> worker +# uni-direction test for eth-sender <---> eth-receiver ---> worker @pytest.mark.skipif(is_grayskull(), reason="Unsupported on GS") @pytest.mark.parametrize("sample_count", [1]) @pytest.mark.parametrize("channel_count", [16]) -@pytest.mark.parametrize("num_directions", [2]) -@pytest.mark.parametrize("enable_worker", [0]) @pytest.mark.parametrize("disable_trid", [0]) @pytest.mark.parametrize( "sample_size_expected_latency", - [(16, 918.0), (128, 919.0), (256, 952.0), (512, 988.0), (1024, 1122.0), (2048, 1224.0), (4096, 1394.0)], + [ + (16, 984.0), + (128, 1002.0), + (256, 1019.0), + (512, 1074.0), + (1024, 1335.0), + (2048, 1609.0), + (4096, 2018.0), + (8192, 2811.0), + ], ) -def test_erisc_latency_bi_dir( - sample_count, sample_size_expected_latency, channel_count, num_directions, enable_worker, disable_trid -): +def test_erisc_write_worker_latency_uni_dir(sample_count, sample_size_expected_latency, channel_count, disable_trid): + benchmark_type_id = 2 run_erisc_write_worker_latency( + benchmark_type_id, sample_count, sample_size_expected_latency, channel_count, - num_directions, - enable_worker, disable_trid, FILE_NAME, ) diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_write_worker_latency_no_edm.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_write_worker_latency_no_edm.cpp index 3a4ed7661f8..b233aee0033 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_write_worker_latency_no_edm.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_write_worker_latency_no_edm.cpp @@ -27,6 +27,8 @@ #include +#include "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/eth_ubenchmark_types.hpp" + // TODO: ARCH_NAME specific, must remove #include "eth_l1_address_map.h" @@ -94,13 +96,12 @@ std::vector build( std::size_t num_samples, std::size_t sample_page_size, std::size_t num_buffer_slots, - std::size_t num_directions, + uint32_t benchmark_type, KernelHandle& local_kernel, KernelHandle& remote_kernel, std::shared_ptr& worker_buffer_0, std::shared_ptr& worker_buffer_1, bool test_latency, - bool enable_worker, bool disable_trid) { Program program0; Program program1; @@ -112,12 +113,26 @@ std::vector build( uint32_t worker_buffer_0_addr = worker_buffer_0->address(); uint32_t worker_buffer_1_addr = worker_buffer_1->address(); + uint32_t measurement_type = (uint32_t)(test_latency ? MeasurementType::Latency : MeasurementType::Bandwidth); + // eth core ct args const std::vector& eth_sender_ct_args = { - num_buffer_slots, worker_noc_x, worker_noc_y, worker_buffer_0_addr}; + benchmark_type, + measurement_type, + num_buffer_slots, + worker_noc_x, + worker_noc_y, + worker_buffer_0_addr, + uint32_t(disable_trid)}; const std::vector& eth_receiver_ct_args = { - num_buffer_slots, worker_noc_x, worker_noc_y, worker_buffer_1_addr}; + benchmark_type, + measurement_type, + num_buffer_slots, + worker_noc_x, + worker_noc_y, + worker_buffer_1_addr, + uint32_t(disable_trid)}; // eth core rt args const std::vector& eth_sender_receiver_rt_args = { @@ -125,29 +140,12 @@ std::vector build( static_cast(num_samples), static_cast(sample_page_size)}; - std::map sender_receiver_defines; - if (num_directions == 2) { - sender_receiver_defines["ENABLE_BI_DIRECTION"] = "1"; - } - if (test_latency) { - sender_receiver_defines["TEST_LATENCY"] = "1"; - } - if (enable_worker) { - sender_receiver_defines["ENABLE_WORKER"] = "1"; - } - if (disable_trid) { - sender_receiver_defines["DISABLE_TRID"] = "1"; - } - local_kernel = tt_metal::CreateKernel( program0, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/" "ethernet_write_worker_latency_ubench_sender.cpp", eth_sender_core, - tt_metal::EthernetConfig{ - .noc = tt_metal::NOC::RISCV_0_default, - .compile_args = eth_sender_ct_args, - .defines = sender_receiver_defines}); + tt_metal::EthernetConfig{.noc = tt_metal::NOC::RISCV_0_default, .compile_args = eth_sender_ct_args}); tt_metal::SetRuntimeArgs(program0, local_kernel, eth_sender_core, eth_sender_receiver_rt_args); remote_kernel = tt_metal::CreateKernel( @@ -155,10 +153,7 @@ std::vector build( "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/" "ethernet_write_worker_latency_ubench_receiver.cpp", eth_receiver_core, - tt_metal::EthernetConfig{ - .noc = tt_metal::NOC::RISCV_0_default, - .compile_args = eth_receiver_ct_args, - .defines = sender_receiver_defines}); + tt_metal::EthernetConfig{.noc = tt_metal::NOC::RISCV_0_default, .compile_args = eth_receiver_ct_args}); tt_metal::SetRuntimeArgs(program1, remote_kernel, eth_receiver_core, eth_sender_receiver_rt_args); // Launch @@ -181,10 +176,9 @@ void run( IDevice* device1, Program& program0, Program& program1, - std::size_t num_directions, + BenchmarkType benchmark_type, std::shared_ptr& worker_buffer_0, - std::shared_ptr& worker_buffer_1, - bool enable_worker) { + std::shared_ptr& worker_buffer_1) { if (std::getenv("TT_METAL_SLOW_DISPATCH_MODE")) { std::thread th2 = std::thread([&] { tt_metal::detail::LaunchProgram(device0, program0); }); std::thread th1 = std::thread([&] { tt_metal::detail::LaunchProgram(device1, program1); }); @@ -202,9 +196,9 @@ void run( tt::tt_metal::detail::DumpDeviceProfileResults(device0); tt::tt_metal::detail::DumpDeviceProfileResults(device1); - if (enable_worker) { + if (benchmark_type == BenchmarkType::EthEthTensixUniDir or benchmark_type == BenchmarkType::EthEthTensixBiDir) { validation(worker_buffer_1); - if (num_directions == 2) { + if (benchmark_type == BenchmarkType::EthEthTensixBiDir) { validation(worker_buffer_0); } } @@ -212,14 +206,20 @@ void run( int main(int argc, char** argv) { std::size_t arg_idx = 1; + uint32_t benchmark_type = (uint32_t)std::stoi(argv[arg_idx++]); + + auto benchmark_type_enum = magic_enum::enum_cast(benchmark_type); + TT_FATAL( + benchmark_type_enum.has_value(), + "Unsupported benchmark {} specified, check BenchmarkType enum for supported values", + benchmark_type); + std::size_t num_samples = std::stoi(argv[arg_idx++]); std::size_t sample_page_size = std::stoi(argv[arg_idx++]); std::size_t num_buffer_slots = std::stoi(argv[arg_idx++]); - std::size_t num_directions = std::stoi(argv[arg_idx++]); + bool test_latency = std::stoi(argv[arg_idx++]); - bool enable_worker = std::stoi(argv[arg_idx++]); bool disable_trid = std::stoi(argv[arg_idx++]); - TT_FATAL(num_directions == 1 or num_directions == 2, "either uni-dir or bi-dir test"); auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); auto num_devices = tt::tt_metal::GetNumAvailableDevices(); @@ -265,11 +265,12 @@ int main(int argc, char** argv) { try { log_info( tt::LogTest, - "num_samples: {}, sample_page_size: {}, num_buffer_slots: {}, num_directions: {}", + "benchmark type: {}, measurement type: {}, num_samples: {}, sample_page_size: {}, num_buffer_slots: {}", + magic_enum::enum_name(benchmark_type_enum.value()), + magic_enum::enum_name(test_latency ? MeasurementType::Latency : MeasurementType::Bandwidth), num_samples, sample_page_size, - num_buffer_slots, - num_directions); + num_buffer_slots); KernelHandle local_kernel; KernelHandle remote_kernel; try { @@ -301,22 +302,20 @@ int main(int argc, char** argv) { num_samples, sample_page_size, num_buffer_slots, - num_directions, + benchmark_type, local_kernel, remote_kernel, worker_buffer_0, worker_buffer_1, test_latency, - enable_worker, disable_trid); run(device_0, device_1, programs[0], programs[1], - num_directions, + benchmark_type_enum.value(), worker_buffer_0, - worker_buffer_1, - enable_worker); + worker_buffer_1); } catch (std::exception& e) { log_error(tt::LogTest, "Caught exception: {}", e.what()); test_fixture.TearDown(); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/eth_ubenchmark_types.hpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/eth_ubenchmark_types.hpp new file mode 100644 index 00000000000..8313da5730f --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/eth_ubenchmark_types.hpp @@ -0,0 +1,29 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +// This file is shared by host and device ethernet microbenchmarks + +#pragma once + +#include + +enum BenchmarkType : uint8_t { + EthOnlyUniDir = 0, + EthOnlyBiDir = 1, + EthEthTensixUniDir = 2, + EthEthTensixBiDir = 3, + TensixPushEth = 4, + EthMcastTensix = 5, + EthToLocalEth = 6, + EthToLocalEthAndMcastTensix = 7, +}; + +enum MeasurementType : uint8_t { Latency = 0, Bandwidth = 1 }; + +struct eth_buffer_slot_sync_t { + volatile uint32_t bytes_sent; + volatile uint32_t receiver_ack; + volatile uint32_t src_id; + uint32_t reserved_2; +}; diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/ethernet_write_worker_latency_ubench_common.hpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/ethernet_write_worker_latency_ubench_common.hpp index 34825404d9a..0e1b83b8b94 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/ethernet_write_worker_latency_ubench_common.hpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/ethernet_write_worker_latency_ubench_common.hpp @@ -9,17 +9,10 @@ #include "ethernet/dataflow_api.h" #include "debug/assert.h" #include "debug/dprint.h" +#include "eth_ubenchmark_types.hpp" // #define ENABLE_DEBUG 1 -struct eth_buffer_slot_sync_t { - volatile uint32_t bytes_sent; - volatile uint32_t receiver_ack; - volatile uint32_t src_id; - - uint32_t reserved_2; -}; - FORCE_INLINE void eth_setup_handshake(std::uint32_t handshake_register_address, bool is_sender) { if (is_sender) { eth_send_bytes(handshake_register_address, handshake_register_address, 16); @@ -43,12 +36,15 @@ bool is_power_of_two(T val) { // ******************************* Common Ct Args ************************************************ -constexpr uint32_t NUM_BUFFER_SLOTS = get_compile_time_arg_val(0); +constexpr BenchmarkType benchmark_type = static_cast(get_compile_time_arg_val(0)); +constexpr MeasurementType measurement_type = static_cast(get_compile_time_arg_val(1)); +constexpr uint32_t NUM_BUFFER_SLOTS = get_compile_time_arg_val(2); constexpr uint32_t MAX_NUM_TRANSACTION_ID = NUM_BUFFER_SLOTS / 2; // the algorithm only works for NUM_BUFFER_SLOTS divisible by MAX_NUM_TRANSACTION_ID -constexpr uint32_t worker_noc_x = get_compile_time_arg_val(1); -constexpr uint32_t worker_noc_y = get_compile_time_arg_val(2); -constexpr uint32_t worker_buffer_addr = get_compile_time_arg_val(3); +constexpr uint32_t worker_noc_x = get_compile_time_arg_val(3); +constexpr uint32_t worker_noc_y = get_compile_time_arg_val(4); +constexpr uint32_t worker_buffer_addr = get_compile_time_arg_val(5); +constexpr uint32_t disable_trid = get_compile_time_arg_val(6); // ******************************* Sender APIs *************************************************** @@ -179,17 +175,24 @@ FORCE_INLINE bool write_worker_done(uint32_t trid) { return ncrisc_noc_nonposted_write_with_transaction_id_flushed(noc_index, trid); } -FORCE_INLINE void ack_complete(volatile eth_buffer_slot_sync_t* buffer_slot_sync_addr) { +FORCE_INLINE void ack_complete( + uint32_t buffer_slot_addr, volatile eth_buffer_slot_sync_t* buffer_slot_sync_addr, uint32_t full_payload_size) { buffer_slot_sync_addr->bytes_sent = 0; while (eth_txq_is_busy()) { switch_context_if_debug(); } - eth_send_bytes_over_channel_payload_only_unsafe_one_packet( - reinterpret_cast(buffer_slot_sync_addr), - reinterpret_cast(buffer_slot_sync_addr), - sizeof(eth_buffer_slot_sync_t)); + if constexpr (measurement_type == MeasurementType::Latency) { + // Send pack entire packet so measurement from sender -> receiver -> sender is symmetric + eth_send_bytes_over_channel_payload_only_unsafe_one_packet( + buffer_slot_addr, buffer_slot_addr, full_payload_size); + } else { + eth_send_bytes_over_channel_payload_only_unsafe_one_packet( + reinterpret_cast(buffer_slot_sync_addr), + reinterpret_cast(buffer_slot_sync_addr), + sizeof(eth_buffer_slot_sync_t)); + } } FORCE_INLINE void write_worker( @@ -210,7 +213,8 @@ FORCE_INLINE void write_worker( buffer_slot_sync_addr->bytes_sent = 0; } -FORCE_INLINE void check_incomping_packet_and_write_worker( +template +FORCE_INLINE void check_incoming_packet_and_write_worker( const std::array& buffer_slot_addrs, const std::array& buffer_slot_sync_addrs, uint32_t read_ptr, @@ -221,48 +225,165 @@ FORCE_INLINE void check_incomping_packet_and_write_worker( bool buffer_not_full = next_write_ptr != read_ptr; if (buffer_not_full && has_incoming_packet(buffer_slot_sync_addrs[write_ptr])) { -#ifdef ENABLE_WORKER - uint32_t curr_trid = get_buffer_slot_trid(write_ptr); - write_worker( - buffer_slot_addrs[write_ptr], buffer_slot_sync_addrs[write_ptr], worker_noc_addr, message_size, curr_trid); -#endif + if constexpr (write_to_worker) { + uint32_t curr_trid = get_buffer_slot_trid(write_ptr); + write_worker( + buffer_slot_addrs[write_ptr], + buffer_slot_sync_addrs[write_ptr], + worker_noc_addr, + message_size, + curr_trid); + } write_ptr = next_write_ptr; } } +template FORCE_INLINE void check_write_worker_done_and_send_ack( + const std::array& buffer_slot_addrs, const std::array& buffer_slot_sync_addrs, + uint32_t full_payload_size, uint32_t& read_ptr, uint32_t write_ptr, uint32_t& num_messages_ack) { bool buffer_not_empty = read_ptr != write_ptr; -#if defined(ENABLE_WORKER) and !defined(DISABLE_TRID) - uint32_t curr_trid = get_buffer_slot_trid(read_ptr); - if (buffer_not_empty && write_worker_done(curr_trid)) { -#else - if (buffer_not_empty) { -#endif - // DPRINT << "read_ptr " << read_ptr < FORCE_INLINE void update_receiver_state( const std::array& buffer_slot_addrs, const std::array& buffer_slot_sync_addrs, uint64_t worker_noc_addr, uint32_t message_size, + uint32_t full_payload_size, uint32_t& num_messages_ack, uint32_t& buffer_read_ptr, uint32_t& buffer_write_ptr) { // Check if there's an incoming packet for current buffer slot and write to worker if there's new packet - check_incomping_packet_and_write_worker( + check_incoming_packet_and_write_worker( buffer_slot_addrs, buffer_slot_sync_addrs, buffer_read_ptr, buffer_write_ptr, worker_noc_addr, message_size); // Check if the write for trid is done, and ack sender if the current buffer slot is done - check_write_worker_done_and_send_ack(buffer_slot_sync_addrs, buffer_read_ptr, buffer_write_ptr, num_messages_ack); + check_write_worker_done_and_send_ack( + buffer_slot_addrs, + buffer_slot_sync_addrs, + full_payload_size, + buffer_read_ptr, + buffer_write_ptr, + num_messages_ack); +} + +template +FORCE_INLINE void receiver_uni_dir( + const std::array& receiver_buffer_slot_addrs, + const std::array& receiver_buffer_slot_sync_addrs, + uint32_t message_size, + uint32_t full_payload_size, + uint32_t num_messages, + uint64_t worker_noc_addr) { + uint32_t total_msgs; + if constexpr (measurement_type == MeasurementType::Latency) { + total_msgs = num_messages; + } else { + total_msgs = num_messages * NUM_BUFFER_SLOTS; + } + + DPRINT << "RECEIVER MAIN LOOP" << ENDL(); + + uint32_t receiver_buffer_read_ptr = 0; + uint32_t receiver_buffer_write_ptr = 0; + uint32_t receiver_num_messages_ack = 0; + + if constexpr (write_to_worker) { + noc_async_write_one_packet_with_trid_set_state(worker_noc_addr); + } + + while (receiver_num_messages_ack < total_msgs) { + update_receiver_state( + receiver_buffer_slot_addrs, + receiver_buffer_slot_sync_addrs, + worker_noc_addr, + message_size, + full_payload_size, + receiver_num_messages_ack, + receiver_buffer_read_ptr, + receiver_buffer_write_ptr); + + // not called in normal execution mode + switch_context_if_debug(); + } +} + +// same as below so merge +template +FORCE_INLINE void send_receiver_bi_dir( + const std::array& sender_buffer_slot_addrs, + const std::array& sender_buffer_slot_sync_addrs, + const std::array& receiver_buffer_slot_addrs, + const std::array& receiver_buffer_slot_sync_addrs, + uint32_t full_payload_size, + uint32_t message_size, + uint32_t num_messages, + uint64_t worker_noc_addr) { + uint32_t total_msgs; + if constexpr (measurement_type == MeasurementType::Latency) { + total_msgs = num_messages * 2; + } else { + total_msgs = num_messages * NUM_BUFFER_SLOTS * 2; + } + + DPRINT << "SENDER-RECEIVER MAIN LOOP" << ENDL(); + + uint32_t sender_buffer_read_ptr = 0; + uint32_t sender_buffer_write_ptr = 0; + + uint32_t receiver_buffer_read_ptr = 0; + uint32_t receiver_buffer_write_ptr = 0; + + uint32_t num_messages_ack = 0; + uint32_t sender_num_messages_send; + if constexpr (measurement_type == MeasurementType::Latency) { + sender_num_messages_send = num_messages; + } else { + sender_num_messages_send = num_messages * NUM_BUFFER_SLOTS; + } + + if constexpr (write_to_worker) { + noc_async_write_one_packet_with_trid_set_state(worker_noc_addr); + } + + while (num_messages_ack < total_msgs) { + update_sender_state( + sender_buffer_slot_addrs, + sender_buffer_slot_sync_addrs, + full_payload_size, + num_messages_ack, + sender_num_messages_send, + sender_buffer_read_ptr, + sender_buffer_write_ptr); + + update_receiver_state( + receiver_buffer_slot_addrs, + receiver_buffer_slot_sync_addrs, + worker_noc_addr, + message_size, + full_payload_size, + num_messages_ack, + receiver_buffer_read_ptr, + receiver_buffer_write_ptr); + + // not called in normal execution mode + switch_context_if_debug(); + } } diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/ethernet_write_worker_latency_ubench_receiver.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/ethernet_write_worker_latency_ubench_receiver.cpp index dc11308f5bb..ea59075824a 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/ethernet_write_worker_latency_ubench_receiver.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/ethernet_write_worker_latency_ubench_receiver.cpp @@ -4,100 +4,6 @@ #include "ethernet_write_worker_latency_ubench_common.hpp" -FORCE_INLINE void main_loop_uni_dir( - const std::array& receiver_buffer_slot_addrs, - const std::array& receiver_buffer_slot_sync_addrs, - uint32_t message_size, - uint32_t num_messages, - uint64_t worker_noc_addr) { - uint32_t total_msgs = -#ifdef TEST_LATENCY - num_messages; -#else - num_messages * NUM_BUFFER_SLOTS; -#endif - - DPRINT << "RECEIVER MAIN LOOP" << ENDL(); - - uint32_t receiver_buffer_read_ptr = 0; - uint32_t receiver_buffer_write_ptr = 0; - uint32_t receiver_num_messages_ack = 0; - - noc_async_write_one_packet_with_trid_set_state(worker_noc_addr); - - while (receiver_num_messages_ack < total_msgs) { - update_receiver_state( - receiver_buffer_slot_addrs, - receiver_buffer_slot_sync_addrs, - worker_noc_addr, - message_size, - receiver_num_messages_ack, - receiver_buffer_read_ptr, - receiver_buffer_write_ptr); - - // not called in normal execution mode - switch_context_if_debug(); - } -} - -FORCE_INLINE void main_loop_bi_dir( - const std::array& sender_buffer_slot_addrs, - const std::array& sender_buffer_slot_sync_addrs, - const std::array& receiver_buffer_slot_addrs, - const std::array& receiver_buffer_slot_sync_addrs, - uint32_t full_payload_size, - uint32_t message_size, - uint32_t num_messages, - uint64_t worker_noc_addr) { - uint32_t total_msgs = -#ifdef TEST_LATENCY - num_messages * 2; -#else - num_messages * NUM_BUFFER_SLOTS * 2; -#endif - - DPRINT << "RECEIVER MAIN LOOP" << ENDL(); - - uint32_t sender_buffer_read_ptr = 0; - uint32_t sender_buffer_write_ptr = 0; - - uint32_t receiver_buffer_read_ptr = 0; - uint32_t receiver_buffer_write_ptr = 0; - - uint32_t num_messages_ack = 0; - uint32_t sender_num_messages_send = -#ifdef TEST_LATENCY - num_messages; -#else - num_messages * NUM_BUFFER_SLOTS; -#endif - - noc_async_write_one_packet_with_trid_set_state(worker_noc_addr); - - while (num_messages_ack < total_msgs) { - update_sender_state( - sender_buffer_slot_addrs, - sender_buffer_slot_sync_addrs, - full_payload_size, - num_messages_ack, - sender_num_messages_send, - sender_buffer_read_ptr, - sender_buffer_write_ptr); - - update_receiver_state( - receiver_buffer_slot_addrs, - receiver_buffer_slot_sync_addrs, - worker_noc_addr, - message_size, - num_messages_ack, - receiver_buffer_read_ptr, - receiver_buffer_write_ptr); - - // not called in normal execution mode - switch_context_if_debug(); - } -} - void kernel_main() { uint32_t arg_idx = 0; const uint32_t handshake_addr = get_arg_val(arg_idx++); @@ -116,39 +22,83 @@ void kernel_main() { buffer_start_addr = setup_receiver_buffer( receiver_buffer_slot_addrs, receiver_buffer_slot_sync_addrs, buffer_start_addr, message_size); -#ifdef ENABLE_BI_DIRECTION + // Only used for bi-directional cases std::array sender_buffer_slot_addrs; std::array sender_buffer_slot_sync_addrs; - setup_sender_buffer(sender_buffer_slot_addrs, sender_buffer_slot_sync_addrs, buffer_start_addr, message_size); -#endif + if constexpr (benchmark_type == BenchmarkType::EthOnlyBiDir or benchmark_type == BenchmarkType::EthEthTensixBiDir) { + setup_sender_buffer(sender_buffer_slot_addrs, sender_buffer_slot_sync_addrs, buffer_start_addr, message_size); + } // Avoids hang in issue https://github.com/tenstorrent/tt-metal/issues/9963 for (uint32_t i = 0; i < 2000000000; i++) { asm volatile("nop"); } - // worker noc address uint64_t worker_noc_addr = get_noc_addr(worker_noc_x, worker_noc_y, worker_buffer_addr); eth_setup_handshake(handshake_addr, false); - { - DeviceZoneScopedN("MAIN-TEST-BODY"); -#ifdef ENABLE_BI_DIRECTION - main_loop_bi_dir( - sender_buffer_slot_addrs, - sender_buffer_slot_sync_addrs, - receiver_buffer_slot_addrs, - receiver_buffer_slot_sync_addrs, - full_payload_size, - message_size, - num_messages, - worker_noc_addr); -#else - main_loop_uni_dir( - receiver_buffer_slot_addrs, receiver_buffer_slot_sync_addrs, message_size, num_messages, worker_noc_addr); -#endif + switch (benchmark_type) { + case EthOnlyUniDir: { + DeviceZoneScopedN("MAIN-TEST-BODY"); + receiver_uni_dir( + receiver_buffer_slot_addrs, + receiver_buffer_slot_sync_addrs, + message_size, + full_payload_size, + num_messages, + worker_noc_addr); + } break; + case EthOnlyBiDir: { + DeviceZoneScopedN("MAIN-TEST-BODY"); + send_receiver_bi_dir( + sender_buffer_slot_addrs, + sender_buffer_slot_sync_addrs, + receiver_buffer_slot_addrs, + receiver_buffer_slot_sync_addrs, + full_payload_size, + message_size, + num_messages, + worker_noc_addr); + } break; + case EthEthTensixUniDir: { + DeviceZoneScopedN("MAIN-TEST-BODY"); + receiver_uni_dir( + receiver_buffer_slot_addrs, + receiver_buffer_slot_sync_addrs, + message_size, + full_payload_size, + num_messages, + worker_noc_addr); + } break; + case EthEthTensixBiDir: { + DeviceZoneScopedN("MAIN-TEST-BODY"); + send_receiver_bi_dir( + sender_buffer_slot_addrs, + sender_buffer_slot_sync_addrs, + receiver_buffer_slot_addrs, + receiver_buffer_slot_sync_addrs, + full_payload_size, + message_size, + num_messages, + worker_noc_addr); + + } break; + case TensixPushEth: { + ASSERT(0); + } break; + case EthMcastTensix: { + ASSERT(0); + } break; + case EthToLocalEth: { + ASSERT(0); + } break; + case EthToLocalEthAndMcastTensix: { + ASSERT(0); + } break; + default: WAYPOINT("!ETH"); ASSERT(0); } + // need to do a delay as trid writes are not waiting for acks, so need to make sure noc response is back. for (int i = 0; i < 1000; ++i) { asm volatile("nop"); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/ethernet_write_worker_latency_ubench_sender.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/ethernet_write_worker_latency_ubench_sender.cpp index 799df166e6d..e5c2f37a2cb 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/ethernet_write_worker_latency_ubench_sender.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/ethernet_write_worker_latency_ubench_sender.cpp @@ -4,17 +4,17 @@ #include "ethernet_write_worker_latency_ubench_common.hpp" -FORCE_INLINE void main_loop_uni_dir( +FORCE_INLINE void send_uni_dir( const std::array& buffer_slot_addrs, const std::array& buffer_slot_sync_addrs, uint32_t full_payload_size, uint32_t num_messages) { - uint32_t total_msgs = -#ifdef TEST_LATENCY - num_messages; -#else - num_messages * NUM_BUFFER_SLOTS; -#endif + uint32_t total_msgs; + if constexpr (measurement_type == MeasurementType::Latency) { + total_msgs = num_messages; + } else { + total_msgs = num_messages * NUM_BUFFER_SLOTS; + } DPRINT << "SENDER MAIN LOOP" << ENDL(); @@ -38,64 +38,6 @@ FORCE_INLINE void main_loop_uni_dir( } } -FORCE_INLINE void main_loop_bi_dir( - const std::array& sender_buffer_slot_addrs, - const std::array& sender_buffer_slot_sync_addrs, - const std::array& receiver_buffer_slot_addrs, - const std::array& receiver_buffer_slot_sync_addrs, - uint32_t full_payload_size, - uint32_t message_size, - uint32_t num_messages, - uint64_t worker_noc_addr) { - uint32_t total_msgs = -#ifdef TEST_LATENCY - num_messages * 2; -#else - num_messages * NUM_BUFFER_SLOTS * 2; -#endif - - DPRINT << "SENDER MAIN LOOP" << ENDL(); - - uint32_t sender_buffer_read_ptr = 0; - uint32_t sender_buffer_write_ptr = 0; - - uint32_t receiver_buffer_read_ptr = 0; - uint32_t receiver_buffer_write_ptr = 0; - - uint32_t num_messages_ack = 0; - uint32_t sender_num_messages_send = -#ifdef TEST_LATENCY - num_messages; -#else - num_messages * NUM_BUFFER_SLOTS; -#endif - - noc_async_write_one_packet_with_trid_set_state(worker_noc_addr); - - while (num_messages_ack < total_msgs) { - update_sender_state( - sender_buffer_slot_addrs, - sender_buffer_slot_sync_addrs, - full_payload_size, - num_messages_ack, - sender_num_messages_send, - sender_buffer_read_ptr, - sender_buffer_write_ptr); - - update_receiver_state( - receiver_buffer_slot_addrs, - receiver_buffer_slot_sync_addrs, - worker_noc_addr, - message_size, - num_messages_ack, - receiver_buffer_read_ptr, - receiver_buffer_write_ptr); - - // not called in normal execution mode - switch_context_if_debug(); - } -} - void kernel_main() { uint32_t arg_idx = 0; const uint32_t handshake_addr = get_arg_val(arg_idx++); @@ -114,40 +56,70 @@ void kernel_main() { buffer_start_addr = setup_sender_buffer(sender_buffer_slot_addrs, sender_buffer_slot_sync_addrs, buffer_start_addr, message_size); -#ifdef ENABLE_BI_DIRECTION + // Only used for bi-directional cases std::array receiver_buffer_slot_addrs; std::array receiver_buffer_slot_sync_addrs; - setup_receiver_buffer(receiver_buffer_slot_addrs, receiver_buffer_slot_sync_addrs, buffer_start_addr, message_size); -#endif + if constexpr (benchmark_type == BenchmarkType::EthOnlyBiDir or benchmark_type == BenchmarkType::EthEthTensixBiDir) { + setup_receiver_buffer( + receiver_buffer_slot_addrs, receiver_buffer_slot_sync_addrs, buffer_start_addr, message_size); + } // Avoids hang in issue https://github.com/tenstorrent/tt-metal/issues/9963 for (uint32_t i = 0; i < 2000000000; i++) { asm volatile("nop"); } + eth_setup_handshake(handshake_addr, true); - // worker noc address -#ifdef ENABLE_BI_DIRECTION uint64_t worker_noc_addr = get_noc_addr(worker_noc_x, worker_noc_y, worker_buffer_addr); -#endif - - { - DeviceZoneScopedN("MAIN-TEST-BODY"); -#ifdef ENABLE_BI_DIRECTION - main_loop_bi_dir( - sender_buffer_slot_addrs, - sender_buffer_slot_sync_addrs, - receiver_buffer_slot_addrs, - receiver_buffer_slot_sync_addrs, - full_payload_size, - message_size, - num_messages, - worker_noc_addr); -#else - main_loop_uni_dir(sender_buffer_slot_addrs, sender_buffer_slot_sync_addrs, full_payload_size, num_messages); -#endif - } + switch (benchmark_type) { + case EthOnlyUniDir: { + DeviceZoneScopedN("MAIN-TEST-BODY"); + send_uni_dir(sender_buffer_slot_addrs, sender_buffer_slot_sync_addrs, full_payload_size, num_messages); + } break; + case EthOnlyBiDir: { + DeviceZoneScopedN("MAIN-TEST-BODY"); + send_receiver_bi_dir( + sender_buffer_slot_addrs, + sender_buffer_slot_sync_addrs, + receiver_buffer_slot_addrs, + receiver_buffer_slot_sync_addrs, + full_payload_size, + message_size, + num_messages, + worker_noc_addr); + } break; + case EthEthTensixUniDir: { + DeviceZoneScopedN("MAIN-TEST-BODY"); + send_uni_dir(sender_buffer_slot_addrs, sender_buffer_slot_sync_addrs, full_payload_size, num_messages); + } break; + case EthEthTensixBiDir: { + DeviceZoneScopedN("MAIN-TEST-BODY"); + send_receiver_bi_dir( + sender_buffer_slot_addrs, + sender_buffer_slot_sync_addrs, + receiver_buffer_slot_addrs, + receiver_buffer_slot_sync_addrs, + full_payload_size, + message_size, + num_messages, + worker_noc_addr); + } break; + case TensixPushEth: { + ASSERT(0); + } break; + case EthMcastTensix: { + ASSERT(0); + } break; + case EthToLocalEth: { + ASSERT(0); + } break; + case EthToLocalEthAndMcastTensix: { + ASSERT(0); + } break; + default: WAYPOINT("!ETH"); ASSERT(0); + } // need to do a delay as trid writes are not waiting for acks, so need to make sure noc response is back. for (int i = 0; i < 1000; ++i) { asm volatile("nop"); From 0dacb45f7b86f01407c852d93f8346d02145fed5 Mon Sep 17 00:00:00 2001 From: Artem Yerofieiev <169092593+ayerofieiev-tt@users.noreply.github.com> Date: Tue, 18 Feb 2025 14:59:55 -0800 Subject: [PATCH 152/316] [skip ci] Add support for both 20.04 and 22.04 in package and release workflow (#17755) ### Ticket None ### Problem description Need both 20.04 and 22.04 wheels for Pytorch backend ### What's changed Uploads [ttnn-0.0.dev1+any-cp310-cp310-linux_x86_64.whl](https://github.com/tenstorrent/tt-metal/releases/download/v0.56.0-rc16/ttnn-0.0.dev1+any-cp310-cp310-linux_x86_64.whl) [ttnn-0.0.dev1+any-cp38-cp38-linux_x86_64.whl](https://github.com/tenstorrent/tt-metal/releases/download/v0.56.0-rc16/ttnn-0.0.dev1+any-cp38-cp38-linux_x86_64.whl) See how version is wrong - 0.0.dev1 ### Checklist - [ ] [Package and release](https://github.com/tenstorrent/tt-metal/actions/runs/13296926538) - [ ] [All Post-Commit](https://github.com/tenstorrent/tt-metal/actions/runs/13296914267) --------- Co-authored-by: Bryan Wilder Field Lozano --- .github/workflows/package-and-release.yaml | 29 ++++++++++++++-------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/.github/workflows/package-and-release.yaml b/.github/workflows/package-and-release.yaml index 0a1c6cbd8ea..b7676486ca8 100644 --- a/.github/workflows/package-and-release.yaml +++ b/.github/workflows/package-and-release.yaml @@ -13,10 +13,20 @@ permissions: jobs: build-artifact: needs: create-tag + strategy: + matrix: + config: + - version: "20.04" + publish-artifact: true + - version: "22.04" + publish-artifact: false uses: ./.github/workflows/build-artifact.yaml - secrets: inherit with: + version: ${{ matrix.config.version }} + distro: ubuntu + publish-artifact: ${{ matrix.config.publish-artifact }} build-wheel: true + secrets: inherit build-artifact-profiler: uses: ./.github/workflows/build-artifact.yaml with: @@ -123,14 +133,7 @@ jobs: path: RELEASE_NOTES.txt # Candidate for breaking up create-and-upload-draft-release: - needs: [ - create-tag, - create-release-notes, - build-artifact, - ] - strategy: - matrix: - os: [ubuntu-20.04] + needs: [create-tag, create-release-notes, build-artifact] # May accidentally create two releases without restricting to 1 job concurrency: create_upload_draft_release runs-on: ubuntu-latest @@ -143,10 +146,14 @@ jobs: uses: qmonnet/git-archive-all-action@791fb850881cf58b1d1fcc9b06c01940080bba0a with: output-files: tt-metalium.tar.gz - - name: Download eager Python packages + - name: Download eager 20.04 Python packages + uses: actions/download-artifact@v4 + with: + name: eager-dist-ubuntu-20.04-any + - name: Download eager 22.04 Python packages uses: actions/download-artifact@v4 with: - name: eager-dist-${{ matrix.os }}-any + name: eager-dist-ubuntu-22.04-any - name: Create VERSION run: echo ${{ needs.create-tag.outputs.version }} > VERSION - name : Download release notes From 244bc82acf5d8b45e2b418d28073af8ffef26eeb Mon Sep 17 00:00:00 2001 From: Stanislav Minakov Date: Tue, 18 Feb 2025 23:59:39 +0000 Subject: [PATCH 153/316] Fixed closing files in serialization (#17974) ### Ticket ### Problem description In the recent [PR](https://github.com/tenstorrent/tt-metal/pull/17906) we used TT_ASSERT to check if file close failed, but TT_ASSERTs get compiled out in release mode, so we end up not closing files. ### What's changed Changed TT_ASSERT to check and a log ### Checklist - [ ] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes - [x] New/Existing tests provide coverage for changes --- ttnn/cpp/ttnn/tensor/serialization.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ttnn/cpp/ttnn/tensor/serialization.cpp b/ttnn/cpp/ttnn/tensor/serialization.cpp index 4d4940404c0..c464dd50a44 100644 --- a/ttnn/cpp/ttnn/tensor/serialization.cpp +++ b/ttnn/cpp/ttnn/tensor/serialization.cpp @@ -27,7 +27,9 @@ namespace { struct FileCloser { void operator()(FILE* file) const { if (file) { - TT_ASSERT(fclose(file) == 0, "Failed to close file"); + if (fclose(file) != 0) { + log_warning("Failed to close file"); + } } } }; From 686a4f0ba4071a81caf924c668d0c73674a4cee0 Mon Sep 17 00:00:00 2001 From: Andrew Fuller Date: Tue, 18 Feb 2025 19:21:51 -0500 Subject: [PATCH 154/316] [skip ci] Restructure slightly the CMake file for clarity (#17978) ### Ticket #14001 ### Problem description Provide a simple example of a clear CMakeLists.txt file. ### What's changed Restructured slightly a CMake file to provide a more straightforward and declarative flow. One glaring issue remaining is that this target has include directories scoped higher than this target's directory. Do not use that detail as a recommended design. --- tt_metal/common/CMakeLists.txt | 42 ++++++++++++++++--------------- tt_metal/impl/CMakeLists.txt | 2 +- tt_metal/jit_build/CMakeLists.txt | 2 +- 3 files changed, 24 insertions(+), 22 deletions(-) diff --git a/tt_metal/common/CMakeLists.txt b/tt_metal/common/CMakeLists.txt index 7d43d25d5b0..fed04bc7914 100644 --- a/tt_metal/common/CMakeLists.txt +++ b/tt_metal/common/CMakeLists.txt @@ -1,19 +1,27 @@ -set(COMMON_SRCS - ${CMAKE_CURRENT_SOURCE_DIR}/core_assignment.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/core_coord.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/mesh_coord.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/metal_soc_descriptor.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/shape2d.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/shape_base.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/tt_backend_api_types.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/work_split.cpp +add_library(common OBJECT) +add_library(TT::Metalium::Common ALIAS common) + +target_sources( + common + PRIVATE + core_assignment.cpp + core_coord.cpp + mesh_coord.cpp + metal_soc_descriptor.cpp + shape2d.cpp + shape_base.cpp + tt_backend_api_types.cpp + utils.cpp + work_split.cpp ) -add_library(common OBJECT ${COMMON_SRCS}) -add_library(Metalium::Metal::Common ALIAS common) +target_include_directories( + common + PUBLIC + ${PROJECT_SOURCE_DIR} + ${PROJECT_SOURCE_DIR}/tt_metal +) -target_link_libraries(common PRIVATE yaml-cpp::yaml-cpp) target_link_libraries( common PUBLIC @@ -28,11 +36,5 @@ target_link_libraries( PRIVATE Tracy::TracyClient TT::Metalium::HostDevCommon -) - -target_include_directories( - common - PUBLIC - ${PROJECT_SOURCE_DIR} - ${PROJECT_SOURCE_DIR}/tt_metal + yaml-cpp::yaml-cpp ) diff --git a/tt_metal/impl/CMakeLists.txt b/tt_metal/impl/CMakeLists.txt index 12515d909f8..7af67d6bada 100644 --- a/tt_metal/impl/CMakeLists.txt +++ b/tt_metal/impl/CMakeLists.txt @@ -89,7 +89,7 @@ target_link_libraries( Boost::smart_ptr FlatBuffers::FlatBuffers range-v3::range-v3 - Metalium::Metal::Common + TT::Metalium::Common Taskflow::Taskflow TT::Metalium::HostDevCommon Metalium::Metal::Hardware diff --git a/tt_metal/jit_build/CMakeLists.txt b/tt_metal/jit_build/CMakeLists.txt index 80533221018..ea5260aa598 100644 --- a/tt_metal/jit_build/CMakeLists.txt +++ b/tt_metal/jit_build/CMakeLists.txt @@ -13,7 +13,7 @@ target_link_libraries( PUBLIC common PRIVATE - Metalium::Metal::Common + TT::Metalium::Common Metalium::Metal::LLRT Tracy::TracyClient Taskflow::Taskflow From c17e35ac4129a7c071be1ba214555e5f8ecdb1ba Mon Sep 17 00:00:00 2001 From: Jay Kruer Date: Tue, 18 Feb 2025 19:11:02 -0800 Subject: [PATCH 155/316] [tt-train] Add RMSNorm module (#16991) ### Problem description We need RMSNorm to train Llama 3 and some other exciting open source models. ### What's changed - Added RMS op - Added RMS module ### Checklist - [ ] Post commit CI passes - [ ] Blackhole Post commit (if applicable) - [ ] Model regression CI testing passes (if applicable) - [ ] Device performance regression CI testing passes (if applicable) - [ ] **(For models and ops writers)** Full [new models](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) tests passes - [ ] New/Existing tests provide coverage for changes --- .../sources/ttml/core/ttnn_all_includes.hpp | 1 + .../sources/ttml/modules/rms_norm_module.cpp | 28 ++++ .../sources/ttml/modules/rms_norm_module.hpp | 27 ++++ tt-train/sources/ttml/ops/rmsnorm_op.cpp | 116 ++++++++++++++ tt-train/sources/ttml/ops/rmsnorm_op.hpp | 12 ++ tt-train/tests/ops/rmsnorm_op_test.cpp | 149 ++++++++++++++++++ 6 files changed, 333 insertions(+) create mode 100644 tt-train/sources/ttml/modules/rms_norm_module.cpp create mode 100644 tt-train/sources/ttml/modules/rms_norm_module.hpp create mode 100644 tt-train/sources/ttml/ops/rmsnorm_op.cpp create mode 100644 tt-train/sources/ttml/ops/rmsnorm_op.hpp create mode 100644 tt-train/tests/ops/rmsnorm_op_test.cpp diff --git a/tt-train/sources/ttml/core/ttnn_all_includes.hpp b/tt-train/sources/ttml/core/ttnn_all_includes.hpp index 0dc4a096ea8..a7f3ecee73f 100644 --- a/tt-train/sources/ttml/core/ttnn_all_includes.hpp +++ b/tt-train/sources/ttml/core/ttnn_all_includes.hpp @@ -38,6 +38,7 @@ #include // NOLINT #include // NOLINT #include // NOLINT +#include // NOLINT #include // NOLINT #include // NOLINT #include // NOLINT diff --git a/tt-train/sources/ttml/modules/rms_norm_module.cpp b/tt-train/sources/ttml/modules/rms_norm_module.cpp new file mode 100644 index 00000000000..04f82a28c28 --- /dev/null +++ b/tt-train/sources/ttml/modules/rms_norm_module.cpp @@ -0,0 +1,28 @@ +// SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#include "rms_norm_module.hpp" + +#include "core/tt_tensor_utils.hpp" +#include "ops/rmsnorm_op.hpp" + +namespace ttml::modules { + +void RMSNormLayer::initialize_tensors(uint32_t features) { + m_gamma = + autograd::create_tensor(core::ones(core::create_shape({1, 1, 1, features}), &autograd::ctx().get_device())); +} + +RMSNormLayer::RMSNormLayer(uint32_t features, float epsilon) : m_epsilon(epsilon) { + initialize_tensors(features); + + create_name("rmsnorm"); + register_tensor(m_gamma, "gamma"); +} + +autograd::TensorPtr RMSNormLayer::operator()(const autograd::TensorPtr& tensor) { + return ops::rmsnorm(tensor, m_gamma, m_epsilon); +} + +} // namespace ttml::modules diff --git a/tt-train/sources/ttml/modules/rms_norm_module.hpp b/tt-train/sources/ttml/modules/rms_norm_module.hpp new file mode 100644 index 00000000000..721b3658c07 --- /dev/null +++ b/tt-train/sources/ttml/modules/rms_norm_module.hpp @@ -0,0 +1,27 @@ +// SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "autograd/auto_context.hpp" +#include "autograd/graph.hpp" +#include "autograd/module_base.hpp" +#include "autograd/tensor.hpp" +#include "ops/rmsnorm_op.hpp" + +namespace ttml::modules { + +class RMSNormLayer : public autograd::ModuleBase { +private: + float m_epsilon = 1e-5F; + autograd::TensorPtr m_gamma = nullptr; + +public: + void initialize_tensors(uint32_t features); + explicit RMSNormLayer(uint32_t features, float epsilon = 1e-5F); + + [[nodiscard]] autograd::TensorPtr operator()(const autograd::TensorPtr& tensor); +}; + +} // namespace ttml::modules diff --git a/tt-train/sources/ttml/ops/rmsnorm_op.cpp b/tt-train/sources/ttml/ops/rmsnorm_op.cpp new file mode 100644 index 00000000000..f232f663254 --- /dev/null +++ b/tt-train/sources/ttml/ops/rmsnorm_op.cpp @@ -0,0 +1,116 @@ +// SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#include "rmsnorm_op.hpp" + +#include +#include +#include +#include +#include + +#include "autograd/auto_context.hpp" +#include "autograd/graph.hpp" +#include "autograd/graph_utils.hpp" +#include "autograd/tensor.hpp" +#include "core/compute_kernel_config.hpp" +#include "ttnn_fixed/trivial_ttnn_ops.hpp" + +namespace ttml::ops { + +autograd::TensorPtr rmsnorm(const autograd::TensorPtr &tensor, const autograd::TensorPtr &gamma, float epsilon) { + auto a_shape = tensor->get_value().logical_shape(); + if (a_shape.rank() != 4) { + throw std::runtime_error("rmsnorm only supports rank-4 input tensors."); + } + + auto ashape_arr = a_shape.to_array_4D(); + auto [B, N, S, C] = ashape_arr; + assert((N == 1)); // one sequence per batch + + // one gain parameter per channel + assert((gamma->get_value().logical_shape().to_array_4D() == std::array{1, 1, 1, C})); + + auto device = &autograd::ctx().get_device(); + + ttnn::Tensor squares = ttnn::square(tensor->get_value()); // [B,1,S,C] -> [B,1,S,C] + + ttnn::Tensor seq_means_of_squares = ttnn::mean(squares, /*dim_arg=*/-1, /*keep_dim=*/true); // [B,1,S,1] + + ttnn::Tensor seq_means_of_squares_plus_epsilon = + ttnn::experimental::add(seq_means_of_squares, epsilon); // [B,1,S,1] x. [1] -> [B,1,S,1] (bcast) + + ttnn::Tensor rms_a = ttnn::sqrt(seq_means_of_squares_plus_epsilon); // [B,1,S,1] -> [B,1,S,1] + + ttnn::Tensor gamma_times_activations = + ttnn::experimental::mul(gamma->get_value(), tensor->get_value()); // [1,1,1,C] x [B,1,S,C] -> [B,1,S,C] (bcast) + + ttnn::Tensor out_tensor = + ttnn::experimental::div(gamma_times_activations, rms_a); // [B,1,S,C] x [B,1,S,C] -> [B,1,S,C] + + auto out = autograd::create_tensor(out_tensor); + + autograd::GradFunction grad = [B, S, C, tensor, gamma, out, rms_a, device]() { + auto a = tensor->get_value(); // [B,1,S,C] + auto g = gamma->get_value(); // [1,1,1,C] + + // c is the number of activations; in the RMS1orm paper they call this + // "n". it is renamed here to avoid confusion with 1. + auto c = static_cast(a.logical_shape()[-1]); + + auto dL_dout = out->get_grad(); // Grad w.r.t normalized arctivations, hence [B,1,S,C] + + auto scaled_gain = ttnn::experimental::div(g, rms_a); // [1,1,1,C] x [B,1,S,1] -> [B,1,S,C] (bcast) + auto gained_dL_dout = ttnn::experimental::mul(scaled_gain, dL_dout); // [B,1,S,C] x [B,1,S,C] -> [B,1,S,C] + + // notation: + // _ · _ <- usual dot product + // _ @ _ <- matrix multiplication + // _ *. _ <- Hadamard product/eltwise multiplication with broadcasting + // _ /. _ <- eltwise division with broadcasting + + // have a : [B,1,S,C] + + // want to obtain scaled_outer = gained_dL_dout @ ((a@a^T)/n*rms(a)^2) + + // to avoid computing the large outer product matrix explicitly, we + // instead compute + // scale = (a^T · gained_dL_dout) : [B,1,S,C] x [B,1,S,C] -> [1] + // scaled_outer = scale *. a : [1] x [B,1,S,C] -> [B,1,S,C] + + auto scale = ttml::ttnn_fixed::sum_over_dim( + ttnn::experimental::mul(a, gained_dL_dout), 3); // [B,1,S,C] x [B,1,S,C] -> [B,1,S,C] -> [B,1,S,1] + + auto scaled_outer = ttnn::experimental::mul(scale, a); // [B,1,S,1] x [B,1,S,C] -> [B,1,S,C] (bcast) + + auto ms_a = ttnn::square(rms_a); // [B,1,S,1] -> [B,1,S,1] + + auto c_by_ms_a = ttnn::experimental::mul(ms_a, c); // [B,1,S,1] x [1] -> [B,1,S,1] (bcast) + + auto rhs = ttnn::experimental::div(scaled_outer, c_by_ms_a); // [B,1,S,C] x [B,1,S,1] -> [B,1,S,C] (bcast) + + auto dL_da = + ttnn::experimental::sub(gained_dL_dout, rhs); // [B,1,S,C] x [B,1,S,C] -> [B,1,S,C]; checked by add_grad + tensor->add_grad(dL_da); + + // dL_dgamma = (a / rms(a)) * dL_dout -> requires sum over batch due to broadcasting + auto dL_dg_components = ttnn::experimental::mul( + dL_dout, + ttnn::experimental::div(a, rms_a)); // [B,1,S,C] x [B,1,S,1] -> [B,1,S,C] (bcast); checked by add_grad + auto dL_dg = ttnn::sum( + dL_dg_components, + /* dim_arg */ ttnn::SmallVector{0, 1, 2}, + /* keep_dim */ true, + /* output_mem_config */ std::nullopt, + /*compute_kernel_config */ core::ComputeKernelConfig::precise()); // [B,1,S,C] -> [1,1,1,C] + gamma->add_grad(dL_dg); + }; + + auto links = autograd::get_links(tensor, gamma); + out->set_node(autograd::ctx().add_backward_node(std::move(grad), links)); + + return out; +} + +} // namespace ttml::ops diff --git a/tt-train/sources/ttml/ops/rmsnorm_op.hpp b/tt-train/sources/ttml/ops/rmsnorm_op.hpp new file mode 100644 index 00000000000..34499b75b4b --- /dev/null +++ b/tt-train/sources/ttml/ops/rmsnorm_op.hpp @@ -0,0 +1,12 @@ +// SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "autograd/tensor.hpp" + +namespace ttml::ops { + +autograd::TensorPtr rmsnorm(const autograd::TensorPtr& tensor, const autograd::TensorPtr& gamma, float epsilon); + +} // namespace ttml::ops diff --git a/tt-train/tests/ops/rmsnorm_op_test.cpp b/tt-train/tests/ops/rmsnorm_op_test.cpp new file mode 100644 index 00000000000..83d02ff9d7d --- /dev/null +++ b/tt-train/tests/ops/rmsnorm_op_test.cpp @@ -0,0 +1,149 @@ +// SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#include "ops/rmsnorm_op.hpp" + +#include + +#include +#include + +#include "autograd/auto_context.hpp" +#include "autograd/tensor.hpp" +#include "core/tt_tensor_utils.hpp" +#include "ops/losses.hpp" + +class RMSNormOpTest : public ::testing::Test { +protected: + void SetUp() override { + ttml::autograd::ctx().open_device(); + } + + void TearDown() override { + ttml::autograd::ctx().close_device(); + } +}; + +// Forward and backward tests are given by comparing with results from PyTorch: +// For test tensor `x` of shape [N,C,H,W] we set x.requires_grad = True +// and compute the RMSNorm as `x_norm_sum = torch.nn.functional.rms_norm(x).sum()` +// and compute its gradient with respect to `x` as `x_grad = torch.autograd.grad(x_norm_sum, x)[0]` +// We then compare the results of the RMSNorm and its gradient with the results of the RMSNorm and its gradient +// computed by the RMSNorm op in TTML. +TEST_F(RMSNormOpTest, RMSNorm_Small_Forward) { + using namespace ttml; + float eps = 0.0078125F; // default in PyTorch for bf16 + + uint32_t N = 1, C = 1, H = 1, W = 8; + + xt::xarray example_xtensor = {{{{1.F, 2.F, 3.F, 4.F, 1.F, 2.F, 3.F, 4.F}}}}; + auto example_tensor = autograd::create_tensor(core::from_xtensor(example_xtensor, &autograd::ctx().get_device())); + auto gamma = autograd::create_tensor(core::ones(core::create_shape({1, 1, 1, W}), &autograd::ctx().get_device())); + + auto result = ops::rmsnorm(example_tensor, gamma, 0.0078125F); + auto result_xtensor = core::to_xtensor(result->get_value()); + xt::xarray expected_result = {{0.3652F, 0.7305F, 1.0938F, 1.4609F, 0.3652F, 0.7305F, 1.0938F, 1.4609F}}; + EXPECT_TRUE(xt::allclose(result_xtensor, expected_result, 1e-2F)); +} + +TEST_F(RMSNormOpTest, RMSNorm_Small_Backward) { + using namespace ttml; + float eps = 0.0078125F; // default in PyTorch for bf16 + + uint32_t N = 1, C = 1, H = 1, W = 8; + + xt::xarray example_xtensor = {{{{1.F, 2.F, 3.F, 4.F, 1.F, 2.F, 3.F, 4.F}}}}; + auto example_tensor = autograd::create_tensor(core::from_xtensor(example_xtensor, &autograd::ctx().get_device())); + auto gamma = autograd::create_tensor(core::ones(core::create_shape({1, 1, 1, W}), &autograd::ctx().get_device())); + + auto result = ops::rmsnorm(example_tensor, gamma, 0.0078125F); + auto result_xtensor = core::to_xtensor(result->get_value()); + + auto target = autograd::create_tensor(core::zeros_like(result->get_value())); + auto mse_result = ttml::ops::mse_loss(result, target); + mse_result->backward(); + auto example_tensor_grad = core::to_xtensor(example_tensor->get_grad()); + auto expected_example_tensor_grad = xt::xarray( + {{{{5.2452e-05F, + 1.0490e-04F, + -2.0742e-05F, + 2.0981e-04F, + 5.2452e-05F, + 1.0490e-04F, + -2.0742e-05F, + 2.0981e-04F}}}}); + EXPECT_TRUE(xt::allclose(example_tensor_grad, expected_example_tensor_grad, 1.0e-3F, 1e-2F)); + + auto gamma_grad = core::to_xtensor(gamma->get_grad()); + auto expected_gamma_grad = + xt::xarray({{{{0.0334F, 0.1338F, 0.2988F, 0.5352F, 0.0334F, 0.1338F, 0.2988F, 0.5352F}}}}); + EXPECT_TRUE(xt::allclose(gamma_grad, expected_gamma_grad, 1.0e-3F, 1e-2F)); +} + +TEST_F(RMSNormOpTest, RMSNorm_Forward_Batch) { + using namespace ttml; + float eps = 0.0078125F; // default in PyTorch for bf16 + + // 2 batches, 1 sequence, 20 tokens, 5-dim'l embedding space. + std::array a_shape = {2, 1, 20, 5}; + xt::xarray a_xarray = xt::xarray::from_shape(a_shape); + std::generate(a_xarray.begin(), a_xarray.end(), [cur = 0.0F]() mutable { return (cur++); }); + + auto example_tensor = autograd::create_tensor(core::from_xtensor(a_xarray, &autograd::ctx().get_device())); + auto gamma = autograd::create_tensor(core::ones(core::create_shape({1, 1, 1, 5}), &autograd::ctx().get_device())); + + auto result = ops::rmsnorm(example_tensor, gamma, 0.0078125F); + auto result_xtensor = core::to_xtensor(result->get_value()); + xt::xarray expected_result = { + {{{0.00000F, 0.40820F, 0.81641F, 1.22656F, 1.63281F}, {0.69922F, 0.83984F, 0.98047F, 1.11719F, 1.25781F}, + {0.82812F, 0.91016F, 0.99219F, 1.07812F, 1.15625F}, {0.87891F, 0.93750F, 0.99609F, 1.05469F, 1.11719F}, + {0.90625F, 0.95312F, 0.99609F, 1.04688F, 1.08594F}, {0.92578F, 0.96094F, 1.00000F, 1.03906F, 1.07031F}, + {0.93750F, 0.96875F, 1.00000F, 1.03125F, 1.06250F}, {0.94531F, 0.97266F, 1.00000F, 1.02344F, 1.05469F}, + {0.95312F, 0.97656F, 1.00000F, 1.02344F, 1.04688F}, {0.95703F, 0.97656F, 1.00000F, 1.02344F, 1.03906F}, + {0.96094F, 0.98047F, 1.00000F, 1.01562F, 1.03906F}, {0.96484F, 0.98047F, 1.00000F, 1.01562F, 1.03125F}, + {0.96875F, 0.98438F, 1.00000F, 1.01562F, 1.03125F}, {0.96875F, 0.98438F, 1.00000F, 1.01562F, 1.03125F}, + {0.97266F, 0.98438F, 1.00000F, 1.01562F, 1.03125F}, {0.97266F, 0.98828F, 1.00000F, 1.01562F, 1.02344F}, + {0.97656F, 0.98828F, 1.00000F, 1.01562F, 1.02344F}, {0.97656F, 0.98828F, 1.00000F, 1.00781F, 1.02344F}, + {0.97656F, 0.98828F, 1.00000F, 1.00781F, 1.02344F}, {0.98047F, 0.98828F, 1.00000F, 1.00781F, 1.02344F}}}, + {{{0.98047F, 0.98828F, 1.00000F, 1.00781F, 1.01562F}, {0.98047F, 0.99219F, 1.00000F, 1.00781F, 1.01562F}, + {0.98047F, 0.99219F, 1.00000F, 1.00781F, 1.01562F}, {0.98438F, 0.99219F, 1.00000F, 1.00781F, 1.01562F}, + {0.98438F, 0.99219F, 1.00000F, 1.00781F, 1.01562F}, {0.98438F, 0.99219F, 1.00000F, 1.00781F, 1.01562F}, + {0.98438F, 0.99219F, 1.00000F, 1.00781F, 1.01562F}, {0.98438F, 0.99219F, 1.00000F, 1.00781F, 1.01562F}, + {0.98438F, 0.99219F, 1.00000F, 1.00781F, 1.01562F}, {0.98828F, 0.99219F, 1.00000F, 1.00781F, 1.01562F}, + {0.98828F, 0.99219F, 1.00000F, 1.00781F, 1.01562F}, {0.98828F, 0.99219F, 1.00000F, 1.00781F, 1.01562F}, + {0.98828F, 0.99219F, 1.00000F, 1.00781F, 1.01562F}, {0.98828F, 0.99219F, 1.00000F, 1.00781F, 1.01562F}, + {0.98828F, 0.99609F, 1.00000F, 1.00781F, 1.00781F}, {0.98828F, 0.99609F, 1.00000F, 1.00781F, 1.00781F}, + {0.98828F, 0.99609F, 1.00000F, 1.00781F, 1.00781F}, {0.98828F, 0.99609F, 1.00000F, 1.00781F, 1.00781F}, + {0.98828F, 0.99609F, 1.00000F, 1.00781F, 1.00781F}, {0.98828F, 0.99609F, 1.00000F, 1.00781F, 1.00781F}}}}; + assert((expected_result.shape() == result_xtensor.shape())); + EXPECT_TRUE(xt::allclose(result_xtensor, expected_result, 6e-2F, 1e-8F)); +} + +TEST_F(RMSNormOpTest, RMSNorm_Backward_Batch) { + using namespace ttml; + float eps = 0.0078125F; // default in PyTorch for bf16 + + // 2 batches, 1 sequence, 20 tokens, 5-dim'l embedding space. + std::array a_shape = {2, 1, 20, 5}; + xt::xarray a_xarray = xt::xarray::from_shape(a_shape); + std::generate(a_xarray.begin(), a_xarray.end(), [cur = 0.0F]() mutable { return (cur++); }); + + auto example_tensor = autograd::create_tensor(core::from_xtensor(a_xarray, &autograd::ctx().get_device())); + auto gamma = autograd::create_tensor(core::ones(core::create_shape({1, 1, 1, 5}), &autograd::ctx().get_device())); + + auto result = ops::rmsnorm(example_tensor, gamma, 0.0078125F); + auto result_xtensor = core::to_xtensor(result->get_value()); + + auto target = autograd::create_tensor(core::zeros_like(result->get_value())); + auto mse_result = ttml::ops::mse_loss(result, target); + mse_result->backward(); + + auto example_tensor_grad = core::to_xtensor(example_tensor->get_grad()); + xt::xarray expected_example_tensor_grad = xt::zeros_like(a_xarray); + EXPECT_TRUE(xt::allclose(example_tensor_grad, expected_example_tensor_grad, 5e-2F, 1e-3F)); + + auto gamma_grad = core::to_xtensor(gamma->get_grad()); + xt::xarray expected_gamma_grad = {{{{0.36111F, 0.37644F, 0.39589F, 0.41945F, 0.44712F}}}}; + EXPECT_TRUE(xt::allclose(gamma_grad, expected_gamma_grad, 5e-2F)); +} From 3200cb91afaaa05b33d7f23db424912ac370cfbb Mon Sep 17 00:00:00 2001 From: Atul Krishnadas Date: Tue, 18 Feb 2025 21:58:04 -0800 Subject: [PATCH 156/316] Atulk/fill pad sharded v2 -- reverted from main due to new assert, changed tests accordingly (#17963) ### Ticket [#17094](https://github.com/tenstorrent/tt-metal/issues/17094) ### Problem description Add sharded support for fill_implicit_padding ### What's changed First op to simply utilize the new shardedAddrGen by @jvegaTT ### Checklist - [x] [All post commit](https://github.com/tenstorrent/tt-metal/actions/runs/13399211627) --------- Co-authored-by: Juan Camilo Vega --- .../unit_tests/operations/test_fill_pad.py | 144 +++++++++++++++++- .../fill_pad/device/fill_pad_op.cpp | 6 - .../device/fill_pad_program_factory.cpp | 13 +- .../kernels/dataflow/fill_pad_writer.cpp | 29 +++- 4 files changed, 179 insertions(+), 13 deletions(-) diff --git a/tests/ttnn/unit_tests/operations/test_fill_pad.py b/tests/ttnn/unit_tests/operations/test_fill_pad.py index 48dff554b6c..3f1b9289e7f 100644 --- a/tests/ttnn/unit_tests/operations/test_fill_pad.py +++ b/tests/ttnn/unit_tests/operations/test_fill_pad.py @@ -5,6 +5,7 @@ import pytest import torch import ttnn +import math from tests.ttnn.utils_for_testing import assert_with_pcc from models.utility_functions import torch_random, run_for_wormhole_b0 @@ -53,11 +54,9 @@ def create_nd_padded_tiled_tensor(shape, tile_size, fill_value, dtype): } -# @pytest.mark.parametrize("shape", [(2, 32, 300, 256)]) @pytest.mark.parametrize( "shape", [ - # 2D shapes with edge cases for fill_pad (1, 16), (16, 1), (1, 17), @@ -67,6 +66,7 @@ def create_nd_padded_tiled_tensor(shape, tile_size, fill_value, dtype): (31, 31), (33, 33), (65, 65), + (97, 97), (1, 2, 3, 2, 1, 2, 97, 97), ], ) @@ -96,3 +96,143 @@ def test_fill_pad( padded_torch_output_tensor = ttnn.from_device(output_tensor).to_torch() assert_with_pcc(padded_torch_tensor, padded_torch_output_tensor) + + +@pytest.mark.parametrize("fill_value", [1]) +@pytest.mark.parametrize( + "shape", + [ + (1, 16), + (97, 97), + ], +) +@pytest.mark.parametrize( + "shard_scheme", + [ + ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + ttnn.TensorMemoryLayout.WIDTH_SHARDED, + ], +) +@pytest.mark.parametrize("dtype", [ttnn.bfloat16, ttnn.uint32]) +def test_fill_pad_complex_sharding(device, fill_value, shape, shard_scheme, dtype): + torch.manual_seed(1234) + torch_input_tensor, padded_torch_tensor = create_nd_padded_tiled_tensor( + shape, 32, fill_value, ttnn_dtype_to_torch_dtype[dtype] + ) + num_cores_xblock = 2 + num_cores_yblock = 4 + num_cores = num_cores_xblock * num_cores_yblock + + # Add complex shard grid with 2 X 4 = 8 cores + shard_grid = ttnn.CoreRangeSet( + [ + ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(0, 1)), + ttnn.CoreRange(ttnn.CoreCoord(2, 0), ttnn.CoreCoord(3, 1)), + ttnn.CoreRange(ttnn.CoreCoord(0, 4), ttnn.CoreCoord(0, 5)), + ] + ) + + tiles_per_2d = padded_torch_tensor.shape[-2] * padded_torch_tensor.shape[-1] / (32 * 32) + dims_b4_last_dim = 1 + for i in range(len(padded_torch_tensor.shape) - 1): + dims_b4_last_dim *= padded_torch_tensor.shape[i] + + shard_shape = [32, 32] + if shard_scheme == ttnn.TensorMemoryLayout.WIDTH_SHARDED: + shard_shape = (dims_b4_last_dim, 32 * math.ceil((math.ceil(padded_torch_tensor.shape[-1] / 32) / num_cores))) + elif shard_scheme == ttnn.TensorMemoryLayout.HEIGHT_SHARDED: + tile_widths_per_core = math.ceil(dims_b4_last_dim / num_cores) + shard_shape = (32 * tile_widths_per_core, padded_torch_tensor.shape[-1]) + else: + shard_shape = (math.ceil(math.sqrt(tiles_per_core)), math.ceil(math.sqrt(tiles_per_core))) + + shard_spec = ttnn.ShardSpec(shard_grid, shard_shape, ttnn.ShardOrientation.ROW_MAJOR) + output_mem_config = ttnn.MemoryConfig( + shard_scheme, + ttnn.BufferType.L1, + shard_spec, + ) + + input_tensor = ttnn.to_device( + ttnn.from_torch(torch_input_tensor, dtype=dtype, layout=ttnn.TILE_LAYOUT), + device, + memory_config=output_mem_config, + ) + + output_tensor = ttnn.fill_implicit_tile_padding(input_tensor, fill_value, memory_config=ttnn.DRAM_MEMORY_CONFIG) + padded_torch_output_tensor = ttnn.from_device(output_tensor).to_torch() + + assert_with_pcc(padded_torch_tensor, padded_torch_output_tensor, 0.99) + + +@pytest.mark.parametrize("fill_value", [1]) +@pytest.mark.parametrize( + "shape", + [ + (1, 16), + (16, 1), + (17, 17), + (17, 1), + (16, 16), + (17, 17), + (31, 31), + (33, 33), + (97, 97), + ], +) +@pytest.mark.parametrize( + "shard_scheme", + [ + ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + ttnn.TensorMemoryLayout.WIDTH_SHARDED, + ttnn.TensorMemoryLayout.BLOCK_SHARDED, + ], +) +@pytest.mark.parametrize("dtype", [ttnn.bfloat16, ttnn.uint32]) +def test_fill_pad_sharded(device, fill_value, shape, shard_scheme, dtype): + torch.manual_seed(1234) + torch_input_tensor, padded_torch_tensor = create_nd_padded_tiled_tensor( + shape, 32, fill_value, ttnn_dtype_to_torch_dtype[dtype] + ) + + num_cores_x = 8 + num_cores_y = 7 + num_cores = num_cores_x * num_cores_y + shard_grid = ttnn.CoreRangeSet( + [ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(num_cores_x - 1, num_cores_y - 1))] + ) + + tiles_per_2d = padded_torch_tensor.shape[-2] * padded_torch_tensor.shape[-1] / (32 * 32) + dims_b4_last_dim = 1 + for i in range(len(padded_torch_tensor.shape) - 1): + dims_b4_last_dim *= padded_torch_tensor.shape[i] + + shard_shape = [32, 32] + if shard_scheme == ttnn.TensorMemoryLayout.WIDTH_SHARDED: + shard_shape = (dims_b4_last_dim, 32 * math.ceil((math.ceil(padded_torch_tensor.shape[-1] / 32) / num_cores))) + elif shard_scheme == ttnn.TensorMemoryLayout.HEIGHT_SHARDED: + tile_widths_per_core = math.ceil(dims_b4_last_dim / num_cores) + shard_shape = (32 * tile_widths_per_core, padded_torch_tensor.shape[-1]) + elif shard_scheme == ttnn.TensorMemoryLayout.BLOCK_SHARDED: + tile_widths_per_core = math.ceil(dims_b4_last_dim / num_cores_x) + shard_shape = (32 * tile_widths_per_core, 32 * math.ceil((padded_torch_tensor.shape[-1] / 32 / num_cores_y))) + else: + shard_shape = (math.ceil(math.sqrt(tiles_per_core)), math.ceil(math.sqrt(tiles_per_core))) + + shard_spec = ttnn.ShardSpec(shard_grid, shard_shape, ttnn.ShardOrientation.ROW_MAJOR) + output_mem_config = ttnn.MemoryConfig( + shard_scheme, + ttnn.BufferType.L1, + shard_spec, + ) + + input_tensor = ttnn.to_device( + ttnn.from_torch(torch_input_tensor, dtype=dtype, layout=ttnn.TILE_LAYOUT), + device, + memory_config=output_mem_config, + ) + + output_tensor = ttnn.fill_implicit_tile_padding(input_tensor, fill_value, memory_config=ttnn.DRAM_MEMORY_CONFIG) + padded_torch_output_tensor = ttnn.from_device(output_tensor).to_torch() + + assert_with_pcc(padded_torch_tensor, padded_torch_output_tensor, 0.99) diff --git a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_op.cpp index 78c13267c69..3de81f581ff 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_op.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_op.cpp @@ -14,12 +14,6 @@ namespace ttnn::operations::data_movement { void FillPad::validate(const std::vector& input_tensors) const { const auto& input_tensor_a = input_tensors.at(0); TT_FATAL(input_tensor_a.get_layout() == TILE_LAYOUT, "FillPad should only be used for tile layout"); - TT_FATAL( - input_tensor_a.memory_config().memory_layout == TensorMemoryLayout::INTERLEAVED, - "FillPad does not currently support sharding"); - TT_FATAL( - this->output_mem_config.memory_layout == TensorMemoryLayout::INTERLEAVED, - "FillPad does not currently support sharding"); } std::vector FillPad::compute_output_specs(const std::vector& input_tensors) const { diff --git a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_program_factory.cpp index e798d9f0c3f..b07c6e65bf0 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_program_factory.cpp @@ -9,6 +9,7 @@ #include #include #include +#include "ttnn/operations/ccl/sharding_addrgen_helper.hpp" bool is_power_of_two_at_least_32(uint32_t value) { return value >= 32 && (value & (value - 1)) == 0; } @@ -68,6 +69,8 @@ operation::ProgramWithCallbacks fill_pad_multi_core(const Tensor& input_tensor, padded_height / tt::constants::TILE_HEIGHT * padded_width / tt::constants::TILE_HEIGHT; uint32_t tiles_per_tile_row = padded_width / tt::constants::TILE_HEIGHT; + bool sharded = input_tensor.memory_config().memory_layout != TensorMemoryLayout::INTERLEAVED; + // create kernel // reader compile time args std::vector writer_compile_time_args = { @@ -82,7 +85,12 @@ operation::ProgramWithCallbacks fill_pad_multi_core(const Tensor& input_tensor, (std::uint32_t)tiles_per_2d_tensor, (std::uint32_t)tiles_per_tile_row, (std::uint32_t)tt::constants::TILE_HEIGHT, - (std::uint32_t)tt::constants::FACE_HEIGHT}; + (std::uint32_t)tt::constants::FACE_HEIGHT, + (std::uint32_t)sharded}; + + if (sharded) { + shard_builder::extend_sharding_compile_time_args(input_tensor, writer_compile_time_args); + } tt::tt_metal::KernelHandle writer_kernel_id = tt::tt_metal::CreateKernel( program, @@ -102,6 +110,9 @@ operation::ProgramWithCallbacks fill_pad_multi_core(const Tensor& input_tensor, { writer_runtime_args[2] = tile_offset; writer_runtime_args[3] = local_num_2d_tensors; + if (sharded) { + shard_builder::extend_sharding_run_time_args(input_tensor, writer_runtime_args); + } tt_metal::SetRuntimeArgs(program, writer_kernel_id, core, writer_runtime_args); } diff --git a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/kernels/dataflow/fill_pad_writer.cpp b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/kernels/dataflow/fill_pad_writer.cpp index a94aa7fdea0..e2ecff02ddc 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/kernels/dataflow/fill_pad_writer.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/kernels/dataflow/fill_pad_writer.cpp @@ -3,6 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 #include "dataflow_api.h" +#include "cpp/ttnn/operations/ccl/shared_with_host/sharded_tensor_addr_gen.hpp" +#include "ttnn/cpp/ttnn/operations/ccl/kernel_common/sharding_addrgen.hpp" void kernel_main() { constexpr uint32_t cb_id_0 = get_compile_time_arg_val(0); @@ -19,20 +21,38 @@ void kernel_main() { constexpr uint32_t tile_size = get_compile_time_arg_val(10); constexpr uint32_t tile_hw = tile_size * tile_size; constexpr uint32_t face_size = get_compile_time_arg_val(11); +#define SHARDED get_compile_time_arg_val(12) == 1 constexpr uint32_t face_hw = face_size * face_size; constexpr uint32_t alignment_adjustor = 16; - uint32_t dst_addr = get_arg_val(0); - uint32_t cb_page_size = get_arg_val(1); - uint32_t starting_tile_offset = get_arg_val(2); - uint32_t num_2d_tensors = get_arg_val(3); + uint32_t rt_arg_ind = 0; + uint32_t dst_addr = get_arg_val(rt_arg_ind++); + uint32_t cb_page_size = get_arg_val(rt_arg_ind++); + uint32_t starting_tile_offset = get_arg_val(rt_arg_ind++); + uint32_t num_2d_tensors = get_arg_val(rt_arg_ind++); +#if (SHARDED) + typedef ShardedInfo< + get_compile_time_arg_val(13), + get_compile_time_arg_val(14), + get_compile_time_arg_val(15), + get_compile_time_arg_val(16), + get_compile_time_arg_val(17), + get_compile_time_arg_val(18), + get_compile_time_arg_val(19)> + tensor_shard_info; + + const auto [mapping_table, rt_increment] = + experimental::shard_addr_gen_utils::get_shard_map(get_arg_addr(rt_arg_ind)); + experimental::ShardedAddrGen s0 = {.bank_base_address = dst_addr, .shard_array = mapping_table}; +#else const DataFormat data_format = get_dataformat(cb_id_0); const InterleavedAddrGenFast s0 = { .bank_base_address = dst_addr, .page_size = tile_hw * element_size_bytes, .data_format = data_format // page_size needs to be tile_size_bytes }; +#endif // Reserve and push the fill value into the circular buffer cb_reserve_back(cb_id_0, 1); @@ -82,4 +102,5 @@ void kernel_main() { for (uint32_t t = 0; t < num_2d_tensors; t++) { fill_pad_2d_tensor(t * tiles_per_2d_tensor + starting_tile_offset); } + noc_async_write_barrier(); } From 4a2bc8106b11d41806c46a0c98c1630cf94d0dd0 Mon Sep 17 00:00:00 2001 From: David Ma Date: Wed, 19 Feb 2025 00:17:14 +0000 Subject: [PATCH 157/316] #0: Remove unused CommandQueue functions --- tt_metal/api/tt-metalium/command_queue.hpp | 19 +---- .../impl/dispatch/hardware_command_queue.cpp | 70 +++++-------------- .../impl/dispatch/hardware_command_queue.hpp | 19 +---- 3 files changed, 22 insertions(+), 86 deletions(-) diff --git a/tt_metal/api/tt-metalium/command_queue.hpp b/tt_metal/api/tt-metalium/command_queue.hpp index 3c1a57fe7e7..18a87e6a169 100644 --- a/tt_metal/api/tt-metalium/command_queue.hpp +++ b/tt_metal/api/tt-metalium/command_queue.hpp @@ -27,7 +27,6 @@ class CommandQueue { virtual ~CommandQueue() = default; virtual const CoreCoord& virtual_enqueue_program_dispatch_core() const = 0; - virtual const CoreCoord& completion_queue_writer_core() const = 0; virtual volatile bool is_dprint_server_hung() = 0; virtual volatile bool is_noc_hung() = 0; @@ -52,9 +51,7 @@ class CommandQueue { virtual IDevice* device() = 0; - // These functions are temporarily needed since MeshCommandQueue relies on the CommandQueue object - virtual uint32_t get_expected_num_workers_completed_for_sub_device(uint32_t sub_device_index) const = 0; - virtual void set_expected_num_workers_completed_for_sub_device(uint32_t sub_device_index, uint32_t num_workers) = 0; + // This function is temporarily needed since MeshCommandQueue relies on the CommandQueue object virtual WorkerConfigBufferMgr& get_config_buffer_mgr(uint32_t index) = 0; virtual void enqueue_trace(const uint32_t trace_id, bool blocking) = 0; @@ -62,13 +59,7 @@ class CommandQueue { virtual void enqueue_program(Program& program, bool blocking) = 0; virtual void enqueue_read_buffer( - std::shared_ptr& buffer, - void* dst, - const BufferRegion& region, - bool blocking, - tt::stl::Span sub_device_ids = {}) = 0; - virtual void enqueue_read_buffer( - Buffer& buffer, + const std::variant, std::shared_ptr>& buffer, void* dst, const BufferRegion& region, bool blocking, @@ -85,12 +76,6 @@ class CommandQueue { const BufferRegion& region, bool blocking, tt::stl::Span sub_device_ids = {}) = 0; - virtual void enqueue_write_buffer( - Buffer& buffer, - const void* src, - const BufferRegion& region, - bool blocking, - tt::stl::Span sub_device_ids = {}) = 0; virtual void finish(tt::stl::Span sub_device_ids) = 0; }; diff --git a/tt_metal/impl/dispatch/hardware_command_queue.cpp b/tt_metal/impl/dispatch/hardware_command_queue.cpp index d0aa1824264..ebbcca6781d 100644 --- a/tt_metal/impl/dispatch/hardware_command_queue.cpp +++ b/tt_metal/impl/dispatch/hardware_command_queue.cpp @@ -91,21 +91,6 @@ std::optional HWCommandQueue::tid() const { return this->tid_; } SystemMemoryManager& HWCommandQueue::sysmem_manager() { return this->manager; } -uint32_t HWCommandQueue::get_expected_num_workers_completed_for_sub_device(uint32_t sub_device_index) const { - TT_FATAL( - sub_device_index < DispatchSettings::DISPATCH_MESSAGE_ENTRIES, - "Expected sub_device_index to be less than DispatchSettings::DISPATCH_MESSAGE_ENTRIES"); - return this->expected_num_workers_completed[sub_device_index]; -} - -void HWCommandQueue::set_expected_num_workers_completed_for_sub_device( - uint32_t sub_device_index, uint32_t num_workers) { - TT_FATAL( - sub_device_index < DispatchSettings::DISPATCH_MESSAGE_ENTRIES, - "Expected sub_device_index to be less than DispatchSettings::DISPATCH_MESSAGE_ENTRIES"); - this->expected_num_workers_completed[sub_device_index] = num_workers; -} - void HWCommandQueue::reset_worker_state( bool reset_launch_msg_state, uint32_t num_sub_devices, const vector_memcpy_aligned& go_signal_noc_data) { TT_FATAL(!this->manager.get_bypass_mode(), "Cannot reset worker state during trace capture"); @@ -182,45 +167,37 @@ void HWCommandQueue::enqueue_command(T& command, bool blocking, tt::stl::Span& buffer, - void* dst, - const BufferRegion& region, - bool blocking, - tt::stl::Span sub_device_ids) { - this->enqueue_read_buffer(*buffer, dst, region, blocking, sub_device_ids); -} - // Read buffer command is enqueued in the issue region and device writes requested buffer data into the completion // region void HWCommandQueue::enqueue_read_buffer( - Buffer& buffer, + const std::variant, std::shared_ptr>& buffer, void* dst, const BufferRegion& region, bool blocking, tt::stl::Span sub_device_ids) { ZoneScopedN("HWCommandQueue_read_buffer"); TT_FATAL(!this->manager.get_bypass_mode(), "Enqueue Read Buffer cannot be used with tracing"); + Buffer& buffer_obj = get_buffer_object(buffer); sub_device_ids = buffer_dispatch::select_sub_device_ids(this->device_, sub_device_ids); - if (is_sharded(buffer.buffer_layout())) { + if (is_sharded(buffer_obj.buffer_layout())) { // Forward data from each core to the completion queue. // Then have the completion queue reader thread copy this data to user space. auto dispatch_params = buffer_dispatch::initialize_sharded_buf_read_dispatch_params( - buffer, this->id_, this->expected_num_workers_completed, region); + buffer_obj, this->id_, this->expected_num_workers_completed, region); auto cores = buffer_dispatch::get_cores_for_sharded_buffer( - dispatch_params.width_split, dispatch_params.buffer_page_mapping, buffer); - for (uint32_t core_id = 0; core_id < buffer.num_cores(); ++core_id) { + dispatch_params.width_split, dispatch_params.buffer_page_mapping, buffer_obj); + for (uint32_t core_id = 0; core_id < buffer_obj.num_cores(); ++core_id) { buffer_dispatch::copy_sharded_buffer_from_core_to_completion_queue( core_id, - buffer, + buffer_obj, dispatch_params, sub_device_ids, cores[core_id], dispatch_core_manager::instance().get_dispatch_core_type(device_->id())); if (dispatch_params.pages_per_txn > 0) { this->issued_completion_q_reads.push( - buffer_dispatch::generate_sharded_buffer_read_descriptor(dst, dispatch_params, buffer)); + buffer_dispatch::generate_sharded_buffer_read_descriptor(dst, dispatch_params, buffer_obj)); this->increment_num_entries_in_completion_q(); } } @@ -228,15 +205,15 @@ void HWCommandQueue::enqueue_read_buffer( // Forward data from device to the completion queue. // Then have the completion queue reader thread copy this data to user space. auto dispatch_params = buffer_dispatch::initialize_interleaved_buf_read_dispatch_params( - buffer, this->id_, this->expected_num_workers_completed, region); + buffer_obj, this->id_, this->expected_num_workers_completed, region); buffer_dispatch::copy_interleaved_buffer_to_completion_queue( dispatch_params, - buffer, + buffer_obj, sub_device_ids, dispatch_core_manager::instance().get_dispatch_core_type(device_->id())); if (dispatch_params.pages_per_txn > 0) { this->issued_completion_q_reads.push( - buffer_dispatch::generate_interleaved_buffer_read_descriptor(dst, dispatch_params, buffer)); + buffer_dispatch::generate_interleaved_buffer_read_descriptor(dst, dispatch_params, buffer_obj)); this->increment_num_entries_in_completion_q(); } } @@ -251,6 +228,8 @@ void HWCommandQueue::enqueue_write_buffer( const BufferRegion& region, bool blocking, tt::stl::Span sub_device_ids) { + ZoneScopedN("HWCommandQueue_write_buffer"); + TT_FATAL(!this->manager.get_bypass_mode(), "Enqueue Write Buffer cannot be used with tracing"); // Top level API to accept different variants for buffer and src // For shared pointer variants, object lifetime is guaranteed at least till the end of this function auto* data = std::visit( @@ -259,33 +238,22 @@ void HWCommandQueue::enqueue_write_buffer( [](const auto& data) -> const void* { return data->data(); }}, src); Buffer& buffer_obj = get_buffer_object(buffer); - this->enqueue_write_buffer(buffer_obj, data, region, blocking, sub_device_ids); -} - -CoreType HWCommandQueue::get_dispatch_core_type() { - return dispatch_core_manager::instance().get_dispatch_core_type(device_->id()); -} - -void HWCommandQueue::enqueue_write_buffer( - Buffer& buffer, - const void* src, - const BufferRegion& region, - bool blocking, - tt::stl::Span sub_device_ids) { - ZoneScopedN("HWCommandQueue_write_buffer"); - TT_FATAL(!this->manager.get_bypass_mode(), "Enqueue Write Buffer cannot be used with tracing"); sub_device_ids = buffer_dispatch::select_sub_device_ids(this->device_, sub_device_ids); auto dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device_->id()); buffer_dispatch::write_to_device_buffer( - src, buffer, region, this->id_, this->expected_num_workers_completed, dispatch_core_type, sub_device_ids); + data, buffer_obj, region, this->id_, this->expected_num_workers_completed, dispatch_core_type, sub_device_ids); if (blocking) { this->finish(sub_device_ids); } } +CoreType HWCommandQueue::get_dispatch_core_type() { + return dispatch_core_manager::instance().get_dispatch_core_type(device_->id()); +} + void HWCommandQueue::enqueue_program(Program& program, bool blocking) { ZoneScopedN("HWCommandQueue_enqueue_program"); std::vector sub_device_ids = {program.determine_sub_device_ids(device_)}; @@ -565,8 +533,6 @@ const CoreCoord& HWCommandQueue::virtual_enqueue_program_dispatch_core() const { return this->virtual_enqueue_program_dispatch_core_; } -const CoreCoord& HWCommandQueue::completion_queue_writer_core() const { return this->completion_queue_writer_core_; } - void HWCommandQueue::record_begin(const uint32_t tid, const std::shared_ptr& ctx) { auto num_sub_devices = this->device_->num_sub_devices(); // Record the original value of expected_num_workers_completed, and reset it to 0. diff --git a/tt_metal/impl/dispatch/hardware_command_queue.hpp b/tt_metal/impl/dispatch/hardware_command_queue.hpp index eeb8c1b9fe8..a9a7a418d81 100644 --- a/tt_metal/impl/dispatch/hardware_command_queue.hpp +++ b/tt_metal/impl/dispatch/hardware_command_queue.hpp @@ -28,7 +28,6 @@ class HWCommandQueue : public CommandQueue { ~HWCommandQueue() override; const CoreCoord& virtual_enqueue_program_dispatch_core() const override; - const CoreCoord& completion_queue_writer_core() const override; volatile bool is_dprint_server_hung() override; volatile bool is_noc_hung() override; @@ -51,21 +50,13 @@ class HWCommandQueue : public CommandQueue { void terminate() override; - // These functions are temporarily needed since MeshCommandQueue relies on the CommandQueue object - uint32_t get_expected_num_workers_completed_for_sub_device(uint32_t sub_device_index) const override; - void set_expected_num_workers_completed_for_sub_device(uint32_t sub_device_index, uint32_t num_workers) override; + // This function is temporarily needed since MeshCommandQueue relies on the CommandQueue object WorkerConfigBufferMgr& get_config_buffer_mgr(uint32_t index) override; void enqueue_trace(const uint32_t trace_id, bool blocking) override; void enqueue_program(Program& program, bool blocking) override; void enqueue_read_buffer( - std::shared_ptr& buffer, - void* dst, - const BufferRegion& region, - bool blocking, - tt::stl::Span sub_device_ids = {}) override; - void enqueue_read_buffer( - Buffer& buffer, + const std::variant, std::shared_ptr>& buffer, void* dst, const BufferRegion& region, bool blocking, @@ -81,12 +72,6 @@ class HWCommandQueue : public CommandQueue { const BufferRegion& region, bool blocking, tt::stl::Span sub_device_ids = {}) override; - void enqueue_write_buffer( - Buffer& buffer, - const void* src, - const BufferRegion& region, - bool blocking, - tt::stl::Span sub_device_ids = {}) override; void finish(tt::stl::Span sub_device_ids) override; From f9f72c5dff8864e59920e51528bc6e794f80581b Mon Sep 17 00:00:00 2001 From: Nemanja Grujic <109360083+nemanjagrujic@users.noreply.github.com> Date: Wed, 19 Feb 2025 10:21:39 +0100 Subject: [PATCH 158/316] #8865: Update changed ttnn ops in dispatch time profiling infra (#17675) ### Ticket [Link to Github Issue](https://github.com/tenstorrent/tt-metal/issues/8865) ### Problem description Dispatch time profiling infra (tests/ttnn/profiling), is no longer working since many ttnn ops changes. ### What's changed Dispatch time profiling infra is fixed. ### Checklist - [ ] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes --- tests/ttnn/profiling/ops_for_profiling.py | 191 ++++++------------ .../profile_host_overhead_with_tracy.py | 6 +- tests/ttnn/profiling/reference.txt | 62 +++--- 3 files changed, 102 insertions(+), 157 deletions(-) diff --git a/tests/ttnn/profiling/ops_for_profiling.py b/tests/ttnn/profiling/ops_for_profiling.py index 8c669973a59..29b32f44ec9 100644 --- a/tests/ttnn/profiling/ops_for_profiling.py +++ b/tests/ttnn/profiling/ops_for_profiling.py @@ -2,7 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 -import tt_lib import ttnn @@ -194,10 +193,6 @@ def threshold_bw(x, y): ttnn.threshold_bw(x, y, 0.7, 10) -def unary_eq_bw(x, y): - tt_lib.tensor.unary_eq_bw(x, y, other=0.7) - - def logiteps_bw(x, y): ttnn.logiteps_bw(x, y, eps=0.0001) @@ -228,51 +223,51 @@ def mseloss(x, y): def primary_moreh_softmax_backward_0(x, y): - tt_lib.operations.primary.moreh_softmax_backward(x, y, dim=0) + ttnn.operations.moreh.softmax_backward(x, y, dim=0) def primary_moreh_softmax_backward_1(x, y): - tt_lib.operations.primary.moreh_softmax_backward(x, y, dim=1) + ttnn.operations.moreh.softmax_backward(x, y, dim=1) def primary_moreh_softmax_backward_2(x, y): - tt_lib.operations.primary.moreh_softmax_backward(x, y, dim=2) + ttnn.operations.moreh.softmax_backward(x, y, dim=2) def primary_moreh_softmax_backward_3(x, y): - tt_lib.operations.primary.moreh_softmax_backward(x, y, dim=3) + ttnn.operations.moreh.softmax_backward(x, y, dim=3) def primary_moreh_softmin_backward_0(x, y): - tt_lib.operations.primary.moreh_softmin_backward(x, y, dim=0) + ttnn.operations.moreh.softmin_backward(x, y, dim=0) def primary_moreh_softmin_backward_1(x, y): - tt_lib.operations.primary.moreh_softmin_backward(x, y, dim=1) + ttnn.operations.moreh.softmin_backward(x, y, dim=1) def primary_moreh_softmin_backward_2(x, y): - tt_lib.operations.primary.moreh_softmin_backward(x, y, dim=2) + ttnn.operations.moreh.softmin_backward(x, y, dim=2) def primary_moreh_softmin_backward_3(x, y): - tt_lib.operations.primary.moreh_softmin_backward(x, y, dim=3) + ttnn.operations.moreh.softmin_backward(x, y, dim=3) def primary_moreh_logsoftmax_backward_0(x, y): - tt_lib.operations.primary.moreh_logsoftmax_backward(x, y, dim=0) + ttnn.operations.moreh.logsoftmax_backward(x, y, dim=0) def primary_moreh_logsoftmax_backward_1(x, y): - tt_lib.operations.primary.moreh_logsoftmax_backward(x, y, dim=1) + ttnn.operations.moreh.logsoftmax_backward(x, y, dim=1) def primary_moreh_logsoftmax_backward_2(x, y): - tt_lib.operations.primary.moreh_logsoftmax_backward(x, y, dim=2) + ttnn.operations.moreh.logsoftmax_backward(x, y, dim=2) def primary_moreh_logsoftmax_backward_3(x, y): - tt_lib.operations.primary.moreh_logsoftmax_backward(x, y, dim=3) + ttnn.operations.moreh.logsoftmax_backward(x, y, dim=3) def primary_scale_mask_softmax_in_place(x, y): @@ -898,10 +893,6 @@ def unary_div_bw(x, y): "op": threshold_bw, "name": "ttnn.threshold_bw", }, - { - "op": unary_eq_bw, - "name": "tt_lib.tensor.unary_eq_bw", - }, { "op": ttnn.logit_bw, "name": "ttnn.logit_bw", @@ -965,51 +956,51 @@ def unary_div_bw(x, y): }, { "op": primary_moreh_softmax_backward_0, - "name": "tt_lib.operations.primary.moreh_softmax_backward_dim_0", + "name": "ttnn.operations.moreh.softmax_backward_dim_0", }, { "op": primary_moreh_softmax_backward_1, - "name": "tt_lib.operations.primary.moreh_softmax_backward_dim_1", + "name": "ttnn.operations.moreh.softmax_backward_dim_1", }, { "op": primary_moreh_softmax_backward_2, - "name": "tt_lib.operations.primary.moreh_softmax_backward_dim_2", + "name": "ttnn.operations.moreh.softmax_backward_dim_2", }, { "op": primary_moreh_softmax_backward_3, - "name": "tt_lib.operations.primary.moreh_softmax_backward_dim_3", + "name": "ttnn.operations.moreh.softmax_backward_dim_3", }, { "op": primary_moreh_softmin_backward_0, - "name": "tt_lib.operations.primary.moreh_softmin_backward_dim_0", + "name": "ttnn.operations.moreh.softmin_backward_dim_0", }, { "op": primary_moreh_softmin_backward_1, - "name": "tt_lib.operations.primary.moreh_softmin_backward_dim_1", + "name": "ttnn.operations.moreh.softmin_backward_dim_1", }, { "op": primary_moreh_softmin_backward_2, - "name": "tt_lib.operations.primary.moreh_softmin_backward_dim_2", + "name": "ttnn.operations.moreh.softmin_backward_dim_2", }, { "op": primary_moreh_softmin_backward_3, - "name": "tt_lib.operations.primary.moreh_softmin_backward_dim_3", + "name": "ttnn.operations.moreh.softmin_backward_dim_3", }, { "op": primary_moreh_logsoftmax_backward_0, - "name": "tt_lib.operations.primary.moreh_logsoftmax_backward_dim_0", + "name": "ttnn.operations.moreh.logsoftmax_backward_dim_0", }, { "op": primary_moreh_logsoftmax_backward_1, - "name": "tt_lib.operations.primary.moreh_logsoftmax_backward_dim_1", + "name": "ttnn.operations.moreh.logsoftmax_backward_dim_1", }, { "op": primary_moreh_logsoftmax_backward_2, - "name": "tt_lib.operations.primary.moreh_logsoftmax_backward_dim_2", + "name": "ttnn.operations.moreh.logsoftmax_backward_dim_2", }, { "op": primary_moreh_logsoftmax_backward_3, - "name": "tt_lib.operations.primary.moreh_logsoftmax_backward_dim_3", + "name": "ttnn.operations.moreh.logsoftmax_backward_dim_3", }, { "op": primary_scale_mask_softmax_in_place, @@ -1048,24 +1039,24 @@ def unary_div_bw(x, y): # To make # { # "op": conv, -# "name": "tt_lib.tensor.conv", +# "name": "ttnn.conv", # }, # Crashing # { # "op": primaru_moreh_mean_0123, -# "name": "tt_lib.operations.primary.moreh_mean_dims_0123", +# "name": "ttnn.operations.moreh.mean_dims_0123", # "shape_func": primaru_moreh_mean_0123_shape_func, # }, # { # "op": primaru_moreh_mean_023, -# "name": "tt_lib.operations.primary.moreh_mean_dims_023", +# "name": "ttnn.operations.moreh.mean_dims_023", # "shape_func": primaru_moreh_mean_023_shape_func, # }, # { # "op": primaru_moreh_mean_123, -# "name": "tt_lib.operations.primary.moreh_mean_dims_123", +# "name": "ttnn.operations.moreh.mean_dims_123", # "shape_func": primaru_moreh_mean_123_shape_func, # }, @@ -1340,10 +1331,6 @@ def group_norm_no_weights(x): ttnn.group_norm(x, num_groups=32, epsilon=0.00001, weight=None, bias=None) -def convert_conv_weight_tensor_to_tiled_layout(x): - tt_lib.tensor.convert_conv_weight_tensor_to_tiled_layout(x, in1_block_h=32, in1_block_w=32) - - def logical_not_(x): ttnn.logical_not_(x) @@ -1441,51 +1428,51 @@ def argmin_all(x): def primary_moreh_softmax_0(x): - tt_lib.operations.primary.moreh_softmax(x, dim=0) + ttnn.operations.moreh.softmax(x, dim=0) def primary_moreh_softmax_1(x): - tt_lib.operations.primary.moreh_softmax(x, dim=1) + ttnn.operations.moreh.softmax(x, dim=1) def primary_moreh_softmax_2(x): - tt_lib.operations.primary.moreh_softmax(x, dim=2) + ttnn.operations.moreh.softmax(x, dim=2) def primary_moreh_softmax_3(x): - tt_lib.operations.primary.moreh_softmax(x, dim=3) + ttnn.operations.moreh.softmax(x, dim=3) def primary_moreh_softmin_0(x): - tt_lib.operations.primary.moreh_softmin(x, dim=0) + ttnn.operations.moreh.softmin(x, dim=0) def primary_moreh_softmin_1(x): - tt_lib.operations.primary.moreh_softmin(x, dim=1) + ttnn.operations.moreh.softmin(x, dim=1) def primary_moreh_softmin_2(x): - tt_lib.operations.primary.moreh_softmin(x, dim=2) + ttnn.operations.moreh.softmin(x, dim=2) def primary_moreh_softmin_3(x): - tt_lib.operations.primary.moreh_softmin(x, dim=3) + ttnn.operations.moreh.softmin(x, dim=3) def primary_moreh_logsoftmax_0(x): - tt_lib.operations.primary.moreh_logsoftmax(x, dim=0) + ttnn.operations.moreh.logsoftmax(x, dim=0) def primary_moreh_logsoftmax_1(x): - tt_lib.operations.primary.moreh_logsoftmax(x, dim=1) + ttnn.operations.moreh.logsoftmax(x, dim=1) def primary_moreh_logsoftmax_2(x): - tt_lib.operations.primary.moreh_logsoftmax(x, dim=2) + ttnn.operations.moreh.logsoftmax(x, dim=2) def primary_moreh_logsoftmax_3(x): - tt_lib.operations.primary.moreh_logsoftmax(x, dim=3) + ttnn.operations.moreh.logsoftmax(x, dim=3) def primary_moreh_norm_0(x): @@ -1501,7 +1488,7 @@ def primary_moreh_norm_2(x): def primary_moreh_norm_3(x): - ttnn.operations.moreh.moreh_norm(x, p=2.0, dim=3) + ttnn.operations.moreh.norm(x, p=2.0, dim=3) def split_dim_3(x): @@ -2096,15 +2083,11 @@ def assign_unary(x): }, { "op": fill_rm, - "name": "tt_lib.tensor.fill_rm", + "name": "ttnn.fill_rm", }, { "op": fill_ones_rm, - "name": "tt_lib.tensor.fill_ones_rm", - }, - { - "op": ttnn.mean, - "name": "tt_lib.tensor.mean_hw", + "name": "ttnn.fill_ones_rm", }, { "op": ttnn.var_hw, @@ -2178,11 +2161,11 @@ def assign_unary(x): }, { "op": pow_int, - "name": "tt_lib.tensor.pow_int", + "name": "ttnn.pow_int", }, { "op": pow_float, - "name": "tt_lib.tensor.pow_float", + "name": "ttnn.pow_float", }, { "op": ttnn.identity, @@ -2229,51 +2212,51 @@ def assign_unary(x): }, { "op": primary_moreh_softmax_0, - "name": "tt_lib.operations.primary.moreh_softmax_dim_0", + "name": "ttnn.operations.moreh.softmax_dim_0", }, { "op": primary_moreh_softmax_1, - "name": "tt_lib.operations.primary.moreh_softmax_dim_1", + "name": "ttnn.operations.moreh.softmax_dim_1", }, { "op": primary_moreh_softmax_2, - "name": "tt_lib.operations.primary.moreh_softmax_dim_2", + "name": "ttnn.operations.moreh.softmax_dim_2", }, { "op": primary_moreh_softmax_3, - "name": "tt_lib.operations.primary.moreh_softmax_dim_3", + "name": "ttnn.operations.moreh.softmax_dim_3", }, { "op": primary_moreh_softmin_0, - "name": "tt_lib.operations.primary.moreh_softmin_dim_0", + "name": "ttnn.operations.moreh.softmin_dim_0", }, { "op": primary_moreh_softmin_1, - "name": "tt_lib.operations.primary.moreh_softmin_dim_1", + "name": "ttnn.operations.moreh.softmin_dim_1", }, { "op": primary_moreh_softmin_2, - "name": "tt_lib.operations.primary.moreh_softmin_dim_2", + "name": "ttnn.operations.moreh.softmin_dim_2", }, { "op": primary_moreh_softmin_3, - "name": "tt_lib.operations.primary_moreh_softmin_dim_3", + "name": "ttnn.operations.moreh.softmin_dim_3", }, { "op": primary_moreh_logsoftmax_0, - "name": "tt_lib.operations.primary.moreh_logsoftmax_dim_0", + "name": "ttnn.operations.moreh.logsoftmax_dim_0", }, { "op": primary_moreh_logsoftmax_1, - "name": "tt_lib.operations.primary.moreh_logsoftmax_dim_1", + "name": "ttnn.operations.moreh.logsoftmax_dim_1", }, { "op": primary_moreh_logsoftmax_2, - "name": "tt_lib.operations.primary.moreh_logsoftmax_dim_2", + "name": "ttnn.operations.moreh.logsoftmax_dim_2", }, { "op": primary_moreh_logsoftmax_3, - "name": "tt_lib.operations.primary.moreh_logsoftmax_dim_3", + "name": "ttnn.operations.moreh.logsoftmax_dim_3", }, { "op": primary_moreh_norm_0, @@ -2355,35 +2338,13 @@ def assign_unary(x): # "name": "ttnn.group_norm_no_weights", # }, -# Unsupported storage type -# { -# "op": convert_conv_weight_tensor_to_tiled_layout, -# "name": "tt_lib.tensor.convert_conv_weight_tensor_to_tiled_layout", -# "layout": "ROW_MAJOR", -# }, - - -# Very slow - And crashes sometimes -# { -# "op": argmin_4, -# "name": "tt_lib.tensor.argmin_dim_0", -# }, -# { -# "op": argmax_4, -# "name": "tt_lib.tensor.argmax_dim_0", -# }, - def layernorm(x, y, z): - ttnn.layer_norm(input=x, epsilon=0.0001, weight=y, bias=z) - - -def primary_layernorm(x, y, z): - ttnn.layer_norm(input=x, epsilon=0.0001, weight=y, bias=z) + ttnn.layer_norm(x, epsilon=0.0001, weight=y, bias=z) def norm_shapes_func(input_shape): - input_shape_12 = [input_shape[0], input_shape[1], 32, input_shape[3]] + input_shape_12 = [input_shape[0], input_shape[1], 1, input_shape[3]] return input_shape, input_shape_12, input_shape_12 @@ -2391,10 +2352,6 @@ def add_layernorm(x, y, z): ttnn.layer_norm(x, residual_input_tensor=x, epsilon=0.0001, weight=y, bias=z) -def primary_add_layernorm(x, y, z): - ttnn.layer_norm(x, residual_input_tensor=x, epsilon=0.0001, weight=y, bias=z) - - def group_norm(x, y, z): ttnn.group_norm(x, num_groups=32, epsilon=0.0001, weight=y, bias=x) @@ -2416,7 +2373,7 @@ def primary_moreh_groupnorm_shape_func(input_shape): def rmsnorm(x, y, z): - ttnn.rms_norm(input=x, epsilon=0.0001, weight=y, bias=z) + ttnn.rms_norm(x, epsilon=0.0001, weight=y, bias=z) def addcmul(x, y, z): @@ -2541,11 +2498,6 @@ def linear_shape_func(input_shape): "name": "ttnn.layer_norm", "shape_func": norm_shapes_func, }, - { - "op": primary_layernorm, - "name": "ttnn.layer_norm", - "shape_func": norm_shapes_func, - }, { "op": rmsnorm, "name": "ttnn.rms_norm", @@ -2553,12 +2505,7 @@ def linear_shape_func(input_shape): }, { "op": add_layernorm, - "name": "ttnn.layer_norm", - "shape_func": norm_shapes_func, - }, - { - "op": primary_add_layernorm, - "name": "ttnn.layer_norm", + "name": "ttnn.add_layer_norm", "shape_func": norm_shapes_func, }, { @@ -2607,8 +2554,8 @@ def linear_shape_func(input_shape): "name": "ttnn.add_bw", }, # { - # "op": tt_lib.tensor.embedding_bw, - # "name": "tt_lib.tensor.embedding_bw", + # "op": ttnn.embedding_bw, + # "name": "ttnn.embedding_bw", # }, { "op": where_bw, @@ -2730,19 +2677,11 @@ def linear_shape_func(input_shape): # Gets stuck # { # "op": primary_moreh_groupnorm, -# "name": "tt_lib.operations.primary.moreh_groupnorm", +# "name": "ttnn.operations.moreh.groupnorm", # "shape_func": primary_moreh_groupnorm_shape_func, # }, # { # "op": primary_moreh_groupnorm_backward, -# "name": "tt_lib.operations.primary.moreh_groupnorm_backward", +# "name": "ttnn.operations.moreh.groupnorm_backward", # "shape_func": primary_moreh_groupnorm_backward_shape_func, # } - - -# Seems depricated -# { -# "op": fused_layernorm, -# "name": "tt_lib.fused_ops.layernorm.Layernorm", -# "shape_func": norm_shapes_func, -# }, diff --git a/tests/ttnn/profiling/profile_host_overhead_with_tracy.py b/tests/ttnn/profiling/profile_host_overhead_with_tracy.py index 609a23e53e3..52960df22cc 100644 --- a/tests/ttnn/profiling/profile_host_overhead_with_tracy.py +++ b/tests/ttnn/profiling/profile_host_overhead_with_tracy.py @@ -118,7 +118,11 @@ def profile_host_overhead(output_directory, output_csv, op_to_profile=""): logger.info(f"Analyzing {file}") # Read the csv file - df = pd.read_csv(file) + try: + df = pd.read_csv(file) + except Exception as e: + print(e) + continue # Iterate over the rows in the final dataframe for index, row in final_df.iterrows(): diff --git a/tests/ttnn/profiling/reference.txt b/tests/ttnn/profiling/reference.txt index bb4a6bc2123..d537690928a 100644 --- a/tests/ttnn/profiling/reference.txt +++ b/tests/ttnn/profiling/reference.txt @@ -1,13 +1,13 @@ op,count,python min dispatch time (ms),python mean dispatch time(ms),python mean dispatch + sync time (ms),C++ mean dispatch time (ms) tt_lib.fused_ops.softmax.softmax,200,0.179,0.192,0.372,0.103 -tt_lib.operations.primary.moreh_logsoftmax_backward_dim_0,200,0.037,0.031,0.265,0.013 -tt_lib.operations.primary.moreh_logsoftmax_backward_dim_1,200,0.035,0.032,0.293,0.01 -tt_lib.operations.primary.moreh_logsoftmax_backward_dim_2,200,0.035,0.028,0.342,0.011 -tt_lib.operations.primary.moreh_logsoftmax_backward_dim_3,200,0.035,0.028,0.283,0.011 -tt_lib.operations.primary.moreh_logsoftmax_dim_0,200,0.028,0.03,0.291,0.011 -tt_lib.operations.primary.moreh_logsoftmax_dim_1,200,0.028,0.023,0.328,0.01 -tt_lib.operations.primary.moreh_logsoftmax_dim_2,200,0.028,0.023,0.252,0.011 -tt_lib.operations.primary.moreh_logsoftmax_dim_3,200,0.028,0.026,0.224,0.009 +ttnn.operations.moreh.logsoftmax_backward_dim_0,200,0.037,0.031,0.265,0.013 +ttnn.operations.moreh.logsoftmax_backward_dim_1,200,0.035,0.032,0.293,0.01 +ttnn.operations.moreh.logsoftmax_backward_dim_2,200,0.035,0.028,0.342,0.011 +ttnn.operations.moreh.logsoftmax_backward_dim_3,200,0.035,0.028,0.283,0.011 +ttnn.operations.moreh.logsoftmax_dim_0,200,0.028,0.03,0.291,0.011 +ttnn.operations.moreh.logsoftmax_dim_1,200,0.028,0.023,0.328,0.01 +ttnn.operations.moreh.logsoftmax_dim_2,200,0.028,0.023,0.252,0.011 +ttnn.operations.moreh.logsoftmax_dim_3,200,0.028,0.026,0.224,0.009 ttnn.operations.moreh.mean_backward,800,0.032,0.031,0.11,0.012 ttnn.operations.moreh.mean_dims_0,800,0.024,0.026,0.107,0.009 ttnn.operations.moreh.mean_dims_01,800,0.045,0.049,0.122,0.018 @@ -26,22 +26,22 @@ ttnn.operations.moreh.norm_dim_0,200,0.031,0.033,0.572,0.016 ttnn.operations.moreh.norm_dim_1,200,0.032,0.036,0.402,0.018 ttnn.operations.moreh.norm_dim_2,200,0.031,0.034,0.357,0.018 ttnn.operations.moreh.norm_dim_3,200,0.031,0.033,0.357,0.016 -tt_lib.operations.primary.moreh_softmax_backward_dim_0,200,0.043,0.029,0.236,0.011 -tt_lib.operations.primary.moreh_softmax_backward_dim_1,200,0.043,0.028,0.326,0.011 -tt_lib.operations.primary.moreh_softmax_backward_dim_2,200,0.043,0.027,0.262,0.012 -tt_lib.operations.primary.moreh_softmax_backward_dim_3,200,0.043,0.029,0.192,0.012 -tt_lib.operations.primary.moreh_softmax_dim_0,200,0.028,0.029,0.413,0.01 -tt_lib.operations.primary.moreh_softmax_dim_1,200,0.025,0.023,0.42,0.011 -tt_lib.operations.primary.moreh_softmax_dim_2,200,0.026,0.022,0.252,0.01 -tt_lib.operations.primary.moreh_softmax_dim_3,200,0.025,0.023,0.226,0.01 -tt_lib.operations.primary.moreh_softmin_backward_dim_0,200,0.031,0.032,0.236,0.012 -tt_lib.operations.primary.moreh_softmin_backward_dim_1,200,0.031,0.03,0.328,0.011 -tt_lib.operations.primary.moreh_softmin_backward_dim_2,200,0.031,0.027,0.263,0.011 -tt_lib.operations.primary.moreh_softmin_backward_dim_3,200,0.031,0.029,0.192,0.011 -tt_lib.operations.primary.moreh_softmin_dim_0,200,0.025,0.032,0.431,0.012 -tt_lib.operations.primary.moreh_softmin_dim_1,200,0.025,0.027,0.437,0.011 -tt_lib.operations.primary.moreh_softmin_dim_2,200,0.025,0.027,0.263,0.01 -tt_lib.operations.primary_moreh_softmin_dim_3,200,0.025,0.027,0.236,0.009 +ttnn.operations.moreh.softmax_backward_dim_0,200,0.043,0.029,0.236,0.011 +ttnn.operations.moreh.softmax_backward_dim_1,200,0.043,0.028,0.326,0.011 +ttnn.operations.moreh.softmax_backward_dim_2,200,0.043,0.027,0.262,0.012 +ttnn.operations.moreh.softmax_backward_dim_3,200,0.043,0.029,0.192,0.012 +ttnn.operations.moreh.softmax_dim_0,200,0.028,0.029,0.413,0.01 +ttnn.operations.moreh.softmax_dim_1,200,0.025,0.023,0.42,0.011 +ttnn.operations.moreh.softmax_dim_2,200,0.026,0.022,0.252,0.01 +ttnn.operations.moreh.softmax_dim_3,200,0.025,0.023,0.226,0.01 +ttnn.operations.moreh.softmin_backward_dim_0,200,0.031,0.032,0.236,0.012 +ttnn.operations.moreh.softmin_backward_dim_1,200,0.031,0.03,0.328,0.011 +ttnn.operations.moreh.softmin_backward_dim_2,200,0.031,0.027,0.263,0.011 +ttnn.operations.moreh.softmin_backward_dim_3,200,0.031,0.029,0.192,0.011 +ttnn.operations.moreh.softmin_dim_0,200,0.025,0.032,0.431,0.012 +ttnn.operations.moreh.softmin_dim_1,200,0.025,0.027,0.437,0.011 +ttnn.operations.moreh.softmin_dim_2,200,0.025,0.027,0.263,0.01 +ttnn.operations.moreh.softmin_dim_3,200,0.025,0.027,0.236,0.009 ttnn.addalpha,200,0.131,0.099,0.256,0.061 ttnn.addcdiv,200,2.846,2.244,3.316,0.163 ttnn.addcmul,200,0.126,0.129,0.389,0.074 @@ -71,8 +71,8 @@ ttnn.complex_sub,200,0.064,0.042,0.148,0.015 ttnn.conj,200,0.103,0.109,0.255,0.044 ttnn.conj_bw,200,0.038,0.03,0.102 ttnn.copy,200,0.034,0.024,0.101,0.008 -tt_lib.tensor.fill_ones_rm,200,0.038,0.02,2.028,0.007 -tt_lib.tensor.fill_rm,200,0.039,0.018,2.028,0.006 +ttnn.fill_ones_rm,200,0.038,0.02,2.028,0.007 +ttnn.fill_rm,200,0.039,0.018,2.028,0.006 ttnn.geglu_dim_2,200,0.175,0.111,0.236,0.045 ttnn.geglu_dim_3,200,0.172,0.111,0.236,0.045 ttnn.glu_dim_2,200,0.108,0.115,0.261,0.036 @@ -81,14 +81,16 @@ ttnn.imag,200,0.025,0.027,0.058,0.011 ttnn.imag_bw,200,0.918,0.932,0.955 ttnn.mac,200,0.09,0.068,0.279,0.027 ttnn.mean,200,0.783,0.793,0.777,0.427 -tt_lib.tensor.mean_hw,200,0.71,0.029,0.08,0.012 -tt_lib.tensor.moreh_norm_backward,200,0.036,0.038,0.667,0.017 +ttnn.operations.moreh.norm_backward,200,0.036,0.038,0.667,0.017 ttnn.mse_loss,200,0.789,0.911,0.992,0.55 ttnn.normalize_global,200,0.334,0.262,56.404,0.154 ttnn.normalize_hw,200,0.369,0.242,0.67,0.145 +ttnn.layer_norm,800,0.061,0.068,0.146 +ttnn.rms_norm,800,0.058,0.066,0.141 +ttnn.add_layer_norm,800,0.074,0.083,0.188 ttnn.polar,200,0.118,0.121,0.751,0.052 -tt_lib.tensor.pow_float,200,0.382,0.342,1.268,0.186 -tt_lib.tensor.pow_int,200,0.047,0.028,0.102,0.01 +ttnn.pow_float,200,0.382,0.342,1.268,0.186 +ttnn.pow_int,200,0.047,0.028,0.102,0.01 ttnn.real,200,0.027,0.029,0.06,0.012 ttnn.real_bw,200,0.934,0.827,0.847 ttnn.reglu_dim_2,200,0.132,0.107,0.245,0.045 From 132a066574b4354ca43aba72510f3e82f0dcbb45 Mon Sep 17 00:00:00 2001 From: Pavle Josipovic Date: Fri, 7 Feb 2025 14:11:37 +0000 Subject: [PATCH 159/316] Add pcc checks to conv2d sweep local runs --- tests/sweep_framework/sweep_utils/conv2d_common.py | 5 ++++- .../sweeps/conv2d/short/conv2d_short_sweep.py | 10 ++++++---- .../sweeps/conv2d/short/conv2d_ttforge_sweep.py | 10 ++++++---- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/tests/sweep_framework/sweep_utils/conv2d_common.py b/tests/sweep_framework/sweep_utils/conv2d_common.py index 0d60dcdf947..eb3eb3056f2 100644 --- a/tests/sweep_framework/sweep_utils/conv2d_common.py +++ b/tests/sweep_framework/sweep_utils/conv2d_common.py @@ -260,6 +260,9 @@ def run_conv2d_short_sweep( input_layout = ttnn.Layout(input_layout) input_dtype = ttnn.DataType(input_dtype) input_memory_config = ttnn.DRAM_MEMORY_CONFIG if input_buffer_type == "dram" else ttnn.L1_MEMORY_CONFIG + torch_input_tensor = torch.reshape( + torch_input_tensor, (1, 1, batch_size * input_height * input_width, input_channels) + ) tt_input_tensor = ttnn.from_torch( torch_input_tensor, dtype=input_dtype, layout=input_layout, device=device, memory_config=input_memory_config ) @@ -315,7 +318,7 @@ def run_conv2d_short_sweep( torch_output_tensor = torch.permute(torch_output_tensor, (0, 3, 1, 2)) print("End of test case") - return [check_with_pcc(torch_output_tensor, torch_out_golden_tensor, pcc=0.998), e2e_perf] + return [check_with_pcc(torch_output_tensor, torch_out_golden_tensor, pcc=0.985), e2e_perf] def run_conv1d_short_sweep( diff --git a/tests/sweep_framework/sweeps/conv2d/short/conv2d_short_sweep.py b/tests/sweep_framework/sweeps/conv2d/short/conv2d_short_sweep.py index f1589328a94..9fc169355cc 100644 --- a/tests/sweep_framework/sweeps/conv2d/short/conv2d_short_sweep.py +++ b/tests/sweep_framework/sweeps/conv2d/short/conv2d_short_sweep.py @@ -1608,10 +1608,11 @@ def run( @pytest.mark.parametrize("input_spec", parameters["short_sweep_suite_conv2d"]["input_specs"]) @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) def test_conv2d_localrun(device, input_spec): - run_conv2d_short_sweep( + pcc, messsage = run_conv2d_short_sweep( input_spec, device, - ) + )[0] + assert pcc, messsage failing_parameters = [ @@ -1630,7 +1631,8 @@ def test_conv2d_localrun(device, input_spec): @pytest.mark.parametrize("input_spec", failing_parameters) @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) def test_conv2d_localrun_fail_only(device, input_spec): - run_conv2d_short_sweep( + pcc, messsage = run_conv2d_short_sweep( input_spec, device, - ) + )[0] + assert pcc, messsage diff --git a/tests/sweep_framework/sweeps/conv2d/short/conv2d_ttforge_sweep.py b/tests/sweep_framework/sweeps/conv2d/short/conv2d_ttforge_sweep.py index bfe24371e10..5c3ec3109c6 100644 --- a/tests/sweep_framework/sweeps/conv2d/short/conv2d_ttforge_sweep.py +++ b/tests/sweep_framework/sweeps/conv2d/short/conv2d_ttforge_sweep.py @@ -407,10 +407,11 @@ def run( @pytest.mark.parametrize("input_spec", parameters["ttforge_sweep_conv2d"]["input_specs"]) @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) def test_conv2d_localrun(device, input_spec): - run_conv2d_short_sweep( + pcc, messsage = run_conv2d_short_sweep( input_spec, device, - ) + )[0] + assert pcc, messsage # fmt: off @@ -433,7 +434,8 @@ def test_conv2d_localrun(device, input_spec): @pytest.mark.parametrize("input_spec", failing_parameters) @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) def test_conv2d_localrun_fail_only(device, input_spec): - run_conv2d_short_sweep( + pcc, messsage = run_conv2d_short_sweep( input_spec, device, - ) + )[0] + assert pcc, messsage From 96b80d00fd830b23063446f4fe0316c10a2fd159 Mon Sep 17 00:00:00 2001 From: Pavle Josipovic Date: Mon, 17 Feb 2025 20:50:17 +0000 Subject: [PATCH 160/316] #17662: Conv2d fix split reader In cases where amount of data that needs to be read is uneven, between first and second reader and in case there are multiple blocks to be read (case when act_block_h_override is used) conv2d would fail with pcc issues. Problem was that offsets for readers between blocks didn't account for potentially different amount of data being read by the other reader. --- .../tt/ttnn_functional_resnet50.py | 4 +- .../unit_tests/operations/test_new_conv2d.py | 51 ++++++++++++++++++- .../operations/conv/conv2d/conv2d_utils.cpp | 8 ++- .../conv2d_op_sharded_program_factory.cpp | 3 +- ...ations_padded_with_halo_3x3_weights_v2.cpp | 4 +- ...er_conv_weights_tiled_col_to_rm_blocks.cpp | 4 +- ...er_conv_weights_tiled_col_to_rm_blocks.cpp | 5 +- 7 files changed, 69 insertions(+), 10 deletions(-) diff --git a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50.py b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50.py index 2150dfc7d1d..fd982c479e9 100644 --- a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50.py +++ b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50.py @@ -698,9 +698,7 @@ def __init__( if type(device) == ttnn.MeshDevice and device.get_num_devices() > 8: self.conv1_config.act_block_h_override = 64 else: - # Todo: restore after issue #16895 is fixed - # self.conv1_config.act_block_h_override = 49 * 32 - self.conv1_config.act_block_h_override = 2 * 32 + self.conv1_config.act_block_h_override = 49 * 32 if is_blackhole(): # self.conv1_config.act_block_h_override = 7 * 32 # self.conv1_config.act_block_h_override = 2 * 32 diff --git a/tests/ttnn/unit_tests/operations/test_new_conv2d.py b/tests/ttnn/unit_tests/operations/test_new_conv2d.py index 7c49616a514..6364fa7e51f 100644 --- a/tests/ttnn/unit_tests/operations/test_new_conv2d.py +++ b/tests/ttnn/unit_tests/operations/test_new_conv2d.py @@ -71,6 +71,7 @@ def run_conv( input_mesh_mapper=None, weight_mesh_mapper=None, output_mesh_composer=None, + enable_split_reader=False, ): if isinstance(device, ttnn.MeshDevice): assert input_mesh_mapper is not None, "Expected mesh mapper for input tensor when using device mesh" @@ -130,7 +131,7 @@ def run_conv( input_channels_alignment=8 if use_shallow_conv_variant and not auto_shard else 32, deallocate_activation=deallocate_activation, enable_act_double_buffer=False, - enable_split_reader=False, + enable_split_reader=enable_split_reader, enable_subblock_padding=False, output_layout=output_layout, ) @@ -2917,3 +2918,51 @@ def test_dram_input_mm_conv(device, tiled_input, input_on_device): passing, pcc_msg = check_with_pcc_without_tensor_printout(torch_output_tensor, torch_out_golden_tensor, pcc=0.99) logger.info(f"PCC = {pcc_msg}. Threshold = 0.99") assert passing + + +@pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) +@pytest.mark.parametrize( + "batch_size, output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, shard_layout, config_override", + ((16, 64, 16, 115, 115, 4, 4, 1, 1, 0, 0, HS, {"act_block_h": 32 * 49}),), +) +def test_split_reader_regression( + device, + torch_tensor_map, + use_program_cache, + batch_size, + output_channels, + input_channels, + input_height, + input_width, + filter_height, + filter_width, + stride_h, + stride_w, + pad_h, + pad_w, + shard_layout, + config_override, +): + run_conv( + device, + torch_tensor_map, + ttnn.MathFidelity.LoFi, + ttnn.bfloat8_b, + ttnn.bfloat8_b, + batch_size, + output_channels, + input_channels, + input_height, + input_width, + filter_height, + filter_width, + stride_h, + stride_w, + pad_h, + pad_w, + config_override=config_override, + use_shallow_conv_variant=True, + has_bias=False, + shard_layout=shard_layout, + enable_split_reader=True, + ) diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp index 32fa50b9b63..30b36c2ca5c 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp @@ -314,7 +314,13 @@ static std::pair determine_largest_subblock_size( break; } } - TT_ASSERT(subblock_h > 0 && subblock_w > 0); + TT_FATAL( + subblock_h > 0 && subblock_w > 0, + "Could not find valid subblock size for block size {}x{}, split_reader_enabled: {}, fp32_accum: {}", + block_height, + block_width, + split_reader_enabled, + fp32_accum); return {subblock_h, subblock_w}; } diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp index a70d7093bf3..d0e917aee50 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp @@ -1299,7 +1299,8 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( (uint32_t)act_mcast_receiver_semaphore_id, (uint32_t)in0_block_num_tiles * tilized_act_tile_size, // act_mcast_sender_size_bytes (uint32_t)(transpose_mcast ? 1 : 0), - (uint32_t)act_block_h_datums_last_block}; + (uint32_t)act_block_h_datums_last_block, + (uint32_t)act_block_h_datums_split_last}; // define for bias std::map writer_defines; diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/reader_conv_activations_padded_with_halo_3x3_weights_v2.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/reader_conv_activations_padded_with_halo_3x3_weights_v2.cpp index 0e6f3012741..dc9ea03e78c 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/reader_conv_activations_padded_with_halo_3x3_weights_v2.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/reader_conv_activations_padded_with_halo_3x3_weights_v2.cpp @@ -31,6 +31,8 @@ void kernel_main() { constexpr uint32_t act_block_h_datums_read_last_block = act_block_h_datums_last_block > act_block_h_datums ? act_block_h_datums / 2 : act_block_h_datums_last_block / 2; + constexpr uint32_t act_block_h_datums_second_reader = get_compile_time_arg_val(26); + constexpr uint32_t act_block_h_datums_second_reader_read = act_block_h_datums_second_reader / 2; uint32_t i = 0; uint32_t noop = get_arg_val(i); @@ -150,7 +152,7 @@ void kernel_main() { start_reader_idx = reader_idx; #ifdef SPLIT_READER - start_reader_idx += act_block_h_datums_read; + start_reader_idx += act_block_h_datums_second_reader_read; #endif } } diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/reader_writer_tiled_out_1d_mcast_receiver_conv_weights_tiled_col_to_rm_blocks.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/reader_writer_tiled_out_1d_mcast_receiver_conv_weights_tiled_col_to_rm_blocks.cpp index c8f0630e3b8..a88ed27882a 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/reader_writer_tiled_out_1d_mcast_receiver_conv_weights_tiled_col_to_rm_blocks.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/reader_writer_tiled_out_1d_mcast_receiver_conv_weights_tiled_col_to_rm_blocks.cpp @@ -62,6 +62,8 @@ void kernel_main() { constexpr uint32_t total_weight_num_tiles = weight_block_height_num_outer * num_blocks_weight_h * weight_block_num_tiles; + constexpr uint32_t act_block_h_datums_first_reader_read = act_block_h_datums_first_reader / 2; + uint32_t i = 0; i += 19; uint32_t out_start_tile_id = get_arg_val(i); @@ -254,7 +256,7 @@ void kernel_main() { out_block_h_start_tile_id_h += out_block_height_num_tiles; #endif - start_reader_idx = reader_idx + act_block_h_datums_read; + start_reader_idx = reader_idx + act_block_h_datums_first_reader_read; } // out_num_blocks_h out_block_w_start_tile_id += out_next_block_stride_w; out_block_w_start_tile_id_w += weight_block_width_ntiles; diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/reader_writer_tiled_out_1d_mcast_sender_conv_weights_tiled_col_to_rm_blocks.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/reader_writer_tiled_out_1d_mcast_sender_conv_weights_tiled_col_to_rm_blocks.cpp index 41d71a0a4e7..0a3d1bad892 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/reader_writer_tiled_out_1d_mcast_sender_conv_weights_tiled_col_to_rm_blocks.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/reader_writer_tiled_out_1d_mcast_sender_conv_weights_tiled_col_to_rm_blocks.cpp @@ -107,6 +107,8 @@ void kernel_main() { constexpr uint32_t cb_id_act_second_reader = 7; constexpr uint32_t cb_id_sharded_act = 3; constexpr uint32_t act_block_h_datums_read = act_block_h_datums / 2; // Extra /2 because of packed uint16 reads + constexpr uint32_t act_block_h_datums_first_reader_read = + act_block_h_datums_first_reader / 2; // Extra /2 because of packed uint16 reads constexpr uint32_t act_block_num_tiles_read = act_block_num_tiles; constexpr uint32_t cb_reader_indices = tt::CBIndex::c_4; @@ -401,8 +403,7 @@ void kernel_main() { out_block_h_start_tile_id += out_next_block_stride_h; out_block_h_start_tile_id_h += out_block_height_num_tiles; #endif - - start_reader_idx = reader_idx + act_block_h_datums_read; + start_reader_idx = reader_idx + act_block_h_datums_first_reader_read; } // out_num_blocks_h out_block_w_start_tile_id += out_next_block_stride_w; out_block_w_start_tile_id_w += weight_block_width_ntiles; From 1aba0a5b786c1a76f0708bd2b41395315177266d Mon Sep 17 00:00:00 2001 From: Pavle Josipovic Date: Sun, 16 Feb 2025 18:42:15 +0000 Subject: [PATCH 161/316] Current auto-shard heuristic is based on minimising circular buffer size allocation per core, and output tensor buffer size per core. In this case number of input channels is small, but number of output channles is large. Since Width sharding can decouple input and output parallel config and input is based input channels and output is based on output channels, output buffer size per core is small. Problem is currently we ignore the the input tensor size per core. Width sharding input parallel core can only use a single core which means that input tensor and halo output have to go to a single core and we run out of memory. Use approx input size per core as factor for auto-shard heuristic. Ideally we would use halo output size for this, but using approx input tensor size is a good enough proxy for now. --- .../unit_tests/operations/test_new_conv2d.py | 32 +++++++++++++++++++ .../operations/conv/conv2d/conv2d_utils.cpp | 13 ++++++-- 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/tests/ttnn/unit_tests/operations/test_new_conv2d.py b/tests/ttnn/unit_tests/operations/test_new_conv2d.py index 6364fa7e51f..610cd0ef6e3 100644 --- a/tests/ttnn/unit_tests/operations/test_new_conv2d.py +++ b/tests/ttnn/unit_tests/operations/test_new_conv2d.py @@ -2966,3 +2966,35 @@ def test_split_reader_regression( shard_layout=shard_layout, enable_split_reader=True, ) + + +@pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) +def test_small_in_large_out_channels_auto_shard(device, torch_tensor_map): + batch_size = 2 + in_channels = 16 + out_channels = 1536 + kernel_size = (2, 2) + stride = (2, 2) + padding = (0, 0) + height = 128 + width = 128 + run_conv( + device, + torch_tensor_map, + ttnn.MathFidelity.LoFi, + ttnn.bfloat16, + ttnn.bfloat16, + batch_size, + out_channels, + in_channels, + height, + width, + kernel_size[0], + kernel_size[1], + stride[0], + stride[1], + padding[0], + padding[1], + None, + auto_shard=True, + ) diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp index 30b36c2ca5c..959acd36d04 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp @@ -845,7 +845,7 @@ Conv2dConfig determine_conv_config_for_auto_shard( conv_config.act_block_w_div = tt::div_up(in_channels, width_sharded_num_cores * constants::TILE_WIDTH); } - const conv_op_l1_usage l1_usage = calculate_L1_usage( + conv_op_l1_usage l1_usage = calculate_L1_usage( compute_config, opt_conv_op_block_config, opt_conv_op_parallel_config, @@ -856,6 +856,16 @@ Conv2dConfig determine_conv_config_for_auto_shard( enable_bias, use_non_tile_height, conv_is_1d_deptwise); + + // Since we don't have L1 usage for halo output (input to conv2d) + // use approx input tensor size per core as a proxy. + uint32_t input_nhw = tt::div_up(batch_size * input_height * input_width, tt::constants::TILE_HEIGHT); + uint32_t input_c = tt::div_up(in_channels_padded, tt::constants::TILE_WIDTH); + uint32_t approx_input_size = + input_nhw * input_c * tt::tile_size(datatype_to_dataformat_converter(conv_config.dtype)); + uint32_t approx_input_size_per_core = approx_input_size / input_parallel_config.grid.num_cores(); + + l1_usage.tensor_allocation_size += approx_input_size_per_core; log_debug( tt::LogOp, "L1 usage for {}: {}, {}", @@ -907,7 +917,6 @@ std::tuple kernel_size, const CoreCoord& compute_grid) { - auto elem_size = conv_config.weights_dtype == DataType::BFLOAT8_B ? 1 : 2; bool is_non_tile_mul_width = check_non_tile_mul_width(compute_grid, conv_config, in_channels); const bool use_non_tile_height = check_non_tile_height(conv_config, out_channels); From c69f5c0ad12fb8cca043c2a35471596e531eb595 Mon Sep 17 00:00:00 2001 From: Pavle Josipovic Date: Sun, 16 Feb 2025 22:12:18 +0000 Subject: [PATCH 162/316] Remove non tile multiple width/height from conv2d These two features are non critical for conv2d meaning they don't contribute to enabling any model perf on any model or improve pass rate on any sweep. Problem with these features is that they kick in in very unpredictable conditions for both users and developers as they have many limits/conditions. They are adding to conv2d test matrix, but they are hard to test for as deriving tests that will trigger them on multiple hw platforms is not easy. Moreover they are source of bugs like #17647, and it's often non obvious that bugs originate from these features and when faced with a bug in conv2d first thing is to go to the code and manually disable them to check for that. For the reasons above these will get removed, and by removing them #17647 will be fixed. --- .../ttnn_functional_resnetblock2d_new_conv.py | 4 - .../unit_tests/operations/test_new_conv2d.py | 202 ------------------ .../ttnn/operations/conv/conv2d/conv2d.cpp | 36 ++-- .../operations/conv/conv2d/conv2d_pybind.cpp | 15 +- .../operations/conv/conv2d/conv2d_utils.cpp | 174 ++++----------- .../operations/conv/conv2d/conv2d_utils.hpp | 17 +- .../conv/conv2d/device/conv2d_op.cpp | 50 ++--- .../conv/conv2d/device/conv2d_op.hpp | 23 +- .../conv2d_op_sharded_program_factory.cpp | 98 ++------- ...onv2d_op_width_sharded_program_factory.cpp | 28 +-- .../conv_bmm_tilize_col_major_out_blocks.cpp | 33 +-- ...er_conv_weights_tiled_col_to_rm_blocks.cpp | 28 --- ...er_conv_weights_tiled_col_to_rm_blocks.cpp | 28 --- .../conv/conv2d/prepare_conv2d_weights.cpp | 82 +++---- .../conv/conv2d/prepare_conv2d_weights.hpp | 3 +- .../conv_transpose2d/conv_transpose2d.cpp | 27 ++- 16 files changed, 163 insertions(+), 685 deletions(-) diff --git a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_resnetblock2d_new_conv.py b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_resnetblock2d_new_conv.py index 691081f1952..58f3ab618b0 100644 --- a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_resnetblock2d_new_conv.py +++ b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_resnetblock2d_new_conv.py @@ -182,10 +182,6 @@ def __init__( self.conv2_config_override = {} if (out_channels, out_channels, input_height, input_width) in config_override: self.conv2_config_override = config_override[(out_channels, out_channels, input_height, input_width)] - # if use_in_shortcut: - # self.conv2_config_override["grid_size"] = self.conv_shortcut.conv.grid_size - # self.conv2_config_override["per_core_out_matrix_height"] = self.conv_shortcut.conv.per_core_out_matrix_height - # self.conv2_config_override["per_core_weight_matrix_width"] = self.conv_shortcut.conv.per_core_out_matrix_width self.conv2_input_height = conv2_input_height self.conv2_input_width = conv2_input_width diff --git a/tests/ttnn/unit_tests/operations/test_new_conv2d.py b/tests/ttnn/unit_tests/operations/test_new_conv2d.py index 610cd0ef6e3..082cb3c90fa 100644 --- a/tests/ttnn/unit_tests/operations/test_new_conv2d.py +++ b/tests/ttnn/unit_tests/operations/test_new_conv2d.py @@ -384,9 +384,6 @@ def test_conv_features( if output_layout == ttnn.ROW_MAJOR_LAYOUT and activations_dtype == ttnn.bfloat8_b: pytest.skip("Row major layout not compatible with bfloat8_b") - if output_layout == ttnn.ROW_MAJOR_LAYOUT and activations_dtype == ttnn.bfloat16 and packer_l1_acc and fp32_accum: - pytest.skip("skipping due to pack_untilize_dst issue!") - run_conv( device, torch_tensor_map, @@ -2592,205 +2589,6 @@ def test_conv_for_vanilla_unet( ) -@pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) -@pytest.mark.parametrize( - "batch_size, output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, shard_layout, config_override", - ( - # unique convs in rn50 (complete list) - # first conv post folding and input_channels padding to tile width - (16, 64, 64, 14, 14, 3, 3, 1, 1, 1, 1, HS, None), - # rn50 layer1 - (8, 64, 64, 56, 56, 3, 3, 1, 1, 1, 1, HS, None), - (16, 64, 64, 56, 56, 3, 3, 1, 1, 1, 1, HS, None), - (20, 64, 64, 56, 56, 3, 3, 1, 1, 1, 1, HS, None), - # rn50 layer2 - (8, 128, 128, 56, 56, 3, 3, 2, 2, 1, 1, HS, None), - (16, 128, 128, 56, 56, 3, 3, 2, 2, 1, 1, HS, None), - (20, 128, 128, 56, 56, 3, 3, 2, 2, 1, 1, HS, None), - (8, 128, 128, 28, 28, 3, 3, 1, 1, 1, 1, HS, None), - (16, 128, 128, 28, 28, 3, 3, 1, 1, 1, 1, HS, None), - (20, 128, 128, 28, 28, 3, 3, 1, 1, 1, 1, HS, None), - (1, 32, 32, 240, 320, 3, 3, 1, 1, 1, 1, HS, None), - (1, 64, 32, 240, 320, 3, 3, 1, 1, 1, 1, HS, None), - ), -) -@pytest.mark.parametrize( - "weights_dtype", - [ttnn.bfloat8_b, ttnn.bfloat16], -) -@pytest.mark.parametrize( - "activations_dtype", - [ttnn.bfloat16, ttnn.float32], -) -@pytest.mark.parametrize("fp32_accum", [False, True], ids=["no_fp32_accum", "fp32_accum"]) -@pytest.mark.parametrize("math_fidelity", [ttnn.MathFidelity.LoFi]) -@pytest.mark.parametrize("packer_l1_acc", [True, False], ids=["pack_l1", "no_pack_l1"]) -@pytest.mark.parametrize("has_bias", [True, False], ids=["with_bias", "no_bias"]) -def test_non_tile_multiple_height_conv_wh( - device, - torch_tensor_map, - use_program_cache, - math_fidelity, - activations_dtype, - weights_dtype, - batch_size, - output_channels, - input_channels, - input_height, - input_width, - filter_height, - filter_width, - stride_h, - stride_w, - pad_h, - pad_w, - shard_layout, - config_override, - fp32_accum, - packer_l1_acc, - has_bias, -): - if device.core_grid.y == 7: - pytest.skip("Issue #6992: Statically allocated circular buffers in program clash with L1 buffers on core range") - - if ( - is_grayskull() - and activations_dtype == ttnn.bfloat16 - and batch_size == 20 - and ( - output_channels == 64 - or ( - stride_h == 2 - and (output_channels == 256 or (output_channels == 128 and weights_dtype == ttnn.bfloat16)) - ) - ) - ): - pytest.skip("Skipping test because it won't fit in L1!") - - if activations_dtype == ttnn.float32 and (batch_size >= 16 or (output_channels == 64 or input_height >= 240)): - pytest.skip("Skipping test because it won't fit in L1!") - - if ( - (weights_dtype == ttnn.bfloat16 and batch_size == 20 and output_channels == 128 and input_height == 56) - or (weights_dtype == ttnn.bfloat16 and batch_size == 20 and output_channels == 64) - or (weights_dtype == ttnn.bfloat8_b and batch_size == 20 and output_channels == 128 and input_height == 56) - ): - pytest.skip("Skipping test because it won't fit in L1!") - - if has_bias and packer_l1_acc and (fp32_accum or activations_dtype is ttnn.float32): - pytest.skip("skipping due to pack_untilize_dst issue! --> #14236") - - use_shallow_conv_variant = (input_channels == 16) and device.arch() != ttnn.device.Arch.WORMHOLE_B0 - run_conv( - device, - torch_tensor_map, - math_fidelity, - activations_dtype, - weights_dtype, - batch_size, - output_channels, - input_channels, - input_height, - input_width, - filter_height, - filter_width, - stride_h, - stride_w, - pad_h, - pad_w, - config_override=config_override, - shard_layout=shard_layout, - use_shallow_conv_variant=use_shallow_conv_variant, - packer_l1_acc=packer_l1_acc, - fp32_accum=fp32_accum, - has_bias=has_bias, - output_layout=ttnn.ROW_MAJOR_LAYOUT, - ) - - -@skip_for_grayskull() -@pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) -@pytest.mark.parametrize( - "batch_size, output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, shard_layout, config_override", - ( - (1, 64, 64, 16, 16, 3, 3, 1, 1, 1, 1, BS, None), - (1, 64, 128, 16, 16, 3, 3, 1, 1, 1, 1, BS, None), - (1, 64, 192, 16, 16, 3, 3, 1, 1, 1, 1, BS, None), - (1, 64, 256, 16, 16, 3, 3, 1, 1, 1, 1, BS, None), - (1, 64, 320, 16, 16, 3, 3, 1, 1, 1, 1, BS, None), - (1, 64, 384, 16, 16, 3, 3, 1, 1, 1, 1, BS, None), - (1, 64, 448, 16, 16, 3, 3, 1, 1, 1, 1, BS, None), - (1, 64, 512, 16, 16, 3, 3, 1, 1, 1, 1, BS, None), - (1, 64, 576, 16, 16, 3, 3, 1, 1, 1, 1, BS, None), - (1, 64, 640, 16, 16, 3, 3, 1, 1, 1, 1, BS, None), - (1, 128, 64, 16, 16, 3, 3, 1, 1, 1, 1, BS, None), - (1, 128, 128, 16, 16, 3, 3, 1, 1, 1, 1, BS, None), - (1, 128, 192, 16, 16, 3, 3, 1, 1, 1, 1, BS, None), - (1, 128, 256, 16, 16, 3, 3, 1, 1, 1, 1, BS, None), - (1, 128, 320, 16, 16, 3, 3, 1, 1, 1, 1, BS, None), - (1, 128, 384, 16, 16, 3, 3, 1, 1, 1, 1, BS, None), - (1, 128, 448, 16, 16, 3, 3, 1, 1, 1, 1, BS, None), - (1, 128, 512, 16, 16, 3, 3, 1, 1, 1, 1, BS, None), - (1, 128, 576, 16, 16, 3, 3, 1, 1, 1, 1, BS, None), - (1, 128, 640, 16, 16, 3, 3, 1, 1, 1, 1, BS, None), - (1, 320, 320, 16, 16, 3, 3, 1, 1, 1, 1, BS, None), - (1, 640, 640, 16, 16, 3, 3, 1, 1, 1, 1, BS, None), - ), -) -@pytest.mark.parametrize( - "weights_dtype", - [ttnn.bfloat16, ttnn.bfloat8_b], -) -@pytest.mark.parametrize( - "activations_dtype", - [ttnn.bfloat16], -) -@pytest.mark.parametrize("math_fidelity", [ttnn.MathFidelity.LoFi]) -def test_non_tile_multiple_width_conv_wh( - device, - torch_tensor_map, - use_program_cache, - math_fidelity, - activations_dtype, - weights_dtype, - batch_size, - output_channels, - input_channels, - input_height, - input_width, - filter_height, - filter_width, - stride_h, - stride_w, - pad_h, - pad_w, - shard_layout, - config_override, -): - run_conv( - device, - torch_tensor_map, - math_fidelity, - activations_dtype, - weights_dtype, - batch_size, - output_channels, - input_channels, - input_height, - input_width, - filter_height, - filter_width, - stride_h, - stride_w, - pad_h, - pad_w, - config_override, - shard_layout=shard_layout, - use_shallow_conv_variant=(input_channels == 16), - output_layout=ttnn.ROW_MAJOR_LAYOUT, - ) - - @skip_for_grayskull() @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) def test_shallow_conv_with_tiled_input(device): diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp index 50b5c017a41..a3928a36629 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp @@ -90,21 +90,18 @@ Result conv2d( ShardOrientation shard_orientation = conv_config.transpose_shards ? ShardOrientation::COL_MAJOR : ShardOrientation::ROW_MAJOR; - bool is_non_tile_mul_width = check_non_tile_mul_width(compute_grid_size, conv_config, in_channels); - auto [input_tensor_post_tm, parallel_config, output_parallel_config, use_non_tile_height] = - shard_or_reshard_tensor_if_required( - device, - input_tensor, - conv_config, - batch_size, - output_height, - output_width, - in_channels, - out_channels, - mm_conv, - auto_shard, - is_non_tile_mul_width); + auto [input_tensor_post_tm, parallel_config, output_parallel_config] = shard_or_reshard_tensor_if_required( + device, + input_tensor, + conv_config, + batch_size, + output_height, + output_width, + in_channels, + out_channels, + mm_conv, + auto_shard); auto [opt_conv_op_parallel_config, opt_conv_op_block_config, conv_out_memory_config] = get_conv_configs( conv_config, @@ -137,8 +134,7 @@ Result conv2d( groups, opt_conv_op_block_config.act_block_h_ntiles, input_width, - true, - is_non_tile_mul_width); + true); } // if 1x1 conv w/ stride 1, convert input tensor to tile layout if required if (mm_conv) { @@ -160,7 +156,7 @@ Result conv2d( .dilation_hw = {dilation[0], dilation[1]}, .num_cores_nhw = opt_conv_op_parallel_config.num_cores_nhw, .core_range_set = input_tensor_post_tm.memory_config().shard_spec.value().grid, - .snap_to_tile = !use_non_tile_height, + .snap_to_tile = true, }; bool bypass_halo = @@ -185,7 +181,7 @@ Result conv2d( parallel_config.shard_orientation == ShardOrientation::COL_MAJOR, 0, input_tensor_post_tm.memory_config(), - !use_non_tile_height); + true); if (conv_config.deallocate_activation) { input_tensor_post_tm.deallocate(/*force*/ true); @@ -217,9 +213,7 @@ Result conv2d( compute_config, conv_config.enable_act_double_buffer, conv_config.enable_weights_double_buffer, - conv_config.enable_split_reader, - conv_config.enable_subblock_padding, - use_non_tile_height); + conv_config.enable_split_reader); if (memory_config.has_value() && memory_config.value() != conv_output.memory_config()) { conv_output = ttnn::to_memory_config(conv_output, memory_config.value(), std::nullopt); diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp index ef664e12add..0591ed02d0c 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp @@ -295,8 +295,7 @@ void py_bind_conv2d(py::module& module) { compute_grid_size, block_shard_orientation, enable_channels_padding, - is_out_tiled, - false); + is_out_tiled); }, py::arg("shard_layout"), py::arg("batch_size"), @@ -384,16 +383,16 @@ void py_bind_conv2d(py::module& module) { py::arg("grid_size"), py::arg("num_cores_nhw") = 1, py::arg("num_cores_c") = 1, - py::arg("per_core_out_matrix_height").noconvert(), - py::arg("per_core_out_matrix_width").noconvert()) + py::arg("per_core_out_matrix_height_ntiles").noconvert(), + py::arg("per_core_out_matrix_width_ntiles").noconvert()) .def_property_readonly("grid_size", [](const OptimizedConvParallelizationConfig& c) { return c.grid_size; }) .def_property_readonly( "num_cores_nhw", [](const OptimizedConvParallelizationConfig& c) { return c.num_cores_nhw; }) .def_property_readonly( - "per_core_out_matrix_height", - [](const OptimizedConvParallelizationConfig& c) { return c.per_core_out_matrix_height; }) - .def_property_readonly("per_core_out_matrix_width", [](const OptimizedConvParallelizationConfig& c) { - return c.per_core_out_matrix_width; + "per_core_out_matrix_height_ntiles", + [](const OptimizedConvParallelizationConfig& c) { return c.per_core_out_matrix_height_ntile; }) + .def_property_readonly("per_core_out_matrix_width_ntiles", [](const OptimizedConvParallelizationConfig& c) { + return c.per_core_out_matrix_width_ntile; }); py::class_(module, "OptimizedConvBlockConfig") diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp index 959acd36d04..6f67fb238a6 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp @@ -10,6 +10,7 @@ #include "conv2d_utils.hpp" #include +#include "tt-metalium/constants.hpp" #include "tt-metalium/hal.hpp" #include "ttnn/operations/conv/conv2d/device/conv2d_op.hpp" #include "ttnn/operations/conv/conv2d/prepare_conv2d_weights.hpp" @@ -80,28 +81,6 @@ uint32_t find_closest_largest_divisor_with_num_padding(uint32_t num1, uint32_t n return divisor; } -bool check_non_tile_mul_width( - const CoreCoord& compute_grid, const Conv2dConfig& conv_config, const uint32_t in_channels) { - auto num_cores_c = conv_config.transpose_shards ? compute_grid.y : compute_grid.x; - auto elem_size = conv_config.weights_dtype == DataType::BFLOAT8_B ? 1 : 2; - bool is_non_tile_mul_width = - (conv_config.shard_layout.has_value() && conv_config.shard_layout == TensorMemoryLayout::BLOCK_SHARDED) && - conv_config.act_block_h_override == 0 && - (conv_config.weights_dtype == DataType::BFLOAT8_B || conv_config.weights_dtype == DataType::BFLOAT16) && - conv_config.output_layout == Layout::ROW_MAJOR && ((elem_size * in_channels) % (16 * num_cores_c)) == 0; - return is_non_tile_mul_width; -} - -bool check_non_tile_height(const Conv2dConfig& conv_config, const uint32_t out_channels) { - bool use_non_tile_height = (conv_config.shard_layout.has_value() && - conv_config.shard_layout.value() == TensorMemoryLayout::HEIGHT_SHARDED) && - out_channels <= 256 && conv_config.act_block_h_override == 0 && - (conv_config.dtype == DataType::BFLOAT16 || conv_config.dtype == DataType::FLOAT32) && - conv_config.output_layout == Layout::ROW_MAJOR; - use_non_tile_height = use_non_tile_height && conv_config.input_channels_alignment != 16; - return use_non_tile_height; -} - ParallelConfig determine_parallel_config( const TensorMemoryLayout shard_layout, uint32_t batch_size, @@ -113,17 +92,9 @@ ParallelConfig determine_parallel_config( ShardOrientation block_shard_orientation, bool enable_channels_padding, bool is_out_tiled, - bool is_non_tile_mul_shard_width, uint32_t act_block_h_override) { uint32_t effective_tile_height = is_out_tiled ? tt::constants::TILE_HEIGHT : 1; uint32_t effective_tile_width = is_out_tiled ? tt::constants::TILE_WIDTH : 1; - // If the shard is not tile-multiplicatively along the width dimension, - // set the effective tile width to 1 and disable channel padding. - // Required(if any) paddings are added while creating the matrices. - if (is_non_tile_mul_shard_width) { - effective_tile_width = 1; - enable_channels_padding = false; - } uint32_t out_nhw_ntiles = tt::round_up(batch_size * output_height * output_width, tt::constants::TILE_HEIGHT) / effective_tile_height; uint32_t input_channles_ntiles = tt::div_up(input_channels, effective_tile_width); @@ -277,13 +248,12 @@ OptimizedConvParallelizationConfig determine_conv_op_parallel_config_from_conv_o TT_ASSERT(conv_output_mem_config.shard_spec.has_value()); const auto& shard_spec = conv_output_mem_config.shard_spec.value(); const auto& shard_shape = shard_spec.shape; - uint32_t per_core_out_matrix_height_ntiles = div_up(shard_shape[0], 32); return { .grid_size = shard_spec.grid.bounding_box().grid_size(), .num_cores_nhw = num_cores_nhw, .num_cores_c = num_cores_c, - .per_core_out_matrix_height = shard_shape[0], - .per_core_out_matrix_width = shard_shape[1], + .per_core_out_matrix_height_ntile = div_up(shard_shape[0], tt::constants::TILE_HEIGHT), + .per_core_out_matrix_width_ntile = div_up(shard_shape[1], tt::constants::TILE_WIDTH), }; } @@ -341,8 +311,7 @@ OptimizedConvBlockConfig determine_per_core_conv_block_config( "Config Error: act_block_h_override must be a multiple of 32 (tile height)."); } - uint32_t act_block_h_ntiles = - div_up(conv_op_parallel_config.per_core_out_matrix_height, tt::constants::TILE_HEIGHT); + uint32_t act_block_h_ntiles = conv_op_parallel_config.per_core_out_matrix_height_ntile; if (act_block_h_override > 0) { uint32_t act_block_h_override_ntiles = act_block_h_override / constants::TILE_HEIGHT; @@ -379,10 +348,8 @@ OptimizedConvBlockConfig determine_per_core_conv_block_config( } TT_ASSERT(act_block_w % 32 == 0); uint32_t act_block_w_ntiles = act_block_w / 32; - uint32_t out_block_h_ntiles = - div_up(conv_op_parallel_config.per_core_out_matrix_height, tt::constants::TILE_HEIGHT); - uint32_t weight_block_w_ntiles = - div_up(conv_op_parallel_config.per_core_out_matrix_width, tt::constants::TILE_WIDTH); + uint32_t out_block_h_ntiles = conv_op_parallel_config.per_core_out_matrix_height_ntile; + uint32_t weight_block_w_ntiles = conv_op_parallel_config.per_core_out_matrix_width_ntile; auto [out_subblock_h_ntiles, out_subblock_w_ntiles] = determine_largest_subblock_size(act_block_h_ntiles, weight_block_w_ntiles, fp32_accum, split_reader_enabled); return { @@ -418,7 +385,7 @@ DeviceComputeKernelConfig get_conv_default_compute_kernel_config(DeviceType* dev } template -static std::tuple get_conv_padded_input_shape_and_mem_config( +static std::tuple get_conv_padded_input_shape_and_mem_config( T* device, const ttnn::Tensor& input_tensor_, const Conv2dConfig& conv_config, @@ -427,8 +394,7 @@ static std::tuple get_conv_padded_i uint32_t width, uint32_t in_channels, uint32_t out_channels, - bool is_mm_conv, - bool is_non_tile_mul_width) { + bool is_mm_conv) { ttnn::Tensor input_tensor = input_tensor_; // tensor to return bool input_tensor_on_device = ttnn::is_tensor_on_device_or_multidevice(input_tensor_); bool needs_shard_or_reshard = false; @@ -494,11 +460,6 @@ static std::tuple get_conv_padded_i } } - // shallow conv variriant not supported - // out_channels <= 256 incorrect output from pack_untilize_dst if output > 256 Tracking --> #14236 - // bf8 not supported due to limation of sharding dim multipl of 32 - const bool use_non_tile_height = check_non_tile_height(conv_config, out_channels); - ParallelConfig parallel_config = input_tensor_parallel_config; if (conv_config.reshard_if_not_optimal || needs_shard_or_reshard) { auto block_shard_orientation = @@ -513,8 +474,7 @@ static std::tuple get_conv_padded_i device->compute_with_storage_grid_size(), block_shard_orientation, !is_mm_conv, - !use_non_tile_height, - is_non_tile_mul_width, + true, conv_config.act_block_h_override); if (conv_config.override_sharding_config) { @@ -541,18 +501,13 @@ static std::tuple get_conv_padded_i const auto& input_shape = input_tensor.get_logical_shape(); uint32_t tensor_height = input_shape[0] * input_shape[1] * input_shape[2]; uint32_t round_up_size = tt::constants::TILE_HEIGHT; - if ((use_non_tile_height || shard_layout == TensorMemoryLayout::WIDTH_SHARDED) && - input_tensor_.layout() == Layout::ROW_MAJOR) { + if (shard_layout == TensorMemoryLayout::WIDTH_SHARDED && input_tensor_.layout() == Layout::ROW_MAJOR) { round_up_size = 1; } uint32_t input_tensor_height_snapped_to_tile = tt::round_up(tensor_height, input_num_cores_nhw * round_up_size); TT_ASSERT(input_tensor_height_snapped_to_tile >= tensor_height); uint32_t input_tensor_width_snapped_to_channels_alignment = tt::round_up(input_shape[3], input_num_cores_c * conv_config.input_channels_alignment); - if (is_non_tile_mul_width) { - input_tensor_width_snapped_to_channels_alignment = - tt::round_up(input_shape[3], conv_config.input_channels_alignment); - } auto input_padded_shape = ttnn::Shape( {1, @@ -566,13 +521,9 @@ static std::tuple get_conv_padded_i parallel_config, round_up_size); - return {input_padded_shape, input_tensor_sharded_memory_config, needs_shard_or_reshard, use_non_tile_height}; + return {input_padded_shape, input_tensor_sharded_memory_config, needs_shard_or_reshard}; } else { - return { - input_tensor.get_logical_shape(), - input_tensor.memory_config(), - needs_shard_or_reshard, - use_non_tile_height}; + return {input_tensor.get_logical_shape(), input_tensor.memory_config(), needs_shard_or_reshard}; } } @@ -584,7 +535,7 @@ static ttnn::Shape flatten_4d_shape(const ttnn::Shape& input_shape) { } template -std::tuple shard_or_reshard_tensor_if_required( +std::tuple shard_or_reshard_tensor_if_required( T* device, const ttnn::Tensor& input_tensor_, const Conv2dConfig& conv_config, @@ -594,24 +545,14 @@ std::tuple shard_or_reshard_ uint32_t in_channels, uint32_t out_channels, bool is_mm_conv, - bool auto_shard, - bool is_non_tile_mul_width) { + bool auto_shard) { ttnn::Tensor input_tensor = input_tensor_; // tensor to return bool input_tensor_on_device = ttnn::is_tensor_on_device_or_multidevice(input_tensor_); auto compute_grid_size = device->compute_with_storage_grid_size(); - auto [input_padded_shape, input_tensor_sharded_memory_config, needs_shard_or_reshard, use_non_tile_height] = + auto [input_padded_shape, input_tensor_sharded_memory_config, needs_shard_or_reshard] = get_conv_padded_input_shape_and_mem_config( - device, - input_tensor_, - conv_config, - batch_size, - height, - width, - in_channels, - out_channels, - is_mm_conv, - is_non_tile_mul_width); + device, input_tensor_, conv_config, batch_size, height, width, in_channels, out_channels, is_mm_conv); ParallelConfig parallel_config = { .grid = input_tensor_sharded_memory_config.shard_spec.value().grid, .shard_scheme = input_tensor_sharded_memory_config.memory_layout, @@ -675,7 +616,7 @@ std::tuple shard_or_reshard_ input_tensor, device, (auto_shard_mm ? ttnn::DRAM_MEMORY_CONFIG : input_tensor_sharded_memory_config)); } } - return {input_tensor, parallel_config, output_parallel_config, use_non_tile_height}; + return {input_tensor, parallel_config, output_parallel_config}; } void validate_weight_and_bias_tensors( @@ -707,10 +648,10 @@ ttnn::operations::matmul::MatmulProgramConfig determine_matmul_op_config_from_co .in0_block_w = conv_blocking_config.act_block_w_ntiles, .out_subblock_h = conv_blocking_config.out_subblock_h_ntiles, .out_subblock_w = conv_blocking_config.out_subblock_w_ntiles, - .out_block_h = div_up(conv_parallelization_config.per_core_out_matrix_height, tt::constants::TILE_HEIGHT), - .out_block_w = div_up(conv_parallelization_config.per_core_out_matrix_width, tt::constants::TILE_WIDTH), - .per_core_M = div_up(conv_parallelization_config.per_core_out_matrix_height, tt::constants::TILE_HEIGHT), - .per_core_N = div_up(conv_parallelization_config.per_core_out_matrix_width, tt::constants::TILE_WIDTH), + .out_block_h = conv_parallelization_config.per_core_out_matrix_height_ntile, + .out_block_w = conv_parallelization_config.per_core_out_matrix_width_ntile, + .per_core_M = conv_parallelization_config.per_core_out_matrix_height_ntile, + .per_core_N = conv_parallelization_config.per_core_out_matrix_width_ntile, .fuse_batch = true, .mcast_in0 = false}; if (activation != "") { @@ -723,10 +664,10 @@ ttnn::operations::matmul::MatmulProgramConfig determine_matmul_op_config_from_co .in0_block_w = conv_blocking_config.act_block_w_ntiles, .out_subblock_h = conv_blocking_config.out_subblock_h_ntiles, .out_subblock_w = conv_blocking_config.out_subblock_w_ntiles, - .out_block_h = div_up(conv_parallelization_config.per_core_out_matrix_height, tt::constants::TILE_HEIGHT), - .out_block_w = div_up(conv_parallelization_config.per_core_out_matrix_width, tt::constants::TILE_WIDTH), - .per_core_M = div_up(conv_parallelization_config.per_core_out_matrix_height, tt::constants::TILE_HEIGHT), - .per_core_N = div_up(conv_parallelization_config.per_core_out_matrix_width, tt::constants::TILE_WIDTH), + .out_block_h = conv_parallelization_config.per_core_out_matrix_height_ntile, + .out_block_w = conv_parallelization_config.per_core_out_matrix_width_ntile, + .per_core_M = conv_parallelization_config.per_core_out_matrix_height_ntile, + .per_core_N = conv_parallelization_config.per_core_out_matrix_width_ntile, .transpose_mcast = transpose_mcast}; if (activation != "") { matmul_config.fused_activation = ttnn::operations::unary::utils::string_to_unary_with_param(activation); @@ -795,9 +736,6 @@ Conv2dConfig determine_conv_config_for_auto_shard( conv_config.act_block_h_override = constants::TILE_HEIGHT; } - const bool is_non_tile_shard_width = check_non_tile_mul_width(compute_grid_size, conv_config, in_channels); - const bool use_non_tile_height = check_non_tile_height(conv_config, out_channels); - const uint32_t in_channels_padded = round_up(in_channels, conv_config.input_channels_alignment); const uint32_t output_channels_padded = round_up(out_channels, constants::TILE_WIDTH); // Note: These are not exact shapes for weights as prepare_conv_weights will pad the weights depending on the @@ -816,7 +754,6 @@ Conv2dConfig determine_conv_config_for_auto_shard( shard_orientation, !is_mm_conv, is_out_tiled, - is_non_tile_shard_width, conv_config.act_block_h_override); const ParallelConfig output_parallel_config = determine_output_parallel_config( @@ -854,7 +791,6 @@ Conv2dConfig determine_conv_config_for_auto_shard( conv_config, conv_out_memory_config, enable_bias, - use_non_tile_height, conv_is_1d_deptwise); // Since we don't have L1 usage for halo output (input to conv2d) @@ -917,16 +853,10 @@ std::tuple kernel_size, const CoreCoord& compute_grid) { - bool is_non_tile_mul_width = check_non_tile_mul_width(compute_grid, conv_config, in_channels); - const bool use_non_tile_height = check_non_tile_height(conv_config, out_channels); - - uint32_t round_up_size = !use_non_tile_height ? tt::constants::TILE_HEIGHT : 1; + uint32_t round_up_size = tt::constants::TILE_HEIGHT; uint32_t nhw_out = batch_size * output_height * output_width; uint32_t out_channels_padded = tt::round_up( out_channels, get_num_cores_channels_from_parallel_config(output_parallel_config) * tt::constants::TILE_WIDTH); - if (is_non_tile_mul_width) { - out_channels_padded = tt::round_up(out_channels, 32); - } MemoryConfig conv_out_memory_config = create_sharded_memory_config_from_parallel_config( ttnn::Shape({1, 1, nhw_out, out_channels_padded}), output_parallel_config, round_up_size); ParallelConfig largest_parallel_config = @@ -942,9 +872,6 @@ std::tuple 1) || (in0_num_blocks_w > 2)); } -template std::tuple get_conv_padded_input_shape_and_mem_config( +template std::tuple get_conv_padded_input_shape_and_mem_config( IDevice* device, const ttnn::Tensor& input_tensor_, const Conv2dConfig& conv_config, @@ -1289,10 +1201,9 @@ template std::tuple get_conv_padded uint32_t width, uint32_t in_channels, uint32_t out_channels, - bool is_mm_conv, - bool is_non_tile_mul_width); + bool is_mm_conv); -template std::tuple get_conv_padded_input_shape_and_mem_config( +template std::tuple get_conv_padded_input_shape_and_mem_config( MeshDevice* device, const ttnn::Tensor& input_tensor_, const Conv2dConfig& conv_config, @@ -1301,10 +1212,9 @@ template std::tuple get_conv_padded uint32_t width, uint32_t in_channels, uint32_t out_channels, - bool is_mm_conv, - bool is_non_tile_mul_width); + bool is_mm_conv); -template std::tuple shard_or_reshard_tensor_if_required( +template std::tuple shard_or_reshard_tensor_if_required( IDevice* device, const ttnn::Tensor& input_tensor_, const Conv2dConfig& conv_config, @@ -1314,10 +1224,9 @@ template std::tuple shard_or uint32_t in_channels, uint32_t out_channels, bool is_mm_conv, - bool auto_shard, - bool is_non_tile_mul_width); + bool auto_shard); -template std::tuple shard_or_reshard_tensor_if_required( +template std::tuple shard_or_reshard_tensor_if_required( MeshDevice* device, const ttnn::Tensor& input_tensor_, const Conv2dConfig& conv_config, @@ -1327,8 +1236,7 @@ template std::tuple shard_or uint32_t in_channels, uint32_t out_channel, bool is_mm_conv, - bool auto_shard, - bool is_non_tile_mul_width); + bool auto_shard); template DeviceComputeKernelConfig get_conv_default_compute_kernel_config( tt::tt_metal::IDevice* device); diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.hpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.hpp index b3d4a0b5553..440521121d5 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.hpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.hpp @@ -37,12 +37,6 @@ bool use_matmul_for_1x1_conv( bool is_1d_deptwise_conv( uint32_t groups, uint32_t input_channels, uint32_t output_channels, uint32_t kernel_width, uint32_t image_width); - -bool check_non_tile_mul_width( - const CoreCoord& compute_grid, const Conv2dConfig& conv_config, const uint32_t in_channels); - -bool check_non_tile_height(const Conv2dConfig& conv_config, const uint32_t out_channels); - sliding_window::ParallelConfig determine_parallel_config( const TensorMemoryLayout shard_layout, uint32_t batch_size, @@ -54,7 +48,6 @@ sliding_window::ParallelConfig determine_parallel_config( ShardOrientation block_shard_orientation, bool enable_channels_padding, bool is_out_tiled = true, - bool is_non_tile_mul_shard_width = false, uint32_t act_block_h_override = 0); sliding_window::ParallelConfig determine_output_parallel_config( @@ -113,7 +106,7 @@ std::tuple -static std::tuple get_conv_padded_input_shape_and_mem_config( +static std::tuple get_conv_padded_input_shape_and_mem_config( T* device, const ttnn::Tensor& input_tensor_, const Conv2dConfig& conv_config, @@ -122,8 +115,7 @@ static std::tuple get_conv_padded_i uint32_t width, uint32_t in_channels, uint32_t out_channels, - bool is_mm_conv, - bool is_non_tile_mul_width = false); + bool is_mm_conv); template DeviceComputeKernelConfig get_conv_default_compute_kernel_config(DeviceType* device); @@ -148,7 +140,7 @@ Conv2dConfig determine_conv_config_for_auto_shard( const DeviceComputeKernelConfig& compute_config); template -std::tuple +std::tuple shard_or_reshard_tensor_if_required( T* device, const ttnn::Tensor& input_tensor_, @@ -159,8 +151,7 @@ shard_or_reshard_tensor_if_required( uint32_t in_channels, uint32_t out_channels, bool is_mm_conv, - bool auto_shard, - bool is_non_tile_mul_width = false); + bool auto_shard); std::ostream& operator<<(std::ostream& os, const Conv2dConfig& config); diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.cpp index a7f1c2a774a..249fab4d7c3 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.cpp @@ -71,8 +71,7 @@ Tensor optimized_conv_new( bool enable_act_double_buffer, bool enable_weights_double_buffer, bool enable_split_reader, - bool enable_subblock_padding, - bool use_non_tile_height) { + bool enable_subblock_padding) { std::vector output_tensors = {Tensor(tt::tt_metal::operation::get_workers_for_op_output({a, b}))}; operation::launch_op( @@ -91,8 +90,7 @@ Tensor optimized_conv_new( enable_act_double_buffer, enable_weights_double_buffer, enable_split_reader, - enable_subblock_padding, - use_non_tile_height]( + enable_subblock_padding]( const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { @@ -138,8 +136,7 @@ Tensor optimized_conv_new( enable_act_double_buffer, enable_weights_double_buffer, enable_split_reader, - enable_subblock_padding, - use_non_tile_height); + enable_subblock_padding); IDevice* device = a.device(); optimized_conv_op.pre_op_l1_allocation_size_bytes = @@ -163,10 +160,8 @@ void OptimizedConvNew::validate( TT_FATAL((this->dtype == DataType::BFLOAT16) || (this->dtype == DataType::FLOAT32), "Error"); } if (this->memory_config.is_sharded()) { - uint32_t out_block_h_ntiles = - optimized_conv_op_utils::div_up(parallelization_config.per_core_out_matrix_height, TILE_HEIGHT); - uint32_t per_core_out_matrix_width_ntiles = - optimized_conv_op_utils::div_up(parallelization_config.per_core_out_matrix_width, TILE_WIDTH); + uint32_t out_block_h_ntiles = parallelization_config.per_core_out_matrix_height_ntile; + uint32_t per_core_out_matrix_width_ntiles = parallelization_config.per_core_out_matrix_width_ntile; auto [act_matrix_shape, act_matrix_shape_unpadded] = optimized_conv_op_utils::compute_opt_conv_activation_as_mm_shape( input_tensor_a.get_padded_shape(), @@ -207,10 +202,8 @@ std::vector OptimizedConvNew::compute_output_specs(const std::vector // Tiled output shape is padded shape. Padded to tile shape. auto shape_w = batch_size * conv_output_h * conv_output_w; auto shape_c = output_channels; - auto padded_shape_w = this->use_non_tile_height - ? parallelization_config.num_cores_nhw * parallelization_config.per_core_out_matrix_height - : parallelization_config.num_cores_nhw * - tt::round_up(parallelization_config.per_core_out_matrix_height, TILE_HEIGHT); + auto padded_shape_w = + parallelization_config.num_cores_nhw * parallelization_config.per_core_out_matrix_height_ntile * TILE_HEIGHT; auto padded_shape_c = tt::round_up(this->output_channels, TILE_WIDTH); ttnn::Shape output_shape({1, 1, shape_w, shape_c}); ttnn::Shape padded_output_shape({1, 1, padded_shape_w, padded_shape_c}); @@ -219,24 +212,9 @@ std::vector OptimizedConvNew::compute_output_specs(const std::vector if (this->memory_config.is_sharded()) { if (this->memory_config.memory_layout == TensorMemoryLayout::HEIGHT_SHARDED) { uint32_t total_height_tiles = padded_output_shape.volume() / padded_output_shape[-1] / TILE_HEIGHT; - uint32_t num_cores; - std::array shard_shape; - if (this->use_non_tile_height) { - num_cores = this->parallelization_config.num_cores_nhw; - uint32_t total_height = padded_output_shape.volume() / padded_output_shape[-1]; - shard_shape = {(uint32_t)(total_height / num_cores), padded_output_shape[-1]}; - } else { - num_cores = total_height_tiles / - tt::div_up(this->parallelization_config.per_core_out_matrix_height, TILE_HEIGHT); - CoreRangeSet shard_grid = - tt::tt_metal::num_cores_to_corerangeset(num_cores, this->parallelization_config.grid_size, true); - - shard_shape = { - optimized_conv_op_utils::div_up( - this->parallelization_config.per_core_out_matrix_height, TILE_HEIGHT) * - TILE_HEIGHT, - padded_output_shape[-1]}; - } + uint32_t num_cores = total_height_tiles / this->parallelization_config.per_core_out_matrix_height_ntile; + std::array shard_shape = { + this->parallelization_config.per_core_out_matrix_height_ntile * TILE_HEIGHT, padded_output_shape[-1]}; CoreRangeSet shard_grid = tt::tt_metal::num_cores_to_corerangeset(num_cores, this->parallelization_config.grid_size, true); auto shard_spec = ShardSpec{shard_grid, shard_shape, ShardOrientation::ROW_MAJOR}; @@ -249,8 +227,8 @@ std::vector OptimizedConvNew::compute_output_specs(const std::vector } else if (this->memory_config.memory_layout == TensorMemoryLayout::WIDTH_SHARDED) { uint32_t total_height_tiles = padded_output_shape.volume() / padded_output_shape[-1] / TILE_HEIGHT; std::array shard_shape = { - tt::div_up(this->parallelization_config.per_core_out_matrix_height, TILE_HEIGHT) * TILE_HEIGHT, - tt::div_up(this->parallelization_config.per_core_out_matrix_width, TILE_WIDTH) * TILE_WIDTH}; + this->parallelization_config.per_core_out_matrix_height_ntile * TILE_HEIGHT, + this->parallelization_config.per_core_out_matrix_width_ntile * TILE_WIDTH}; auto shard_grid = this->memory_config.shard_spec.value().grid; auto shard_spec = ShardSpec{shard_grid, shard_shape, this->memory_config.shard_spec.value().orientation}; auto mem_config = this->memory_config; @@ -314,8 +292,7 @@ operation::ProgramWithCallbacks OptimizedConvNew::create_program( enable_act_double_buffer, enable_weights_double_buffer, enable_split_reader, - enable_subblock_padding, - use_non_tile_height); + enable_subblock_padding); const uint32_t post_op_l1_allocation_size = device->allocator()->get_statistics(tt::tt_metal::BufferType::L1).total_allocated_bytes; @@ -340,7 +317,6 @@ operation::ProgramWithCallbacks OptimizedConvNew::create_program( .enable_subblock_padding = enable_subblock_padding}, this->memory_config, has_bias, - use_non_tile_height, is_1d_deptwise_conv( groups, input_tensor_shape[3], diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp index 6f804922950..04557524b76 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp @@ -122,13 +122,8 @@ struct OptimizedConvParallelizationConfig { CoreCoord grid_size; // (x,y) uint32_t num_cores_nhw = 1; uint32_t num_cores_c = 1; - uint32_t per_core_out_matrix_height = 1; - uint32_t per_core_out_matrix_width = 1; - // std::size_t in0_block_w; - // std::size_t out_subblock_h; - // std::size_t out_subblock_w; - // std::size_t per_core_M; - // std::size_t per_core_N; + uint32_t per_core_out_matrix_height_ntile = 1; + uint32_t per_core_out_matrix_width_ntile = 1; CoreCoord get_grid_size() const { return this->grid_size; } }; @@ -159,8 +154,7 @@ tt::tt_metal::operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_ bool enable_act_double_buffer, bool enable_weights_double_buffer, bool enable_split_reader, - bool enable_subblock_padding, - bool use_non_tile_height); + bool enable_subblock_padding); // new micro op struct OptimizedConvNew { @@ -179,7 +173,6 @@ struct OptimizedConvNew { bool enable_weights_double_buffer; bool enable_split_reader; bool enable_subblock_padding; - bool use_non_tile_height; uint32_t pre_op_l1_allocation_size_bytes; OptimizedConvNew( const sliding_window::SlidingWindowConfig& sliding_window_config, @@ -198,8 +191,7 @@ struct OptimizedConvNew { bool enable_act_double_buffer, bool enable_weights_double_buffer, bool enable_split_reader, - bool enable_subblock_padding, - bool use_non_tile_height) : + bool enable_subblock_padding) : output_channels(output_channels), groups(groups), sliding_window_config(sliding_window_config), @@ -216,8 +208,7 @@ struct OptimizedConvNew { enable_act_double_buffer(enable_act_double_buffer), enable_weights_double_buffer(enable_weights_double_buffer), enable_split_reader(enable_split_reader), - enable_subblock_padding(enable_subblock_padding), - use_non_tile_height(use_non_tile_height) {} + enable_subblock_padding(enable_subblock_padding) {} void validate( const std::vector& input_tensors, @@ -290,8 +281,7 @@ Tensor optimized_conv_new( bool enable_act_double_buffer = false, bool enable_weights_double_buffer = false, bool enable_split_reader = false, - bool enable_subblock_padding = false, - bool use_non_tile_height = false); + bool enable_subblock_padding = false); // Only enable packer l1 accumulation when there are in0_num_blocks_w > 2, otherwise // unnecessary overhead for reconfigs are added. Last iteration of l1 accumulation @@ -317,7 +307,6 @@ conv_op_l1_usage calculate_L1_usage( const Conv2dConfig& conv_config, const MemoryConfig& output_memory_config, bool enable_bias, - bool use_non_tile_height, bool is_1d_depthwise_conv); } // namespace conv2d diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp index d0e917aee50..32fd24971e8 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp @@ -33,7 +33,6 @@ const uint32_t tilize_mode_tilized_act_cb = CBIndex::c_25; const uint32_t untilize_mode_reblock_cb = CBIndex::c_26; const uint32_t out0_cb = CBIndex::c_16; const uint32_t temp_sum_cb = CBIndex::c_27; -const uint32_t untilized_padded_out_cb = CBIndex::c_28; } // namespace CMAKE_UNIQUE_NAMESPACE } // namespace @@ -84,8 +83,7 @@ std::tuple create_CBs_for_sharded_input_v2( bool with_bias, bool split_reader, bool fp32_dest_acc_en, - bool packer_l1_acc_en, - bool use_non_tile_height) { + bool packer_l1_acc_en) { using namespace CMAKE_UNIQUE_NAMESPACE; tt::DataFormat interm0_df = @@ -199,42 +197,15 @@ std::tuple create_CBs_for_sharded_input_v2( bool need_unpad_after_untilize = output_shard_shape[1] * output_shard_shape[0] < num_writer_output_tiles * TILE_HW; - // If only width is non-tile multiple - if (need_unpad_after_untilize && !use_non_tile_height && weight_width_sliced) { - uint32_t num_bytes_for_df = datum_size(out_df); - CircularBufferConfig compute_cb_output_config = - CircularBufferConfig(num_writer_output_tiles * out_tile_size, {{untilized_padded_out_cb, out_df}}) - .set_page_size(untilized_padded_out_cb, out_tile_size); - auto compute_cb_output = tt_metal::CreateCircularBuffer(program, core, compute_cb_output_config); - log_debug( - LogOp, - "untilized padded out CB(shard width non-tile multiple): {}, npages: {}, pagesize: {}", - untilized_padded_out_cb, - num_writer_output_tiles, - out_tile_size); - CircularBufferConfig cb_output_config = - CircularBufferConfig( - num_bytes_for_df * output_shard_shape[0] * output_shard_shape[1], {{out0_cb, out_df}}) - .set_page_size(out0_cb, output_shard_shape[1] * num_bytes_for_df); - cb_output_config = cb_output_config.set_globally_allocated_address(*output.buffer()); - cb_output = tt_metal::CreateCircularBuffer(program, core, cb_output_config); - log_debug( - LogOp, - "output CB(shard widht non-tile multiple): {}, npages: {}, pagesize: {}", - out0_cb, - output_shard_shape[0], - output_shard_shape[1] * num_bytes_for_df); - } else { - auto shard_shape = output.shard_spec().value().shape; - uint32_t aligned_output_stick_nbytes = - use_non_tile_height ? shard_shape[1] * output.element_size() : out_tile_size; - uint32_t aligned_output_num_pages = use_non_tile_height ? shard_shape[0] : num_writer_output_tiles; - CircularBufferConfig cb_output_config = - CircularBufferConfig(aligned_output_num_pages * aligned_output_stick_nbytes, {{out0_cb, out_df}}) - .set_page_size(out0_cb, aligned_output_stick_nbytes); - cb_output_config = cb_output_config.set_globally_allocated_address(*output.buffer()); - cb_output = tt_metal::CreateCircularBuffer(program, core, cb_output_config); - } + + auto shard_shape = output.shard_spec().value().shape; + uint32_t aligned_output_stick_nbytes = out_tile_size; + uint32_t aligned_output_num_pages = num_writer_output_tiles; + CircularBufferConfig cb_output_config = + CircularBufferConfig(aligned_output_num_pages * aligned_output_stick_nbytes, {{out0_cb, out_df}}) + .set_page_size(out0_cb, aligned_output_stick_nbytes); + cb_output_config = cb_output_config.set_globally_allocated_address(*output.buffer()); + cb_output = tt_metal::CreateCircularBuffer(program, core, cb_output_config); } else { // Share buffer if same data format if (interm0_df == out_df) { @@ -425,8 +396,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( bool enable_act_double_buffer, bool enable_weights_double_buffer, bool enable_split_reader, - bool enable_subblock_padding, - bool use_non_tile_height) { + bool enable_subblock_padding) { using namespace CMAKE_UNIQUE_NAMESPACE; bool pass = true; tt_metal::IDevice* device = a.device(); @@ -435,8 +405,8 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( TT_FATAL(output_channels <= b.get_padded_shape()[3], "Invalid weight shape. Incorrect weight tensor."); uint32_t act_block_h_ntiles = block_config.act_block_h_ntiles; uint32_t act_block_w_ntiles = block_config.act_block_w_ntiles; - uint32_t weight_block_w_ntiles = div_up(parallelization_config.per_core_out_matrix_width, TILE_WIDTH); - uint32_t out_block_h_ntiles = div_up(parallelization_config.per_core_out_matrix_height, TILE_HEIGHT); + uint32_t weight_block_w_ntiles = parallelization_config.per_core_out_matrix_width_ntile; + uint32_t out_block_h_ntiles = parallelization_config.per_core_out_matrix_height_ntile; uint32_t out_subblock_h_ntiles = block_config.out_subblock_h_ntiles; uint32_t out_subblock_w_ntiles = block_config.out_subblock_w_ntiles; @@ -535,8 +505,8 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( uint32_t num_cores_y = p_config.grid_size.y; uint32_t total_num_cores = num_cores_x * num_cores_y; - uint32_t per_core_out_matrix_width_ntiles = div_up(parallelization_config.per_core_out_matrix_width, TILE_WIDTH); - uint32_t per_core_out_matrix_height_ntiles = div_up(parallelization_config.per_core_out_matrix_height, TILE_HEIGHT); + uint32_t per_core_out_matrix_width_ntiles = parallelization_config.per_core_out_matrix_width_ntile; + uint32_t per_core_out_matrix_height_ntiles = parallelization_config.per_core_out_matrix_height_ntile; bool block_sharded = a.memory_config().memory_layout == TensorMemoryLayout::BLOCK_SHARDED; bool height_sharded = a.memory_config().memory_layout == TensorMemoryLayout::HEIGHT_SHARDED; @@ -919,14 +889,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( log_debug(LogOp, "num_blocks_out_h_per_core: {}", num_blocks_out_h_per_core); TT_FATAL(act_matrix_height_ntiles % per_core_out_matrix_height_ntiles == 0, "Error"); - uint32_t total_active_num_cores_per_weight_slice; - if (use_non_tile_height) { - total_active_num_cores_per_weight_slice = - tt::round_up(act_matrix_height_unpadded, parallelization_config.num_cores_nhw) / - parallelization_config.per_core_out_matrix_height; - } else { - total_active_num_cores_per_weight_slice = act_matrix_height_ntiles / per_core_out_matrix_height_ntiles; - } + uint32_t total_active_num_cores_per_weight_slice = act_matrix_height_ntiles / per_core_out_matrix_height_ntiles; TT_FATAL(total_active_num_cores_per_weight_slice <= total_num_cores_per_weight_slice, "Error"); uint32_t total_noop_cores = total_num_cores_per_weight_slice - total_active_num_cores_per_weight_slice; uint32_t total_active_num_cores = total_active_num_cores_per_weight_slice * num_weight_slices_width; @@ -1074,8 +1037,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( uint32_t output_block_num_tiles = enable_subblock_padding ? (act_block_h_ntiles_padded * weight_block_w_ntiles) : writer_output_block_num_tiles; - uint32_t aligned_output_num_pages = - use_non_tile_height ? output.shard_spec().value().shape[0] : writer_output_block_num_tiles; + uint32_t aligned_output_num_pages = writer_output_block_num_tiles; std::vector reader_rt_args; std::vector reader_compile_time_args; @@ -1157,8 +1119,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( has_bias, split_reader, fp32_dest_acc_en, - packer_l1_acc_en, - use_non_tile_height); + packer_l1_acc_en); } CBHandle cb_sharded_act = std::get<0>(input_output_cbs); CBHandle cb_output = std::get<1>(input_output_cbs); @@ -1391,20 +1352,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( writer_compile_time_args.insert( writer_compile_time_args.end(), split_reader_args.begin(), split_reader_args.end()); } - bool need_unpad_after_untilize = - parallelization_config.per_core_out_matrix_width < per_core_out_matrix_width_ntiles * TILE_WIDTH; - if (need_unpad_after_untilize) { - TT_FATAL(block_sharded, "Need to handle this case for non-sliced weights"); - TT_FATAL(untilize_out, "Cannot support non-tile multiple shard width with tilized output"); - writer_compile_time_args.push_back(per_core_out_matrix_width_ntiles); - writer_compile_time_args.push_back(per_core_out_matrix_width_ntiles * TILE_WIDTH * 2); - writer_compile_time_args.push_back(parallelization_config.per_core_out_matrix_width * 2); - writer_compile_time_args.push_back(untilized_padded_out_cb); - writer_defines["UNPAD_UNTILIZE_OUT"] = 1; - writer_mcast_sender_defines["UNPAD_UNTILIZE_OUT"] = 1; - } - uint32_t compute_output_cb = need_unpad_after_untilize ? untilized_padded_out_cb : out0_cb; std::vector compute_kernel_args = { in0_block_w, act_num_subblocks, @@ -1428,9 +1376,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( untilize_out, bias_ntiles_per_core, - compute_output_cb, - aligned_output_num_pages, - use_non_tile_height}; + out0_cb}; auto writer_mcast_noc = NOC::NOC_0; auto reader_noc = writer_mcast_noc == NOC::NOC_0 ? NOC::NOC_1 : NOC::NOC_0; @@ -1816,8 +1762,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_new( bool enable_act_double_buffer, bool enable_weights_double_buffer, bool enable_split_reader, - bool enable_subblock_padding, - bool use_non_tile_height) { + bool enable_subblock_padding) { tt_metal::Program program = tt_metal::CreateProgram(); ttnn::operations::sliding_window::ParallelConfig parallel_config; @@ -1889,8 +1834,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_new( enable_act_double_buffer, enable_weights_double_buffer, enable_split_reader, - enable_subblock_padding, - use_non_tile_height); + enable_subblock_padding); } } // namespace conv2d diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp index 3ed850823b9..84d7bc017aa 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp @@ -62,9 +62,8 @@ tt::tt_metal::operation::ProgramWithCallbacks multi_core_optimized_conv_width_sh TT_FATAL(output_channels <= b.get_padded_shape()[3], "Invalid weight shape. Incorrect weight tensor."); uint32_t act_block_h_ntiles = block_config.act_block_h_ntiles; uint32_t act_block_w_ntiles = block_config.act_block_w_ntiles; - uint32_t weight_block_w_ntiles = - div_up(parallelization_config.per_core_out_matrix_width, tt::constants::TILE_WIDTH); - uint32_t out_block_h_ntiles = div_up(parallelization_config.per_core_out_matrix_height, tt::constants::TILE_HEIGHT); + uint32_t weight_block_w_ntiles = parallelization_config.per_core_out_matrix_width_ntile; + uint32_t out_block_h_ntiles = parallelization_config.per_core_out_matrix_height_ntile; uint32_t out_subblock_h_ntiles = block_config.out_subblock_h_ntiles; uint32_t out_subblock_w_ntiles = block_config.out_subblock_w_ntiles; @@ -168,12 +167,10 @@ tt::tt_metal::operation::ProgramWithCallbacks multi_core_optimized_conv_width_sh const auto& p_config = parallelization_config; uint32_t num_cores_x = p_config.grid_size.x; uint32_t num_cores_y = p_config.grid_size.y; - uint32_t per_core_out_matrix_height_ntiles = - div_up(p_config.per_core_out_matrix_height, tt::constants::TILE_HEIGHT); - uint32_t per_core_out_matrix_width_ntiles = div_up(p_config.per_core_out_matrix_width, tt::constants::TILE_WIDTH); + uint32_t per_core_out_matrix_height_ntiles = p_config.per_core_out_matrix_height_ntile; // weight_width_sliced determines is 1d-sysarr-conv or 2d-sysarr-conv - bool weight_width_sliced = per_core_out_matrix_width_ntiles < weight_matrix_width_ntiles; - // uint32_t conv_act_c_blocks = weight_matrix_width_ntiles / per_core_out_matrix_width_ntiles; + bool weight_width_sliced = p_config.per_core_out_matrix_width_ntile < weight_matrix_width_ntiles; + // uint32_t conv_act_c_blocks = weight_matrix_width_ntiles / p_config.per_core_out_matrix_width_ntile; uint32_t input_channels_padded = shard_shape[1] * input_num_cores; // TT_FATAL(conv_act_c_blocks == p_config.num_cores_c, "Error"); TT_FATAL(input_channels_padded >= ashape[3], "Incorrect padding of input channels!"); @@ -443,10 +440,10 @@ tt::tt_metal::operation::ProgramWithCallbacks multi_core_optimized_conv_width_sh bias_in_dram = bias_buffer->buffer_type() == BufferType::DRAM; } - uint32_t num_weight_slices_width = weight_matrix_width_ntiles / per_core_out_matrix_width_ntiles; + uint32_t num_weight_slices_width = weight_matrix_width_ntiles / p_config.per_core_out_matrix_width_ntile; uint32_t num_blocks_act_h_per_core = - (per_core_out_matrix_height_ntiles + act_block_h_ntiles - 1) / act_block_h_ntiles; - uint32_t num_blocks_weight_w_per_core = per_core_out_matrix_width_ntiles / weight_block_w_ntiles; + (p_config.per_core_out_matrix_height_ntile + act_block_h_ntiles - 1) / act_block_h_ntiles; + uint32_t num_blocks_weight_w_per_core = p_config.per_core_out_matrix_width_ntile / weight_block_w_ntiles; uint32_t bias_ntiles_per_core = bias_ntiles / num_weight_slices_width; auto output_shape = sliding_window_config.get_output_shape(); @@ -511,8 +508,8 @@ tt::tt_metal::operation::ProgramWithCallbacks multi_core_optimized_conv_width_sh log_debug(LogOp, "act_matrix_height_ntiles: {}", act_matrix_height_ntiles); log_debug(LogOp, "act_matrix_width_ntiles: {}", act_matrix_width_ntiles); log_debug(LogOp, "weight_matrix_width_ntiles: {}", weight_matrix_width_ntiles); - log_debug(LogOp, "per_core_out_matrix_height_ntiles: {}", per_core_out_matrix_height_ntiles); - log_debug(LogOp, "per_core_out_matrix_width_ntiles: {}", per_core_out_matrix_width_ntiles); + log_debug(LogOp, "per_core_out_matrix_height_ntiles: {}", p_config.per_core_out_matrix_height_ntile); + log_debug(LogOp, "per_core_out_matrix_width_ntiles: {}", p_config.per_core_out_matrix_width_ntile); log_debug(LogOp, "per_core_num_blocks_act_w: {}", per_core_num_blocks_act_w); log_debug(LogOp, "num_blocks_act_h: {}", num_blocks_act_h); @@ -648,8 +645,7 @@ tt::tt_metal::operation::ProgramWithCallbacks multi_core_optimized_conv_width_sh if (packer_l1_acc) { compute_defines["PACKER_L1_ACC"] = "1"; } - uint32_t num_output_tiles = per_core_out_matrix_height_ntiles * per_core_out_matrix_width_ntiles; - uint32_t use_non_tile_height = false; + uint32_t num_output_tiles = per_core_out_matrix_height_ntiles * p_config.per_core_out_matrix_width_ntile; compute_kernel_args = { act_block_w_ntiles, // in0_block_w act_num_subblocks, // in0_num_sublocks @@ -675,8 +671,6 @@ tt::tt_metal::operation::ProgramWithCallbacks multi_core_optimized_conv_width_sh bias_ntiles, out0_cb, - num_output_tiles, - use_non_tile_height, input_num_cores, // in0_nblocks_w_tilize. Repeat tilize after all cores have done one round of MCAST. }; diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/conv_bmm_tilize_col_major_out_blocks.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/conv_bmm_tilize_col_major_out_blocks.cpp index 94ea5310615..94545fc3704 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/conv_bmm_tilize_col_major_out_blocks.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/conv_bmm_tilize_col_major_out_blocks.cpp @@ -40,24 +40,19 @@ inline void tilize_in( tilize_uninit(in_cb_id, out_cb_id); } // tilize_in() -template +template inline void reblock_and_untilize( uint32_t num_out_subblocks_in_col, uint32_t out_subblock_num_tiles, uint32_t out_subblock_h, - uint32_t output_rows_h, uint32_t interm_cb_id, uint32_t out_cb_id) { - constexpr bool is_non_tile_height_ = is_non_tile_height; - uint32_t TILE_SIZE = is_non_tile_height_ ? 32 : out_block_w; uint32_t num_tiles_in_row_of_subblocks = mulsi3(out_subblock_num_tiles, num_out_subblocks_in_col); cb_wait_front(interm_cb_id, num_tiles_in_row_of_subblocks); uint32_t within_block_index = 0; for (uint32_t h = 0; h < out_subblock_h; h++) { uint32_t block_offset = 0; - uint32_t out_sub_block_rows_h = output_rows_h <= TILE_SIZE ? output_rows_h : TILE_SIZE; - uint32_t rows_to_copy = is_non_tile_height_ ? out_sub_block_rows_h : 16; - cb_reserve_back(out_cb_id, out_sub_block_rows_h); + cb_reserve_back(out_cb_id, out_block_w); for (uint32_t n = 0; n < num_out_subblocks_in_col; n++) { tile_regs_acquire(); for (uint32_t w = 0; w < out_subblock_w; w++) { @@ -66,12 +61,11 @@ inline void reblock_and_untilize( } tile_regs_commit(); tile_regs_wait(); - pack_untilize_dst(out_cb_id, 1, n, rows_to_copy); + pack_untilize_dst(out_cb_id, 1, n); tile_regs_release(); block_offset += out_subblock_num_tiles; } - cb_push_back(out_cb_id, out_sub_block_rows_h); - output_rows_h -= out_sub_block_rows_h; + cb_push_back(out_cb_id, out_block_w); within_block_index += out_subblock_w; } cb_pop_front(interm_cb_id, num_tiles_in_row_of_subblocks); @@ -100,11 +94,9 @@ void MAIN { constexpr bool tilize_in0 = get_compile_time_arg_val(14); constexpr bool untilize_out = get_compile_time_arg_val(15); constexpr uint32_t out_cb_id = get_compile_time_arg_val(17); - uint32_t output_rows_h = get_compile_time_arg_val(18); - constexpr bool is_non_tile_height = get_compile_time_arg_val(19); #ifdef WIDTH_SHARDED - constexpr uint32_t in0_nblocks_w_tilize = get_compile_time_arg_val(20); + constexpr uint32_t in0_nblocks_w_tilize = get_compile_time_arg_val(18); #endif constexpr uint32_t out_block_num_tiles = in0_num_subblocks * in1_num_subblocks * out_subblock_num_tiles; @@ -118,7 +110,6 @@ void MAIN { constexpr uint32_t in0_cb_second_reader_id = tt::CBIndex::c_7; constexpr uint32_t matmul_partials_cb = tt::CBIndex::c_24; constexpr uint32_t tilized_in0_cb_id = tt::CBIndex::c_25; - // constexpr uint32_t untilize_mode_reblock_cb = tt::CBIndex::c_26; constexpr uint32_t untilize_mode_out_cb_id = untilize_out ? matmul_partials_cb : out_cb_id; @@ -439,19 +430,9 @@ void MAIN { #endif pack_untilize_dst_init_short(out_cb_id); copy_tile_to_dst_init_short(matmul_partials_cb); - uint32_t curr_tile_output_rows_h = 0; - uint32_t TILE_SIZE = is_non_tile_height ? 32 : out_block_w; - TILE_SIZE = TILE_SIZE * out_subblock_h; for (uint32_t in0_subblock_i = 0; in0_subblock_i < in0_num_subblocks; ++in0_subblock_i) { - curr_tile_output_rows_h = output_rows_h < TILE_SIZE ? output_rows_h : TILE_SIZE; - reblock_and_untilize( - in1_num_subblocks, - out_subblock_num_tiles, - out_subblock_h, - curr_tile_output_rows_h, - matmul_partials_cb, - out_cb_id); - output_rows_h -= curr_tile_output_rows_h; + reblock_and_untilize( + in1_num_subblocks, out_subblock_num_tiles, out_subblock_h, matmul_partials_cb, out_cb_id); } pack_untilize_uninit(matmul_partials_cb); } diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/writer_tiled_out_2d_mcast_receiver_conv_weights_tiled_col_to_rm_blocks.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/writer_tiled_out_2d_mcast_receiver_conv_weights_tiled_col_to_rm_blocks.cpp index b4760a862f5..37c8edb7701 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/writer_tiled_out_2d_mcast_receiver_conv_weights_tiled_col_to_rm_blocks.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/writer_tiled_out_2d_mcast_receiver_conv_weights_tiled_col_to_rm_blocks.cpp @@ -48,12 +48,6 @@ void kernel_main() { constexpr uint32_t out_addr = get_compile_time_arg_val(29); -#ifdef UNPAD_UNTILIZE_OUT - constexpr uint32_t out_block_width_ntiles = get_compile_time_arg_val(33); - constexpr uint32_t out_block_width_padded_bytes = get_compile_time_arg_val(34); - constexpr uint32_t out_block_width_bytes = get_compile_time_arg_val(35); - constexpr uint32_t untilized_padded_out_cb = get_compile_time_arg_val(36); -#endif uint32_t i = 0; i += 19; uint32_t out_start_tile_id = get_arg_val(i); @@ -194,30 +188,8 @@ void kernel_main() { } // out_num_blocks_w #ifdef SHARDED_OUT -#ifdef UNPAD_UNTILIZE_OUT - uint32_t dst_cb_addr = get_write_ptr(cb_id_out0); - - uint32_t src_cb_addr = get_read_ptr(untilized_padded_out_cb); - for (uint32_t nbw = 0; nbw < out_num_blocks_w; nbw++) { - for (uint32_t nbh = 0; nbh < out_num_blocks_h; nbh++) { - for (uint32_t bh = 0; bh < out_block_height_num_tiles; bh++) { - cb_wait_front(untilized_padded_out_cb, out_block_width_ntiles); - uint32_t src_cb_addr = get_read_ptr(untilized_padded_out_cb); - for (uint32_t r = 0; r < 32; r++) { - noc_async_read(get_noc_addr(src_cb_addr), dst_cb_addr, out_block_width_bytes); - noc_async_read_barrier(); - src_cb_addr += out_block_width_padded_bytes; - - dst_cb_addr += out_aligned_page_size; - } - cb_pop_front(untilized_padded_out_cb, out_block_width_ntiles); - } - } - } -#else cb_wait_front( cb_id_out0, out_subblock_tile_count * out_num_subblocks_h * out_num_subblocks_w * out_num_blocks_w * out_num_blocks_h); #endif -#endif } diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/writer_tiled_out_2d_mcast_sender_conv_weights_tiled_col_to_rm_blocks.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/writer_tiled_out_2d_mcast_sender_conv_weights_tiled_col_to_rm_blocks.cpp index 0053e2c68d2..88744e90369 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/writer_tiled_out_2d_mcast_sender_conv_weights_tiled_col_to_rm_blocks.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/writer_tiled_out_2d_mcast_sender_conv_weights_tiled_col_to_rm_blocks.cpp @@ -49,12 +49,6 @@ void kernel_main() { constexpr uint32_t out_addr = get_compile_time_arg_val(29); -#ifdef UNPAD_UNTILIZE_OUT - constexpr uint32_t out_block_width_ntiles = get_compile_time_arg_val(33); - constexpr uint32_t out_block_width_padded_bytes = get_compile_time_arg_val(34); - constexpr uint32_t out_block_width_bytes = get_compile_time_arg_val(35); - constexpr uint32_t untilized_padded_out_cb = get_compile_time_arg_val(36); -#endif uint32_t i = 0; i += 1; const uint32_t weight_addr_dram_base = get_arg_val(i); @@ -337,30 +331,8 @@ void kernel_main() { weight_start_tile_id += weight_next_block_stride_w; } // out_num_blocks_w #ifdef SHARDED_OUT -#ifdef UNPAD_UNTILIZE_OUT - uint32_t dst_cb_addr = get_write_ptr(cb_id_out0); - - uint32_t src_cb_addr = get_read_ptr(untilized_padded_out_cb); - for (uint32_t nbw = 0; nbw < out_num_blocks_w; nbw++) { - for (uint32_t nbh = 0; nbh < out_num_blocks_h; nbh++) { - for (uint32_t bh = 0; bh < out_block_height_num_tiles; bh++) { - cb_wait_front(untilized_padded_out_cb, out_block_width_ntiles); - uint32_t src_cb_addr = get_read_ptr(untilized_padded_out_cb); - for (uint32_t r = 0; r < 32; r++) { - noc_async_read(get_noc_addr(src_cb_addr), dst_cb_addr, out_block_width_bytes); - noc_async_read_barrier(); - src_cb_addr += out_block_width_padded_bytes; - - dst_cb_addr += out_aligned_page_size; - } - cb_pop_front(untilized_padded_out_cb, out_block_width_ntiles); - } - } - } -#else cb_wait_front( cb_id_out0, out_subblock_tile_count * out_num_subblocks_h * out_num_subblocks_w * out_num_blocks_w * out_num_blocks_h); #endif -#endif } diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.cpp index 2678a4ce2af..2f7b82a170e 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.cpp @@ -504,24 +504,17 @@ ttnn::Tensor conv_bias_layout_convert( uint32_t weight_block_w_ntiles, const ParallelConfig& parallel_config, T* device, - uint32_t out_channels, - bool is_non_tile_mul_width) { + uint32_t out_channels) { ttnn::Tensor bias_tensor_ = bias_tensor; validate_bias_tensor(bias_tensor_); - if (!is_non_tile_mul_width) { - const auto& bias_shape = bias_tensor_.get_logical_shape(); - TT_FATAL(bias_shape[0] == 1 && bias_shape[1] == 1 && bias_shape[2] == 1, "bias shape is not correct"); - ttnn::Shape bias_channels_padded_shape({1, 1, 32, round_up(out_channels, weight_block_w_ntiles * 32)}); - bias_tensor_ = - ttnn::pad(bias_tensor_, bias_channels_padded_shape.to_array_4D(), tt::tt_metal::Array4D{0, 0, 0, 0}, 0); - bias_tensor_ = ttnn::to_layout(bias_tensor_, Layout::TILE, std::nullopt, std::nullopt, (T*)nullptr); - if (bias_tensor_.get_dtype() != bias_dtype) { - bias_tensor_ = ttnn::to_dtype(bias_tensor_, bias_dtype); - } - } else { - uint32_t num_cores_channels = get_num_cores_channels_from_parallel_config(parallel_config); - bias_tensor_ = - convert_conv_bias_tensor_to_tiled_layout_block_sharded(bias_tensor_, num_cores_channels, bias_dtype); + const auto& bias_shape = bias_tensor_.get_logical_shape(); + TT_FATAL(bias_shape[0] == 1 && bias_shape[1] == 1 && bias_shape[2] == 1, "bias shape is not correct"); + ttnn::Shape bias_channels_padded_shape({1, 1, 32, round_up(out_channels, weight_block_w_ntiles * 32)}); + bias_tensor_ = + ttnn::pad(bias_tensor_, bias_channels_padded_shape.to_array_4D(), tt::tt_metal::Array4D{0, 0, 0, 0}, 0); + bias_tensor_ = ttnn::to_layout(bias_tensor_, Layout::TILE, std::nullopt, std::nullopt, (T*)nullptr); + if (bias_tensor_.get_dtype() != bias_dtype) { + bias_tensor_ = ttnn::to_dtype(bias_tensor_, bias_dtype); } return bias_tensor_; } @@ -569,10 +562,6 @@ static OptimizedConvBlockConfig get_opt_block_config( ShardOrientation shard_orientation = conv_config.transpose_shards ? ShardOrientation::COL_MAJOR : ShardOrientation::ROW_MAJOR; - const bool use_non_tile_height = check_non_tile_height(conv_config, out_channels); - - bool is_non_tile_mul_width = check_non_tile_mul_width(compute_grid_size, conv_config, in_channels); - if (input_memory_config.is_sharded() && !conv_config.reshard_if_not_optimal) { conv_config.shard_layout = input_memory_config.memory_layout; } @@ -593,8 +582,7 @@ static OptimizedConvBlockConfig get_opt_block_config( compute_grid_size, shard_orientation, !mm_conv, - !use_non_tile_height, - is_non_tile_mul_width, + true, conv_config.act_block_h_override); } auto output_parallel_config = parallel_config; @@ -610,11 +598,11 @@ static OptimizedConvBlockConfig get_opt_block_config( log_debug(tt::LogOp, "Changing width sharded output grid to {}", output_parallel_config.grid); } - uint32_t round_up_size = !use_non_tile_height ? tt::constants::TILE_HEIGHT : 1; auto conv_out_memory_config = create_sharded_memory_config_from_parallel_config( - ttnn::Shape({1, 1, batch_size * output_height * output_width, tt::round_up(out_channels, 32)}), + ttnn::Shape( + {1, 1, batch_size * output_height * output_width, tt::round_up(out_channels, tt::constants::TILE_WIDTH)}), output_parallel_config, - round_up_size); + tt::constants::TILE_HEIGHT); auto largest_parallel_config = output_parallel_config.grid.num_cores() > parallel_config.grid.num_cores() ? output_parallel_config : parallel_config; @@ -657,8 +645,7 @@ std::pair> prepare_conv_weights_biases uint32_t groups, uint32_t act_block_h_ntiles, uint32_t input_width, - const bool parameters_on_device, - bool is_non_tile_mul_width) { + const bool parameters_on_device) { validate_weight_tensor(weight_tensor); ttnn::Tensor weight_tensor_; // tensor to return ttnn::Tensor bias_tensor_; @@ -701,11 +688,7 @@ std::pair> prepare_conv_weights_biases uint32_t out_channel_padding = out_channels_padded - out_channels; ttnn::Shape weights_channels_padded_shape({out_channels_padded, in_channels_padded, window_h, window_w}); - if (is_non_tile_mul_width) { - weights_channels_padded_shape = ttnn::Shape( - {round_up(out_channels, 32), round_up(in_channels, input_channels_alignment), window_h, window_w}); - out_channels_padded = tt::round_up(out_channels, 32); - } + if (weights_bias_dtype == DataType::BFLOAT8_B) { TT_ASSERT(weight_tensor_.get_dtype() == DataType::FLOAT32); if (bias_tensor.has_value()) { @@ -757,8 +740,7 @@ std::pair> prepare_conv_weights_biases weight_block_w_ntiles, output_parallel_config, device, - out_channels_padded, - is_non_tile_mul_width); + out_channels_padded); bias_tensor_ = ttnn::operations::core::to_device(bias_tensor_, device, std::nullopt); } } @@ -819,10 +801,6 @@ ttnn::Tensor prepare_conv_weights( ShardOrientation shard_orientation = conv_config.transpose_shards ? ShardOrientation::COL_MAJOR : ShardOrientation::ROW_MAJOR; - const bool use_non_tile_height = check_non_tile_height(conv_config, out_channels); - bool is_non_tile_mul_width = - check_non_tile_mul_width(device->compute_with_storage_grid_size(), conv_config, in_channels); - if (input_memory_config.is_sharded() && !conv_config.reshard_if_not_optimal) { conv_config.shard_layout = input_memory_config.memory_layout; } @@ -844,8 +822,7 @@ ttnn::Tensor prepare_conv_weights( device->compute_with_storage_grid_size(), shard_orientation, !mm_conv, - !use_non_tile_height, - is_non_tile_mul_width, + true, conv_config.act_block_h_override); } @@ -867,9 +844,7 @@ ttnn::Tensor prepare_conv_weights( device, groups, opt_conv_op_block_config.act_block_h_ntiles, - input_width, - false, - is_non_tile_mul_width); + input_width); return weight_tensor_on_device; } @@ -928,13 +903,10 @@ ttnn::Tensor prepare_conv_bias( ShardOrientation shard_orientation = conv_config.transpose_shards ? ShardOrientation::COL_MAJOR : ShardOrientation::ROW_MAJOR; - const bool use_non_tile_height = check_non_tile_height(conv_config, out_channels); - if (input_memory_config.is_sharded() && !conv_config.reshard_if_not_optimal) { conv_config.shard_layout = input_memory_config.memory_layout; } CoreCoord compute_grid = device->compute_with_storage_grid_size(); - bool is_non_tile_mul_width = check_non_tile_mul_width(compute_grid, conv_config, in_channels); ParallelConfig parallel_config; if (input_memory_config.shard_spec.has_value() && !conv_config.reshard_if_not_optimal) { parallel_config = { @@ -952,8 +924,7 @@ ttnn::Tensor prepare_conv_bias( compute_grid, shard_orientation, !mm_conv, - !use_non_tile_height, - is_non_tile_mul_width, + true, conv_config.act_block_h_override); } @@ -970,8 +941,7 @@ ttnn::Tensor prepare_conv_bias( weight_block_w_ntiles, output_parallel_config, device, - out_channels, - is_non_tile_mul_width); + out_channels); return bias_tensor_; } @@ -1028,8 +998,7 @@ template std::pair> prepare_conv_weigh uint32_t groups, uint32_t act_block_h_ntiles, uint32_t input_width, - const bool parameters_on_device, - bool is_non_tile_mul_width); + const bool parameters_on_device); template std::pair> prepare_conv_weights_biases_and_move_to_device( @@ -1045,8 +1014,7 @@ prepare_conv_weights_biases_and_move_to_device( uint32_t groups, uint32_t act_block_h_ntiles, uint32_t input_width, - const bool parameters_on_device, - bool is_non_tile_mul_width); + const bool parameters_on_device); template ttnn::Tensor prepare_conv_bias( const ttnn::Tensor& bias_tensor, @@ -1091,8 +1059,7 @@ template ttnn::Tensor conv_bias_layout_convert( uint32_t weight_block_w_ntiles, const sliding_window::ParallelConfig& parallel_config, IDevice* device, - uint32_t out_channels, - bool is_non_tile_mul_width); + uint32_t out_channels); template ttnn::Tensor conv_bias_layout_convert( const ttnn::Tensor& bias_tensor, @@ -1101,8 +1068,7 @@ template ttnn::Tensor conv_bias_layout_convert( uint32_t weight_block_w_ntiles, const sliding_window::ParallelConfig& parallel_config, MeshDevice* device, - uint32_t out_channels, - bool is_non_tile_mul_width); + uint32_t out_channels); } // namespace conv2d } // namespace operations::conv diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.hpp b/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.hpp index d1951b8bb33..5377a62a345 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.hpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.hpp @@ -117,8 +117,7 @@ std::pair> prepare_conv_weights_biases uint32_t groups, uint32_t act_block_h_ntiles, uint32_t input_width, - const bool parameters_on_device = true, - bool is_non_tile_mul_width = false); + const bool parameters_on_device = true); } // namespace conv2d } // namespace operations::conv diff --git a/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d.cpp b/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d.cpp index 7c5ab221a0e..d9e4f831fb5 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d.cpp @@ -202,26 +202,25 @@ Result conv_transpose2d( } // Call Halo Transpose - auto [input_tensor_post_tm, parallel_config, output_parallel_config, use_non_tile_height] = - shard_or_reshard_tensor_if_required( - device, - input_tensor, - conv_config, - batch_size, - output_height, - output_width, - in_channels, - out_channels, - mm_conv, - auto_shard); + auto [input_tensor_post_tm, parallel_config, output_parallel_config] = shard_or_reshard_tensor_if_required( + device, + input_tensor, + conv_config, + batch_size, + output_height, + output_width, + in_channels, + out_channels, + mm_conv, + auto_shard); - uint32_t round_up_size = !use_non_tile_height ? tt::constants::TILE_HEIGHT : 1; + uint32_t round_up_size = tt::constants::TILE_HEIGHT; Tensor halo_output; if (!mm_conv) { sliding_window_config.num_cores_nhw = get_num_cores_nhw_from_parallel_config(parallel_config); sliding_window_config.core_range_set = input_tensor_post_tm.memory_config().shard_spec.value().grid; - sliding_window_config.snap_to_tile = !use_non_tile_height; + sliding_window_config.snap_to_tile = true; halo_output = ttnn::halo( DefaultQueueId, From 7193d385658ef86c5d7213006876b3002a2adaaa Mon Sep 17 00:00:00 2001 From: Joseph Chu Date: Tue, 18 Feb 2025 23:37:25 +0000 Subject: [PATCH 163/316] #0: Fix ttnn.distribute(..) API for dtype=bfloat8_b --- tests/ttnn/unit_tests/test_multi_device.py | 11 +++++++++++ ttnn/ttnn/operations/core.py | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/tests/ttnn/unit_tests/test_multi_device.py b/tests/ttnn/unit_tests/test_multi_device.py index 71ccbbceddc..845ab31c894 100644 --- a/tests/ttnn/unit_tests/test_multi_device.py +++ b/tests/ttnn/unit_tests/test_multi_device.py @@ -718,3 +718,14 @@ def test_line_all_gather_after_reshape(mesh_device): mesh_device=mesh_device, topology=ttnn.Topology.Linear, ) + + +def test_distribute_api(mesh_device): + torch_hidden_states = torch.rand((1, 1, 32, 32), dtype=torch.bfloat16) + with ttnn.distribute(ttnn.ReplicateTensorToMesh(mesh_device)): + hidden_states = ttnn.from_torch( + torch_hidden_states, + dtype=ttnn.bfloat8_b, + layout=ttnn.TILE_LAYOUT, + device=mesh_device, + ) diff --git a/ttnn/ttnn/operations/core.py b/ttnn/ttnn/operations/core.py index 179cb169384..39db661f28e 100644 --- a/ttnn/ttnn/operations/core.py +++ b/ttnn/ttnn/operations/core.py @@ -200,7 +200,7 @@ def from_torch( if layout != ttnn.TILE_LAYOUT: raise RuntimeError("ttnn.from_torch: bfloat8_b/bfloat4_b requires TILE_LAYOUT!") # Tilize tensor - tensor = ttnn.from_torch(tensor, layout=ttnn.TILE_LAYOUT, tile=tile, pad_value=pad_value) + tensor = ttnn.from_torch(tensor, layout=ttnn.TILE_LAYOUT, tile=tile, pad_value=pad_value, mesh_mapper=None) logical_shape = tensor.shape padded_shape = tensor.padded_shape tensor = tensor.reshape(tensor.padded_shape) From 9b5f53aee16cc16257737fe712ef529e9ede8e9f Mon Sep 17 00:00:00 2001 From: Kalaivani Baskar <156762498+KalaivaniMCW@users.noreply.github.com> Date: Wed, 19 Feb 2025 19:01:22 +0530 Subject: [PATCH 164/316] #0: binary_ng scalar - fix const qualifier (#17932) ### Ticket Link to Github Issue ### Problem description Adding tests for binary_ng - scalar input ### What's changed - fix const qualifier - added new tests ### Checklist - [x] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes https://github.com/tenstorrent/tt-metal/actions/runs/13407843503 - [ ] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [ ] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [ ] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [ ] New/Existing tests provide coverage for changes --- .../operations/eltwise/test_binaryng_fp32.py | 25 +++++++++++++++++++ .../dataflow/writer_interleaved_scalar.cpp | 2 +- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/tests/ttnn/unit_tests/operations/eltwise/test_binaryng_fp32.py b/tests/ttnn/unit_tests/operations/eltwise/test_binaryng_fp32.py index 7cbe875449a..b1d6396789c 100644 --- a/tests/ttnn/unit_tests/operations/eltwise/test_binaryng_fp32.py +++ b/tests/ttnn/unit_tests/operations/eltwise/test_binaryng_fp32.py @@ -571,3 +571,28 @@ def test_bitwise_right_shift(device, ttnn_function): status = ttnn.pearson_correlation_coefficient(z_torch, tt_out) >= 0.999 assert status + + +@skip_for_grayskull("Unsupported dtype for Grayskull") +@pytest.mark.parametrize( + "ttnn_function", + [ + ttnn.experimental.sub, + ttnn.experimental.add, + ttnn.experimental.rsub, + ttnn.experimental.mul, + ttnn.experimental.div, + ], +) +def test_ng_scalar_fp32(device, ttnn_function): + x_torch = torch.tensor([[1]], dtype=torch.float32) + y_torch = 0.00030171126 + golden_fn = ttnn.get_golden_function(ttnn_function) + z_torch = golden_fn(x_torch, y_torch) + x_tt = ttnn.from_torch(x_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + y_tt = y_torch + z_tt_out = ttnn_function(x_tt, y_tt) + tt_out = ttnn.to_torch(z_tt_out) + + status = torch.allclose(z_torch, tt_out, atol=1e-10, rtol=1e-5, equal_nan=False) + assert status diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_scalar.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_scalar.cpp index 17a5ec998c1..649ebeea682 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_scalar.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_scalar.cpp @@ -41,7 +41,7 @@ void kernel_main() { // we only need to fill a tile with the scalar value once cb_reserve_back(cb_id_src, onetile); #ifdef FILL_WITH_VALUE_FLOAT - float* float_ptr = reinterpret_cast(&packed_scalar); + const auto float_ptr = reinterpret_cast(&packed_scalar); FILL_WITH_VALUE_FLOAT(cb_id_src, *float_ptr); #endif #ifdef FILL_WITH_VALUE From e54804e504c9e2a758360ff8e386928bf0e8afb2 Mon Sep 17 00:00:00 2001 From: Jason Davies Date: Tue, 18 Feb 2025 12:01:25 +0000 Subject: [PATCH 165/316] Ensure get_profiler_artifacts_dir respects TT_METAL_HOME by default. Fixes #17939. --- tt_metal/api/tt-metalium/common.hpp | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/tt_metal/api/tt-metalium/common.hpp b/tt_metal/api/tt-metalium/common.hpp index c6e39034226..d1828386ba6 100644 --- a/tt_metal/api/tt-metalium/common.hpp +++ b/tt_metal/api/tt-metalium/common.hpp @@ -15,15 +15,20 @@ constexpr std::string_view PROFILER_RUNTIME_ROOT_DIR = "generated/profiler/"; constexpr std::string_view PROFILER_LOGS_DIR_NAME = ".logs/"; inline std::string get_profiler_artifacts_dir() { - std::string artifactDir = string(PROFILER_RUNTIME_ROOT_DIR); - const auto PROFILER_ARTIFACTS_DIR = std::getenv("TT_METAL_PROFILER_DIR"); - if (PROFILER_ARTIFACTS_DIR != nullptr) { - artifactDir = string(PROFILER_ARTIFACTS_DIR) + "/"; + std::string artifacts_dir; + if (std::getenv("TT_METAL_PROFILER_DIR")) { + artifacts_dir = std::string(std::getenv("TT_METAL_PROFILER_DIR")) + "/"; + } else { + std::string prefix; + if (std::getenv("TT_METAL_HOME")) { + prefix = std::string(std::getenv("TT_METAL_HOME")) + "/"; + } + artifacts_dir = prefix + std::string(PROFILER_RUNTIME_ROOT_DIR); } - return artifactDir; + return artifacts_dir; } -inline std::string get_profiler_logs_dir() { return get_profiler_artifacts_dir() + string(PROFILER_LOGS_DIR_NAME); } +inline std::string get_profiler_logs_dir() { return get_profiler_artifacts_dir() + std::string(PROFILER_LOGS_DIR_NAME); } inline std::string PROFILER_ZONE_SRC_LOCATIONS_LOG = get_profiler_logs_dir() + "zone_src_locations.log"; } // namespace tt_metal From b26e037495846e03357d54b6d89848611e36096e Mon Sep 17 00:00:00 2001 From: Ata Tuzuner Date: Wed, 19 Feb 2025 09:52:20 -0500 Subject: [PATCH 166/316] Replacing L1 base address increment instructions with CFGSHIFTMASK (#17723) ### Ticket [Link to Github Issue](https://github.com/tenstorrent/tt-llk-bh/issues/4) ### Problem description Blackhole has new `CFGSHIFTMASK` that can update addresses for the unpacker instructions inside the mop/replay buffers. If an operation is unpacker bound, then using this instruction should increase performance. ### What's changed Replaced L1 base address increment code that uses cfg read/write and tdma gpr operations with the new `CFGSHIFTMASK` instruction in the unpack AB matmul llk api. This replacement saves 6 instructions in the mop replay buffer. No notable performance improvements. Only affects BH and addresses an issue in BH third party repo. ### Checklist - [x] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) [CI passes](https://github.com/tenstorrent/tt-metal/actions/runs/13399863311) - [x] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) [CI passes](https://github.com/tenstorrent/tt-metal/actions/runs/13399865409) (if applicable) - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [ ] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [ ] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [ ] New/Existing tests provide coverage for changes --- tt_metal/third_party/tt_llk_blackhole | 2 +- .../compute/bmm_large_block_zm_fused_bias_activation.cpp | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tt_metal/third_party/tt_llk_blackhole b/tt_metal/third_party/tt_llk_blackhole index 76b5357a75b..8c25441b351 160000 --- a/tt_metal/third_party/tt_llk_blackhole +++ b/tt_metal/third_party/tt_llk_blackhole @@ -1 +1 @@ -Subproject commit 76b5357a75bfed7dac22a7b0417bb5589c2e0c5b +Subproject commit 8c25441b351646046d8de3fd6b8d895b7c87135d diff --git a/ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp b/ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp index 73ef8d67cfb..f3275fe122f 100644 --- a/ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp @@ -401,9 +401,6 @@ void MAIN { pack_untilize_uninit(mm_partials_cb_id); } if constexpr (batch > 1 || num_blocks_w_dim > 1 || num_blocks_h_dim > 1) { - // reconfigure init for matmul - mm_block_init_short( - in0_cb_id, in1_cb_id, in1_transpose_tile, out_subblock_w, out_subblock_h, in0_block_w); #ifdef FUSE_BIAS // reconfigure unpacker df for src A and src B reconfig_data_format(mm_partials_cb_id, in1_cb_id, bias_cb_id, in0_cb_id); @@ -411,6 +408,9 @@ void MAIN { // reconfigure unpacker df for src A reconfig_data_format_srca(mm_partials_cb_id, in1_cb_id); #endif + // reconfigure init for matmul + mm_block_init_short( + in0_cb_id, in1_cb_id, in1_transpose_tile, out_subblock_w, out_subblock_h, in0_block_w); } } } From e820e8d177f36bcf53187ba295fe9cd4cb66e75a Mon Sep 17 00:00:00 2001 From: William Ly Date: Wed, 19 Feb 2025 10:34:22 -0500 Subject: [PATCH 167/316] #17731: Upload gtest testcase data to superset (#17950) ### Ticket #17731 ### Problem description The produce_data python script doesn't support gtest-generated xml files. As a result, gtest data isn't uploaded to superset. ### What's changed Add gtest support: - update xml utils to handle both pytest and gtest xml files - add unit tests - add model constraint validation to ensure test-specific table (`sw_test.cicd_test`) constraints are not violated (job_id, full_test_name, test_start_ts) ### Checklist - [x] New/Existing tests provide coverage for changes https://github.com/tenstorrent/tt-metal/actions/runs/13399311809 --- infra/data_collection/cicd.py | 22 +- infra/data_collection/github/workflows.py | 85 +- infra/data_collection/junit_xml_utils.py | 44 +- infra/data_collection/pydantic_models.py | 16 +- .../unit_tests_device.xml | 112 + .../most_recent_tests.xml | 51 + .../unit_tests_api_grayskull.xml | 339 ++ .../unit_tests_debug_tools_wormhole_b0.xml | 6 + .../unit_tests_debug_tools_wormhole_b0_1.xml | 6 + .../unit_tests_debug_tools_wormhole_b0_2.xml | 6 + .../most_recent_tests.xml | 1 + .../13315815702/logs/37190213375.log | 3112 +++++++++++ .../logs/37190213375_annotations.json | 1 + .../13315815702/logs/37190219113.log | 2178 ++++++++ .../13315815702/logs/37190230023.log | 4710 +++++++++++++++++ .../13315815702/logs/37190251054.log | 690 +++ .../logs/37190251054_annotations.json | 1 + .../13315815702/logs/37190252200.log | 568 ++ .../workflow.json | 1 + .../workflow_jobs.json | 657 +++ infra/tests/data_collection/test_cicd.py | 50 + 21 files changed, 12599 insertions(+), 57 deletions(-) create mode 100644 infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/artifacts/test_reports_304c9db0-c03c-4ec4-8503-cd90ed1e264c/unit_tests_device.xml create mode 100644 infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/artifacts/test_reports_36168e86-3fe3-4807-94c7-1f22471b0c56/most_recent_tests.xml create mode 100644 infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/artifacts/test_reports_a9d3638c-0b51-4a1d-b6bf-002b85a02892/unit_tests_api_grayskull.xml create mode 100644 infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/artifacts/test_reports_be17cde1-9f41-464b-988e-7df7929dd6a6/unit_tests_debug_tools_wormhole_b0.xml create mode 100644 infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/artifacts/test_reports_be17cde1-9f41-464b-988e-7df7929dd6a6/unit_tests_debug_tools_wormhole_b0_1.xml create mode 100644 infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/artifacts/test_reports_be17cde1-9f41-464b-988e-7df7929dd6a6/unit_tests_debug_tools_wormhole_b0_2.xml create mode 100644 infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/artifacts/test_reports_e6d768c7-44f2-4bd8-a96a-4277c643d4a3/most_recent_tests.xml create mode 100644 infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/logs/37190213375.log create mode 100644 infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/logs/37190213375_annotations.json create mode 100644 infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/logs/37190219113.log create mode 100644 infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/logs/37190230023.log create mode 100644 infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/logs/37190251054.log create mode 100644 infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/logs/37190251054_annotations.json create mode 100644 infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/logs/37190252200.log create mode 100644 infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/workflow.json create mode 100644 infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/workflow_jobs.json diff --git a/infra/data_collection/cicd.py b/infra/data_collection/cicd.py index f499d82209a..48a8fc1b76a 100644 --- a/infra/data_collection/cicd.py +++ b/infra/data_collection/cicd.py @@ -65,19 +65,25 @@ def create_cicd_json_for_data_analysis( test_report_exists = github_job_id in github_job_id_to_test_reports if test_report_exists: - test_report_path = github_job_id_to_test_reports[github_job_id] - tests = get_tests_from_test_report_path(test_report_path) + tests = [] + test_reports = github_job_id_to_test_reports[github_job_id] + for test_report_path in test_reports: + logger.info(f"Job id:{github_job_id} Analyzing test report {test_report_path}") + tests += get_tests_from_test_report_path(test_report_path) else: tests = [] logger.info(f"Found {len(tests)} tests for job {github_job_id}") - job = pydantic_models.Job( - **raw_job, - tests=tests, - ) - - jobs.append(job) + try: + job = pydantic_models.Job( + **raw_job, + tests=tests, + ) + except ValueError as e: + logger.warning(f"Skipping insert for job {github_job_id}, model validation failed: {e}") + else: + jobs.append(job) pipeline = pydantic_models.Pipeline( **raw_pipeline, diff --git a/infra/data_collection/github/workflows.py b/infra/data_collection/github/workflows.py index 0fc9a823a5a..be5fbe661c6 100644 --- a/infra/data_collection/github/workflows.py +++ b/infra/data_collection/github/workflows.py @@ -4,7 +4,7 @@ import pathlib import json -from datetime import datetime +from datetime import datetime, timedelta from functools import partial from typing import List @@ -26,11 +26,10 @@ def get_workflow_run_uuids_to_test_reports_paths_(workflow_outputs_dir, workflow test_report_uuid = test_report_dir.name.replace("test_reports_", "") try: - xml_file_paths = (test_report_dir / "most_recent_tests.xml").resolve(strict=True) + # read all *.xml in test_report_dir (gtest can have one xml files per test executable) + xml_file_paths = [file.resolve(strict=True) for file in list(test_report_dir.glob("*.xml"))] except FileNotFoundError as e: - logger.warning( - f"no pytest xml file found matching most_recent_tests.xml (likely gtest xml) in {test_report_dir}" - ) + logger.warning(f"No pytest or gtest xml file found in {test_report_dir}, skipping directory.") else: workflow_run_test_reports_path[test_report_uuid] = xml_file_paths @@ -134,48 +133,60 @@ def get_github_job_id_to_annotations(workflow_outputs_dir, workflow_run_id: int) return github_job_ids_to_annotation_jsons -def get_pydantic_test_from_pytest_testcase_(testcase, default_timestamp=datetime.now()): - skipped = junit_xml_utils.get_pytest_testcase_is_skipped(testcase) - failed = junit_xml_utils.get_pytest_testcase_is_failed(testcase) - error = junit_xml_utils.get_pytest_testcase_is_error(testcase) +def get_pydantic_test_from_testcase_(testcase, default_timestamp=datetime.now(), is_pytest=True, testsuite_name=None): + skipped = junit_xml_utils.get_testcase_is_skipped(testcase) + failed = junit_xml_utils.get_testcase_is_failed(testcase) + error = junit_xml_utils.get_testcase_is_error(testcase) success = not (failed or error) error_message = None # Error is a scarier thing than failure because it means there's an infra error, expose that first if failed: - error_message = junit_xml_utils.get_pytest_failure_message(testcase) + error_message = junit_xml_utils.get_test_failure_message(testcase) if error: - error_message = junit_xml_utils.get_pytest_error_message(testcase) + error_message = junit_xml_utils.get_test_error_message(testcase) # Error at the beginning of a test can prevent pytest from recording timestamps at all if not (skipped or error): - properties = junit_xml_utils.get_pytest_testcase_properties(testcase) - # Check if properties is none to see if pytest recorded the timestamps - if properties is not None: - test_start_ts = datetime.strptime(properties["start_timestamp"], "%Y-%m-%dT%H:%M:%S") - test_end_ts = datetime.strptime(properties["end_timestamp"], "%Y-%m-%dT%H:%M:%S") + if is_pytest: + properties = junit_xml_utils.get_pytest_testcase_properties(testcase) + # Check if properties is none to see if pytest recorded the timestamps + if properties is not None: + test_start_ts = datetime.strptime(properties["start_timestamp"], "%Y-%m-%dT%H:%M:%S") + test_end_ts = datetime.strptime(properties["end_timestamp"], "%Y-%m-%dT%H:%M:%S") + else: + test_start_ts = default_timestamp + test_end_ts = default_timestamp else: test_start_ts = default_timestamp - test_end_ts = default_timestamp + # gtest stores elapsed time for the test in the time attribute + gtest_elapsed_time = float(testcase.attrib["time"]) + test_end_ts = default_timestamp + timedelta(seconds=gtest_elapsed_time) else: test_start_ts = default_timestamp test_end_ts = default_timestamp test_case_name = testcase.attrib["name"].split("[")[0] - filepath_no_ext = testcase.attrib["classname"].replace(".", "/") - filepath = f"{filepath_no_ext}.py" + if is_pytest: + filepath_no_ext = testcase.attrib["classname"].replace(".", "/") + filepath = f"{filepath_no_ext}.py" + else: + filepath = testcase.attrib["file"] + if filepath.startswith("/work/"): + filepath = filepath.lstrip("/work/") - def get_category_from_pytest_testcase_(testcase_): + def get_category_from_testcase_(testcase_, is_pytest=True): categories = ["models", "ttnn", "tt_eager", "tt_metal"] for category in categories: - if category in testcase_.attrib["classname"]: + identifier_attrib = "classname" if is_pytest else "file" + if category in testcase_.attrib[identifier_attrib]: return category return "other" - category = get_category_from_pytest_testcase_(testcase) + category = get_category_from_testcase_(testcase, is_pytest=is_pytest) # leaving empty for now group = None @@ -183,7 +194,10 @@ def get_category_from_pytest_testcase_(testcase_): # leaving empty for now owner = None - full_test_name = f"{filepath}::{testcase.attrib['name']}" + if testsuite_name: + full_test_name = f"{filepath}::{testsuite_name}::{testcase.attrib['name']}" + else: + full_test_name = f"{filepath}::{testcase.attrib['name']}" # to be populated with [] if available config = None @@ -229,17 +243,24 @@ def get_tests_from_test_report_path(test_report_path): report_root = report_root_tree.getroot() is_pytest = junit_xml_utils.is_pytest_junit_xml(report_root) + is_gtest = junit_xml_utils.is_gtest_xml(report_root) - if is_pytest: - testsuite = report_root[0] - default_timestamp = datetime.strptime(testsuite.attrib["timestamp"], "%Y-%m-%dT%H:%M:%S.%f") - - get_pydantic_test = partial(get_pydantic_test_from_pytest_testcase_, default_timestamp=default_timestamp) - + if is_pytest or is_gtest: + logger.info(f"Found {len(report_root)} testsuites") tests = [] - for testcase in testsuite: - if is_valid_testcase_(testcase): - tests.append(get_pydantic_test(testcase)) + for i in range(len(report_root)): + testsuite = report_root[i] + testsuite_name = testsuite.attrib.get("name") if is_gtest else None + default_timestamp = datetime.strptime(testsuite.attrib["timestamp"], "%Y-%m-%dT%H:%M:%S.%f") + get_pydantic_test = partial( + get_pydantic_test_from_testcase_, + default_timestamp=default_timestamp, + is_pytest=is_pytest, + testsuite_name=testsuite_name, + ) + for testcase in testsuite: + if is_valid_testcase_(testcase): + tests.append(get_pydantic_test(testcase)) return tests else: diff --git a/infra/data_collection/junit_xml_utils.py b/infra/data_collection/junit_xml_utils.py index 33a08039bad..310c5d74a6b 100644 --- a/infra/data_collection/junit_xml_utils.py +++ b/infra/data_collection/junit_xml_utils.py @@ -18,13 +18,15 @@ def get_xml_file_root_element_tree(filepath): return root_element_tree -def sanity_check_pytest_junit_xml_(root_element): +def sanity_check_test_xml_(root_element, is_pytest=True): testsuite_count = len(root_element) - assert testsuite_count == 1, f"{len(root_element)}" - - logger.debug("Asserted pytest junit xml") - + if is_pytest: + assert testsuite_count == 1, f"{len(root_element)}" + logger.debug("Asserted pytest junit xml") + else: + assert testsuite_count >= 1, f"{len(root_element)}" + logger.debug("Asserted gtest xml") return root_element @@ -32,19 +34,29 @@ def is_pytest_junit_xml(root_element): is_pytest = root_element[0].get("name") == "pytest" if is_pytest: - sanity_check_pytest_junit_xml_(root_element) + sanity_check_test_xml_(root_element) return is_pytest +def is_gtest_xml(root_element): + is_gtest = root_element[0].get("name") != "pytest" + + if is_gtest: + sanity_check_test_xml_(root_element, is_pytest=False) + + return is_gtest + + def get_at_most_one_single_child_element_(element, tag_name): is_expected = lambda child_: child_.tag == tag_name potential_expected_blocks = list(filter(is_expected, element)) - assert ( - len(potential_expected_blocks) <= 1 - ), f"{len(potential_expected_blocks)} is not exactly 1 for tag name {tag_name}" + # downgrade assert to warning + if len(potential_expected_blocks) > 1: + element_name = element.attrib.get("name", "unknown_name") + logger.warning(f"{element_name} : {len(potential_expected_blocks)} is greater than 1 for tag name {tag_name}") return potential_expected_blocks[0] if len(potential_expected_blocks) else None @@ -73,31 +85,31 @@ def get_optional_child_element_exists_(parent_element, tag_name): return get_at_most_one_single_child_element_(parent_element, tag_name) != None -def get_pytest_testcase_is_skipped(testcase_element): +def get_testcase_is_skipped(testcase_element): return get_optional_child_element_exists_(testcase_element, "skipped") -def get_pytest_testcase_is_failed(testcase_element): +def get_testcase_is_failed(testcase_element): return get_optional_child_element_exists_(testcase_element, "failure") -def get_pytest_testcase_is_error(testcase_element): +def get_testcase_is_error(testcase_element): return get_optional_child_element_exists_(testcase_element, "error") # opportunity for less copy-pasta -def get_pytest_failure_message(testcase_element): - assert get_pytest_testcase_is_failed(testcase_element) +def get_test_failure_message(testcase_element): + assert get_testcase_is_failed(testcase_element) failure_element = get_at_most_one_single_child_element_(testcase_element, "failure") return failure_element.attrib["message"] -def get_pytest_error_message(testcase_element): - assert get_pytest_testcase_is_error(testcase_element) +def get_test_error_message(testcase_element): + assert get_testcase_is_error(testcase_element) error_element = get_at_most_one_single_child_element_(testcase_element, "error") diff --git a/infra/data_collection/pydantic_models.py b/infra/data_collection/pydantic_models.py index 0c5ca96870c..4972e446d62 100644 --- a/infra/data_collection/pydantic_models.py +++ b/infra/data_collection/pydantic_models.py @@ -9,7 +9,7 @@ from datetime import datetime from typing import List, Optional -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, model_validator class Test(BaseModel): @@ -74,6 +74,20 @@ class Job(BaseModel): failure_description: Optional[str] = Field(None, description="Failure description.") tests: List[Test] = [] + # Model validator to check the unique combination constraint + @model_validator(mode="before") + def check_unique_tests(cls, values): + tests = values.get("tests", []) + seen_combinations = set() + + for test in tests: + # for each job, the test constraint is full_test_name, test_start_ts + test_combination = (test.full_test_name, test.test_start_ts) + if test_combination in seen_combinations: + raise ValueError(f"Duplicate test combination found: {test_combination}") + seen_combinations.add(test_combination) + return values + class Pipeline(BaseModel): """ diff --git a/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/artifacts/test_reports_304c9db0-c03c-4ec4-8503-cd90ed1e264c/unit_tests_device.xml b/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/artifacts/test_reports_304c9db0-c03c-4ec4-8503-cd90ed1e264c/unit_tests_device.xml new file mode 100644 index 00000000000..e9e12828b54 --- /dev/null +++ b/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/artifacts/test_reports_304c9db0-c03c-4ec4-8503-cd90ed1e264c/unit_tests_device.xml @@ -0,0 +1,112 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/artifacts/test_reports_36168e86-3fe3-4807-94c7-1f22471b0c56/most_recent_tests.xml b/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/artifacts/test_reports_36168e86-3fe3-4807-94c7-1f22471b0c56/most_recent_tests.xml new file mode 100644 index 00000000000..25dc24ed6e2 --- /dev/null +++ b/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/artifacts/test_reports_36168e86-3fe3-4807-94c7-1f22471b0c56/most_recent_tests.xml @@ -0,0 +1,51 @@ +device = <ttnn._ttnn.device.Device object at 0x7ff1e7e1d2f0> +shape = (1, 1, 32, 131072), on_device = True, from_layout = <Layout.TILE: 1> +to_layout = <Layout.ROW_MAJOR: 0> + + @pytest.mark.parametrize( + "shape", + [(1, 1, 32, 128 * 1024), (1, 1, 128, 5120), (1, 1, 512, 5120), (1, 1, 128, 128 * 1024)], + ) + @pytest.mark.parametrize("on_device", [True]) + @pytest.mark.parametrize("from_layout", [ttnn.TILE_LAYOUT]) + @pytest.mark.parametrize("to_layout", [ttnn.ROW_MAJOR_LAYOUT]) + def test_to_layout_wide_tensor(device, shape, on_device, from_layout, to_layout): + torch.manual_seed(0) + torch_input_tensor = torch.rand(shape, dtype=torch.bfloat16) + input_tensor = ttnn.from_torch(torch_input_tensor) + assert input_tensor.layout == ttnn.ROW_MAJOR_LAYOUT + input_tensor = ttnn.to_layout(input_tensor, from_layout) + assert input_tensor.layout == from_layout + + if on_device: + input_tensor = ttnn.to_device(input_tensor, device) + assert ttnn.is_tensor_storage_on_device(input_tensor) + + output_tensor = ttnn.to_layout(input_tensor, to_layout) + assert output_tensor.layout == to_layout + + if on_device: + assert ttnn.is_tensor_storage_on_device(output_tensor) + output_tensor = ttnn.from_device(output_tensor) + assert not ttnn.is_tensor_storage_on_device(output_tensor) + + output_tensor = ttnn.to_torch(output_tensor) + +> assert_with_pcc(torch_input_tensor, output_tensor) + +tests/ttnn/unit_tests/test_to_layout.py:94: +_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ + +expected_pytorch_result = tensor([[[[0.6719, 0.1836, 0.4570, 0.7500, 0.2617, 0.9805, 0.7617, 0.4023, 0.0352, 0.8242, 0.0820, 0.9453, 0.1406, 0.3...7500, 0.8477, 0.6641, 0.8164, 0.6797, 0.3945, 0.3555, 0.7070, 0.1367, 0.8203, 0.7070, 0.3750]]]], dtype=torch.bfloat16) +actual_pytorch_result = TorchTensor([[[[ 0.6719, 0.1836, 0.4570, 0.7500, 0.2617, 0.9805, 0.7617, 0.4023...0, -0.5859, 0.0000, -0.6797, 0.0000, -0.0723, 0.0000, 0.4766]]]], dtype=torch.bfloat16) +pcc = 0.9999 + + def assert_with_pcc(expected_pytorch_result, actual_pytorch_result, pcc=0.9999): + assert list(expected_pytorch_result.shape) == list( + actual_pytorch_result.shape + ), f"list(expected_pytorch_result.shape)={list(expected_pytorch_result.shape)} vs list(actual_pytorch_result.shape)={list(actual_pytorch_result.shape)}" + pcc_passed, pcc_message = comp_pcc(expected_pytorch_result, actual_pytorch_result, pcc) +> assert pcc_passed, construct_pcc_assert_message(pcc_message, expected_pytorch_result, actual_pytorch_result) +E AssertionError: 0.24432213315356766 + +tests/ttnn/utils_for_testing.py:57: AssertionError diff --git a/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/artifacts/test_reports_a9d3638c-0b51-4a1d-b6bf-002b85a02892/unit_tests_api_grayskull.xml b/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/artifacts/test_reports_a9d3638c-0b51-4a1d-b6bf-002b85a02892/unit_tests_api_grayskull.xml new file mode 100644 index 00000000000..2606ebe2c3a --- /dev/null +++ b/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/artifacts/test_reports_a9d3638c-0b51-4a1d-b6bf-002b85a02892/unit_tests_api_grayskull.xml @@ -0,0 +1,339 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + devices_.at(id), test_config, false, true) + Actual: false +Expected: true]]> + devices_.at(id), test_config, false, true) + Actual: false +Expected: true]]> + devices_.at(id), test_config, false, true) + Actual: false +Expected: true]]> + devices_.at(id), test_config, false, true) + Actual: false +Expected: true]]> + + + + + + devices_.at(id), test_config, true, true) + Actual: false +Expected: true]]> + devices_.at(id), test_config, true, true) + Actual: false +Expected: true]]> + devices_.at(id), test_config, true, true) + Actual: false +Expected: true]]> + devices_.at(id), test_config, true, true) + Actual: false +Expected: true]]> + devices_.at(id), test_config, true, true) + Actual: false +Expected: true]]> + devices_.at(id), test_config, true, true) + Actual: false +Expected: true]]> + + + + + + devices_.at(id), test_config, true, true) + Actual: false +Expected: true]]> + devices_.at(id), test_config, true, true) + Actual: false +Expected: true]]> + devices_.at(id), test_config, true, true) + Actual: false +Expected: true]]> + devices_.at(id), test_config, true, true) + Actual: false +Expected: true]]> + devices_.at(id), test_config, true, true) + Actual: false +Expected: true]]> + devices_.at(id), test_config, true, true) + Actual: false +Expected: true]]> + + + + devices_.at(id), test_config) + Actual: false +Expected: true]]> + devices_.at(id), test_config) + Actual: false +Expected: true]]> + devices_.at(id), test_config) + Actual: false +Expected: true]]> + devices_.at(id), test_config) + Actual: false +Expected: true]]> + devices_.at(id), test_config) + Actual: false +Expected: true]]> + + + devices_.at(id), test_config) + Actual: false +Expected: true]]> + devices_.at(id), test_config) + Actual: false +Expected: true]]> + devices_.at(id), test_config) + Actual: false +Expected: true]]> + devices_.at(id), test_config) + Actual: false +Expected: true]]> + devices_.at(id), test_config) + Actual: false +Expected: true]]> + devices_.at(id), test_config) + Actual: false +Expected: true]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/artifacts/test_reports_be17cde1-9f41-464b-988e-7df7929dd6a6/unit_tests_debug_tools_wormhole_b0.xml b/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/artifacts/test_reports_be17cde1-9f41-464b-988e-7df7929dd6a6/unit_tests_debug_tools_wormhole_b0.xml new file mode 100644 index 00000000000..16de92c293b --- /dev/null +++ b/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/artifacts/test_reports_be17cde1-9f41-464b-988e-7df7929dd6a6/unit_tests_debug_tools_wormhole_b0.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/artifacts/test_reports_be17cde1-9f41-464b-988e-7df7929dd6a6/unit_tests_debug_tools_wormhole_b0_1.xml b/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/artifacts/test_reports_be17cde1-9f41-464b-988e-7df7929dd6a6/unit_tests_debug_tools_wormhole_b0_1.xml new file mode 100644 index 00000000000..3ce6b6443e2 --- /dev/null +++ b/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/artifacts/test_reports_be17cde1-9f41-464b-988e-7df7929dd6a6/unit_tests_debug_tools_wormhole_b0_1.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/artifacts/test_reports_be17cde1-9f41-464b-988e-7df7929dd6a6/unit_tests_debug_tools_wormhole_b0_2.xml b/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/artifacts/test_reports_be17cde1-9f41-464b-988e-7df7929dd6a6/unit_tests_debug_tools_wormhole_b0_2.xml new file mode 100644 index 00000000000..efd876edcd3 --- /dev/null +++ b/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/artifacts/test_reports_be17cde1-9f41-464b-988e-7df7929dd6a6/unit_tests_debug_tools_wormhole_b0_2.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/artifacts/test_reports_e6d768c7-44f2-4bd8-a96a-4277c643d4a3/most_recent_tests.xml b/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/artifacts/test_reports_e6d768c7-44f2-4bd8-a96a-4277c643d4a3/most_recent_tests.xml new file mode 100644 index 00000000000..a25eb1e66e2 --- /dev/null +++ b/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/artifacts/test_reports_e6d768c7-44f2-4bd8-a96a-4277c643d4a3/most_recent_tests.xml @@ -0,0 +1 @@ +/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:16: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:16: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull diff --git a/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/logs/37190213375.log b/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/logs/37190213375.log new file mode 100644 index 00000000000..fb010e2c9b4 --- /dev/null +++ b/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/logs/37190213375.log @@ -0,0 +1,3112 @@ +2025-02-13T20:00:51.2580903Z Current runner version: '2.322.0' +2025-02-13T20:00:51.2590387Z Runner name: 'tt-metal-ci-vm-160' +2025-02-13T20:00:51.2591645Z Runner group name: 'Default' +2025-02-13T20:00:51.2593073Z Machine name: 'tt-metal-ci-vm-160' +2025-02-13T20:00:51.2598655Z ##[group]GITHUB_TOKEN Permissions +2025-02-13T20:00:51.2602478Z Actions: read +2025-02-13T20:00:51.2603523Z Contents: write +2025-02-13T20:00:51.2604793Z Metadata: read +2025-02-13T20:00:51.2605694Z Packages: write +2025-02-13T20:00:51.2606569Z Pages: write +2025-02-13T20:00:51.2607427Z PullRequests: write +2025-02-13T20:00:51.2608323Z ##[endgroup] +2025-02-13T20:00:51.2612534Z Secret source: Actions +2025-02-13T20:00:51.2613679Z Prepare workflow directory +2025-02-13T20:00:51.5191575Z Prepare all required actions +2025-02-13T20:00:51.5258363Z Getting action download info +2025-02-13T20:00:51.6994940Z Download action repository 'tenstorrent/tt-metal@main' (SHA:ac426de3d4a9c274964843fdae6aa83ea3960a30) +2025-02-13T20:00:58.0946917Z Getting action download info +2025-02-13T20:00:58.2597279Z Download action repository 'actions/checkout@v4' (SHA:11bd71901bbe5b1630ceea73d27597364c9af683) +2025-02-13T20:00:58.8564327Z Uses: tenstorrent/tt-metal/.github/workflows/build-and-unit-tests.yaml@refs/heads/sagarwal/multi_page_buffer (ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70) +2025-02-13T20:00:58.8567015Z ##[group] Inputs +2025-02-13T20:00:58.8567515Z build-type: Release +2025-02-13T20:00:58.8568357Z with-retries: false +2025-02-13T20:00:58.8568811Z arch: grayskull +2025-02-13T20:00:58.8569256Z runner-label: E150 +2025-02-13T20:00:58.8570241Z timeout: 35 +2025-02-13T20:00:58.8570688Z os: ubuntu-20.04 +2025-02-13T20:00:58.8571123Z ##[endgroup] +2025-02-13T20:00:58.8571732Z Complete job name: sd-unit-tests (grayskull, E150) / grayskull E150 api +2025-02-13T20:00:58.9255104Z A job started hook has been configured by the self-hosted runner administrator +2025-02-13T20:00:58.9395034Z ##[group]Run '/opt/tt_metal_infra/scripts/ci/grayskull/reset.sh' +2025-02-13T20:00:58.9413384Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:00:58.9414324Z ##[endgroup] +2025-02-13T20:00:58.9573831Z ++ date +2025-02-13T20:00:58.9574693Z + echo Current date / time is Thu Feb 13 20:00:58 UTC 2025 +2025-02-13T20:00:58.9575337Z + set_e_was_enabled=false +2025-02-13T20:00:58.9575892Z + [[ ehxB == *e* ]] +2025-02-13T20:00:58.9576367Z + set_e_was_enabled=true +2025-02-13T20:00:58.9576833Z + set +e +2025-02-13T20:00:58.9577250Z + docker image prune +2025-02-13T20:00:58.9579185Z Current date / time is Thu Feb 13 20:00:58 UTC 2025 +2025-02-13T20:00:58.9698133Z WARNING! This will remove all dangling images. +2025-02-13T20:00:58.9733489Z ++ df +2025-02-13T20:00:58.9736132Z ++ awk '{print $5}' +2025-02-13T20:00:58.9736624Z ++ sed s/%// +2025-02-13T20:00:58.9737892Z +++ findmnt -n -o SOURCE / +2025-02-13T20:00:58.9767183Z ++ grep -w '^/dev/vda3' +2025-02-13T20:00:58.9788334Z + disk_usage_before=59 +2025-02-13T20:00:58.9802805Z Are you sure you want to continue? [y/N] ::notice title=disk-usage-before-startup::Disk usage is 59 % +2025-02-13T20:00:58.9804343Z + echo '::notice title=disk-usage-before-startup::Disk usage is 59 %' +2025-02-13T20:00:58.9805308Z + '[' 59 -ge 90 ']' +2025-02-13T20:00:58.9805748Z ++ df +2025-02-13T20:00:58.9806361Z ++ awk '{print $5}' +2025-02-13T20:00:58.9806784Z ++ sed s/%// +2025-02-13T20:00:58.9807200Z +++ findmnt -n -o SOURCE / +2025-02-13T20:00:58.9827157Z ++ grep -w '^/dev/vda3' +2025-02-13T20:00:58.9845999Z + disk_usage_after=59 +2025-02-13T20:00:58.9846697Z + echo '::notice title=disk-usage-after-startup::Disk usage is 59 %' +2025-02-13T20:00:58.9847340Z + '[' 59 -ge 90 ']' +2025-02-13T20:00:58.9874557Z ##[notice]Disk usage is 59 % +2025-02-13T20:00:58.9888493Z ++ lsmod +2025-02-13T20:00:58.9888997Z + lsmod_output='Module Size Used by +2025-02-13T20:00:58.9889620Z veth 28672 0 +2025-02-13T20:00:58.9890152Z wekafsio 70086656 2 +2025-02-13T20:00:58.9890690Z wekafsgw 40960 8 wekafsio +2025-02-13T20:00:58.9891247Z uio_pci_generic 16384 0 +2025-02-13T20:00:58.9892218Z igb_uio 20480 0 +2025-02-13T20:00:58.9892771Z uio 20480 2 igb_uio,uio_pci_generic +2025-02-13T20:00:58.9893348Z xt_conntrack 16384 1 +2025-02-13T20:00:58.9893845Z xt_MASQUERADE 20480 1 +2025-02-13T20:00:58.9894412Z nf_conntrack_netlink 45056 0 +2025-02-13T20:00:58.9894999Z nfnetlink 16384 2 nf_conntrack_netlink +2025-02-13T20:00:58.9895607Z xfrm_user 36864 1 +2025-02-13T20:00:58.9896141Z xfrm_algo 16384 1 xfrm_user +2025-02-13T20:00:58.9897895Z iptable_nat 16384 1 +2025-02-13T20:00:58.9898497Z nf_nat 45056 2 iptable_nat,xt_MASQUERADE +2025-02-13T20:00:58.9899323Z nf_conntrack 139264 4 xt_conntrack,nf_nat,nf_conntrack_netlink,xt_MASQUERADE +2025-02-13T20:00:58.9900075Z nf_defrag_ipv6 24576 1 nf_conntrack +2025-02-13T20:00:58.9900842Z nf_defrag_ipv4 16384 1 nf_conntrack +2025-02-13T20:00:58.9901420Z xt_addrtype 16384 2 +2025-02-13T20:00:58.9901947Z iptable_filter 16384 1 +2025-02-13T20:00:58.9903176Z bpfilter 32768 0 +2025-02-13T20:00:58.9903722Z br_netfilter 28672 0 +2025-02-13T20:00:58.9904262Z bridge 176128 1 br_netfilter +2025-02-13T20:00:58.9905023Z stp 16384 1 bridge +2025-02-13T20:00:58.9905720Z llc 16384 2 bridge,stp +2025-02-13T20:00:58.9906401Z aufs 262144 0 +2025-02-13T20:00:58.9906938Z xfs 1286144 2 +2025-02-13T20:00:58.9907476Z overlay 118784 0 +2025-02-13T20:00:58.9907987Z rdma_ucm 28672 0 +2025-02-13T20:00:58.9908535Z rdma_cm 110592 1 rdma_ucm +2025-02-13T20:00:58.9909080Z iw_cm 49152 1 rdma_cm +2025-02-13T20:00:58.9909876Z ib_ipoib 131072 0 +2025-02-13T20:00:58.9910439Z ib_cm 114688 2 rdma_cm,ib_ipoib +2025-02-13T20:00:58.9911049Z ib_umad 28672 8 +2025-02-13T20:00:58.9911578Z nls_iso8859_1 16384 1 +2025-02-13T20:00:58.9912125Z dm_multipath 32768 0 +2025-02-13T20:00:58.9912660Z scsi_dh_rdac 16384 0 +2025-02-13T20:00:58.9913184Z scsi_dh_emc 16384 0 +2025-02-13T20:00:58.9913705Z scsi_dh_alua 20480 0 +2025-02-13T20:00:58.9914349Z mlx5_ib 397312 0 +2025-02-13T20:00:58.9914978Z kvm_amd 98304 0 +2025-02-13T20:00:58.9915572Z ib_uverbs 139264 24 rdma_ucm,mlx5_ib +2025-02-13T20:00:58.9916166Z ccp 90112 1 kvm_amd +2025-02-13T20:00:58.9916716Z input_leds 16384 0 +2025-02-13T20:00:58.9917255Z kvm 667648 1 kvm_amd +2025-02-13T20:00:58.9917818Z joydev 24576 0 +2025-02-13T20:00:58.9918335Z serio_raw 20480 0 +2025-02-13T20:00:58.9919087Z ib_core 348160 10 rdma_cm,ib_ipoib,iw_cm,ib_umad,rdma_ucm,ib_uverbs,mlx5_ib,ib_cm +2025-02-13T20:00:58.9920051Z tenstorrent 49152 0 +2025-02-13T20:00:58.9920581Z sch_fq_codel 20480 45 +2025-02-13T20:00:58.9921157Z binfmt_misc 24576 1 +2025-02-13T20:00:58.9921669Z msr 16384 0 +2025-02-13T20:00:58.9922178Z efi_pstore 16384 0 +2025-02-13T20:00:58.9922686Z virtio_rng 16384 0 +2025-02-13T20:00:58.9923256Z ip_tables 32768 2 iptable_filter,iptable_nat +2025-02-13T20:00:58.9925153Z x_tables 40960 5 xt_conntrack,iptable_filter,xt_addrtype,ip_tables,xt_MASQUERADE +2025-02-13T20:00:58.9925931Z autofs4 45056 2 +2025-02-13T20:00:58.9926485Z btrfs 1269760 0 +2025-02-13T20:00:58.9927009Z zstd_compress 167936 1 btrfs +2025-02-13T20:00:58.9927556Z raid10 61440 0 +2025-02-13T20:00:58.9928198Z raid456 155648 0 +2025-02-13T20:00:58.9928759Z async_raid6_recov 24576 1 raid456 +2025-02-13T20:00:58.9996490Z async_memcpy 20480 2 raid456,async_raid6_recov +2025-02-13T20:00:58.9997285Z async_pq 24576 2 raid456,async_raid6_recov +2025-02-13T20:00:58.9998211Z async_xor 20480 3 async_pq,raid456,async_raid6_recov +2025-02-13T20:00:58.9999021Z async_tx 20480 5 async_pq,async_memcpy,async_xor,raid456,async_raid6_recov +2025-02-13T20:00:58.9999948Z xor 24576 2 async_xor,btrfs +2025-02-13T20:00:59.0000672Z raid6_pq 114688 4 async_pq,btrfs,raid456,async_raid6_recov +2025-02-13T20:00:59.0001484Z libcrc32c 16384 5 nf_conntrack,nf_nat,btrfs,xfs,raid456 +2025-02-13T20:00:59.0002155Z raid1 45056 0 +2025-02-13T20:00:59.0002686Z raid0 24576 0 +2025-02-13T20:00:59.0003210Z multipath 20480 0 +2025-02-13T20:00:59.0003748Z linear 20480 0 +2025-02-13T20:00:59.0004406Z hid_generic 16384 0 +2025-02-13T20:00:59.0005123Z crct10dif_pclmul 16384 1 +2025-02-13T20:00:59.0005641Z usbhid 57344 0 +2025-02-13T20:00:59.0006151Z crc32_pclmul 16384 0 +2025-02-13T20:00:59.0006672Z ghash_clmulni_intel 16384 0 +2025-02-13T20:00:59.0007249Z hid 131072 2 usbhid,hid_generic +2025-02-13T20:00:59.0007860Z mlx5_core 1626112 1 mlx5_ib +2025-02-13T20:00:59.0008411Z cirrus 16384 0 +2025-02-13T20:00:59.0008942Z drm_kms_helper 184320 3 cirrus +2025-02-13T20:00:59.0009500Z pci_hyperv_intf 16384 1 mlx5_core +2025-02-13T20:00:59.0010117Z syscopyarea 16384 1 drm_kms_helper +2025-02-13T20:00:59.0010737Z mlxdevm 172032 1 mlx5_core +2025-02-13T20:00:59.0011370Z sysfillrect 16384 1 drm_kms_helper +2025-02-13T20:00:59.0012008Z sysimgblt 16384 1 drm_kms_helper +2025-02-13T20:00:59.0012648Z auxiliary 16384 2 mlx5_ib,mlx5_core +2025-02-13T20:00:59.0013437Z fb_sys_fops 16384 1 drm_kms_helper +2025-02-13T20:00:59.0014005Z aesni_intel 372736 0 +2025-02-13T20:00:59.0014562Z crypto_simd 16384 1 aesni_intel +2025-02-13T20:00:59.0015566Z mlx_compat 65536 12 rdma_cm,ib_ipoib,mlxdevm,iw_cm,auxiliary,ib_umad,ib_core,rdma_ucm,ib_uverbs,mlx5_ib,ib_cm,mlx5_core +2025-02-13T20:00:59.0016473Z ahci 40960 0 +2025-02-13T20:00:59.0016989Z tls 73728 1 mlx5_core +2025-02-13T20:00:59.0017635Z cryptd 24576 2 crypto_simd,ghash_clmulni_intel +2025-02-13T20:00:59.0027108Z libahci 36864 1 ahci +2025-02-13T20:00:59.0027742Z drm 495616 3 drm_kms_helper,cirrus +2025-02-13T20:00:59.0028371Z glue_helper 16384 1 aesni_intel +2025-02-13T20:00:59.0028980Z psmouse 155648 0 +2025-02-13T20:00:59.0029534Z mlxfw 32768 1 mlx5_core +2025-02-13T20:00:59.0030080Z psample 20480 1 mlx5_core +2025-02-13T20:00:59.0030661Z virtio_blk 20480 3' +2025-02-13T20:00:59.0031181Z + grep -q tenstorrent +2025-02-13T20:00:59.0045107Z + echo Module Size Used by veth 28672 0 wekafsio 70086656 2 wekafsgw 40960 8 wekafsio uio_pci_generic 16384 0 igb_uio 20480 0 uio 20480 2 igb_uio,uio_pci_generic xt_conntrack 16384 1 xt_MASQUERADE 20480 1 nf_conntrack_netlink 45056 0 nfnetlink 16384 2 nf_conntrack_netlink xfrm_user 36864 1 xfrm_algo 16384 1 xfrm_user iptable_nat 16384 1 nf_nat 45056 2 iptable_nat,xt_MASQUERADE nf_conntrack 139264 4 xt_conntrack,nf_nat,nf_conntrack_netlink,xt_MASQUERADE nf_defrag_ipv6 24576 1 nf_conntrack nf_defrag_ipv4 16384 1 nf_conntrack xt_addrtype 16384 2 iptable_filter 16384 1 bpfilter 32768 0 br_netfilter 28672 0 bridge 176128 1 br_netfilter stp 16384 1 bridge llc 16384 2 bridge,stp aufs 262144 0 xfs 1286144 2 overlay 118784 0 rdma_ucm 28672 0 rdma_cm 110592 1 rdma_ucm iw_cm 49152 1 rdma_cm ib_ipoib 131072 0 ib_cm 114688 2 rdma_cm,ib_ipoib ib_umad 28672 8 nls_iso8859_1 16384 1 dm_multipath 32768 0 scsi_dh_rdac 16384 0 scsi_dh_emc 16384 0 scsi_dh_alua 20480 0 mlx5_ib 397312 0 kvm_amd 98304 0 ib_uverbs 139264 24 rdma_ucm,mlx5_ib ccp 90112 1 kvm_amd input_leds 16384 0 kvm 667648 1 kvm_amd joydev 24576 0 serio_raw 20480 0 ib_core 348160 10 rdma_cm,ib_ipoib,iw_cm,ib_umad,rdma_ucm,ib_uverbs,mlx5_ib,ib_cm tenstorrent 49152 0 sch_fq_codel 20480 45 binfmt_misc 24576 1 msr 16384 0 efi_pstore 16384 0 virtio_rng 16384 0 ip_tables 32768 2 iptable_filter,iptable_nat x_tables 40960 5 xt_conntrack,iptable_filter,xt_addrtype,ip_tables,xt_MASQUERADE autofs4 45056 2 btrfs 1269760 0 zstd_compress 167936 1 btrfs raid10 61440 0 raid456 155648 0 async_raid6_recov 24576 1 raid456 async_memcpy 20480 2 raid456,async_raid6_recov async_pq 24576 2 raid456,async_raid6_recov async_xor 20480 3 async_pq,raid456,async_raid6_recov async_tx 20480 5 async_pq,async_memcpy,async_xor,raid456,async_raid6_recov xor 24576 2 async_xor,btrfs raid6_pq 114688 4 async_pq,btrfs,raid456,async_raid6_recov libcrc32c 16384 5 nf_conntrack,nf_nat,btrfs,xfs,raid456 raid1 45056 0 raid0 24576 0 multipath 20480 0 linear 20480 0 hid_generic 16384 0 crct10dif_pclmul 16384 1 usbhid 57344 0 crc32_pclmul 16384 0 ghash_clmulni_intel 16384 0 hid 131072 2 usbhid,hid_generic mlx5_core 1626112 1 mlx5_ib cirrus 16384 0 drm_kms_helper 184320 3 cirrus pci_hyperv_intf 16384 1 mlx5_core syscopyarea 16384 1 drm_kms_helper mlxdevm 172032 1 mlx5_core sysfillrect 16384 1 drm_kms_helper sysimgblt 16384 1 drm_kms_helper auxiliary 16384 2 mlx5_ib,mlx5_core fb_sys_fops 16384 1 drm_kms_helper aesni_intel 372736 0 crypto_simd 16384 1 aesni_intel mlx_compat 65536 12 rdma_cm,ib_ipoib,mlxdevm,iw_cm,auxiliary,ib_umad,ib_core,rdma_ucm,ib_uverbs,mlx5_ib,ib_cm,mlx5_core ahci 40960 0 tls 73728 1 mlx5_core cryptd 24576 2 crypto_simd,ghash_clmulni_intel libahci 36864 1 ahci drm 495616 3 drm_kms_helper,cirrus glue_helper 16384 1 aesni_intel psmouse 155648 0 mlxfw 32768 1 mlx5_core psample 20480 1 mlx5_core virtio_blk 20480 3 +2025-02-13T20:00:59.0063891Z + [[ 0 -ne 0 ]] +2025-02-13T20:00:59.0064404Z ++ lsof -w /dev/tenstorrent/0 +2025-02-13T20:00:59.1334523Z + lsof_output= +2025-02-13T20:00:59.1335359Z + '[' -n '' ']' +2025-02-13T20:00:59.1335877Z + i=0 +2025-02-13T20:00:59.1336362Z + iter_limit=10 +2025-02-13T20:00:59.1337226Z + echo '::notice title=printing-smi-info-startup::Touching and printing out SMI info' +2025-02-13T20:00:59.1338199Z + sleep 20 +2025-02-13T20:00:59.1341181Z ##[notice]Touching and printing out SMI info +2025-02-13T20:01:19.1350446Z + sudo touch /opt/tt_metal_infra/smi.log +2025-02-13T20:01:19.1610009Z + sudo chown ubuntu /opt/tt_metal_infra/smi.log +2025-02-13T20:01:19.1885615Z + tt-smi-metal -s -f /opt/tt_metal_infra/smi.log +2025-02-13T20:01:19.6070285Z +2025-02-13T20:01:19.6072076Z  Detected Chips: 1 +2025-02-13T20:01:19.6107684Z  +2025-02-13T20:01:19.6109492Z  Detected Chips: 1 +2025-02-13T20:01:19.6109783Z +2025-02-13T20:01:19.6110017Z  Detecting ARC: | +2025-02-13T20:01:19.6110344Z +2025-02-13T20:01:19.6110576Z  Detecting DRAM: | +2025-02-13T20:01:19.6110886Z +2025-02-13T20:01:19.6111152Z [] ETH: | +2025-02-13T20:01:19.6164540Z Gathering Information ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:00 +2025-02-13T20:01:19.6188050Z  Saved tt-smi log to: /opt/tt_metal_infra/smi.log  +2025-02-13T20:01:19.6956933Z + cat /opt/tt_metal_infra/smi.log +2025-02-13T20:01:19.6963739Z { +2025-02-13T20:01:19.6965271Z + echo '::notice title=attempting-reset-startup::Attempting to reset card(s). Sleeping first' +2025-02-13T20:01:19.6966545Z + sleep 30 +2025-02-13T20:01:19.6966957Z "time": "2025-02-13T20:01:19.610808", +2025-02-13T20:01:19.6968879Z "host_info": { +2025-02-13T20:01:19.6969306Z "OS": "Linux", +2025-02-13T20:01:19.6969709Z "Distro": "Ubuntu 20.04.6 LTS", +2025-02-13T20:01:19.6970184Z "Kernel": "5.4.0-205-generic", +2025-02-13T20:01:19.6970671Z "Hostname": "tt-metal-ci-vm-160", +2025-02-13T20:01:19.6971130Z "Platform": "x86_64", +2025-02-13T20:01:19.6971587Z "Python": "3.8.10", +2025-02-13T20:01:19.6972039Z "Memory": "47.14 GB", +2025-02-13T20:01:19.6972466Z "Driver": "TTKMD 1.29" +2025-02-13T20:01:19.6973024Z }, +2025-02-13T20:01:19.6973423Z "device_info": [ +2025-02-13T20:01:19.6973810Z { +2025-02-13T20:01:19.6974166Z "smbus_telem": { +2025-02-13T20:01:19.6975019Z "BOARD_ID": "0x10000361152e045", +2025-02-13T20:01:19.6975716Z "ENUM_VERSION": "0xba5e0001", +2025-02-13T20:01:19.6976216Z "DEVICE_ID": "0xfaca1e52", +2025-02-13T20:01:19.6976770Z "ASIC_RO": null, +2025-02-13T20:01:19.6977226Z "ASIC_IDD": null, +2025-02-13T20:01:19.6977691Z "BOARD_ID_HIGH": "0x1000036", +2025-02-13T20:01:19.6978192Z "BOARD_ID_LOW": "0x1152e045", +2025-02-13T20:01:19.6978697Z "ARC0_FW_VERSION": "0x1070000", +2025-02-13T20:01:19.6979195Z "ARC1_FW_VERSION": "0x1070000", +2025-02-13T20:01:19.6979714Z "ARC2_FW_VERSION": null, +2025-02-13T20:01:19.6980241Z "ARC3_FW_VERSION": "0x1070000", +2025-02-13T20:01:19.6980771Z "SPIBOOTROM_FW_VERSION": null, +2025-02-13T20:01:19.6981262Z "ETH_FW_VERSION": null, +2025-02-13T20:01:19.6981759Z "M3_BL_FW_VERSION": null, +2025-02-13T20:01:19.6982265Z "M3_APP_FW_VERSION": null, +2025-02-13T20:01:19.6982774Z "DDR_SPEED": "0xe74", +2025-02-13T20:01:19.6983274Z "DDR_STATUS": "0x111111", +2025-02-13T20:01:19.6983734Z "ETH_STATUS0": null, +2025-02-13T20:01:19.6984343Z "ETH_STATUS1": null, +2025-02-13T20:01:19.6985006Z "PCIE_STATUS": "0x11040042", +2025-02-13T20:01:19.6985489Z "FAULTS": null, +2025-02-13T20:01:19.6985945Z "ARC0_HEALTH": "0x39e74a28", +2025-02-13T20:01:19.6986423Z "ARC1_HEALTH": null, +2025-02-13T20:01:19.6986904Z "ARC2_HEALTH": null, +2025-02-13T20:01:19.6987623Z "ARC3_HEALTH": null, +2025-02-13T20:01:19.6988088Z "FAN_SPEED": "0xff", +2025-02-13T20:01:19.6988552Z "AICLK": "0x4b200fa", +2025-02-13T20:01:19.6989005Z "AXICLK": "0x384", +2025-02-13T20:01:19.6989465Z "ARCCLK": "0x21c", +2025-02-13T20:01:19.6989924Z "THROTTLER": null, +2025-02-13T20:01:19.6990386Z "VCORE": "0x2e4", +2025-02-13T20:01:19.6990853Z "ASIC_TEMPERATURE": "0x2f60210", +2025-02-13T20:01:19.6991369Z "VREG_TEMPERATURE": null, +2025-02-13T20:01:19.6991851Z "BOARD_TEMPERATURE": null, +2025-02-13T20:01:19.6992331Z "TDP": "0xaa0011", +2025-02-13T20:01:19.6992763Z "TDC": "0x12c0015", +2025-02-13T20:01:19.6993233Z "VDD_LIMITS": "0x3a202e4", +2025-02-13T20:01:19.6993726Z "THM_LIMITS": "0x53004b", +2025-02-13T20:01:19.6994247Z "WH_FW_DATE": "0x45011317", +2025-02-13T20:01:19.6994758Z "ASIC_TMON0": "0x21212222", +2025-02-13T20:01:19.6995238Z "ASIC_TMON1": "0x2121", +2025-02-13T20:01:19.6995727Z "MVDDQ_POWER": null, +2025-02-13T20:01:19.6996211Z "GDDR_TRAIN_TEMP0": null, +2025-02-13T20:01:19.6996750Z "GDDR_TRAIN_TEMP1": null, +2025-02-13T20:01:19.6997318Z "BOOT_DATE": "0x5208110b", +2025-02-13T20:01:19.6997900Z "RT_SECONDS": null, +2025-02-13T20:01:19.6998458Z "AUX_STATUS": null, +2025-02-13T20:01:19.6998977Z "ETH_DEBUG_STATUS0": null, +2025-02-13T20:01:19.6999475Z "ETH_DEBUG_STATUS1": null, +2025-02-13T20:01:19.7000334Z "TT_FLASH_VERSION": "0x30100", +2025-02-13T20:01:19.7000861Z "FW_BUNDLE_VERSION": "0x50090000" +2025-02-13T20:01:19.7001345Z }, +2025-02-13T20:01:19.7001698Z "board_info": { +2025-02-13T20:01:19.7002141Z "bus_id": "0000:07:00.0", +2025-02-13T20:01:19.7002608Z "board_type": "e150", +2025-02-13T20:01:19.7003096Z "board_id": "10000361152e045", +2025-02-13T20:01:19.7003583Z "coords": "N/A", +2025-02-13T20:01:19.7004042Z "dram_status": true, +2025-02-13T20:01:19.7004532Z "dram_speed": "3700", +2025-02-13T20:01:19.7005202Z "pcie_speed": 4, +2025-02-13T20:01:19.7005637Z "pcie_width": "16" +2025-02-13T20:01:19.7006162Z }, +2025-02-13T20:01:19.7006613Z "telemetry": { +2025-02-13T20:01:19.7007126Z "voltage": "0.74", +2025-02-13T20:01:19.7007574Z "current": " 21.0", +2025-02-13T20:01:19.7008023Z "power": " 17.0", +2025-02-13T20:01:19.7008470Z "aiclk": " 250", +2025-02-13T20:01:19.7008933Z "asic_temperature": "33.0" +2025-02-13T20:01:19.7009363Z }, +2025-02-13T20:01:19.7009743Z "firmwares": { +2025-02-13T20:01:19.7010191Z "fw_bundle_version": "80.9.0.0", +2025-02-13T20:01:19.7010710Z "tt_flash_version": "0.3.1.0", +2025-02-13T20:01:19.7011201Z "cm_fw": "1.7.0.0", +2025-02-13T20:01:19.7011671Z "cm_fw_date": "2024-05-01", +2025-02-13T20:01:19.7012147Z "eth_fw": "N/A", +2025-02-13T20:01:19.7012606Z "bm_bl_fw": "N/A", +2025-02-13T20:01:19.7013024Z "bm_app_fw": "N/A" +2025-02-13T20:01:19.7013445Z }, +2025-02-13T20:01:19.7013814Z "limits": { +2025-02-13T20:01:19.7014211Z "vdd_min": "0.74", +2025-02-13T20:01:19.7014723Z "vdd_max": "0.93", +2025-02-13T20:01:19.7015186Z "tdp_limit": "170", +2025-02-13T20:01:19.7015672Z "tdc_limit": "300", +2025-02-13T20:01:19.7016133Z "asic_fmax": "1202", +2025-02-13T20:01:19.7016589Z "therm_trip_l1_limit": "83", +2025-02-13T20:01:19.7017076Z "thm_limit": "75", +2025-02-13T20:01:19.7017582Z "bus_peak_limit": null +2025-02-13T20:01:19.7018163Z } +2025-02-13T20:01:19.7018486Z } +2025-02-13T20:01:19.7018829Z ] +2025-02-13T20:01:19.7019428Z }::notice title=attempting-reset-startup::Attempting to reset card(s). Sleeping first +2025-02-13T20:01:49.6981544Z + '[' 0 -lt 10 ']' +2025-02-13T20:01:49.6982080Z + (( i++ )) +2025-02-13T20:01:49.6983311Z ++ tt-smi-metal -r 0 +2025-02-13T20:01:50.2494436Z + reset_output=' Starting Tensix reset on GS board at PCI index 0  +2025-02-13T20:01:50.2495342Z  Lowering clks to safe value...  +2025-02-13T20:01:50.2496031Z  Beginning reset sequence...  +2025-02-13T20:01:50.2496689Z  Finishing reset sequence...  +2025-02-13T20:01:50.2497421Z  Returning clks to original values...  +2025-02-13T20:01:50.2498196Z  Finished Tensix reset on GS board at PCI index 0 +2025-02-13T20:01:50.2498926Z  +2025-02-13T20:01:50.2499455Z  Re-initializing boards after reset....  +2025-02-13T20:01:50.2499966Z +2025-02-13T20:01:50.2500343Z  Detected Chips: 1 +2025-02-13T20:01:50.2501006Z  +2025-02-13T20:01:50.2501577Z  Detected Chips: 1 +2025-02-13T20:01:50.2501940Z +2025-02-13T20:01:50.2502294Z  Detecting ARC: | +2025-02-13T20:01:50.2502627Z +2025-02-13T20:01:50.2502881Z  Detecting DRAM: | +2025-02-13T20:01:50.2503227Z +2025-02-13T20:01:50.2503484Z [] ETH: |' +2025-02-13T20:01:50.2503952Z + [[ 0 -ne 0 ]] +2025-02-13T20:01:50.2504628Z + [[  Starting Tensix reset on GS board at PCI index 0  +2025-02-13T20:01:50.2505409Z  Lowering clks to safe value...  +2025-02-13T20:01:50.2506075Z  Beginning reset sequence...  +2025-02-13T20:01:50.2506732Z  Finishing reset sequence...  +2025-02-13T20:01:50.2507422Z  Returning clks to original values...  +2025-02-13T20:01:50.2508199Z  Finished Tensix reset on GS board at PCI index 0 +2025-02-13T20:01:50.2508835Z  +2025-02-13T20:01:50.2509531Z  Re-initializing boards after reset....  +2025-02-13T20:01:50.2510086Z +2025-02-13T20:01:50.2510364Z  Detected Chips: 1 +2025-02-13T20:01:50.2510892Z  +2025-02-13T20:01:50.2511356Z  Detected Chips: 1 +2025-02-13T20:01:50.2511683Z +2025-02-13T20:01:50.2511959Z  Detecting ARC: | +2025-02-13T20:01:50.2512765Z +2025-02-13T20:01:50.2513047Z  Detecting DRAM: | +2025-02-13T20:01:50.2513369Z +2025-02-13T20:01:50.2513794Z [] ETH: | == *\N\o\ \c\h\i\p\s\ \d\e\t\e\c\t\e\d* ]] +2025-02-13T20:01:50.2514433Z + break +2025-02-13T20:01:50.2514791Z + '[' 1 -eq 10 ']' +2025-02-13T20:01:50.2515597Z + echo '::notice title=reset-successful-startup::tt-smi reset was successful' +2025-02-13T20:01:50.2516491Z + check_hugepages_service_status=0 +2025-02-13T20:01:50.2517132Z + sudo systemctl status tenstorrent-hugepages.service +2025-02-13T20:01:50.2519010Z ##[notice]tt-smi reset was successful +2025-02-13T20:01:50.2822738Z ● tenstorrent-hugepages.service - Script that configures hugepages for Tenstorrent ASICs +2025-02-13T20:01:50.2824173Z Loaded: loaded (/lib/systemd/system/tenstorrent-hugepages.service; enabled; vendor preset: enabled) +2025-02-13T20:01:50.2825394Z Active: inactive (dead) since Thu 2025-02-13 19:53:53 UTC; 7min ago +2025-02-13T20:01:50.2826521Z Process: 639514 ExecStart=/opt/tenstorrent/bin/hugepages-setup.sh (code=exited, status=0/SUCCESS) +2025-02-13T20:01:50.2827548Z Main PID: 639514 (code=exited, status=0/SUCCESS) +2025-02-13T20:01:50.2827968Z +2025-02-13T20:01:50.2828572Z Feb 13 19:53:53 tt-metal-ci-vm-160 systemd[1]: Started Script that configures hugepages for Tenstorrent ASICs. +2025-02-13T20:01:50.2829769Z Feb 13 19:53:53 tt-metal-ci-vm-160 hugepages-setup.sh[639514]: Node 0 hugepages before: 1 +2025-02-13T20:01:50.2830924Z Feb 13 19:53:53 tt-metal-ci-vm-160 hugepages-setup.sh[639514]: Node 0 hugepages needed: 1 +2025-02-13T20:01:50.2831976Z Feb 13 19:53:53 tt-metal-ci-vm-160 hugepages-setup.sh[639514]: Node 0 hugepages after: 1 +2025-02-13T20:01:50.2833366Z Feb 13 19:53:53 tt-metal-ci-vm-160 hugepages-setup.sh[639514]: Completed hugepage setup +2025-02-13T20:01:50.2834445Z Feb 13 19:53:53 tt-metal-ci-vm-160 systemd[1]: tenstorrent-hugepages.service: Succeeded. +2025-02-13T20:01:50.2835317Z + check_hugepages_service_status=3 +2025-02-13T20:01:50.2835842Z + '[' 3 -eq 4 ']' +2025-02-13T20:01:50.2837190Z + echo '::notice title=hugepages-service-found-startup::Hugepages service found. Command returned with exit code 3. Restarting it so we can ensure hugepages are available' +2025-02-13T20:01:50.2838733Z + sudo systemctl restart tenstorrent-hugepages.service +2025-02-13T20:01:50.2842028Z ##[notice]Hugepages service found. Command returned with exit code 3. Restarting it so we can ensure hugepages are available +2025-02-13T20:01:50.3135621Z ++ date +%s +2025-02-13T20:01:50.3165351Z + hugepages_check_start=1739476910 +2025-02-13T20:01:50.3170088Z + hugepages_check_timeout=60 +2025-02-13T20:01:50.3195006Z ++ cat /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages +2025-02-13T20:01:50.3197338Z ##[notice]Hugepages is now setup. +2025-02-13T20:01:50.3199150Z + [[ 1 -eq 0 ]] +2025-02-13T20:01:50.3200031Z + echo '::notice title=hugepages-setup-success-startup::Hugepages is now setup.' +2025-02-13T20:01:50.3200835Z + echo 'Printing out cpu information...' +2025-02-13T20:01:50.3201358Z + lscpu +2025-02-13T20:01:50.3201753Z Printing out cpu information... +2025-02-13T20:01:50.3215625Z Architecture: x86_64 +2025-02-13T20:01:50.3216300Z CPU op-mode(s): 32-bit, 64-bit +2025-02-13T20:01:50.3216873Z Byte Order: Little Endian +2025-02-13T20:01:50.3217448Z Address sizes: 40 bits physical, 48 bits virtual +2025-02-13T20:01:50.3217997Z CPU(s): 14 +2025-02-13T20:01:50.3218453Z On-line CPU(s) list: 0-13 +2025-02-13T20:01:50.3218959Z Thread(s) per core: 1 +2025-02-13T20:01:50.3219502Z Core(s) per socket: 1 +2025-02-13T20:01:50.3220014Z Socket(s): 14 +2025-02-13T20:01:50.3220485Z NUMA node(s): 2 +2025-02-13T20:01:50.3220950Z Vendor ID: AuthenticAMD +2025-02-13T20:01:50.3221532Z CPU family: 23 +2025-02-13T20:01:50.3222158Z Model: 49 +2025-02-13T20:01:50.3223110Z Model name: AMD EPYC-Rome Processor +2025-02-13T20:01:50.3223699Z Stepping: 0 +2025-02-13T20:01:50.3224155Z CPU MHz: 2299.998 +2025-02-13T20:01:50.3224622Z BogoMIPS: 4599.99 +2025-02-13T20:01:50.3225131Z Virtualization: AMD-V +2025-02-13T20:01:50.3225600Z Hypervisor vendor: KVM +2025-02-13T20:01:50.3226246Z Virtualization type: full +2025-02-13T20:01:50.3226727Z L1d cache: 448 KiB +2025-02-13T20:01:50.3227180Z L1i cache: 448 KiB +2025-02-13T20:01:50.3227629Z L2 cache: 7 MiB +2025-02-13T20:01:50.3228102Z L3 cache: 224 MiB +2025-02-13T20:01:50.3228560Z NUMA node0 CPU(s): 0-6 +2025-02-13T20:01:50.3229010Z NUMA node1 CPU(s): 7-13 +2025-02-13T20:01:50.3229515Z Vulnerability Gather data sampling: Not affected +2025-02-13T20:01:50.3230053Z Vulnerability Itlb multihit: Not affected +2025-02-13T20:01:50.3230580Z Vulnerability L1tf: Not affected +2025-02-13T20:01:50.3231097Z Vulnerability Mds: Not affected +2025-02-13T20:01:50.3231622Z Vulnerability Meltdown: Not affected +2025-02-13T20:01:50.3232151Z Vulnerability Mmio stale data: Not affected +2025-02-13T20:01:50.3232688Z Vulnerability Retbleed: Vulnerable +2025-02-13T20:01:50.3233821Z Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp +2025-02-13T20:01:50.3234885Z Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization +2025-02-13T20:01:50.3236385Z Vulnerability Spectre v2: Mitigation; Retpolines; IBPB conditional; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected +2025-02-13T20:01:50.3237375Z Vulnerability Srbds: Not affected +2025-02-13T20:01:50.3237885Z Vulnerability Tsx async abort: Not affected +2025-02-13T20:01:50.3241398Z Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid tsc_known_freq pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm svm cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core ssbd ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 clzero xsaveerptr wbnoinvd arat npt nrip_save umip rdpid +2025-02-13T20:01:50.3478989Z ##[group]Run tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main +2025-02-13T20:01:50.3479848Z with: +2025-02-13T20:01:50.3480351Z token: *** +2025-02-13T20:01:50.3480701Z fetch-depth: 1 +2025-02-13T20:01:50.3481078Z env: +2025-02-13T20:01:50.3481395Z ARCH_NAME: grayskull +2025-02-13T20:01:50.3481800Z LOGURU_LEVEL: INFO +2025-02-13T20:01:50.3482159Z ##[endgroup] +2025-02-13T20:01:50.3591953Z ##[group]Run set -x +2025-02-13T20:01:50.3592334Z set -x +2025-02-13T20:01:50.3592668Z ls -al +2025-02-13T20:01:50.3593076Z if [ -f "semicolon_delimited_script" ]; then +2025-02-13T20:01:50.3593616Z  file semicolon_delimited_script +2025-02-13T20:01:50.3594111Z  head semicolon_delimited_script +2025-02-13T20:01:50.3594563Z fi +2025-02-13T20:01:50.3594899Z sudo rm -rf deleteme +2025-02-13T20:01:50.3595318Z sudo rm -rf docker-job +2025-02-13T20:01:50.3595738Z if [ -d ".git" ]; then +2025-02-13T20:01:50.3596211Z  echo 'Cleaning repo' +2025-02-13T20:01:50.3596613Z  git clean -xffd +2025-02-13T20:01:50.3597022Z  echo 'Done git clean -xffd' +2025-02-13T20:01:50.3597509Z  echo 'Attempting to delete any lock files' +2025-02-13T20:01:50.3598052Z  find .git -type f -iname '*.lock' -delete +2025-02-13T20:01:50.3598804Z  echo 'Done deleting lock files' +2025-02-13T20:01:50.3599298Z  echo 'De-init-ing submodules' +2025-02-13T20:01:50.3599985Z  git submodule deinit -f --all +2025-02-13T20:01:50.3600457Z  echo 'Done de-initing submodules' +2025-02-13T20:01:50.3600966Z fi +2025-02-13T20:01:50.3620528Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:01:50.3621058Z env: +2025-02-13T20:01:50.3621428Z ARCH_NAME: grayskull +2025-02-13T20:01:50.3621791Z LOGURU_LEVEL: INFO +2025-02-13T20:01:50.3622144Z ##[endgroup] +2025-02-13T20:01:50.3663031Z + ls -al +2025-02-13T20:01:50.3680922Z total 699996 +2025-02-13T20:01:50.3681522Z drwxr-xr-x 26 ubuntu ubuntu 4096 Feb 13 19:55 . +2025-02-13T20:01:50.3682079Z drwxr-xr-x 3 ubuntu ubuntu 4096 Jan 13 23:55 .. +2025-02-13T20:01:50.3682648Z drwxr-xr-x 3 ubuntu ubuntu 4096 Feb 13 19:54 .cache +2025-02-13T20:01:50.3683225Z -rw-r--r-- 1 ubuntu ubuntu 3966 Jan 27 06:06 .clang-format +2025-02-13T20:01:50.3683919Z -rw-r--r-- 1 ubuntu ubuntu 6268 Jan 27 06:06 .clang-format-ignore +2025-02-13T20:01:50.3684537Z -rw-r--r-- 1 ubuntu ubuntu 6374 Jan 27 06:06 .clang-tidy +2025-02-13T20:01:50.3685137Z -rw-r--r-- 1 ubuntu ubuntu 43 Jan 27 06:06 .clangd +2025-02-13T20:01:50.3685704Z -rw-r--r-- 1 ubuntu ubuntu 222 Jan 27 06:06 .gersemirc +2025-02-13T20:01:50.3686270Z drwxr-xr-x 9 ubuntu ubuntu 4096 Feb 13 19:57 .git +2025-02-13T20:01:50.3686909Z -rw-r--r-- 1 ubuntu ubuntu 239 Jan 27 06:06 .git-blame-ignore-revs +2025-02-13T20:01:50.3687540Z -rw-r--r-- 1 ubuntu ubuntu 35 Jan 27 06:06 .gitattributes +2025-02-13T20:01:50.3688114Z drwxr-xr-x 6 ubuntu ubuntu 4096 Feb 13 05:41 .github +2025-02-13T20:01:50.3688677Z -rw-r--r-- 1 ubuntu ubuntu 1730 Jan 27 06:06 .gitignore +2025-02-13T20:01:50.3689231Z -rw-r--r-- 1 ubuntu ubuntu 991 Feb 5 14:57 .gitmodules +2025-02-13T20:01:50.3689847Z drwx------ 6 ubuntu ubuntu 4096 Feb 13 19:55 .local +2025-02-13T20:01:50.3690469Z -rw-r--r-- 1 ubuntu ubuntu 932 Jan 27 06:06 .pre-commit-config.yaml +2025-02-13T20:01:50.3691177Z -rw-r--r-- 1 ubuntu ubuntu 15813574 Feb 13 05:41 .test_durations +2025-02-13T20:01:50.3691747Z -rw-r--r-- 1 ubuntu ubuntu 213 Jan 27 06:06 .yamllint +2025-02-13T20:01:50.3692315Z -rw-r--r-- 1 ubuntu ubuntu 11086 Feb 13 05:41 CMakeLists.txt +2025-02-13T20:01:50.3692917Z -rw-r--r-- 1 ubuntu ubuntu 2231 Feb 5 14:57 CMakePresets.json +2025-02-13T20:01:50.3693505Z -rw-r--r-- 1 ubuntu ubuntu 11478 Feb 13 05:41 CODEOWNERS +2025-02-13T20:01:50.3694090Z -rw-r--r-- 1 ubuntu ubuntu 5253 Jan 27 06:06 CODE_OF_CONDUCT.md +2025-02-13T20:01:50.3694949Z -rw-r--r-- 1 ubuntu ubuntu 36527 Jan 27 06:06 CONTRIBUTING.md +2025-02-13T20:01:50.3695525Z -rw-r--r-- 1 ubuntu ubuntu 126373 Jan 27 06:06 Doxyfile +2025-02-13T20:01:50.3696100Z -rw-r--r-- 1 ubuntu ubuntu 6046 Feb 5 14:57 INSTALLING.md +2025-02-13T20:01:50.3696659Z -rw-r--r-- 1 ubuntu ubuntu 11825 Jan 27 06:06 LICENSE +2025-02-13T20:01:50.3697213Z -rw-r--r-- 1 ubuntu ubuntu 1562 Jan 27 06:06 MANIFEST.in +2025-02-13T20:01:50.3697812Z -rw-r--r-- 1 ubuntu ubuntu 18372 Feb 13 05:41 METALIUM_GUIDE.md +2025-02-13T20:01:50.3698389Z -rw-r--r-- 1 ubuntu ubuntu 15526 Feb 13 05:41 README.md +2025-02-13T20:01:50.3698932Z drwxr-xr-x 7 ubuntu ubuntu 4096 Feb 13 19:54 build +2025-02-13T20:01:50.3699495Z -rwxr-xr-x 1 ubuntu ubuntu 11097 Feb 13 05:41 build_metal.sh +2025-02-13T20:01:50.3700084Z drwxr-xr-x 3 ubuntu ubuntu 4096 Feb 13 19:55 built +2025-02-13T20:01:50.3700689Z -rw-r--r-- 1 ubuntu ubuntu 1438 Jan 27 06:06 check_copyright_config.yaml +2025-02-13T20:01:50.3701309Z -rw-r--r-- 1 ubuntu ubuntu 1821 Jan 27 06:06 cloc.sh +2025-02-13T20:01:50.3701840Z drwxr-xr-x 4 ubuntu ubuntu 4096 Feb 13 05:41 cmake +2025-02-13T20:01:50.3702386Z -rw-r--r-- 1 ubuntu ubuntu 23178 Feb 13 05:41 conftest.py +2025-02-13T20:01:50.3702966Z drwxr-xr-x 2 ubuntu ubuntu 4096 Jan 27 06:06 contributing +2025-02-13T20:01:50.3703703Z -rwxr-xr-x 1 ubuntu ubuntu 1420 Jan 27 06:06 create_venv.sh +2025-02-13T20:01:50.3704274Z drwxr-xr-x 2 ubuntu ubuntu 4096 Feb 13 19:44 data +2025-02-13T20:01:50.3704845Z drwxr-xr-x 2 ubuntu ubuntu 4096 Feb 13 05:41 dependencies +2025-02-13T20:01:50.3705447Z drwxr-xr-x 2 ubuntu ubuntu 4096 Feb 13 05:41 dockerfile +2025-02-13T20:01:50.3706007Z drwxr-xr-x 3 ubuntu ubuntu 4096 Feb 7 18:22 docs +2025-02-13T20:01:50.3706553Z drwxr-xr-x 4 ubuntu ubuntu 4096 Feb 13 19:55 generated +2025-02-13T20:01:50.3707108Z drwxr-xr-x 4 ubuntu ubuntu 4096 Feb 5 14:57 infra +2025-02-13T20:01:50.3707729Z -rwxr-xr-x 1 ubuntu ubuntu 6885 Feb 13 05:41 install_dependencies.sh +2025-02-13T20:01:50.3708357Z drwxr-xr-x 10 ubuntu ubuntu 4096 Feb 13 19:55 models +2025-02-13T20:01:50.3708931Z -rw-r--r-- 1 ubuntu ubuntu 1042 Jan 27 06:06 pyproject.toml +2025-02-13T20:01:50.3709522Z -rw-r--r-- 1 ubuntu ubuntu 1200 Jan 27 06:06 pytest.ini +2025-02-13T20:01:50.3710118Z drwxr-xr-x 7 ubuntu ubuntu 4096 Feb 13 15:08 python_env +2025-02-13T20:01:50.3710691Z drwxr-xr-x 4 ubuntu ubuntu 4096 Feb 13 19:44 runtime +2025-02-13T20:01:50.3711259Z drwxr-xr-x 4 ubuntu ubuntu 4096 Feb 13 05:41 scripts +2025-02-13T20:01:50.3711858Z -rw-r--r-- 1 root root 329 Feb 13 19:54 semicolon_delimited_script +2025-02-13T20:01:50.3712486Z -rw-r--r-- 1 ubuntu ubuntu 7551 Feb 5 14:57 setup.py +2025-02-13T20:01:50.3713080Z drwxr-xr-x 24 ubuntu ubuntu 4096 Jan 27 06:06 tech_reports +2025-02-13T20:01:50.3713661Z drwxr-xr-x 11 ubuntu ubuntu 4096 Feb 13 05:41 tests +2025-02-13T20:01:50.3714235Z drwxr-xr-x 11 ubuntu ubuntu 4096 Feb 13 05:41 tt-train +2025-02-13T20:01:50.3714821Z drwxr-xr-x 5 ubuntu ubuntu 4096 Feb 13 19:49 tt_fabric +2025-02-13T20:01:50.3715405Z drwxr-xr-x 22 ubuntu ubuntu 4096 Feb 13 05:46 tt_metal +2025-02-13T20:01:50.3715978Z -rw-r--r-- 1 ubuntu ubuntu 700477440 Feb 13 19:54 ttm_any.tar +2025-02-13T20:01:50.3716519Z drwxr-xr-x 10 ubuntu ubuntu 4096 Feb 13 19:55 ttnn +2025-02-13T20:01:50.3717038Z + '[' -f semicolon_delimited_script ']' +2025-02-13T20:01:50.3717468Z + file semicolon_delimited_script +2025-02-13T20:01:50.3729789Z semicolon_delimited_script: ASCII text +2025-02-13T20:01:50.3731107Z + head semicolon_delimited_script +2025-02-13T20:01:50.3739479Z set -eu +2025-02-13T20:01:50.3739666Z +2025-02-13T20:01:50.3740331Z install_wheel=false +2025-02-13T20:01:50.3740773Z if [ "${install_wheel,,}" == "true" ]; then +2025-02-13T20:01:50.3741302Z WHEEL_FILENAME=$(ls -1 *.whl) +2025-02-13T20:01:50.3741738Z pip3 install "$WHEEL_FILENAME" +2025-02-13T20:01:50.3742166Z fi +2025-02-13T20:01:50.3742859Z +2025-02-13T20:01:50.3743073Z pip install --force-reinstall pip==21.2.4 +2025-02-13T20:01:50.3743628Z pip install -r tt_metal/python_env/requirements-dev.txt +2025-02-13T20:01:50.3747671Z + sudo rm -rf deleteme +2025-02-13T20:01:50.4014417Z + sudo rm -rf docker-job +2025-02-13T20:01:50.4234055Z + '[' -d .git ']' +2025-02-13T20:01:50.4234684Z Cleaning repo +2025-02-13T20:01:50.4235101Z + echo 'Cleaning repo' +2025-02-13T20:01:50.4235555Z + git clean -xffd +2025-02-13T20:01:53.3522659Z Removing .cache/ +2025-02-13T20:01:53.3523200Z Removing .local/ +2025-02-13T20:01:53.3523599Z Removing build/ +2025-02-13T20:01:53.3524059Z Removing built/ +2025-02-13T20:01:53.3524416Z Removing data/ +2025-02-13T20:01:53.3524795Z Removing generated/ +2025-02-13T20:01:53.3525205Z Removing models/__pycache__/ +2025-02-13T20:01:53.3525687Z Removing python_env/ +2025-02-13T20:01:53.3526089Z Removing runtime/ +2025-02-13T20:01:53.3526500Z Removing semicolon_delimited_script +2025-02-13T20:01:53.3527007Z Removing tests/scripts/__pycache__/ +2025-02-13T20:01:53.3527533Z Removing ttm_any.tar +2025-02-13T20:01:53.3527934Z Removing ttnn/tt_lib/__pycache__/ +2025-02-13T20:01:53.3528438Z Removing ttnn/tt_lib/fused_ops/__pycache__/ +2025-02-13T20:01:53.3528972Z Removing ttnn/ttnn.egg-info/ +2025-02-13T20:01:53.3529425Z Removing ttnn/ttnn/__pycache__/ +2025-02-13T20:01:53.3529983Z Removing ttnn/ttnn/_ttnn.so +2025-02-13T20:01:53.3531288Z Removing ttnn/ttnn/distributed/__pycache__/ +2025-02-13T20:01:53.3531857Z Removing ttnn/ttnn/experimental_loader/__pycache__/ +2025-02-13T20:01:53.3532476Z Removing ttnn/ttnn/operations/__pycache__/ +2025-02-13T20:01:53.3545566Z + echo 'Done git clean -xffd' +2025-02-13T20:01:53.3546047Z + echo 'Attempting to delete any lock files' +2025-02-13T20:01:53.3546564Z + find .git -type f -iname '*.lock' -delete +2025-02-13T20:01:53.3547038Z Done git clean -xffd +2025-02-13T20:01:53.3547450Z Attempting to delete any lock files +2025-02-13T20:01:53.3826680Z + echo 'Done deleting lock files' +2025-02-13T20:01:53.3827097Z Done deleting lock files +2025-02-13T20:01:53.3827511Z + echo 'De-init-ing submodules' +2025-02-13T20:01:53.3827905Z De-init-ing submodules +2025-02-13T20:01:53.3828280Z + git submodule deinit -f --all +2025-02-13T20:01:53.4094074Z Cleared directory 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:01:53.4130599Z Submodule 'models/demos/t3000/llama2_70b/reference/llama' (https://github.com/tenstorrent-metal/llama.git) unregistered for path 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:01:53.4131899Z Cleared directory 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:01:53.4297561Z Submodule '3rd_party/wandb-cpp' (https://github.com/yhisaki/wandb-cpp) unregistered for path 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:01:53.4298482Z Cleared directory 'tt_metal/third_party/tracy' +2025-02-13T20:01:53.4332278Z Submodule 'tt_metal/third_party/tracy' (https://github.com/tenstorrent-metal/tracy.git) unregistered for path 'tt_metal/third_party/tracy' +2025-02-13T20:01:53.4333328Z Cleared directory 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:01:53.4367034Z Submodule 'tt_metal/third_party/tt_llk_blackhole' (https://github.com/tenstorrent/tt-llk-bh.git) unregistered for path 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:01:53.4368150Z Cleared directory 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:01:53.4400280Z Submodule 'tt_metal/third_party/tt_llk_grayskull' (https://github.com/tenstorrent/tt-llk-gs.git) unregistered for path 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:01:53.4401500Z Cleared directory 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:01:53.4569836Z Submodule 'tt_metal/third_party/tt_llk_wormhole_b0' (https://github.com/tenstorrent/tt-llk-wh-b0.git) unregistered for path 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:01:53.4570941Z Cleared directory 'tt_metal/third_party/umd' +2025-02-13T20:01:53.4585120Z Submodule 'tt_metal/third_party/umd' (https://github.com/tenstorrent/tt-umd.git) unregistered for path 'tt_metal/third_party/umd' +2025-02-13T20:01:53.4594256Z + echo 'Done de-initing submodules' +2025-02-13T20:01:53.4594734Z Done de-initing submodules +2025-02-13T20:01:53.4711557Z ##[group]Run actions/checkout@v4 +2025-02-13T20:01:53.4712088Z with: +2025-02-13T20:01:53.4712812Z token: *** +2025-02-13T20:01:53.4713227Z fetch-depth: 1 +2025-02-13T20:01:53.4713652Z lfs: false +2025-02-13T20:01:53.4714111Z submodules: recursive +2025-02-13T20:01:53.4714584Z clean: true +2025-02-13T20:01:53.4715043Z repository: tenstorrent/tt-metal +2025-02-13T20:01:53.4715605Z ssh-strict: true +2025-02-13T20:01:53.4716039Z ssh-user: git +2025-02-13T20:01:53.4716478Z persist-credentials: true +2025-02-13T20:01:53.4717051Z sparse-checkout-cone-mode: true +2025-02-13T20:01:53.4717617Z fetch-tags: false +2025-02-13T20:01:53.4718077Z show-progress: true +2025-02-13T20:01:53.4718573Z set-safe-directory: true +2025-02-13T20:01:53.4719051Z env: +2025-02-13T20:01:53.4719427Z ARCH_NAME: grayskull +2025-02-13T20:01:53.4720054Z LOGURU_LEVEL: INFO +2025-02-13T20:01:53.4720467Z ##[endgroup] +2025-02-13T20:01:53.6035249Z Syncing repository: tenstorrent/tt-metal +2025-02-13T20:01:53.6037126Z ##[group]Getting Git version info +2025-02-13T20:01:53.6037927Z Working directory is '/home/ubuntu/actions-runner/_work/tt-metal/tt-metal' +2025-02-13T20:01:53.6039047Z [command]/usr/bin/git version +2025-02-13T20:01:53.6039542Z git version 2.25.1 +2025-02-13T20:01:53.6044660Z ##[endgroup] +2025-02-13T20:01:53.6056866Z Copying '/home/ubuntu/.gitconfig' to '/home/ubuntu/actions-runner/_work/_temp/daca7d6a-6d2a-41a2-a2d1-0aeab7f115e4/.gitconfig' +2025-02-13T20:01:53.6072783Z Temporarily overriding HOME='/home/ubuntu/actions-runner/_work/_temp/daca7d6a-6d2a-41a2-a2d1-0aeab7f115e4' before making global git config changes +2025-02-13T20:01:53.6074398Z Adding repository directory to the temporary git global config as a safe directory +2025-02-13T20:01:53.6078223Z [command]/usr/bin/git config --global --add safe.directory /home/ubuntu/actions-runner/_work/tt-metal/tt-metal +2025-02-13T20:01:53.6129235Z [command]/usr/bin/git config --local --get remote.origin.url +2025-02-13T20:01:53.6150024Z https://github.com/tenstorrent/tt-metal +2025-02-13T20:01:53.6167713Z ##[group]Removing previously created refs, to avoid conflicts +2025-02-13T20:01:53.6171919Z [command]/usr/bin/git rev-parse --symbolic-full-name --verify --quiet HEAD +2025-02-13T20:01:53.6206590Z refs/heads/smanoj/conv_device_weights +2025-02-13T20:01:53.6216182Z [command]/usr/bin/git checkout --detach +2025-02-13T20:01:53.6679879Z HEAD is now at 68e85df3 #0: Skip weights bfloat8 on grayskull +2025-02-13T20:01:53.6735051Z [command]/usr/bin/git branch --delete --force smanoj/conv_device_weights +2025-02-13T20:01:53.6767246Z Deleted branch smanoj/conv_device_weights (was 68e85df3). +2025-02-13T20:01:53.6891556Z ##[endgroup] +2025-02-13T20:01:53.6895651Z [command]/usr/bin/git submodule status +2025-02-13T20:01:53.7155681Z -29125b7ad8b5513eeaa4417ed92892bf39c8bd74 models/demos/t3000/llama2_70b/reference/llama +2025-02-13T20:01:53.7157038Z -368cd07f89f497df20a66936fbfae3956f151af4 tt-train/3rd_party/wandb-cpp +2025-02-13T20:01:53.7158179Z -71d4c8d378b52af7da7012b9b595a61e9304f0bb tt_metal/third_party/tracy +2025-02-13T20:01:53.7159360Z -9fd3e2d93d1532373f52e11e963de40c1cdf9a55 tt_metal/third_party/tt_llk_blackhole +2025-02-13T20:01:53.7160925Z -0c04db64275a4bd36a7e14d3c533855cb33f6a20 tt_metal/third_party/tt_llk_grayskull +2025-02-13T20:01:53.7162254Z -0ec3177bfc262f7edf6cfc19531ecb8f669895d2 tt_metal/third_party/tt_llk_wormhole_b0 +2025-02-13T20:01:53.7163450Z -5de287e9c5b2fa3d55fbfd53e9bc59e2050f32fb tt_metal/third_party/umd +2025-02-13T20:01:53.7165947Z ##[group]Cleaning the repository +2025-02-13T20:01:53.7170513Z [command]/usr/bin/git clean -ffdx +2025-02-13T20:01:53.7442376Z [command]/usr/bin/git reset --hard HEAD +2025-02-13T20:01:53.7945862Z HEAD is now at 68e85df3 #0: Skip weights bfloat8 on grayskull +2025-02-13T20:01:53.7956672Z ##[endgroup] +2025-02-13T20:01:53.7959628Z ##[group]Disabling automatic garbage collection +2025-02-13T20:01:53.7964586Z [command]/usr/bin/git config --local gc.auto 0 +2025-02-13T20:01:53.7996906Z ##[endgroup] +2025-02-13T20:01:53.7997784Z ##[group]Setting up auth +2025-02-13T20:01:53.8003508Z [command]/usr/bin/git config --local --name-only --get-regexp core\.sshCommand +2025-02-13T20:01:53.8032794Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'core\.sshCommand' && git config --local --unset-all 'core.sshCommand' || :" +2025-02-13T20:01:53.8304860Z [command]/usr/bin/git config --local --name-only --get-regexp http\.https\:\/\/github\.com\/\.extraheader +2025-02-13T20:01:53.8334314Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'http\.https\:\/\/github\.com\/\.extraheader' && git config --local --unset-all 'http.https://github.com/.extraheader' || :" +2025-02-13T20:01:53.8604467Z [command]/usr/bin/git config --local http.https://github.com/.extraheader AUTHORIZATION: basic *** +2025-02-13T20:01:53.8643122Z ##[endgroup] +2025-02-13T20:01:53.8643904Z ##[group]Fetching the repository +2025-02-13T20:01:53.8652455Z [command]/usr/bin/git -c protocol.version=2 fetch --no-tags --prune --no-recurse-submodules --depth=1 origin +ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70:refs/remotes/origin/sagarwal/multi_page_buffer +2025-02-13T20:01:54.2696539Z From https://github.com/tenstorrent/tt-metal +2025-02-13T20:01:54.2697739Z + 6d399963...ac8ce51f ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70 -> origin/sagarwal/multi_page_buffer (forced update) +2025-02-13T20:01:54.2720322Z ##[endgroup] +2025-02-13T20:01:54.2721130Z ##[group]Determining the checkout info +2025-02-13T20:01:54.2723252Z ##[endgroup] +2025-02-13T20:01:54.2723949Z ##[group]Checking out the ref +2025-02-13T20:01:54.2729416Z [command]/usr/bin/git checkout --progress --force -B sagarwal/multi_page_buffer refs/remotes/origin/sagarwal/multi_page_buffer +2025-02-13T20:01:54.3402530Z Previous HEAD position was 68e85df3 #0: Skip weights bfloat8 on grayskull +2025-02-13T20:01:54.3589254Z Switched to a new branch 'sagarwal/multi_page_buffer' +2025-02-13T20:01:54.3590428Z Branch 'sagarwal/multi_page_buffer' set up to track remote branch 'sagarwal/multi_page_buffer' from 'origin'. +2025-02-13T20:01:54.3604126Z ##[endgroup] +2025-02-13T20:01:54.3604934Z ##[group]Setting up auth for fetching submodules +2025-02-13T20:01:54.3609515Z [command]/usr/bin/git config --global http.https://github.com/.extraheader AUTHORIZATION: basic *** +2025-02-13T20:01:54.3650499Z [command]/usr/bin/git config --global --unset-all url.https://github.com/.insteadOf +2025-02-13T20:01:54.3677836Z [command]/usr/bin/git config --global --add url.https://github.com/.insteadOf git@github.com: +2025-02-13T20:01:54.3705878Z [command]/usr/bin/git config --global --add url.https://github.com/.insteadOf org-64161552@github.com: +2025-02-13T20:01:54.3728883Z ##[endgroup] +2025-02-13T20:01:54.3729593Z ##[group]Fetching submodules +2025-02-13T20:01:54.3732552Z [command]/usr/bin/git submodule sync --recursive +2025-02-13T20:01:54.3999990Z [command]/usr/bin/git -c protocol.version=2 submodule update --init --force --depth=1 --recursive +2025-02-13T20:01:54.4266955Z Submodule 'models/demos/t3000/llama2_70b/reference/llama' (https://github.com/tenstorrent-metal/llama.git) registered for path 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:01:54.4270061Z Submodule '3rd_party/wandb-cpp' (https://github.com/yhisaki/wandb-cpp) registered for path 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:01:54.4272689Z Submodule 'tt_metal/third_party/tracy' (https://github.com/tenstorrent-metal/tracy.git) registered for path 'tt_metal/third_party/tracy' +2025-02-13T20:01:54.4276408Z Submodule 'tt_metal/third_party/tt_llk_blackhole' (https://github.com/tenstorrent/tt-llk-bh.git) registered for path 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:01:54.4280568Z Submodule 'tt_metal/third_party/tt_llk_grayskull' (https://github.com/tenstorrent/tt-llk-gs.git) registered for path 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:01:54.4284460Z Submodule 'tt_metal/third_party/tt_llk_wormhole_b0' (https://github.com/tenstorrent/tt-llk-wh-b0.git) registered for path 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:01:54.4287843Z Submodule 'tt_metal/third_party/umd' (https://github.com/tenstorrent/tt-umd.git) registered for path 'tt_metal/third_party/umd' +2025-02-13T20:01:54.4775067Z Submodule path 'models/demos/t3000/llama2_70b/reference/llama': checked out '29125b7ad8b5513eeaa4417ed92892bf39c8bd74' +2025-02-13T20:01:54.5141774Z Submodule path 'tt-train/3rd_party/wandb-cpp': checked out '368cd07f89f497df20a66936fbfae3956f151af4' +2025-02-13T20:01:54.6606149Z Submodule path 'tt_metal/third_party/tracy': checked out '71d4c8d378b52af7da7012b9b595a61e9304f0bb' +2025-02-13T20:01:54.6945236Z Submodule path 'tt_metal/third_party/tt_llk_blackhole': checked out '9fd3e2d93d1532373f52e11e963de40c1cdf9a55' +2025-02-13T20:01:54.7278972Z Submodule path 'tt_metal/third_party/tt_llk_grayskull': checked out '0c04db64275a4bd36a7e14d3c533855cb33f6a20' +2025-02-13T20:01:54.7599560Z Submodule path 'tt_metal/third_party/tt_llk_wormhole_b0': checked out '0ec3177bfc262f7edf6cfc19531ecb8f669895d2' +2025-02-13T20:01:55.0210580Z Submodule path 'tt_metal/third_party/umd': checked out '5de287e9c5b2fa3d55fbfd53e9bc59e2050f32fb' +2025-02-13T20:01:55.0290598Z [command]/usr/bin/git submodule foreach --recursive git config --local gc.auto 0 +2025-02-13T20:01:55.0540972Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:01:55.0584278Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:01:55.0627908Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:01:55.0672784Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:01:55.0717625Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:01:55.0767309Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:01:55.0808835Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:01:55.0866755Z ##[endgroup] +2025-02-13T20:01:55.0867768Z ##[group]Persisting credentials for submodules +2025-02-13T20:01:55.0874697Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'url\.https\:\/\/github\.com\/\.insteadOf' && git config --local --unset-all 'url.https://github.com/.insteadOf' || :" +2025-02-13T20:01:55.1120566Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:01:55.1145916Z url.https://github.com/.insteadof +2025-02-13T20:01:55.1146556Z url.https://github.com/.insteadof +2025-02-13T20:01:55.1185591Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:01:55.1213836Z url.https://github.com/.insteadof +2025-02-13T20:01:55.1214442Z url.https://github.com/.insteadof +2025-02-13T20:01:55.1253600Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:01:55.1282588Z url.https://github.com/.insteadof +2025-02-13T20:01:55.1283191Z url.https://github.com/.insteadof +2025-02-13T20:01:55.1321334Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:01:55.1348304Z url.https://github.com/.insteadof +2025-02-13T20:01:55.1348906Z url.https://github.com/.insteadof +2025-02-13T20:01:55.1384659Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:01:55.1412666Z url.https://github.com/.insteadof +2025-02-13T20:01:55.1413306Z url.https://github.com/.insteadof +2025-02-13T20:01:55.1453948Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:01:55.1477909Z url.https://github.com/.insteadof +2025-02-13T20:01:55.1478522Z url.https://github.com/.insteadof +2025-02-13T20:01:55.1519875Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:01:55.1541808Z url.https://github.com/.insteadof +2025-02-13T20:01:55.1542416Z url.https://github.com/.insteadof +2025-02-13T20:01:55.1597837Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local 'http.https://github.com/.extraheader' 'AUTHORIZATION: basic ***' && git config --local --show-origin --name-only --get-regexp remote.origin.url" +2025-02-13T20:01:55.1853406Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:01:55.1897793Z file:/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.git/modules/models/demos/t3000/llama2_70b/reference/llama/config remote.origin.url +2025-02-13T20:01:55.1919579Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:01:55.1962623Z file:/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.git/modules/3rd_party/wandb-cpp/config remote.origin.url +2025-02-13T20:01:55.1987658Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:01:55.2033408Z file:/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.git/modules/tt_metal/third_party/tracy/config remote.origin.url +2025-02-13T20:01:55.2057548Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:01:55.2101522Z file:/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.git/modules/tt_metal/third_party/tt_llk_blackhole/config remote.origin.url +2025-02-13T20:01:55.2123990Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:01:55.2169006Z file:/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.git/modules/tt_metal/third_party/tt_llk_grayskull/config remote.origin.url +2025-02-13T20:01:55.2191939Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:01:55.2235400Z file:/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.git/modules/tt_metal/third_party/tt_llk_wormhole_b0/config remote.origin.url +2025-02-13T20:01:55.2259421Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:01:55.2303088Z file:/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.git/modules/tt_metal/third_party/umd/config remote.origin.url +2025-02-13T20:01:55.2384803Z [command]/usr/bin/git submodule foreach --recursive git config --local --add 'url.https://github.com/.insteadOf' 'git@github.com:' +2025-02-13T20:01:55.2627069Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:01:55.2670224Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:01:55.2716673Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:01:55.2765637Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:01:55.2810232Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:01:55.2854748Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:01:55.2895003Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:01:55.2950000Z [command]/usr/bin/git submodule foreach --recursive git config --local --add 'url.https://github.com/.insteadOf' 'org-64161552@github.com:' +2025-02-13T20:01:55.3205058Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:01:55.3243268Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:01:55.3286514Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:01:55.3327446Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:01:55.3372425Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:01:55.3416790Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:01:55.3457507Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:01:55.3506708Z ##[endgroup] +2025-02-13T20:01:55.3561754Z [command]/usr/bin/git log -1 --format=%H +2025-02-13T20:01:55.3589993Z ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70 +2025-02-13T20:01:55.3720673Z ##[group]Run git submodule foreach 'git clean -xffd' +2025-02-13T20:01:55.3721300Z git submodule foreach 'git clean -xffd' +2025-02-13T20:01:55.3741215Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:01:55.3741745Z env: +2025-02-13T20:01:55.3742065Z ARCH_NAME: grayskull +2025-02-13T20:01:55.3742434Z LOGURU_LEVEL: INFO +2025-02-13T20:01:55.3742780Z ##[endgroup] +2025-02-13T20:01:55.4033247Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:01:55.4060600Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:01:55.4085668Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:01:55.4125339Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:01:55.4148981Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:01:55.4177170Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:01:55.4201173Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:01:55.4351822Z Prepare all required actions +2025-02-13T20:01:55.4352625Z Getting action download info +2025-02-13T20:01:55.5950621Z Download action repository 'actions/download-artifact@v4' (SHA:fa0a91b85d4f404e444e00e005971372dc801d16) +2025-02-13T20:01:56.2614586Z Download action repository 'catchpoint/workflow-telemetry-action@v2' (SHA:94c3c3d9567a0205de6da68a76c428ce4e769af1) +2025-02-13T20:01:57.1661885Z ##[group]Run ./.github/actions/prepare-metal-run +2025-02-13T20:01:57.1662472Z with: +2025-02-13T20:01:57.1662840Z is_profiler: false +2025-02-13T20:01:57.1663311Z python-version: 3.8 +2025-02-13T20:01:57.1663745Z run-telemetry: false +2025-02-13T20:01:57.1664162Z env: +2025-02-13T20:01:57.1664516Z ARCH_NAME: grayskull +2025-02-13T20:01:57.1664936Z LOGURU_LEVEL: INFO +2025-02-13T20:01:57.1665331Z ##[endgroup] +2025-02-13T20:01:57.1745087Z ##[group]Run actions/download-artifact@v4 +2025-02-13T20:01:57.1745658Z with: +2025-02-13T20:01:57.1746042Z name: TTMetal_build_any +2025-02-13T20:01:57.1746576Z merge-multiple: false +2025-02-13T20:01:57.1747077Z repository: tenstorrent/tt-metal +2025-02-13T20:01:57.1747593Z run-id: 13315815702 +2025-02-13T20:01:57.1747992Z env: +2025-02-13T20:01:57.1748367Z ARCH_NAME: grayskull +2025-02-13T20:01:57.1748818Z LOGURU_LEVEL: INFO +2025-02-13T20:01:57.1749239Z ##[endgroup] +2025-02-13T20:01:57.4566090Z Downloading single artifact +2025-02-13T20:01:57.6927710Z Preparing to download the following artifacts: +2025-02-13T20:01:57.6928414Z - TTMetal_build_any (ID: 2588416029, Size: 171796974) +2025-02-13T20:01:57.8130058Z Redirecting to blob download url: https://productionresultssa8.blob.core.windows.net/actions-results/c50d1cc6-5c31-4c4c-b0e4-cb91df2420e1/workflow-job-run-85e4bcb1-b635-5839-8d32-ecb05ba8175c/artifacts/220fe10383c34fbe00d66e183fcfa42d19c438ee1c01790da9aeb9ea9685c6a0.zip +2025-02-13T20:01:57.8132844Z Starting download of artifact to: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal +2025-02-13T20:01:58.0764643Z (node:652001) [DEP0005] DeprecationWarning: Buffer() is deprecated due to security and usability issues. Please use the Buffer.alloc(), Buffer.allocUnsafe(), or Buffer.from() methods instead. +2025-02-13T20:01:58.0766441Z (Use `node --trace-deprecation ...` to show where the warning was created) +2025-02-13T20:02:11.9286582Z Artifact download completed successfully. +2025-02-13T20:02:11.9287248Z Total of 1 artifact(s) downloaded +2025-02-13T20:02:11.9293777Z Download artifact has finished successfully +2025-02-13T20:02:11.9457083Z ##[group]Run tar -xvf ttm_any.tar +2025-02-13T20:02:11.9457622Z tar -xvf ttm_any.tar +2025-02-13T20:02:11.9477924Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:02:11.9478437Z env: +2025-02-13T20:02:11.9478754Z ARCH_NAME: grayskull +2025-02-13T20:02:11.9479126Z LOGURU_LEVEL: INFO +2025-02-13T20:02:11.9479480Z ##[endgroup] +2025-02-13T20:02:12.7129630Z Prepare all required actions +2025-02-13T20:02:12.7130220Z Getting action download info +2025-02-13T20:02:12.8677703Z Download action repository 'getsentry/action-setup-venv@v2.1.1' (SHA:3a832a9604b3e1a4202ae559248f26867b467cc7) +2025-02-13T20:02:13.2330690Z Getting action download info +2025-02-13T20:02:13.3915553Z Download action repository 'actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c' (SHA:0a5c61591373683505ea898e09a3ea4f39ef2b9c) +2025-02-13T20:02:14.0629189Z Download action repository 'actions/cache@1bd1e32a3bdc45362d1e726936510720a7c30a57' (SHA:1bd1e32a3bdc45362d1e726936510720a7c30a57) +2025-02-13T20:02:14.9545618Z ##[group]Run ./.github/actions/install-python-deps +2025-02-13T20:02:14.9546164Z with: +2025-02-13T20:02:14.9546532Z python-version: 3.8 +2025-02-13T20:02:14.9546923Z env: +2025-02-13T20:02:14.9547534Z ARCH_NAME: grayskull +2025-02-13T20:02:14.9547946Z LOGURU_LEVEL: INFO +2025-02-13T20:02:14.9548334Z ##[endgroup] +2025-02-13T20:02:14.9622443Z ##[group]Run getsentry/action-setup-venv@v2.1.1 +2025-02-13T20:02:14.9622984Z with: +2025-02-13T20:02:14.9623364Z python-version: 3.8 +2025-02-13T20:02:14.9623966Z venv-dir: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-13T20:02:14.9625303Z cache-dependency-path: tt_metal/python_env/requirements-dev.txt +docs/requirements-docs.txt +tests/sweep_framework/requirements-sweeps.txt +pyproject.toml +create_venv.sh + +2025-02-13T20:02:14.9626531Z install-cmd: ./create_venv.sh +2025-02-13T20:02:14.9626977Z env: +2025-02-13T20:02:14.9627344Z ARCH_NAME: grayskull +2025-02-13T20:02:14.9627758Z LOGURU_LEVEL: INFO +2025-02-13T20:02:14.9628161Z ##[endgroup] +2025-02-13T20:02:14.9690166Z ##[group]Run actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c +2025-02-13T20:02:14.9690773Z with: +2025-02-13T20:02:14.9691141Z python-version: 3.8 +2025-02-13T20:02:14.9691606Z check-latest: false +2025-02-13T20:02:14.9692307Z token: *** +2025-02-13T20:02:14.9692703Z update-environment: true +2025-02-13T20:02:14.9693229Z allow-prereleases: false +2025-02-13T20:02:14.9693650Z env: +2025-02-13T20:02:14.9694000Z ARCH_NAME: grayskull +2025-02-13T20:02:14.9694385Z LOGURU_LEVEL: INFO +2025-02-13T20:02:14.9694775Z ##[endgroup] +2025-02-13T20:02:15.2220309Z ##[group]Installed versions +2025-02-13T20:02:15.2286032Z Successfully set up CPython (3.8.18) +2025-02-13T20:02:15.2286831Z ##[endgroup] +2025-02-13T20:02:15.2451662Z ##[group]Run echo '::remove-matcher owner=python::' +2025-02-13T20:02:15.2452438Z echo '::remove-matcher owner=python::' +2025-02-13T20:02:15.2476506Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:02:15.2477055Z env: +2025-02-13T20:02:15.2477387Z ARCH_NAME: grayskull +2025-02-13T20:02:15.2477775Z LOGURU_LEVEL: INFO +2025-02-13T20:02:15.2478360Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:15.2479282Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:02:15.2480351Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:15.2481195Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:15.2482025Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:15.2482861Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:02:15.2483568Z ##[endgroup] +2025-02-13T20:02:15.3365031Z ##[group]Run actions/cache@1bd1e32a3bdc45362d1e726936510720a7c30a57 +2025-02-13T20:02:15.3365807Z with: +2025-02-13T20:02:15.3366461Z path: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-13T20:02:15.3368352Z key: setup-venv-Linux-py-3.8.18-/home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/bin/python-509e0fbc74e4697ea036d8e6b4ed76321c253e4ffef8468c11ee556fb8e370e2-./create_venv.sh +2025-02-13T20:02:15.3370050Z enableCrossOsArchive: false +2025-02-13T20:02:15.3370638Z fail-on-cache-miss: false +2025-02-13T20:02:15.3371190Z lookup-only: false +2025-02-13T20:02:15.3371709Z save-always: false +2025-02-13T20:02:15.3372188Z env: +2025-02-13T20:02:15.3372624Z ARCH_NAME: grayskull +2025-02-13T20:02:15.3373113Z LOGURU_LEVEL: INFO +2025-02-13T20:02:15.3373854Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:15.3374983Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:02:15.3376162Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:15.3377227Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:15.3378279Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:15.3379349Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:02:15.3380509Z ##[endgroup] +2025-02-13T20:02:15.6878375Z Cache hit for: setup-venv-Linux-py-3.8.18-/home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/bin/python-509e0fbc74e4697ea036d8e6b4ed76321c253e4ffef8468c11ee556fb8e370e2-./create_venv.sh +2025-02-13T20:02:16.9142792Z Received 41943040 of 652437919 (6.4%), 39.8 MBs/sec +2025-02-13T20:02:17.9166537Z Received 134217728 of 652437919 (20.6%), 63.7 MBs/sec +2025-02-13T20:02:18.9206720Z Received 226492416 of 652437919 (34.7%), 71.7 MBs/sec +2025-02-13T20:02:19.9216555Z Received 318767104 of 652437919 (48.9%), 75.8 MBs/sec +2025-02-13T20:02:20.9245382Z Received 402653184 of 652437919 (61.7%), 76.6 MBs/sec +2025-02-13T20:02:21.9277751Z Received 494927872 of 652437919 (75.9%), 78.4 MBs/sec +2025-02-13T20:02:22.9281606Z Received 549453824 of 652437919 (84.2%), 74.7 MBs/sec +2025-02-13T20:02:23.9284586Z Received 648243615 of 652437919 (99.4%), 77.1 MBs/sec +2025-02-13T20:02:24.0579857Z Received 652437919 of 652437919 (100.0%), 76.4 MBs/sec +2025-02-13T20:02:24.0586331Z Cache Size: ~622 MB (652437919 B) +2025-02-13T20:02:24.0639000Z [command]/usr/bin/tar -xf /home/ubuntu/actions-runner/_work/_temp/2d315e9b-3a7c-41ea-adf2-9f36002bed16/cache.tgz -P -C /home/ubuntu/actions-runner/_work/tt-metal/tt-metal -z +2025-02-13T20:02:41.2658374Z Cache restored successfully +2025-02-13T20:02:41.3753133Z Cache restored from key: setup-venv-Linux-py-3.8.18-/home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/bin/python-509e0fbc74e4697ea036d8e6b4ed76321c253e4ffef8468c11ee556fb8e370e2-./create_venv.sh +2025-02-13T20:02:41.4320010Z ##[group]Run source /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env/bin/activate +2025-02-13T20:02:41.4321049Z source /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env/bin/activate +2025-02-13T20:02:41.4321838Z echo "VIRTUAL_ENV=${VIRTUAL_ENV}" >> $GITHUB_ENV +2025-02-13T20:02:41.4322409Z echo "${VIRTUAL_ENV}/bin" >> $GITHUB_PATH +2025-02-13T20:02:41.4343454Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:02:41.4344076Z env: +2025-02-13T20:02:41.4344380Z ARCH_NAME: grayskull +2025-02-13T20:02:41.4344822Z LOGURU_LEVEL: INFO +2025-02-13T20:02:41.4345439Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:41.4346386Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:02:41.4347324Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:41.4348179Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:41.4349073Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:41.4349939Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:02:41.4350627Z ##[endgroup] +2025-02-13T20:02:41.4596360Z Prepare all required actions +2025-02-13T20:02:41.4596902Z Getting action download info +2025-02-13T20:02:41.6122732Z Download action repository 'docker/login-action@v3' (SHA:9780b0c442fbb1117ed29e0efdff1e18412f7567) +2025-02-13T20:02:43.1263437Z Download action repository 'tenstorrent/docker-run-action@v5' (SHA:f939ca6b256fc7d5c78538d8af38b00a287e3415) +2025-02-13T20:02:43.6176447Z ##[group]Run ./.github/actions/docker-run +2025-02-13T20:02:43.6177037Z with: +2025-02-13T20:02:43.6177423Z docker_os_arch: tt-metalium/ubuntu-20.04-amd64 +2025-02-13T20:02:43.6178170Z docker_password: *** +2025-02-13T20:02:43.6179637Z docker_opts: -e ARCH_NAME=grayskull +-e TT_METAL_HOME=/home/ubuntu/actions-runner/_work/tt-metal/tt-metal +-e TT_METAL_SLOW_DISPATCH_MODE=1 +-e LD_LIBRARY_PATH=/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/build/lib +-e GTEST_OUTPUT=xml:generated/test_reports/ + +2025-02-13T20:02:43.6182048Z run_args: pip install --force-reinstall pip==21.2.4 +pip install -r tt_metal/python_env/requirements-dev.txt +pip install -e . +mkdir -p generated/test_reports +./build/test/tt_metal/unit_tests_api_grayskull + +2025-02-13T20:02:43.6183274Z docker_username: sagarwalTT +2025-02-13T20:02:43.6183878Z device: -v /dev/hugepages-1G:/dev/hugepages-1G +--device /dev/tenstorrent + +2025-02-13T20:02:43.6184509Z install_wheel: false +2025-02-13T20:02:43.6185006Z env: +2025-02-13T20:02:43.6185314Z ARCH_NAME: grayskull +2025-02-13T20:02:43.6185702Z LOGURU_LEVEL: INFO +2025-02-13T20:02:43.6186274Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:43.6187154Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:02:43.6188022Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:43.6188906Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:43.6189705Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:43.6190523Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:02:43.6191339Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-13T20:02:43.6191963Z ##[endgroup] +2025-02-13T20:02:43.6220768Z ##[group]Build container for action use: '/home/ubuntu/actions-runner/_work/_actions/tenstorrent/docker-run-action/v5/Dockerfile'. +2025-02-13T20:02:43.6267088Z ##[command]/usr/bin/docker build -t 15909e:fd9224906af24c48bfa352521c0faefe -f "/home/ubuntu/actions-runner/_work/_actions/tenstorrent/docker-run-action/v5/Dockerfile" "/home/ubuntu/actions-runner/_work/_actions/tenstorrent/docker-run-action/v5" +2025-02-13T20:02:44.2821454Z #0 building with "default" instance using docker driver +2025-02-13T20:02:44.2822621Z +2025-02-13T20:02:44.2822882Z #1 [internal] load build definition from Dockerfile +2025-02-13T20:02:44.3368234Z #1 transferring dockerfile: +2025-02-13T20:02:44.4894751Z #1 transferring dockerfile: 171B done +2025-02-13T20:02:44.6548032Z #1 DONE 0.5s +2025-02-13T20:02:44.7949682Z +2025-02-13T20:02:44.7950460Z #2 [internal] load metadata for public.ecr.aws/docker/library/docker:20.10 +2025-02-13T20:02:45.5038518Z #2 DONE 0.7s +2025-02-13T20:02:45.5401054Z +2025-02-13T20:02:45.5401540Z #3 [internal] load .dockerignore +2025-02-13T20:02:45.5403793Z #3 transferring context: 2B done +2025-02-13T20:02:45.5404424Z #3 DONE 0.0s +2025-02-13T20:02:45.5404636Z +2025-02-13T20:02:45.5405317Z #4 [1/3] FROM public.ecr.aws/docker/library/docker:20.10@sha256:2967f0819c84dd589ed0a023b9d25dcfe7a3c123d5bf784ffbb77edf55335f0c +2025-02-13T20:02:45.5406168Z #4 DONE 0.0s +2025-02-13T20:02:45.5406346Z +2025-02-13T20:02:45.5406515Z #5 [internal] load build context +2025-02-13T20:02:45.5406993Z #5 transferring context: 35B done +2025-02-13T20:02:45.5407371Z #5 DONE 0.0s +2025-02-13T20:02:45.5407568Z +2025-02-13T20:02:45.5407710Z #6 [2/3] RUN apk add bash +2025-02-13T20:02:45.5408059Z #6 CACHED +2025-02-13T20:02:45.5408236Z +2025-02-13T20:02:45.5409138Z #7 [3/3] COPY entrypoint.sh /entrypoint.sh +2025-02-13T20:02:45.5409589Z #7 CACHED +2025-02-13T20:02:45.5409764Z +2025-02-13T20:02:45.5409924Z #8 exporting to image +2025-02-13T20:02:45.5410291Z #8 exporting layers done +2025-02-13T20:02:45.5410935Z #8 writing image sha256:6c59bb83b1e1a090578302cd4b3de295bd814716289f366fcb99810b7e40c491 done +2025-02-13T20:02:45.5411759Z #8 naming to docker.io/library/15909e:fd9224906af24c48bfa352521c0faefe done +2025-02-13T20:02:45.5412330Z #8 DONE 0.0s +2025-02-13T20:02:45.5487559Z ##[endgroup] +2025-02-13T20:02:45.5533062Z Prepare all required actions +2025-02-13T20:02:45.5533590Z Getting action download info +2025-02-13T20:02:45.7005915Z Download action repository 'actions/checkout@v3' (SHA:f43a0e5ff2bd294095638e18286ca9a3d1956744) +2025-02-13T20:02:46.2575985Z ##[group]Run ./.github/actions/generate-docker-tag +2025-02-13T20:02:46.2576487Z with: +2025-02-13T20:02:46.2576834Z image: tt-metalium/ubuntu-20.04-amd64 +2025-02-13T20:02:46.2577267Z env: +2025-02-13T20:02:46.2577619Z ARCH_NAME: grayskull +2025-02-13T20:02:46.2577969Z LOGURU_LEVEL: INFO +2025-02-13T20:02:46.2578524Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:46.2579396Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:02:46.2580227Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:46.2581013Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:46.2581791Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:46.2582606Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:02:46.2583447Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-13T20:02:46.2584042Z ##[endgroup] +2025-02-13T20:02:46.2607557Z ##[group]Run echo "::notice::[DEPRECATION] This action is deprecated. Please migrate to reading the Docker image from the pipeline." +2025-02-13T20:02:46.2608853Z echo "::notice::[DEPRECATION] This action is deprecated. Please migrate to reading the Docker image from the pipeline." +2025-02-13T20:02:46.2632175Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:02:46.2632696Z env: +2025-02-13T20:02:46.2632988Z ARCH_NAME: grayskull +2025-02-13T20:02:46.2633360Z LOGURU_LEVEL: INFO +2025-02-13T20:02:46.2633912Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:46.2634773Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:02:46.2635616Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:46.2636669Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:46.2637456Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:46.2638263Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:02:46.2639063Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-13T20:02:46.2639657Z ##[endgroup] +2025-02-13T20:02:46.2687915Z ##[notice][DEPRECATION] This action is deprecated. Please migrate to reading the Docker image from the pipeline. +2025-02-13T20:02:46.2768799Z ##[group]Run actions/checkout@v3 +2025-02-13T20:02:46.2769246Z with: +2025-02-13T20:02:46.2769556Z fetch-depth: 1 +2025-02-13T20:02:46.2769923Z clean: false +2025-02-13T20:02:46.2770328Z repository: tenstorrent/tt-metal +2025-02-13T20:02:46.2770947Z token: *** +2025-02-13T20:02:46.2771318Z ssh-strict: true +2025-02-13T20:02:46.2771723Z persist-credentials: true +2025-02-13T20:02:46.2772203Z sparse-checkout-cone-mode: true +2025-02-13T20:02:46.2772656Z fetch-tags: false +2025-02-13T20:02:46.2773026Z lfs: false +2025-02-13T20:02:46.2773351Z submodules: false +2025-02-13T20:02:46.2773734Z set-safe-directory: true +2025-02-13T20:02:46.2774134Z env: +2025-02-13T20:02:46.2774842Z ARCH_NAME: grayskull +2025-02-13T20:02:46.2775323Z LOGURU_LEVEL: INFO +2025-02-13T20:02:46.2775996Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:46.2776874Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:02:46.2777751Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:46.2778567Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:46.2779349Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:46.2780177Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:02:46.2781032Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-13T20:02:46.2781665Z ##[endgroup] +2025-02-13T20:02:46.3665199Z Syncing repository: tenstorrent/tt-metal +2025-02-13T20:02:46.3668588Z ##[group]Getting Git version info +2025-02-13T20:02:46.3669296Z Working directory is '/home/ubuntu/actions-runner/_work/tt-metal/tt-metal' +2025-02-13T20:02:46.3706530Z [command]/usr/bin/git version +2025-02-13T20:02:46.3746904Z git version 2.25.1 +2025-02-13T20:02:46.3775115Z ##[endgroup] +2025-02-13T20:02:46.3786089Z Copying '/home/ubuntu/.gitconfig' to '/home/ubuntu/actions-runner/_work/_temp/c6a15b38-201e-400f-b234-e2dd277590e1/.gitconfig' +2025-02-13T20:02:46.3797092Z Temporarily overriding HOME='/home/ubuntu/actions-runner/_work/_temp/c6a15b38-201e-400f-b234-e2dd277590e1' before making global git config changes +2025-02-13T20:02:46.3798362Z Adding repository directory to the temporary git global config as a safe directory +2025-02-13T20:02:46.3802148Z [command]/usr/bin/git config --global --add safe.directory /home/ubuntu/actions-runner/_work/tt-metal/tt-metal +2025-02-13T20:02:46.3834503Z [command]/usr/bin/git config --local --get remote.origin.url +2025-02-13T20:02:46.3852282Z https://github.com/tenstorrent/tt-metal +2025-02-13T20:02:46.3870210Z ##[group]Removing previously created refs, to avoid conflicts +2025-02-13T20:02:46.3874011Z [command]/usr/bin/git rev-parse --symbolic-full-name --verify --quiet HEAD +2025-02-13T20:02:46.3893318Z refs/heads/sagarwal/multi_page_buffer +2025-02-13T20:02:46.3904180Z [command]/usr/bin/git checkout --detach +2025-02-13T20:02:46.6907642Z HEAD is now at ac8ce51f Fixing merge conflict +2025-02-13T20:02:46.6967451Z [command]/usr/bin/git branch --delete --force sagarwal/multi_page_buffer +2025-02-13T20:02:46.7007073Z Deleted branch sagarwal/multi_page_buffer (was ac8ce51f). +2025-02-13T20:02:46.7144144Z ##[endgroup] +2025-02-13T20:02:46.7146893Z [command]/usr/bin/git submodule status +2025-02-13T20:02:46.7498522Z 29125b7ad8b5513eeaa4417ed92892bf39c8bd74 models/demos/t3000/llama2_70b/reference/llama (heads/main) +2025-02-13T20:02:46.7571704Z 368cd07f89f497df20a66936fbfae3956f151af4 tt-train/3rd_party/wandb-cpp (heads/master) +2025-02-13T20:02:46.7645132Z 71d4c8d378b52af7da7012b9b595a61e9304f0bb tt_metal/third_party/tracy (71d4c8d) +2025-02-13T20:02:46.7722438Z 9fd3e2d93d1532373f52e11e963de40c1cdf9a55 tt_metal/third_party/tt_llk_blackhole (remotes/origin/HEAD) +2025-02-13T20:02:46.7796059Z 0c04db64275a4bd36a7e14d3c533855cb33f6a20 tt_metal/third_party/tt_llk_grayskull (heads/main) +2025-02-13T20:02:46.7871455Z 0ec3177bfc262f7edf6cfc19531ecb8f669895d2 tt_metal/third_party/tt_llk_wormhole_b0 (remotes/origin/HEAD) +2025-02-13T20:02:46.7946066Z 5de287e9c5b2fa3d55fbfd53e9bc59e2050f32fb tt_metal/third_party/umd (5de287e) +2025-02-13T20:02:46.7957220Z ##[group]Disabling automatic garbage collection +2025-02-13T20:02:46.7960319Z [command]/usr/bin/git config --local gc.auto 0 +2025-02-13T20:02:46.7987594Z ##[endgroup] +2025-02-13T20:02:46.7988549Z ##[group]Setting up auth +2025-02-13T20:02:46.7993152Z [command]/usr/bin/git config --local --name-only --get-regexp core\.sshCommand +2025-02-13T20:02:46.8020846Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'core\.sshCommand' && git config --local --unset-all 'core.sshCommand' || :" +2025-02-13T20:02:46.8278453Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:02:46.8327038Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:02:46.8374766Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:02:46.8428951Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:02:46.8482500Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:02:46.8532571Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:02:46.8583763Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:02:46.8653391Z [command]/usr/bin/git config --local --name-only --get-regexp http\.https\:\/\/github\.com\/\.extraheader +2025-02-13T20:02:46.8671773Z http.https://github.com/.extraheader +2025-02-13T20:02:46.8680385Z [command]/usr/bin/git config --local --unset-all http.https://github.com/.extraheader +2025-02-13T20:02:46.8705796Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'http\.https\:\/\/github\.com\/\.extraheader' && git config --local --unset-all 'http.https://github.com/.extraheader' || :" +2025-02-13T20:02:46.8973595Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:02:46.9000182Z http.https://github.com/.extraheader +2025-02-13T20:02:46.9035581Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:02:46.9062532Z http.https://github.com/.extraheader +2025-02-13T20:02:46.9098500Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:02:46.9126046Z http.https://github.com/.extraheader +2025-02-13T20:02:46.9159470Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:02:46.9187016Z http.https://github.com/.extraheader +2025-02-13T20:02:46.9220880Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:02:46.9251804Z http.https://github.com/.extraheader +2025-02-13T20:02:46.9287889Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:02:46.9316816Z http.https://github.com/.extraheader +2025-02-13T20:02:46.9355659Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:02:46.9381637Z http.https://github.com/.extraheader +2025-02-13T20:02:46.9435981Z [command]/usr/bin/git config --local http.https://github.com/.extraheader AUTHORIZATION: basic *** +2025-02-13T20:02:46.9473165Z ##[endgroup] +2025-02-13T20:02:46.9473790Z ##[group]Fetching the repository +2025-02-13T20:02:46.9481646Z [command]/usr/bin/git -c protocol.version=2 fetch --no-tags --prune --progress --no-recurse-submodules --depth=1 origin +ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70:refs/remotes/origin/sagarwal/multi_page_buffer +2025-02-13T20:02:47.3107279Z remote: Total 0 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0) +2025-02-13T20:02:47.3433842Z ##[endgroup] +2025-02-13T20:02:47.3434889Z ##[group]Determining the checkout info +2025-02-13T20:02:47.3436560Z ##[endgroup] +2025-02-13T20:02:47.3437315Z ##[group]Checking out the ref +2025-02-13T20:02:47.3438477Z [command]/usr/bin/git checkout --progress --force -B sagarwal/multi_page_buffer refs/remotes/origin/sagarwal/multi_page_buffer +2025-02-13T20:02:47.3944040Z Switched to a new branch 'sagarwal/multi_page_buffer' +2025-02-13T20:02:47.3945329Z Branch 'sagarwal/multi_page_buffer' set up to track remote branch 'sagarwal/multi_page_buffer' from 'origin'. +2025-02-13T20:02:47.3952315Z ##[endgroup] +2025-02-13T20:02:47.3991250Z [command]/usr/bin/git log -1 --format='%H' +2025-02-13T20:02:47.4017076Z 'ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70' +2025-02-13T20:02:47.4141179Z ##[group]Run BUILD_TAG=$(cat \ +2025-02-13T20:02:47.4141636Z BUILD_TAG=$(cat \ +2025-02-13T20:02:47.4142036Z  install_dependencies.sh \ +2025-02-13T20:02:47.4142494Z  dockerfile/Dockerfile \ +2025-02-13T20:02:47.4142984Z  tt_metal/python_env/requirements-dev.txt \ +2025-02-13T20:02:47.4143542Z  docs/requirements-docs.txt \ +2025-02-13T20:02:47.4144081Z  tests/sweep_framework/requirements-sweeps.txt \ +2025-02-13T20:02:47.4144617Z  | sha1sum | cut -d' ' -f1) +2025-02-13T20:02:47.4145341Z echo "BUILD_TAG=$BUILD_TAG" >> $GITHUB_ENV +2025-02-13T20:02:47.4146273Z echo "TT_METAL_DOCKER_IMAGE_TAG=ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:${BUILD_TAG}" >> $GITHUB_ENV +2025-02-13T20:02:47.4167276Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:02:47.4167793Z env: +2025-02-13T20:02:47.4168102Z ARCH_NAME: grayskull +2025-02-13T20:02:47.4168456Z LOGURU_LEVEL: INFO +2025-02-13T20:02:47.4168999Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:47.4169917Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:02:47.4170755Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:47.4171547Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:47.4172317Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:47.4173111Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:02:47.4173878Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-13T20:02:47.4174474Z ##[endgroup] +2025-02-13T20:02:47.4290813Z ##[group]Run echo "RUNNER_UID=$(id -u)" >> $GITHUB_ENV +2025-02-13T20:02:47.4291371Z echo "RUNNER_UID=$(id -u)" >> $GITHUB_ENV +2025-02-13T20:02:47.4292199Z echo "RUNNER_GID=$(id -g)" >> $GITHUB_ENV +2025-02-13T20:02:47.4315667Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:02:47.4316289Z env: +2025-02-13T20:02:47.4316687Z ARCH_NAME: grayskull +2025-02-13T20:02:47.4317113Z LOGURU_LEVEL: INFO +2025-02-13T20:02:47.4317703Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:47.4318633Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:02:47.4319512Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:47.4320643Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:47.4321456Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:47.4322280Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:02:47.4323088Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-13T20:02:47.4324133Z BUILD_TAG: 3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:02:47.4325150Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:02:47.4326302Z ##[endgroup] +2025-02-13T20:02:47.4460540Z ##[group]Run docker/login-action@v3 +2025-02-13T20:02:47.4461354Z with: +2025-02-13T20:02:47.4461727Z registry: https://ghcr.io +2025-02-13T20:02:47.4462144Z username: sagarwalTT +2025-02-13T20:02:47.4462764Z password: *** +2025-02-13T20:02:47.4463085Z ecr: auto +2025-02-13T20:02:47.4463410Z logout: true +2025-02-13T20:02:47.4463785Z env: +2025-02-13T20:02:47.4464100Z ARCH_NAME: grayskull +2025-02-13T20:02:47.4464480Z LOGURU_LEVEL: INFO +2025-02-13T20:02:47.4465059Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:47.4465960Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:02:47.4466828Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:47.4467632Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:47.4468435Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:47.4469223Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:02:47.4470077Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-13T20:02:47.4470862Z BUILD_TAG: 3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:02:47.4472017Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:02:47.4472900Z RUNNER_UID: 1000 +2025-02-13T20:02:47.4473257Z RUNNER_GID: 1000 +2025-02-13T20:02:47.4473609Z ##[endgroup] +2025-02-13T20:02:47.7943188Z Logging into https://ghcr.io... +2025-02-13T20:02:48.2793821Z Login Succeeded! +2025-02-13T20:02:48.2930941Z ##[group]Run docker pull ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:02:48.2932296Z docker pull ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:02:48.2953360Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:02:48.2953918Z env: +2025-02-13T20:02:48.2954258Z ARCH_NAME: grayskull +2025-02-13T20:02:48.2954644Z LOGURU_LEVEL: INFO +2025-02-13T20:02:48.2955213Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:48.2956123Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:02:48.2957057Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:48.2957848Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:48.2958637Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:48.2959450Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:02:48.2960471Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-13T20:02:48.2961177Z BUILD_TAG: 3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:02:48.2962119Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:02:48.2963002Z RUNNER_UID: 1000 +2025-02-13T20:02:48.2963367Z RUNNER_GID: 1000 +2025-02-13T20:02:48.2963785Z ##[endgroup] +2025-02-13T20:02:48.9241283Z 3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6: Pulling from tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64 +2025-02-13T20:02:48.9258334Z Digest: sha256:8a4d11f562408a7a138235af5a27a98439b4c5655255b17980d1a8dcbd067fd7 +2025-02-13T20:02:48.9260397Z Status: Image is up to date for ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:02:48.9273518Z ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:02:48.9363075Z ##[group]Run tenstorrent/docker-run-action@v5 +2025-02-13T20:02:48.9363582Z with: +2025-02-13T20:02:48.9363909Z shell: bash +2025-02-13T20:02:48.9364243Z username: sagarwalTT +2025-02-13T20:02:48.9365367Z password: *** +2025-02-13T20:02:48.9365736Z registry: ghcr.io +2025-02-13T20:02:48.9366474Z image: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:02:48.9370938Z options: -u 1000:1000 +--rm +-v /etc/passwd:/etc/passwd:ro +-v /etc/shadow:/etc/shadow:ro +-v /etc/bashrc:/etc/bashrc:ro +-v /home/ubuntu/actions-runner/_work/tt-metal/tt-metal:/home/ubuntu/actions-runner/_work/tt-metal/tt-metal +--net=host +--log-driver local +--log-opt max-size=50m +-e ARCH_NAME=grayskull +-e TT_METAL_HOME=/home/ubuntu/actions-runner/_work/tt-metal/tt-metal +-e TT_METAL_SLOW_DISPATCH_MODE=1 +-e LD_LIBRARY_PATH=/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/build/lib +-e GTEST_OUTPUT=xml:generated/test_reports/ + +-e LOGURU_LEVEL=INFO +-e PYTHONPATH=/home/ubuntu/actions-runner/_work/tt-metal/tt-metal +-e HOME=/home/ubuntu/actions-runner/_work/tt-metal/tt-metal +-v /dev/hugepages-1G:/dev/hugepages-1G +--device /dev/tenstorrent + +-w /home/ubuntu/actions-runner/_work/tt-metal/tt-metal + +2025-02-13T20:02:48.9376416Z run: set -eu + +install_wheel=false +if [ "${install_wheel,,}" == "true" ]; then + WHEEL_FILENAME=$(ls -1 *.whl) + pip3 install "$WHEEL_FILENAME" +fi + +pip install --force-reinstall pip==21.2.4 +pip install -r tt_metal/python_env/requirements-dev.txt +pip install -e . +mkdir -p generated/test_reports +./build/test/tt_metal/unit_tests_api_grayskull + + +2025-02-13T20:02:48.9378072Z env: +2025-02-13T20:02:48.9378384Z ARCH_NAME: grayskull +2025-02-13T20:02:48.9378756Z LOGURU_LEVEL: INFO +2025-02-13T20:02:48.9379303Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:48.9380155Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:02:48.9381103Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:48.9381869Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:48.9382641Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:02:48.9383437Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:02:48.9384222Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-13T20:02:48.9384954Z BUILD_TAG: 3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:02:48.9385878Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:02:48.9386720Z RUNNER_UID: 1000 +2025-02-13T20:02:48.9387057Z RUNNER_GID: 1000 +2025-02-13T20:02:48.9387399Z ##[endgroup] +2025-02-13T20:02:48.9501173Z ##[command]/usr/bin/docker run --name efd9224906af24c48bfa352521c0faefe_015afe --label 15909e --workdir /github/workspace --rm -e "ARCH_NAME" -e "LOGURU_LEVEL" -e "pythonLocation" -e "PKG_CONFIG_PATH" -e "Python_ROOT_DIR" -e "Python2_ROOT_DIR" -e "Python3_ROOT_DIR" -e "LD_LIBRARY_PATH" -e "VIRTUAL_ENV" -e "BUILD_TAG" -e "TT_METAL_DOCKER_IMAGE_TAG" -e "RUNNER_UID" -e "RUNNER_GID" -e "INPUT_SHELL" -e "INPUT_USERNAME" -e "INPUT_PASSWORD" -e "INPUT_REGISTRY" -e "INPUT_IMAGE" -e "INPUT_OPTIONS" -e "INPUT_RUN" -e "INPUT_DOCKER_NETWORK" -e "HOME" -e "GITHUB_JOB" -e "GITHUB_REF" -e "GITHUB_SHA" -e "GITHUB_REPOSITORY" -e "GITHUB_REPOSITORY_OWNER" -e "GITHUB_REPOSITORY_OWNER_ID" -e "GITHUB_RUN_ID" -e "GITHUB_RUN_NUMBER" -e "GITHUB_RETENTION_DAYS" -e "GITHUB_RUN_ATTEMPT" -e "GITHUB_REPOSITORY_ID" -e "GITHUB_ACTOR_ID" -e "GITHUB_ACTOR" -e "GITHUB_TRIGGERING_ACTOR" -e "GITHUB_WORKFLOW" -e "GITHUB_HEAD_REF" -e "GITHUB_BASE_REF" -e "GITHUB_EVENT_NAME" -e "GITHUB_SERVER_URL" -e "GITHUB_API_URL" -e "GITHUB_GRAPHQL_URL" -e "GITHUB_REF_NAME" -e "GITHUB_REF_PROTECTED" -e "GITHUB_REF_TYPE" -e "GITHUB_WORKFLOW_REF" -e "GITHUB_WORKFLOW_SHA" -e "GITHUB_WORKSPACE" -e "GITHUB_EVENT_PATH" -e "GITHUB_PATH" -e "GITHUB_ENV" -e "GITHUB_STEP_SUMMARY" -e "GITHUB_STATE" -e "GITHUB_OUTPUT" -e "GITHUB_ACTION" -e "GITHUB_ACTION_REPOSITORY" -e "GITHUB_ACTION_REF" -e "GITHUB_ACTION_PATH" -e "RUNNER_OS" -e "RUNNER_ARCH" -e "RUNNER_NAME" -e "RUNNER_ENVIRONMENT" -e "RUNNER_TOOL_CACHE" -e "RUNNER_TEMP" -e "RUNNER_WORKSPACE" -e "ACTIONS_RUNTIME_URL" -e "ACTIONS_RUNTIME_TOKEN" -e "ACTIONS_CACHE_URL" -e "ACTIONS_ID_TOKEN_REQUEST_URL" -e "ACTIONS_ID_TOKEN_REQUEST_TOKEN" -e "ACTIONS_RESULTS_URL" -e GITHUB_ACTIONS=true -e CI=true -v "/var/run/docker.sock":"/var/run/docker.sock" -v "/home/ubuntu/actions-runner/_work/_temp/_github_home":"/github/home" -v "/home/ubuntu/actions-runner/_work/_temp/_github_workflow":"/github/workflow" -v "/home/ubuntu/actions-runner/_work/_temp/_runner_file_commands":"/github/file_commands" -v "/home/ubuntu/actions-runner/_work/tt-metal/tt-metal":"/github/workspace" 15909e:fd9224906af24c48bfa352521c0faefe +2025-02-13T20:02:50.4369588Z WARNING! Your password will be stored unencrypted in /github/home/.docker/config.json. +2025-02-13T20:02:50.4370331Z Login Succeeded +2025-02-13T20:02:50.4370814Z Configure a credential helper to remove this warning. See +2025-02-13T20:02:50.4371731Z https://docs.docker.com/engine/reference/commandline/login/#credentials-store +2025-02-13T20:02:50.4372230Z +2025-02-13T20:02:52.1131998Z Collecting pip==21.2.4 +2025-02-13T20:02:52.1580981Z Downloading pip-21.2.4-py3-none-any.whl (1.6 MB) +2025-02-13T20:02:54.7004879Z Installing collected packages: pip +2025-02-13T20:02:55.7261428Z WARNING: The scripts pip, pip3 and pip3.8 are installed in '/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.local/bin' which is not on PATH. +2025-02-13T20:02:55.7262861Z Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location. +2025-02-13T20:02:55.7552272Z Successfully installed pip-21.2.4 +2025-02-13T20:02:57.0206367Z Requirement already satisfied: platformdirs<4.0.0 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 8)) (3.11.0) +2025-02-13T20:02:57.0303088Z Requirement already satisfied: pre-commit==3.0.4 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 9)) (3.0.4) +2025-02-13T20:02:57.0343482Z Requirement already satisfied: black==24.3.0 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 10)) (24.3.0) +2025-02-13T20:02:57.0511097Z Requirement already satisfied: clang-format==19.1.4 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 11)) (19.1.4) +2025-02-13T20:02:57.0524524Z Requirement already satisfied: build==0.10.0 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 12)) (0.10.0) +2025-02-13T20:02:57.0736379Z Requirement already satisfied: twine==4.0.2 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 13)) (4.0.2) +2025-02-13T20:02:57.0806348Z Requirement already satisfied: yamllint==1.32.0 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 14)) (1.32.0) +2025-02-13T20:02:57.0863282Z Requirement already satisfied: mypy==1.9.0 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 15)) (1.9.0) +2025-02-13T20:02:57.0934771Z Requirement already satisfied: docutils==0.18.1 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/../../docs/requirements-docs.txt (line 1)) (0.18.1) +2025-02-13T20:02:57.0950446Z Requirement already satisfied: sphinx==7.1.2 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/../../docs/requirements-docs.txt (line 2)) (7.1.2) +2025-02-13T20:02:57.1165339Z Requirement already satisfied: sphinx-rtd-theme==1.3.0 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/../../docs/requirements-docs.txt (line 3)) (1.3.0) +2025-02-13T20:02:57.1230811Z Requirement already satisfied: sphinxcontrib-email==0.3.5 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/../../docs/requirements-docs.txt (line 4)) (0.3.5) +2025-02-13T20:02:57.1255824Z Requirement already satisfied: lxml==4.9.4 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/../../docs/requirements-docs.txt (line 5)) (4.9.4) +2025-02-13T20:02:57.1299844Z Requirement already satisfied: breathe==4.35.0 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/../../docs/requirements-docs.txt (line 6)) (4.35.0) +2025-02-13T20:02:57.1324594Z Requirement already satisfied: nbsphinx==0.9.3 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (0.9.3) +2025-02-13T20:02:57.1369079Z Requirement already satisfied: sphinxcontrib-jquery==4.1 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/../../docs/requirements-docs.txt (line 8)) (4.1) +2025-02-13T20:02:57.1388112Z Requirement already satisfied: ipython==8.12.3 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/../../docs/requirements-docs.txt (line 9)) (8.12.3) +2025-02-13T20:02:57.1930563Z Requirement already satisfied: pandoc==2.3 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/../../docs/requirements-docs.txt (line 10)) (2.3) +2025-02-13T20:02:57.1952718Z Requirement already satisfied: tabulate==0.9.0 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/../../docs/requirements-docs.txt (line 11)) (0.9.0) +2025-02-13T20:02:57.1986780Z Requirement already satisfied: myst-parser==3.0.0 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/../../docs/requirements-docs.txt (line 12)) (3.0.0) +2025-02-13T20:02:57.2213423Z Requirement already satisfied: elasticsearch in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/../../tests/sweep_framework/requirements-sweeps.txt (line 1)) (8.17.1) +2025-02-13T20:02:57.2450833Z Requirement already satisfied: termcolor in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/../../tests/sweep_framework/requirements-sweeps.txt (line 2)) (2.4.0) +2025-02-13T20:02:57.2475906Z Requirement already satisfied: beautifultable in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/../../tests/sweep_framework/requirements-sweeps.txt (line 3)) (1.1.0) +2025-02-13T20:02:57.2507766Z Requirement already satisfied: faster-fifo in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/../../tests/sweep_framework/requirements-sweeps.txt (line 4)) (1.4.7) +2025-02-13T20:02:57.2552159Z Requirement already satisfied: pytest==7.2.2 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 24)) (7.2.2) +2025-02-13T20:02:57.2679092Z Requirement already satisfied: pytest-timeout==2.2.0 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 25)) (2.2.0) +2025-02-13T20:02:57.2704216Z Requirement already satisfied: pytest-split==0.8.2 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 26)) (0.8.2) +2025-02-13T20:02:57.2727330Z Requirement already satisfied: pytest-xdist==3.6.1 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 27)) (3.6.1) +2025-02-13T20:02:57.2773719Z Requirement already satisfied: jsbeautifier==1.14.7 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 28)) (1.14.7) +2025-02-13T20:02:57.2799591Z Requirement already satisfied: datasets==2.9.0 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 29)) (2.9.0) +2025-02-13T20:02:57.3669386Z Requirement already satisfied: torch==2.2.1.0+cpu in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 30)) (2.2.1+cpu) +2025-02-13T20:02:57.3734452Z Requirement already satisfied: networkx==3.1 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 31)) (3.1) +2025-02-13T20:02:57.3897479Z Requirement already satisfied: torchvision==0.17.1+cpu in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 32)) (0.17.1+cpu) +2025-02-13T20:02:57.3934622Z Requirement already satisfied: torchmetrics==1.3.1 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 33)) (1.3.1) +2025-02-13T20:02:57.4978908Z Requirement already satisfied: torch-fidelity==0.3.0 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 34)) (0.3.0) +2025-02-13T20:02:57.5019079Z Requirement already satisfied: transformers==4.38.0 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 35)) (4.38.0) +2025-02-13T20:02:57.9124067Z Requirement already satisfied: xlsxwriter==3.0.8 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 36)) (3.0.8) +2025-02-13T20:02:57.9137321Z Requirement already satisfied: tiktoken==0.3.3 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 37)) (0.3.3) +2025-02-13T20:02:57.9171032Z Requirement already satisfied: tqdm==4.66.3 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 38)) (4.66.3) +2025-02-13T20:02:57.9265707Z Requirement already satisfied: enlighten==1.12.4 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 39)) (1.12.4) +2025-02-13T20:02:57.9301877Z Requirement already satisfied: sentencepiece==0.1.97 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 40)) (0.1.97) +2025-02-13T20:02:57.9316882Z Requirement already satisfied: numba>=0.58.1 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 41)) (0.58.1) +2025-02-13T20:02:57.9358769Z Requirement already satisfied: librosa==0.10.0 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 42)) (0.10.0) +2025-02-13T20:02:57.9591339Z Requirement already satisfied: timm==0.6.13 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 43)) (0.6.13) +2025-02-13T20:02:57.9631821Z Requirement already satisfied: opencv-python-headless==4.8.1.78 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 44)) (4.8.1.78) +2025-02-13T20:02:57.9745200Z Requirement already satisfied: diffusers==0.12.1 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 45)) (0.12.1) +2025-02-13T20:02:58.0170140Z Requirement already satisfied: accelerate==0.27.2 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 46)) (0.27.2) +2025-02-13T20:02:58.0626989Z Requirement already satisfied: ftfy==6.1.1 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 47)) (6.1.1) +2025-02-13T20:02:58.0847217Z Requirement already satisfied: gitpython==3.1.41 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 48)) (3.1.41) +2025-02-13T20:02:58.0968515Z Requirement already satisfied: einops==0.6.1 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 49)) (0.6.1) +2025-02-13T20:02:58.0984745Z Requirement already satisfied: multiprocess==0.70.14 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 53)) (0.70.14) +2025-02-13T20:02:58.1006099Z Requirement already satisfied: evaluate==0.4.0 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 54)) (0.4.0) +2025-02-13T20:02:58.1638788Z Requirement already satisfied: bert-score==0.3.12 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 55)) (0.3.12) +2025-02-13T20:02:58.1695058Z Requirement already satisfied: fsspec==2023.9.2 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 56)) (2023.9.2) +2025-02-13T20:02:58.2079666Z Requirement already satisfied: docopt==0.6.2 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 57)) (0.6.2) +2025-02-13T20:02:58.2096890Z Requirement already satisfied: blobfile==2.1.1 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 59)) (2.1.1) +2025-02-13T20:02:58.2137414Z Requirement already satisfied: numpy<2,>=1.24.4 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 60)) (1.24.4) +2025-02-13T20:02:58.2153399Z Requirement already satisfied: huggingface-hub==0.25.2 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 61)) (0.25.2) +2025-02-13T20:02:58.2958359Z Requirement already satisfied: pydantic==2.9.2 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 62)) (2.9.2) +2025-02-13T20:02:58.3063755Z Requirement already satisfied: cfgv>=2.0.0 in /usr/local/lib/python3.8/dist-packages (from pre-commit==3.0.4->-r tt_metal/python_env/requirements-dev.txt (line 9)) (3.4.0) +2025-02-13T20:02:58.3078521Z Requirement already satisfied: nodeenv>=0.11.1 in /usr/local/lib/python3.8/dist-packages (from pre-commit==3.0.4->-r tt_metal/python_env/requirements-dev.txt (line 9)) (1.9.1) +2025-02-13T20:02:58.3103037Z Requirement already satisfied: pyyaml>=5.1 in /usr/lib/python3/dist-packages (from pre-commit==3.0.4->-r tt_metal/python_env/requirements-dev.txt (line 9)) (5.3.1) +2025-02-13T20:02:58.3112865Z Requirement already satisfied: identify>=1.0.0 in /usr/local/lib/python3.8/dist-packages (from pre-commit==3.0.4->-r tt_metal/python_env/requirements-dev.txt (line 9)) (2.6.1) +2025-02-13T20:02:58.3133366Z Requirement already satisfied: virtualenv>=20.10.0 in /usr/local/lib/python3.8/dist-packages (from pre-commit==3.0.4->-r tt_metal/python_env/requirements-dev.txt (line 9)) (20.29.1) +2025-02-13T20:02:58.3334306Z Requirement already satisfied: packaging>=22.0 in /usr/local/lib/python3.8/dist-packages (from black==24.3.0->-r tt_metal/python_env/requirements-dev.txt (line 10)) (24.2) +2025-02-13T20:02:58.3347165Z Requirement already satisfied: mypy-extensions>=0.4.3 in /usr/local/lib/python3.8/dist-packages (from black==24.3.0->-r tt_metal/python_env/requirements-dev.txt (line 10)) (1.0.0) +2025-02-13T20:02:58.3362959Z Requirement already satisfied: pathspec>=0.9.0 in /usr/local/lib/python3.8/dist-packages (from black==24.3.0->-r tt_metal/python_env/requirements-dev.txt (line 10)) (0.12.1) +2025-02-13T20:02:58.3386438Z Requirement already satisfied: tomli>=1.1.0; python_version < "3.11" in /usr/local/lib/python3.8/dist-packages (from black==24.3.0->-r tt_metal/python_env/requirements-dev.txt (line 10)) (2.2.1) +2025-02-13T20:02:58.3406775Z Requirement already satisfied: typing-extensions>=4.0.1; python_version < "3.11" in /usr/local/lib/python3.8/dist-packages (from black==24.3.0->-r tt_metal/python_env/requirements-dev.txt (line 10)) (4.12.2) +2025-02-13T20:02:58.3419062Z Requirement already satisfied: click>=8.0.0 in /usr/local/lib/python3.8/dist-packages (from black==24.3.0->-r tt_metal/python_env/requirements-dev.txt (line 10)) (8.1.8) +2025-02-13T20:02:58.3444887Z Requirement already satisfied: pyproject_hooks in /usr/local/lib/python3.8/dist-packages (from build==0.10.0->-r tt_metal/python_env/requirements-dev.txt (line 12)) (1.2.0) +2025-02-13T20:02:58.3456211Z Requirement already satisfied: readme-renderer>=35.0 in /usr/local/lib/python3.8/dist-packages (from twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (43.0) +2025-02-13T20:02:58.3493600Z Requirement already satisfied: pkginfo>=1.8.1 in /usr/local/lib/python3.8/dist-packages (from twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (1.12.0) +2025-02-13T20:02:58.3531980Z Requirement already satisfied: keyring>=15.1 in /usr/local/lib/python3.8/dist-packages (from twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (25.5.0) +2025-02-13T20:02:58.3768642Z Requirement already satisfied: urllib3>=1.26.0 in /usr/local/lib/python3.8/dist-packages (from twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (2.2.3) +2025-02-13T20:02:58.3848305Z Requirement already satisfied: requests>=2.20 in /usr/local/lib/python3.8/dist-packages (from twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (2.32.3) +2025-02-13T20:02:58.3893878Z Requirement already satisfied: requests-toolbelt!=0.9.0,>=0.8.0 in /usr/local/lib/python3.8/dist-packages (from twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (1.0.0) +2025-02-13T20:02:58.3921515Z Requirement already satisfied: rich>=12.0.0 in /usr/local/lib/python3.8/dist-packages (from twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (13.9.4) +2025-02-13T20:02:58.3978748Z Requirement already satisfied: importlib-metadata>=3.6 in /usr/local/lib/python3.8/dist-packages (from twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (8.5.0) +2025-02-13T20:02:58.4163596Z Requirement already satisfied: rfc3986>=1.4.0 in /usr/local/lib/python3.8/dist-packages (from twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (2.0.0) +2025-02-13T20:02:58.4182491Z Requirement already satisfied: sphinxcontrib-qthelp in /usr/local/lib/python3.8/dist-packages (from sphinx==7.1.2->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 2)) (1.0.3) +2025-02-13T20:02:58.4221012Z Requirement already satisfied: Jinja2>=3.0 in /usr/local/lib/python3.8/dist-packages (from sphinx==7.1.2->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 2)) (3.1.5) +2025-02-13T20:02:58.4247181Z Requirement already satisfied: sphinxcontrib-devhelp in /usr/local/lib/python3.8/dist-packages (from sphinx==7.1.2->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 2)) (1.0.2) +2025-02-13T20:02:58.4286209Z Requirement already satisfied: sphinxcontrib-serializinghtml>=1.1.5 in /usr/local/lib/python3.8/dist-packages (from sphinx==7.1.2->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 2)) (1.1.5) +2025-02-13T20:02:58.4326461Z Requirement already satisfied: babel>=2.9 in /usr/local/lib/python3.8/dist-packages (from sphinx==7.1.2->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 2)) (2.17.0) +2025-02-13T20:02:58.4416359Z Requirement already satisfied: sphinxcontrib-htmlhelp>=2.0.0 in /usr/local/lib/python3.8/dist-packages (from sphinx==7.1.2->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 2)) (2.0.1) +2025-02-13T20:02:58.4463035Z Requirement already satisfied: imagesize>=1.3 in /usr/local/lib/python3.8/dist-packages (from sphinx==7.1.2->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 2)) (1.4.1) +2025-02-13T20:02:58.4474792Z Requirement already satisfied: sphinxcontrib-applehelp in /usr/local/lib/python3.8/dist-packages (from sphinx==7.1.2->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 2)) (1.0.4) +2025-02-13T20:02:58.4513230Z Requirement already satisfied: sphinxcontrib-jsmath in /usr/local/lib/python3.8/dist-packages (from sphinx==7.1.2->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 2)) (1.0.1) +2025-02-13T20:02:58.4547157Z Requirement already satisfied: snowballstemmer>=2.0 in /usr/local/lib/python3.8/dist-packages (from sphinx==7.1.2->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 2)) (2.2.0) +2025-02-13T20:02:58.4562326Z Requirement already satisfied: Pygments>=2.13 in /usr/local/lib/python3.8/dist-packages (from sphinx==7.1.2->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 2)) (2.19.1) +2025-02-13T20:02:58.4584120Z Requirement already satisfied: alabaster<0.8,>=0.7 in /usr/local/lib/python3.8/dist-packages (from sphinx==7.1.2->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 2)) (0.7.13) +2025-02-13T20:02:58.4596062Z Requirement already satisfied: traitlets>=5 in /usr/local/lib/python3.8/dist-packages (from nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (5.14.3) +2025-02-13T20:02:58.4671132Z Requirement already satisfied: nbformat in /usr/local/lib/python3.8/dist-packages (from nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (5.10.4) +2025-02-13T20:02:58.4768893Z Requirement already satisfied: nbconvert!=5.4 in /usr/local/lib/python3.8/dist-packages (from nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (7.16.6) +2025-02-13T20:02:58.5086458Z Requirement already satisfied: jedi>=0.16 in /usr/local/lib/python3.8/dist-packages (from ipython==8.12.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 9)) (0.19.2) +2025-02-13T20:02:58.5346265Z Requirement already satisfied: stack-data in /usr/local/lib/python3.8/dist-packages (from ipython==8.12.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 9)) (0.6.3) +2025-02-13T20:02:58.5416233Z Requirement already satisfied: pexpect>4.3; sys_platform != "win32" in /usr/local/lib/python3.8/dist-packages (from ipython==8.12.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 9)) (4.9.0) +2025-02-13T20:02:58.5432658Z Requirement already satisfied: matplotlib-inline in /usr/local/lib/python3.8/dist-packages (from ipython==8.12.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 9)) (0.1.7) +2025-02-13T20:02:58.5452533Z Requirement already satisfied: backcall in /usr/local/lib/python3.8/dist-packages (from ipython==8.12.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 9)) (0.2.0) +2025-02-13T20:02:58.5461565Z Requirement already satisfied: decorator in /usr/local/lib/python3.8/dist-packages (from ipython==8.12.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 9)) (5.1.1) +2025-02-13T20:02:58.5479853Z Requirement already satisfied: prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30 in /usr/local/lib/python3.8/dist-packages (from ipython==8.12.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 9)) (3.0.50) +2025-02-13T20:02:58.5497541Z Requirement already satisfied: pickleshare in /usr/local/lib/python3.8/dist-packages (from ipython==8.12.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 9)) (0.7.5) +2025-02-13T20:02:58.5512518Z Requirement already satisfied: plumbum in /usr/local/lib/python3.8/dist-packages (from pandoc==2.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 10)) (1.9.0) +2025-02-13T20:02:58.5670474Z Requirement already satisfied: ply in /usr/local/lib/python3.8/dist-packages (from pandoc==2.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 10)) (3.11) +2025-02-13T20:02:58.5682148Z Requirement already satisfied: markdown-it-py~=3.0 in /usr/local/lib/python3.8/dist-packages (from myst-parser==3.0.0->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 12)) (3.0.0) +2025-02-13T20:02:58.5882521Z Requirement already satisfied: mdit-py-plugins~=0.4 in /usr/local/lib/python3.8/dist-packages (from myst-parser==3.0.0->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 12)) (0.4.2) +2025-02-13T20:02:58.5951962Z Requirement already satisfied: elastic-transport<9,>=8.15.1 in /usr/local/lib/python3.8/dist-packages (from elasticsearch->-r tt_metal/python_env/../../tests/sweep_framework/requirements-sweeps.txt (line 1)) (8.17.0) +2025-02-13T20:02:58.6086636Z Requirement already satisfied: wcwidth in /usr/local/lib/python3.8/dist-packages (from beautifultable->-r tt_metal/python_env/../../tests/sweep_framework/requirements-sweeps.txt (line 3)) (0.2.13) +2025-02-13T20:02:58.6109965Z Requirement already satisfied: cython>=0.29 in /usr/local/lib/python3.8/dist-packages (from faster-fifo->-r tt_metal/python_env/../../tests/sweep_framework/requirements-sweeps.txt (line 4)) (3.0.11) +2025-02-13T20:02:58.6125633Z Requirement already satisfied: setuptools>=45.2.0 in /usr/lib/python3/dist-packages (from faster-fifo->-r tt_metal/python_env/../../tests/sweep_framework/requirements-sweeps.txt (line 4)) (45.2.0) +2025-02-13T20:02:58.6138069Z Requirement already satisfied: pluggy<2.0,>=0.12 in /usr/local/lib/python3.8/dist-packages (from pytest==7.2.2->-r tt_metal/python_env/requirements-dev.txt (line 24)) (1.5.0) +2025-02-13T20:02:58.6176758Z Requirement already satisfied: iniconfig in /usr/local/lib/python3.8/dist-packages (from pytest==7.2.2->-r tt_metal/python_env/requirements-dev.txt (line 24)) (2.0.0) +2025-02-13T20:02:58.6190522Z Requirement already satisfied: attrs>=19.2.0 in /usr/local/lib/python3.8/dist-packages (from pytest==7.2.2->-r tt_metal/python_env/requirements-dev.txt (line 24)) (25.1.0) +2025-02-13T20:02:58.6627230Z Requirement already satisfied: exceptiongroup>=1.0.0rc8; python_version < "3.11" in /usr/local/lib/python3.8/dist-packages (from pytest==7.2.2->-r tt_metal/python_env/requirements-dev.txt (line 24)) (1.2.2) +2025-02-13T20:02:58.6648111Z Requirement already satisfied: execnet>=2.1 in /usr/local/lib/python3.8/dist-packages (from pytest-xdist==3.6.1->-r tt_metal/python_env/requirements-dev.txt (line 27)) (2.1.1) +2025-02-13T20:02:58.6686925Z Requirement already satisfied: editorconfig>=0.12.2 in /usr/local/lib/python3.8/dist-packages (from jsbeautifier==1.14.7->-r tt_metal/python_env/requirements-dev.txt (line 28)) (0.17.0) +2025-02-13T20:02:58.6698800Z Requirement already satisfied: six>=1.13.0 in /usr/lib/python3/dist-packages (from jsbeautifier==1.14.7->-r tt_metal/python_env/requirements-dev.txt (line 28)) (1.14.0) +2025-02-13T20:02:58.6711680Z Requirement already satisfied: responses<0.19 in /usr/local/lib/python3.8/dist-packages (from datasets==2.9.0->-r tt_metal/python_env/requirements-dev.txt (line 29)) (0.18.0) +2025-02-13T20:02:58.6801996Z Requirement already satisfied: xxhash in /usr/local/lib/python3.8/dist-packages (from datasets==2.9.0->-r tt_metal/python_env/requirements-dev.txt (line 29)) (3.5.0) +2025-02-13T20:02:58.6818150Z Requirement already satisfied: pandas in /usr/local/lib/python3.8/dist-packages (from datasets==2.9.0->-r tt_metal/python_env/requirements-dev.txt (line 29)) (2.0.3) +2025-02-13T20:02:58.7567553Z Requirement already satisfied: aiohttp in /usr/local/lib/python3.8/dist-packages (from datasets==2.9.0->-r tt_metal/python_env/requirements-dev.txt (line 29)) (3.10.11) +2025-02-13T20:02:58.7932435Z Requirement already satisfied: dill<0.3.7 in /usr/local/lib/python3.8/dist-packages (from datasets==2.9.0->-r tt_metal/python_env/requirements-dev.txt (line 29)) (0.3.6) +2025-02-13T20:02:58.7958643Z Requirement already satisfied: pyarrow>=6.0.0 in /usr/local/lib/python3.8/dist-packages (from datasets==2.9.0->-r tt_metal/python_env/requirements-dev.txt (line 29)) (17.0.0) +2025-02-13T20:02:58.8007604Z Requirement already satisfied: sympy in /usr/local/lib/python3.8/dist-packages (from torch==2.2.1.0+cpu->-r tt_metal/python_env/requirements-dev.txt (line 30)) (1.13.3) +2025-02-13T20:02:58.8043460Z Requirement already satisfied: filelock in /usr/local/lib/python3.8/dist-packages (from torch==2.2.1.0+cpu->-r tt_metal/python_env/requirements-dev.txt (line 30)) (3.16.1) +2025-02-13T20:02:58.8162957Z Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /usr/local/lib/python3.8/dist-packages (from torchvision==0.17.1+cpu->-r tt_metal/python_env/requirements-dev.txt (line 32)) (10.4.0) +2025-02-13T20:02:58.8326439Z Requirement already satisfied: lightning-utilities>=0.8.0 in /usr/local/lib/python3.8/dist-packages (from torchmetrics==1.3.1->-r tt_metal/python_env/requirements-dev.txt (line 33)) (0.11.9) +2025-02-13T20:02:58.8391007Z Requirement already satisfied: scipy in /usr/local/lib/python3.8/dist-packages (from torch-fidelity==0.3.0->-r tt_metal/python_env/requirements-dev.txt (line 34)) (1.10.1) +2025-02-13T20:02:58.8595503Z Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.8/dist-packages (from transformers==4.38.0->-r tt_metal/python_env/requirements-dev.txt (line 35)) (2024.11.6) +2025-02-13T20:02:58.8624151Z Requirement already satisfied: tokenizers<0.19,>=0.14 in /usr/local/lib/python3.8/dist-packages (from transformers==4.38.0->-r tt_metal/python_env/requirements-dev.txt (line 35)) (0.15.2) +2025-02-13T20:02:58.8705242Z Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.8/dist-packages (from transformers==4.38.0->-r tt_metal/python_env/requirements-dev.txt (line 35)) (0.5.2) +2025-02-13T20:02:58.8983920Z Requirement already satisfied: blessed>=1.17.7 in /usr/local/lib/python3.8/dist-packages (from enlighten==1.12.4->-r tt_metal/python_env/requirements-dev.txt (line 39)) (1.20.0) +2025-02-13T20:02:58.9033918Z Requirement already satisfied: prefixed>=0.3.2 in /usr/local/lib/python3.8/dist-packages (from enlighten==1.12.4->-r tt_metal/python_env/requirements-dev.txt (line 39)) (0.9.0) +2025-02-13T20:02:58.9051529Z Requirement already satisfied: llvmlite<0.42,>=0.41.0dev0 in /usr/local/lib/python3.8/dist-packages (from numba>=0.58.1->-r tt_metal/python_env/requirements-dev.txt (line 41)) (0.41.1) +2025-02-13T20:02:58.9066408Z Requirement already satisfied: scikit-learn>=0.20.0 in /usr/local/lib/python3.8/dist-packages (from librosa==0.10.0->-r tt_metal/python_env/requirements-dev.txt (line 42)) (1.3.2) +2025-02-13T20:02:58.9351056Z Requirement already satisfied: soxr>=0.3.2 in /usr/local/lib/python3.8/dist-packages (from librosa==0.10.0->-r tt_metal/python_env/requirements-dev.txt (line 42)) (0.3.7) +2025-02-13T20:02:58.9402788Z Requirement already satisfied: audioread>=2.1.9 in /usr/local/lib/python3.8/dist-packages (from librosa==0.10.0->-r tt_metal/python_env/requirements-dev.txt (line 42)) (3.0.1) +2025-02-13T20:02:58.9424957Z Requirement already satisfied: soundfile>=0.12.1 in /usr/local/lib/python3.8/dist-packages (from librosa==0.10.0->-r tt_metal/python_env/requirements-dev.txt (line 42)) (0.13.1) +2025-02-13T20:02:58.9454472Z Requirement already satisfied: joblib>=0.14 in /usr/local/lib/python3.8/dist-packages (from librosa==0.10.0->-r tt_metal/python_env/requirements-dev.txt (line 42)) (1.4.2) +2025-02-13T20:02:58.9469323Z Requirement already satisfied: msgpack>=1.0 in /usr/local/lib/python3.8/dist-packages (from librosa==0.10.0->-r tt_metal/python_env/requirements-dev.txt (line 42)) (1.1.0) +2025-02-13T20:02:58.9485240Z Requirement already satisfied: lazy-loader>=0.1 in /usr/local/lib/python3.8/dist-packages (from librosa==0.10.0->-r tt_metal/python_env/requirements-dev.txt (line 42)) (0.4) +2025-02-13T20:02:58.9541547Z Requirement already satisfied: pooch>=1.0 in /usr/local/lib/python3.8/dist-packages (from librosa==0.10.0->-r tt_metal/python_env/requirements-dev.txt (line 42)) (1.8.2) +2025-02-13T20:02:58.9595383Z Requirement already satisfied: psutil in /usr/local/lib/python3.8/dist-packages (from accelerate==0.27.2->-r tt_metal/python_env/requirements-dev.txt (line 46)) (6.1.1) +2025-02-13T20:02:58.9760956Z Requirement already satisfied: gitdb<5,>=4.0.1 in /usr/local/lib/python3.8/dist-packages (from gitpython==3.1.41->-r tt_metal/python_env/requirements-dev.txt (line 48)) (4.0.12) +2025-02-13T20:02:58.9779441Z Requirement already satisfied: matplotlib in /usr/local/lib/python3.8/dist-packages (from bert-score==0.3.12->-r tt_metal/python_env/requirements-dev.txt (line 55)) (3.7.5) +2025-02-13T20:02:58.9862915Z Requirement already satisfied: pycryptodomex~=3.8 in /usr/local/lib/python3.8/dist-packages (from blobfile==2.1.1->-r tt_metal/python_env/requirements-dev.txt (line 59)) (3.21.0) +2025-02-13T20:02:58.9877898Z Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.8/dist-packages (from pydantic==2.9.2->-r tt_metal/python_env/requirements-dev.txt (line 62)) (0.7.0) +2025-02-13T20:02:58.9913508Z Requirement already satisfied: pydantic-core==2.23.4 in /usr/local/lib/python3.8/dist-packages (from pydantic==2.9.2->-r tt_metal/python_env/requirements-dev.txt (line 62)) (2.23.4) +2025-02-13T20:02:58.9936173Z Requirement already satisfied: distlib<1,>=0.3.7 in /usr/local/lib/python3.8/dist-packages (from virtualenv>=20.10.0->pre-commit==3.0.4->-r tt_metal/python_env/requirements-dev.txt (line 9)) (0.3.9) +2025-02-13T20:02:58.9950607Z Requirement already satisfied: nh3>=0.2.14 in /usr/local/lib/python3.8/dist-packages (from readme-renderer>=35.0->twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (0.2.20) +2025-02-13T20:02:58.9965092Z Requirement already satisfied: importlib-resources; python_version < "3.9" in /usr/local/lib/python3.8/dist-packages (from keyring>=15.1->twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (6.4.5) +2025-02-13T20:02:59.0109369Z Requirement already satisfied: SecretStorage>=3.2; sys_platform == "linux" in /usr/local/lib/python3.8/dist-packages (from keyring>=15.1->twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (3.3.3) +2025-02-13T20:02:59.0133648Z Requirement already satisfied: jaraco.classes in /usr/local/lib/python3.8/dist-packages (from keyring>=15.1->twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (3.4.0) +2025-02-13T20:02:59.0239402Z Requirement already satisfied: jeepney>=0.4.2; sys_platform == "linux" in /usr/local/lib/python3.8/dist-packages (from keyring>=15.1->twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (0.8.0) +2025-02-13T20:02:59.0308736Z Requirement already satisfied: jaraco.context in /usr/local/lib/python3.8/dist-packages (from keyring>=15.1->twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (6.0.1) +2025-02-13T20:02:59.0433124Z Requirement already satisfied: jaraco.functools in /usr/local/lib/python3.8/dist-packages (from keyring>=15.1->twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (4.1.0) +2025-02-13T20:02:59.0554206Z Requirement already satisfied: idna<4,>=2.5 in /usr/lib/python3/dist-packages (from requests>=2.20->twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (2.8) +2025-02-13T20:02:59.0575202Z Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.8/dist-packages (from requests>=2.20->twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (3.4.1) +2025-02-13T20:02:59.0598732Z Requirement already satisfied: certifi>=2017.4.17 in /usr/lib/python3/dist-packages (from requests>=2.20->twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (2019.11.28) +2025-02-13T20:02:59.0608943Z Requirement already satisfied: zipp>=3.20 in /usr/local/lib/python3.8/dist-packages (from importlib-metadata>=3.6->twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (3.20.2) +2025-02-13T20:02:59.0773440Z Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.8/dist-packages (from Jinja2>=3.0->sphinx==7.1.2->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 2)) (2.1.5) +2025-02-13T20:02:59.0791459Z Requirement already satisfied: pytz>=2015.7; python_version < "3.9" in /usr/local/lib/python3.8/dist-packages (from babel>=2.9->sphinx==7.1.2->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 2)) (2025.1) +2025-02-13T20:02:59.0811146Z Requirement already satisfied: jsonschema>=2.6 in /usr/local/lib/python3.8/dist-packages (from nbformat->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (4.23.0) +2025-02-13T20:02:59.0992409Z Requirement already satisfied: jupyter-core!=5.0.*,>=4.12 in /usr/local/lib/python3.8/dist-packages (from nbformat->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (5.7.2) +2025-02-13T20:02:59.1100457Z Requirement already satisfied: fastjsonschema>=2.15 in /usr/local/lib/python3.8/dist-packages (from nbformat->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (2.21.1) +2025-02-13T20:02:59.1162525Z Requirement already satisfied: nbclient>=0.5.0 in /usr/local/lib/python3.8/dist-packages (from nbconvert!=5.4->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (0.10.1) +2025-02-13T20:02:59.1394839Z Requirement already satisfied: bleach[css]!=5.0.0 in /usr/local/lib/python3.8/dist-packages (from nbconvert!=5.4->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (6.1.0) +2025-02-13T20:02:59.1450952Z Requirement already satisfied: jupyterlab-pygments in /usr/local/lib/python3.8/dist-packages (from nbconvert!=5.4->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (0.3.0) +2025-02-13T20:02:59.1463987Z Requirement already satisfied: pandocfilters>=1.4.1 in /usr/local/lib/python3.8/dist-packages (from nbconvert!=5.4->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (1.5.1) +2025-02-13T20:02:59.1482858Z Requirement already satisfied: mistune<4,>=2.0.3 in /usr/local/lib/python3.8/dist-packages (from nbconvert!=5.4->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (3.1.1) +2025-02-13T20:02:59.1508095Z Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.8/dist-packages (from nbconvert!=5.4->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (4.13.3) +2025-02-13T20:02:59.1565085Z Requirement already satisfied: defusedxml in /usr/local/lib/python3.8/dist-packages (from nbconvert!=5.4->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (0.7.1) +2025-02-13T20:02:59.1593193Z Requirement already satisfied: parso<0.9.0,>=0.8.4 in /usr/local/lib/python3.8/dist-packages (from jedi>=0.16->ipython==8.12.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 9)) (0.8.4) +2025-02-13T20:02:59.1641691Z Requirement already satisfied: pure-eval in /usr/local/lib/python3.8/dist-packages (from stack-data->ipython==8.12.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 9)) (0.2.3) +2025-02-13T20:02:59.1662955Z Requirement already satisfied: executing>=1.2.0 in /usr/local/lib/python3.8/dist-packages (from stack-data->ipython==8.12.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 9)) (2.2.0) +2025-02-13T20:02:59.1729162Z Requirement already satisfied: asttokens>=2.1.0 in /usr/local/lib/python3.8/dist-packages (from stack-data->ipython==8.12.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 9)) (3.0.0) +2025-02-13T20:02:59.1793324Z Requirement already satisfied: ptyprocess>=0.5 in /usr/local/lib/python3.8/dist-packages (from pexpect>4.3; sys_platform != "win32"->ipython==8.12.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 9)) (0.7.0) +2025-02-13T20:02:59.1804857Z Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.8/dist-packages (from markdown-it-py~=3.0->myst-parser==3.0.0->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 12)) (0.1.2) +2025-02-13T20:02:59.1815743Z Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.8/dist-packages (from pandas->datasets==2.9.0->-r tt_metal/python_env/requirements-dev.txt (line 29)) (2.9.0.post0) +2025-02-13T20:02:59.1837546Z Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.8/dist-packages (from pandas->datasets==2.9.0->-r tt_metal/python_env/requirements-dev.txt (line 29)) (2025.1) +2025-02-13T20:02:59.1855273Z Requirement already satisfied: async-timeout<6.0,>=4.0; python_version < "3.11" in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets==2.9.0->-r tt_metal/python_env/requirements-dev.txt (line 29)) (5.0.1) +2025-02-13T20:02:59.1867448Z Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets==2.9.0->-r tt_metal/python_env/requirements-dev.txt (line 29)) (2.4.4) +2025-02-13T20:02:59.1881380Z Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets==2.9.0->-r tt_metal/python_env/requirements-dev.txt (line 29)) (6.1.0) +2025-02-13T20:02:59.1911221Z Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets==2.9.0->-r tt_metal/python_env/requirements-dev.txt (line 29)) (1.3.1) +2025-02-13T20:02:59.1929235Z Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets==2.9.0->-r tt_metal/python_env/requirements-dev.txt (line 29)) (1.5.0) +2025-02-13T20:02:59.1949790Z Requirement already satisfied: yarl<2.0,>=1.12.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets==2.9.0->-r tt_metal/python_env/requirements-dev.txt (line 29)) (1.15.2) +2025-02-13T20:02:59.2005777Z Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.8/dist-packages (from sympy->torch==2.2.1.0+cpu->-r tt_metal/python_env/requirements-dev.txt (line 30)) (1.3.0) +2025-02-13T20:02:59.2081475Z Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.8/dist-packages (from scikit-learn>=0.20.0->librosa==0.10.0->-r tt_metal/python_env/requirements-dev.txt (line 42)) (3.5.0) +2025-02-13T20:02:59.2098759Z Requirement already satisfied: cffi>=1.0 in /usr/local/lib/python3.8/dist-packages (from soundfile>=0.12.1->librosa==0.10.0->-r tt_metal/python_env/requirements-dev.txt (line 42)) (1.17.1) +2025-02-13T20:02:59.2116985Z Requirement already satisfied: smmap<6,>=3.0.1 in /usr/local/lib/python3.8/dist-packages (from gitdb<5,>=4.0.1->gitpython==3.1.41->-r tt_metal/python_env/requirements-dev.txt (line 48)) (5.0.2) +2025-02-13T20:02:59.2128947Z Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib->bert-score==0.3.12->-r tt_metal/python_env/requirements-dev.txt (line 55)) (3.1.4) +2025-02-13T20:02:59.2154806Z Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.8/dist-packages (from matplotlib->bert-score==0.3.12->-r tt_metal/python_env/requirements-dev.txt (line 55)) (4.55.8) +2025-02-13T20:02:59.2510723Z Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib->bert-score==0.3.12->-r tt_metal/python_env/requirements-dev.txt (line 55)) (1.4.7) +2025-02-13T20:02:59.2528488Z Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib->bert-score==0.3.12->-r tt_metal/python_env/requirements-dev.txt (line 55)) (1.1.1) +2025-02-13T20:02:59.2671298Z Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.8/dist-packages (from matplotlib->bert-score==0.3.12->-r tt_metal/python_env/requirements-dev.txt (line 55)) (0.12.1) +2025-02-13T20:02:59.2732292Z Requirement already satisfied: cryptography>=2.0 in /usr/local/lib/python3.8/dist-packages (from SecretStorage>=3.2; sys_platform == "linux"->keyring>=15.1->twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (44.0.0) +2025-02-13T20:02:59.2940713Z Requirement already satisfied: more-itertools in /usr/local/lib/python3.8/dist-packages (from jaraco.classes->keyring>=15.1->twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (10.5.0) +2025-02-13T20:02:59.2958958Z Requirement already satisfied: backports.tarfile; python_version < "3.12" in /usr/local/lib/python3.8/dist-packages (from jaraco.context->keyring>=15.1->twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (1.2.0) +2025-02-13T20:02:59.3051809Z Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.8/dist-packages (from jsonschema>=2.6->nbformat->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (2023.12.1) +2025-02-13T20:02:59.3084075Z Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.8/dist-packages (from jsonschema>=2.6->nbformat->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (0.20.1) +2025-02-13T20:02:59.3103009Z Requirement already satisfied: pkgutil-resolve-name>=1.3.10; python_version < "3.9" in /usr/local/lib/python3.8/dist-packages (from jsonschema>=2.6->nbformat->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (1.3.10) +2025-02-13T20:02:59.3112386Z Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.8/dist-packages (from jsonschema>=2.6->nbformat->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (0.35.1) +2025-02-13T20:02:59.3137525Z Requirement already satisfied: jupyter-client>=6.1.12 in /usr/local/lib/python3.8/dist-packages (from nbclient>=0.5.0->nbconvert!=5.4->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (8.6.3) +2025-02-13T20:02:59.3314578Z Requirement already satisfied: webencodings in /usr/local/lib/python3.8/dist-packages (from bleach[css]!=5.0.0->nbconvert!=5.4->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (0.5.1) +2025-02-13T20:02:59.3332616Z Requirement already satisfied: tinycss2<1.3,>=1.1.0; extra == "css" in /usr/local/lib/python3.8/dist-packages (from bleach[css]!=5.0.0->nbconvert!=5.4->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (1.2.1) +2025-02-13T20:02:59.3384292Z Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.8/dist-packages (from beautifulsoup4->nbconvert!=5.4->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (2.6) +2025-02-13T20:02:59.3395989Z Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.8/dist-packages (from yarl<2.0,>=1.12.0->aiohttp->datasets==2.9.0->-r tt_metal/python_env/requirements-dev.txt (line 29)) (0.2.0) +2025-02-13T20:02:59.3408705Z Requirement already satisfied: pycparser in /usr/local/lib/python3.8/dist-packages (from cffi>=1.0->soundfile>=0.12.1->librosa==0.10.0->-r tt_metal/python_env/requirements-dev.txt (line 42)) (2.22) +2025-02-13T20:02:59.3423002Z Requirement already satisfied: pyzmq>=23.0 in /usr/local/lib/python3.8/dist-packages (from jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert!=5.4->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (26.2.1) +2025-02-13T20:02:59.3442138Z Requirement already satisfied: tornado>=6.2 in /usr/local/lib/python3.8/dist-packages (from jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert!=5.4->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (6.4.2) +2025-02-13T20:03:00.4277417Z Obtaining file:///home/ubuntu/actions-runner/_work/tt-metal/tt-metal +2025-02-13T20:03:00.4306360Z Installing build dependencies: started +2025-02-13T20:03:05.4527681Z Installing build dependencies: finished with status 'done' +2025-02-13T20:03:05.4528355Z Getting requirements to build wheel: started +2025-02-13T20:03:09.1152747Z Getting requirements to build wheel: finished with status 'done' +2025-02-13T20:03:09.1182291Z Preparing wheel metadata: started +2025-02-13T20:03:12.2039600Z Preparing wheel metadata: finished with status 'done' +2025-02-13T20:03:12.4797744Z Collecting seaborn==0.13.2 +2025-02-13T20:03:12.5095120Z Downloading seaborn-0.13.2-py3-none-any.whl (294 kB) +2025-02-13T20:03:12.9858217Z Collecting jupyterlab==4.2.5 +2025-02-13T20:03:12.9941751Z Downloading jupyterlab-4.2.5-py3-none-any.whl (11.6 MB) +2025-02-13T20:03:13.5915123Z Collecting click==8.1.7 +2025-02-13T20:03:13.5995726Z Downloading click-8.1.7-py3-none-any.whl (97 kB) +2025-02-13T20:03:13.7675593Z Collecting pyyaml>=5.4 +2025-02-13T20:03:13.7755069Z Downloading PyYAML-6.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (746 kB) +2025-02-13T20:03:14.4484590Z Collecting matplotlib==3.7.1 +2025-02-13T20:03:14.4566854Z Downloading matplotlib-3.7.1-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (9.2 MB) +2025-02-13T20:03:14.9373650Z Collecting bokeh==3.1.1 +2025-02-13T20:03:14.9450193Z Downloading bokeh-3.1.1-py3-none-any.whl (8.3 MB) +2025-02-13T20:03:16.2527767Z Collecting Pillow==10.3.0 +2025-02-13T20:03:16.2610832Z Downloading pillow-10.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.4 MB) +2025-02-13T20:03:16.5047508Z Collecting toolz==0.12.0 +2025-02-13T20:03:16.5121068Z Downloading toolz-0.12.0-py3-none-any.whl (55 kB) +2025-02-13T20:03:16.6269057Z Collecting graphviz==0.20.3 +2025-02-13T20:03:16.6349592Z Downloading graphviz-0.20.3-py3-none-any.whl (47 kB) +2025-02-13T20:03:16.6824283Z Requirement already satisfied: pandas==2.0.3 in /usr/local/lib/python3.8/dist-packages (from ttnn==0.0.dev1+any) (2.0.3) +2025-02-13T20:03:16.7900194Z Requirement already satisfied: networkx==3.1 in /usr/local/lib/python3.8/dist-packages (from ttnn==0.0.dev1+any) (3.1) +2025-02-13T20:03:16.8068013Z Requirement already satisfied: numpy<2,>=1.24.4 in /usr/local/lib/python3.8/dist-packages (from ttnn==0.0.dev1+any) (1.24.4) +2025-02-13T20:03:16.9248040Z Collecting ipywidgets==8.1.1 +2025-02-13T20:03:16.9330056Z Downloading ipywidgets-8.1.1-py3-none-any.whl (139 kB) +2025-02-13T20:03:17.0815816Z Collecting dash==2.15.0 +2025-02-13T20:03:17.0896524Z Downloading dash-2.15.0-py3-none-any.whl (10.2 MB) +2025-02-13T20:03:17.6795328Z Collecting plotly==5.18.0 +2025-02-13T20:03:17.6902247Z Downloading plotly-5.18.0-py3-none-any.whl (15.6 MB) +2025-02-13T20:03:19.7124961Z Requirement already satisfied: torch==2.2.1+cpu in /usr/local/lib/python3.8/dist-packages (from ttnn==0.0.dev1+any) (2.2.1+cpu) +2025-02-13T20:03:19.7855276Z Collecting loguru==0.6.0 +2025-02-13T20:03:19.7937258Z Downloading loguru-0.6.0-py3-none-any.whl (58 kB) +2025-02-13T20:03:19.9928773Z Collecting jupyter-server<3,>=2.4.0 +2025-02-13T20:03:20.0011907Z Downloading jupyter_server-2.14.2-py3-none-any.whl (383 kB) +2025-02-13T20:03:20.0899877Z Requirement already satisfied: setuptools>=40.1.0 in /usr/lib/python3/dist-packages (from jupyterlab==4.2.5->ttnn==0.0.dev1+any) (45.2.0) +2025-02-13T20:03:20.0918148Z Requirement already satisfied: importlib-resources>=1.4; python_version < "3.9" in /usr/local/lib/python3.8/dist-packages (from jupyterlab==4.2.5->ttnn==0.0.dev1+any) (6.4.5) +2025-02-13T20:03:20.1068248Z Requirement already satisfied: tomli>=1.2.2; python_version < "3.11" in /usr/local/lib/python3.8/dist-packages (from jupyterlab==4.2.5->ttnn==0.0.dev1+any) (2.2.1) +2025-02-13T20:03:20.1691541Z Collecting jupyter-lsp>=2.0.0 +2025-02-13T20:03:20.1769834Z Downloading jupyter_lsp-2.2.5-py3-none-any.whl (69 kB) +2025-02-13T20:03:20.2215581Z Requirement already satisfied: tornado>=6.2.0 in /usr/local/lib/python3.8/dist-packages (from jupyterlab==4.2.5->ttnn==0.0.dev1+any) (6.4.2) +2025-02-13T20:03:20.3625164Z Collecting ipykernel>=6.5.0 +2025-02-13T20:03:20.3704670Z Downloading ipykernel-6.29.5-py3-none-any.whl (117 kB) +2025-02-13T20:03:20.5092004Z Collecting async-lru>=1.0.0 +2025-02-13T20:03:20.5172930Z Downloading async_lru-2.0.4-py3-none-any.whl (6.1 kB) +2025-02-13T20:03:20.5518307Z Requirement already satisfied: traitlets in /usr/local/lib/python3.8/dist-packages (from jupyterlab==4.2.5->ttnn==0.0.dev1+any) (5.14.3) +2025-02-13T20:03:20.5596905Z Requirement already satisfied: jinja2>=3.0.3 in /usr/local/lib/python3.8/dist-packages (from jupyterlab==4.2.5->ttnn==0.0.dev1+any) (3.1.5) +2025-02-13T20:03:20.5620705Z Requirement already satisfied: packaging in /usr/local/lib/python3.8/dist-packages (from jupyterlab==4.2.5->ttnn==0.0.dev1+any) (24.2) +2025-02-13T20:03:20.5631154Z Requirement already satisfied: jupyter-core in /usr/local/lib/python3.8/dist-packages (from jupyterlab==4.2.5->ttnn==0.0.dev1+any) (5.7.2) +2025-02-13T20:03:20.6217457Z Collecting notebook-shim>=0.2 +2025-02-13T20:03:20.6294592Z Downloading notebook_shim-0.2.4-py3-none-any.whl (13 kB) +2025-02-13T20:03:20.7808919Z Collecting jupyterlab-server<3,>=2.27.1 +2025-02-13T20:03:20.7887221Z Downloading jupyterlab_server-2.27.3-py3-none-any.whl (59 kB) +2025-02-13T20:03:20.9357562Z Collecting httpx>=0.25.0 +2025-02-13T20:03:20.9433300Z Downloading httpx-0.28.1-py3-none-any.whl (73 kB) +2025-02-13T20:03:20.9914643Z Requirement already satisfied: importlib-metadata>=4.8.3; python_version < "3.10" in /usr/local/lib/python3.8/dist-packages (from jupyterlab==4.2.5->ttnn==0.0.dev1+any) (8.5.0) +2025-02-13T20:03:21.0104202Z Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.8/dist-packages (from matplotlib==3.7.1->ttnn==0.0.dev1+any) (4.55.8) +2025-02-13T20:03:21.0473723Z Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.8/dist-packages (from matplotlib==3.7.1->ttnn==0.0.dev1+any) (2.9.0.post0) +2025-02-13T20:03:21.0497068Z Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib==3.7.1->ttnn==0.0.dev1+any) (1.4.7) +2025-02-13T20:03:21.0511542Z Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.8/dist-packages (from matplotlib==3.7.1->ttnn==0.0.dev1+any) (0.12.1) +2025-02-13T20:03:21.0570928Z Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib==3.7.1->ttnn==0.0.dev1+any) (3.1.4) +2025-02-13T20:03:21.0597425Z Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib==3.7.1->ttnn==0.0.dev1+any) (1.1.1) +2025-02-13T20:03:21.1358008Z Collecting xyzservices>=2021.09.1 +2025-02-13T20:03:21.1437828Z Downloading xyzservices-2025.1.0-py3-none-any.whl (88 kB) +2025-02-13T20:03:21.1834855Z Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.8/dist-packages (from pandas==2.0.3->ttnn==0.0.dev1+any) (2025.1) +2025-02-13T20:03:21.1847003Z Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.8/dist-packages (from pandas==2.0.3->ttnn==0.0.dev1+any) (2025.1) +2025-02-13T20:03:21.2983621Z Collecting widgetsnbextension~=4.0.9 +2025-02-13T20:03:21.3060453Z Downloading widgetsnbextension-4.0.13-py3-none-any.whl (2.3 MB) +2025-02-13T20:03:21.4646341Z Collecting comm>=0.1.3 +2025-02-13T20:03:21.4722356Z Downloading comm-0.2.2-py3-none-any.whl (7.2 kB) +2025-02-13T20:03:21.5067132Z Requirement already satisfied: ipython>=6.1.0 in /usr/local/lib/python3.8/dist-packages (from ipywidgets==8.1.1->ttnn==0.0.dev1+any) (8.12.3) +2025-02-13T20:03:21.6729613Z Collecting jupyterlab-widgets~=3.0.9 +2025-02-13T20:03:21.6809447Z Downloading jupyterlab_widgets-3.0.13-py3-none-any.whl (214 kB) +2025-02-13T20:03:21.8358268Z Collecting Werkzeug<3.1 +2025-02-13T20:03:21.8437138Z Downloading werkzeug-3.0.6-py3-none-any.whl (227 kB) +2025-02-13T20:03:21.9803078Z Collecting dash-core-components==2.0.0 +2025-02-13T20:03:21.9884280Z Downloading dash_core_components-2.0.0-py3-none-any.whl (3.8 kB) +2025-02-13T20:03:22.0763462Z Collecting dash-html-components==2.0.0 +2025-02-13T20:03:22.0876207Z Downloading dash_html_components-2.0.0-py3-none-any.whl (4.1 kB) +2025-02-13T20:03:22.1910552Z Collecting nest-asyncio +2025-02-13T20:03:22.1990080Z Downloading nest_asyncio-1.6.0-py3-none-any.whl (5.2 kB) +2025-02-13T20:03:22.2308551Z Requirement already satisfied: requests in /usr/local/lib/python3.8/dist-packages (from dash==2.15.0->ttnn==0.0.dev1+any) (2.32.3) +2025-02-13T20:03:22.2815324Z Collecting retrying +2025-02-13T20:03:22.2897646Z Downloading retrying-1.3.4-py3-none-any.whl (11 kB) +2025-02-13T20:03:22.4008934Z Collecting Flask<3.1,>=1.0.4 +2025-02-13T20:03:22.4090661Z Downloading flask-3.0.3-py3-none-any.whl (101 kB) +2025-02-13T20:03:22.5047431Z Collecting dash-table==5.0.0 +2025-02-13T20:03:22.5124228Z Downloading dash_table-5.0.0-py3-none-any.whl (3.9 kB) +2025-02-13T20:03:22.5453224Z Requirement already satisfied: typing-extensions>=4.1.1 in /usr/local/lib/python3.8/dist-packages (from dash==2.15.0->ttnn==0.0.dev1+any) (4.12.2) +2025-02-13T20:03:22.6180648Z Collecting tenacity>=6.2.0 +2025-02-13T20:03:22.6273937Z Downloading tenacity-9.0.0-py3-none-any.whl (28 kB) +2025-02-13T20:03:22.6661648Z Requirement already satisfied: sympy in /usr/local/lib/python3.8/dist-packages (from torch==2.2.1+cpu->ttnn==0.0.dev1+any) (1.13.3) +2025-02-13T20:03:22.6702519Z Requirement already satisfied: fsspec in /usr/local/lib/python3.8/dist-packages (from torch==2.2.1+cpu->ttnn==0.0.dev1+any) (2023.9.2) +2025-02-13T20:03:22.7090347Z Requirement already satisfied: filelock in /usr/local/lib/python3.8/dist-packages (from torch==2.2.1+cpu->ttnn==0.0.dev1+any) (3.16.1) +2025-02-13T20:03:22.7723483Z Collecting jupyter-server-terminals>=0.4.4 +2025-02-13T20:03:22.7803385Z Downloading jupyter_server_terminals-0.5.3-py3-none-any.whl (13 kB) +2025-02-13T20:03:22.9650326Z Collecting argon2-cffi>=21.1 +2025-02-13T20:03:22.9734148Z Downloading argon2_cffi-23.1.0-py3-none-any.whl (15 kB) +2025-02-13T20:03:23.1419209Z Collecting overrides>=5.0 +2025-02-13T20:03:23.1511512Z Downloading overrides-7.7.0-py3-none-any.whl (17 kB) +2025-02-13T20:03:23.1951804Z Requirement already satisfied: nbformat>=5.3.0 in /usr/local/lib/python3.8/dist-packages (from jupyter-server<3,>=2.4.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (5.10.4) +2025-02-13T20:03:23.3325192Z Collecting websocket-client>=1.7 +2025-02-13T20:03:23.3407722Z Downloading websocket_client-1.8.0-py3-none-any.whl (58 kB) +2025-02-13T20:03:23.3937787Z Requirement already satisfied: nbconvert>=6.4.4 in /usr/local/lib/python3.8/dist-packages (from jupyter-server<3,>=2.4.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (7.16.6) +2025-02-13T20:03:23.5169833Z Collecting terminado>=0.8.3 +2025-02-13T20:03:23.5252553Z Downloading terminado-0.18.1-py3-none-any.whl (14 kB) +2025-02-13T20:03:23.6395186Z Collecting send2trash>=1.8.2 +2025-02-13T20:03:23.6475171Z Downloading Send2Trash-1.8.3-py3-none-any.whl (18 kB) +2025-02-13T20:03:23.7990080Z Collecting anyio>=3.1.0 +2025-02-13T20:03:23.8071339Z Downloading anyio-4.5.2-py3-none-any.whl (89 kB) +2025-02-13T20:03:23.9432946Z Collecting jupyter-events>=0.9.0 +2025-02-13T20:03:23.9516860Z Downloading jupyter_events-0.10.0-py3-none-any.whl (18 kB) +2025-02-13T20:03:24.0112934Z Requirement already satisfied: pyzmq>=24 in /usr/local/lib/python3.8/dist-packages (from jupyter-server<3,>=2.4.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (26.2.1) +2025-02-13T20:03:24.0930503Z Collecting prometheus-client>=0.9 +2025-02-13T20:03:24.1011659Z Downloading prometheus_client-0.21.1-py3-none-any.whl (54 kB) +2025-02-13T20:03:24.1494879Z Requirement already satisfied: jupyter-client>=7.4.4 in /usr/local/lib/python3.8/dist-packages (from jupyter-server<3,>=2.4.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (8.6.3) +2025-02-13T20:03:24.1724886Z Requirement already satisfied: zipp>=3.1.0; python_version < "3.10" in /usr/local/lib/python3.8/dist-packages (from importlib-resources>=1.4; python_version < "3.9"->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (3.20.2) +2025-02-13T20:03:24.8199643Z Collecting debugpy>=1.6.5 +2025-02-13T20:03:24.8288735Z Downloading debugpy-1.8.12-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB) +2025-02-13T20:03:25.0557693Z Requirement already satisfied: matplotlib-inline>=0.1 in /usr/local/lib/python3.8/dist-packages (from ipykernel>=6.5.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (0.1.7) +2025-02-13T20:03:25.0579532Z Requirement already satisfied: psutil in /usr/local/lib/python3.8/dist-packages (from ipykernel>=6.5.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (6.1.1) +2025-02-13T20:03:25.0788236Z Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.8/dist-packages (from jinja2>=3.0.3->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (2.1.5) +2025-02-13T20:03:25.0804185Z Requirement already satisfied: platformdirs>=2.5 in /usr/local/lib/python3.8/dist-packages (from jupyter-core->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (3.11.0) +2025-02-13T20:03:25.1827230Z Collecting json5>=0.9.0 +2025-02-13T20:03:25.1914782Z Downloading json5-0.10.0-py3-none-any.whl (34 kB) +2025-02-13T20:03:25.2459466Z Requirement already satisfied: jsonschema>=4.18.0 in /usr/local/lib/python3.8/dist-packages (from jupyterlab-server<3,>=2.27.1->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (4.23.0) +2025-02-13T20:03:25.2686110Z Requirement already satisfied: babel>=2.10 in /usr/local/lib/python3.8/dist-packages (from jupyterlab-server<3,>=2.27.1->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (2.17.0) +2025-02-13T20:03:25.3784774Z Collecting httpcore==1.* +2025-02-13T20:03:25.3864489Z Downloading httpcore-1.0.7-py3-none-any.whl (78 kB) +2025-02-13T20:03:25.4419548Z Requirement already satisfied: idna in /usr/lib/python3/dist-packages (from httpx>=0.25.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (2.8) +2025-02-13T20:03:25.4436539Z Requirement already satisfied: certifi in /usr/lib/python3/dist-packages (from httpx>=0.25.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (2019.11.28) +2025-02-13T20:03:25.4448968Z Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.7->matplotlib==3.7.1->ttnn==0.0.dev1+any) (1.14.0) +2025-02-13T20:03:25.4459756Z Requirement already satisfied: backcall in /usr/local/lib/python3.8/dist-packages (from ipython>=6.1.0->ipywidgets==8.1.1->ttnn==0.0.dev1+any) (0.2.0) +2025-02-13T20:03:25.4472911Z Requirement already satisfied: stack-data in /usr/local/lib/python3.8/dist-packages (from ipython>=6.1.0->ipywidgets==8.1.1->ttnn==0.0.dev1+any) (0.6.3) +2025-02-13T20:03:25.4551594Z Requirement already satisfied: decorator in /usr/local/lib/python3.8/dist-packages (from ipython>=6.1.0->ipywidgets==8.1.1->ttnn==0.0.dev1+any) (5.1.1) +2025-02-13T20:03:25.4568943Z Requirement already satisfied: jedi>=0.16 in /usr/local/lib/python3.8/dist-packages (from ipython>=6.1.0->ipywidgets==8.1.1->ttnn==0.0.dev1+any) (0.19.2) +2025-02-13T20:03:25.4908228Z Requirement already satisfied: pexpect>4.3; sys_platform != "win32" in /usr/local/lib/python3.8/dist-packages (from ipython>=6.1.0->ipywidgets==8.1.1->ttnn==0.0.dev1+any) (4.9.0) +2025-02-13T20:03:25.4935721Z Requirement already satisfied: prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30 in /usr/local/lib/python3.8/dist-packages (from ipython>=6.1.0->ipywidgets==8.1.1->ttnn==0.0.dev1+any) (3.0.50) +2025-02-13T20:03:25.4954629Z Requirement already satisfied: pickleshare in /usr/local/lib/python3.8/dist-packages (from ipython>=6.1.0->ipywidgets==8.1.1->ttnn==0.0.dev1+any) (0.7.5) +2025-02-13T20:03:25.4979523Z Requirement already satisfied: pygments>=2.4.0 in /usr/local/lib/python3.8/dist-packages (from ipython>=6.1.0->ipywidgets==8.1.1->ttnn==0.0.dev1+any) (2.19.1) +2025-02-13T20:03:25.5007195Z Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.8/dist-packages (from requests->dash==2.15.0->ttnn==0.0.dev1+any) (3.4.1) +2025-02-13T20:03:25.5038467Z Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.8/dist-packages (from requests->dash==2.15.0->ttnn==0.0.dev1+any) (2.2.3) +2025-02-13T20:03:25.5741629Z Collecting itsdangerous>=2.1.2 +2025-02-13T20:03:25.5820847Z Downloading itsdangerous-2.2.0-py3-none-any.whl (16 kB) +2025-02-13T20:03:25.6844234Z Collecting blinker>=1.6.2 +2025-02-13T20:03:25.6923986Z Downloading blinker-1.8.2-py3-none-any.whl (9.5 kB) +2025-02-13T20:03:25.7330578Z Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.8/dist-packages (from sympy->torch==2.2.1+cpu->ttnn==0.0.dev1+any) (1.3.0) +2025-02-13T20:03:25.8076170Z Collecting argon2-cffi-bindings +2025-02-13T20:03:25.8161575Z Downloading argon2_cffi_bindings-21.2.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (86 kB) +2025-02-13T20:03:25.8649468Z Requirement already satisfied: fastjsonschema>=2.15 in /usr/local/lib/python3.8/dist-packages (from nbformat>=5.3.0->jupyter-server<3,>=2.4.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (2.21.1) +2025-02-13T20:03:25.8733753Z Requirement already satisfied: bleach[css]!=5.0.0 in /usr/local/lib/python3.8/dist-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (6.1.0) +2025-02-13T20:03:25.8805109Z Requirement already satisfied: pandocfilters>=1.4.1 in /usr/local/lib/python3.8/dist-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (1.5.1) +2025-02-13T20:03:25.8825003Z Requirement already satisfied: defusedxml in /usr/local/lib/python3.8/dist-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (0.7.1) +2025-02-13T20:03:25.8855761Z Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.8/dist-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (4.13.3) +2025-02-13T20:03:25.8928909Z Requirement already satisfied: nbclient>=0.5.0 in /usr/local/lib/python3.8/dist-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (0.10.1) +2025-02-13T20:03:25.9213675Z Requirement already satisfied: jupyterlab-pygments in /usr/local/lib/python3.8/dist-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (0.3.0) +2025-02-13T20:03:25.9230081Z Requirement already satisfied: mistune<4,>=2.0.3 in /usr/local/lib/python3.8/dist-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (3.1.1) +2025-02-13T20:03:25.9266376Z Requirement already satisfied: ptyprocess; os_name != "nt" in /usr/local/lib/python3.8/dist-packages (from terminado>=0.8.3->jupyter-server<3,>=2.4.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (0.7.0) +2025-02-13T20:03:25.9823727Z Collecting sniffio>=1.1 +2025-02-13T20:03:25.9921885Z Downloading sniffio-1.3.1-py3-none-any.whl (10 kB) +2025-02-13T20:03:26.0328479Z Requirement already satisfied: exceptiongroup>=1.0.2; python_version < "3.11" in /usr/local/lib/python3.8/dist-packages (from anyio>=3.1.0->jupyter-server<3,>=2.4.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (1.2.2) +2025-02-13T20:03:26.0352085Z Requirement already satisfied: referencing in /usr/local/lib/python3.8/dist-packages (from jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (0.35.1) +2025-02-13T20:03:26.0940876Z Collecting rfc3339-validator +2025-02-13T20:03:26.1025902Z Downloading rfc3339_validator-0.1.4-py2.py3-none-any.whl (3.5 kB) +2025-02-13T20:03:26.2129699Z Collecting python-json-logger>=2.0.4 +2025-02-13T20:03:26.2211888Z Downloading python_json_logger-3.2.1-py3-none-any.whl (14 kB) +2025-02-13T20:03:26.3380191Z Collecting rfc3986-validator>=0.1.1 +2025-02-13T20:03:26.3464013Z Downloading rfc3986_validator-0.1.1-py2.py3-none-any.whl (4.2 kB) +2025-02-13T20:03:26.3870278Z Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.8/dist-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.27.1->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (2023.12.1) +2025-02-13T20:03:26.3914619Z Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.8/dist-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.27.1->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (0.20.1) +2025-02-13T20:03:26.3936291Z Requirement already satisfied: pkgutil-resolve-name>=1.3.10; python_version < "3.9" in /usr/local/lib/python3.8/dist-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.27.1->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (1.3.10) +2025-02-13T20:03:26.3950407Z Requirement already satisfied: attrs>=22.2.0 in /usr/local/lib/python3.8/dist-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.27.1->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (25.1.0) +2025-02-13T20:03:26.5547932Z Collecting h11<0.15,>=0.13 +2025-02-13T20:03:26.5631057Z Downloading h11-0.14.0-py3-none-any.whl (58 kB) +2025-02-13T20:03:26.6119082Z Requirement already satisfied: executing>=1.2.0 in /usr/local/lib/python3.8/dist-packages (from stack-data->ipython>=6.1.0->ipywidgets==8.1.1->ttnn==0.0.dev1+any) (2.2.0) +2025-02-13T20:03:26.6199335Z Requirement already satisfied: asttokens>=2.1.0 in /usr/local/lib/python3.8/dist-packages (from stack-data->ipython>=6.1.0->ipywidgets==8.1.1->ttnn==0.0.dev1+any) (3.0.0) +2025-02-13T20:03:26.6261162Z Requirement already satisfied: pure-eval in /usr/local/lib/python3.8/dist-packages (from stack-data->ipython>=6.1.0->ipywidgets==8.1.1->ttnn==0.0.dev1+any) (0.2.3) +2025-02-13T20:03:26.6288932Z Requirement already satisfied: parso<0.9.0,>=0.8.4 in /usr/local/lib/python3.8/dist-packages (from jedi>=0.16->ipython>=6.1.0->ipywidgets==8.1.1->ttnn==0.0.dev1+any) (0.8.4) +2025-02-13T20:03:26.6348215Z Requirement already satisfied: wcwidth in /usr/local/lib/python3.8/dist-packages (from prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30->ipython>=6.1.0->ipywidgets==8.1.1->ttnn==0.0.dev1+any) (0.2.13) +2025-02-13T20:03:26.6380786Z Requirement already satisfied: cffi>=1.0.1 in /usr/local/lib/python3.8/dist-packages (from argon2-cffi-bindings->argon2-cffi>=21.1->jupyter-server<3,>=2.4.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (1.17.1) +2025-02-13T20:03:26.6397488Z Requirement already satisfied: webencodings in /usr/local/lib/python3.8/dist-packages (from bleach[css]!=5.0.0->nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (0.5.1) +2025-02-13T20:03:26.6420836Z Requirement already satisfied: tinycss2<1.3,>=1.1.0; extra == "css" in /usr/local/lib/python3.8/dist-packages (from bleach[css]!=5.0.0->nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (1.2.1) +2025-02-13T20:03:26.6484655Z Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.8/dist-packages (from beautifulsoup4->nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (2.6) +2025-02-13T20:03:26.6498120Z Requirement already satisfied: pycparser in /usr/local/lib/python3.8/dist-packages (from cffi>=1.0.1->argon2-cffi-bindings->argon2-cffi>=21.1->jupyter-server<3,>=2.4.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (2.22) +2025-02-13T20:03:29.0511456Z Installing collected packages: Pillow, matplotlib, seaborn, terminado, jupyter-server-terminals, argon2-cffi-bindings, argon2-cffi, overrides, websocket-client, send2trash, sniffio, anyio, rfc3339-validator, python-json-logger, pyyaml, rfc3986-validator, jupyter-events, prometheus-client, jupyter-server, jupyter-lsp, comm, debugpy, nest-asyncio, ipykernel, async-lru, notebook-shim, json5, jupyterlab-server, h11, httpcore, httpx, jupyterlab, click, xyzservices, bokeh, toolz, graphviz, widgetsnbextension, jupyterlab-widgets, ipywidgets, Werkzeug, dash-core-components, tenacity, plotly, dash-html-components, retrying, itsdangerous, blinker, Flask, dash-table, dash, loguru, ttnn +2025-02-13T20:03:31.2249579Z WARNING: The script wsdump is installed in '/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.local/bin' which is not on PATH. +2025-02-13T20:03:31.2251269Z Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location. +2025-02-13T20:03:31.2406116Z WARNING: The script send2trash is installed in '/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.local/bin' which is not on PATH. +2025-02-13T20:03:31.2408245Z Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location. +2025-02-13T20:03:31.4176315Z WARNING: The script jupyter-events is installed in '/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.local/bin' which is not on PATH. +2025-02-13T20:03:31.4177984Z Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location. +2025-02-13T20:03:31.6123682Z WARNING: The script jupyter-server is installed in '/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.local/bin' which is not on PATH. +2025-02-13T20:03:31.6125392Z Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location. +2025-02-13T20:03:32.3894266Z WARNING: The script debugpy is installed in '/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.local/bin' which is not on PATH. +2025-02-13T20:03:32.3895914Z Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location. +2025-02-13T20:03:32.5210326Z WARNING: The script pyjson5 is installed in '/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.local/bin' which is not on PATH. +2025-02-13T20:03:32.5211980Z Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location. +2025-02-13T20:03:32.7050908Z WARNING: The script httpx is installed in '/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.local/bin' which is not on PATH. +2025-02-13T20:03:32.7052554Z Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location. +2025-02-13T20:03:33.2679064Z WARNING: The scripts jlpm, jupyter-lab, jupyter-labextension and jupyter-labhub are installed in '/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.local/bin' which is not on PATH. +2025-02-13T20:03:33.2681239Z Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location. +2025-02-13T20:03:34.2256982Z WARNING: The script bokeh is installed in '/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.local/bin' which is not on PATH. +2025-02-13T20:03:34.2258609Z Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location. +2025-02-13T20:03:45.1265238Z WARNING: The script flask is installed in '/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.local/bin' which is not on PATH. +2025-02-13T20:03:45.1266503Z Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location. +2025-02-13T20:03:45.6235201Z WARNING: The scripts dash-generate-components, dash-update-components and renderer are installed in '/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.local/bin' which is not on PATH. +2025-02-13T20:03:45.6237203Z Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location. +2025-02-13T20:03:45.6764751Z Running setup.py develop for ttnn +2025-02-13T20:03:48.8645764Z Successfully installed Flask-3.0.3 Pillow-10.3.0 Werkzeug-3.0.6 anyio-4.5.2 argon2-cffi-23.1.0 argon2-cffi-bindings-21.2.0 async-lru-2.0.4 blinker-1.8.2 bokeh-3.1.1 click-8.1.7 comm-0.2.2 dash-2.15.0 dash-core-components-2.0.0 dash-html-components-2.0.0 dash-table-5.0.0 debugpy-1.8.12 graphviz-0.20.3 h11-0.14.0 httpcore-1.0.7 httpx-0.28.1 ipykernel-6.29.5 ipywidgets-8.1.1 itsdangerous-2.2.0 json5-0.10.0 jupyter-events-0.10.0 jupyter-lsp-2.2.5 jupyter-server-2.14.2 jupyter-server-terminals-0.5.3 jupyterlab-4.2.5 jupyterlab-server-2.27.3 jupyterlab-widgets-3.0.13 loguru-0.6.0 matplotlib-3.7.1 nest-asyncio-1.6.0 notebook-shim-0.2.4 overrides-7.7.0 plotly-5.18.0 prometheus-client-0.21.1 python-json-logger-3.2.1 pyyaml-6.0.2 retrying-1.3.4 rfc3339-validator-0.1.4 rfc3986-validator-0.1.1 seaborn-0.13.2 send2trash-1.8.3 sniffio-1.3.1 tenacity-9.0.0 terminado-0.18.1 toolz-0.12.0 ttnn websocket-client-1.8.0 widgetsnbextension-4.0.13 xyzservices-2025.1.0 +2025-02-13T20:03:49.6315968Z Running main() from gmock_main.cc +2025-02-13T20:03:49.6316567Z [==========] Running 166 tests from 14 test suites. +2025-02-13T20:03:49.6317197Z [----------] Global test environment set-up. +2025-02-13T20:03:49.6317782Z [----------] 12 tests from Host +2025-02-13T20:03:49.6318359Z [ RUN ] Host.TestTilizeAndThenUntilizeBfloat16 +2025-02-13T20:03:49.9101996Z [ OK ] Host.TestTilizeAndThenUntilizeBfloat16 (278 ms) +2025-02-13T20:03:49.9102849Z [ RUN ] Host.TestTilizeThrowErrorForNonBfloat16DataType +2025-02-13T20:03:49.9104309Z  Always | FATAL  | Invalid type passed into tilize +2025-02-13T20:03:49.9106900Z [ OK ] Host.TestTilizeThrowErrorForNonBfloat16DataType (0 ms) +2025-02-13T20:03:49.9107760Z [ RUN ] Host.TestTilizeThrowErrorForInvalidTileMandN +2025-02-13T20:03:49.9109263Z  Always | FATAL  | m and n must be divisible by 32 +2025-02-13T20:03:49.9110560Z  Always | FATAL  | None of the input size, m, nor n can be 0 +2025-02-13T20:03:49.9112186Z  Always | FATAL  | None of the input size, m, nor n can be 0 +2025-02-13T20:03:49.9113674Z  Always | FATAL  | None of the input size, m, nor n can be 0 +2025-02-13T20:03:49.9114845Z [ OK ] Host.TestTilizeThrowErrorForInvalidTileMandN (0 ms) +2025-02-13T20:03:49.9115670Z [ RUN ] Host.TestTilizeThrowErrorForInvalidVectorShape +2025-02-13T20:03:49.9116827Z  Always | FATAL  | Input size must be divisible by m and n +2025-02-13T20:03:49.9118155Z  Always | FATAL  | None of the input size, m, nor n can be 0 +2025-02-13T20:03:49.9119172Z [ OK ] Host.TestTilizeThrowErrorForInvalidVectorShape (0 ms) +2025-02-13T20:03:49.9120205Z [ RUN ] Host.TestUntilizeThrowErrorForNonBfloat16DataType +2025-02-13T20:03:49.9121444Z  Always | FATAL  | Invalid type passed into untilize +2025-02-13T20:03:49.9122436Z [ OK ] Host.TestUntilizeThrowErrorForNonBfloat16DataType (0 ms) +2025-02-13T20:03:49.9123308Z [ RUN ] Host.TestUntilizeThrowErrorForInvalidTileMandN +2025-02-13T20:03:49.9124395Z  Always | FATAL  | m and n must be divisible by 32 +2025-02-13T20:03:49.9125669Z  Always | FATAL  | None of the input size, m, nor n can be 0 +2025-02-13T20:03:49.9127337Z  Always | FATAL  | None of the input size, m, nor n can be 0 +2025-02-13T20:03:49.9128649Z  Always | FATAL  | None of the input size, m, nor n can be 0 +2025-02-13T20:03:49.9129649Z [ OK ] Host.TestUntilizeThrowErrorForInvalidTileMandN (0 ms) +2025-02-13T20:03:49.9130513Z [ RUN ] Host.TestUntilizeThrowErrorForInvalidVectorShape +2025-02-13T20:03:49.9131662Z  Always | FATAL  | Input size must be divisible by m and n +2025-02-13T20:03:49.9132981Z  Always | FATAL  | None of the input size, m, nor n can be 0 +2025-02-13T20:03:49.9134000Z [ OK ] Host.TestUntilizeThrowErrorForInvalidVectorShape (0 ms) +2025-02-13T20:03:49.9134801Z [ RUN ] Host.TestUntilizeAndThenTilizeBfloat16 +2025-02-13T20:03:50.1758438Z [ OK ] Host.TestUntilizeAndThenTilizeBfloat16 (263 ms) +2025-02-13T20:03:50.1759123Z [ RUN ] Host.ExtractBitArray +2025-02-13T20:03:50.1759626Z [ OK ] Host.ExtractBitArray (0 ms) +2025-02-13T20:03:50.1760626Z [ RUN ] Host.PackBitArray +2025-02-13T20:03:50.1761537Z [ OK ] Host.PackBitArray (0 ms) +2025-02-13T20:03:50.1761979Z [ RUN ] Host.PackExtractBitArray +2025-02-13T20:03:50.1762418Z [ OK ] Host.PackExtractBitArray (0 ms) +2025-02-13T20:03:50.1762872Z [ RUN ] Host.ExtractPackBitArray +2025-02-13T20:03:50.1763310Z [ OK ] Host.ExtractPackBitArray (0 ms) +2025-02-13T20:03:50.1763770Z [----------] 12 tests from Host (544 ms total) +2025-02-13T20:03:50.1764067Z +2025-02-13T20:03:50.1764231Z [----------] 6 tests from WorkerConfigBuffer +2025-02-13T20:03:50.1764723Z [ RUN ] WorkerConfigBuffer.MarkCompletelyFull +2025-02-13T20:03:50.1765390Z [ OK ] WorkerConfigBuffer.MarkCompletelyFull (0 ms) +2025-02-13T20:03:50.1765926Z [ RUN ] WorkerConfigBuffer.SmallSize +2025-02-13T20:03:50.1766427Z [ OK ] WorkerConfigBuffer.SmallSize (0 ms) +2025-02-13T20:03:50.1766890Z [ RUN ] WorkerConfigBuffer.SizeOne +2025-02-13T20:03:50.1767355Z [ OK ] WorkerConfigBuffer.SizeOne (0 ms) +2025-02-13T20:03:50.1767841Z [ RUN ] WorkerConfigBuffer.LoopAround +2025-02-13T20:03:50.1768324Z [ OK ] WorkerConfigBuffer.LoopAround (0 ms) +2025-02-13T20:03:50.1768813Z [ RUN ] WorkerConfigBuffer.Randomized +2025-02-13T20:03:50.1769792Z  Test | INFO  | Using seed: 1739477030 +2025-02-13T20:03:50.1770392Z [ OK ] WorkerConfigBuffer.Randomized (0 ms) +2025-02-13T20:03:50.1770868Z [ RUN ] WorkerConfigBuffer.VeryBasic +2025-02-13T20:03:50.1771334Z [ OK ] WorkerConfigBuffer.VeryBasic (0 ms) +2025-02-13T20:03:50.1771836Z [----------] 6 tests from WorkerConfigBuffer (0 ms total) +2025-02-13T20:03:50.1772178Z +2025-02-13T20:03:50.1772309Z [----------] 3 tests from NOC +2025-02-13T20:03:50.1772750Z [ RUN ] NOC.TensixSingleDeviceHarvestingPrints +2025-02-13T20:03:50.1831753Z +2025-02-13T20:03:50.1974398Z  Device | INFO  | Opening user mode device driver +2025-02-13T20:03:50.2009646Z 2025-02-13 20:03:50.200 | INFO  | SiliconDriver  - Opened PCI device 0; KMD version: 1.29.0, IOMMU: disabled +2025-02-13T20:03:50.2020132Z 2025-02-13 20:03:50.201 | INFO  | SiliconDriver  - Detected PCI devices: [0] +2025-02-13T20:03:50.2021461Z 2025-02-13 20:03:50.201 | INFO  | SiliconDriver  - Using local chip ids: {0} and remote chip ids {} +2025-02-13T20:03:50.2170760Z 2025-02-13 20:03:50.216 | WARNING  | SiliconDriver  - init_detect_tt_device_numanodes(): Could not determine NumaNodeSet for TT device (physical_device_id: 0 pci_bus_id: 0000:07:00.0) +2025-02-13T20:03:50.2173111Z 2025-02-13 20:03:50.216 | WARNING  | SiliconDriver  - Could not find NumaNodeSet for TT Device (physical_device_id: 0 pci_bus_id: 0000:07:00.0) +2025-02-13T20:03:50.2183116Z 2025-02-13 20:03:50.217 | WARNING  | SiliconDriver  - bind_area_memory_nodeset(): Unable to determine TT Device to NumaNode mapping for physical_device_id: 0. Skipping membind. +2025-02-13T20:03:50.2188540Z 2025-02-13 20:03:50.217 | WARNING  | SiliconDriver  - ---- ttSiliconDevice::init_hugepage: bind_area_to_memory_nodeset() failed (physical_device_id: 0 ch: 0). Hugepage allocation is not on NumaNode matching TT Device. Side-Effect is decreased Device->Host perf (Issue #893). +2025-02-13T20:03:50.2263141Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:50.2267768Z  BuildKernels | INFO  | Skipping deleting built cache +2025-02-13T20:03:50.2325328Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:50.6921899Z  Always | INFO  | Harvesting Disabled in SW +2025-02-13T20:03:50.6923220Z  Always | INFO  | Logical -- Virtual Mapping +2025-02-13T20:03:50.6924468Z  Always | INFO  | [Logical <-> Virtual] Coordinates +2025-02-13T20:03:50.6927329Z  Always | INFO  | {L[x0-y0]:V[x1-y1]}, {L[x1-y0]:V[x2-y1]}, {L[x2-y0]:V[x3-y1]}, {L[x3-y0]:V[x4-y1]}, {L[x4-y0]:V[x5-y1]}, {L[x5-y0]:V[x6-y1]}, {L[x6-y0]:V[x7-y1]}, {L[x7-y0]:V[x8-y1]}, {L[x8-y0]:V[x9-y1]}, {L[x9-y0]:V[x10-y1]}, {L[x10-y0]:V[x11-y1]}, {L[x11-y0]:V[x12-y1]}, +2025-02-13T20:03:50.6930331Z  Always | INFO  | {L[x0-y1]:V[x1-y2]}, {L[x1-y1]:V[x2-y2]}, {L[x2-y1]:V[x3-y2]}, {L[x3-y1]:V[x4-y2]}, {L[x4-y1]:V[x5-y2]}, {L[x5-y1]:V[x6-y2]}, {L[x6-y1]:V[x7-y2]}, {L[x7-y1]:V[x8-y2]}, {L[x8-y1]:V[x9-y2]}, {L[x9-y1]:V[x10-y2]}, {L[x10-y1]:V[x11-y2]}, {L[x11-y1]:V[x12-y2]}, +2025-02-13T20:03:50.6933099Z  Always | INFO  | {L[x0-y2]:V[x1-y3]}, {L[x1-y2]:V[x2-y3]}, {L[x2-y2]:V[x3-y3]}, {L[x3-y2]:V[x4-y3]}, {L[x4-y2]:V[x5-y3]}, {L[x5-y2]:V[x6-y3]}, {L[x6-y2]:V[x7-y3]}, {L[x7-y2]:V[x8-y3]}, {L[x8-y2]:V[x9-y3]}, {L[x9-y2]:V[x10-y3]}, {L[x10-y2]:V[x11-y3]}, {L[x11-y2]:V[x12-y3]}, +2025-02-13T20:03:50.6935674Z  Always | INFO  | {L[x0-y3]:V[x1-y4]}, {L[x1-y3]:V[x2-y4]}, {L[x2-y3]:V[x3-y4]}, {L[x3-y3]:V[x4-y4]}, {L[x4-y3]:V[x5-y4]}, {L[x5-y3]:V[x6-y4]}, {L[x6-y3]:V[x7-y4]}, {L[x7-y3]:V[x8-y4]}, {L[x8-y3]:V[x9-y4]}, {L[x9-y3]:V[x10-y4]}, {L[x10-y3]:V[x11-y4]}, {L[x11-y3]:V[x12-y4]}, +2025-02-13T20:03:50.6938258Z  Always | INFO  | {L[x0-y4]:V[x1-y5]}, {L[x1-y4]:V[x2-y5]}, {L[x2-y4]:V[x3-y5]}, {L[x3-y4]:V[x4-y5]}, {L[x4-y4]:V[x5-y5]}, {L[x5-y4]:V[x6-y5]}, {L[x6-y4]:V[x7-y5]}, {L[x7-y4]:V[x8-y5]}, {L[x8-y4]:V[x9-y5]}, {L[x9-y4]:V[x10-y5]}, {L[x10-y4]:V[x11-y5]}, {L[x11-y4]:V[x12-y5]}, +2025-02-13T20:03:50.6940829Z  Always | INFO  | {L[x0-y5]:V[x1-y7]}, {L[x1-y5]:V[x2-y7]}, {L[x2-y5]:V[x3-y7]}, {L[x3-y5]:V[x4-y7]}, {L[x4-y5]:V[x5-y7]}, {L[x5-y5]:V[x6-y7]}, {L[x6-y5]:V[x7-y7]}, {L[x7-y5]:V[x8-y7]}, {L[x8-y5]:V[x9-y7]}, {L[x9-y5]:V[x10-y7]}, {L[x10-y5]:V[x11-y7]}, {L[x11-y5]:V[x12-y7]}, +2025-02-13T20:03:50.6943406Z  Always | INFO  | {L[x0-y6]:V[x1-y8]}, {L[x1-y6]:V[x2-y8]}, {L[x2-y6]:V[x3-y8]}, {L[x3-y6]:V[x4-y8]}, {L[x4-y6]:V[x5-y8]}, {L[x5-y6]:V[x6-y8]}, {L[x6-y6]:V[x7-y8]}, {L[x7-y6]:V[x8-y8]}, {L[x8-y6]:V[x9-y8]}, {L[x9-y6]:V[x10-y8]}, {L[x10-y6]:V[x11-y8]}, {L[x11-y6]:V[x12-y8]}, +2025-02-13T20:03:50.6946459Z  Always | INFO  | {L[x0-y7]:V[x1-y9]}, {L[x1-y7]:V[x2-y9]}, {L[x2-y7]:V[x3-y9]}, {L[x3-y7]:V[x4-y9]}, {L[x4-y7]:V[x5-y9]}, {L[x5-y7]:V[x6-y9]}, {L[x6-y7]:V[x7-y9]}, {L[x7-y7]:V[x8-y9]}, {L[x8-y7]:V[x9-y9]}, {L[x9-y7]:V[x10-y9]}, {L[x10-y7]:V[x11-y9]}, {L[x11-y7]:V[x12-y9]}, +2025-02-13T20:03:50.6949461Z  Always | INFO  | {L[x0-y8]:V[x1-y10]}, {L[x1-y8]:V[x2-y10]}, {L[x2-y8]:V[x3-y10]}, {L[x3-y8]:V[x4-y10]}, {L[x4-y8]:V[x5-y10]}, {L[x5-y8]:V[x6-y10]}, {L[x6-y8]:V[x7-y10]}, {L[x7-y8]:V[x8-y10]}, {L[x8-y8]:V[x9-y10]}, {L[x9-y8]:V[x10-y10]}, {L[x10-y8]:V[x11-y10]}, {L[x11-y8]:V[x12-y10]}, +2025-02-13T20:03:50.6952498Z  Always | INFO  | {L[x0-y9]:V[x1-y11]}, {L[x1-y9]:V[x2-y11]}, {L[x2-y9]:V[x3-y11]}, {L[x3-y9]:V[x4-y11]}, {L[x4-y9]:V[x5-y11]}, {L[x5-y9]:V[x6-y11]}, {L[x6-y9]:V[x7-y11]}, {L[x7-y9]:V[x8-y11]}, {L[x8-y9]:V[x9-y11]}, {L[x9-y9]:V[x10-y11]}, {L[x10-y9]:V[x11-y11]}, {L[x11-y9]:V[x12-y11]}, +2025-02-13T20:03:50.6954286Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:50.6955555Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:50.6956554Z [ OK ] NOC.TensixSingleDeviceHarvestingPrints (516 ms) +2025-02-13T20:03:50.6957245Z [ RUN ] NOC.TensixVerifyNocNodeIDs +2025-02-13T20:03:50.6958367Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:50.6959891Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:50.7710270Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:50.7720221Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:50.7722303Z [ OK ] NOC.TensixVerifyNocNodeIDs (79 ms) +2025-02-13T20:03:50.7723008Z [ RUN ] NOC.TensixVerifyNocIdentityTranslationTable +2025-02-13T20:03:50.7723789Z /work/tests/tt_metal/tt_metal/api/test_noc.cpp:143: Skipped +2025-02-13T20:03:50.7724835Z +2025-02-13T20:03:50.7725205Z [ SKIPPED ] NOC.TensixVerifyNocIdentityTranslationTable (0 ms) +2025-02-13T20:03:50.7725961Z [----------] 3 tests from NOC (596 ms total) +2025-02-13T20:03:50.7726347Z +2025-02-13T20:03:50.7726544Z [----------] 67 tests from DeviceFixture +2025-02-13T20:03:50.7727224Z [ RUN ] DeviceFixture.TensixDirectedStreamRegWriteRead +2025-02-13T20:03:50.7728488Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:50.7758594Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:51.1240635Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:51.1249778Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:51.1250849Z [ OK ] DeviceFixture.TensixDirectedStreamRegWriteRead (352 ms) +2025-02-13T20:03:51.1251793Z [ RUN ] DeviceFixture.TensixLegallyModifyRTArgsDataMovement +2025-02-13T20:03:51.1253071Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:51.1294176Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:51.6798673Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:51.6807874Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:51.6809006Z [ OK ] DeviceFixture.TensixLegallyModifyRTArgsDataMovement (555 ms) +2025-02-13T20:03:51.6809929Z [ RUN ] DeviceFixture.TensixLegallyModifyRTArgsCompute +2025-02-13T20:03:51.6811120Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:51.6850170Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:52.1616779Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:52.1623248Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:52.1624747Z [ OK ] DeviceFixture.TensixLegallyModifyRTArgsCompute (481 ms) +2025-02-13T20:03:52.1625500Z [ RUN ] DeviceFixture.TensixSetRuntimeArgsSubsetOfCoresCompute +2025-02-13T20:03:52.1629600Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:52.1630632Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:52.1793786Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:52.1805220Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:52.1806314Z [ OK ] DeviceFixture.TensixSetRuntimeArgsSubsetOfCoresCompute (18 ms) +2025-02-13T20:03:52.1807247Z [ RUN ] DeviceFixture.TensixSetRuntimeArgsUniqueValuesCompute +2025-02-13T20:03:52.1810245Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:52.1811256Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:52.2029032Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:52.2039008Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:52.2040144Z [ OK ] DeviceFixture.TensixSetRuntimeArgsUniqueValuesCompute (23 ms) +2025-02-13T20:03:52.2041044Z [ RUN ] DeviceFixture.TensixSetRuntimeArgsVaryingLengthPerCore +2025-02-13T20:03:52.2044124Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:52.2102542Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:52.6234904Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:52.6240603Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:52.6241575Z [ OK ] DeviceFixture.TensixSetRuntimeArgsVaryingLengthPerCore (420 ms) +2025-02-13T20:03:52.6242356Z [ RUN ] DeviceFixture.TensixIllegalTooManyRuntimeArgs +2025-02-13T20:03:52.6246243Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:52.6249905Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:52.6306492Z  Metal | WARNING  | Too many runtime args, unique: 100 common: 300 on COMPUTE +2025-02-13T20:03:52.6308303Z  Always | FATAL  | 400 unique+common runtime args targeting kernel increment_runtime_arg on (x=1,y=1) are too large. Max allowable is 256 +2025-02-13T20:03:52.6313511Z  Always | FATAL  | Illegal Runtime Args on (x=1,y=1): Number of runtime args cannot be modified from 100 to 300! +2025-02-13T20:03:52.6317113Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:52.6325561Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:52.6326408Z [ OK ] DeviceFixture.TensixIllegalTooManyRuntimeArgs (8 ms) +2025-02-13T20:03:52.6327040Z [ RUN ] DeviceFixture.TensixIllegallyModifyRTArgs +2025-02-13T20:03:52.6328271Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:52.6345086Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:52.6490002Z  Always | FATAL  | Illegal Runtime Args on (x=0,y=0): Number of runtime args cannot be modified from 2 to 3! +2025-02-13T20:03:52.6495170Z  Always | FATAL  | Illegal Common Runtime Args: Can only set common runtime args once. Get and modify args in place instead. +2025-02-13T20:03:52.6496399Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:52.6505758Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:52.6506618Z [ OK ] DeviceFixture.TensixIllegallyModifyRTArgs (18 ms) +2025-02-13T20:03:52.6507281Z [ RUN ] DeviceFixture.TensixInitializeLegalSemaphores +2025-02-13T20:03:52.6508256Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:52.6546487Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:52.6606687Z  Metal | WARNING  | Circular buffer indices are not contiguous starting at 0. This will hurt dispatch performance. Non-contiguous indices: 16. First unused index: 1. Kernels: reader_unary_push_4 +2025-02-13T20:03:53.1915645Z  Metal | WARNING  | Circular buffer indices are not contiguous starting at 0. This will hurt dispatch performance. Non-contiguous indices: 16. First unused index: 1. Kernels: writer_unary, reader_unary_push_4, eltwise_copy_3m +2025-02-13T20:03:53.2100181Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:53.2111337Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:53.2112436Z [ OK ] DeviceFixture.TensixInitializeLegalSemaphores (560 ms) +2025-02-13T20:03:53.2114075Z [ RUN ] DeviceFixture.TensixInitializeIllegalSemaphores +2025-02-13T20:03:53.2115342Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:53.2117413Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:53.2367448Z  Always | FATAL  | Cannot add semaphore on core (x=0,y=0). Max number of semaphores (8) reached! +2025-02-13T20:03:53.2369838Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:53.2380663Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:53.2381750Z [ OK ] DeviceFixture.TensixInitializeIllegalSemaphores (26 ms) +2025-02-13T20:03:53.2383407Z [ RUN ] DeviceFixture.TensixCreateMultipleSemaphoresOnSameCore +2025-02-13T20:03:53.2384737Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:53.2405130Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:53.2463420Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:53.2474138Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:53.2475314Z [ OK ] DeviceFixture.TensixCreateMultipleSemaphoresOnSameCore (9 ms) +2025-02-13T20:03:53.2476233Z [ RUN ] DeviceFixture.TestInterleavedReadWrite +2025-02-13T20:03:53.2477424Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:53.2506316Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:53.2596628Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:53.2606789Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:53.2608585Z [ OK ] DeviceFixture.TestInterleavedReadWrite (13 ms) +2025-02-13T20:03:53.2609345Z [ RUN ] DeviceFixture.TestHeightShardReadWrite +2025-02-13T20:03:53.2611052Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:53.2612581Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:53.2692179Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:53.2702726Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:53.2703860Z [ OK ] DeviceFixture.TestHeightShardReadWrite (9 ms) +2025-02-13T20:03:53.2704618Z [ RUN ] DeviceFixture.TestWidthShardReadWrite +2025-02-13T20:03:53.2705792Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:53.2707580Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:53.2788795Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:53.2798544Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:53.2799838Z [ OK ] DeviceFixture.TestWidthShardReadWrite (9 ms) +2025-02-13T20:03:53.2800658Z [ RUN ] DeviceFixture.TestUnorderedHeightShardReadWrite +2025-02-13T20:03:53.2802007Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:53.2809257Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:53.2965668Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:53.2974959Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:53.2976268Z [ OK ] DeviceFixture.TestUnorderedHeightShardReadWrite (17 ms) +2025-02-13T20:03:53.2977147Z [ RUN ] DeviceFixture.TestSimpleDramBufferReadOnlyLo +2025-02-13T20:03:53.2978382Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:53.3011365Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:53.3070000Z  Always | INFO  | writeDramBackdoor -- channel=0 address=32 +2025-02-13T20:03:53.3071690Z  Always | INFO  | readDramBackdoor -- channel=0 address=32 byte_size=4 +2025-02-13T20:03:53.3073411Z  Always | INFO  | writeDramBackdoor -- channel=0 address=32 +2025-02-13T20:03:53.3074829Z  Always | INFO  | readDramBackdoor -- channel=0 address=32 byte_size=8 +2025-02-13T20:03:53.3076210Z  Always | INFO  | writeDramBackdoor -- channel=0 address=32 +2025-02-13T20:03:53.3078350Z  Always | INFO  | readDramBackdoor -- channel=0 address=32 byte_size=16 +2025-02-13T20:03:53.3079904Z  Always | INFO  | writeDramBackdoor -- channel=0 address=32 +2025-02-13T20:03:53.3081400Z  Always | INFO  | readDramBackdoor -- channel=0 address=32 byte_size=32 +2025-02-13T20:03:53.3082897Z  Always | INFO  | writeDramBackdoor -- channel=0 address=32 +2025-02-13T20:03:53.3084330Z  Always | INFO  | readDramBackdoor -- channel=0 address=32 byte_size=1024 +2025-02-13T20:03:53.3085757Z  Always | INFO  | writeDramBackdoor -- channel=0 address=32 +2025-02-13T20:03:53.3087201Z  Always | INFO  | readDramBackdoor -- channel=0 address=32 byte_size=16384 +2025-02-13T20:03:53.3094650Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:53.3104824Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:53.3105890Z [ OK ] DeviceFixture.TestSimpleDramBufferReadOnlyLo (12 ms) +2025-02-13T20:03:53.3106745Z [ RUN ] DeviceFixture.TestSimpleDramBufferReadOnlyHi +2025-02-13T20:03:53.3107960Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:53.3113210Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:53.3172654Z  Always | INFO  | writeDramBackdoor -- channel=0 address=1073725440 +2025-02-13T20:03:53.3174699Z  Always | INFO  | readDramBackdoor -- channel=0 address=1073725440 byte_size=4 +2025-02-13T20:03:53.3176728Z  Always | INFO  | writeDramBackdoor -- channel=0 address=1073725440 +2025-02-13T20:03:53.3178368Z  Always | INFO  | readDramBackdoor -- channel=0 address=1073725440 byte_size=8 +2025-02-13T20:03:53.3179854Z  Always | INFO  | writeDramBackdoor -- channel=0 address=1073725440 +2025-02-13T20:03:53.3181335Z  Always | INFO  | readDramBackdoor -- channel=0 address=1073725440 byte_size=16 +2025-02-13T20:03:53.3182834Z  Always | INFO  | writeDramBackdoor -- channel=0 address=1073725440 +2025-02-13T20:03:53.3184325Z  Always | INFO  | readDramBackdoor -- channel=0 address=1073725440 byte_size=32 +2025-02-13T20:03:53.3185811Z  Always | INFO  | writeDramBackdoor -- channel=0 address=1073725440 +2025-02-13T20:03:53.3187431Z  Always | INFO  | readDramBackdoor -- channel=0 address=1073725440 byte_size=1024 +2025-02-13T20:03:53.3188909Z  Always | INFO  | writeDramBackdoor -- channel=0 address=1073725440 +2025-02-13T20:03:53.3191099Z  Always | INFO  | readDramBackdoor -- channel=0 address=1073725440 byte_size=16384 +2025-02-13T20:03:53.3192438Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:53.3201321Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:53.3202369Z [ OK ] DeviceFixture.TestSimpleDramBufferReadOnlyHi (9 ms) +2025-02-13T20:03:53.3203232Z [ RUN ] DeviceFixture.TestSimpleDramBufferWriteOnlyLo +2025-02-13T20:03:53.3204450Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:53.3214536Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:53.3272338Z  Always | INFO  | writeDramBackdoor -- channel=0 address=32 +2025-02-13T20:03:53.3274219Z  Always | INFO  | readDramBackdoor -- channel=0 address=32 byte_size=4 +2025-02-13T20:03:53.3276236Z  Always | INFO  | writeDramBackdoor -- channel=0 address=32 +2025-02-13T20:03:53.3277626Z  Always | INFO  | readDramBackdoor -- channel=0 address=32 byte_size=8 +2025-02-13T20:03:53.3279005Z  Always | INFO  | writeDramBackdoor -- channel=0 address=32 +2025-02-13T20:03:53.3280589Z  Always | INFO  | readDramBackdoor -- channel=0 address=32 byte_size=16 +2025-02-13T20:03:53.3282591Z  Always | INFO  | writeDramBackdoor -- channel=0 address=32 +2025-02-13T20:03:53.3284101Z  Always | INFO  | readDramBackdoor -- channel=0 address=32 byte_size=32 +2025-02-13T20:03:53.3285545Z  Always | INFO  | writeDramBackdoor -- channel=0 address=32 +2025-02-13T20:03:53.3287003Z  Always | INFO  | readDramBackdoor -- channel=0 address=32 byte_size=1024 +2025-02-13T20:03:53.3288638Z  Always | INFO  | writeDramBackdoor -- channel=0 address=32 +2025-02-13T20:03:53.3290065Z  Always | INFO  | readDramBackdoor -- channel=0 address=32 byte_size=16384 +2025-02-13T20:03:53.3293998Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:53.3304603Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:53.3305677Z [ OK ] DeviceFixture.TestSimpleDramBufferWriteOnlyLo (10 ms) +2025-02-13T20:03:53.3306552Z [ RUN ] DeviceFixture.TestSimpleDramBufferWriteOnlyHi +2025-02-13T20:03:53.3307804Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:53.3314820Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:53.3374744Z  Always | INFO  | writeDramBackdoor -- channel=0 address=1073725440 +2025-02-13T20:03:53.3376883Z  Always | INFO  | readDramBackdoor -- channel=0 address=1073725440 byte_size=4 +2025-02-13T20:03:53.3378379Z  Always | INFO  | writeDramBackdoor -- channel=0 address=1073725440 +2025-02-13T20:03:53.3380284Z  Always | INFO  | readDramBackdoor -- channel=0 address=1073725440 byte_size=8 +2025-02-13T20:03:53.3422229Z  Always | INFO  | writeDramBackdoor -- channel=0 address=1073725440 +2025-02-13T20:03:53.3423831Z  Always | INFO  | readDramBackdoor -- channel=0 address=1073725440 byte_size=16 +2025-02-13T20:03:53.3425339Z  Always | INFO  | writeDramBackdoor -- channel=0 address=1073725440 +2025-02-13T20:03:53.3426847Z  Always | INFO  | readDramBackdoor -- channel=0 address=1073725440 byte_size=32 +2025-02-13T20:03:53.3428344Z  Always | INFO  | writeDramBackdoor -- channel=0 address=1073725440 +2025-02-13T20:03:53.3429867Z  Always | INFO  | readDramBackdoor -- channel=0 address=1073725440 byte_size=1024 +2025-02-13T20:03:53.3431378Z  Always | INFO  | writeDramBackdoor -- channel=0 address=1073725440 +2025-02-13T20:03:53.3432939Z  Always | INFO  | readDramBackdoor -- channel=0 address=1073725440 byte_size=16384 +2025-02-13T20:03:53.3473242Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:53.3474491Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:53.3475569Z [ OK ] DeviceFixture.TestSimpleDramBufferWriteOnlyHi (10 ms) +2025-02-13T20:03:53.3476417Z [ RUN ] DeviceFixture.TestSimpleL1BufferReadOnlyLo +2025-02-13T20:03:53.3477619Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:53.3479536Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:53.3481105Z  Always | INFO  | writeL1Backdoor -- coord=(x=5,y=4) address=100384 +2025-02-13T20:03:53.3482822Z  Always | INFO  | readL1Backdoor -- coord=(x=5,y=4) address=100384 byte_size=4 +2025-02-13T20:03:53.3484307Z  Always | INFO  | writeL1Backdoor -- coord=(x=5,y=4) address=100384 +2025-02-13T20:03:53.3486383Z  Always | INFO  | readL1Backdoor -- coord=(x=5,y=4) address=100384 byte_size=8 +2025-02-13T20:03:53.3487869Z  Always | INFO  | writeL1Backdoor -- coord=(x=5,y=4) address=100384 +2025-02-13T20:03:53.3489349Z  Always | INFO  | readL1Backdoor -- coord=(x=5,y=4) address=100384 byte_size=16 +2025-02-13T20:03:53.3491566Z  Always | INFO  | writeL1Backdoor -- coord=(x=5,y=4) address=100384 +2025-02-13T20:03:53.3493162Z  Always | INFO  | readL1Backdoor -- coord=(x=5,y=4) address=100384 byte_size=32 +2025-02-13T20:03:53.3498677Z  Always | INFO  | writeL1Backdoor -- coord=(x=5,y=4) address=100384 +2025-02-13T20:03:53.3500196Z  Always | INFO  | readL1Backdoor -- coord=(x=5,y=4) address=100384 byte_size=1024 +2025-02-13T20:03:53.3507490Z  Always | INFO  | writeL1Backdoor -- coord=(x=5,y=4) address=100384 +2025-02-13T20:03:53.3509004Z  Always | INFO  | readL1Backdoor -- coord=(x=5,y=4) address=100384 byte_size=16384 +2025-02-13T20:03:53.3517685Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:53.3529177Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:53.3530457Z [ OK ] DeviceFixture.TestSimpleL1BufferReadOnlyLo (12 ms) +2025-02-13T20:03:53.3531283Z [ RUN ] DeviceFixture.TestSimpleL1BufferReadOnlyHi +2025-02-13T20:03:53.3532491Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:53.3533809Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:53.3590349Z  Always | INFO  | writeL1Backdoor -- coord=(x=5,y=4) address=1032192 +2025-02-13T20:03:53.3591890Z  Always | INFO  | readL1Backdoor -- coord=(x=5,y=4) address=1032192 byte_size=4 +2025-02-13T20:03:53.3595732Z  Always | INFO  | writeL1Backdoor -- coord=(x=5,y=4) address=1032192 +2025-02-13T20:03:53.3597239Z  Always | INFO  | readL1Backdoor -- coord=(x=5,y=4) address=1032192 byte_size=8 +2025-02-13T20:03:53.3601348Z  Always | INFO  | writeL1Backdoor -- coord=(x=5,y=4) address=1032192 +2025-02-13T20:03:53.3602858Z  Always | INFO  | readL1Backdoor -- coord=(x=5,y=4) address=1032192 byte_size=16 +2025-02-13T20:03:53.3607709Z  Always | INFO  | writeL1Backdoor -- coord=(x=5,y=4) address=1032192 +2025-02-13T20:03:53.3609202Z  Always | INFO  | readL1Backdoor -- coord=(x=5,y=4) address=1032192 byte_size=32 +2025-02-13T20:03:53.3613879Z  Always | INFO  | writeL1Backdoor -- coord=(x=5,y=4) address=1032192 +2025-02-13T20:03:53.3615418Z  Always | INFO  | readL1Backdoor -- coord=(x=5,y=4) address=1032192 byte_size=1024 +2025-02-13T20:03:53.3621142Z  Always | INFO  | writeL1Backdoor -- coord=(x=5,y=4) address=1032192 +2025-02-13T20:03:53.3622722Z  Always | INFO  | readL1Backdoor -- coord=(x=5,y=4) address=1032192 byte_size=16384 +2025-02-13T20:03:53.3636294Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:53.3646325Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:53.3647381Z [ OK ] DeviceFixture.TestSimpleL1BufferReadOnlyHi (11 ms) +2025-02-13T20:03:53.3648752Z [ RUN ] DeviceFixture.TestSimpleL1BufferWriteOnlyLo +2025-02-13T20:03:53.3650096Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:53.3719973Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:53.3777635Z  Always | INFO  | writeL1Backdoor -- coord=(x=5,y=4) address=100384 +2025-02-13T20:03:53.3779136Z  Always | INFO  | readL1Backdoor -- coord=(x=5,y=4) address=100384 byte_size=4 +2025-02-13T20:03:53.3783509Z  Always | INFO  | writeL1Backdoor -- coord=(x=5,y=4) address=100384 +2025-02-13T20:03:53.3785011Z  Always | INFO  | readL1Backdoor -- coord=(x=5,y=4) address=100384 byte_size=8 +2025-02-13T20:03:53.3789380Z  Always | INFO  | writeL1Backdoor -- coord=(x=5,y=4) address=100384 +2025-02-13T20:03:53.3790883Z  Always | INFO  | readL1Backdoor -- coord=(x=5,y=4) address=100384 byte_size=16 +2025-02-13T20:03:53.3795452Z  Always | INFO  | writeL1Backdoor -- coord=(x=5,y=4) address=100384 +2025-02-13T20:03:53.3796967Z  Always | INFO  | readL1Backdoor -- coord=(x=5,y=4) address=100384 byte_size=32 +2025-02-13T20:03:53.3801851Z  Always | INFO  | writeL1Backdoor -- coord=(x=5,y=4) address=100384 +2025-02-13T20:03:53.3803385Z  Always | INFO  | readL1Backdoor -- coord=(x=5,y=4) address=100384 byte_size=1024 +2025-02-13T20:03:53.3809182Z  Always | INFO  | writeL1Backdoor -- coord=(x=5,y=4) address=100384 +2025-02-13T20:03:53.3810740Z  Always | INFO  | readL1Backdoor -- coord=(x=5,y=4) address=100384 byte_size=16384 +2025-02-13T20:03:53.3821119Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:53.3831765Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:53.3833015Z [ OK ] DeviceFixture.TestSimpleL1BufferWriteOnlyLo (18 ms) +2025-02-13T20:03:53.3833848Z [ RUN ] DeviceFixture.TestSimpleL1BufferWriteOnlyHi +2025-02-13T20:03:53.3835224Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:53.3836557Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:53.3893529Z  Always | INFO  | writeL1Backdoor -- coord=(x=5,y=4) address=1032192 +2025-02-13T20:03:53.3895036Z  Always | INFO  | readL1Backdoor -- coord=(x=5,y=4) address=1032192 byte_size=4 +2025-02-13T20:03:53.3899410Z  Always | INFO  | writeL1Backdoor -- coord=(x=5,y=4) address=1032192 +2025-02-13T20:03:53.3900887Z  Always | INFO  | readL1Backdoor -- coord=(x=5,y=4) address=1032192 byte_size=8 +2025-02-13T20:03:53.3905051Z  Always | INFO  | writeL1Backdoor -- coord=(x=5,y=4) address=1032192 +2025-02-13T20:03:53.3906726Z  Always | INFO  | readL1Backdoor -- coord=(x=5,y=4) address=1032192 byte_size=16 +2025-02-13T20:03:53.3911040Z  Always | INFO  | writeL1Backdoor -- coord=(x=5,y=4) address=1032192 +2025-02-13T20:03:53.3912583Z  Always | INFO  | readL1Backdoor -- coord=(x=5,y=4) address=1032192 byte_size=32 +2025-02-13T20:03:53.3917127Z  Always | INFO  | writeL1Backdoor -- coord=(x=5,y=4) address=1032192 +2025-02-13T20:03:53.3918656Z  Always | INFO  | readL1Backdoor -- coord=(x=5,y=4) address=1032192 byte_size=1024 +2025-02-13T20:03:53.3924456Z  Always | INFO  | writeL1Backdoor -- coord=(x=5,y=4) address=1032192 +2025-02-13T20:03:53.3925981Z  Always | INFO  | readL1Backdoor -- coord=(x=5,y=4) address=1032192 byte_size=16384 +2025-02-13T20:03:53.3939433Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:53.3949083Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:53.3950528Z [ OK ] DeviceFixture.TestSimpleL1BufferWriteOnlyHi (11 ms) +2025-02-13T20:03:53.3951379Z [ RUN ] DeviceFixture.TensixTestSimpleL1ReadWriteTileLo +2025-02-13T20:03:53.3952617Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:53.4021783Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:53.4080020Z  Always | INFO  | writeL1Backdoor -- coord=(x=0,y=0) address=794624 +2025-02-13T20:03:53.7090894Z  Always | INFO  | readL1Backdoor -- coord=(x=0,y=0) address=794624 byte_size=2048 +2025-02-13T20:03:53.7092896Z  Always | INFO  | readL1Backdoor -- coord=(x=0,y=0) address=802816 byte_size=2048 +2025-02-13T20:03:53.7100091Z  Always | INFO  | writeL1Backdoor -- coord=(x=0,y=0) address=794624 +2025-02-13T20:03:53.7110127Z  Always | INFO  | readL1Backdoor -- coord=(x=0,y=0) address=794624 byte_size=4096 +2025-02-13T20:03:53.7117316Z  Always | INFO  | readL1Backdoor -- coord=(x=0,y=0) address=802816 byte_size=4096 +2025-02-13T20:03:53.7126046Z  Always | INFO  | writeL1Backdoor -- coord=(x=0,y=0) address=794624 +2025-02-13T20:03:53.7135780Z  Always | INFO  | readL1Backdoor -- coord=(x=0,y=0) address=794624 byte_size=6144 +2025-02-13T20:03:53.7143434Z  Always | INFO  | readL1Backdoor -- coord=(x=0,y=0) address=802816 byte_size=6144 +2025-02-13T20:03:53.7152423Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:53.7162799Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:53.7163704Z [ OK ] DeviceFixture.TensixTestSimpleL1ReadWriteTileLo (321 ms) +2025-02-13T20:03:53.7164413Z [ RUN ] DeviceFixture.TensixTestSimpleL1ReadWriteTileHi +2025-02-13T20:03:53.7165765Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:53.7166878Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:53.7225497Z  Always | INFO  | writeL1Backdoor -- coord=(x=0,y=0) address=1032192 +2025-02-13T20:03:53.7235387Z  Always | INFO  | readL1Backdoor -- coord=(x=0,y=0) address=1032192 byte_size=2048 +2025-02-13T20:03:53.7241491Z  Always | INFO  | readL1Backdoor -- coord=(x=0,y=0) address=1040384 byte_size=2048 +2025-02-13T20:03:53.7249093Z  Always | INFO  | writeL1Backdoor -- coord=(x=0,y=0) address=1032192 +2025-02-13T20:03:53.7260824Z  Always | INFO  | readL1Backdoor -- coord=(x=0,y=0) address=1032192 byte_size=4096 +2025-02-13T20:03:53.7267450Z  Always | INFO  | readL1Backdoor -- coord=(x=0,y=0) address=1040384 byte_size=4096 +2025-02-13T20:03:53.7276024Z  Always | INFO  | writeL1Backdoor -- coord=(x=0,y=0) address=1032192 +2025-02-13T20:03:53.7286998Z  Always | INFO  | readL1Backdoor -- coord=(x=0,y=0) address=1032192 byte_size=6144 +2025-02-13T20:03:53.7294409Z  Always | INFO  | readL1Backdoor -- coord=(x=0,y=0) address=1040384 byte_size=6144 +2025-02-13T20:03:53.7303192Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:53.7313080Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:53.7313962Z [ OK ] DeviceFixture.TensixTestSimpleL1ReadWriteTileHi (15 ms) +2025-02-13T20:03:53.7314681Z [ RUN ] DeviceFixture.TensixTestSimpleL1ReadWritex2y2TileLo +2025-02-13T20:03:53.7315715Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:53.7355287Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:53.7413709Z  Always | INFO  | writeL1Backdoor -- coord=(x=2,y=2) address=794624 +2025-02-13T20:03:53.7424792Z  Always | INFO  | readL1Backdoor -- coord=(x=2,y=2) address=794624 byte_size=2048 +2025-02-13T20:03:53.7430840Z  Always | INFO  | readL1Backdoor -- coord=(x=2,y=2) address=802816 byte_size=2048 +2025-02-13T20:03:53.7438002Z  Always | INFO  | writeL1Backdoor -- coord=(x=2,y=2) address=794624 +2025-02-13T20:03:53.7448316Z  Always | INFO  | readL1Backdoor -- coord=(x=2,y=2) address=794624 byte_size=4096 +2025-02-13T20:03:53.7455710Z  Always | INFO  | readL1Backdoor -- coord=(x=2,y=2) address=802816 byte_size=4096 +2025-02-13T20:03:53.7463556Z  Always | INFO  | writeL1Backdoor -- coord=(x=2,y=2) address=794624 +2025-02-13T20:03:53.7474348Z  Always | INFO  | readL1Backdoor -- coord=(x=2,y=2) address=794624 byte_size=6144 +2025-02-13T20:03:53.7482852Z  Always | INFO  | readL1Backdoor -- coord=(x=2,y=2) address=802816 byte_size=6144 +2025-02-13T20:03:53.7490420Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:53.7500518Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:53.7501634Z [ OK ] DeviceFixture.TensixTestSimpleL1ReadWritex2y2TileLo (18 ms) +2025-02-13T20:03:53.7502643Z [ RUN ] DeviceFixture.TensixTestSimpleL1ReadWritex2y2TileHi +2025-02-13T20:03:53.7503724Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:53.7557871Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:53.7615858Z  Always | INFO  | writeL1Backdoor -- coord=(x=2,y=2) address=1032192 +2025-02-13T20:03:53.7626706Z  Always | INFO  | readL1Backdoor -- coord=(x=2,y=2) address=1032192 byte_size=2048 +2025-02-13T20:03:53.7632544Z  Always | INFO  | readL1Backdoor -- coord=(x=2,y=2) address=1040384 byte_size=2048 +2025-02-13T20:03:53.7639066Z  Always | INFO  | writeL1Backdoor -- coord=(x=2,y=2) address=1032192 +2025-02-13T20:03:53.7650261Z  Always | INFO  | readL1Backdoor -- coord=(x=2,y=2) address=1032192 byte_size=4096 +2025-02-13T20:03:53.7658109Z  Always | INFO  | readL1Backdoor -- coord=(x=2,y=2) address=1040384 byte_size=4096 +2025-02-13T20:03:53.7665224Z  Always | INFO  | writeL1Backdoor -- coord=(x=2,y=2) address=1032192 +2025-02-13T20:03:53.7675869Z  Always | INFO  | readL1Backdoor -- coord=(x=2,y=2) address=1032192 byte_size=6144 +2025-02-13T20:03:53.7684219Z  Always | INFO  | readL1Backdoor -- coord=(x=2,y=2) address=1040384 byte_size=6144 +2025-02-13T20:03:53.7691782Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:53.7702196Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:53.7703424Z [ OK ] DeviceFixture.TensixTestSimpleL1ReadWritex2y2TileHi (20 ms) +2025-02-13T20:03:53.7704189Z [ RUN ] DeviceFixture.TensixTestBufferL1ReadWriteTileLo +2025-02-13T20:03:53.7705237Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:53.7760424Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:53.7817876Z  Always | INFO  | writeL1Backdoor -- coord=(x=2,y=2) address=794624 +2025-02-13T20:03:53.7828250Z  Always | INFO  | readL1Backdoor -- coord=(x=2,y=2) address=794624 byte_size=2048 +2025-02-13T20:03:53.7834457Z  Always | INFO  | readL1Backdoor -- coord=(x=2,y=2) address=802816 byte_size=2048 +2025-02-13T20:03:53.7842964Z  Always | INFO  | writeL1Backdoor -- coord=(x=2,y=2) address=794624 +2025-02-13T20:03:53.7854278Z  Always | INFO  | readL1Backdoor -- coord=(x=2,y=2) address=794624 byte_size=4096 +2025-02-13T20:03:53.7860586Z  Always | INFO  | readL1Backdoor -- coord=(x=2,y=2) address=802816 byte_size=4096 +2025-02-13T20:03:53.7868740Z  Always | INFO  | writeL1Backdoor -- coord=(x=2,y=2) address=794624 +2025-02-13T20:03:53.7879080Z  Always | INFO  | readL1Backdoor -- coord=(x=2,y=2) address=794624 byte_size=6144 +2025-02-13T20:03:53.7886585Z  Always | INFO  | readL1Backdoor -- coord=(x=2,y=2) address=802816 byte_size=6144 +2025-02-13T20:03:53.7895164Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:53.7905668Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:53.7907424Z [ OK ] DeviceFixture.TensixTestBufferL1ReadWriteTileLo (20 ms) +2025-02-13T20:03:53.7908327Z [ RUN ] DeviceFixture.TensixTestBufferL1ReadWriteTileHi +2025-02-13T20:03:53.7909589Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:53.7961277Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:53.8019581Z  Always | INFO  | writeL1Backdoor -- coord=(x=2,y=2) address=1032192 +2025-02-13T20:03:53.8029666Z  Always | INFO  | readL1Backdoor -- coord=(x=2,y=2) address=1032192 byte_size=2048 +2025-02-13T20:03:53.8035844Z  Always | INFO  | readL1Backdoor -- coord=(x=2,y=2) address=1040384 byte_size=2048 +2025-02-13T20:03:53.8043270Z  Always | INFO  | writeL1Backdoor -- coord=(x=2,y=2) address=1032192 +2025-02-13T20:03:53.8053843Z  Always | INFO  | readL1Backdoor -- coord=(x=2,y=2) address=1032192 byte_size=4096 +2025-02-13T20:03:53.8060987Z  Always | INFO  | readL1Backdoor -- coord=(x=2,y=2) address=1040384 byte_size=4096 +2025-02-13T20:03:53.8069296Z  Always | INFO  | writeL1Backdoor -- coord=(x=2,y=2) address=1032192 +2025-02-13T20:03:53.8081437Z  Always | INFO  | readL1Backdoor -- coord=(x=2,y=2) address=1032192 byte_size=6144 +2025-02-13T20:03:53.8088781Z  Always | INFO  | readL1Backdoor -- coord=(x=2,y=2) address=1040384 byte_size=6144 +2025-02-13T20:03:53.8097283Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:53.8109128Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:53.8110218Z [ OK ] DeviceFixture.TensixTestBufferL1ReadWriteTileHi (20 ms) +2025-02-13T20:03:53.8111131Z [ RUN ] DeviceFixture.TensixSingleCoreDirectDramReaderOnly +2025-02-13T20:03:53.8112432Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:53.8163897Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:54.1031266Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:54.1041132Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:54.1042313Z [ OK ] DeviceFixture.TensixSingleCoreDirectDramReaderOnly (293 ms) +2025-02-13T20:03:54.1043293Z [ RUN ] DeviceFixture.TensixSingleCoreDirectDramWriterOnly +2025-02-13T20:03:54.1044584Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:54.1092943Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:54.3911191Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:54.3919917Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:54.3921128Z [ OK ] DeviceFixture.TensixSingleCoreDirectDramWriterOnly (287 ms) +2025-02-13T20:03:54.3922124Z [ RUN ] DeviceFixture.TensixSingleCoreDirectDramReaderWriter +2025-02-13T20:03:54.3924715Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:54.3926070Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:54.7006928Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:54.7018523Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:54.7019697Z [ OK ] DeviceFixture.TensixSingleCoreDirectDramReaderWriter (309 ms) +2025-02-13T20:03:54.7020797Z [ RUN ] DeviceFixture.TensixSingleCoreDirectDramReaderDatacopyWriter +2025-02-13T20:03:54.7022228Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:54.7053155Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:54.7112126Z  Metal | WARNING  | Circular buffer indices are not contiguous starting at 0. This will hurt dispatch performance. Non-contiguous indices: 16. First unused index: 1. Kernels: direct_reader_unary +2025-02-13T20:03:55.1371732Z  Metal | WARNING  | Circular buffer indices are not contiguous starting at 0. This will hurt dispatch performance. Non-contiguous indices: 16. First unused index: 1. Kernels: direct_writer_unary, direct_reader_unary, eltwise_copy +2025-02-13T20:03:55.9662214Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:55.9671925Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:55.9673162Z [ OK ] DeviceFixture.TensixSingleCoreDirectDramReaderDatacopyWriter (1265 ms) +2025-02-13T20:03:55.9674330Z [ RUN ] DeviceFixture.TensixTestSingleCoreSingleTileBankedL1ReaderOnly +2025-02-13T20:03:55.9675689Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:55.9679645Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:56.3245640Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:56.3253987Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:56.3255699Z [ OK ] DeviceFixture.TensixTestSingleCoreSingleTileBankedL1ReaderOnly (358 ms) +2025-02-13T20:03:56.3256923Z [ RUN ] DeviceFixture.TensixTestSingleCoreMultiTileBankedL1ReaderOnly +2025-02-13T20:03:56.3258328Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:56.3315971Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:57.2984169Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:57.2992419Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:57.2993699Z [ OK ] DeviceFixture.TensixTestSingleCoreMultiTileBankedL1ReaderOnly (973 ms) +2025-02-13T20:03:57.2994956Z [ RUN ] DeviceFixture.TensixTestSingleCoreSingleTileBankedDramReaderOnly +2025-02-13T20:03:57.2996355Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:57.3012335Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:57.6447440Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:57.6455767Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:57.6457025Z [ OK ] DeviceFixture.TensixTestSingleCoreSingleTileBankedDramReaderOnly (346 ms) +2025-02-13T20:03:57.6458260Z [ RUN ] DeviceFixture.TensixTestSingleCoreMultiTileBankedDramReaderOnly +2025-02-13T20:03:57.6460518Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:57.6461884Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:59.4109633Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:59.4117463Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:59.4118744Z [ OK ] DeviceFixture.TensixTestSingleCoreMultiTileBankedDramReaderOnly (1766 ms) +2025-02-13T20:03:59.4120130Z [ RUN ] DeviceFixture.TensixTestSingleCoreSingleTileBankedL1WriterOnly +2025-02-13T20:03:59.4121582Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:59.4124248Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:03:59.7323160Z  Metal | INFO  | Closing device 0 +2025-02-13T20:03:59.7330001Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:03:59.7331266Z [ OK ] DeviceFixture.TensixTestSingleCoreSingleTileBankedL1WriterOnly (321 ms) +2025-02-13T20:03:59.7332501Z [ RUN ] DeviceFixture.TensixTestSingleCoreMultiTileBankedL1WriterOnly +2025-02-13T20:03:59.7333905Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:03:59.7356106Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:00.7639400Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:00.7647909Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:00.7649189Z [ OK ] DeviceFixture.TensixTestSingleCoreMultiTileBankedL1WriterOnly (1031 ms) +2025-02-13T20:04:00.7650396Z [ RUN ] DeviceFixture.TensixTestSingleCoreSingleTileBankedDramWriterOnly +2025-02-13T20:04:00.7651813Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:00.7657738Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:01.1075332Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:01.1086543Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:01.1087834Z [ OK ] DeviceFixture.TensixTestSingleCoreSingleTileBankedDramWriterOnly (343 ms) +2025-02-13T20:04:01.1089095Z [ RUN ] DeviceFixture.TensixTestSingleCoreMultiTileBankedDramWriterOnly +2025-02-13T20:04:01.1090508Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:01.1092735Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:01.9756816Z /work/tests/tt_metal/tt_metal/api/test_banked.cpp:375: Failure +2025-02-13T20:04:01.9757914Z Value of: local_test_functions::reader_cb_writer(this->devices_.at(id), test_config, false, true) +2025-02-13T20:04:01.9758815Z Actual: false +2025-02-13T20:04:01.9759200Z Expected: true +2025-02-13T20:04:02.2642268Z /work/tests/tt_metal/tt_metal/api/test_banked.cpp:375: Failure +2025-02-13T20:04:02.2643340Z Value of: local_test_functions::reader_cb_writer(this->devices_.at(id), test_config, false, true) +2025-02-13T20:04:02.2644228Z Actual: false +2025-02-13T20:04:02.2644641Z Expected: true +2025-02-13T20:04:02.6027720Z /work/tests/tt_metal/tt_metal/api/test_banked.cpp:375: Failure +2025-02-13T20:04:02.6028887Z Value of: local_test_functions::reader_cb_writer(this->devices_.at(id), test_config, false, true) +2025-02-13T20:04:02.6029777Z Actual: false +2025-02-13T20:04:02.6030189Z Expected: true +2025-02-13T20:04:02.9462981Z /work/tests/tt_metal/tt_metal/api/test_banked.cpp:375: Failure +2025-02-13T20:04:02.9464053Z Value of: local_test_functions::reader_cb_writer(this->devices_.at(id), test_config, false, true) +2025-02-13T20:04:02.9464944Z Actual: false +2025-02-13T20:04:02.9465347Z Expected: true +2025-02-13T20:04:02.9466391Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:02.9472522Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:02.9473821Z [ FAILED ] DeviceFixture.TensixTestSingleCoreMultiTileBankedDramWriterOnly (1838 ms) +2025-02-13T20:04:02.9475903Z [ RUN ] DeviceFixture.TensixTestSingleCoreSingleTileBankedL1ReaderAndWriter +2025-02-13T20:04:02.9477329Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:02.9478655Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:02.9560918Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:02.9572188Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:02.9573546Z [ OK ] DeviceFixture.TensixTestSingleCoreSingleTileBankedL1ReaderAndWriter (9 ms) +2025-02-13T20:04:02.9574818Z [ RUN ] DeviceFixture.TensixTestSingleCoreMultiTileBankedL1ReaderAndWriter +2025-02-13T20:04:02.9577346Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:02.9578708Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:03.4745808Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:03.4756034Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:03.4758104Z [ OK ] DeviceFixture.TensixTestSingleCoreMultiTileBankedL1ReaderAndWriter (518 ms) +2025-02-13T20:04:03.4759397Z [ RUN ] DeviceFixture.TensixTestSingleCoreSingleTileBankedDramReaderAndWriter +2025-02-13T20:04:03.4761105Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:03.4762454Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:03.4837758Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:03.4848452Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:03.4849728Z [ OK ] DeviceFixture.TensixTestSingleCoreSingleTileBankedDramReaderAndWriter (9 ms) +2025-02-13T20:04:03.4851008Z [ RUN ] DeviceFixture.TensixTestSingleCoreMultiTileBankedDramReaderAndWriter +2025-02-13T20:04:03.4853316Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:03.4854728Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:03.5749931Z /work/tests/tt_metal/tt_metal/api/test_banked.cpp:429: Failure +2025-02-13T20:04:03.5750995Z Value of: local_test_functions::reader_cb_writer(this->devices_.at(id), test_config, true, true) +2025-02-13T20:04:03.5751849Z Actual: false +2025-02-13T20:04:03.5752241Z Expected: true +2025-02-13T20:04:03.6994836Z /work/tests/tt_metal/tt_metal/api/test_banked.cpp:429: Failure +2025-02-13T20:04:03.6995891Z Value of: local_test_functions::reader_cb_writer(this->devices_.at(id), test_config, true, true) +2025-02-13T20:04:03.6996749Z Actual: false +2025-02-13T20:04:03.6997139Z Expected: true +2025-02-13T20:04:03.8650729Z /work/tests/tt_metal/tt_metal/api/test_banked.cpp:429: Failure +2025-02-13T20:04:03.8651769Z Value of: local_test_functions::reader_cb_writer(this->devices_.at(id), test_config, true, true) +2025-02-13T20:04:03.8652876Z Actual: false +2025-02-13T20:04:03.8653269Z Expected: true +2025-02-13T20:04:04.0717762Z /work/tests/tt_metal/tt_metal/api/test_banked.cpp:429: Failure +2025-02-13T20:04:04.0718835Z Value of: local_test_functions::reader_cb_writer(this->devices_.at(id), test_config, true, true) +2025-02-13T20:04:04.0719889Z Actual: false +2025-02-13T20:04:04.0720325Z Expected: true +2025-02-13T20:04:04.3193586Z /work/tests/tt_metal/tt_metal/api/test_banked.cpp:429: Failure +2025-02-13T20:04:04.3194655Z Value of: local_test_functions::reader_cb_writer(this->devices_.at(id), test_config, true, true) +2025-02-13T20:04:04.3195550Z Actual: false +2025-02-13T20:04:04.3195972Z Expected: true +2025-02-13T20:04:04.6086222Z /work/tests/tt_metal/tt_metal/api/test_banked.cpp:429: Failure +2025-02-13T20:04:04.6087305Z Value of: local_test_functions::reader_cb_writer(this->devices_.at(id), test_config, true, true) +2025-02-13T20:04:04.6088208Z Actual: false +2025-02-13T20:04:04.6088626Z Expected: true +2025-02-13T20:04:04.6089492Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:04.6096776Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:04.6098384Z [ FAILED ] DeviceFixture.TensixTestSingleCoreMultiTileBankedDramReaderAndWriter (1124 ms) +2025-02-13T20:04:04.6099752Z [ RUN ] DeviceFixture.TensixTestSingleCoreSingleTileBankedDramReaderAndL1Writer +2025-02-13T20:04:04.6101270Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:04.6143893Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:04.6224001Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:04.6234568Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:04.6235937Z [ OK ] DeviceFixture.TensixTestSingleCoreSingleTileBankedDramReaderAndL1Writer (13 ms) +2025-02-13T20:04:04.6237348Z [ RUN ] DeviceFixture.TensixTestSingleCoreMultiTileBankedDramReaderAndL1Writer +2025-02-13T20:04:04.6238834Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:04.6244275Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:05.4494254Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:05.4504155Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:05.4505516Z [ OK ] DeviceFixture.TensixTestSingleCoreMultiTileBankedDramReaderAndL1Writer (826 ms) +2025-02-13T20:04:05.4506939Z [ RUN ] DeviceFixture.TensixTestSingleCoreSingleTileBankedL1ReaderAndDramWriter +2025-02-13T20:04:05.4508446Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:05.4527196Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:05.4608370Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:05.4618982Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:05.4620330Z [ OK ] DeviceFixture.TensixTestSingleCoreSingleTileBankedL1ReaderAndDramWriter (11 ms) +2025-02-13T20:04:05.4621695Z [ RUN ] DeviceFixture.TensixTestSingleCoreMultiTileBankedL1ReaderAndDramWriter +2025-02-13T20:04:05.4623392Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:05.4628276Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:05.5271588Z /work/tests/tt_metal/tt_metal/api/test_banked.cpp:485: Failure +2025-02-13T20:04:05.5272675Z Value of: local_test_functions::reader_cb_writer(this->devices_.at(id), test_config, true, true) +2025-02-13T20:04:05.5273595Z Actual: false +2025-02-13T20:04:05.5274036Z Expected: true +2025-02-13T20:04:05.6138406Z /work/tests/tt_metal/tt_metal/api/test_banked.cpp:485: Failure +2025-02-13T20:04:05.6139496Z Value of: local_test_functions::reader_cb_writer(this->devices_.at(id), test_config, true, true) +2025-02-13T20:04:05.6140411Z Actual: false +2025-02-13T20:04:05.6140826Z Expected: true +2025-02-13T20:04:05.7288293Z /work/tests/tt_metal/tt_metal/api/test_banked.cpp:485: Failure +2025-02-13T20:04:05.7289394Z Value of: local_test_functions::reader_cb_writer(this->devices_.at(id), test_config, true, true) +2025-02-13T20:04:05.7290281Z Actual: false +2025-02-13T20:04:05.7290693Z Expected: true +2025-02-13T20:04:05.8720663Z /work/tests/tt_metal/tt_metal/api/test_banked.cpp:485: Failure +2025-02-13T20:04:05.8721823Z Value of: local_test_functions::reader_cb_writer(this->devices_.at(id), test_config, true, true) +2025-02-13T20:04:05.8722721Z Actual: false +2025-02-13T20:04:05.8723146Z Expected: true +2025-02-13T20:04:06.0435644Z /work/tests/tt_metal/tt_metal/api/test_banked.cpp:485: Failure +2025-02-13T20:04:06.0436732Z Value of: local_test_functions::reader_cb_writer(this->devices_.at(id), test_config, true, true) +2025-02-13T20:04:06.0437630Z Actual: false +2025-02-13T20:04:06.0438033Z Expected: true +2025-02-13T20:04:06.2436645Z /work/tests/tt_metal/tt_metal/api/test_banked.cpp:485: Failure +2025-02-13T20:04:06.2437708Z Value of: local_test_functions::reader_cb_writer(this->devices_.at(id), test_config, true, true) +2025-02-13T20:04:06.2438625Z Actual: false +2025-02-13T20:04:06.2439053Z Expected: true +2025-02-13T20:04:06.2440045Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:06.2447620Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:06.2449002Z [ FAILED ] DeviceFixture.TensixTestSingleCoreMultiTileBankedL1ReaderAndDramWriter (782 ms) +2025-02-13T20:04:06.2450429Z [ RUN ] DeviceFixture.TensixTestSingleCoreMultiTileBankedL1ReaderDataCopyL1Writer +2025-02-13T20:04:06.2451938Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:06.2507519Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:06.2566742Z  Metal | WARNING  | Circular buffer indices are not contiguous starting at 0. This will hurt dispatch performance. Non-contiguous indices: 16. First unused index: 1. Kernels: banked_reader +2025-02-13T20:04:06.7340335Z  Metal | WARNING  | Circular buffer indices are not contiguous starting at 0. This will hurt dispatch performance. Non-contiguous indices: 16. First unused index: 1. Kernels: banked_writer, banked_reader, eltwise_copy +2025-02-13T20:04:09.2579696Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:09.2588700Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:09.2590123Z [ OK ] DeviceFixture.TensixTestSingleCoreMultiTileBankedL1ReaderDataCopyL1Writer (3014 ms) +2025-02-13T20:04:09.2591635Z [ RUN ] DeviceFixture.TensixTestSingleCoreMultiTileBankedDramReaderDataCopyDramWriter +2025-02-13T20:04:09.2593717Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:09.2608683Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:10.0699373Z /work/tests/tt_metal/tt_metal/api/test_banked.cpp:524: Failure +2025-02-13T20:04:10.0700487Z Value of: local_test_functions::reader_datacopy_writer(this->devices_.at(id), test_config) +2025-02-13T20:04:10.0701370Z Actual: false +2025-02-13T20:04:10.0701806Z Expected: true +2025-02-13T20:04:10.5658887Z /work/tests/tt_metal/tt_metal/api/test_banked.cpp:524: Failure +2025-02-13T20:04:10.5660022Z Value of: local_test_functions::reader_datacopy_writer(this->devices_.at(id), test_config) +2025-02-13T20:04:10.5660898Z Actual: false +2025-02-13T20:04:10.5661329Z Expected: true +2025-02-13T20:04:11.0685296Z /work/tests/tt_metal/tt_metal/api/test_banked.cpp:524: Failure +2025-02-13T20:04:11.0686206Z Value of: local_test_functions::reader_datacopy_writer(this->devices_.at(id), test_config) +2025-02-13T20:04:11.0686874Z Actual: false +2025-02-13T20:04:11.0687203Z Expected: true +2025-02-13T20:04:11.5762948Z /work/tests/tt_metal/tt_metal/api/test_banked.cpp:524: Failure +2025-02-13T20:04:11.5764380Z Value of: local_test_functions::reader_datacopy_writer(this->devices_.at(id), test_config) +2025-02-13T20:04:11.5765448Z Actual: false +2025-02-13T20:04:11.5767042Z Expected: true +2025-02-13T20:04:11.9977803Z /work/tests/tt_metal/tt_metal/api/test_banked.cpp:524: Failure +2025-02-13T20:04:11.9979216Z Value of: local_test_functions::reader_datacopy_writer(this->devices_.at(id), test_config) +2025-02-13T20:04:11.9980330Z Actual: false +2025-02-13T20:04:11.9982080Z Expected: true +2025-02-13T20:04:11.9983431Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:11.9991796Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:11.9992967Z [ FAILED ] DeviceFixture.TensixTestSingleCoreMultiTileBankedDramReaderDataCopyDramWriter (2740 ms) +2025-02-13T20:04:11.9994301Z [ RUN ] DeviceFixture.TensixTestSingleCoreMultiTileBankedL1ReaderDataCopyDramWriter +2025-02-13T20:04:11.9995746Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:11.9999345Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:12.0479216Z /work/tests/tt_metal/tt_metal/api/test_banked.cpp:545: Failure +2025-02-13T20:04:12.0480525Z Value of: local_test_functions::reader_datacopy_writer(this->devices_.at(id), test_config) +2025-02-13T20:04:12.0481384Z Actual: false +2025-02-13T20:04:12.0481811Z Expected: true +2025-02-13T20:04:12.1099119Z /work/tests/tt_metal/tt_metal/api/test_banked.cpp:545: Failure +2025-02-13T20:04:12.1102845Z Value of: local_test_functions::reader_datacopy_writer(this->devices_.at(id), test_config) +2025-02-13T20:04:12.1103890Z Actual: false +2025-02-13T20:04:12.1104406Z Expected: true +2025-02-13T20:04:12.1917009Z /work/tests/tt_metal/tt_metal/api/test_banked.cpp:545: Failure +2025-02-13T20:04:12.1918093Z Value of: local_test_functions::reader_datacopy_writer(this->devices_.at(id), test_config) +2025-02-13T20:04:12.1918971Z Actual: false +2025-02-13T20:04:12.1919395Z Expected: true +2025-02-13T20:04:12.3219846Z /work/tests/tt_metal/tt_metal/api/test_banked.cpp:545: Failure +2025-02-13T20:04:12.3220902Z Value of: local_test_functions::reader_datacopy_writer(this->devices_.at(id), test_config) +2025-02-13T20:04:12.3221768Z Actual: false +2025-02-13T20:04:12.3222173Z Expected: true +2025-02-13T20:04:12.4780173Z /work/tests/tt_metal/tt_metal/api/test_banked.cpp:545: Failure +2025-02-13T20:04:12.4781698Z Value of: local_test_functions::reader_datacopy_writer(this->devices_.at(id), test_config) +2025-02-13T20:04:12.4782547Z Actual: false +2025-02-13T20:04:12.4782957Z Expected: true +2025-02-13T20:04:12.6229119Z /work/tests/tt_metal/tt_metal/api/test_banked.cpp:545: Failure +2025-02-13T20:04:12.6230221Z Value of: local_test_functions::reader_datacopy_writer(this->devices_.at(id), test_config) +2025-02-13T20:04:12.6231090Z Actual: false +2025-02-13T20:04:12.6231517Z Expected: true +2025-02-13T20:04:12.6232689Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:12.6242921Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:12.6244387Z [ FAILED ] DeviceFixture.TensixTestSingleCoreMultiTileBankedL1ReaderDataCopyDramWriter (624 ms) +2025-02-13T20:04:12.6245971Z [ RUN ] DeviceFixture.TensixTestSingleCoreMultiTileBankedDramReaderDataCopyL1Writer +2025-02-13T20:04:12.6248558Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:12.6250292Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:12.8529765Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:12.8542265Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:12.8543774Z [ OK ] DeviceFixture.TensixTestSingleCoreMultiTileBankedDramReaderDataCopyL1Writer (229 ms) +2025-02-13T20:04:12.8545073Z [ RUN ] DeviceFixture.TensixTestCircularBuffersSequentiallyPlaced +2025-02-13T20:04:12.8546502Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:12.8567836Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:13.3514008Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:13.3523696Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:13.3524628Z [ OK ] DeviceFixture.TensixTestCircularBuffersSequentiallyPlaced (498 ms) +2025-02-13T20:04:13.3527280Z [ RUN ] DeviceFixture.TensixTestCircularBufferSequentialAcrossAllCores +2025-02-13T20:04:13.3528345Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:13.3530908Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:13.3594685Z  Metal | WARNING  | Circular buffer indices are not contiguous starting at 0. This will hurt dispatch performance. Non-contiguous indices: 31. First unused index: 3. Kernels: blank +2025-02-13T20:04:13.3597203Z  Metal | WARNING  | Circular buffer indices are not contiguous starting at 0. This will hurt dispatch performance. Non-contiguous indices: 31. First unused index: 0. Kernels: blank +2025-02-13T20:04:13.3599410Z  Metal | WARNING  | Circular buffer indices are not contiguous starting at 0. This will hurt dispatch performance. Non-contiguous indices: 31. First unused index: 5. Kernels: blank +2025-02-13T20:04:13.6911181Z  Metal | WARNING  | Circular buffer indices are not contiguous starting at 0. This will hurt dispatch performance. Non-contiguous indices: 31. First unused index: 3. Kernels: blank, blank, blank +2025-02-13T20:04:13.6914157Z  Metal | WARNING  | Circular buffer indices are not contiguous starting at 0. This will hurt dispatch performance. Non-contiguous indices: 31. First unused index: 0. Kernels: blank, blank, blank +2025-02-13T20:04:13.6916507Z  Metal | WARNING  | Circular buffer indices are not contiguous starting at 0. This will hurt dispatch performance. Non-contiguous indices: 31. First unused index: 5. Kernels: blank, blank, blank +2025-02-13T20:04:13.6941740Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:13.6956902Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:13.6957875Z [ OK ] DeviceFixture.TensixTestCircularBufferSequentialAcrossAllCores (343 ms) +2025-02-13T20:04:13.6958711Z [ RUN ] DeviceFixture.TensixTestValidCircularBufferAddress +2025-02-13T20:04:13.6962968Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:13.6967828Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:13.7031604Z  Metal | WARNING  | Circular buffer indices are not contiguous starting at 0. This will hurt dispatch performance. Non-contiguous indices: 16,24. First unused index: 0. Kernels: blank +2025-02-13T20:04:14.0315357Z  Metal | WARNING  | Circular buffer indices are not contiguous starting at 0. This will hurt dispatch performance. Non-contiguous indices: 16,24. First unused index: 0. Kernels: blank, blank, blank +2025-02-13T20:04:14.0344324Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:14.0358140Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:14.0359275Z [ OK ] DeviceFixture.TensixTestValidCircularBufferAddress (340 ms) +2025-02-13T20:04:14.0360479Z [ RUN ] DeviceFixture.TensixTestCircularBuffersAndL1BuffersCollision +2025-02-13T20:04:14.0364342Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:14.0386207Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:14.0456960Z  Always | FATAL  | Statically allocated circular buffers in program 119 clash with L1 buffers on core range [(x=5,y=4) - (x=5,y=4)]. L1 buffer allocated at 786432 and static circular buffer region ends at 821280 +2025-02-13T20:04:14.0463659Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:14.0477420Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:14.0478619Z [ OK ] DeviceFixture.TensixTestCircularBuffersAndL1BuffersCollision (11 ms) +2025-02-13T20:04:14.0479833Z [ RUN ] DeviceFixture.TensixTestValidUpdateCircularBufferSize +2025-02-13T20:04:14.0482781Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:14.0491975Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:14.3545175Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:14.3557575Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:14.3560328Z [ OK ] DeviceFixture.TensixTestValidUpdateCircularBufferSize (307 ms) +2025-02-13T20:04:14.3561161Z [ RUN ] DeviceFixture.TensixTestInvalidUpdateCircularBufferSize +2025-02-13T20:04:14.3562190Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:14.3618081Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:14.3699727Z  Always | FATAL  | Total circular buffer size 1024 B must be divisible by page size 2048 B +2025-02-13T20:04:14.3704214Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:14.3718190Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:14.3719257Z [ OK ] DeviceFixture.TensixTestInvalidUpdateCircularBufferSize (15 ms) +2025-02-13T20:04:14.3721141Z [ RUN ] DeviceFixture.TensixTestUpdateCircularBufferAddress +2025-02-13T20:04:14.3723187Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:14.3724190Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:14.3819791Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:14.3831273Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:14.3834136Z [ OK ] DeviceFixture.TensixTestUpdateCircularBufferAddress (11 ms) +2025-02-13T20:04:14.3836102Z [ RUN ] DeviceFixture.TensixTestUpdateCircularBufferPageSize +2025-02-13T20:04:14.3837126Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:14.3838141Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:14.3931710Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:14.3943783Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:14.3945171Z [ OK ] DeviceFixture.TensixTestUpdateCircularBufferPageSize (11 ms) +2025-02-13T20:04:14.3946006Z [ RUN ] DeviceFixture.TensixTestDataCopyWithUpdatedCircularBufferConfig +2025-02-13T20:04:14.3947735Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:14.3948974Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:14.4061678Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:14.4073815Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:14.4075839Z [ OK ] DeviceFixture.TensixTestDataCopyWithUpdatedCircularBufferConfig (12 ms) +2025-02-13T20:04:14.4076737Z [ RUN ] DeviceFixture.TensixTestCreateCircularBufferAtValidIndices +2025-02-13T20:04:14.4077820Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:14.4123754Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:14.7074312Z  Metal | WARNING  | Circular buffer indices are not contiguous starting at 0. This will hurt dispatch performance. Non-contiguous indices: 2,16,24. First unused index: 1. Kernels: blank, blank, blank +2025-02-13T20:04:14.7087938Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:14.7103454Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:14.7104414Z [ OK ] DeviceFixture.TensixTestCreateCircularBufferAtValidIndices (303 ms) +2025-02-13T20:04:14.7105212Z [ RUN ] DeviceFixture.TestCreateCircularBufferAtInvalidIndex +2025-02-13T20:04:14.7108269Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:14.7153909Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:14.7216774Z  Always | FATAL  | Buffer index (32) exceeds max number of circular buffers per core (32) +2025-02-13T20:04:14.7222111Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:14.7235192Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:14.7236124Z [ OK ] DeviceFixture.TestCreateCircularBufferAtInvalidIndex (12 ms) +2025-02-13T20:04:14.7236904Z [ RUN ] DeviceFixture.TestCreateCircularBufferWithMismatchingConfig +2025-02-13T20:04:14.7237947Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:14.7255346Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:14.7316874Z  Always | FATAL  | Illegal circular buffer index 1. Page size can only be specified for buffer indices configured during config creation +2025-02-13T20:04:14.7320921Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:14.7331651Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:14.7332883Z [ OK ] DeviceFixture.TestCreateCircularBufferWithMismatchingConfig (9 ms) +2025-02-13T20:04:14.7334046Z [ RUN ] DeviceFixture.TensixTestCreateCircularBufferAtOverlappingIndex +2025-02-13T20:04:14.7335457Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:14.7355366Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:14.7418119Z  Always | FATAL  | Invalid circular buffer index: Cannot add circular buffer at index 16, another circular buffer already exists +2025-02-13T20:04:14.7421312Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:14.7433331Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:14.7436511Z [ OK ] DeviceFixture.TensixTestCreateCircularBufferAtOverlappingIndex (10 ms) +2025-02-13T20:04:14.7437540Z [ RUN ] DeviceFixture.TensixTestCircularBufferNonBlockingAPIs +2025-02-13T20:04:14.7439139Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:14.7457366Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:15.1055943Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:15.1068174Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:15.1069111Z [ OK ] DeviceFixture.TensixTestCircularBufferNonBlockingAPIs (363 ms) +2025-02-13T20:04:15.1069801Z [----------] 67 tests from DeviceFixture (24334 ms total) +2025-02-13T20:04:15.1070175Z +2025-02-13T20:04:15.1070381Z [----------] 3 tests from TensorShapeBaseTests +2025-02-13T20:04:15.1070909Z [ RUN ] TensorShapeBaseTests.General4D +2025-02-13T20:04:15.1071929Z  Always | FATAL  | ShapeBase[] index out of range. 4 not in [-4, 4) +2025-02-13T20:04:15.1073430Z  Always | FATAL  | ShapeBase[] index out of range. -5 not in [-4, 4) +2025-02-13T20:04:15.1078242Z [ OK ] TensorShapeBaseTests.General4D (1 ms) +2025-02-13T20:04:15.1079150Z [ RUN ] TensorShapeBaseTests.Empty +2025-02-13T20:04:15.1080255Z  Always | FATAL  | ShapeBase[] index out of range. 0 not in [-4, 0) +2025-02-13T20:04:15.1082604Z  Always | FATAL  | ShapeBase[] index out of range. 1 not in [-4, 0) +2025-02-13T20:04:15.1086264Z  Always | FATAL  | ShapeBase[] index out of range. 2 not in [-4, 0) +2025-02-13T20:04:15.1090193Z  Always | FATAL  | ShapeBase[] index out of range. 3 not in [-4, 0) +2025-02-13T20:04:15.1093588Z  Always | FATAL  | ShapeBase[] index out of range. 4 not in [-4, 0) +2025-02-13T20:04:15.1097092Z  Always | FATAL  | ShapeBase[] index out of range. -5 not in [-4, 0) +2025-02-13T20:04:15.1099960Z [ OK ] TensorShapeBaseTests.Empty (2 ms) +2025-02-13T20:04:15.1100521Z [ RUN ] TensorShapeBaseTests.TwoElements +2025-02-13T20:04:15.1101395Z  Always | FATAL  | ShapeBase[] index out of range. 2 not in [-4, 2) +2025-02-13T20:04:15.1104320Z  Always | FATAL  | ShapeBase[] index out of range. -5 not in [-4, 2) +2025-02-13T20:04:15.1107866Z [ OK ] TensorShapeBaseTests.TwoElements (0 ms) +2025-02-13T20:04:15.1108462Z [----------] 3 tests from TensorShapeBaseTests (4 ms total) +2025-02-13T20:04:15.1108844Z +2025-02-13T20:04:15.1109036Z [----------] 2 tests from TensorVectorBaseTests +2025-02-13T20:04:15.1109546Z [ RUN ] TensorVectorBaseTests.General5D +2025-02-13T20:04:15.1110419Z  Always | FATAL  | ShapeBase[] index out of range. 5 not in [-5, 5) +2025-02-13T20:04:15.1111520Z  Always | FATAL  | ShapeBase[] index out of range. -6 not in [-5, 5) +2025-02-13T20:04:15.1114762Z [ OK ] TensorVectorBaseTests.General5D (0 ms) +2025-02-13T20:04:15.1115360Z [ RUN ] TensorVectorBaseTests.SingleElement +2025-02-13T20:04:15.1116285Z  Always | FATAL  | ShapeBase[] index out of range. 1 not in [-4, 1) +2025-02-13T20:04:15.1118414Z  Always | FATAL  | ShapeBase[] index out of range. -5 not in [-4, 1) +2025-02-13T20:04:15.1122097Z [ OK ] TensorVectorBaseTests.SingleElement (0 ms) +2025-02-13T20:04:15.1122919Z [----------] 2 tests from TensorVectorBaseTests (1 ms total) +2025-02-13T20:04:15.1123274Z +2025-02-13T20:04:15.1123426Z [----------] 1 test from SOC +2025-02-13T20:04:15.1124058Z [ RUN ] SOC.TensixValidateLogicalToPhysicalCoreCoordHostMapping +2025-02-13T20:04:15.1127262Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:15.1193497Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:15.1256471Z  Test | INFO  | Device 0 harvesting mask 0 +2025-02-13T20:04:15.1257625Z  Test | INFO  | Device 0 has 0 harvested rows. Physical harvested row coordinates are: +2025-02-13T20:04:15.1258696Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:15.1267897Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:15.1268958Z [ OK ] SOC.TensixValidateLogicalToPhysicalCoreCoordHostMapping (14 ms) +2025-02-13T20:04:15.1269618Z [----------] 1 test from SOC (14 ms total) +2025-02-13T20:04:15.1269919Z +2025-02-13T20:04:15.1270139Z [----------] 6 tests from DeviceSingleCardBufferFixture +2025-02-13T20:04:15.1270782Z [ RUN ] DeviceSingleCardBufferFixture.TestInvalidBufferRegion +2025-02-13T20:04:15.1271806Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:15.1294638Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:15.1355058Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:15.1367903Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:15.1368806Z [ OK ] DeviceSingleCardBufferFixture.TestInvalidBufferRegion (9 ms) +2025-02-13T20:04:15.1369584Z [ RUN ] DeviceSingleCardBufferFixture.TestValidBufferRegion +2025-02-13T20:04:15.1370579Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:15.1396784Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:15.1457509Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:15.1469136Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:15.1470022Z [ OK ] DeviceSingleCardBufferFixture.TestValidBufferRegion (10 ms) +2025-02-13T20:04:15.1470798Z [ RUN ] DeviceSingleCardBufferFixture.TestPartialBufferRegion +2025-02-13T20:04:15.1471851Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:15.1497515Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:15.1557991Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:15.1569529Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:15.1570435Z [ OK ] DeviceSingleCardBufferFixture.TestPartialBufferRegion (10 ms) +2025-02-13T20:04:15.1571183Z [ RUN ] DeviceSingleCardBufferFixture.TestFullBufferRegion +2025-02-13T20:04:15.1572370Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:15.1597533Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:15.1658849Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:15.1670202Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:15.1671067Z [ OK ] DeviceSingleCardBufferFixture.TestFullBufferRegion (10 ms) +2025-02-13T20:04:15.1671869Z [ RUN ] DeviceSingleCardBufferFixture.TestL1BuffersAllocatedTopDown +2025-02-13T20:04:15.1673740Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:15.1699164Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:15.1760538Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:15.1772324Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:15.1773565Z [ OK ] DeviceSingleCardBufferFixture.TestL1BuffersAllocatedTopDown (10 ms) +2025-02-13T20:04:15.1774783Z [ RUN ] DeviceSingleCardBufferFixture.TestL1BuffersDoNotGrowBeyondBankSize +2025-02-13T20:04:15.1776202Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:15.1800688Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:15.1861797Z  Always | FATAL  | Out of Memory: Cannot allocate at an address below 524304. Allocation at 524224 +2025-02-13T20:04:15.1866554Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:15.1878388Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:15.1879654Z [ OK ] DeviceSingleCardBufferFixture.TestL1BuffersDoNotGrowBeyondBankSize (10 ms) +2025-02-13T20:04:15.1880943Z [----------] 6 tests from DeviceSingleCardBufferFixture (61 ms total) +2025-02-13T20:04:15.1881466Z +2025-02-13T20:04:15.1881671Z [----------] 15 tests from DispatchFixture +2025-02-13T20:04:15.1882312Z [ RUN ] DispatchFixture.TensixDRAMtoL1Multicast +2025-02-13T20:04:15.1883361Z  Test | INFO  | Running test using Slow Dispatch +2025-02-13T20:04:15.1884731Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:15.1901395Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:15.5765995Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:15.5775805Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:15.5776658Z [ OK ] DispatchFixture.TensixDRAMtoL1Multicast (389 ms) +2025-02-13T20:04:15.5777361Z [ RUN ] DispatchFixture.TensixDRAMtoL1MulticastLoopbackSrc +2025-02-13T20:04:15.5778280Z  Test | INFO  | Running test using Slow Dispatch +2025-02-13T20:04:15.5779363Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:15.5841696Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:15.9637854Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:15.9650496Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:15.9651399Z [ OK ] DispatchFixture.TensixDRAMtoL1MulticastLoopbackSrc (387 ms) +2025-02-13T20:04:15.9652223Z [ RUN ] DispatchFixture.TensixDRAMtoL1MulticastExcludeRegionUpLeft +2025-02-13T20:04:15.9653181Z  Test | INFO  | Running test using Slow Dispatch +2025-02-13T20:04:15.9655838Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:15.9679885Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:15.9742035Z  Test | INFO  | This test is only supported on Blackhole +2025-02-13T20:04:15.9743190Z /work/tests/tt_metal/tt_metal/api/test_dram_to_l1_multicast.cpp:162: Skipped +2025-02-13T20:04:15.9743675Z +2025-02-13T20:04:15.9744134Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:15.9753837Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:15.9755826Z [ SKIPPED ] DispatchFixture.TensixDRAMtoL1MulticastExcludeRegionUpLeft (10 ms) +2025-02-13T20:04:15.9757037Z [ RUN ] DispatchFixture.TensixDRAMtoL1MulticastExcludeRegionUpRight +2025-02-13T20:04:15.9793543Z  Test | INFO  | Running test using Slow Dispatch +2025-02-13T20:04:15.9795360Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:15.9797197Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:15.9840186Z  Test | INFO  | This test is only supported on Blackhole +2025-02-13T20:04:15.9841299Z /work/tests/tt_metal/tt_metal/api/test_dram_to_l1_multicast.cpp:179: Skipped +2025-02-13T20:04:15.9841981Z +2025-02-13T20:04:15.9842550Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:15.9851580Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:15.9852852Z [ SKIPPED ] DispatchFixture.TensixDRAMtoL1MulticastExcludeRegionUpRight (9 ms) +2025-02-13T20:04:15.9854037Z [ RUN ] DispatchFixture.TensixDRAMtoL1MulticastExcludeRegionDownLeft +2025-02-13T20:04:15.9855431Z  Test | INFO  | Running test using Slow Dispatch +2025-02-13T20:04:15.9856766Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:15.9881196Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:15.9941312Z  Test | INFO  | This test is only supported on Blackhole +2025-02-13T20:04:15.9942417Z /work/tests/tt_metal/tt_metal/api/test_dram_to_l1_multicast.cpp:196: Skipped +2025-02-13T20:04:15.9943177Z +2025-02-13T20:04:15.9943801Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:15.9953909Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:15.9955205Z [ SKIPPED ] DispatchFixture.TensixDRAMtoL1MulticastExcludeRegionDownLeft (10 ms) +2025-02-13T20:04:15.9956546Z [ RUN ] DispatchFixture.TensixDRAMtoL1MulticastExcludeRegionDownRight +2025-02-13T20:04:15.9957786Z  Test | INFO  | Running test using Slow Dispatch +2025-02-13T20:04:15.9959109Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:15.9982761Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:16.0043532Z  Test | INFO  | This test is only supported on Blackhole +2025-02-13T20:04:16.0044860Z /work/tests/tt_metal/tt_metal/api/test_dram_to_l1_multicast.cpp:213: Skipped +2025-02-13T20:04:16.0045474Z +2025-02-13T20:04:16.0046009Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:16.0054186Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:16.0055809Z [ SKIPPED ] DispatchFixture.TensixDRAMtoL1MulticastExcludeRegionDownRight (10 ms) +2025-02-13T20:04:16.0056951Z [ RUN ] DispatchFixture.TensixDRAMLoopbackSingleCore +2025-02-13T20:04:16.0058070Z  Test | INFO  | Running test using Slow Dispatch +2025-02-13T20:04:16.0059488Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:16.0083445Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:16.3023646Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:16.3037370Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:16.3039329Z [ OK ] DispatchFixture.TensixDRAMLoopbackSingleCore (298 ms) +2025-02-13T20:04:16.3040508Z [ RUN ] DispatchFixture.TensixDRAMLoopbackSingleCorePreAllocated +2025-02-13T20:04:16.3041757Z  Test | INFO  | Running test using Slow Dispatch +2025-02-13T20:04:16.3043150Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:16.3113834Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:16.3238334Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:16.3250438Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:16.3251697Z [ OK ] DispatchFixture.TensixDRAMLoopbackSingleCorePreAllocated (21 ms) +2025-02-13T20:04:16.3252653Z [ RUN ] DispatchFixture.TensixDRAMLoopbackSingleCoreDB +2025-02-13T20:04:16.3253806Z  Test | INFO  | Running test using Slow Dispatch +2025-02-13T20:04:16.3255154Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:16.3315944Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:16.7392154Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:16.7400849Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:16.7402105Z [ OK ] DispatchFixture.TensixDRAMLoopbackSingleCoreDB (415 ms) +2025-02-13T20:04:16.7403034Z [ RUN ] DispatchFixture.TensixCreateGlobalCircularBuffers +2025-02-13T20:04:16.7404434Z  Test | INFO  | Running test using Slow Dispatch +2025-02-13T20:04:16.7405815Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:16.7457619Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:16.7525877Z  Always | FATAL  | Duplicate cores found +2025-02-13T20:04:16.7527219Z  Always | FATAL  | Duplicate receiver cores found +2025-02-13T20:04:16.7529326Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:16.7542061Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:16.7543220Z [ OK ] DispatchFixture.TensixCreateGlobalCircularBuffers (14 ms) +2025-02-13T20:04:16.7544139Z [ RUN ] DispatchFixture.TensixProgramGlobalCircularBuffers +2025-02-13T20:04:16.7545586Z  Test | INFO  | Running test using Slow Dispatch +2025-02-13T20:04:16.7547067Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:16.7558015Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:16.7631678Z  Always | FATAL  | Can only specify one remote buffer index per config +2025-02-13T20:04:16.7633335Z  Always | FATAL  | Specified cores are not contained in associated GlobalCircularBuffer +2025-02-13T20:04:17.0300733Z  Always | FATAL  | Specified cores are not contained in associated GlobalCircularBuffer +2025-02-13T20:04:17.2962071Z  Metal | WARNING  | Circular buffer indices are not contiguous starting at 0. This will hurt dispatch performance. Non-contiguous indices: 17. First unused index: 0. Kernels: blank +2025-02-13T20:04:17.2964860Z  Always | FATAL  | Circular buffer indices overlap for KernelGroup 0 on programmable core type 0. Local end index 18, Remote start index 16 +2025-02-13T20:04:17.2966575Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:17.2981821Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:17.2983013Z [ OK ] DispatchFixture.TensixProgramGlobalCircularBuffers (543 ms) +2025-02-13T20:04:17.2984059Z [ RUN ] DispatchFixture.InitializeGlobalSemaphores +2025-02-13T20:04:17.2985290Z  Test | INFO  | Running test using Slow Dispatch +2025-02-13T20:04:17.2986636Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:17.3013018Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:17.3087895Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:17.3099361Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:17.3100968Z [ OK ] DispatchFixture.InitializeGlobalSemaphores (11 ms) +2025-02-13T20:04:17.3102136Z [ RUN ] DispatchFixture.CreateMultipleGlobalSemaphoresOnSameCore +2025-02-13T20:04:17.3103588Z  Test | INFO  | Running test using Slow Dispatch +2025-02-13T20:04:17.3105225Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:17.3113382Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:17.3193964Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:17.3205803Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:17.3208046Z [ OK ] DispatchFixture.CreateMultipleGlobalSemaphoresOnSameCore (10 ms) +2025-02-13T20:04:17.3209575Z [ RUN ] DispatchFixture.ResetGlobalSemaphores +2025-02-13T20:04:17.3210925Z  Test | INFO  | Running test using Slow Dispatch +2025-02-13T20:04:17.3213789Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:17.3215728Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:17.3293757Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:17.3306842Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:17.3308202Z [ OK ] DispatchFixture.ResetGlobalSemaphores (9 ms) +2025-02-13T20:04:17.3309298Z [ RUN ] DispatchFixture.TensixCreateKernelsOnComputeCores +2025-02-13T20:04:17.3310711Z  Test | INFO  | Running test using Slow Dispatch +2025-02-13T20:04:17.3314612Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:04:17.3316828Z  Metal | INFO  | AI CLK for device 0 is: 1202 MHz +2025-02-13T20:04:17.3375433Z  Metal | INFO  | Closing device 0 +2025-02-13T20:04:17.3399442Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:04:17.3401332Z [ OK ] DispatchFixture.TensixCreateKernelsOnComputeCores (8 ms) +2025-02-13T20:04:17.3406262Z [ DISABLED ] DispatchFixture.DISABLED_TensixCreateKernelsOnStorageCores +2025-02-13T20:04:17.3407531Z [ DISABLED ] DispatchFixture.DISABLED_TensixIdleEthCreateKernelsOnDispatchCores +2025-02-13T20:04:17.3408351Z [----------] 15 tests from DispatchFixture (2150 ms total) +2025-02-13T20:04:17.3408946Z +2025-02-13T20:04:17.3409254Z [----------] 4 tests from CompileProgramWithKernelPathEnvVarFixture +2025-02-13T20:04:17.3410351Z [ RUN ] CompileProgramWithKernelPathEnvVarFixture.TensixKernelUnderMetalRootDir +2025-02-13T20:04:17.3412189Z  Test | INFO  | Skipping test: TT_METAL_KERNEL_PATH must be set +2025-02-13T20:04:17.3413475Z /work/tests/tt_metal/tt_metal/api/compile_program_with_kernel_path_env_var_fixture.hpp:17: Skipped +2025-02-13T20:04:17.3414577Z +2025-02-13T20:04:17.3415127Z [ SKIPPED ] CompileProgramWithKernelPathEnvVarFixture.TensixKernelUnderMetalRootDir (0 ms) +2025-02-13T20:04:17.3416857Z [ RUN ] CompileProgramWithKernelPathEnvVarFixture.TensixKernelUnderKernelRootDir +2025-02-13T20:04:17.3418052Z  Test | INFO  | Skipping test: TT_METAL_KERNEL_PATH must be set +2025-02-13T20:04:17.3418994Z /work/tests/tt_metal/tt_metal/api/compile_program_with_kernel_path_env_var_fixture.hpp:17: Skipped +2025-02-13T20:04:17.3419561Z +2025-02-13T20:04:17.3420068Z [ SKIPPED ] CompileProgramWithKernelPathEnvVarFixture.TensixKernelUnderKernelRootDir (0 ms) +2025-02-13T20:04:17.3421255Z [ RUN ] CompileProgramWithKernelPathEnvVarFixture.TensixKernelUnderMetalRootDirAndKernelRootDir +2025-02-13T20:04:17.3422453Z  Test | INFO  | Skipping test: TT_METAL_KERNEL_PATH must be set +2025-02-13T20:04:17.3423401Z /work/tests/tt_metal/tt_metal/api/compile_program_with_kernel_path_env_var_fixture.hpp:17: Skipped +2025-02-13T20:04:17.3423964Z +2025-02-13T20:04:17.3424825Z [ SKIPPED ] CompileProgramWithKernelPathEnvVarFixture.TensixKernelUnderMetalRootDirAndKernelRootDir (0 ms) +2025-02-13T20:04:17.3425946Z [ RUN ] CompileProgramWithKernelPathEnvVarFixture.TensixNonExistentKernel +2025-02-13T20:04:17.3426987Z  Test | INFO  | Skipping test: TT_METAL_KERNEL_PATH must be set +2025-02-13T20:04:17.3427924Z /work/tests/tt_metal/tt_metal/api/compile_program_with_kernel_path_env_var_fixture.hpp:17: Skipped +2025-02-13T20:04:17.3428498Z +2025-02-13T20:04:17.3428933Z [ SKIPPED ] CompileProgramWithKernelPathEnvVarFixture.TensixNonExistentKernel (0 ms) +2025-02-13T20:04:17.3429847Z [----------] 4 tests from CompileProgramWithKernelPathEnvVarFixture (0 ms total) +2025-02-13T20:04:17.3430321Z +2025-02-13T20:04:17.3430490Z [----------] 18 tests from CoreCoordFixture +2025-02-13T20:04:17.3430983Z [ RUN ] CoreCoordFixture.TestCoreRangeIntersects +2025-02-13T20:04:17.3431554Z [ OK ] CoreCoordFixture.TestCoreRangeIntersects (0 ms) +2025-02-13T20:04:17.3432152Z [ RUN ] CoreCoordFixture.TestCoreRangeNotIntersects +2025-02-13T20:04:17.3432748Z [ OK ] CoreCoordFixture.TestCoreRangeNotIntersects (0 ms) +2025-02-13T20:04:17.3433319Z [ RUN ] CoreCoordFixture.TestCoreRangeIterator +2025-02-13T20:04:17.3433869Z [ OK ] CoreCoordFixture.TestCoreRangeIterator (0 ms) +2025-02-13T20:04:17.3434413Z [ RUN ] CoreCoordFixture.TestCoreRangeMerge +2025-02-13T20:04:17.3434960Z [ OK ] CoreCoordFixture.TestCoreRangeMerge (0 ms) +2025-02-13T20:04:17.3435594Z [ RUN ] CoreCoordFixture.TestCoreRangeNotMergeable +2025-02-13T20:04:17.3436229Z [ OK ] CoreCoordFixture.TestCoreRangeNotMergeable (0 ms) +2025-02-13T20:04:17.3436880Z [ RUN ] CoreCoordFixture.TestCoreRangeSetValidConstruct +2025-02-13T20:04:17.3437581Z [ OK ] CoreCoordFixture.TestCoreRangeSetValidConstruct (0 ms) +2025-02-13T20:04:17.3438252Z [ RUN ] CoreCoordFixture.TestCoreRangeSetInvalidConstruct +2025-02-13T20:04:17.3439917Z  Always | FATAL  | Cannot create CoreRangeSet with specified core ranges because core ranges [(x=3,y=3) - (x=5,y=4)] and [(x=1,y=2) - (x=3,y=3)] overlap! +2025-02-13T20:04:17.3441838Z  Always | FATAL  | Cannot create CoreRangeSet with specified core ranges because core ranges [(x=1,y=1) - (x=1,y=1)] and [(x=0,y=0) - (x=1,y=1)] overlap! +2025-02-13T20:04:17.3443037Z [ OK ] CoreCoordFixture.TestCoreRangeSetInvalidConstruct (0 ms) +2025-02-13T20:04:17.3443695Z [ RUN ] CoreCoordFixture.TestCoreRangeSetContains +2025-02-13T20:04:17.3444296Z [ OK ] CoreCoordFixture.TestCoreRangeSetContains (0 ms) +2025-02-13T20:04:17.3444912Z [ RUN ] CoreCoordFixture.TestCoreRangeSetNotContains +2025-02-13T20:04:17.3445666Z [ OK ] CoreCoordFixture.TestCoreRangeSetNotContains (0 ms) +2025-02-13T20:04:17.3446297Z [ RUN ] CoreCoordFixture.TestCoreRangeSetIntersects +2025-02-13T20:04:17.3446899Z [ OK ] CoreCoordFixture.TestCoreRangeSetIntersects (0 ms) +2025-02-13T20:04:17.3447564Z [ RUN ] CoreCoordFixture.TestCoreRangeSetNotIntersects +2025-02-13T20:04:17.3448290Z [ OK ] CoreCoordFixture.TestCoreRangeSetNotIntersects (0 ms) +2025-02-13T20:04:17.3448989Z [ RUN ] CoreCoordFixture.TestCoreRangeSetMergeNoSolution +2025-02-13T20:04:17.3449764Z [ OK ] CoreCoordFixture.TestCoreRangeSetMergeNoSolution (0 ms) +2025-02-13T20:04:17.3450984Z [ RUN ] CoreCoordFixture.TestCoreRangeSetMergeCoreCoord +2025-02-13T20:04:17.3451740Z [ OK ] CoreCoordFixture.TestCoreRangeSetMergeCoreCoord (0 ms) +2025-02-13T20:04:17.3452553Z [ RUN ] CoreCoordFixture.TestCoreRangeSetMergeCoreRange +2025-02-13T20:04:17.3453229Z [ OK ] CoreCoordFixture.TestCoreRangeSetMergeCoreRange (0 ms) +2025-02-13T20:04:17.3453864Z [ RUN ] CoreCoordFixture.TestCoreRangeAdjacent +2025-02-13T20:04:17.3454423Z [ OK ] CoreCoordFixture.TestCoreRangeAdjacent (0 ms) +2025-02-13T20:04:17.3454990Z [ RUN ] CoreCoordFixture.TestCoreRangeNotAdjacent +2025-02-13T20:04:17.3455751Z [ OK ] CoreCoordFixture.TestCoreRangeNotAdjacent (0 ms) +2025-02-13T20:04:17.3456338Z [ RUN ] CoreCoordFixture.TestCoreRangeContains +2025-02-13T20:04:17.3456906Z [ OK ] CoreCoordFixture.TestCoreRangeContains (0 ms) +2025-02-13T20:04:17.3457481Z [ RUN ] CoreCoordFixture.TestCoreRangeNotContains +2025-02-13T20:04:17.3458076Z [ OK ] CoreCoordFixture.TestCoreRangeNotContains (0 ms) +2025-02-13T20:04:17.3458652Z [----------] 18 tests from CoreCoordFixture (0 ms total) +2025-02-13T20:04:17.3458999Z +2025-02-13T20:04:17.3459182Z [----------] 3 tests from FreeListAllocator +2025-02-13T20:04:17.3459781Z [ RUN ] FreeListAllocator.TestDirectedSeriesOfAllocDealloc +2025-02-13T20:04:17.3460491Z [ OK ] FreeListAllocator.TestDirectedSeriesOfAllocDealloc (0 ms) +2025-02-13T20:04:17.3461153Z [ RUN ] FreeListAllocator.TestResizeAllocator +2025-02-13T20:04:17.3461743Z [ OK ] FreeListAllocator.TestResizeAllocator (0 ms) +2025-02-13T20:04:17.3462734Z [ RUN ] FreeListAllocator.TestDirectedResizeAllocator +2025-02-13T20:04:17.3463510Z [ OK ] FreeListAllocator.TestDirectedResizeAllocator (0 ms) +2025-02-13T20:04:17.3464121Z [----------] 3 tests from FreeListAllocator (0 ms total) +2025-02-13T20:04:17.3464468Z +2025-02-13T20:04:17.3464665Z [----------] 18 tests from FreeListOptTest +2025-02-13T20:04:17.3465127Z [ RUN ] FreeListOptTest.Allocation +2025-02-13T20:04:17.3465632Z [ OK ] FreeListOptTest.Allocation (0 ms) +2025-02-13T20:04:17.3466126Z [ RUN ] FreeListOptTest.Alignment +2025-02-13T20:04:17.3466585Z [ OK ] FreeListOptTest.Alignment (0 ms) +2025-02-13T20:04:17.3467076Z [ RUN ] FreeListOptTest.MinAllocationSize +2025-02-13T20:04:17.3467602Z [ OK ] FreeListOptTest.MinAllocationSize (0 ms) +2025-02-13T20:04:17.3468100Z [ RUN ] FreeListOptTest.Clear +2025-02-13T20:04:17.3468533Z [ OK ] FreeListOptTest.Clear (0 ms) +2025-02-13T20:04:17.3469063Z [ RUN ] FreeListOptTest.AllocationAndDeallocation +2025-02-13T20:04:17.3469668Z [ OK ] FreeListOptTest.AllocationAndDeallocation (0 ms) +2025-02-13T20:04:17.3470215Z [ RUN ] FreeListOptTest.AllocateAtAddress +2025-02-13T20:04:17.3470745Z [ OK ] FreeListOptTest.AllocateAtAddress (0 ms) +2025-02-13T20:04:17.3471312Z [ RUN ] FreeListOptTest.AllocateAtAddressInteractions +2025-02-13T20:04:17.3471940Z [ OK ] FreeListOptTest.AllocateAtAddressInteractions (0 ms) +2025-02-13T20:04:17.3472527Z [ RUN ] FreeListOptTest.ShrinkAndReset +2025-02-13T20:04:17.3473022Z [ OK ] FreeListOptTest.ShrinkAndReset (0 ms) +2025-02-13T20:04:17.3473516Z [ RUN ] FreeListOptTest.Statistics +2025-02-13T20:04:17.3473971Z [ OK ] FreeListOptTest.Statistics (0 ms) +2025-02-13T20:04:17.3474474Z [ RUN ] FreeListOptTest.AllocateFromTop +2025-02-13T20:04:17.3475107Z [ OK ] FreeListOptTest.AllocateFromTop (0 ms) +2025-02-13T20:04:17.3475598Z [ RUN ] FreeListOptTest.Coalescing +2025-02-13T20:04:17.3476083Z [ OK ] FreeListOptTest.Coalescing (0 ms) +2025-02-13T20:04:17.3476645Z [ RUN ] FreeListOptTest.CoalescingAfterResetShrink +2025-02-13T20:04:17.3477264Z [ OK ] FreeListOptTest.CoalescingAfterResetShrink (0 ms) +2025-02-13T20:04:17.3477821Z [ RUN ] FreeListOptTest.OutOfMemory +2025-02-13T20:04:17.3478313Z [ OK ] FreeListOptTest.OutOfMemory (0 ms) +2025-02-13T20:04:17.3478835Z [ RUN ] FreeListOptTest.AvailableAddresses +2025-02-13T20:04:17.3479365Z [ OK ] FreeListOptTest.AvailableAddresses (0 ms) +2025-02-13T20:04:17.3480145Z [ RUN ] FreeListOptTest.LowestOccupiedAddress +2025-02-13T20:04:17.3480718Z [ OK ] FreeListOptTest.LowestOccupiedAddress (0 ms) +2025-02-13T20:04:17.3481378Z [ RUN ] FreeListOptTest.LowestOccupiedAddressWithAllocateAt +2025-02-13T20:04:17.3482121Z [ OK ] FreeListOptTest.LowestOccupiedAddressWithAllocateAt (0 ms) +2025-02-13T20:04:17.3482760Z [ RUN ] FreeListOptTest.FirstFit +2025-02-13T20:04:17.3483234Z [ OK ] FreeListOptTest.FirstFit (0 ms) +2025-02-13T20:04:17.3483843Z [ RUN ] FreeListOptTest.FirstFitAllocateAtAddressInteractions +2025-02-13T20:04:17.3484709Z [ OK ] FreeListOptTest.FirstFitAllocateAtAddressInteractions (0 ms) +2025-02-13T20:04:17.3485384Z [----------] 18 tests from FreeListOptTest (1 ms total) +2025-02-13T20:04:17.3485712Z +2025-02-13T20:04:17.3485982Z [----------] 8 tests from BlockfloatCommonTests/ConvertU32ToBfpTests +2025-02-13T20:04:17.3486815Z [ RUN ] BlockfloatCommonTests/ConvertU32ToBfpTests.MantissaRoundingWithPositiveFloat/0 +2025-02-13T20:04:17.3487880Z [ OK ] BlockfloatCommonTests/ConvertU32ToBfpTests.MantissaRoundingWithPositiveFloat/0 (0 ms) +2025-02-13T20:04:17.3488943Z [ RUN ] BlockfloatCommonTests/ConvertU32ToBfpTests.MantissaRoundingWithPositiveFloat/1 +2025-02-13T20:04:17.3489991Z [ OK ] BlockfloatCommonTests/ConvertU32ToBfpTests.MantissaRoundingWithPositiveFloat/1 (0 ms) +2025-02-13T20:04:17.3491061Z [ RUN ] BlockfloatCommonTests/ConvertU32ToBfpTests.MantissaRoundingWithPositiveFloat/2 +2025-02-13T20:04:17.3492132Z [ OK ] BlockfloatCommonTests/ConvertU32ToBfpTests.MantissaRoundingWithPositiveFloat/2 (0 ms) +2025-02-13T20:04:17.3493195Z [ RUN ] BlockfloatCommonTests/ConvertU32ToBfpTests.MantissaRoundingWithPositiveFloat/3 +2025-02-13T20:04:17.3494240Z [ OK ] BlockfloatCommonTests/ConvertU32ToBfpTests.MantissaRoundingWithPositiveFloat/3 (0 ms) +2025-02-13T20:04:17.3495324Z [ RUN ] BlockfloatCommonTests/ConvertU32ToBfpTests.MantissaRoundingWithNegativeFloat/0 +2025-02-13T20:04:17.3496456Z [ OK ] BlockfloatCommonTests/ConvertU32ToBfpTests.MantissaRoundingWithNegativeFloat/0 (0 ms) +2025-02-13T20:04:17.3497524Z [ RUN ] BlockfloatCommonTests/ConvertU32ToBfpTests.MantissaRoundingWithNegativeFloat/1 +2025-02-13T20:04:17.3498573Z [ OK ] BlockfloatCommonTests/ConvertU32ToBfpTests.MantissaRoundingWithNegativeFloat/1 (0 ms) +2025-02-13T20:04:17.3499654Z [ RUN ] BlockfloatCommonTests/ConvertU32ToBfpTests.MantissaRoundingWithNegativeFloat/2 +2025-02-13T20:04:17.3500738Z [ OK ] BlockfloatCommonTests/ConvertU32ToBfpTests.MantissaRoundingWithNegativeFloat/2 (0 ms) +2025-02-13T20:04:17.3501808Z [ RUN ] BlockfloatCommonTests/ConvertU32ToBfpTests.MantissaRoundingWithNegativeFloat/3 +2025-02-13T20:04:17.3502887Z [ OK ] BlockfloatCommonTests/ConvertU32ToBfpTests.MantissaRoundingWithNegativeFloat/3 (0 ms) +2025-02-13T20:04:17.3503839Z [----------] 8 tests from BlockfloatCommonTests/ConvertU32ToBfpTests (0 ms total) +2025-02-13T20:04:17.3504360Z +2025-02-13T20:04:17.3504546Z [----------] Global test environment tear-down +2025-02-13T20:04:17.3505091Z [==========] 166 tests from 14 test suites ran. (27710 ms total) +2025-02-13T20:04:17.3505612Z [ PASSED ] 152 tests. +2025-02-13T20:04:17.3506015Z [ SKIPPED ] 9 tests, listed below: +2025-02-13T20:04:17.3506554Z [ SKIPPED ] NOC.TensixVerifyNocIdentityTranslationTable +2025-02-13T20:04:17.3507371Z [ SKIPPED ] DispatchFixture.TensixDRAMtoL1MulticastExcludeRegionUpLeft +2025-02-13T20:04:17.3508240Z [ SKIPPED ] DispatchFixture.TensixDRAMtoL1MulticastExcludeRegionUpRight +2025-02-13T20:04:17.3509121Z [ SKIPPED ] DispatchFixture.TensixDRAMtoL1MulticastExcludeRegionDownLeft +2025-02-13T20:04:17.3510019Z [ SKIPPED ] DispatchFixture.TensixDRAMtoL1MulticastExcludeRegionDownRight +2025-02-13T20:04:17.3510999Z [ SKIPPED ] CompileProgramWithKernelPathEnvVarFixture.TensixKernelUnderMetalRootDir +2025-02-13T20:04:17.3512075Z [ SKIPPED ] CompileProgramWithKernelPathEnvVarFixture.TensixKernelUnderKernelRootDir +2025-02-13T20:04:17.3513279Z [ SKIPPED ] CompileProgramWithKernelPathEnvVarFixture.TensixKernelUnderMetalRootDirAndKernelRootDir +2025-02-13T20:04:17.3514420Z [ SKIPPED ] CompileProgramWithKernelPathEnvVarFixture.TensixNonExistentKernel +2025-02-13T20:04:17.3515121Z [ FAILED ] 5 tests, listed below: +2025-02-13T20:04:17.3515802Z [ FAILED ] DeviceFixture.TensixTestSingleCoreMultiTileBankedDramWriterOnly +2025-02-13T20:04:17.3516770Z [ FAILED ] DeviceFixture.TensixTestSingleCoreMultiTileBankedDramReaderAndWriter +2025-02-13T20:04:17.3517773Z [ FAILED ] DeviceFixture.TensixTestSingleCoreMultiTileBankedL1ReaderAndDramWriter +2025-02-13T20:04:17.3518944Z [ FAILED ] DeviceFixture.TensixTestSingleCoreMultiTileBankedDramReaderDataCopyDramWriter +2025-02-13T20:04:17.3520258Z [ FAILED ] DeviceFixture.TensixTestSingleCoreMultiTileBankedL1ReaderDataCopyDramWriter +2025-02-13T20:04:17.3520905Z +2025-02-13T20:04:17.3521038Z 5 FAILED TESTS +2025-02-13T20:04:17.3521428Z YOU HAVE 2 DISABLED TESTS +2025-02-13T20:04:17.3521715Z +2025-02-13T20:04:17.3522259Z  Device | INFO  | Closing user mode device drivers +2025-02-13T20:04:17.9057828Z Prepare all required actions +2025-02-13T20:04:17.9058452Z Getting action download info +2025-02-13T20:04:18.1746826Z Download action repository 'slackapi/slack-github-action@v1.26.0' (SHA:70cd7be8e40a46e8b0eced40b0de447bdb42f68e) +2025-02-13T20:04:18.7414555Z ##[group]Run ./.github/actions/slack-report +2025-02-13T20:04:18.7415000Z with: +2025-02-13T20:04:18.7415746Z slack_webhook_url: *** +2025-02-13T20:04:18.7416114Z owner: U06CXU895AP +2025-02-13T20:04:18.7416455Z env: +2025-02-13T20:04:18.7416765Z ARCH_NAME: grayskull +2025-02-13T20:04:18.7417134Z LOGURU_LEVEL: INFO +2025-02-13T20:04:18.7417690Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:18.7418522Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:04:18.7419368Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:18.7420190Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:18.7420961Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:18.7421734Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:04:18.7422547Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-13T20:04:18.7423208Z BUILD_TAG: 3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:04:18.7424132Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:04:18.7424983Z RUNNER_UID: 1000 +2025-02-13T20:04:18.7425351Z RUNNER_GID: 1000 +2025-02-13T20:04:18.7425707Z ##[endgroup] +2025-02-13T20:04:18.7466069Z Prepare all required actions +2025-02-13T20:04:18.7466649Z Getting action download info +2025-02-13T20:04:18.8806973Z Download action repository 'actions/upload-artifact@v4' (SHA:65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08) +2025-02-13T20:04:19.5888017Z ##[group]Run ./.github/actions/upload-artifact-with-job-uuid +2025-02-13T20:04:19.5888544Z with: +2025-02-13T20:04:19.5888865Z path: generated/test_reports/ + +2025-02-13T20:04:19.5889254Z prefix: test_reports_ +2025-02-13T20:04:19.5889871Z env: +2025-02-13T20:04:19.5890146Z ARCH_NAME: grayskull +2025-02-13T20:04:19.5890483Z LOGURU_LEVEL: INFO +2025-02-13T20:04:19.5891013Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:19.5891830Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:04:19.5892645Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:19.5893444Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:19.5894210Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:19.5894993Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:04:19.5895773Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-13T20:04:19.5896463Z BUILD_TAG: 3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:04:19.5897417Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:04:19.5898295Z RUNNER_UID: 1000 +2025-02-13T20:04:19.5898630Z RUNNER_GID: 1000 +2025-02-13T20:04:19.5898960Z ##[endgroup] +2025-02-13T20:04:19.5920943Z ##[group]Run uuid=$(uuidgen) +2025-02-13T20:04:19.5921338Z uuid=$(uuidgen) +2025-02-13T20:04:19.5921727Z artifact_name="test_reports_$uuid" +2025-02-13T20:04:19.5922229Z echo "[UPLOAD-ARTIFACT-UUID] $artifact_name" +2025-02-13T20:04:19.5922825Z echo "artifact-name=$artifact_name" >> "$GITHUB_OUTPUT" +2025-02-13T20:04:19.5944617Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:04:19.5945095Z env: +2025-02-13T20:04:19.5945405Z ARCH_NAME: grayskull +2025-02-13T20:04:19.5945755Z LOGURU_LEVEL: INFO +2025-02-13T20:04:19.5946289Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:19.5947125Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:04:19.5947968Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:19.5948718Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:19.5949473Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:19.5950236Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:04:19.6057209Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-13T20:04:19.6058029Z BUILD_TAG: 3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:04:19.6059081Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:04:19.6059911Z RUNNER_UID: 1000 +2025-02-13T20:04:19.6060260Z RUNNER_GID: 1000 +2025-02-13T20:04:19.6060603Z ##[endgroup] +2025-02-13T20:04:19.6124088Z [UPLOAD-ARTIFACT-UUID] test_reports_a9d3638c-0b51-4a1d-b6bf-002b85a02892 +2025-02-13T20:04:19.6192210Z ##[group]Run actions/upload-artifact@v4 +2025-02-13T20:04:19.6192793Z with: +2025-02-13T20:04:19.6193339Z name: test_reports_a9d3638c-0b51-4a1d-b6bf-002b85a02892 +2025-02-13T20:04:19.6194046Z path: generated/test_reports/ + +2025-02-13T20:04:19.6194638Z if-no-files-found: warn +2025-02-13T20:04:19.6195184Z compression-level: 6 +2025-02-13T20:04:19.6195682Z overwrite: false +2025-02-13T20:04:19.6196179Z include-hidden-files: false +2025-02-13T20:04:19.6197094Z env: +2025-02-13T20:04:19.6197508Z ARCH_NAME: grayskull +2025-02-13T20:04:19.6198016Z LOGURU_LEVEL: INFO +2025-02-13T20:04:19.6198856Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:19.6200165Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:04:19.6201292Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:19.6202396Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:19.6203734Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:19.6204910Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:04:19.6206056Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-13T20:04:19.6206985Z BUILD_TAG: 3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:04:19.6208293Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:04:19.6209491Z RUNNER_UID: 1000 +2025-02-13T20:04:19.6209941Z RUNNER_GID: 1000 +2025-02-13T20:04:19.6210400Z ##[endgroup] +2025-02-13T20:04:19.8944974Z With the provided path, there will be 1 file uploaded +2025-02-13T20:04:19.8949575Z Artifact name is valid! +2025-02-13T20:04:19.8950293Z Root directory input is valid! +2025-02-13T20:04:20.0973222Z Beginning upload of artifact content to blob storage +2025-02-13T20:04:20.3207612Z Uploaded bytes 4940 +2025-02-13T20:04:20.3768976Z Finished uploading artifact content to blob storage! +2025-02-13T20:04:20.3772410Z SHA256 hash of uploaded artifact zip is a20715031cae84601355c7742ddf16864fef3ef5b6862d32cc95302590ec4b82 +2025-02-13T20:04:20.3773750Z Finalizing artifact upload +2025-02-13T20:04:20.4792029Z Artifact test_reports_a9d3638c-0b51-4a1d-b6bf-002b85a02892.zip successfully finalized. Artifact ID 2588439359 +2025-02-13T20:04:20.4793792Z Artifact test_reports_a9d3638c-0b51-4a1d-b6bf-002b85a02892 has been successfully uploaded! Final size is 4940 bytes. Artifact ID is 2588439359 +2025-02-13T20:04:20.4799469Z Artifact download URL: https://github.com/tenstorrent/tt-metal/actions/runs/13315815702/artifacts/2588439359 +2025-02-13T20:04:20.4952961Z Prepare all required actions +2025-02-13T20:04:20.4954132Z Getting action download info +2025-02-13T20:04:20.6386890Z ##[group]Run ./.github/actions/generate-system-logs +2025-02-13T20:04:20.6387388Z with: +2025-02-13T20:04:20.6387722Z env: +2025-02-13T20:04:20.6388042Z ARCH_NAME: grayskull +2025-02-13T20:04:20.6388407Z LOGURU_LEVEL: INFO +2025-02-13T20:04:20.6388959Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:20.6389814Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:04:20.6390643Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:20.6391424Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:20.6392203Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:20.6393000Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:04:20.6393903Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-13T20:04:20.6394610Z BUILD_TAG: 3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:04:20.6395554Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:04:20.6396440Z RUNNER_UID: 1000 +2025-02-13T20:04:20.6396783Z RUNNER_GID: 1000 +2025-02-13T20:04:20.6397149Z ##[endgroup] +2025-02-13T20:04:20.6422985Z ##[group]Run echo "HOSTNAME=$(hostname)" >> $GITHUB_ENV +2025-02-13T20:04:20.6423582Z echo "HOSTNAME=$(hostname)" >> $GITHUB_ENV +2025-02-13T20:04:20.6424172Z echo "TIMESTAMP=$(date +'%Y%m%d%H%M%S')" >> $GITHUB_ENV +2025-02-13T20:04:20.6447516Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:04:20.6448026Z env: +2025-02-13T20:04:20.6448362Z ARCH_NAME: grayskull +2025-02-13T20:04:20.6448728Z LOGURU_LEVEL: INFO +2025-02-13T20:04:20.6473342Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:20.6474197Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:04:20.6475072Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:20.6476194Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:20.6476969Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:20.6477760Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:04:20.6478621Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-13T20:04:20.6479305Z BUILD_TAG: 3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:04:20.6480360Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:04:20.6481236Z RUNNER_UID: 1000 +2025-02-13T20:04:20.6481579Z RUNNER_GID: 1000 +2025-02-13T20:04:20.6481905Z ##[endgroup] +2025-02-13T20:04:20.6593379Z ##[group]Run rm -rf ~/run-log +2025-02-13T20:04:20.6593955Z rm -rf ~/run-log +2025-02-13T20:04:20.6594477Z mkdir -p ~/run-log/ +2025-02-13T20:04:20.6595427Z sudo dmesg > ~/run-log/20250213200420_tt-metal-ci-vm-160_dmesg.log +2025-02-13T20:04:20.6596606Z sudo lspci > ~/run-log/20250213200420_tt-metal-ci-vm-160_lspci.log +2025-02-13T20:04:20.6597586Z sudo lshw > ~/run-log/20250213200420_tt-metal-ci-vm-160_lshw.log +2025-02-13T20:04:20.6624339Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:04:20.6625039Z env: +2025-02-13T20:04:20.6625492Z ARCH_NAME: grayskull +2025-02-13T20:04:20.6625947Z LOGURU_LEVEL: INFO +2025-02-13T20:04:20.6626672Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:20.6627829Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:04:20.6629314Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:20.6630359Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:20.6632526Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:20.6633728Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:04:20.6634857Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-13T20:04:20.6635772Z BUILD_TAG: 3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:04:20.6637017Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:04:20.6638161Z RUNNER_UID: 1000 +2025-02-13T20:04:20.6638586Z RUNNER_GID: 1000 +2025-02-13T20:04:20.6639043Z HOSTNAME: tt-metal-ci-vm-160 +2025-02-13T20:04:20.6639569Z TIMESTAMP: 20250213200420 +2025-02-13T20:04:20.6640249Z ##[endgroup] +2025-02-13T20:04:22.2359601Z ##[group]Run tar -cvf ~/run-log/sys_logs.tar ~/run-log/20250213200420_tt-metal-ci-vm-160_* +2025-02-13T20:04:22.2360891Z tar -cvf ~/run-log/sys_logs.tar ~/run-log/20250213200420_tt-metal-ci-vm-160_* +2025-02-13T20:04:22.2386741Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:04:22.2387442Z env: +2025-02-13T20:04:22.2387866Z ARCH_NAME: grayskull +2025-02-13T20:04:22.2388353Z LOGURU_LEVEL: INFO +2025-02-13T20:04:22.2389053Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:22.2390199Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:04:22.2391309Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:22.2392345Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:22.2393390Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:22.2394427Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:04:22.2395477Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-13T20:04:22.2396370Z BUILD_TAG: 3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:04:22.2397916Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:04:22.2399072Z RUNNER_UID: 1000 +2025-02-13T20:04:22.2399514Z RUNNER_GID: 1000 +2025-02-13T20:04:22.2400131Z HOSTNAME: tt-metal-ci-vm-160 +2025-02-13T20:04:22.2400636Z TIMESTAMP: 20250213200420 +2025-02-13T20:04:22.2401118Z ##[endgroup] +2025-02-13T20:04:22.2463018Z tar: Removing leading `/' from member names +2025-02-13T20:04:22.2465420Z /home/ubuntu/run-log/20250213200420_tt-metal-ci-vm-160_dmesg.log +2025-02-13T20:04:22.2468105Z tar: Removing leading `/' from hard link targets +2025-02-13T20:04:22.2468864Z /home/ubuntu/run-log/20250213200420_tt-metal-ci-vm-160_lshw.log +2025-02-13T20:04:22.2469740Z /home/ubuntu/run-log/20250213200420_tt-metal-ci-vm-160_lspci.log +2025-02-13T20:04:22.2522299Z ##[group]Run actions/upload-artifact@v4 +2025-02-13T20:04:22.2522752Z with: +2025-02-13T20:04:22.2523141Z name: 20250213200420_tt-metal-ci-vm-160_sys_logs +2025-02-13T20:04:22.2523631Z path: ~/run-log/20250213200420_sys_logs.tar +2025-02-13T20:04:22.2524086Z if-no-files-found: warn +2025-02-13T20:04:22.2524473Z compression-level: 6 +2025-02-13T20:04:22.2524843Z overwrite: false +2025-02-13T20:04:22.2525202Z include-hidden-files: false +2025-02-13T20:04:22.2525587Z env: +2025-02-13T20:04:22.2525898Z ARCH_NAME: grayskull +2025-02-13T20:04:22.2526229Z LOGURU_LEVEL: INFO +2025-02-13T20:04:22.2526762Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:22.2527599Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:04:22.2528712Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:22.2529477Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:22.2530239Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:22.2531063Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:04:22.2531833Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-13T20:04:22.2532496Z BUILD_TAG: 3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:04:22.2533404Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:04:22.2534242Z RUNNER_UID: 1000 +2025-02-13T20:04:22.2534560Z RUNNER_GID: 1000 +2025-02-13T20:04:22.2534885Z HOSTNAME: tt-metal-ci-vm-160 +2025-02-13T20:04:22.2535274Z TIMESTAMP: 20250213200420 +2025-02-13T20:04:22.2535630Z ##[endgroup] +2025-02-13T20:04:22.5164014Z ##[warning]No files were found with the provided path: ~/run-log/20250213200420_sys_logs.tar. No artifacts will be uploaded. +2025-02-13T20:04:22.5419383Z Post job cleanup. +2025-02-13T20:04:22.5489358Z Post job cleanup. +2025-02-13T20:04:22.6445430Z [command]/usr/bin/git version +2025-02-13T20:04:22.6487093Z git version 2.25.1 +2025-02-13T20:04:22.6528265Z Copying '/home/ubuntu/.gitconfig' to '/home/ubuntu/actions-runner/_work/_temp/1939d7c4-1256-4c9a-bf67-06ee42f6f1ce/.gitconfig' +2025-02-13T20:04:22.6540449Z Temporarily overriding HOME='/home/ubuntu/actions-runner/_work/_temp/1939d7c4-1256-4c9a-bf67-06ee42f6f1ce' before making global git config changes +2025-02-13T20:04:22.6541745Z Adding repository directory to the temporary git global config as a safe directory +2025-02-13T20:04:22.6544491Z [command]/usr/bin/git config --global --add safe.directory /home/ubuntu/actions-runner/_work/tt-metal/tt-metal +2025-02-13T20:04:22.6577062Z [command]/usr/bin/git config --local --name-only --get-regexp core\.sshCommand +2025-02-13T20:04:22.6606347Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'core\.sshCommand' && git config --local --unset-all 'core.sshCommand' || :" +2025-02-13T20:04:22.6862408Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:04:22.6906229Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:04:22.6953325Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:04:22.7000725Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:04:22.7040760Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:04:22.7096376Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:04:22.7150170Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:04:22.7216400Z [command]/usr/bin/git config --local --name-only --get-regexp http\.https\:\/\/github\.com\/\.extraheader +2025-02-13T20:04:22.7235863Z http.https://github.com/.extraheader +2025-02-13T20:04:22.7245299Z [command]/usr/bin/git config --local --unset-all http.https://github.com/.extraheader +2025-02-13T20:04:22.7275896Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'http\.https\:\/\/github\.com\/\.extraheader' && git config --local --unset-all 'http.https://github.com/.extraheader' || :" +2025-02-13T20:04:22.7535079Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:04:22.7584584Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:04:22.7639638Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:04:22.7693525Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:04:22.7745820Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:04:22.7793334Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:04:22.7854574Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:04:22.8065172Z Post job cleanup. +2025-02-13T20:04:23.1644631Z [command]/usr/bin/docker logout https://ghcr.io +2025-02-13T20:04:23.1797038Z Removing login credentials for ghcr.io +2025-02-13T20:04:23.1844390Z ##[group]Post cache +2025-02-13T20:04:23.1844825Z State not set +2025-02-13T20:04:23.1846799Z ##[endgroup] +2025-02-13T20:04:23.2045406Z Post job cleanup. +2025-02-13T20:04:23.2148231Z Post job cleanup. +2025-02-13T20:04:23.2239177Z Post job cleanup. +2025-02-13T20:04:23.2312292Z Post job cleanup. +2025-02-13T20:04:23.3510558Z [command]/usr/bin/git version +2025-02-13T20:04:23.3552695Z git version 2.25.1 +2025-02-13T20:04:23.3592946Z Copying '/home/ubuntu/.gitconfig' to '/home/ubuntu/actions-runner/_work/_temp/64fe6d5f-5f97-41c3-9988-fb1332fb9146/.gitconfig' +2025-02-13T20:04:23.3605248Z Temporarily overriding HOME='/home/ubuntu/actions-runner/_work/_temp/64fe6d5f-5f97-41c3-9988-fb1332fb9146' before making global git config changes +2025-02-13T20:04:23.3606883Z Adding repository directory to the temporary git global config as a safe directory +2025-02-13T20:04:23.3611159Z [command]/usr/bin/git config --global --add safe.directory /home/ubuntu/actions-runner/_work/tt-metal/tt-metal +2025-02-13T20:04:23.3653268Z [command]/usr/bin/git config --local --name-only --get-regexp core\.sshCommand +2025-02-13T20:04:23.3686584Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'core\.sshCommand' && git config --local --unset-all 'core.sshCommand' || :" +2025-02-13T20:04:23.3979685Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:04:23.4031855Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:04:23.4080441Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:04:23.4126117Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:04:23.4172984Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:04:23.4219422Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:04:23.4267277Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:04:23.4337152Z [command]/usr/bin/git config --local --name-only --get-regexp http\.https\:\/\/github\.com\/\.extraheader +2025-02-13T20:04:23.4369775Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'http\.https\:\/\/github\.com\/\.extraheader' && git config --local --unset-all 'http.https://github.com/.extraheader' || :" +2025-02-13T20:04:23.4651763Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:04:23.4701265Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:04:23.4751874Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:04:23.4804272Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:04:23.4855329Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:04:23.4906076Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:04:23.4956073Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:04:23.5129241Z A job completed hook has been configured by the self-hosted runner administrator +2025-02-13T20:04:23.5163554Z ##[group]Run '/opt/tt_metal_infra/scripts/ci/grayskull/cleanup.sh' +2025-02-13T20:04:23.5178872Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:04:23.5179394Z ##[endgroup] +2025-02-13T20:04:23.5237325Z Current date / time is Thu Feb 13 20:04:23 UTC 2025 +2025-02-13T20:04:23.7249887Z Cleaning up orphan processes diff --git a/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/logs/37190213375_annotations.json b/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/logs/37190213375_annotations.json new file mode 100644 index 00000000000..fa70e443a72 --- /dev/null +++ b/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/logs/37190213375_annotations.json @@ -0,0 +1 @@ +[{"path":".github","blob_href":"https://github.com/tenstorrent/tt-metal/blob/ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70/.github","start_line":103,"start_column":null,"end_line":103,"end_column":null,"annotation_level":"notice","title":"","message":"[DEPRECATION] This action is deprecated. Please migrate to reading the Docker image from the pipeline.","raw_details":""},{"path":".github","blob_href":"https://github.com/tenstorrent/tt-metal/blob/ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70/.github","start_line":113,"start_column":null,"end_line":113,"end_column":null,"annotation_level":"warning","title":"","message":"No files were found with the provided path: ~/run-log/20250213200420_sys_logs.tar. No artifacts will be uploaded.","raw_details":""},{"path":".github","blob_href":"https://github.com/tenstorrent/tt-metal/blob/ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70/.github","start_line":31,"start_column":null,"end_line":31,"end_column":null,"annotation_level":"notice","title":"disk-usage-after-startup","message":"Disk usage is 59 %","raw_details":""},{"path":".github","blob_href":"https://github.com/tenstorrent/tt-metal/blob/ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70/.github","start_line":145,"start_column":null,"end_line":145,"end_column":null,"annotation_level":"notice","title":"printing-smi-info-startup","message":"Touching and printing out SMI info","raw_details":""},{"path":".github","blob_href":"https://github.com/tenstorrent/tt-metal/blob/ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70/.github","start_line":315,"start_column":null,"end_line":315,"end_column":null,"annotation_level":"notice","title":"reset-successful-startup","message":"tt-smi reset was successful","raw_details":""},{"path":".github","blob_href":"https://github.com/tenstorrent/tt-metal/blob/ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70/.github","start_line":332,"start_column":null,"end_line":332,"end_column":null,"annotation_level":"notice","title":"hugepages-service-found-startup","message":"Hugepages service found. Command returned with exit code 3. Restarting it so we can ensure hugepages are available","raw_details":""},{"path":".github","blob_href":"https://github.com/tenstorrent/tt-metal/blob/ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70/.github","start_line":337,"start_column":null,"end_line":337,"end_column":null,"annotation_level":"notice","title":"hugepages-setup-success-startup","message":"Hugepages is now setup.","raw_details":""}] diff --git a/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/logs/37190219113.log b/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/logs/37190219113.log new file mode 100644 index 00000000000..e560e43cc2d --- /dev/null +++ b/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/logs/37190219113.log @@ -0,0 +1,2178 @@ +2025-02-13T20:00:52.7788540Z Current runner version: '2.322.0' +2025-02-13T20:00:52.7796036Z Runner name: 'tt-metal-ci-vm-68' +2025-02-13T20:00:52.7797076Z Runner group name: 'Default' +2025-02-13T20:00:52.7798383Z Machine name: 'tt-metal-ci-vm-68' +2025-02-13T20:00:52.7802653Z ##[group]GITHUB_TOKEN Permissions +2025-02-13T20:00:52.7805322Z Actions: read +2025-02-13T20:00:52.7806056Z Contents: write +2025-02-13T20:00:52.7806784Z Metadata: read +2025-02-13T20:00:52.7807522Z Packages: write +2025-02-13T20:00:52.7808329Z Pages: write +2025-02-13T20:00:52.7809107Z PullRequests: write +2025-02-13T20:00:52.7809874Z ##[endgroup] +2025-02-13T20:00:52.7813322Z Secret source: Actions +2025-02-13T20:00:52.7814305Z Prepare workflow directory +2025-02-13T20:00:53.0392477Z Prepare all required actions +2025-02-13T20:00:53.0441740Z Getting action download info +2025-02-13T20:00:53.2037727Z Download action repository 'tenstorrent/tt-metal@main' (SHA:ac426de3d4a9c274964843fdae6aa83ea3960a30) +2025-02-13T20:00:59.2308829Z Getting action download info +2025-02-13T20:00:59.3802472Z Download action repository 'actions/checkout@v4' (SHA:11bd71901bbe5b1630ceea73d27597364c9af683) +2025-02-13T20:00:59.9810909Z Uses: tenstorrent/tt-metal/.github/workflows/build-and-unit-tests.yaml@refs/heads/sagarwal/multi_page_buffer (ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70) +2025-02-13T20:00:59.9813637Z ##[group] Inputs +2025-02-13T20:00:59.9814102Z build-type: Release +2025-02-13T20:00:59.9815152Z with-retries: false +2025-02-13T20:00:59.9815600Z arch: wormhole_b0 +2025-02-13T20:00:59.9816007Z runner-label: N150 +2025-02-13T20:00:59.9817042Z timeout: 35 +2025-02-13T20:00:59.9817454Z os: ubuntu-20.04 +2025-02-13T20:00:59.9817882Z ##[endgroup] +2025-02-13T20:00:59.9818457Z Complete job name: sd-unit-tests (wormhole_b0, N150) / wormhole_b0 N150 device +2025-02-13T20:01:00.0448285Z A job started hook has been configured by the self-hosted runner administrator +2025-02-13T20:01:00.0599074Z ##[group]Run '/opt/tt_metal_infra/scripts/ci/wormhole_b0/reset.sh' +2025-02-13T20:01:00.0619218Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:01:00.0620130Z ##[endgroup] +2025-02-13T20:01:00.0791615Z ++ date +2025-02-13T20:01:00.0792413Z + echo Current date / time is Thu Feb 13 20:01:00 UTC 2025 +2025-02-13T20:01:00.0793059Z + set_e_was_enabled=false +2025-02-13T20:01:00.0793602Z + [[ ehxB == *e* ]] +2025-02-13T20:01:00.0794045Z + set_e_was_enabled=true +2025-02-13T20:01:00.0794491Z + set +e +2025-02-13T20:01:00.0794912Z + docker image prune +2025-02-13T20:01:00.0797191Z Current date / time is Thu Feb 13 20:01:00 UTC 2025 +2025-02-13T20:01:00.0927148Z WARNING! This will remove all dangling images. +2025-02-13T20:01:00.0959913Z ++ df +2025-02-13T20:01:00.0962419Z ++ awk '{print $5}' +2025-02-13T20:01:00.0965445Z ++ sed s/%// +2025-02-13T20:01:00.0966078Z +++ findmnt -n -o SOURCE / +2025-02-13T20:01:00.0999513Z ++ grep -w '^/dev/vda1' +2025-02-13T20:01:00.1017856Z + disk_usage_before=82 +2025-02-13T20:01:00.1031346Z + echo '::notice title=disk-usage-before-startup::Disk usage is 82 %' +2025-02-13T20:01:00.1032092Z + '[' 82 -ge 90 ']' +2025-02-13T20:01:00.1032591Z ++ df +2025-02-13T20:01:00.1032982Z ++ awk '{print $5}' +2025-02-13T20:01:00.1033576Z ++ sed s/%// +2025-02-13T20:01:00.1034003Z +++ findmnt -n -o SOURCE / +2025-02-13T20:01:00.1035102Z Are you sure you want to continue? [y/N] ::notice title=disk-usage-before-startup::Disk usage is 82 % +2025-02-13T20:01:00.1056764Z ++ grep -w '^/dev/vda1' +2025-02-13T20:01:00.1075319Z + disk_usage_after=82 +2025-02-13T20:01:00.1076077Z + echo '::notice title=disk-usage-after-startup::Disk usage is 82 %' +2025-02-13T20:01:00.1076702Z + '[' 82 -ge 90 ']' +2025-02-13T20:01:00.1103508Z ##[notice]Disk usage is 82 % +2025-02-13T20:01:00.1112105Z ++ lsmod +2025-02-13T20:01:00.1137756Z + lsmod_output='Module Size Used by +2025-02-13T20:01:00.1138756Z wekafsio 70086656 1 +2025-02-13T20:01:00.1139389Z wekafsgw 40960 4 wekafsio +2025-02-13T20:01:00.1140077Z uio_pci_generic 16384 0 +2025-02-13T20:01:00.1140890Z igb_uio 20480 0 +2025-02-13T20:01:00.1142182Z uio 20480 2 igb_uio,uio_pci_generic +2025-02-13T20:01:00.1142933Z veth 28672 0 +2025-02-13T20:01:00.1143580Z xt_conntrack 16384 1 +2025-02-13T20:01:00.1144239Z xt_MASQUERADE 20480 1 +2025-02-13T20:01:00.1145085Z nf_conntrack_netlink 45056 0 +2025-02-13T20:01:00.1151871Z nfnetlink 16384 2 nf_conntrack_netlink +2025-02-13T20:01:00.1152645Z xfrm_user 36864 1 +2025-02-13T20:01:00.1153302Z xfrm_algo 16384 1 xfrm_user +2025-02-13T20:01:00.1153979Z iptable_nat 16384 1 +2025-02-13T20:01:00.1155290Z nf_nat 45056 2 iptable_nat,xt_MASQUERADE +2025-02-13T20:01:00.1156960Z nf_conntrack 139264 4 xt_conntrack,nf_nat,nf_conntrack_netlink,xt_MASQUERADE +2025-02-13T20:01:00.1158544Z nf_defrag_ipv6 24576 1 nf_conntrack +2025-02-13T20:01:00.1159706Z nf_defrag_ipv4 16384 1 nf_conntrack +2025-02-13T20:01:00.1160415Z xt_addrtype 16384 2 +2025-02-13T20:01:00.1161068Z iptable_filter 16384 1 +2025-02-13T20:01:00.1161695Z bpfilter 32768 0 +2025-02-13T20:01:00.1162325Z br_netfilter 28672 0 +2025-02-13T20:01:00.1162983Z bridge 176128 1 br_netfilter +2025-02-13T20:01:00.1163687Z stp 16384 1 bridge +2025-02-13T20:01:00.1164381Z llc 16384 2 bridge,stp +2025-02-13T20:01:00.1165056Z xfs 1286144 2 +2025-02-13T20:01:00.1165662Z aufs 262144 0 +2025-02-13T20:01:00.1166282Z overlay 118784 0 +2025-02-13T20:01:00.1166906Z rdma_ucm 28672 0 +2025-02-13T20:01:00.1167542Z rdma_cm 110592 1 rdma_ucm +2025-02-13T20:01:00.1168207Z iw_cm 49152 1 rdma_cm +2025-02-13T20:01:00.1169154Z ib_ipoib 131072 0 +2025-02-13T20:01:00.1169783Z ib_cm 114688 2 rdma_cm,ib_ipoib +2025-02-13T20:01:00.1170487Z ib_umad 28672 0 +2025-02-13T20:01:00.1171049Z nls_iso8859_1 16384 1 +2025-02-13T20:01:00.1171644Z dm_multipath 32768 0 +2025-02-13T20:01:00.1172225Z scsi_dh_rdac 16384 0 +2025-02-13T20:01:00.1172782Z scsi_dh_emc 16384 0 +2025-02-13T20:01:00.1173370Z scsi_dh_alua 20480 0 +2025-02-13T20:01:00.1177649Z mlx5_ib 397312 0 +2025-02-13T20:01:00.1178947Z ib_uverbs 139264 2 rdma_ucm,mlx5_ib +2025-02-13T20:01:00.1180006Z kvm_amd 98304 0 +2025-02-13T20:01:00.1180784Z ccp 90112 1 kvm_amd +2025-02-13T20:01:00.1181469Z kvm 667648 1 kvm_amd +2025-02-13T20:01:00.1182145Z input_leds 16384 0 +2025-02-13T20:01:00.1182775Z joydev 24576 0 +2025-02-13T20:01:00.1184046Z ib_core 348160 8 rdma_cm,ib_ipoib,iw_cm,ib_umad,rdma_ucm,ib_uverbs,mlx5_ib,ib_cm +2025-02-13T20:01:00.1184999Z serio_raw 20480 0 +2025-02-13T20:01:00.1185740Z tenstorrent 49152 0 +2025-02-13T20:01:00.1186468Z sch_fq_codel 20480 45 +2025-02-13T20:01:00.1187238Z binfmt_misc 24576 1 +2025-02-13T20:01:00.1187943Z msr 16384 0 +2025-02-13T20:01:00.1188565Z efi_pstore 16384 0 +2025-02-13T20:01:00.1189300Z virtio_rng 16384 0 +2025-02-13T20:01:00.1190031Z ip_tables 32768 2 iptable_filter,iptable_nat +2025-02-13T20:01:00.1191199Z x_tables 40960 5 xt_conntrack,iptable_filter,xt_addrtype,ip_tables,xt_MASQUERADE +2025-02-13T20:01:00.1192548Z autofs4 45056 2 +2025-02-13T20:01:00.1229915Z btrfs 1269760 0 +2025-02-13T20:01:00.1230648Z zstd_compress 167936 1 btrfs +2025-02-13T20:01:00.1231646Z raid10 61440 0 +2025-02-13T20:01:00.1232690Z raid456 155648 0 +2025-02-13T20:01:00.1234002Z async_raid6_recov 24576 1 raid456 +2025-02-13T20:01:00.1235593Z async_memcpy 20480 2 raid456,async_raid6_recov +2025-02-13T20:01:00.1236884Z async_pq 24576 2 raid456,async_raid6_recov +2025-02-13T20:01:00.1238100Z async_xor 20480 3 async_pq,raid456,async_raid6_recov +2025-02-13T20:01:00.1239138Z async_tx 20480 5 async_pq,async_memcpy,async_xor,raid456,async_raid6_recov +2025-02-13T20:01:00.1240107Z xor 24576 2 async_xor,btrfs +2025-02-13T20:01:00.1240976Z raid6_pq 114688 4 async_pq,btrfs,raid456,async_raid6_recov +2025-02-13T20:01:00.1242191Z libcrc32c 16384 5 nf_conntrack,nf_nat,btrfs,xfs,raid456 +2025-02-13T20:01:00.1243024Z raid1 45056 0 +2025-02-13T20:01:00.1243720Z raid0 24576 0 +2025-02-13T20:01:00.1244431Z multipath 20480 0 +2025-02-13T20:01:00.1245048Z linear 20480 0 +2025-02-13T20:01:00.1245688Z hid_generic 16384 0 +2025-02-13T20:01:00.1246332Z crct10dif_pclmul 16384 1 +2025-02-13T20:01:00.1246962Z crc32_pclmul 16384 0 +2025-02-13T20:01:00.1247589Z usbhid 57344 0 +2025-02-13T20:01:00.1248218Z ghash_clmulni_intel 16384 0 +2025-02-13T20:01:00.1248953Z hid 131072 2 usbhid,hid_generic +2025-02-13T20:01:00.1249687Z mlx5_core 1626112 1 mlx5_ib +2025-02-13T20:01:00.1250339Z cirrus 16384 0 +2025-02-13T20:01:00.1250968Z drm_kms_helper 184320 3 cirrus +2025-02-13T20:01:00.1251603Z aesni_intel 372736 0 +2025-02-13T20:01:00.1252296Z syscopyarea 16384 1 drm_kms_helper +2025-02-13T20:01:00.1253076Z sysfillrect 16384 1 drm_kms_helper +2025-02-13T20:01:00.1253900Z sysimgblt 16384 1 drm_kms_helper +2025-02-13T20:01:00.1254828Z pci_hyperv_intf 16384 1 mlx5_core +2025-02-13T20:01:00.1255553Z crypto_simd 16384 1 aesni_intel +2025-02-13T20:01:00.1256494Z fb_sys_fops 16384 1 drm_kms_helper +2025-02-13T20:01:00.1257227Z mlxdevm 172032 1 mlx5_core +2025-02-13T20:01:00.1257974Z auxiliary 16384 2 mlx5_ib,mlx5_core +2025-02-13T20:01:00.1258828Z cryptd 24576 2 crypto_simd,ghash_clmulni_intel +2025-02-13T20:01:00.1260211Z mlx_compat 65536 12 rdma_cm,ib_ipoib,mlxdevm,iw_cm,auxiliary,ib_umad,ib_core,rdma_ucm,ib_uverbs,mlx5_ib,ib_cm,mlx5_core +2025-02-13T20:01:00.1261431Z tls 73728 1 mlx5_core +2025-02-13T20:01:00.1262150Z glue_helper 16384 1 aesni_intel +2025-02-13T20:01:00.1262844Z ahci 40960 0 +2025-02-13T20:01:00.1263508Z drm 495616 3 drm_kms_helper,cirrus +2025-02-13T20:01:00.1264304Z psmouse 155648 0 +2025-02-13T20:01:00.1265011Z mlxfw 32768 1 mlx5_core +2025-02-13T20:01:00.1265727Z libahci 36864 1 ahci +2025-02-13T20:01:00.1266412Z virtio_blk 20480 3 +2025-02-13T20:01:00.1267098Z psample 20480 1 mlx5_core' +2025-02-13T20:01:00.1267828Z + grep -q tenstorrent +2025-02-13T20:01:00.1291172Z + echo Module Size Used by wekafsio 70086656 1 wekafsgw 40960 4 wekafsio uio_pci_generic 16384 0 igb_uio 20480 0 uio 20480 2 igb_uio,uio_pci_generic veth 28672 0 xt_conntrack 16384 1 xt_MASQUERADE 20480 1 nf_conntrack_netlink 45056 0 nfnetlink 16384 2 nf_conntrack_netlink xfrm_user 36864 1 xfrm_algo 16384 1 xfrm_user iptable_nat 16384 1 nf_nat 45056 2 iptable_nat,xt_MASQUERADE nf_conntrack 139264 4 xt_conntrack,nf_nat,nf_conntrack_netlink,xt_MASQUERADE nf_defrag_ipv6 24576 1 nf_conntrack nf_defrag_ipv4 16384 1 nf_conntrack xt_addrtype 16384 2 iptable_filter 16384 1 bpfilter 32768 0 br_netfilter 28672 0 bridge 176128 1 br_netfilter stp 16384 1 bridge llc 16384 2 bridge,stp xfs 1286144 2 aufs 262144 0 overlay 118784 0 rdma_ucm 28672 0 rdma_cm 110592 1 rdma_ucm iw_cm 49152 1 rdma_cm ib_ipoib 131072 0 ib_cm 114688 2 rdma_cm,ib_ipoib ib_umad 28672 0 nls_iso8859_1 16384 1 dm_multipath 32768 0 scsi_dh_rdac 16384 0 scsi_dh_emc 16384 0 scsi_dh_alua 20480 0 mlx5_ib 397312 0 ib_uverbs 139264 2 rdma_ucm,mlx5_ib kvm_amd 98304 0 ccp 90112 1 kvm_amd kvm 667648 1 kvm_amd input_leds 16384 0 joydev 24576 0 ib_core 348160 8 rdma_cm,ib_ipoib,iw_cm,ib_umad,rdma_ucm,ib_uverbs,mlx5_ib,ib_cm serio_raw 20480 0 tenstorrent 49152 0 sch_fq_codel 20480 45 binfmt_misc 24576 1 msr 16384 0 efi_pstore 16384 0 virtio_rng 16384 0 ip_tables 32768 2 iptable_filter,iptable_nat x_tables 40960 5 xt_conntrack,iptable_filter,xt_addrtype,ip_tables,xt_MASQUERADE autofs4 45056 2 btrfs 1269760 0 zstd_compress 167936 1 btrfs raid10 61440 0 raid456 155648 0 async_raid6_recov 24576 1 raid456 async_memcpy 20480 2 raid456,async_raid6_recov async_pq 24576 2 raid456,async_raid6_recov async_xor 20480 3 async_pq,raid456,async_raid6_recov async_tx 20480 5 async_pq,async_memcpy,async_xor,raid456,async_raid6_recov xor 24576 2 async_xor,btrfs raid6_pq 114688 4 async_pq,btrfs,raid456,async_raid6_recov libcrc32c 16384 5 nf_conntrack,nf_nat,btrfs,xfs,raid456 raid1 45056 0 raid0 24576 0 multipath 20480 0 linear 20480 0 hid_generic 16384 0 crct10dif_pclmul 16384 1 crc32_pclmul 16384 0 usbhid 57344 0 ghash_clmulni_intel 16384 0 hid 131072 2 usbhid,hid_generic mlx5_core 1626112 1 mlx5_ib cirrus 16384 0 drm_kms_helper 184320 3 cirrus aesni_intel 372736 0 syscopyarea 16384 1 drm_kms_helper sysfillrect 16384 1 drm_kms_helper sysimgblt 16384 1 drm_kms_helper pci_hyperv_intf 16384 1 mlx5_core crypto_simd 16384 1 aesni_intel fb_sys_fops 16384 1 drm_kms_helper mlxdevm 172032 1 mlx5_core auxiliary 16384 2 mlx5_ib,mlx5_core cryptd 24576 2 crypto_simd,ghash_clmulni_intel mlx_compat 65536 12 rdma_cm,ib_ipoib,mlxdevm,iw_cm,auxiliary,ib_umad,ib_core,rdma_ucm,ib_uverbs,mlx5_ib,ib_cm,mlx5_core tls 73728 1 mlx5_core glue_helper 16384 1 aesni_intel ahci 40960 0 drm 495616 3 drm_kms_helper,cirrus psmouse 155648 0 mlxfw 32768 1 mlx5_core libahci 36864 1 ahci virtio_blk 20480 3 psample 20480 1 mlx5_core +2025-02-13T20:01:00.1311828Z + [[ 0 -ne 0 ]] +2025-02-13T20:01:00.1312946Z ++ lsof -w /dev/tenstorrent/0 +2025-02-13T20:01:00.2582315Z + lsof_output= +2025-02-13T20:01:00.2583002Z + '[' -n '' ']' +2025-02-13T20:01:00.2583520Z + i=0 +2025-02-13T20:01:00.2583998Z + iter_limit=10 +2025-02-13T20:01:00.2584989Z + echo '::notice title=printing-smi-info-startup::Touching and printing out SMI info' +2025-02-13T20:01:00.2586025Z + sleep 20 +2025-02-13T20:01:00.2588935Z ##[notice]Touching and printing out SMI info +2025-02-13T20:01:20.2598126Z + sudo touch /opt/tt_metal_infra/smi.log +2025-02-13T20:01:20.2858446Z + sudo chown ubuntu /opt/tt_metal_infra/smi.log +2025-02-13T20:01:20.3216292Z + tt-smi-metal -s -f /opt/tt_metal_infra/smi.log +2025-02-13T20:01:20.7169221Z +2025-02-13T20:01:20.7184913Z  Detected Chips: 1 +2025-02-13T20:01:20.7185494Z  +2025-02-13T20:01:20.7185897Z  Detected Chips: 1 +2025-02-13T20:01:20.7186452Z +2025-02-13T20:01:20.7186667Z  Detecting ARC: | +2025-02-13T20:01:20.7186935Z +2025-02-13T20:01:20.7187149Z  Detecting DRAM: | +2025-02-13T20:01:20.7187392Z +2025-02-13T20:01:20.7187695Z [] [16/16] ETH: | +2025-02-13T20:01:20.7252129Z Gathering Information ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:00 +2025-02-13T20:01:20.7261278Z  Saved tt-smi log to: /opt/tt_metal_infra/smi.log  +2025-02-13T20:01:20.7866833Z + cat /opt/tt_metal_infra/smi.log +2025-02-13T20:01:20.7875444Z { +2025-02-13T20:01:20.7877319Z + echo '::notice title=attempting-reset-startup::Attempting to reset card(s). Sleeping first' +2025-02-13T20:01:20.7878219Z + sleep 30 +2025-02-13T20:01:20.7878535Z "time": "2025-02-13T20:01:20.718602", +2025-02-13T20:01:20.7878942Z "host_info": { +2025-02-13T20:01:20.7879284Z "OS": "Linux", +2025-02-13T20:01:20.7879646Z "Distro": "Ubuntu 20.04.3 LTS", +2025-02-13T20:01:20.7880067Z "Kernel": "5.4.0-205-generic", +2025-02-13T20:01:20.7880520Z "Hostname": "tt-metal-ci-vm-68", +2025-02-13T20:01:20.7880952Z "Platform": "x86_64", +2025-02-13T20:01:20.7881355Z "Python": "3.8.10", +2025-02-13T20:01:20.7881737Z "Memory": "47.14 GB", +2025-02-13T20:01:20.7882134Z "Driver": "TTKMD 1.29" +2025-02-13T20:01:20.7882688Z }, +2025-02-13T20:01:20.7883036Z "device_info": [ +2025-02-13T20:01:20.7883366Z { +2025-02-13T20:01:20.7884283Z "smbus_telem": { +2025-02-13T20:01:20.7884673Z "BOARD_ID": "0x10001851172b06b", +2025-02-13T20:01:20.7885209Z "SMBUS_TX_ENUM_VERSION": "0xba5e0001", +2025-02-13T20:01:20.7885688Z "SMBUS_TX_DEVICE_ID": "0x401e1e52", +2025-02-13T20:01:20.7886133Z "SMBUS_TX_ASIC_RO": "0x2ec29", +2025-02-13T20:01:20.7886579Z "SMBUS_TX_ASIC_IDD": "0xb96", +2025-02-13T20:01:20.7887028Z "SMBUS_TX_BOARD_ID_HIGH": "0x1000185", +2025-02-13T20:01:20.7887489Z "SMBUS_TX_BOARD_ID_LOW": "0x1172b06b", +2025-02-13T20:01:20.7887957Z "SMBUS_TX_ARC0_FW_VERSION": "0x21d0000", +2025-02-13T20:01:20.7888421Z "SMBUS_TX_ARC1_FW_VERSION": "0x21d0000", +2025-02-13T20:01:20.7888889Z "SMBUS_TX_ARC2_FW_VERSION": null, +2025-02-13T20:01:20.7889334Z "SMBUS_TX_ARC3_FW_VERSION": "0x21d0000", +2025-02-13T20:01:20.7889827Z "SMBUS_TX_SPIBOOTROM_FW_VERSION": "0x30b0000", +2025-02-13T20:01:20.7890339Z "SMBUS_TX_ETH_FW_VERSION": "0x6a000", +2025-02-13T20:01:20.7890808Z "SMBUS_TX_M3_BL_FW_VERSION": "0x81020000", +2025-02-13T20:01:20.7891294Z "SMBUS_TX_M3_APP_FW_VERSION": "0x5090000", +2025-02-13T20:01:20.7891914Z "SMBUS_TX_DDR_SPEED": null, +2025-02-13T20:01:20.7892359Z "SMBUS_TX_DDR_STATUS": "0x2222222", +2025-02-13T20:01:20.7892819Z "SMBUS_TX_ETH_STATUS0": "0x11111111", +2025-02-13T20:01:20.7893279Z "SMBUS_TX_ETH_STATUS1": "0x11111111", +2025-02-13T20:01:20.7893733Z "SMBUS_TX_PCIE_STATUS": "0x11040000", +2025-02-13T20:01:20.7894180Z "SMBUS_TX_FAULTS": null, +2025-02-13T20:01:20.7895108Z "SMBUS_TX_ARC0_HEALTH": "0x1b8369", +2025-02-13T20:01:20.7895568Z "SMBUS_TX_ARC1_HEALTH": "0xa1ac6", +2025-02-13T20:01:20.7896026Z "SMBUS_TX_ARC2_HEALTH": null, +2025-02-13T20:01:20.7896470Z "SMBUS_TX_ARC3_HEALTH": "0x112f", +2025-02-13T20:01:20.7896941Z "SMBUS_TX_FAN_SPEED": "0xffffffff", +2025-02-13T20:01:20.7897394Z "SMBUS_TX_AICLK": "0x3e801f4", +2025-02-13T20:01:20.7897835Z "SMBUS_TX_AXICLK": "0x384", +2025-02-13T20:01:20.7898264Z "SMBUS_TX_ARCCLK": "0x21c", +2025-02-13T20:01:20.7898695Z "SMBUS_TX_THROTTLER": null, +2025-02-13T20:01:20.7899128Z "SMBUS_TX_VCORE": "0x2d5", +2025-02-13T20:01:20.7899570Z "SMBUS_TX_ASIC_TEMPERATURE": "0x254022a", +2025-02-13T20:01:20.7900050Z "SMBUS_TX_VREG_TEMPERATURE": null, +2025-02-13T20:01:20.7900528Z "SMBUS_TX_BOARD_TEMPERATURE": "0x212423", +2025-02-13T20:01:20.7901000Z "SMBUS_TX_TDP": "0x64000e", +2025-02-13T20:01:20.7901430Z "SMBUS_TX_TDC": "0xf00012", +2025-02-13T20:01:20.7901859Z "SMBUS_TX_VDD_LIMITS": "0x3e802d0", +2025-02-13T20:01:20.7902309Z "SMBUS_TX_THM_LIMITS": "0x53004b", +2025-02-13T20:01:20.7902766Z "SMBUS_TX_WH_FW_DATE": "0x4b01121f", +2025-02-13T20:01:20.7903213Z "SMBUS_TX_ASIC_TMON0": "0x27262320", +2025-02-13T20:01:20.7903643Z "SMBUS_TX_ASIC_TMON1": "0x251c", +2025-02-13T20:01:20.7904164Z "SMBUS_TX_MVDDQ_POWER": "0x190000", +2025-02-13T20:01:20.7904616Z "SMBUS_TX_GDDR_TRAIN_TEMP0": null, +2025-02-13T20:01:20.7905071Z "SMBUS_TX_GDDR_TRAIN_TEMP1": null, +2025-02-13T20:01:20.7905518Z "SMBUS_TX_BOOT_DATE": "0x520d1335", +2025-02-13T20:01:20.7905965Z "SMBUS_TX_RT_SECONDS": "0x1be", +2025-02-13T20:01:20.7906403Z "SMBUS_TX_AUX_STATUS": null, +2025-02-13T20:01:20.7906877Z "SMBUS_TX_ETH_DEBUG_STATUS0": "0xccddddcc", +2025-02-13T20:01:20.7907347Z "SMBUS_TX_ETH_DEBUG_STATUS1": "0xccdddddd", +2025-02-13T20:01:20.7907819Z "SMBUS_TX_TT_FLASH_VERSION": "0x30100" +2025-02-13T20:01:20.7908244Z }, +2025-02-13T20:01:20.7908687Z "board_info": { +2025-02-13T20:01:20.7909057Z "bus_id": "0000:07:00.0", +2025-02-13T20:01:20.7909479Z "board_type": "n150 L", +2025-02-13T20:01:20.7909901Z "board_id": "010001851172b06b", +2025-02-13T20:01:20.7910325Z "coords": "(0, 0, 0, 0)", +2025-02-13T20:01:20.7910740Z "dram_status": true, +2025-02-13T20:01:20.7911132Z "dram_speed": "12G", +2025-02-13T20:01:20.7911541Z "pcie_speed": 4, +2025-02-13T20:01:20.7911927Z "pcie_width": 16 +2025-02-13T20:01:20.7912297Z }, +2025-02-13T20:01:20.7912619Z "telemetry": { +2025-02-13T20:01:20.7912992Z "voltage": "0.72", +2025-02-13T20:01:20.7913388Z "current": " 18.0", +2025-02-13T20:01:20.7913793Z "power": " 14.0", +2025-02-13T20:01:20.7914170Z "aiclk": " 500", +2025-02-13T20:01:20.7914570Z "asic_temperature": "34.6" +2025-02-13T20:01:20.7914996Z }, +2025-02-13T20:01:20.7915327Z "firmwares": { +2025-02-13T20:01:20.7915701Z "arc_fw": "2.29.0.0", +2025-02-13T20:01:20.7916115Z "arc_fw_date": "2024-11-01", +2025-02-13T20:01:20.7916539Z "eth_fw": "6.10.0", +2025-02-13T20:01:20.7916938Z "m3_bl_fw": "129.2.0.0", +2025-02-13T20:01:20.7917332Z "m3_app_fw": "5.9.0.0", +2025-02-13T20:01:20.7917757Z "tt_flash_version": "0.3.1.0" +2025-02-13T20:01:20.7918170Z }, +2025-02-13T20:01:20.7918490Z "limits": { +2025-02-13T20:01:20.7918844Z "vdd_min": "0.72", +2025-02-13T20:01:20.7919241Z "vdd_max": "1.00", +2025-02-13T20:01:20.7919722Z "tdp_limit": "100", +2025-02-13T20:01:20.7920132Z "tdc_limit": "240", +2025-02-13T20:01:20.7920537Z "asic_fmax": "1000", +2025-02-13T20:01:20.7920957Z "therm_trip_l1_limit": "83", +2025-02-13T20:01:20.7921383Z "thm_limit": "75", +2025-02-13T20:01:20.7921798Z "bus_peak_limit": null +2025-02-13T20:01:20.7922202Z } +2025-02-13T20:01:20.7922511Z } +2025-02-13T20:01:20.7922813Z ] +2025-02-13T20:01:20.7923352Z }::notice title=attempting-reset-startup::Attempting to reset card(s). Sleeping first +2025-02-13T20:01:50.7889977Z + '[' 0 -lt 10 ']' +2025-02-13T20:01:50.7890370Z + (( i++ )) +2025-02-13T20:01:50.7895448Z ++ tt-smi-metal -r 0 +2025-02-13T20:02:01.9110636Z + reset_output=' Starting pci link reset on WH devices at pci indices: 0  +2025-02-13T20:02:01.9111681Z  Finishing pci link reset on WH devices at pci indices: 0  +2025-02-13T20:02:01.9112817Z  Re-initializing boards after reset....  +2025-02-13T20:02:01.9113167Z +2025-02-13T20:02:01.9113426Z  Detected Chips: 1 +2025-02-13T20:02:01.9113845Z  +2025-02-13T20:02:01.9114192Z  Detected Chips: 1 +2025-02-13T20:02:01.9114424Z +2025-02-13T20:02:01.9114653Z  Detecting ARC: | +2025-02-13T20:02:01.9114893Z +2025-02-13T20:02:01.9115093Z  Detecting DRAM: | +2025-02-13T20:02:01.9115326Z +2025-02-13T20:02:01.9115699Z [0/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9116318Z  +2025-02-13T20:02:01.9116712Z  Detected Chips: 1 +2025-02-13T20:02:01.9116977Z +2025-02-13T20:02:01.9117247Z  Detecting ARC: / +2025-02-13T20:02:01.9117506Z +2025-02-13T20:02:01.9117705Z  Detecting DRAM: / +2025-02-13T20:02:01.9117939Z +2025-02-13T20:02:01.9118451Z [0/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9119113Z  +2025-02-13T20:02:01.9119550Z  Detected Chips: 1 +2025-02-13T20:02:01.9119839Z +2025-02-13T20:02:01.9120068Z  Detecting ARC: - +2025-02-13T20:02:01.9120324Z +2025-02-13T20:02:01.9120619Z  Detecting DRAM: - +2025-02-13T20:02:01.9120893Z +2025-02-13T20:02:01.9121999Z [0/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9122620Z  +2025-02-13T20:02:01.9123018Z  Detected Chips: 1 +2025-02-13T20:02:01.9123306Z +2025-02-13T20:02:01.9123550Z  Detecting ARC: \ +2025-02-13T20:02:01.9123851Z +2025-02-13T20:02:01.9124072Z  Detecting DRAM: \ +2025-02-13T20:02:01.9124347Z +2025-02-13T20:02:01.9124781Z [0/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9125385Z  +2025-02-13T20:02:01.9125733Z  Detected Chips: 1 +2025-02-13T20:02:01.9125970Z +2025-02-13T20:02:01.9126168Z  Detecting ARC: | +2025-02-13T20:02:01.9126393Z +2025-02-13T20:02:01.9126606Z  Detecting DRAM: | +2025-02-13T20:02:01.9126837Z +2025-02-13T20:02:01.9127196Z [0/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9127742Z  +2025-02-13T20:02:01.9128100Z  Detected Chips: 1 +2025-02-13T20:02:01.9128339Z +2025-02-13T20:02:01.9128528Z  Detecting ARC: / +2025-02-13T20:02:01.9128760Z +2025-02-13T20:02:01.9128952Z  Detecting DRAM: / +2025-02-13T20:02:01.9129189Z +2025-02-13T20:02:01.9129553Z [0/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9130092Z  +2025-02-13T20:02:01.9130439Z  Detected Chips: 1 +2025-02-13T20:02:01.9130671Z +2025-02-13T20:02:01.9130871Z  Detecting ARC: - +2025-02-13T20:02:01.9131098Z +2025-02-13T20:02:01.9131305Z  Detecting DRAM: - +2025-02-13T20:02:01.9131532Z +2025-02-13T20:02:01.9132119Z [0/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9132677Z  +2025-02-13T20:02:01.9133018Z  Detected Chips: 1 +2025-02-13T20:02:01.9133260Z +2025-02-13T20:02:01.9133451Z  Detecting ARC: \ +2025-02-13T20:02:01.9133687Z +2025-02-13T20:02:01.9133887Z  Detecting DRAM: \ +2025-02-13T20:02:01.9134122Z +2025-02-13T20:02:01.9134480Z [0/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9135280Z  +2025-02-13T20:02:01.9135623Z  Detected Chips: 1 +2025-02-13T20:02:01.9135857Z +2025-02-13T20:02:01.9136054Z  Detecting ARC: | +2025-02-13T20:02:01.9136278Z +2025-02-13T20:02:01.9136508Z  Detecting DRAM: | +2025-02-13T20:02:01.9136740Z +2025-02-13T20:02:01.9137176Z [0/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9137762Z  +2025-02-13T20:02:01.9138341Z  Detected Chips: 1 +2025-02-13T20:02:01.9138596Z +2025-02-13T20:02:01.9138862Z  Detecting ARC: / +2025-02-13T20:02:01.9139104Z +2025-02-13T20:02:01.9139343Z  Detecting DRAM: / +2025-02-13T20:02:01.9139613Z +2025-02-13T20:02:01.9140088Z [1/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9140697Z  +2025-02-13T20:02:01.9141096Z  Detected Chips: 1 +2025-02-13T20:02:01.9141430Z +2025-02-13T20:02:01.9141651Z  Detecting ARC: - +2025-02-13T20:02:01.9141935Z +2025-02-13T20:02:01.9142151Z  Detecting DRAM: - +2025-02-13T20:02:01.9142422Z +2025-02-13T20:02:01.9142834Z [1/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9143469Z  +2025-02-13T20:02:01.9143889Z  Detected Chips: 1 +2025-02-13T20:02:01.9144141Z +2025-02-13T20:02:01.9144417Z  Detecting ARC: \ +2025-02-13T20:02:01.9144665Z +2025-02-13T20:02:01.9144915Z  Detecting DRAM: \ +2025-02-13T20:02:01.9145170Z +2025-02-13T20:02:01.9145586Z [1/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9146233Z  +2025-02-13T20:02:01.9146664Z  Detected Chips: 1 +2025-02-13T20:02:01.9146935Z +2025-02-13T20:02:01.9170568Z  Detecting ARC: | +2025-02-13T20:02:01.9171033Z +2025-02-13T20:02:01.9171284Z  Detecting DRAM: | +2025-02-13T20:02:01.9171535Z +2025-02-13T20:02:01.9171908Z [1/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9172458Z  +2025-02-13T20:02:01.9172818Z  Detected Chips: 1 +2025-02-13T20:02:01.9173050Z +2025-02-13T20:02:01.9173254Z  Detecting ARC: / +2025-02-13T20:02:01.9173481Z +2025-02-13T20:02:01.9173684Z  Detecting DRAM: / +2025-02-13T20:02:01.9173914Z +2025-02-13T20:02:01.9174289Z [1/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9175049Z  +2025-02-13T20:02:01.9175403Z  Detected Chips: 1 +2025-02-13T20:02:01.9175650Z +2025-02-13T20:02:01.9175843Z  Detecting ARC: - +2025-02-13T20:02:01.9176086Z +2025-02-13T20:02:01.9176279Z  Detecting DRAM: - +2025-02-13T20:02:01.9176522Z +2025-02-13T20:02:01.9176891Z [1/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9177434Z  +2025-02-13T20:02:01.9177783Z  Detected Chips: 1 +2025-02-13T20:02:01.9178016Z +2025-02-13T20:02:01.9178220Z  Detecting ARC: \ +2025-02-13T20:02:01.9178443Z +2025-02-13T20:02:01.9178650Z  Detecting DRAM: \ +2025-02-13T20:02:01.9178880Z +2025-02-13T20:02:01.9179252Z [1/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9179798Z  +2025-02-13T20:02:01.9180141Z  Detected Chips: 1 +2025-02-13T20:02:01.9180390Z +2025-02-13T20:02:01.9180587Z  Detecting ARC: | +2025-02-13T20:02:01.9180825Z +2025-02-13T20:02:01.9181174Z  Detecting DRAM: | +2025-02-13T20:02:01.9181496Z +2025-02-13T20:02:01.9181914Z [1/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9182552Z  +2025-02-13T20:02:01.9182901Z  Detected Chips: 1 +2025-02-13T20:02:01.9183146Z +2025-02-13T20:02:01.9183348Z  Detecting ARC: / +2025-02-13T20:02:01.9183581Z +2025-02-13T20:02:01.9183787Z  Detecting DRAM: / +2025-02-13T20:02:01.9184018Z +2025-02-13T20:02:01.9184387Z [1/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9184924Z  +2025-02-13T20:02:01.9185264Z  Detected Chips: 1 +2025-02-13T20:02:01.9185509Z +2025-02-13T20:02:01.9185701Z  Detecting ARC: - +2025-02-13T20:02:01.9185941Z +2025-02-13T20:02:01.9186134Z  Detecting DRAM: - +2025-02-13T20:02:01.9186376Z +2025-02-13T20:02:01.9186735Z [1/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9187288Z  +2025-02-13T20:02:01.9187633Z  Detected Chips: 1 +2025-02-13T20:02:01.9187862Z +2025-02-13T20:02:01.9188060Z  Detecting ARC: \ +2025-02-13T20:02:01.9188283Z +2025-02-13T20:02:01.9188487Z  Detecting DRAM: \ +2025-02-13T20:02:01.9188719Z +2025-02-13T20:02:01.9189087Z [2/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9189706Z  +2025-02-13T20:02:01.9190063Z  Detected Chips: 1 +2025-02-13T20:02:01.9190297Z +2025-02-13T20:02:01.9190493Z  Detecting ARC: | +2025-02-13T20:02:01.9190732Z +2025-02-13T20:02:01.9190923Z  Detecting DRAM: | +2025-02-13T20:02:01.9191165Z +2025-02-13T20:02:01.9191526Z [2/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9192070Z  +2025-02-13T20:02:01.9192421Z  Detected Chips: 1 +2025-02-13T20:02:01.9192665Z +2025-02-13T20:02:01.9192863Z  Detecting ARC: / +2025-02-13T20:02:01.9193089Z +2025-02-13T20:02:01.9193296Z  Detecting DRAM: / +2025-02-13T20:02:01.9193526Z +2025-02-13T20:02:01.9193899Z [2/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9194572Z  +2025-02-13T20:02:01.9194921Z  Detected Chips: 1 +2025-02-13T20:02:01.9195146Z +2025-02-13T20:02:01.9195331Z  Detecting ARC: - +2025-02-13T20:02:01.9195554Z +2025-02-13T20:02:01.9195747Z  Detecting DRAM: - +2025-02-13T20:02:01.9195991Z +2025-02-13T20:02:01.9196354Z [2/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9196895Z  +2025-02-13T20:02:01.9197245Z  Detected Chips: 1 +2025-02-13T20:02:01.9197482Z +2025-02-13T20:02:01.9197671Z  Detecting ARC: \ +2025-02-13T20:02:01.9197897Z +2025-02-13T20:02:01.9198097Z  Detecting DRAM: \ +2025-02-13T20:02:01.9198320Z +2025-02-13T20:02:01.9198685Z [2/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9199216Z  +2025-02-13T20:02:01.9199574Z  Detected Chips: 1 +2025-02-13T20:02:01.9199803Z +2025-02-13T20:02:01.9199999Z  Detecting ARC: | +2025-02-13T20:02:01.9200247Z +2025-02-13T20:02:01.9200440Z  Detecting DRAM: | +2025-02-13T20:02:01.9200682Z +2025-02-13T20:02:01.9201042Z [2/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9201626Z  +2025-02-13T20:02:01.9202123Z  Detected Chips: 1 +2025-02-13T20:02:01.9202355Z +2025-02-13T20:02:01.9202560Z  Detecting ARC: / +2025-02-13T20:02:01.9202789Z +2025-02-13T20:02:01.9202988Z  Detecting DRAM: / +2025-02-13T20:02:01.9203230Z +2025-02-13T20:02:01.9203588Z [2/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9204130Z  +2025-02-13T20:02:01.9204476Z  Detected Chips: 1 +2025-02-13T20:02:01.9204811Z +2025-02-13T20:02:01.9205010Z  Detecting ARC: - +2025-02-13T20:02:01.9205244Z +2025-02-13T20:02:01.9205438Z  Detecting DRAM: - +2025-02-13T20:02:01.9205666Z +2025-02-13T20:02:01.9206032Z [2/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9206584Z  +2025-02-13T20:02:01.9206916Z  Detected Chips: 1 +2025-02-13T20:02:01.9207138Z +2025-02-13T20:02:01.9207339Z  Detecting ARC: \ +2025-02-13T20:02:01.9207558Z +2025-02-13T20:02:01.9207744Z  Detecting DRAM: \ +2025-02-13T20:02:01.9207983Z +2025-02-13T20:02:01.9208336Z [2/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9208883Z  +2025-02-13T20:02:01.9209225Z  Detected Chips: 1 +2025-02-13T20:02:01.9209465Z +2025-02-13T20:02:01.9209655Z  Detecting ARC: | +2025-02-13T20:02:01.9209885Z +2025-02-13T20:02:01.9210090Z  Detecting DRAM: | +2025-02-13T20:02:01.9210323Z +2025-02-13T20:02:01.9210684Z [2/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9211213Z  +2025-02-13T20:02:01.9211535Z  Detected Chips: 1 +2025-02-13T20:02:01.9211769Z +2025-02-13T20:02:01.9211954Z  Detecting ARC: / +2025-02-13T20:02:01.9212175Z +2025-02-13T20:02:01.9212360Z  Detecting DRAM: / +2025-02-13T20:02:01.9212584Z +2025-02-13T20:02:01.9212935Z [3/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9213472Z  +2025-02-13T20:02:01.9213803Z  Detected Chips: 1 +2025-02-13T20:02:01.9214026Z +2025-02-13T20:02:01.9214216Z  Detecting ARC: - +2025-02-13T20:02:01.9214436Z +2025-02-13T20:02:01.9214631Z  Detecting DRAM: - +2025-02-13T20:02:01.9215038Z +2025-02-13T20:02:01.9215407Z [3/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9215940Z  +2025-02-13T20:02:01.9216272Z  Detected Chips: 1 +2025-02-13T20:02:01.9216500Z +2025-02-13T20:02:01.9216691Z  Detecting ARC: \ +2025-02-13T20:02:01.9216915Z +2025-02-13T20:02:01.9217102Z  Detecting DRAM: \ +2025-02-13T20:02:01.9217469Z +2025-02-13T20:02:01.9217834Z [3/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9218359Z  +2025-02-13T20:02:01.9218684Z  Detected Chips: 1 +2025-02-13T20:02:01.9218914Z +2025-02-13T20:02:01.9219118Z  Detecting ARC: | +2025-02-13T20:02:01.9219349Z +2025-02-13T20:02:01.9219561Z  Detecting DRAM: | +2025-02-13T20:02:01.9219783Z +2025-02-13T20:02:01.9220148Z [3/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9220688Z  +2025-02-13T20:02:01.9221035Z  Detected Chips: 1 +2025-02-13T20:02:01.9221266Z +2025-02-13T20:02:01.9221456Z  Detecting ARC: / +2025-02-13T20:02:01.9221689Z +2025-02-13T20:02:01.9221879Z  Detecting DRAM: / +2025-02-13T20:02:01.9222121Z +2025-02-13T20:02:01.9222481Z [3/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9223038Z  +2025-02-13T20:02:01.9223383Z  Detected Chips: 1 +2025-02-13T20:02:01.9223620Z +2025-02-13T20:02:01.9223810Z  Detecting ARC: - +2025-02-13T20:02:01.9224037Z +2025-02-13T20:02:01.9224243Z  Detecting DRAM: - +2025-02-13T20:02:01.9224472Z +2025-02-13T20:02:01.9224843Z [3/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9225385Z  +2025-02-13T20:02:01.9225727Z  Detected Chips: 1 +2025-02-13T20:02:01.9225955Z +2025-02-13T20:02:01.9226148Z  Detecting ARC: \ +2025-02-13T20:02:01.9226382Z +2025-02-13T20:02:01.9226572Z  Detecting DRAM: \ +2025-02-13T20:02:01.9226806Z +2025-02-13T20:02:01.9227270Z [3/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9227814Z  +2025-02-13T20:02:01.9228156Z  Detected Chips: 1 +2025-02-13T20:02:01.9228386Z +2025-02-13T20:02:01.9228571Z  Detecting ARC: | +2025-02-13T20:02:01.9228808Z +2025-02-13T20:02:01.9229008Z  Detecting DRAM: | +2025-02-13T20:02:01.9229238Z +2025-02-13T20:02:01.9229678Z [3/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9230213Z  +2025-02-13T20:02:01.9230552Z  Detected Chips: 1 +2025-02-13T20:02:01.9230777Z +2025-02-13T20:02:01.9230962Z  Detecting ARC: / +2025-02-13T20:02:01.9231195Z +2025-02-13T20:02:01.9231386Z  Detecting DRAM: / +2025-02-13T20:02:01.9231629Z +2025-02-13T20:02:01.9231984Z [3/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9232520Z  +2025-02-13T20:02:01.9232862Z  Detected Chips: 1 +2025-02-13T20:02:01.9233120Z +2025-02-13T20:02:01.9233315Z  Detecting ARC: - +2025-02-13T20:02:01.9233541Z +2025-02-13T20:02:01.9233764Z  Detecting DRAM: - +2025-02-13T20:02:01.9233996Z +2025-02-13T20:02:01.9234371Z [3/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9234929Z  +2025-02-13T20:02:01.9235296Z  Detected Chips: 1 +2025-02-13T20:02:01.9235521Z +2025-02-13T20:02:01.9235722Z  Detecting ARC: \ +2025-02-13T20:02:01.9235949Z +2025-02-13T20:02:01.9236144Z  Detecting DRAM: \ +2025-02-13T20:02:01.9236376Z +2025-02-13T20:02:01.9236728Z [4/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9237280Z  +2025-02-13T20:02:01.9237629Z  Detected Chips: 1 +2025-02-13T20:02:01.9237880Z +2025-02-13T20:02:01.9238077Z  Detecting ARC: | +2025-02-13T20:02:01.9238321Z +2025-02-13T20:02:01.9238521Z  Detecting DRAM: | +2025-02-13T20:02:01.9238763Z +2025-02-13T20:02:01.9239148Z [4/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9239696Z  +2025-02-13T20:02:01.9240061Z  Detected Chips: 1 +2025-02-13T20:02:01.9240414Z +2025-02-13T20:02:01.9240627Z  Detecting ARC: / +2025-02-13T20:02:01.9240858Z +2025-02-13T20:02:01.9241058Z  Detecting DRAM: / +2025-02-13T20:02:01.9241311Z +2025-02-13T20:02:01.9241678Z [4/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9242235Z  +2025-02-13T20:02:01.9242605Z  Detected Chips: 1 +2025-02-13T20:02:01.9242858Z +2025-02-13T20:02:01.9243059Z  Detecting ARC: - +2025-02-13T20:02:01.9243301Z +2025-02-13T20:02:01.9243503Z  Detecting DRAM: - +2025-02-13T20:02:01.9243736Z +2025-02-13T20:02:01.9244120Z [4/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9244685Z  +2025-02-13T20:02:01.9245057Z  Detected Chips: 1 +2025-02-13T20:02:01.9245293Z +2025-02-13T20:02:01.9245504Z  Detecting ARC: \ +2025-02-13T20:02:01.9245734Z +2025-02-13T20:02:01.9245927Z  Detecting DRAM: \ +2025-02-13T20:02:01.9246173Z +2025-02-13T20:02:01.9246528Z [4/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9247068Z  +2025-02-13T20:02:01.9247414Z  Detected Chips: 1 +2025-02-13T20:02:01.9247654Z +2025-02-13T20:02:01.9247844Z  Detecting ARC: | +2025-02-13T20:02:01.9248078Z +2025-02-13T20:02:01.9248269Z  Detecting DRAM: | +2025-02-13T20:02:01.9248498Z +2025-02-13T20:02:01.9248864Z [4/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9249404Z  +2025-02-13T20:02:01.9249746Z  Detected Chips: 1 +2025-02-13T20:02:01.9249975Z +2025-02-13T20:02:01.9250179Z  Detecting ARC: / +2025-02-13T20:02:01.9250523Z +2025-02-13T20:02:01.9250730Z  Detecting DRAM: / +2025-02-13T20:02:01.9250963Z +2025-02-13T20:02:01.9251320Z [4/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9251860Z  +2025-02-13T20:02:01.9252211Z  Detected Chips: 1 +2025-02-13T20:02:01.9252452Z +2025-02-13T20:02:01.9252643Z  Detecting ARC: - +2025-02-13T20:02:01.9252882Z +2025-02-13T20:02:01.9253074Z  Detecting DRAM: - +2025-02-13T20:02:01.9253304Z +2025-02-13T20:02:01.9253680Z [4/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9254225Z  +2025-02-13T20:02:01.9254578Z  Detected Chips: 1 +2025-02-13T20:02:01.9255080Z +2025-02-13T20:02:01.9255300Z  Detecting ARC: \ +2025-02-13T20:02:01.9255535Z +2025-02-13T20:02:01.9255738Z  Detecting DRAM: \ +2025-02-13T20:02:01.9255970Z +2025-02-13T20:02:01.9256346Z [4/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9256916Z  +2025-02-13T20:02:01.9257297Z  Detected Chips: 1 +2025-02-13T20:02:01.9257551Z +2025-02-13T20:02:01.9257748Z  Detecting ARC: | +2025-02-13T20:02:01.9258007Z +2025-02-13T20:02:01.9258203Z  Detecting DRAM: | +2025-02-13T20:02:01.9258451Z +2025-02-13T20:02:01.9258822Z [4/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9259386Z  +2025-02-13T20:02:01.9259747Z  Detected Chips: 1 +2025-02-13T20:02:01.9259987Z +2025-02-13T20:02:01.9260198Z  Detecting ARC: / +2025-02-13T20:02:01.9260427Z +2025-02-13T20:02:01.9260638Z  Detecting DRAM: / +2025-02-13T20:02:01.9260868Z +2025-02-13T20:02:01.9261227Z [5/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9261777Z  +2025-02-13T20:02:01.9262137Z  Detected Chips: 1 +2025-02-13T20:02:01.9262394Z +2025-02-13T20:02:01.9262638Z  Detecting ARC: - +2025-02-13T20:02:01.9262881Z +2025-02-13T20:02:01.9263082Z  Detecting DRAM: - +2025-02-13T20:02:01.9263328Z +2025-02-13T20:02:01.9263699Z [5/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9264397Z  +2025-02-13T20:02:01.9264759Z  Detected Chips: 1 +2025-02-13T20:02:01.9264992Z +2025-02-13T20:02:01.9265207Z  Detecting ARC: \ +2025-02-13T20:02:01.9265437Z +2025-02-13T20:02:01.9265653Z  Detecting DRAM: \ +2025-02-13T20:02:01.9265887Z +2025-02-13T20:02:01.9266278Z [5/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9266832Z  +2025-02-13T20:02:01.9267196Z  Detected Chips: 1 +2025-02-13T20:02:01.9267440Z +2025-02-13T20:02:01.9267629Z  Detecting ARC: | +2025-02-13T20:02:01.9267859Z +2025-02-13T20:02:01.9268049Z  Detecting DRAM: | +2025-02-13T20:02:01.9268288Z +2025-02-13T20:02:01.9268658Z [5/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9269192Z  +2025-02-13T20:02:01.9269539Z  Detected Chips: 1 +2025-02-13T20:02:01.9269842Z +2025-02-13T20:02:01.9270093Z  Detecting ARC: / +2025-02-13T20:02:01.9270317Z +2025-02-13T20:02:01.9270515Z  Detecting DRAM: / +2025-02-13T20:02:01.9270740Z +2025-02-13T20:02:01.9271109Z [5/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9271648Z  +2025-02-13T20:02:01.9271995Z  Detected Chips: 1 +2025-02-13T20:02:01.9272235Z +2025-02-13T20:02:01.9272427Z  Detecting ARC: - +2025-02-13T20:02:01.9272672Z +2025-02-13T20:02:01.9272865Z  Detecting DRAM: - +2025-02-13T20:02:01.9273104Z +2025-02-13T20:02:01.9273462Z [5/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9274019Z  +2025-02-13T20:02:01.9274482Z  Detected Chips: 1 +2025-02-13T20:02:01.9274721Z +2025-02-13T20:02:01.9274930Z  Detecting ARC: \ +2025-02-13T20:02:01.9275153Z +2025-02-13T20:02:01.9275358Z  Detecting DRAM: \ +2025-02-13T20:02:01.9275596Z +2025-02-13T20:02:01.9275976Z [5/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9276509Z  +2025-02-13T20:02:01.9276861Z  Detected Chips: 1 +2025-02-13T20:02:01.9277123Z +2025-02-13T20:02:01.9277318Z  Detecting ARC: | +2025-02-13T20:02:01.9277570Z +2025-02-13T20:02:01.9277764Z  Detecting DRAM: | +2025-02-13T20:02:01.9278018Z +2025-02-13T20:02:01.9278383Z [5/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9278943Z  +2025-02-13T20:02:01.9279311Z  Detected Chips: 1 +2025-02-13T20:02:01.9279546Z +2025-02-13T20:02:01.9279764Z  Detecting ARC: / +2025-02-13T20:02:01.9279998Z +2025-02-13T20:02:01.9280218Z  Detecting DRAM: / +2025-02-13T20:02:01.9280450Z +2025-02-13T20:02:01.9280828Z [5/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9281368Z  +2025-02-13T20:02:01.9281741Z  Detected Chips: 1 +2025-02-13T20:02:01.9281992Z +2025-02-13T20:02:01.9282189Z  Detecting ARC: - +2025-02-13T20:02:01.9282434Z +2025-02-13T20:02:01.9282635Z  Detecting DRAM: - +2025-02-13T20:02:01.9282893Z +2025-02-13T20:02:01.9283258Z [5/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9283824Z  +2025-02-13T20:02:01.9284192Z  Detected Chips: 1 +2025-02-13T20:02:01.9284431Z +2025-02-13T20:02:01.9284645Z  Detecting ARC: \ +2025-02-13T20:02:01.9284878Z +2025-02-13T20:02:01.9285096Z  Detecting DRAM: \ +2025-02-13T20:02:01.9285331Z +2025-02-13T20:02:01.9285715Z [6/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9286268Z  +2025-02-13T20:02:01.9286631Z  Detected Chips: 1 +2025-02-13T20:02:01.9286888Z +2025-02-13T20:02:01.9287085Z  Detecting ARC: | +2025-02-13T20:02:01.9287330Z +2025-02-13T20:02:01.9287639Z  Detecting DRAM: | +2025-02-13T20:02:01.9287878Z +2025-02-13T20:02:01.9288236Z [6/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9288778Z  +2025-02-13T20:02:01.9289129Z  Detected Chips: 1 +2025-02-13T20:02:01.9289363Z +2025-02-13T20:02:01.9289566Z  Detecting ARC: / +2025-02-13T20:02:01.9289790Z +2025-02-13T20:02:01.9289998Z  Detecting DRAM: / +2025-02-13T20:02:01.9290223Z +2025-02-13T20:02:01.9290590Z [6/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9291126Z  +2025-02-13T20:02:01.9291460Z  Detected Chips: 1 +2025-02-13T20:02:01.9291696Z +2025-02-13T20:02:01.9291892Z  Detecting ARC: - +2025-02-13T20:02:01.9292129Z +2025-02-13T20:02:01.9292325Z  Detecting DRAM: - +2025-02-13T20:02:01.9292564Z +2025-02-13T20:02:01.9292922Z [6/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9293465Z  +2025-02-13T20:02:01.9293813Z  Detected Chips: 1 +2025-02-13T20:02:01.9294046Z +2025-02-13T20:02:01.9294246Z  Detecting ARC: \ +2025-02-13T20:02:01.9294471Z +2025-02-13T20:02:01.9294848Z  Detecting DRAM: \ +2025-02-13T20:02:01.9295109Z +2025-02-13T20:02:01.9295491Z [6/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9296032Z  +2025-02-13T20:02:01.9296371Z  Detected Chips: 1 +2025-02-13T20:02:01.9296610Z +2025-02-13T20:02:01.9296807Z  Detecting ARC: | +2025-02-13T20:02:01.9297045Z +2025-02-13T20:02:01.9297238Z  Detecting DRAM: | +2025-02-13T20:02:01.9297477Z +2025-02-13T20:02:01.9297973Z [6/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9298542Z  +2025-02-13T20:02:01.9298904Z  Detected Chips: 1 +2025-02-13T20:02:01.9299139Z +2025-02-13T20:02:01.9299354Z  Detecting ARC: / +2025-02-13T20:02:01.9299592Z +2025-02-13T20:02:01.9299811Z  Detecting DRAM: / +2025-02-13T20:02:01.9300049Z +2025-02-13T20:02:01.9300428Z [6/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9300995Z  +2025-02-13T20:02:01.9301355Z  Detected Chips: 1 +2025-02-13T20:02:01.9301590Z +2025-02-13T20:02:01.9301786Z  Detecting ARC: - +2025-02-13T20:02:01.9302036Z +2025-02-13T20:02:01.9302233Z  Detecting DRAM: - +2025-02-13T20:02:01.9302483Z +2025-02-13T20:02:01.9302848Z [6/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9303401Z  +2025-02-13T20:02:01.9303765Z  Detected Chips: 1 +2025-02-13T20:02:01.9304004Z +2025-02-13T20:02:01.9304220Z  Detecting ARC: \ +2025-02-13T20:02:01.9304448Z +2025-02-13T20:02:01.9304664Z  Detecting DRAM: \ +2025-02-13T20:02:01.9304900Z +2025-02-13T20:02:01.9305289Z [6/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9305840Z  +2025-02-13T20:02:01.9306198Z  Detected Chips: 1 +2025-02-13T20:02:01.9306432Z +2025-02-13T20:02:01.9306626Z  Detecting ARC: | +2025-02-13T20:02:01.9306873Z +2025-02-13T20:02:01.9307074Z  Detecting DRAM: | +2025-02-13T20:02:01.9307332Z +2025-02-13T20:02:01.9307693Z [6/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9308256Z  +2025-02-13T20:02:01.9308606Z  Detected Chips: 1 +2025-02-13T20:02:01.9308855Z +2025-02-13T20:02:01.9309045Z  Detecting ARC: / +2025-02-13T20:02:01.9309274Z +2025-02-13T20:02:01.9309488Z  Detecting DRAM: / +2025-02-13T20:02:01.9309791Z +2025-02-13T20:02:01.9310159Z [7/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9310698Z  +2025-02-13T20:02:01.9311047Z  Detected Chips: 1 +2025-02-13T20:02:01.9311431Z +2025-02-13T20:02:01.9311625Z  Detecting ARC: - +2025-02-13T20:02:01.9311865Z +2025-02-13T20:02:01.9312058Z  Detecting DRAM: - +2025-02-13T20:02:01.9312296Z +2025-02-13T20:02:01.9312656Z [7/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9313199Z  +2025-02-13T20:02:01.9313545Z  Detected Chips: 1 +2025-02-13T20:02:01.9313795Z +2025-02-13T20:02:01.9314000Z  Detecting ARC: \ +2025-02-13T20:02:01.9314236Z +2025-02-13T20:02:01.9314437Z  Detecting DRAM: \ +2025-02-13T20:02:01.9314684Z +2025-02-13T20:02:01.9315074Z [7/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9315629Z  +2025-02-13T20:02:01.9315975Z  Detected Chips: 1 +2025-02-13T20:02:01.9316206Z +2025-02-13T20:02:01.9316403Z  Detecting ARC: | +2025-02-13T20:02:01.9316650Z +2025-02-13T20:02:01.9316840Z  Detecting DRAM: | +2025-02-13T20:02:01.9317088Z +2025-02-13T20:02:01.9317445Z [7/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9317986Z  +2025-02-13T20:02:01.9318342Z  Detected Chips: 1 +2025-02-13T20:02:01.9318579Z +2025-02-13T20:02:01.9319847Z ##[notice]tt-smi reset was successful +2025-02-13T20:02:01.9322211Z  Detecting ARC: / +2025-02-13T20:02:01.9322452Z +2025-02-13T20:02:01.9322658Z  Detecting DRAM: / +2025-02-13T20:02:01.9322896Z +2025-02-13T20:02:01.9323277Z [7/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9323837Z  +2025-02-13T20:02:01.9324187Z  Detected Chips: 1 +2025-02-13T20:02:01.9324565Z +2025-02-13T20:02:01.9324789Z  Detecting ARC: - +2025-02-13T20:02:01.9325021Z +2025-02-13T20:02:01.9325231Z  Detecting DRAM: - +2025-02-13T20:02:01.9325472Z +2025-02-13T20:02:01.9325832Z [7/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9326404Z  +2025-02-13T20:02:01.9326757Z  Detected Chips: 1 +2025-02-13T20:02:01.9327031Z +2025-02-13T20:02:01.9327223Z  Detecting ARC: \ +2025-02-13T20:02:01.9327466Z +2025-02-13T20:02:01.9327660Z  Detecting DRAM: \ +2025-02-13T20:02:01.9327888Z +2025-02-13T20:02:01.9328257Z [7/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9328821Z  +2025-02-13T20:02:01.9329180Z  Detected Chips: 1 +2025-02-13T20:02:01.9329414Z +2025-02-13T20:02:01.9329632Z  Detecting ARC: | +2025-02-13T20:02:01.9329871Z +2025-02-13T20:02:01.9330120Z  Detecting DRAM: | +2025-02-13T20:02:01.9330357Z +2025-02-13T20:02:01.9330719Z [7/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9331259Z  +2025-02-13T20:02:01.9331605Z  Detected Chips: 1 +2025-02-13T20:02:01.9331847Z +2025-02-13T20:02:01.9332036Z  Detecting ARC: / +2025-02-13T20:02:01.9332268Z +2025-02-13T20:02:01.9332461Z  Detecting DRAM: / +2025-02-13T20:02:01.9332688Z +2025-02-13T20:02:01.9332896Z [] [16/16] ETH: /' +2025-02-13T20:02:01.9333271Z + [[ 0 -ne 0 ]] +2025-02-13T20:02:01.9333781Z + [[  Starting pci link reset on WH devices at pci indices: 0  +2025-02-13T20:02:01.9334495Z  Finishing pci link reset on WH devices at pci indices: 0  +2025-02-13T20:02:01.9335335Z  Re-initializing boards after reset....  +2025-02-13T20:02:01.9335698Z +2025-02-13T20:02:01.9335919Z  Detected Chips: 1 +2025-02-13T20:02:01.9336361Z  +2025-02-13T20:02:01.9336758Z  Detected Chips: 1 +2025-02-13T20:02:01.9337019Z +2025-02-13T20:02:01.9337236Z  Detecting ARC: | +2025-02-13T20:02:01.9337507Z +2025-02-13T20:02:01.9337723Z  Detecting DRAM: | +2025-02-13T20:02:01.9337964Z +2025-02-13T20:02:01.9338382Z [0/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9339221Z  +2025-02-13T20:02:01.9339609Z  Detected Chips: 1 +2025-02-13T20:02:01.9339864Z +2025-02-13T20:02:01.9340081Z  Detecting ARC: / +2025-02-13T20:02:01.9340329Z +2025-02-13T20:02:01.9340555Z  Detecting DRAM: / +2025-02-13T20:02:01.9340821Z +2025-02-13T20:02:01.9341240Z [0/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9341865Z  +2025-02-13T20:02:01.9342267Z  Detected Chips: 1 +2025-02-13T20:02:01.9342553Z +2025-02-13T20:02:01.9342775Z  Detecting ARC: - +2025-02-13T20:02:01.9343055Z +2025-02-13T20:02:01.9343294Z  Detecting DRAM: - +2025-02-13T20:02:01.9354889Z +2025-02-13T20:02:01.9355446Z [0/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9356124Z  +2025-02-13T20:02:01.9356525Z  Detected Chips: 1 +2025-02-13T20:02:01.9356826Z +2025-02-13T20:02:01.9357064Z  Detecting ARC: \ +2025-02-13T20:02:01.9357335Z +2025-02-13T20:02:01.9357579Z  Detecting DRAM: \ +2025-02-13T20:02:01.9357827Z +2025-02-13T20:02:01.9358247Z [0/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9358863Z  +2025-02-13T20:02:01.9359246Z  Detected Chips: 1 +2025-02-13T20:02:01.9359494Z +2025-02-13T20:02:01.9359694Z  Detecting ARC: | +2025-02-13T20:02:01.9359929Z +2025-02-13T20:02:01.9360122Z  Detecting DRAM: | +2025-02-13T20:02:01.9360362Z +2025-02-13T20:02:01.9360729Z [0/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9361442Z  +2025-02-13T20:02:01.9361808Z  Detected Chips: 1 +2025-02-13T20:02:01.9362040Z +2025-02-13T20:02:01.9362242Z  Detecting ARC: / +2025-02-13T20:02:01.9362462Z +2025-02-13T20:02:01.9362665Z  Detecting DRAM: / +2025-02-13T20:02:01.9362909Z +2025-02-13T20:02:01.9363285Z [0/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9363818Z  +2025-02-13T20:02:01.9364168Z  Detected Chips: 1 +2025-02-13T20:02:01.9364411Z +2025-02-13T20:02:01.9364599Z  Detecting ARC: - +2025-02-13T20:02:01.9364841Z +2025-02-13T20:02:01.9365035Z  Detecting DRAM: - +2025-02-13T20:02:01.9365278Z +2025-02-13T20:02:01.9365638Z [0/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9366179Z  +2025-02-13T20:02:01.9366540Z  Detected Chips: 1 +2025-02-13T20:02:01.9366772Z +2025-02-13T20:02:01.9366975Z  Detecting ARC: \ +2025-02-13T20:02:01.9367210Z +2025-02-13T20:02:01.9367408Z  Detecting DRAM: \ +2025-02-13T20:02:01.9367636Z +2025-02-13T20:02:01.9368004Z [0/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9368549Z  +2025-02-13T20:02:01.9368887Z  Detected Chips: 1 +2025-02-13T20:02:01.9369126Z +2025-02-13T20:02:01.9369314Z  Detecting ARC: | +2025-02-13T20:02:01.9369549Z +2025-02-13T20:02:01.9369744Z  Detecting DRAM: | +2025-02-13T20:02:01.9369980Z +2025-02-13T20:02:01.9370337Z [0/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9370885Z  +2025-02-13T20:02:01.9371269Z  Detected Chips: 1 +2025-02-13T20:02:01.9371532Z +2025-02-13T20:02:01.9371741Z  Detecting ARC: / +2025-02-13T20:02:01.9371965Z +2025-02-13T20:02:01.9372165Z  Detecting DRAM: / +2025-02-13T20:02:01.9372394Z +2025-02-13T20:02:01.9372766Z [1/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9373306Z  +2025-02-13T20:02:01.9373638Z  Detected Chips: 1 +2025-02-13T20:02:01.9373877Z +2025-02-13T20:02:01.9374066Z  Detecting ARC: - +2025-02-13T20:02:01.9374427Z +2025-02-13T20:02:01.9374623Z  Detecting DRAM: - +2025-02-13T20:02:01.9375037Z +2025-02-13T20:02:01.9375409Z [1/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9375951Z  +2025-02-13T20:02:01.9376302Z  Detected Chips: 1 +2025-02-13T20:02:01.9376572Z +2025-02-13T20:02:01.9376826Z  Detecting ARC: \ +2025-02-13T20:02:01.9377123Z +2025-02-13T20:02:01.9377381Z  Detecting DRAM: \ +2025-02-13T20:02:01.9377685Z +2025-02-13T20:02:01.9378178Z [1/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9378914Z  +2025-02-13T20:02:01.9379337Z  Detected Chips: 1 +2025-02-13T20:02:01.9379660Z +2025-02-13T20:02:01.9379901Z  Detecting ARC: | +2025-02-13T20:02:01.9380218Z +2025-02-13T20:02:01.9380465Z  Detecting DRAM: | +2025-02-13T20:02:01.9380781Z +2025-02-13T20:02:01.9381267Z [1/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9382013Z  +2025-02-13T20:02:01.9382453Z  Detected Chips: 1 +2025-02-13T20:02:01.9382762Z +2025-02-13T20:02:01.9383005Z  Detecting ARC: / +2025-02-13T20:02:01.9383266Z +2025-02-13T20:02:01.9383506Z  Detecting DRAM: / +2025-02-13T20:02:01.9383771Z +2025-02-13T20:02:01.9384202Z [1/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9384868Z  +2025-02-13T20:02:01.9385271Z  Detected Chips: 1 +2025-02-13T20:02:01.9385560Z +2025-02-13T20:02:01.9385800Z  Detecting ARC: - +2025-02-13T20:02:01.9386090Z +2025-02-13T20:02:01.9386329Z  Detecting DRAM: - +2025-02-13T20:02:01.9386928Z +2025-02-13T20:02:01.9387418Z [1/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9388106Z  +2025-02-13T20:02:01.9388537Z  Detected Chips: 1 +2025-02-13T20:02:01.9388845Z +2025-02-13T20:02:01.9389098Z  Detecting ARC: \ +2025-02-13T20:02:01.9389374Z +2025-02-13T20:02:01.9389711Z  Detecting DRAM: \ +2025-02-13T20:02:01.9390003Z +2025-02-13T20:02:01.9390476Z [1/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9391177Z  +2025-02-13T20:02:01.9391587Z  Detected Chips: 1 +2025-02-13T20:02:01.9391885Z +2025-02-13T20:02:01.9392126Z  Detecting ARC: | +2025-02-13T20:02:01.9392422Z +2025-02-13T20:02:01.9392659Z  Detecting DRAM: | +2025-02-13T20:02:01.9392959Z +2025-02-13T20:02:01.9393412Z [1/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9394110Z  +2025-02-13T20:02:01.9394534Z  Detected Chips: 1 +2025-02-13T20:02:01.9394826Z +2025-02-13T20:02:01.9395072Z  Detecting ARC: / +2025-02-13T20:02:01.9395344Z +2025-02-13T20:02:01.9395588Z  Detecting DRAM: / +2025-02-13T20:02:01.9395874Z +2025-02-13T20:02:01.9398200Z [1/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9398912Z  +2025-02-13T20:02:01.9399340Z  Detected Chips: 1 +2025-02-13T20:02:01.9399623Z +2025-02-13T20:02:01.9399872Z  Detecting ARC: - +2025-02-13T20:02:01.9400169Z +2025-02-13T20:02:01.9400411Z  Detecting DRAM: - +2025-02-13T20:02:01.9400700Z +2025-02-13T20:02:01.9401139Z [1/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9401809Z  +2025-02-13T20:02:01.9402237Z  Detected Chips: 1 +2025-02-13T20:02:01.9402515Z +2025-02-13T20:02:01.9402756Z  Detecting ARC: \ +2025-02-13T20:02:01.9403034Z +2025-02-13T20:02:01.9403266Z  Detecting DRAM: \ +2025-02-13T20:02:01.9403538Z +2025-02-13T20:02:01.9403995Z [2/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9404663Z  +2025-02-13T20:02:01.9405256Z  Detected Chips: 1 +2025-02-13T20:02:01.9405546Z +2025-02-13T20:02:01.9405772Z  Detecting ARC: | +2025-02-13T20:02:01.9406046Z +2025-02-13T20:02:01.9406265Z  Detecting DRAM: | +2025-02-13T20:02:01.9406544Z +2025-02-13T20:02:01.9406968Z [2/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9407620Z  +2025-02-13T20:02:01.9408029Z  Detected Chips: 1 +2025-02-13T20:02:01.9408305Z +2025-02-13T20:02:01.9408548Z  Detecting ARC: / +2025-02-13T20:02:01.9408818Z +2025-02-13T20:02:01.9409058Z  Detecting DRAM: / +2025-02-13T20:02:01.9409331Z +2025-02-13T20:02:01.9409782Z [2/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9410442Z  +2025-02-13T20:02:01.9410832Z  Detected Chips: 1 +2025-02-13T20:02:01.9411118Z +2025-02-13T20:02:01.9411352Z  Detecting ARC: - +2025-02-13T20:02:01.9411661Z +2025-02-13T20:02:01.9411895Z  Detecting DRAM: - +2025-02-13T20:02:01.9412180Z +2025-02-13T20:02:01.9412633Z [2/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9413286Z  +2025-02-13T20:02:01.9413696Z  Detected Chips: 1 +2025-02-13T20:02:01.9413980Z +2025-02-13T20:02:01.9414213Z  Detecting ARC: \ +2025-02-13T20:02:01.9414484Z +2025-02-13T20:02:01.9414889Z  Detecting DRAM: \ +2025-02-13T20:02:01.9415179Z +2025-02-13T20:02:01.9415643Z [2/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9416298Z  +2025-02-13T20:02:01.9416676Z  Detected Chips: 1 +2025-02-13T20:02:01.9416958Z +2025-02-13T20:02:01.9417342Z  Detecting ARC: | +2025-02-13T20:02:01.9417635Z +2025-02-13T20:02:01.9417878Z  Detecting DRAM: | +2025-02-13T20:02:01.9418162Z +2025-02-13T20:02:01.9418600Z [2/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9419271Z  +2025-02-13T20:02:01.9419682Z  Detected Chips: 1 +2025-02-13T20:02:01.9419962Z +2025-02-13T20:02:01.9420201Z  Detecting ARC: / +2025-02-13T20:02:01.9420474Z +2025-02-13T20:02:01.9420715Z  Detecting DRAM: / +2025-02-13T20:02:01.9420998Z +2025-02-13T20:02:01.9421454Z [2/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9422110Z  +2025-02-13T20:02:01.9422510Z  Detected Chips: 1 +2025-02-13T20:02:01.9422803Z +2025-02-13T20:02:01.9423025Z  Detecting ARC: - +2025-02-13T20:02:01.9423306Z +2025-02-13T20:02:01.9423541Z  Detecting DRAM: - +2025-02-13T20:02:01.9423846Z +2025-02-13T20:02:01.9424289Z [2/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9424951Z  +2025-02-13T20:02:01.9425357Z  Detected Chips: 1 +2025-02-13T20:02:01.9425637Z +2025-02-13T20:02:01.9425879Z  Detecting ARC: \ +2025-02-13T20:02:01.9426147Z +2025-02-13T20:02:01.9426384Z  Detecting DRAM: \ +2025-02-13T20:02:01.9426656Z +2025-02-13T20:02:01.9427108Z [2/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9427765Z  +2025-02-13T20:02:01.9428163Z  Detected Chips: 1 +2025-02-13T20:02:01.9428462Z +2025-02-13T20:02:01.9428685Z  Detecting ARC: | +2025-02-13T20:02:01.9428962Z +2025-02-13T20:02:01.9429182Z  Detecting DRAM: | +2025-02-13T20:02:01.9429460Z +2025-02-13T20:02:01.9429971Z [2/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9430617Z  +2025-02-13T20:02:01.9431034Z  Detected Chips: 1 +2025-02-13T20:02:01.9431309Z +2025-02-13T20:02:01.9431546Z  Detecting ARC: / +2025-02-13T20:02:01.9431817Z +2025-02-13T20:02:01.9432057Z  Detecting DRAM: / +2025-02-13T20:02:01.9432513Z +2025-02-13T20:02:01.9432968Z [3/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9433611Z  +2025-02-13T20:02:01.9434010Z  Detected Chips: 1 +2025-02-13T20:02:01.9434299Z +2025-02-13T20:02:01.9434520Z  Detecting ARC: - +2025-02-13T20:02:01.9434800Z +2025-02-13T20:02:01.9435031Z  Detecting DRAM: - +2025-02-13T20:02:01.9435326Z +2025-02-13T20:02:01.9435759Z [3/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9436410Z  +2025-02-13T20:02:01.9436810Z  Detected Chips: 1 +2025-02-13T20:02:01.9437094Z +2025-02-13T20:02:01.9437328Z  Detecting ARC: \ +2025-02-13T20:02:01.9437596Z +2025-02-13T20:02:01.9437839Z  Detecting DRAM: \ +2025-02-13T20:02:01.9438120Z +2025-02-13T20:02:01.9438578Z [3/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9439242Z  +2025-02-13T20:02:01.9439651Z  Detected Chips: 1 +2025-02-13T20:02:01.9439935Z +2025-02-13T20:02:01.9440164Z  Detecting ARC: | +2025-02-13T20:02:01.9440443Z +2025-02-13T20:02:01.9440670Z  Detecting DRAM: | +2025-02-13T20:02:01.9440962Z +2025-02-13T20:02:01.9441407Z [3/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9442073Z  +2025-02-13T20:02:01.9442474Z  Detected Chips: 1 +2025-02-13T20:02:01.9442752Z +2025-02-13T20:02:01.9442991Z  Detecting ARC: / +2025-02-13T20:02:01.9443261Z +2025-02-13T20:02:01.9443499Z  Detecting DRAM: / +2025-02-13T20:02:01.9443777Z +2025-02-13T20:02:01.9444357Z [3/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9445029Z  +2025-02-13T20:02:01.9445443Z  Detected Chips: 1 +2025-02-13T20:02:01.9445727Z +2025-02-13T20:02:01.9445959Z  Detecting ARC: - +2025-02-13T20:02:01.9446256Z +2025-02-13T20:02:01.9446494Z  Detecting DRAM: - +2025-02-13T20:02:01.9446781Z +2025-02-13T20:02:01.9447223Z [3/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9447893Z  +2025-02-13T20:02:01.9448307Z  Detected Chips: 1 +2025-02-13T20:02:01.9448613Z +2025-02-13T20:02:01.9448837Z  Detecting ARC: \ +2025-02-13T20:02:01.9449107Z +2025-02-13T20:02:01.9449341Z  Detecting DRAM: \ +2025-02-13T20:02:01.9449614Z +2025-02-13T20:02:01.9450065Z [3/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9450724Z  +2025-02-13T20:02:01.9451127Z  Detected Chips: 1 +2025-02-13T20:02:01.9451404Z +2025-02-13T20:02:01.9451641Z  Detecting ARC: | +2025-02-13T20:02:01.9451922Z +2025-02-13T20:02:01.9452156Z  Detecting DRAM: | +2025-02-13T20:02:01.9452443Z +2025-02-13T20:02:01.9452880Z [3/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9453527Z  +2025-02-13T20:02:01.9453932Z  Detected Chips: 1 +2025-02-13T20:02:01.9454219Z +2025-02-13T20:02:01.9454447Z  Detecting ARC: / +2025-02-13T20:02:01.9454906Z +2025-02-13T20:02:01.9455149Z  Detecting DRAM: / +2025-02-13T20:02:01.9455434Z +2025-02-13T20:02:01.9455860Z [3/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9456501Z  +2025-02-13T20:02:01.9456921Z  Detected Chips: 1 +2025-02-13T20:02:01.9457201Z +2025-02-13T20:02:01.9457432Z  Detecting ARC: - +2025-02-13T20:02:01.9457701Z +2025-02-13T20:02:01.9457928Z  Detecting DRAM: - +2025-02-13T20:02:01.9458215Z +2025-02-13T20:02:01.9458663Z [3/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9459326Z  +2025-02-13T20:02:01.9459733Z  Detected Chips: 1 +2025-02-13T20:02:01.9460019Z +2025-02-13T20:02:01.9460434Z  Detecting ARC: \ +2025-02-13T20:02:01.9460706Z +2025-02-13T20:02:01.9460948Z  Detecting DRAM: \ +2025-02-13T20:02:01.9461226Z +2025-02-13T20:02:01.9461674Z [4/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9462339Z  +2025-02-13T20:02:01.9462739Z  Detected Chips: 1 +2025-02-13T20:02:01.9463019Z +2025-02-13T20:02:01.9463259Z  Detecting ARC: | +2025-02-13T20:02:01.9463532Z +2025-02-13T20:02:01.9463766Z  Detecting DRAM: | +2025-02-13T20:02:01.9464051Z +2025-02-13T20:02:01.9464494Z [4/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9465157Z  +2025-02-13T20:02:01.9465568Z  Detected Chips: 1 +2025-02-13T20:02:01.9465863Z +2025-02-13T20:02:01.9466112Z  Detecting ARC: / +2025-02-13T20:02:01.9466411Z +2025-02-13T20:02:01.9466648Z  Detecting DRAM: / +2025-02-13T20:02:01.9466928Z +2025-02-13T20:02:01.9467402Z [4/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9468075Z  +2025-02-13T20:02:01.9468499Z  Detected Chips: 1 +2025-02-13T20:02:01.9468785Z +2025-02-13T20:02:01.9469034Z  Detecting ARC: - +2025-02-13T20:02:01.9469314Z +2025-02-13T20:02:01.9469635Z  Detecting DRAM: - +2025-02-13T20:02:01.9469937Z +2025-02-13T20:02:01.9470393Z [4/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9471076Z  +2025-02-13T20:02:01.9471499Z  Detected Chips: 1 +2025-02-13T20:02:01.9471804Z +2025-02-13T20:02:01.9472042Z  Detecting ARC: \ +2025-02-13T20:02:01.9472340Z +2025-02-13T20:02:01.9472743Z  Detecting DRAM: \ +2025-02-13T20:02:01.9473043Z +2025-02-13T20:02:01.9473517Z [4/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9474217Z  +2025-02-13T20:02:01.9474645Z  Detected Chips: 1 +2025-02-13T20:02:01.9474940Z +2025-02-13T20:02:01.9475185Z  Detecting ARC: | +2025-02-13T20:02:01.9475468Z +2025-02-13T20:02:01.9475705Z  Detecting DRAM: | +2025-02-13T20:02:01.9476011Z +2025-02-13T20:02:01.9476460Z [4/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9477142Z  +2025-02-13T20:02:01.9477567Z  Detected Chips: 1 +2025-02-13T20:02:01.9477866Z +2025-02-13T20:02:01.9478101Z  Detecting ARC: / +2025-02-13T20:02:01.9478397Z +2025-02-13T20:02:01.9478633Z  Detecting DRAM: / +2025-02-13T20:02:01.9478927Z +2025-02-13T20:02:01.9479400Z [4/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9480125Z  +2025-02-13T20:02:01.9480540Z  Detected Chips: 1 +2025-02-13T20:02:01.9480821Z +2025-02-13T20:02:01.9481057Z  Detecting ARC: - +2025-02-13T20:02:01.9481336Z +2025-02-13T20:02:01.9481580Z  Detecting DRAM: - +2025-02-13T20:02:01.9481866Z +2025-02-13T20:02:01.9482310Z [4/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9482982Z  +2025-02-13T20:02:01.9483385Z  Detected Chips: 1 +2025-02-13T20:02:01.9483671Z +2025-02-13T20:02:01.9483904Z  Detecting ARC: \ +2025-02-13T20:02:01.9484186Z +2025-02-13T20:02:01.9484418Z  Detecting DRAM: \ +2025-02-13T20:02:01.9484700Z +2025-02-13T20:02:01.9485150Z [4/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9485830Z  +2025-02-13T20:02:01.9486243Z  Detected Chips: 1 +2025-02-13T20:02:01.9486525Z +2025-02-13T20:02:01.9486772Z  Detecting ARC: | +2025-02-13T20:02:01.9487046Z +2025-02-13T20:02:01.9487288Z  Detecting DRAM: | +2025-02-13T20:02:01.9487568Z +2025-02-13T20:02:01.9488015Z [4/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9488856Z  +2025-02-13T20:02:01.9489276Z  Detected Chips: 1 +2025-02-13T20:02:01.9489572Z +2025-02-13T20:02:01.9489803Z  Detecting ARC: / +2025-02-13T20:02:01.9490088Z +2025-02-13T20:02:01.9490321Z  Detecting DRAM: / +2025-02-13T20:02:01.9490611Z +2025-02-13T20:02:01.9491057Z [5/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9491728Z  +2025-02-13T20:02:01.9492146Z  Detected Chips: 1 +2025-02-13T20:02:01.9492433Z +2025-02-13T20:02:01.9492674Z  Detecting ARC: - +2025-02-13T20:02:01.9492958Z +2025-02-13T20:02:01.9493211Z  Detecting DRAM: - +2025-02-13T20:02:01.9493504Z +2025-02-13T20:02:01.9493961Z [5/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9494838Z  +2025-02-13T20:02:01.9495278Z  Detected Chips: 1 +2025-02-13T20:02:01.9495591Z +2025-02-13T20:02:01.9495827Z  Detecting ARC: \ +2025-02-13T20:02:01.9496128Z +2025-02-13T20:02:01.9496365Z  Detecting DRAM: \ +2025-02-13T20:02:01.9496666Z +2025-02-13T20:02:01.9497118Z [5/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9497802Z  +2025-02-13T20:02:01.9498226Z  Detected Chips: 1 +2025-02-13T20:02:01.9498524Z +2025-02-13T20:02:01.9498773Z  Detecting ARC: | +2025-02-13T20:02:01.9499033Z +2025-02-13T20:02:01.9499267Z  Detecting DRAM: | +2025-02-13T20:02:01.9499528Z +2025-02-13T20:02:01.9499957Z [5/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9500597Z  +2025-02-13T20:02:01.9501142Z  Detected Chips: 1 +2025-02-13T20:02:01.9501438Z +2025-02-13T20:02:01.9501663Z  Detecting ARC: / +2025-02-13T20:02:01.9501924Z +2025-02-13T20:02:01.9502132Z  Detecting DRAM: / +2025-02-13T20:02:01.9502402Z +2025-02-13T20:02:01.9502789Z [5/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9503333Z  +2025-02-13T20:02:01.9503680Z  Detected Chips: 1 +2025-02-13T20:02:01.9503924Z +2025-02-13T20:02:01.9504116Z  Detecting ARC: - +2025-02-13T20:02:01.9504341Z +2025-02-13T20:02:01.9504540Z  Detecting DRAM: - +2025-02-13T20:02:01.9504807Z +2025-02-13T20:02:01.9505182Z [5/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9505722Z  +2025-02-13T20:02:01.9506068Z  Detected Chips: 1 +2025-02-13T20:02:01.9506297Z +2025-02-13T20:02:01.9506498Z  Detecting ARC: \ +2025-02-13T20:02:01.9506722Z +2025-02-13T20:02:01.9506925Z  Detecting DRAM: \ +2025-02-13T20:02:01.9507162Z +2025-02-13T20:02:01.9507518Z [5/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9508053Z  +2025-02-13T20:02:01.9508408Z  Detected Chips: 1 +2025-02-13T20:02:01.9508650Z +2025-02-13T20:02:01.9508842Z  Detecting ARC: | +2025-02-13T20:02:01.9509065Z +2025-02-13T20:02:01.9509268Z  Detecting DRAM: | +2025-02-13T20:02:01.9509495Z +2025-02-13T20:02:01.9509945Z [5/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9510509Z  +2025-02-13T20:02:01.9510887Z  Detected Chips: 1 +2025-02-13T20:02:01.9511147Z +2025-02-13T20:02:01.9511381Z  Detecting ARC: / +2025-02-13T20:02:01.9511653Z +2025-02-13T20:02:01.9511858Z  Detecting DRAM: / +2025-02-13T20:02:01.9512097Z +2025-02-13T20:02:01.9512459Z [5/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9513007Z  +2025-02-13T20:02:01.9513348Z  Detected Chips: 1 +2025-02-13T20:02:01.9513585Z +2025-02-13T20:02:01.9513778Z  Detecting ARC: - +2025-02-13T20:02:01.9514008Z +2025-02-13T20:02:01.9514200Z  Detecting DRAM: - +2025-02-13T20:02:01.9514599Z +2025-02-13T20:02:01.9514968Z [5/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9515500Z  +2025-02-13T20:02:01.9515846Z  Detected Chips: 1 +2025-02-13T20:02:01.9516072Z +2025-02-13T20:02:01.9516266Z  Detecting ARC: \ +2025-02-13T20:02:01.9516488Z +2025-02-13T20:02:01.9516678Z  Detecting DRAM: \ +2025-02-13T20:02:01.9516914Z +2025-02-13T20:02:01.9517268Z [6/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9517797Z  +2025-02-13T20:02:01.9518146Z  Detected Chips: 1 +2025-02-13T20:02:01.9518384Z +2025-02-13T20:02:01.9518576Z  Detecting ARC: | +2025-02-13T20:02:01.9518818Z +2025-02-13T20:02:01.9519012Z  Detecting DRAM: | +2025-02-13T20:02:01.9519240Z +2025-02-13T20:02:01.9519614Z [6/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9520164Z  +2025-02-13T20:02:01.9520509Z  Detected Chips: 1 +2025-02-13T20:02:01.9520744Z +2025-02-13T20:02:01.9520956Z  Detecting ARC: / +2025-02-13T20:02:01.9521184Z +2025-02-13T20:02:01.9521394Z  Detecting DRAM: / +2025-02-13T20:02:01.9521627Z +2025-02-13T20:02:01.9521988Z [6/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9522533Z  +2025-02-13T20:02:01.9522873Z  Detected Chips: 1 +2025-02-13T20:02:01.9523124Z +2025-02-13T20:02:01.9523321Z  Detecting ARC: - +2025-02-13T20:02:01.9523553Z +2025-02-13T20:02:01.9523743Z  Detecting DRAM: - +2025-02-13T20:02:01.9523974Z +2025-02-13T20:02:01.9524451Z [6/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9525013Z  +2025-02-13T20:02:01.9525364Z  Detected Chips: 1 +2025-02-13T20:02:01.9525591Z +2025-02-13T20:02:01.9525788Z  Detecting ARC: \ +2025-02-13T20:02:01.9526026Z +2025-02-13T20:02:01.9526232Z  Detecting DRAM: \ +2025-02-13T20:02:01.9526462Z +2025-02-13T20:02:01.9526820Z [6/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9527350Z  +2025-02-13T20:02:01.9527701Z  Detected Chips: 1 +2025-02-13T20:02:01.9527945Z +2025-02-13T20:02:01.9528140Z  Detecting ARC: | +2025-02-13T20:02:01.9528373Z +2025-02-13T20:02:01.9528565Z  Detecting DRAM: | +2025-02-13T20:02:01.9528810Z +2025-02-13T20:02:01.9529165Z [6/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9529708Z  +2025-02-13T20:02:01.9530065Z  Detected Chips: 1 +2025-02-13T20:02:01.9530306Z +2025-02-13T20:02:01.9530516Z  Detecting ARC: / +2025-02-13T20:02:01.9530738Z +2025-02-13T20:02:01.9530938Z  Detecting DRAM: / +2025-02-13T20:02:01.9531173Z +2025-02-13T20:02:01.9531530Z [6/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9532082Z  +2025-02-13T20:02:01.9532434Z  Detected Chips: 1 +2025-02-13T20:02:01.9532676Z +2025-02-13T20:02:01.9532865Z  Detecting ARC: - +2025-02-13T20:02:01.9533096Z +2025-02-13T20:02:01.9533288Z  Detecting DRAM: - +2025-02-13T20:02:01.9533523Z +2025-02-13T20:02:01.9533878Z [6/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9534424Z  +2025-02-13T20:02:01.9534956Z  Detected Chips: 1 +2025-02-13T20:02:01.9535201Z +2025-02-13T20:02:01.9535415Z  Detecting ARC: \ +2025-02-13T20:02:01.9535638Z +2025-02-13T20:02:01.9535843Z  Detecting DRAM: \ +2025-02-13T20:02:01.9536078Z +2025-02-13T20:02:01.9536441Z [6/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9536974Z  +2025-02-13T20:02:01.9537316Z  Detected Chips: 1 +2025-02-13T20:02:01.9537700Z +2025-02-13T20:02:01.9537896Z  Detecting ARC: | +2025-02-13T20:02:01.9538132Z +2025-02-13T20:02:01.9538323Z  Detecting DRAM: | +2025-02-13T20:02:01.9538557Z +2025-02-13T20:02:01.9538920Z [6/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9539447Z  +2025-02-13T20:02:01.9539788Z  Detected Chips: 1 +2025-02-13T20:02:01.9540041Z +2025-02-13T20:02:01.9540265Z  Detecting ARC: / +2025-02-13T20:02:01.9540487Z +2025-02-13T20:02:01.9540694Z  Detecting DRAM: / +2025-02-13T20:02:01.9540925Z +2025-02-13T20:02:01.9541297Z [7/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9541830Z  +2025-02-13T20:02:01.9542186Z  Detected Chips: 1 +2025-02-13T20:02:01.9542429Z +2025-02-13T20:02:01.9542619Z  Detecting ARC: - +2025-02-13T20:02:01.9542854Z +2025-02-13T20:02:01.9543047Z  Detecting DRAM: - +2025-02-13T20:02:01.9543292Z +2025-02-13T20:02:01.9543651Z [7/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9544196Z  +2025-02-13T20:02:01.9544542Z  Detected Chips: 1 +2025-02-13T20:02:01.9544773Z +2025-02-13T20:02:01.9544972Z  Detecting ARC: \ +2025-02-13T20:02:01.9545198Z +2025-02-13T20:02:01.9545404Z  Detecting DRAM: \ +2025-02-13T20:02:01.9545629Z +2025-02-13T20:02:01.9545999Z [7/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9546533Z  +2025-02-13T20:02:01.9546891Z  Detected Chips: 1 +2025-02-13T20:02:01.9547147Z +2025-02-13T20:02:01.9547337Z  Detecting ARC: | +2025-02-13T20:02:01.9547753Z +2025-02-13T20:02:01.9547969Z  Detecting DRAM: | +2025-02-13T20:02:01.9548245Z +2025-02-13T20:02:01.9548609Z [7/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9577067Z  +2025-02-13T20:02:01.9577459Z  Detected Chips: 1 +2025-02-13T20:02:01.9577720Z +2025-02-13T20:02:01.9577923Z  Detecting ARC: / +2025-02-13T20:02:01.9578149Z +2025-02-13T20:02:01.9578471Z  Detecting DRAM: / +2025-02-13T20:02:01.9578751Z +2025-02-13T20:02:01.9579175Z [7/900] [0/16] ETH: Waiting for initial training to complete: / +2025-02-13T20:02:01.9579724Z  +2025-02-13T20:02:01.9580126Z  Detected Chips: 1 +2025-02-13T20:02:01.9580370Z +2025-02-13T20:02:01.9580561Z  Detecting ARC: - +2025-02-13T20:02:01.9580794Z +2025-02-13T20:02:01.9580989Z  Detecting DRAM: - +2025-02-13T20:02:01.9581241Z +2025-02-13T20:02:01.9581604Z [7/900] [0/16] ETH: Waiting for initial training to complete: - +2025-02-13T20:02:01.9582147Z  +2025-02-13T20:02:01.9582504Z  Detected Chips: 1 +2025-02-13T20:02:01.9582735Z +2025-02-13T20:02:01.9582935Z  Detecting ARC: \ +2025-02-13T20:02:01.9583168Z +2025-02-13T20:02:01.9583382Z  Detecting DRAM: \ +2025-02-13T20:02:01.9583616Z +2025-02-13T20:02:01.9583982Z [7/900] [0/16] ETH: Waiting for initial training to complete: \ +2025-02-13T20:02:01.9584525Z  +2025-02-13T20:02:01.9584876Z  Detected Chips: 1 +2025-02-13T20:02:01.9585116Z +2025-02-13T20:02:01.9585304Z  Detecting ARC: | +2025-02-13T20:02:01.9585538Z +2025-02-13T20:02:01.9585734Z  Detecting DRAM: | +2025-02-13T20:02:01.9585974Z +2025-02-13T20:02:01.9586330Z [7/900] [0/16] ETH: Waiting for initial training to complete: | +2025-02-13T20:02:01.9586883Z  +2025-02-13T20:02:01.9587232Z  Detected Chips: 1 +2025-02-13T20:02:01.9587461Z +2025-02-13T20:02:01.9587670Z  Detecting ARC: / +2025-02-13T20:02:01.9587894Z +2025-02-13T20:02:01.9588104Z  Detecting DRAM: / +2025-02-13T20:02:01.9588332Z +2025-02-13T20:02:01.9588678Z [] [16/16] ETH: / == *\N\o\ \c\h\i\p\s\ \d\e\t\e\c\t\e\d* ]] +2025-02-13T20:02:01.9589355Z + break +2025-02-13T20:02:01.9589724Z + '[' 1 -eq 10 ']' +2025-02-13T20:02:01.9590281Z + echo '::notice title=reset-successful-startup::tt-smi reset was successful' +2025-02-13T20:02:01.9590846Z + check_hugepages_service_status=0 +2025-02-13T20:02:01.9591302Z + sudo systemctl status tenstorrent-hugepages.service +2025-02-13T20:02:01.9591860Z + check_hugepages_service_status=3 +2025-02-13T20:02:01.9592225Z + '[' 3 -eq 4 ']' +2025-02-13T20:02:01.9593132Z + echo '::notice title=hugepages-service-found-startup::Hugepages service found. Command returned with exit code 3. Restarting it so we can ensure hugepages are available' +2025-02-13T20:02:01.9594174Z + sudo systemctl restart tenstorrent-hugepages.service +2025-02-13T20:02:01.9595013Z ● tenstorrent-hugepages.service - Script that configures hugepages for Tenstorrent ASICs +2025-02-13T20:02:01.9595962Z Loaded: loaded (/lib/systemd/system/tenstorrent-hugepages.service; enabled; vendor preset: enabled) +2025-02-13T20:02:01.9596818Z Active: failed (Result: exit-code) since Thu 2025-02-13 19:54:06 UTC; 7min ago +2025-02-13T20:02:01.9597638Z Process: 929227 ExecStart=/opt/tenstorrent/bin/hugepages-setup.sh (code=exited, status=1/FAILURE) +2025-02-13T20:02:01.9598351Z Main PID: 929227 (code=exited, status=1/FAILURE) +2025-02-13T20:02:01.9598654Z +2025-02-13T20:02:01.9599045Z Feb 13 19:54:06 tt-metal-ci-vm-68 systemd[1]: Started Script that configures hugepages for Tenstorrent ASICs. +2025-02-13T20:02:01.9599869Z Feb 13 19:54:06 tt-metal-ci-vm-68 hugepages-setup.sh[929227]: Node 0 hugepages before: 0 +2025-02-13T20:02:01.9600598Z Feb 13 19:54:06 tt-metal-ci-vm-68 hugepages-setup.sh[929227]: Node 0 hugepages needed: 4 +2025-02-13T20:02:01.9601457Z Feb 13 19:54:06 tt-metal-ci-vm-68 hugepages-setup.sh[929227]: Node 0 hugepages after: 0 +2025-02-13T20:02:01.9602286Z Feb 13 19:54:06 tt-metal-ci-vm-68 hugepages-setup.sh[929227]: Failed to get requested 4 hugepages, only got 0 +2025-02-13T20:02:01.9603288Z Feb 13 19:54:06 tt-metal-ci-vm-68 systemd[1]: tenstorrent-hugepages.service: Main process exited, code=exited, status=1/FAILURE +2025-02-13T20:02:01.9604296Z Feb 13 19:54:06 tt-metal-ci-vm-68 systemd[1]: tenstorrent-hugepages.service: Failed with result 'exit-code'. +2025-02-13T20:02:01.9606576Z ##[notice]Hugepages service found. Command returned with exit code 3. Restarting it so we can ensure hugepages are available +2025-02-13T20:02:01.9778291Z ++ date +%s +2025-02-13T20:02:01.9791878Z + hugepages_check_start=1739476921 +2025-02-13T20:02:01.9813506Z + hugepages_check_timeout=60 +2025-02-13T20:02:01.9815118Z ++ cat /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages +2025-02-13T20:02:01.9815871Z + [[ 1 -eq 0 ]] +2025-02-13T20:02:01.9818544Z ##[notice]Hugepages is now setup. +2025-02-13T20:02:01.9820624Z + echo '::notice title=hugepages-setup-success-startup::Hugepages is now setup.' +2025-02-13T20:02:01.9821433Z + echo 'Printing out cpu information...' +2025-02-13T20:02:01.9821992Z + lscpu +2025-02-13T20:02:01.9822397Z Printing out cpu information... +2025-02-13T20:02:01.9854352Z Architecture: x86_64 +2025-02-13T20:02:01.9855159Z CPU op-mode(s): 32-bit, 64-bit +2025-02-13T20:02:01.9855680Z Byte Order: Little Endian +2025-02-13T20:02:01.9856248Z Address sizes: 40 bits physical, 48 bits virtual +2025-02-13T20:02:01.9856791Z CPU(s): 14 +2025-02-13T20:02:01.9857246Z On-line CPU(s) list: 0-13 +2025-02-13T20:02:01.9857701Z Thread(s) per core: 1 +2025-02-13T20:02:01.9859578Z Core(s) per socket: 1 +2025-02-13T20:02:01.9860164Z Socket(s): 14 +2025-02-13T20:02:01.9860629Z NUMA node(s): 2 +2025-02-13T20:02:01.9861674Z Vendor ID: AuthenticAMD +2025-02-13T20:02:01.9862164Z CPU family: 23 +2025-02-13T20:02:01.9862606Z Model: 49 +2025-02-13T20:02:01.9863122Z Model name: AMD EPYC-Rome Processor +2025-02-13T20:02:01.9864082Z Stepping: 0 +2025-02-13T20:02:01.9864532Z CPU MHz: 2300.000 +2025-02-13T20:02:01.9865004Z BogoMIPS: 4600.00 +2025-02-13T20:02:01.9865518Z Virtualization: AMD-V +2025-02-13T20:02:01.9865967Z Hypervisor vendor: KVM +2025-02-13T20:02:01.9866483Z Virtualization type: full +2025-02-13T20:02:01.9866953Z L1d cache: 448 KiB +2025-02-13T20:02:01.9867414Z L1i cache: 448 KiB +2025-02-13T20:02:01.9867876Z L2 cache: 7 MiB +2025-02-13T20:02:01.9868330Z L3 cache: 224 MiB +2025-02-13T20:02:01.9868794Z NUMA node0 CPU(s): 0-6 +2025-02-13T20:02:01.9869223Z NUMA node1 CPU(s): 7-13 +2025-02-13T20:02:01.9869725Z Vulnerability Gather data sampling: Not affected +2025-02-13T20:02:01.9870266Z Vulnerability Itlb multihit: Not affected +2025-02-13T20:02:01.9870801Z Vulnerability L1tf: Not affected +2025-02-13T20:02:01.9871326Z Vulnerability Mds: Not affected +2025-02-13T20:02:01.9871847Z Vulnerability Meltdown: Not affected +2025-02-13T20:02:01.9872373Z Vulnerability Mmio stale data: Not affected +2025-02-13T20:02:01.9872902Z Vulnerability Retbleed: Vulnerable +2025-02-13T20:02:01.9873763Z Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp +2025-02-13T20:02:01.9874751Z Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization +2025-02-13T20:02:01.9876235Z Vulnerability Spectre v2: Mitigation; Retpolines; IBPB conditional; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected +2025-02-13T20:02:01.9877154Z Vulnerability Srbds: Not affected +2025-02-13T20:02:01.9877658Z Vulnerability Tsx async abort: Not affected +2025-02-13T20:02:01.9880381Z Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid tsc_known_freq pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm svm cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core ssbd ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 clzero xsaveerptr wbnoinvd arat npt nrip_save umip rdpid +2025-02-13T20:02:02.0115124Z ##[group]Run tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main +2025-02-13T20:02:02.0115857Z with: +2025-02-13T20:02:02.0116326Z token: *** +2025-02-13T20:02:02.0116641Z fetch-depth: 1 +2025-02-13T20:02:02.0116962Z env: +2025-02-13T20:02:02.0117249Z ARCH_NAME: wormhole_b0 +2025-02-13T20:02:02.0117611Z LOGURU_LEVEL: INFO +2025-02-13T20:02:02.0117975Z ##[endgroup] +2025-02-13T20:02:02.0204518Z ##[group]Run set -x +2025-02-13T20:02:02.0205026Z set -x +2025-02-13T20:02:02.0205396Z ls -al +2025-02-13T20:02:02.0205805Z if [ -f "semicolon_delimited_script" ]; then +2025-02-13T20:02:02.0206443Z  file semicolon_delimited_script +2025-02-13T20:02:02.0206996Z  head semicolon_delimited_script +2025-02-13T20:02:02.0207514Z fi +2025-02-13T20:02:02.0207929Z sudo rm -rf deleteme +2025-02-13T20:02:02.0208422Z sudo rm -rf docker-job +2025-02-13T20:02:02.0208917Z if [ -d ".git" ]; then +2025-02-13T20:02:02.0209532Z  echo 'Cleaning repo' +2025-02-13T20:02:02.0210012Z  git clean -xffd +2025-02-13T20:02:02.0210499Z  echo 'Done git clean -xffd' +2025-02-13T20:02:02.0211040Z  echo 'Attempting to delete any lock files' +2025-02-13T20:02:02.0211644Z  find .git -type f -iname '*.lock' -delete +2025-02-13T20:02:02.0212202Z  echo 'Done deleting lock files' +2025-02-13T20:02:02.0212963Z  echo 'De-init-ing submodules' +2025-02-13T20:02:02.0213493Z  git submodule deinit -f --all +2025-02-13T20:02:02.0214034Z  echo 'Done de-initing submodules' +2025-02-13T20:02:02.0214524Z fi +2025-02-13T20:02:02.0235352Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:02:02.0235997Z env: +2025-02-13T20:02:02.0236340Z ARCH_NAME: wormhole_b0 +2025-02-13T20:02:02.0236709Z LOGURU_LEVEL: INFO +2025-02-13T20:02:02.0237106Z ##[endgroup] +2025-02-13T20:02:02.0276240Z + ls -al +2025-02-13T20:02:02.0290616Z total 699996 +2025-02-13T20:02:02.0291147Z drwxr-xr-x 26 ubuntu ubuntu 4096 Feb 13 19:56 . +2025-02-13T20:02:02.0291859Z drwxr-xr-x 3 ubuntu ubuntu 4096 Aug 26 21:38 .. +2025-02-13T20:02:02.0292437Z drwxr-xr-x 3 ubuntu ubuntu 4096 Feb 13 19:55 .cache +2025-02-13T20:02:02.0293054Z -rw-r--r-- 1 ubuntu ubuntu 3966 Dec 19 19:11 .clang-format +2025-02-13T20:02:02.0293728Z -rw-r--r-- 1 ubuntu ubuntu 6268 Jan 26 15:59 .clang-format-ignore +2025-02-13T20:02:02.0294390Z -rw-r--r-- 1 ubuntu ubuntu 6374 Jan 26 15:59 .clang-tidy +2025-02-13T20:02:02.0295137Z -rw-r--r-- 1 ubuntu ubuntu 43 Sep 25 11:01 .clangd +2025-02-13T20:02:02.0295703Z -rw-r--r-- 1 ubuntu ubuntu 222 Oct 29 04:04 .gersemirc +2025-02-13T20:02:02.0296420Z drwxr-xr-x 10 ubuntu ubuntu 4096 Feb 13 19:56 .git +2025-02-13T20:02:02.0297003Z -rw-r--r-- 1 ubuntu ubuntu 239 Dec 9 18:06 .git-blame-ignore-revs +2025-02-13T20:02:02.0297647Z -rw-r--r-- 1 ubuntu ubuntu 35 Dec 4 05:07 .gitattributes +2025-02-13T20:02:02.0298281Z drwxr-xr-x 6 ubuntu ubuntu 4096 Feb 13 08:56 .github +2025-02-13T20:02:02.0298869Z -rw-r--r-- 1 ubuntu ubuntu 1730 Jan 22 04:32 .gitignore +2025-02-13T20:02:02.0299572Z -rw-r--r-- 1 ubuntu ubuntu 991 Feb 5 00:09 .gitmodules +2025-02-13T20:02:02.0300162Z drwx------ 6 ubuntu ubuntu 4096 Feb 13 19:55 .local +2025-02-13T20:02:02.0300785Z -rw-r--r-- 1 ubuntu ubuntu 932 Dec 9 18:06 .pre-commit-config.yaml +2025-02-13T20:02:02.0301457Z -rw-r--r-- 1 ubuntu ubuntu 15813574 Feb 13 08:56 .test_durations +2025-02-13T20:02:02.0302051Z -rw-r--r-- 1 ubuntu ubuntu 213 Nov 22 03:23 .yamllint +2025-02-13T20:02:02.0302637Z -rw-r--r-- 1 ubuntu ubuntu 11086 Feb 13 08:56 CMakeLists.txt +2025-02-13T20:02:02.0303381Z -rw-r--r-- 1 ubuntu ubuntu 2231 Feb 5 00:09 CMakePresets.json +2025-02-13T20:02:02.0303962Z -rw-r--r-- 1 ubuntu ubuntu 11478 Feb 13 08:56 CODEOWNERS +2025-02-13T20:02:02.0304791Z -rw-r--r-- 1 ubuntu ubuntu 5253 Sep 19 20:34 CODE_OF_CONDUCT.md +2025-02-13T20:02:02.0305475Z -rw-r--r-- 1 ubuntu ubuntu 36527 Jan 15 01:12 CONTRIBUTING.md +2025-02-13T20:02:02.0306095Z -rw-r--r-- 1 ubuntu ubuntu 126373 Jan 26 15:59 Doxyfile +2025-02-13T20:02:02.0306705Z -rw-r--r-- 1 ubuntu ubuntu 6046 Feb 5 00:09 INSTALLING.md +2025-02-13T20:02:02.0307333Z -rw-r--r-- 1 ubuntu ubuntu 11825 Oct 9 11:34 LICENSE +2025-02-13T20:02:02.0307928Z -rw-r--r-- 1 ubuntu ubuntu 1562 Jan 27 05:58 MANIFEST.in +2025-02-13T20:02:02.0308566Z -rw-r--r-- 1 ubuntu ubuntu 18372 Feb 13 08:56 METALIUM_GUIDE.md +2025-02-13T20:02:02.0309206Z -rw-r--r-- 1 ubuntu ubuntu 15526 Feb 13 08:56 README.md +2025-02-13T20:02:02.0309856Z drwxr-xr-x 7 ubuntu ubuntu 4096 Feb 13 19:54 build +2025-02-13T20:02:02.0310494Z -rwxr-xr-x 1 ubuntu ubuntu 11097 Feb 13 08:56 build_metal.sh +2025-02-13T20:02:02.0311080Z drwxr-xr-x 3 ubuntu ubuntu 4096 Feb 13 19:55 built +2025-02-13T20:02:02.0311716Z -rw-r--r-- 1 ubuntu ubuntu 1438 Oct 9 11:34 check_copyright_config.yaml +2025-02-13T20:02:02.0312419Z -rw-r--r-- 1 ubuntu ubuntu 1821 Sep 19 20:34 cloc.sh +2025-02-13T20:02:02.0313004Z drwxr-xr-x 4 ubuntu ubuntu 4096 Feb 13 08:56 cmake +2025-02-13T20:02:02.0313620Z -rw-r--r-- 1 ubuntu ubuntu 23178 Feb 13 08:56 conftest.py +2025-02-13T20:02:02.0314277Z drwxr-xr-x 2 ubuntu ubuntu 4096 Nov 26 09:37 contributing +2025-02-13T20:02:02.0314930Z -rwxr-xr-x 1 ubuntu ubuntu 1420 Oct 26 03:55 create_venv.sh +2025-02-13T20:02:02.0315805Z drwxr-xr-x 2 ubuntu ubuntu 4096 Feb 13 19:44 data +2025-02-13T20:02:02.0316423Z drwxr-xr-x 2 ubuntu ubuntu 4096 Feb 13 08:56 dependencies +2025-02-13T20:02:02.0317139Z drwxr-xr-x 2 ubuntu ubuntu 4096 Feb 13 08:56 dockerfile +2025-02-13T20:02:02.0317762Z drwxr-xr-x 3 ubuntu ubuntu 4096 Feb 2 00:53 docs +2025-02-13T20:02:02.0318324Z drwxr-xr-x 5 ubuntu ubuntu 4096 Feb 13 19:56 generated +2025-02-13T20:02:02.0318913Z drwxr-xr-x 4 ubuntu ubuntu 4096 Feb 5 00:09 infra +2025-02-13T20:02:02.0319582Z -rwxr-xr-x 1 ubuntu ubuntu 6885 Feb 13 08:56 install_dependencies.sh +2025-02-13T20:02:02.0320219Z drwxr-xr-x 9 ubuntu ubuntu 4096 Feb 13 19:54 models +2025-02-13T20:02:02.0320815Z -rw-r--r-- 1 ubuntu ubuntu 1042 Dec 18 16:56 pyproject.toml +2025-02-13T20:02:02.0321433Z -rw-r--r-- 1 ubuntu ubuntu 1200 Oct 9 11:34 pytest.ini +2025-02-13T20:02:02.0322040Z drwxr-xr-x 7 ubuntu ubuntu 4096 Feb 13 15:08 python_env +2025-02-13T20:02:02.0322668Z drwxr-xr-x 4 ubuntu ubuntu 4096 Feb 13 19:44 runtime +2025-02-13T20:02:02.0323323Z drwxr-xr-x 4 ubuntu ubuntu 4096 Feb 13 08:56 scripts +2025-02-13T20:02:02.0323937Z -rw-r--r-- 1 root root 348 Feb 13 19:54 semicolon_delimited_script +2025-02-13T20:02:02.0324586Z -rw-r--r-- 1 ubuntu ubuntu 7551 Feb 5 00:09 setup.py +2025-02-13T20:02:02.0325161Z drwxr-xr-x 24 ubuntu ubuntu 4096 Jan 15 01:12 tech_reports +2025-02-13T20:02:02.0325749Z drwxr-xr-x 11 ubuntu ubuntu 4096 Feb 13 08:56 tests +2025-02-13T20:02:02.0326360Z drwxr-xr-x 11 ubuntu ubuntu 4096 Feb 13 08:56 tt-train +2025-02-13T20:02:02.0326968Z drwxr-xr-x 5 ubuntu ubuntu 4096 Feb 13 19:50 tt_fabric +2025-02-13T20:02:02.0327553Z drwxr-xr-x 22 ubuntu ubuntu 4096 Feb 13 08:56 tt_metal +2025-02-13T20:02:02.0328181Z -rw-r--r-- 1 ubuntu ubuntu 700477440 Feb 13 19:54 ttm_any.tar +2025-02-13T20:02:02.0328787Z drwxr-xr-x 10 ubuntu ubuntu 4096 Feb 13 19:55 ttnn +2025-02-13T20:02:02.0329402Z + '[' -f semicolon_delimited_script ']' +2025-02-13T20:02:02.0330002Z + file semicolon_delimited_script +2025-02-13T20:02:02.0351569Z semicolon_delimited_script: ASCII text +2025-02-13T20:02:02.0353681Z + head semicolon_delimited_script +2025-02-13T20:02:02.0360686Z set -eu +2025-02-13T20:02:02.0360903Z +2025-02-13T20:02:02.0361041Z install_wheel=false +2025-02-13T20:02:02.0361536Z if [ "${install_wheel,,}" == "true" ]; then +2025-02-13T20:02:02.0362044Z WHEEL_FILENAME=$(ls -1 *.whl) +2025-02-13T20:02:02.0362513Z pip3 install "$WHEEL_FILENAME" +2025-02-13T20:02:02.0362980Z fi +2025-02-13T20:02:02.0363308Z +2025-02-13T20:02:02.0363558Z pip install --force-reinstall pip==21.2.4 +2025-02-13T20:02:02.0364173Z pip install -r tt_metal/python_env/requirements-dev.txt +2025-02-13T20:02:02.0364826Z + sudo rm -rf deleteme +2025-02-13T20:02:02.0575004Z + sudo rm -rf docker-job +2025-02-13T20:02:02.0799520Z + '[' -d .git ']' +2025-02-13T20:02:02.0799922Z Cleaning repo +2025-02-13T20:02:02.0800368Z + echo 'Cleaning repo' +2025-02-13T20:02:02.0800809Z + git clean -xffd +2025-02-13T20:02:05.0982042Z Removing .cache/ +2025-02-13T20:02:05.0982556Z Removing .local/ +2025-02-13T20:02:05.0982921Z Removing build/ +2025-02-13T20:02:05.0983281Z Removing built/ +2025-02-13T20:02:05.0983662Z Removing data/ +2025-02-13T20:02:05.0984021Z Removing generated/ +2025-02-13T20:02:05.0984400Z Removing python_env/ +2025-02-13T20:02:05.0984772Z Removing runtime/ +2025-02-13T20:02:05.0985165Z Removing semicolon_delimited_script +2025-02-13T20:02:05.0985614Z Removing ttm_any.tar +2025-02-13T20:02:05.0986039Z Removing ttnn/ttnn.egg-info/ +2025-02-13T20:02:05.0986562Z Removing ttnn/ttnn/_ttnn.so +2025-02-13T20:02:05.1007386Z + echo 'Done git clean -xffd' +2025-02-13T20:02:05.1007844Z Done git clean -xffd +2025-02-13T20:02:05.1008712Z + echo 'Attempting to delete any lock files' +2025-02-13T20:02:05.1009262Z Attempting to delete any lock files +2025-02-13T20:02:05.1010121Z + find .git -type f -iname '*.lock' -delete +2025-02-13T20:02:05.2325921Z + echo 'Done deleting lock files' +2025-02-13T20:02:05.2326394Z Done deleting lock files +2025-02-13T20:02:05.2326811Z De-init-ing submodules +2025-02-13T20:02:05.2327260Z + echo 'De-init-ing submodules' +2025-02-13T20:02:05.2327759Z + git submodule deinit -f --all +2025-02-13T20:02:05.2593301Z Cleared directory 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:02:05.2624049Z Submodule 'models/demos/t3000/llama2_70b/reference/llama' (https://github.com/tenstorrent-metal/llama.git) unregistered for path 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:02:05.2625224Z Cleared directory 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:02:05.2781732Z Submodule '3rd_party/wandb-cpp' (https://github.com/yhisaki/wandb-cpp) unregistered for path 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:02:05.2782591Z Cleared directory 'tt_metal/third_party/tracy' +2025-02-13T20:02:05.2817030Z Submodule 'tt_metal/third_party/tracy' (https://github.com/tenstorrent-metal/tracy.git) unregistered for path 'tt_metal/third_party/tracy' +2025-02-13T20:02:05.2818029Z Cleared directory 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:02:05.2852703Z Submodule 'tt_metal/third_party/tt_llk_blackhole' (https://github.com/tenstorrent/tt-llk-bh.git) unregistered for path 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:02:05.2853736Z Cleared directory 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:02:05.2887524Z Submodule 'tt_metal/third_party/tt_llk_grayskull' (https://github.com/tenstorrent/tt-llk-gs.git) unregistered for path 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:02:05.2888578Z Cleared directory 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:02:05.3033986Z Submodule 'tt_metal/third_party/tt_llk_wormhole_b0' (https://github.com/tenstorrent/tt-llk-wh-b0.git) unregistered for path 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:02:05.3034998Z Cleared directory 'tt_metal/third_party/umd' +2025-02-13T20:02:05.3051631Z Submodule 'tt_metal/third_party/umd' (https://github.com/tenstorrent/tt-umd.git) unregistered for path 'tt_metal/third_party/umd' +2025-02-13T20:02:05.3065510Z + echo 'Done de-initing submodules' +2025-02-13T20:02:05.3065899Z Done de-initing submodules +2025-02-13T20:02:05.3161745Z ##[group]Run actions/checkout@v4 +2025-02-13T20:02:05.3162138Z with: +2025-02-13T20:02:05.3162725Z token: *** +2025-02-13T20:02:05.3163017Z fetch-depth: 1 +2025-02-13T20:02:05.3163327Z lfs: false +2025-02-13T20:02:05.3163635Z submodules: recursive +2025-02-13T20:02:05.3163963Z clean: true +2025-02-13T20:02:05.3164291Z repository: tenstorrent/tt-metal +2025-02-13T20:02:05.3164672Z ssh-strict: true +2025-02-13T20:02:05.3165277Z ssh-user: git +2025-02-13T20:02:05.3165616Z persist-credentials: true +2025-02-13T20:02:05.3165998Z sparse-checkout-cone-mode: true +2025-02-13T20:02:05.3166379Z fetch-tags: false +2025-02-13T20:02:05.3166700Z show-progress: true +2025-02-13T20:02:05.3167051Z set-safe-directory: true +2025-02-13T20:02:05.3167515Z env: +2025-02-13T20:02:05.3167903Z ARCH_NAME: wormhole_b0 +2025-02-13T20:02:05.3168315Z LOGURU_LEVEL: INFO +2025-02-13T20:02:05.3168642Z ##[endgroup] +2025-02-13T20:02:05.4418396Z Syncing repository: tenstorrent/tt-metal +2025-02-13T20:02:05.4419922Z ##[group]Getting Git version info +2025-02-13T20:02:05.4420512Z Working directory is '/home/ubuntu/actions-runner/_work/tt-metal/tt-metal' +2025-02-13T20:02:05.4421345Z [command]/usr/bin/git version +2025-02-13T20:02:05.4421740Z git version 2.25.1 +2025-02-13T20:02:05.4445211Z ##[endgroup] +2025-02-13T20:02:05.4456140Z Copying '/home/ubuntu/.gitconfig' to '/home/ubuntu/actions-runner/_work/_temp/1e6c18f3-571f-4716-81a0-21115cf52d58/.gitconfig' +2025-02-13T20:02:05.4469087Z Temporarily overriding HOME='/home/ubuntu/actions-runner/_work/_temp/1e6c18f3-571f-4716-81a0-21115cf52d58' before making global git config changes +2025-02-13T20:02:05.4470230Z Adding repository directory to the temporary git global config as a safe directory +2025-02-13T20:02:05.4473466Z [command]/usr/bin/git config --global --add safe.directory /home/ubuntu/actions-runner/_work/tt-metal/tt-metal +2025-02-13T20:02:05.4519958Z [command]/usr/bin/git config --local --get remote.origin.url +2025-02-13T20:02:05.4540835Z https://github.com/tenstorrent/tt-metal +2025-02-13T20:02:05.4557805Z ##[group]Removing previously created refs, to avoid conflicts +2025-02-13T20:02:05.4561842Z [command]/usr/bin/git rev-parse --symbolic-full-name --verify --quiet HEAD +2025-02-13T20:02:05.4591079Z refs/heads/smanoj/conv_device_weights +2025-02-13T20:02:05.4601036Z [command]/usr/bin/git checkout --detach +2025-02-13T20:02:05.5244540Z HEAD is now at 68e85df3d #0: Skip weights bfloat8 on grayskull +2025-02-13T20:02:05.5924346Z [command]/usr/bin/git branch --delete --force smanoj/conv_device_weights +2025-02-13T20:02:05.5987031Z Deleted branch smanoj/conv_device_weights (was 68e85df3d). +2025-02-13T20:02:05.6674929Z ##[endgroup] +2025-02-13T20:02:05.6675680Z [command]/usr/bin/git submodule status +2025-02-13T20:02:05.6939478Z -29125b7ad8b5513eeaa4417ed92892bf39c8bd74 models/demos/t3000/llama2_70b/reference/llama +2025-02-13T20:02:05.6940885Z -368cd07f89f497df20a66936fbfae3956f151af4 tt-train/3rd_party/wandb-cpp +2025-02-13T20:02:05.6943414Z -71d4c8d378b52af7da7012b9b595a61e9304f0bb tt_metal/third_party/tracy +2025-02-13T20:02:05.6944500Z -9fd3e2d93d1532373f52e11e963de40c1cdf9a55 tt_metal/third_party/tt_llk_blackhole +2025-02-13T20:02:05.6945513Z -0c04db64275a4bd36a7e14d3c533855cb33f6a20 tt_metal/third_party/tt_llk_grayskull +2025-02-13T20:02:05.6946483Z -0ec3177bfc262f7edf6cfc19531ecb8f669895d2 tt_metal/third_party/tt_llk_wormhole_b0 +2025-02-13T20:02:05.6947562Z -5de287e9c5b2fa3d55fbfd53e9bc59e2050f32fb tt_metal/third_party/umd +2025-02-13T20:02:05.6951349Z ##[group]Cleaning the repository +2025-02-13T20:02:05.6956197Z [command]/usr/bin/git clean -ffdx +2025-02-13T20:02:05.7227189Z [command]/usr/bin/git reset --hard HEAD +2025-02-13T20:02:05.7955288Z HEAD is now at 68e85df3d #0: Skip weights bfloat8 on grayskull +2025-02-13T20:02:05.7969507Z ##[endgroup] +2025-02-13T20:02:05.7970707Z ##[group]Disabling automatic garbage collection +2025-02-13T20:02:05.7975335Z [command]/usr/bin/git config --local gc.auto 0 +2025-02-13T20:02:05.8004968Z ##[endgroup] +2025-02-13T20:02:05.8005663Z ##[group]Setting up auth +2025-02-13T20:02:05.8012465Z [command]/usr/bin/git config --local --name-only --get-regexp core\.sshCommand +2025-02-13T20:02:05.8041826Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'core\.sshCommand' && git config --local --unset-all 'core.sshCommand' || :" +2025-02-13T20:02:05.8314285Z [command]/usr/bin/git config --local --name-only --get-regexp http\.https\:\/\/github\.com\/\.extraheader +2025-02-13T20:02:05.8340146Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'http\.https\:\/\/github\.com\/\.extraheader' && git config --local --unset-all 'http.https://github.com/.extraheader' || :" +2025-02-13T20:02:05.8604184Z [command]/usr/bin/git config --local http.https://github.com/.extraheader AUTHORIZATION: basic *** +2025-02-13T20:02:05.8640977Z ##[endgroup] +2025-02-13T20:02:05.8641739Z ##[group]Fetching the repository +2025-02-13T20:02:05.8650332Z [command]/usr/bin/git -c protocol.version=2 fetch --no-tags --prune --no-recurse-submodules --depth=1 origin +ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70:refs/remotes/origin/sagarwal/multi_page_buffer +2025-02-13T20:02:06.6383631Z From https://github.com/tenstorrent/tt-metal +2025-02-13T20:02:06.6384663Z + 6d3999637...ac8ce51fe ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70 -> origin/sagarwal/multi_page_buffer (forced update) +2025-02-13T20:02:06.6420763Z ##[endgroup] +2025-02-13T20:02:06.6422597Z ##[group]Determining the checkout info +2025-02-13T20:02:06.6423216Z ##[endgroup] +2025-02-13T20:02:06.6423872Z ##[group]Checking out the ref +2025-02-13T20:02:06.6429331Z [command]/usr/bin/git checkout --progress --force -B sagarwal/multi_page_buffer refs/remotes/origin/sagarwal/multi_page_buffer +2025-02-13T20:02:06.7988381Z Previous HEAD position was 68e85df3d #0: Skip weights bfloat8 on grayskull +2025-02-13T20:02:06.8201360Z Switched to a new branch 'sagarwal/multi_page_buffer' +2025-02-13T20:02:06.8202309Z Branch 'sagarwal/multi_page_buffer' set up to track remote branch 'sagarwal/multi_page_buffer' from 'origin'. +2025-02-13T20:02:06.8865868Z ##[endgroup] +2025-02-13T20:02:06.8866675Z ##[group]Setting up auth for fetching submodules +2025-02-13T20:02:06.8871900Z [command]/usr/bin/git config --global http.https://github.com/.extraheader AUTHORIZATION: basic *** +2025-02-13T20:02:06.8911219Z [command]/usr/bin/git config --global --unset-all url.https://github.com/.insteadOf +2025-02-13T20:02:06.8938104Z [command]/usr/bin/git config --global --add url.https://github.com/.insteadOf git@github.com: +2025-02-13T20:02:06.8970055Z [command]/usr/bin/git config --global --add url.https://github.com/.insteadOf org-64161552@github.com: +2025-02-13T20:02:06.9002276Z ##[endgroup] +2025-02-13T20:02:06.9002829Z ##[group]Fetching submodules +2025-02-13T20:02:06.9005936Z [command]/usr/bin/git submodule sync --recursive +2025-02-13T20:02:06.9278122Z [command]/usr/bin/git -c protocol.version=2 submodule update --init --force --depth=1 --recursive +2025-02-13T20:02:06.9531421Z Submodule 'models/demos/t3000/llama2_70b/reference/llama' (https://github.com/tenstorrent-metal/llama.git) registered for path 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:02:06.9533308Z Submodule '3rd_party/wandb-cpp' (https://github.com/yhisaki/wandb-cpp) registered for path 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:02:06.9537604Z Submodule 'tt_metal/third_party/tracy' (https://github.com/tenstorrent-metal/tracy.git) registered for path 'tt_metal/third_party/tracy' +2025-02-13T20:02:06.9540498Z Submodule 'tt_metal/third_party/tt_llk_blackhole' (https://github.com/tenstorrent/tt-llk-bh.git) registered for path 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:02:06.9544421Z Submodule 'tt_metal/third_party/tt_llk_grayskull' (https://github.com/tenstorrent/tt-llk-gs.git) registered for path 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:02:06.9547515Z Submodule 'tt_metal/third_party/tt_llk_wormhole_b0' (https://github.com/tenstorrent/tt-llk-wh-b0.git) registered for path 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:02:06.9551541Z Submodule 'tt_metal/third_party/umd' (https://github.com/tenstorrent/tt-umd.git) registered for path 'tt_metal/third_party/umd' +2025-02-13T20:02:07.0029829Z Submodule path 'models/demos/t3000/llama2_70b/reference/llama': checked out '29125b7ad8b5513eeaa4417ed92892bf39c8bd74' +2025-02-13T20:02:07.0443627Z Submodule path 'tt-train/3rd_party/wandb-cpp': checked out '368cd07f89f497df20a66936fbfae3956f151af4' +2025-02-13T20:02:07.2057485Z Submodule path 'tt_metal/third_party/tracy': checked out '71d4c8d378b52af7da7012b9b595a61e9304f0bb' +2025-02-13T20:02:07.2407348Z Submodule path 'tt_metal/third_party/tt_llk_blackhole': checked out '9fd3e2d93d1532373f52e11e963de40c1cdf9a55' +2025-02-13T20:02:07.2721299Z Submodule path 'tt_metal/third_party/tt_llk_grayskull': checked out '0c04db64275a4bd36a7e14d3c533855cb33f6a20' +2025-02-13T20:02:07.3159777Z Submodule path 'tt_metal/third_party/tt_llk_wormhole_b0': checked out '0ec3177bfc262f7edf6cfc19531ecb8f669895d2' +2025-02-13T20:02:07.6099821Z WARNING: Multiple 'url.*..insteadof' keys with the same alias: "git@github.com:" +2025-02-13T20:02:07.6101086Z WARNING: Multiple 'url.*..insteadof' keys with the same alias: "org-64161552@github.com:" +2025-02-13T20:02:07.6144744Z Submodule path 'tt_metal/third_party/umd': checked out '5de287e9c5b2fa3d55fbfd53e9bc59e2050f32fb' +2025-02-13T20:02:07.6227406Z [command]/usr/bin/git submodule foreach --recursive git config --local gc.auto 0 +2025-02-13T20:02:07.6483178Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:02:07.6521779Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:02:07.6566966Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:02:07.6610452Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:02:07.6652532Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:02:07.6691974Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:02:07.6734957Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:02:07.6784981Z ##[endgroup] +2025-02-13T20:02:07.6785793Z ##[group]Persisting credentials for submodules +2025-02-13T20:02:07.6792607Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'url\.https\:\/\/github\.com\/\.insteadOf' && git config --local --unset-all 'url.https://github.com/.insteadOf' || :" +2025-02-13T20:02:07.7037828Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:02:07.7062898Z url.https://github.com/.insteadof +2025-02-13T20:02:07.7063550Z url.https://github.com/.insteadof +2025-02-13T20:02:07.7093906Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:02:07.7121739Z url.https://github.com/.insteadof +2025-02-13T20:02:07.7122380Z url.https://github.com/.insteadof +2025-02-13T20:02:07.7156643Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:02:07.7182893Z url.https://github.com/.insteadof +2025-02-13T20:02:07.7183552Z url.https://github.com/.insteadof +2025-02-13T20:02:07.7216043Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:02:07.7243362Z url.https://github.com/.insteadof +2025-02-13T20:02:07.7244010Z url.https://github.com/.insteadof +2025-02-13T20:02:07.7277432Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:02:07.7308552Z url.https://github.com/.insteadof +2025-02-13T20:02:07.7309345Z url.https://github.com/.insteadof +2025-02-13T20:02:07.7345870Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:02:07.7374242Z url.https://github.com/.insteadof +2025-02-13T20:02:07.7375063Z url.https://github.com/.insteadof +2025-02-13T20:02:07.7409075Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:02:07.7436437Z url.https://github.com/.insteadof +2025-02-13T20:02:07.7437097Z url.https://github.com/.insteadof +2025-02-13T20:02:07.7485932Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local 'http.https://github.com/.extraheader' 'AUTHORIZATION: basic ***' && git config --local --show-origin --name-only --get-regexp remote.origin.url" +2025-02-13T20:02:07.7736836Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:02:07.7777447Z file:/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.git/modules/models/demos/t3000/llama2_70b/reference/llama/config remote.origin.url +2025-02-13T20:02:07.7800810Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:02:07.7843469Z file:/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.git/modules/3rd_party/wandb-cpp/config remote.origin.url +2025-02-13T20:02:07.7863420Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:02:07.7907433Z file:/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.git/modules/tt_metal/third_party/tracy/config remote.origin.url +2025-02-13T20:02:07.7929591Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:02:07.8059266Z file:/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.git/modules/tt_metal/third_party/tt_llk_blackhole/config remote.origin.url +2025-02-13T20:02:07.8066363Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:02:07.8138933Z file:/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.git/modules/tt_metal/third_party/tt_llk_grayskull/config remote.origin.url +2025-02-13T20:02:07.8140567Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:02:07.8175054Z file:/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.git/modules/tt_metal/third_party/tt_llk_wormhole_b0/config remote.origin.url +2025-02-13T20:02:07.8195049Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:02:07.8235283Z file:/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.git/modules/tt_metal/third_party/umd/config remote.origin.url +2025-02-13T20:02:07.8308627Z [command]/usr/bin/git submodule foreach --recursive git config --local --add 'url.https://github.com/.insteadOf' 'git@github.com:' +2025-02-13T20:02:07.8559925Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:02:07.8597448Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:02:07.8639437Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:02:07.8682188Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:02:07.8723853Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:02:07.8760959Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:02:07.8802341Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:02:07.8855916Z [command]/usr/bin/git submodule foreach --recursive git config --local --add 'url.https://github.com/.insteadOf' 'org-64161552@github.com:' +2025-02-13T20:02:07.9100911Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:02:07.9138949Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:02:07.9174970Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:02:07.9217458Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:02:07.9255957Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:02:07.9292539Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:02:07.9332166Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:02:07.9383265Z ##[endgroup] +2025-02-13T20:02:07.9462430Z [command]/usr/bin/git log -1 --format=%H +2025-02-13T20:02:07.9520783Z ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70 +2025-02-13T20:02:07.9664342Z ##[group]Run git submodule foreach 'git clean -xffd' +2025-02-13T20:02:07.9664941Z git submodule foreach 'git clean -xffd' +2025-02-13T20:02:07.9685813Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:02:07.9686321Z env: +2025-02-13T20:02:07.9686644Z ARCH_NAME: wormhole_b0 +2025-02-13T20:02:07.9687021Z LOGURU_LEVEL: INFO +2025-02-13T20:02:07.9687383Z ##[endgroup] +2025-02-13T20:02:07.9962837Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:02:07.9984523Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:02:08.0009215Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:02:08.0042182Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:02:08.0063966Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:02:08.0088937Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:02:08.0115210Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:02:08.0238462Z Prepare all required actions +2025-02-13T20:02:08.0239161Z Getting action download info +2025-02-13T20:02:08.1927444Z Download action repository 'actions/download-artifact@v4' (SHA:fa0a91b85d4f404e444e00e005971372dc801d16) +2025-02-13T20:02:08.9129973Z Download action repository 'catchpoint/workflow-telemetry-action@v2' (SHA:94c3c3d9567a0205de6da68a76c428ce4e769af1) +2025-02-13T20:02:09.8050374Z ##[group]Run ./.github/actions/prepare-metal-run +2025-02-13T20:02:09.8050815Z with: +2025-02-13T20:02:09.8051108Z is_profiler: false +2025-02-13T20:02:09.8051458Z python-version: 3.8 +2025-02-13T20:02:09.8051827Z run-telemetry: false +2025-02-13T20:02:09.8052158Z env: +2025-02-13T20:02:09.8052445Z ARCH_NAME: wormhole_b0 +2025-02-13T20:02:09.8052783Z LOGURU_LEVEL: INFO +2025-02-13T20:02:09.8053086Z ##[endgroup] +2025-02-13T20:02:09.8122971Z ##[group]Run actions/download-artifact@v4 +2025-02-13T20:02:09.8123425Z with: +2025-02-13T20:02:09.8123714Z name: TTMetal_build_any +2025-02-13T20:02:09.8124103Z merge-multiple: false +2025-02-13T20:02:09.8124465Z repository: tenstorrent/tt-metal +2025-02-13T20:02:09.8124847Z run-id: 13315815702 +2025-02-13T20:02:09.8125142Z env: +2025-02-13T20:02:09.8125421Z ARCH_NAME: wormhole_b0 +2025-02-13T20:02:09.8125753Z LOGURU_LEVEL: INFO +2025-02-13T20:02:09.8126085Z ##[endgroup] +2025-02-13T20:02:10.0597799Z Downloading single artifact +2025-02-13T20:02:10.3014239Z Preparing to download the following artifacts: +2025-02-13T20:02:10.3015062Z - TTMetal_build_any (ID: 2588416029, Size: 171796974) +2025-02-13T20:02:10.4427634Z Redirecting to blob download url: https://productionresultssa8.blob.core.windows.net/actions-results/c50d1cc6-5c31-4c4c-b0e4-cb91df2420e1/workflow-job-run-85e4bcb1-b635-5839-8d32-ecb05ba8175c/artifacts/220fe10383c34fbe00d66e183fcfa42d19c438ee1c01790da9aeb9ea9685c6a0.zip +2025-02-13T20:02:10.4429548Z Starting download of artifact to: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal +2025-02-13T20:02:10.7055730Z (node:935428) [DEP0005] DeprecationWarning: Buffer() is deprecated due to security and usability issues. Please use the Buffer.alloc(), Buffer.allocUnsafe(), or Buffer.from() methods instead. +2025-02-13T20:02:10.7057178Z (Use `node --trace-deprecation ...` to show where the warning was created) +2025-02-13T20:02:20.8913190Z Artifact download completed successfully. +2025-02-13T20:02:20.8913882Z Total of 1 artifact(s) downloaded +2025-02-13T20:02:20.8920053Z Download artifact has finished successfully +2025-02-13T20:02:20.9081942Z ##[group]Run tar -xvf ttm_any.tar +2025-02-13T20:02:20.9082393Z tar -xvf ttm_any.tar +2025-02-13T20:02:20.9104203Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:02:20.9104695Z env: + +2025-02-13T20:04:10.8487948Z [ OK ] DeviceInit/DeviceParamFixture.TensixDeviceLoadBlankKernels/1 (6 ms) +2025-02-13T20:04:10.8489122Z [----------] 4 tests from DeviceInit/DeviceParamFixture (366 ms total) +2025-02-13T20:04:10.8489529Z +2025-02-13T20:04:10.8489710Z [----------] Global test environment tear-down +2025-02-13T20:04:10.8499044Z [==========] 30 tests from 7 test suites ran. (3503 ms total) +2025-02-13T20:04:10.8499669Z [ PASSED ] 13 tests. +2025-02-13T20:04:10.8500106Z [ SKIPPED ] 17 tests, listed below: +2025-02-13T20:04:10.8500661Z [ SKIPPED ] N300DeviceFixture.EthValidateEthernetConnectivity +2025-02-13T20:04:10.8501324Z [ SKIPPED ] N300DeviceFixture.EthInvalidLogicalEthernetCore +2025-02-13T20:04:10.8501969Z [ SKIPPED ] N300DeviceFixture.EthValidateAllEthernetCoreMapping +2025-02-13T20:04:10.8502658Z [ SKIPPED ] N300DeviceFixture.EthValidatePhysicalCoreConversion +2025-02-13T20:04:10.8503343Z [ SKIPPED ] N300DeviceFixture.ActiveEthValidateEthernetSockets +2025-02-13T20:04:10.8503955Z [ SKIPPED ] DevicePool.DevicePoolAddDevices +2025-02-13T20:04:10.8504466Z [ SKIPPED ] DevicePool.DevicePoolReduceDevices +2025-02-13T20:04:10.8505163Z [ SKIPPED ] TGFixture.ActiveEthValidateNumLinksBetweenAdjacentGalaxyChips +2025-02-13T20:04:10.8505871Z [ SKIPPED ] TGFixture.ValidateNumMMIOChips +2025-02-13T20:04:10.8506411Z [ SKIPPED ] TGFixture.ValidateNumGalaxyChips +2025-02-13T20:04:10.8506941Z [ SKIPPED ] TGFixture.ValidateChipBoardTypes +2025-02-13T20:04:10.8507716Z [ SKIPPED ] GalaxyFixture.ActiveEthValidateLinksBetweenMMIOAndGalaxyChips +2025-02-13T20:04:10.8508570Z [ SKIPPED ] GalaxyFixture.ValidateAllGalaxyChipsAreUnharvested +2025-02-13T20:04:10.8509298Z [ SKIPPED ] GalaxyFixture.ValidateAllMMIOChipsHaveSingleRowHarvested +2025-02-13T20:04:10.8509936Z [ SKIPPED ] TGGFixture.ValidateNumMMIOChips +2025-02-13T20:04:10.8510439Z [ SKIPPED ] TGGFixture.ValidateNumGalaxyChips +2025-02-13T20:04:10.8510949Z [ SKIPPED ] TGGFixture.ValidateChipBoardTypes +2025-02-13T20:04:10.8511777Z  Device | INFO  | Closing user mode device drivers +2025-02-13T20:04:12.1677100Z Prepare all required actions +2025-02-13T20:04:12.1677664Z Getting action download info +2025-02-13T20:04:12.4634756Z Download action repository 'actions/upload-artifact@v4' (SHA:65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08) +2025-02-13T20:04:13.2056274Z ##[group]Run ./.github/actions/upload-artifact-with-job-uuid +2025-02-13T20:04:13.2057082Z with: +2025-02-13T20:04:13.2057400Z path: generated/test_reports/ + +2025-02-13T20:04:13.2057798Z prefix: test_reports_ +2025-02-13T20:04:13.2058138Z env: +2025-02-13T20:04:13.2058440Z ARCH_NAME: wormhole_b0 +2025-02-13T20:04:13.2058796Z LOGURU_LEVEL: INFO +2025-02-13T20:04:13.2059317Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:13.2060118Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:04:13.2060907Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:13.2061671Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:13.2062412Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:13.2063165Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:04:13.2063939Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-13T20:04:13.2064588Z BUILD_TAG: 3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:04:13.2065452Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:04:13.2066250Z RUNNER_UID: 1000 +2025-02-13T20:04:13.2066799Z RUNNER_GID: 1000 +2025-02-13T20:04:13.2067124Z ##[endgroup] +2025-02-13T20:04:13.2125063Z ##[group]Run uuid=$(uuidgen) +2025-02-13T20:04:13.2125484Z uuid=$(uuidgen) +2025-02-13T20:04:13.2125901Z artifact_name="test_reports_$uuid" +2025-02-13T20:04:13.2126423Z echo "[UPLOAD-ARTIFACT-UUID] $artifact_name" +2025-02-13T20:04:13.2127034Z echo "artifact-name=$artifact_name" >> "$GITHUB_OUTPUT" +2025-02-13T20:04:13.2149156Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:04:13.2149633Z env: +2025-02-13T20:04:13.2149992Z ARCH_NAME: wormhole_b0 +2025-02-13T20:04:13.2150400Z LOGURU_LEVEL: INFO +2025-02-13T20:04:13.2150978Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:13.2151797Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:04:13.2152590Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:13.2153318Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:13.2154241Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:13.2154982Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:04:13.2155730Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-13T20:04:13.2156540Z BUILD_TAG: 3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:04:13.2157395Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:04:13.2158192Z RUNNER_UID: 1000 +2025-02-13T20:04:13.2158546Z RUNNER_GID: 1000 +2025-02-13T20:04:13.2158915Z ##[endgroup] +2025-02-13T20:04:13.2220585Z [UPLOAD-ARTIFACT-UUID] test_reports_304c9db0-c03c-4ec4-8503-cd90ed1e264c +2025-02-13T20:04:13.2279075Z ##[group]Run actions/upload-artifact@v4 +2025-02-13T20:04:13.2279537Z with: +2025-02-13T20:04:13.2279943Z name: test_reports_304c9db0-c03c-4ec4-8503-cd90ed1e264c +2025-02-13T20:04:13.2280474Z path: generated/test_reports/ + +2025-02-13T20:04:13.2280907Z if-no-files-found: warn +2025-02-13T20:04:13.2281319Z compression-level: 6 +2025-02-13T20:04:13.2281694Z overwrite: false +2025-02-13T20:04:13.2282047Z include-hidden-files: false +2025-02-13T20:04:13.2282772Z env: +2025-02-13T20:04:13.2283109Z ARCH_NAME: wormhole_b0 +2025-02-13T20:04:13.2283484Z LOGURU_LEVEL: INFO +2025-02-13T20:04:13.2284053Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:13.2284869Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:04:13.2285971Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:13.2286706Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:13.2287440Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:04:13.2288192Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:04:13.2288959Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-13T20:04:13.2289592Z BUILD_TAG: 3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:04:13.2290483Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:04:13.2291294Z RUNNER_UID: 1000 +2025-02-13T20:04:13.2291638Z RUNNER_GID: 1000 +2025-02-13T20:04:13.2291975Z ##[endgroup] +2025-02-13T20:04:13.5173920Z With the provided path, there will be 1 file uploaded +2025-02-13T20:04:13.5180547Z Artifact name is valid! +2025-02-13T20:04:13.5182675Z Root directory input is valid! +2025-02-13T20:04:13.7612885Z Beginning upload of artifact content to blob storage +2025-02-13T20:04:13.9946020Z Uploaded bytes 1502 +2025-02-13T20:04:14.0553702Z Finished uploading artifact content to blob storage! +2025-02-13T20:04:14.0555271Z SHA256 hash of uploaded artifact zip is 1776c969fd4ce0e532f0d0d5c56885d18266030c15230edf5f2a99d507142b36 +2025-02-13T20:04:14.0558400Z Finalizing artifact upload +2025-02-13T20:04:14.1785523Z Artifact test_reports_304c9db0-c03c-4ec4-8503-cd90ed1e264c.zip successfully finalized. Artifact ID 2588438668 +2025-02-13T20:04:14.1787263Z Artifact test_reports_304c9db0-c03c-4ec4-8503-cd90ed1e264c has been successfully uploaded! Final size is 1502 bytes. Artifact ID is 2588438668 +2025-02-13T20:04:14.1795001Z Artifact download URL: https://github.com/tenstorrent/tt-metal/actions/runs/13315815702/artifacts/2588438668 +2025-02-13T20:04:14.2020183Z Post job cleanup. +2025-02-13T20:04:14.2094157Z Post job cleanup. +2025-02-13T20:04:14.2964363Z [command]/usr/bin/git version +2025-02-13T20:04:14.3004908Z git version 2.25.1 +2025-02-13T20:04:14.3044758Z Copying '/home/ubuntu/.gitconfig' to '/home/ubuntu/actions-runner/_work/_temp/8f65a5bc-195a-41a4-a81a-c97fc0af023a/.gitconfig' +2025-02-13T20:04:14.3057401Z Temporarily overriding HOME='/home/ubuntu/actions-runner/_work/_temp/8f65a5bc-195a-41a4-a81a-c97fc0af023a' before making global git config changes +2025-02-13T20:04:14.3058519Z Adding repository directory to the temporary git global config as a safe directory +2025-02-13T20:04:14.3062979Z [command]/usr/bin/git config --global --add safe.directory /home/ubuntu/actions-runner/_work/tt-metal/tt-metal +2025-02-13T20:04:14.3091378Z [command]/usr/bin/git config --local --name-only --get-regexp core\.sshCommand +2025-02-13T20:04:14.3123168Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'core\.sshCommand' && git config --local --unset-all 'core.sshCommand' || :" +2025-02-13T20:04:14.3411167Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:04:14.3460150Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:04:14.3514222Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:04:14.3564199Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:04:14.3618509Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:04:14.3671996Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:04:14.3726540Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:04:14.3793724Z [command]/usr/bin/git config --local --name-only --get-regexp http\.https\:\/\/github\.com\/\.extraheader +2025-02-13T20:04:14.3815107Z http.https://github.com/.extraheader +2025-02-13T20:04:14.3824803Z [command]/usr/bin/git config --local --unset-all http.https://github.com/.extraheader +2025-02-13T20:04:14.3853082Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'http\.https\:\/\/github\.com\/\.extraheader' && git config --local --unset-all 'http.https://github.com/.extraheader' || :" +2025-02-13T20:04:14.4119491Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:04:14.4170364Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:04:14.4213111Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:04:14.4258939Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:04:14.4307150Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:04:14.4356637Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:04:14.4399904Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:04:14.4585660Z Post job cleanup. +2025-02-13T20:04:14.8133988Z [command]/usr/bin/docker logout https://ghcr.io +2025-02-13T20:04:14.8302373Z Removing login credentials for ghcr.io +2025-02-13T20:04:14.8351075Z ##[group]Post cache +2025-02-13T20:04:14.8352495Z State not set +2025-02-13T20:04:14.8353503Z ##[endgroup] +2025-02-13T20:04:14.8525386Z Post job cleanup. +2025-02-13T20:04:14.8612816Z Post job cleanup. +2025-02-13T20:04:14.9388997Z Post job cleanup. +2025-02-13T20:04:15.1024111Z Cache hit occurred on the primary key setup-venv-Linux-py-3.8.18-/home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/bin/python-509e0fbc74e4697ea036d8e6b4ed76321c253e4ffef8468c11ee556fb8e370e2-./create_venv.sh, not saving cache. +2025-02-13T20:04:15.1138078Z Post job cleanup. +2025-02-13T20:04:15.3232933Z Post job cleanup. +2025-02-13T20:04:15.3308851Z Post job cleanup. +2025-02-13T20:04:15.4753286Z [command]/usr/bin/git version +2025-02-13T20:04:15.4796448Z git version 2.25.1 +2025-02-13T20:04:15.4837147Z Copying '/home/ubuntu/.gitconfig' to '/home/ubuntu/actions-runner/_work/_temp/6339d470-e7f4-4506-b66c-6f14a7fa0ec4/.gitconfig' +2025-02-13T20:04:15.4847551Z Temporarily overriding HOME='/home/ubuntu/actions-runner/_work/_temp/6339d470-e7f4-4506-b66c-6f14a7fa0ec4' before making global git config changes +2025-02-13T20:04:15.4848659Z Adding repository directory to the temporary git global config as a safe directory +2025-02-13T20:04:15.4853887Z [command]/usr/bin/git config --global --add safe.directory /home/ubuntu/actions-runner/_work/tt-metal/tt-metal +2025-02-13T20:04:15.4894618Z [command]/usr/bin/git config --local --name-only --get-regexp core\.sshCommand +2025-02-13T20:04:15.4934580Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'core\.sshCommand' && git config --local --unset-all 'core.sshCommand' || :" +2025-02-13T20:04:15.5212251Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:04:15.5270766Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:04:15.5321269Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:04:15.5379962Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:04:15.5428642Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:04:15.5476693Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:04:15.5528887Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:04:15.5599050Z [command]/usr/bin/git config --local --name-only --get-regexp http\.https\:\/\/github\.com\/\.extraheader +2025-02-13T20:04:15.5637009Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'http\.https\:\/\/github\.com\/\.extraheader' && git config --local --unset-all 'http.https://github.com/.extraheader' || :" +2025-02-13T20:04:15.5907117Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:04:15.5960200Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:04:15.6007758Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:04:15.6056367Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:04:15.6108245Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:04:15.6154403Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:04:15.6202309Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:04:15.6390862Z A job completed hook has been configured by the self-hosted runner administrator +2025-02-13T20:04:15.6423731Z ##[group]Run '/opt/tt_metal_infra/scripts/ci/wormhole_b0/cleanup.sh' +2025-02-13T20:04:15.6439454Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:04:15.6439988Z ##[endgroup] +2025-02-13T20:04:15.6501713Z Current date / time is Thu Feb 13 20:04:15 UTC 2025 +2025-02-13T20:04:15.8536860Z Cleaning up orphan processes diff --git a/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/logs/37190230023.log b/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/logs/37190230023.log new file mode 100644 index 00000000000..ee0d0865c1f --- /dev/null +++ b/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/logs/37190230023.log @@ -0,0 +1,4710 @@ +2025-02-13T20:07:34.5693260Z Current runner version: '2.322.0' +2025-02-13T20:07:34.5700744Z Runner name: 'tt-metal-ci-vm-27' +2025-02-13T20:07:34.5701785Z Runner group name: 'Default' +2025-02-13T20:07:34.5703016Z Machine name: 'tt-metal-ci-vm-27' +2025-02-13T20:07:34.5707534Z ##[group]GITHUB_TOKEN Permissions +2025-02-13T20:07:34.5710225Z Actions: read +2025-02-13T20:07:34.5711030Z Contents: write +2025-02-13T20:07:34.5711788Z Metadata: read +2025-02-13T20:07:34.5712540Z Packages: write +2025-02-13T20:07:34.5713332Z Pages: write +2025-02-13T20:07:34.5714057Z PullRequests: write +2025-02-13T20:07:34.5714835Z ##[endgroup] +2025-02-13T20:07:34.5718577Z Secret source: Actions +2025-02-13T20:07:34.5719623Z Prepare workflow directory +2025-02-13T20:07:34.8289514Z Prepare all required actions +2025-02-13T20:07:34.8337941Z Getting action download info +2025-02-13T20:07:35.0035266Z Download action repository 'tenstorrent/tt-metal@main' (SHA:ac426de3d4a9c274964843fdae6aa83ea3960a30) +2025-02-13T20:07:41.0005635Z Getting action download info +2025-02-13T20:07:41.1496756Z Download action repository 'actions/checkout@v4' (SHA:11bd71901bbe5b1630ceea73d27597364c9af683) +2025-02-13T20:07:41.7716175Z Uses: tenstorrent/tt-metal/.github/workflows/cpp-post-commit.yaml@refs/heads/sagarwal/multi_page_buffer (ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70) +2025-02-13T20:07:41.7718816Z ##[group] Inputs +2025-02-13T20:07:41.7719337Z build-type: Release +2025-02-13T20:07:41.7720155Z with-retries: false +2025-02-13T20:07:41.7720636Z arch: wormhole_b0 +2025-02-13T20:07:41.7721391Z runner-label: N150 +2025-02-13T20:07:41.7722450Z timeout: 35 +2025-02-13T20:07:41.7722865Z os: ubuntu-20.04 +2025-02-13T20:07:41.7723316Z ##[endgroup] +2025-02-13T20:07:41.7723940Z Complete job name: cpp-unit-tests (wormhole_b0, N150) / tools wormhole_b0 N150 +2025-02-13T20:07:41.8356039Z A job started hook has been configured by the self-hosted runner administrator +2025-02-13T20:07:41.8507686Z ##[group]Run '/opt/tt_metal_infra/scripts/ci/wormhole_b0/reset.sh' +2025-02-13T20:07:41.8527101Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:07:41.8528273Z ##[endgroup] +2025-02-13T20:07:41.8697900Z ++ date +2025-02-13T20:07:41.8698574Z + echo Current date / time is Thu Feb 13 20:07:41 UTC 2025 +2025-02-13T20:07:41.8699177Z + set_e_was_enabled=false +2025-02-13T20:07:41.8699704Z + [[ ehxB == *e* ]] +2025-02-13T20:07:41.8700160Z + set_e_was_enabled=true +2025-02-13T20:07:41.8700625Z + set +e +2025-02-13T20:07:41.8701044Z + docker image prune +2025-02-13T20:07:41.8701551Z Current date / time is Thu Feb 13 20:07:41 UTC 2025 +2025-02-13T20:07:41.8831021Z WARNING! This will remove all dangling images. +2025-02-13T20:07:41.8865121Z ++ df +2025-02-13T20:07:41.8868541Z ++ awk '{print $5}' +2025-02-13T20:07:41.8870844Z ++ sed s/%// +2025-02-13T20:07:41.8871497Z +++ findmnt -n -o SOURCE / +2025-02-13T20:07:41.8897608Z ++ grep -w '^/dev/vda1' +2025-02-13T20:07:41.8918403Z + disk_usage_before=67 +2025-02-13T20:07:41.8934925Z + echo '::notice title=disk-usage-before-startup::Disk usage is 67 %' +2025-02-13T20:07:41.8935762Z + '[' 67 -ge 90 ']' +2025-02-13T20:07:41.8936940Z Are you sure you want to continue? [y/N] ::notice title=disk-usage-before-startup::Disk usage is 67 % +2025-02-13T20:07:41.8937901Z ++ df +2025-02-13T20:07:41.8938391Z ++ awk '{print $5}' +2025-02-13T20:07:41.8938914Z ++ sed s/%// +2025-02-13T20:07:41.8939442Z +++ findmnt -n -o SOURCE / +2025-02-13T20:07:41.8952366Z ++ grep -w '^/dev/vda1' +2025-02-13T20:07:41.9028489Z + disk_usage_after=67 +2025-02-13T20:07:41.9061562Z ##[notice]Disk usage is 67 % +2025-02-13T20:07:41.9069851Z + echo '::notice title=disk-usage-after-startup::Disk usage is 67 %' +2025-02-13T20:07:41.9070544Z + '[' 67 -ge 90 ']' +2025-02-13T20:07:41.9070995Z ++ lsmod +2025-02-13T20:07:41.9071499Z + lsmod_output='Module Size Used by +2025-02-13T20:07:41.9072111Z wekafsio 70086656 1 +2025-02-13T20:07:41.9072657Z wekafsgw 40960 4 wekafsio +2025-02-13T20:07:41.9073193Z xt_nat 16384 0 +2025-02-13T20:07:41.9073676Z xt_tcpudp 20480 0 +2025-02-13T20:07:41.9074720Z veth 28672 0 +2025-02-13T20:07:41.9075234Z uio_pci_generic 16384 0 +2025-02-13T20:07:41.9075757Z igb_uio 20480 0 +2025-02-13T20:07:41.9076324Z uio 20480 2 igb_uio,uio_pci_generic +2025-02-13T20:07:41.9076906Z xt_conntrack 16384 1 +2025-02-13T20:07:41.9077420Z xt_MASQUERADE 20480 1 +2025-02-13T20:07:41.9077951Z nf_conntrack_netlink 45056 0 +2025-02-13T20:07:41.9078512Z nfnetlink 16384 2 nf_conntrack_netlink +2025-02-13T20:07:41.9079120Z xfrm_user 36864 1 +2025-02-13T20:07:41.9079656Z xfrm_algo 16384 1 xfrm_user +2025-02-13T20:07:41.9080208Z iptable_nat 16384 1 +2025-02-13T20:07:41.9080802Z nf_nat 45056 3 xt_nat,iptable_nat,xt_MASQUERADE +2025-02-13T20:07:41.9081634Z nf_conntrack 139264 5 xt_conntrack,nf_nat,xt_nat,nf_conntrack_netlink,xt_MASQUERADE +2025-02-13T20:07:41.9082403Z nf_defrag_ipv6 24576 1 nf_conntrack +2025-02-13T20:07:41.9083012Z nf_defrag_ipv4 16384 1 nf_conntrack +2025-02-13T20:07:41.9083578Z xt_addrtype 16384 2 +2025-02-13T20:07:41.9084082Z iptable_filter 16384 1 +2025-02-13T20:07:41.9084617Z bpfilter 32768 0 +2025-02-13T20:07:41.9085140Z br_netfilter 28672 0 +2025-02-13T20:07:41.9085686Z bridge 176128 1 br_netfilter +2025-02-13T20:07:41.9086254Z stp 16384 1 bridge +2025-02-13T20:07:41.9086800Z llc 16384 2 bridge,stp +2025-02-13T20:07:41.9087348Z aufs 262144 0 +2025-02-13T20:07:41.9089416Z xfs 1286144 2 +2025-02-13T20:07:41.9089907Z overlay 118784 0 +2025-02-13T20:07:41.9090428Z rdma_ucm 28672 0 +2025-02-13T20:07:41.9091175Z rdma_cm 110592 1 rdma_ucm +2025-02-13T20:07:41.9091706Z iw_cm 49152 1 rdma_cm +2025-02-13T20:07:41.9092197Z ib_ipoib 131072 0 +2025-02-13T20:07:41.9092718Z ib_cm 114688 2 rdma_cm,ib_ipoib +2025-02-13T20:07:41.9093272Z ib_umad 28672 8 +2025-02-13T20:07:41.9093752Z nls_iso8859_1 16384 1 +2025-02-13T20:07:41.9094222Z dm_multipath 32768 0 +2025-02-13T20:07:41.9094701Z scsi_dh_rdac 16384 0 +2025-02-13T20:07:41.9095193Z scsi_dh_emc 16384 0 +2025-02-13T20:07:41.9095675Z scsi_dh_alua 20480 0 +2025-02-13T20:07:41.9096148Z mlx5_ib 397312 0 +2025-02-13T20:07:41.9096677Z ib_uverbs 139264 18 rdma_ucm,mlx5_ib +2025-02-13T20:07:41.9097209Z kvm_amd 98304 0 +2025-02-13T20:07:41.9097696Z ccp 90112 1 kvm_amd +2025-02-13T20:07:41.9098209Z kvm 667648 1 kvm_amd +2025-02-13T20:07:41.9098798Z joydev 24576 0 +2025-02-13T20:07:41.9099345Z input_leds 16384 0 +2025-02-13T20:07:41.9099863Z serio_raw 20480 0 +2025-02-13T20:07:41.9100708Z ib_core 348160 8 rdma_cm,ib_ipoib,iw_cm,ib_umad,rdma_ucm,ib_uverbs,mlx5_ib,ib_cm +2025-02-13T20:07:41.9101609Z tenstorrent 49152 0 +2025-02-13T20:07:41.9102211Z sch_fq_codel 20480 45 +2025-02-13T20:07:41.9102849Z binfmt_misc 24576 1 +2025-02-13T20:07:41.9103473Z msr 16384 0 +2025-02-13T20:07:41.9104078Z efi_pstore 16384 0 +2025-02-13T20:07:41.9104677Z virtio_rng 16384 0 +2025-02-13T20:07:41.9105318Z ip_tables 32768 2 iptable_filter,iptable_nat +2025-02-13T20:07:41.9106315Z x_tables 40960 7 xt_conntrack,iptable_filter,xt_tcpudp,xt_addrtype,xt_nat,ip_tables,xt_MASQUERADE +2025-02-13T20:07:41.9107277Z autofs4 45056 2 +2025-02-13T20:07:41.9107873Z btrfs 1269760 0 +2025-02-13T20:07:41.9108505Z zstd_compress 167936 1 btrfs +2025-02-13T20:07:41.9109150Z raid10 61440 0 +2025-02-13T20:07:41.9109728Z raid456 155648 0 +2025-02-13T20:07:41.9110362Z async_raid6_recov 24576 1 raid456 +2025-02-13T20:07:41.9111122Z async_memcpy 20480 2 raid456,async_raid6_recov +2025-02-13T20:07:41.9112286Z async_pq 24576 2 raid456,async_raid6_recov +2025-02-13T20:07:41.9113148Z async_xor 20480 3 async_pq,raid456,async_raid6_recov +2025-02-13T20:07:41.9114100Z async_tx 20480 5 async_pq,async_memcpy,async_xor,raid456,async_raid6_recov +2025-02-13T20:07:41.9114926Z xor 24576 2 async_xor,btrfs +2025-02-13T20:07:41.9115789Z raid6_pq 114688 4 async_pq,btrfs,raid456,async_raid6_recov +2025-02-13T20:07:41.9116818Z libcrc32c 16384 5 nf_conntrack,nf_nat,btrfs,xfs,raid456 +2025-02-13T20:07:41.9117658Z raid1 45056 0 +2025-02-13T20:07:41.9118274Z raid0 24576 0 +2025-02-13T20:07:41.9118809Z multipath 20480 0 +2025-02-13T20:07:41.9119405Z linear 20480 0 +2025-02-13T20:07:41.9119941Z hid_generic 16384 0 +2025-02-13T20:07:41.9120492Z usbhid 57344 0 +2025-02-13T20:07:41.9121075Z hid 131072 2 usbhid,hid_generic +2025-02-13T20:07:41.9121655Z mlx5_core 1626112 1 mlx5_ib +2025-02-13T20:07:41.9122186Z crct10dif_pclmul 16384 1 +2025-02-13T20:07:41.9122692Z crc32_pclmul 16384 0 +2025-02-13T20:07:41.9123204Z ghash_clmulni_intel 16384 0 +2025-02-13T20:07:41.9123695Z cirrus 16384 0 +2025-02-13T20:07:41.9124187Z aesni_intel 372736 0 +2025-02-13T20:07:41.9124728Z drm_kms_helper 184320 3 cirrus +2025-02-13T20:07:41.9125331Z pci_hyperv_intf 16384 1 mlx5_core +2025-02-13T20:07:41.9125898Z crypto_simd 16384 1 aesni_intel +2025-02-13T20:07:41.9126487Z syscopyarea 16384 1 drm_kms_helper +2025-02-13T20:07:41.9127239Z sysfillrect 16384 1 drm_kms_helper +2025-02-13T20:07:41.9128055Z mlxdevm 172032 1 mlx5_core +2025-02-13T20:07:41.9128613Z sysimgblt 16384 1 drm_kms_helper +2025-02-13T20:07:41.9129273Z auxiliary 16384 2 mlx5_ib,mlx5_core +2025-02-13T20:07:41.9129885Z fb_sys_fops 16384 1 drm_kms_helper +2025-02-13T20:07:41.9130512Z cryptd 24576 2 crypto_simd,ghash_clmulni_intel +2025-02-13T20:07:41.9131492Z mlx_compat 65536 12 rdma_cm,ib_ipoib,mlxdevm,iw_cm,auxiliary,ib_umad,ib_core,rdma_ucm,ib_uverbs,mlx5_ib,ib_cm,mlx5_core +2025-02-13T20:07:41.9132343Z ahci 40960 0 +2025-02-13T20:07:41.9132864Z tls 73728 1 mlx5_core +2025-02-13T20:07:41.9133431Z psmouse 155648 0 +2025-02-13T20:07:41.9133938Z libahci 36864 1 ahci +2025-02-13T20:07:41.9134460Z glue_helper 16384 1 aesni_intel +2025-02-13T20:07:41.9135012Z mlxfw 32768 1 mlx5_core +2025-02-13T20:07:41.9135602Z drm 495616 3 drm_kms_helper,cirrus +2025-02-13T20:07:41.9136181Z psample 20480 1 mlx5_core +2025-02-13T20:07:41.9136704Z virtio_blk 20480 3' +2025-02-13T20:07:41.9137200Z + grep -q tenstorrent +2025-02-13T20:07:41.9149723Z + echo Module Size Used by wekafsio 70086656 1 wekafsgw 40960 4 wekafsio xt_nat 16384 0 xt_tcpudp 20480 0 veth 28672 0 uio_pci_generic 16384 0 igb_uio 20480 0 uio 20480 2 igb_uio,uio_pci_generic xt_conntrack 16384 1 xt_MASQUERADE 20480 1 nf_conntrack_netlink 45056 0 nfnetlink 16384 2 nf_conntrack_netlink xfrm_user 36864 1 xfrm_algo 16384 1 xfrm_user iptable_nat 16384 1 nf_nat 45056 3 xt_nat,iptable_nat,xt_MASQUERADE nf_conntrack 139264 5 xt_conntrack,nf_nat,xt_nat,nf_conntrack_netlink,xt_MASQUERADE nf_defrag_ipv6 24576 1 nf_conntrack nf_defrag_ipv4 16384 1 nf_conntrack xt_addrtype 16384 2 iptable_filter 16384 1 bpfilter 32768 0 br_netfilter 28672 0 bridge 176128 1 br_netfilter stp 16384 1 bridge llc 16384 2 bridge,stp aufs 262144 0 xfs 1286144 2 overlay 118784 0 rdma_ucm 28672 0 rdma_cm 110592 1 rdma_ucm iw_cm 49152 1 rdma_cm ib_ipoib 131072 0 ib_cm 114688 2 rdma_cm,ib_ipoib ib_umad 28672 8 nls_iso8859_1 16384 1 dm_multipath 32768 0 scsi_dh_rdac 16384 0 scsi_dh_emc 16384 0 scsi_dh_alua 20480 0 mlx5_ib 397312 0 ib_uverbs 139264 18 rdma_ucm,mlx5_ib kvm_amd 98304 0 ccp 90112 1 kvm_amd kvm 667648 1 kvm_amd joydev 24576 0 input_leds 16384 0 serio_raw 20480 0 ib_core 348160 8 rdma_cm,ib_ipoib,iw_cm,ib_umad,rdma_ucm,ib_uverbs,mlx5_ib,ib_cm tenstorrent 49152 0 sch_fq_codel 20480 45 binfmt_misc 24576 1 msr 16384 0 efi_pstore 16384 0 virtio_rng 16384 0 ip_tables 32768 2 iptable_filter,iptable_nat x_tables 40960 7 xt_conntrack,iptable_filter,xt_tcpudp,xt_addrtype,xt_nat,ip_tables,xt_MASQUERADE autofs4 45056 2 btrfs 1269760 0 zstd_compress 167936 1 btrfs raid10 61440 0 raid456 155648 0 async_raid6_recov 24576 1 raid456 async_memcpy 20480 2 raid456,async_raid6_recov async_pq 24576 2 raid456,async_raid6_recov async_xor 20480 3 async_pq,raid456,async_raid6_recov async_tx 20480 5 async_pq,async_memcpy,async_xor,raid456,async_raid6_recov xor 24576 2 async_xor,btrfs raid6_pq 114688 4 async_pq,btrfs,raid456,async_raid6_recov libcrc32c 16384 5 nf_conntrack,nf_nat,btrfs,xfs,raid456 raid1 45056 0 raid0 24576 0 multipath 20480 0 linear 20480 0 hid_generic 16384 0 usbhid 57344 0 hid 131072 2 usbhid,hid_generic mlx5_core 1626112 1 mlx5_ib crct10dif_pclmul 16384 1 crc32_pclmul 16384 0 ghash_clmulni_intel 16384 0 cirrus 16384 0 aesni_intel 372736 0 drm_kms_helper 184320 3 cirrus pci_hyperv_intf 16384 1 mlx5_core crypto_simd 16384 1 aesni_intel syscopyarea 16384 1 drm_kms_helper sysfillrect 16384 1 drm_kms_helper mlxdevm 172032 1 mlx5_core sysimgblt 16384 1 drm_kms_helper auxiliary 16384 2 mlx5_ib,mlx5_core fb_sys_fops 16384 1 drm_kms_helper cryptd 24576 2 crypto_simd,ghash_clmulni_intel mlx_compat 65536 12 rdma_cm,ib_ipoib,mlxdevm,iw_cm,auxiliary,ib_umad,ib_core,rdma_ucm,ib_uverbs,mlx5_ib,ib_cm,mlx5_core ahci 40960 0 tls 73728 1 mlx5_core psmouse 155648 0 libahci 36864 1 ahci glue_helper 16384 1 aesni_intel mlxfw 32768 1 mlx5_core drm 495616 3 drm_kms_helper,cirrus psample 20480 1 mlx5_core virtio_blk 20480 3 +2025-02-13T20:07:41.9161349Z + [[ 0 -ne 0 ]] +2025-02-13T20:07:41.9161774Z ++ lsof -w /dev/tenstorrent/0 +2025-02-13T20:07:42.0489691Z + lsof_output= +2025-02-13T20:07:42.0490337Z + '[' -n '' ']' +2025-02-13T20:07:42.0490844Z + i=0 +2025-02-13T20:07:42.0491302Z + iter_limit=10 +2025-02-13T20:07:42.0492035Z + echo '::notice title=printing-smi-info-startup::Touching and printing out SMI info' +2025-02-13T20:07:42.0492815Z + sleep 20 +2025-02-13T20:07:42.0494833Z ##[notice]Touching and printing out SMI info +2025-02-13T20:08:02.0506856Z + sudo touch /opt/tt_metal_infra/smi.log +2025-02-13T20:08:02.0724785Z + sudo chown ubuntu /opt/tt_metal_infra/smi.log +2025-02-13T20:08:02.0934939Z + tt-smi-metal -s -f /opt/tt_metal_infra/smi.log +2025-02-13T20:08:02.4986397Z Gathering Information ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:00 +2025-02-13T20:08:02.4996132Z  Saved tt-smi log to: /opt/tt_metal_infra/smi.log  +2025-02-13T20:08:02.5623852Z + cat /opt/tt_metal_infra/smi.log +2025-02-13T20:08:02.5632639Z { +2025-02-13T20:08:02.5635109Z + echo '::notice title=attempting-reset-startup::Attempting to reset card(s). Sleeping first' +2025-02-13T20:08:02.5636051Z "time": "2025-02-13T20:08:02.492455", +2025-02-13T20:08:02.5636631Z "host_info": { +2025-02-13T20:08:02.5637031Z "OS": "Linux", +2025-02-13T20:08:02.5637445Z "Distro": "Ubuntu 20.04.3 LTS", +2025-02-13T20:08:02.5637880Z "Kernel": "5.4.0-205-generic", +2025-02-13T20:08:02.5638329Z "Hostname": "tt-metal-ci-vm-27", +2025-02-13T20:08:02.5638850Z "Platform": "x86_64", +2025-02-13T20:08:02.5639281Z "Python": "3.8.10", +2025-02-13T20:08:02.5639647Z "Memory": "47.14 GB", +2025-02-13T20:08:02.5640042Z "Driver": "TTKMD 1.29" +2025-02-13T20:08:02.5640413Z }, +2025-02-13T20:08:02.5640732Z "device_info": [ +2025-02-13T20:08:02.5641126Z { +2025-02-13T20:08:02.5641585Z "smbus_telem": { +2025-02-13T20:08:02.5642053Z "BOARD_ID": "0x10001851172b005", +2025-02-13T20:08:02.5642590Z "SMBUS_TX_ENUM_VERSION": "0xba5e0001", +2025-02-13T20:08:02.5643085Z "SMBUS_TX_DEVICE_ID": "0x401e1e52", +2025-02-13T20:08:02.5644092Z "SMBUS_TX_ASIC_RO": "0x2d8b2", +2025-02-13T20:08:02.5644543Z "SMBUS_TX_ASIC_IDD": "0x93b", +2025-02-13T20:08:02.5645009Z "SMBUS_TX_BOARD_ID_HIGH": "0x1000185", +2025-02-13T20:08:02.5645504Z "SMBUS_TX_BOARD_ID_LOW": "0x1172b005", +2025-02-13T20:08:02.5646411Z + sleep 30 +2025-02-13T20:08:02.5646866Z "SMBUS_TX_ARC0_FW_VERSION": "0x21d0000", +2025-02-13T20:08:02.5647345Z "SMBUS_TX_ARC1_FW_VERSION": "0x21d0000", +2025-02-13T20:08:02.5647996Z "SMBUS_TX_ARC2_FW_VERSION": null, +2025-02-13T20:08:02.5648475Z "SMBUS_TX_ARC3_FW_VERSION": "0x21d0000", +2025-02-13T20:08:02.5649055Z "SMBUS_TX_SPIBOOTROM_FW_VERSION": "0x30b0000", +2025-02-13T20:08:02.5649545Z "SMBUS_TX_ETH_FW_VERSION": "0x6a000", +2025-02-13T20:08:02.5650018Z "SMBUS_TX_M3_BL_FW_VERSION": "0x81020000", +2025-02-13T20:08:02.5650496Z "SMBUS_TX_M3_APP_FW_VERSION": "0x50a0000", +2025-02-13T20:08:02.5650975Z "SMBUS_TX_DDR_SPEED": null, +2025-02-13T20:08:02.5651430Z "SMBUS_TX_DDR_STATUS": "0x2222222", +2025-02-13T20:08:02.5651901Z "SMBUS_TX_ETH_STATUS0": "0x11111111", +2025-02-13T20:08:02.5652370Z "SMBUS_TX_ETH_STATUS1": "0x11111111", +2025-02-13T20:08:02.5652859Z "SMBUS_TX_PCIE_STATUS": "0x11040000", +2025-02-13T20:08:02.5653345Z "SMBUS_TX_FAULTS": null, +2025-02-13T20:08:02.5653777Z "SMBUS_TX_ARC0_HEALTH": "0x43945e", +2025-02-13T20:08:02.5654240Z "SMBUS_TX_ARC1_HEALTH": "0x18d1a4", +2025-02-13T20:08:02.5654695Z "SMBUS_TX_ARC2_HEALTH": null, +2025-02-13T20:08:02.5655408Z "SMBUS_TX_ARC3_HEALTH": "0x2a33", +2025-02-13T20:08:02.5655899Z "SMBUS_TX_FAN_SPEED": "0xffffffff", +2025-02-13T20:08:02.5656369Z "SMBUS_TX_AICLK": "0x3e801f4", +2025-02-13T20:08:02.5656827Z "SMBUS_TX_AXICLK": "0x384", +2025-02-13T20:08:02.5657279Z "SMBUS_TX_ARCCLK": "0x21c", +2025-02-13T20:08:02.5657703Z "SMBUS_TX_THROTTLER": null, +2025-02-13T20:08:02.5658146Z "SMBUS_TX_VCORE": "0x2d5", +2025-02-13T20:08:02.5658611Z "SMBUS_TX_ASIC_TEMPERATURE": "0x26e0242", +2025-02-13T20:08:02.5659102Z "SMBUS_TX_VREG_TEMPERATURE": null, +2025-02-13T20:08:02.5659588Z "SMBUS_TX_BOARD_TEMPERATURE": "0x222523", +2025-02-13T20:08:02.5660054Z "SMBUS_TX_TDP": "0x64000e", +2025-02-13T20:08:02.5660483Z "SMBUS_TX_TDC": "0xf00012", +2025-02-13T20:08:02.5660940Z "SMBUS_TX_VDD_LIMITS": "0x3e802d0", +2025-02-13T20:08:02.5661424Z "SMBUS_TX_THM_LIMITS": "0x53004b", +2025-02-13T20:08:02.5661873Z "SMBUS_TX_WH_FW_DATE": "0x4b01121f", +2025-02-13T20:08:02.5662343Z "SMBUS_TX_ASIC_TMON0": "0x2e291e22", +2025-02-13T20:08:02.5662806Z "SMBUS_TX_ASIC_TMON1": "0x1c26", +2025-02-13T20:08:02.5663268Z "SMBUS_TX_MVDDQ_POWER": "0x190000", +2025-02-13T20:08:02.5663764Z "SMBUS_TX_GDDR_TRAIN_TEMP0": null, +2025-02-13T20:08:02.5664230Z "SMBUS_TX_GDDR_TRAIN_TEMP1": null, +2025-02-13T20:08:02.5664693Z "SMBUS_TX_BOOT_DATE": "0x520d1331", +2025-02-13T20:08:02.5665154Z "SMBUS_TX_RT_SECONDS": "0x448", +2025-02-13T20:08:02.5665601Z "SMBUS_TX_AUX_STATUS": null, +2025-02-13T20:08:02.5666049Z "SMBUS_TX_ETH_DEBUG_STATUS0": "0xccddddcc", +2025-02-13T20:08:02.5666553Z "SMBUS_TX_ETH_DEBUG_STATUS1": "0xccdddddd", +2025-02-13T20:08:02.5667073Z "SMBUS_TX_TT_FLASH_VERSION": "0x30100" +2025-02-13T20:08:02.5667513Z }, +2025-02-13T20:08:02.5667850Z "board_info": { +2025-02-13T20:08:02.5668231Z "bus_id": "0000:07:00.0", +2025-02-13T20:08:02.5668658Z "board_type": "n150 L", +2025-02-13T20:08:02.5669088Z "board_id": "010001851172b005", +2025-02-13T20:08:02.5669627Z "coords": "(0, 0, 0, 0)", +2025-02-13T20:08:02.5670051Z "dram_status": true, +2025-02-13T20:08:02.5670493Z "dram_speed": "12G", +2025-02-13T20:08:02.5670936Z "pcie_speed": 4, +2025-02-13T20:08:02.5671337Z "pcie_width": 16 +2025-02-13T20:08:02.5671728Z }, +2025-02-13T20:08:02.5672072Z "telemetry": { +2025-02-13T20:08:02.5672460Z "voltage": "0.72", +2025-02-13T20:08:02.5672846Z "current": " 18.0", +2025-02-13T20:08:02.5673260Z "power": " 14.0", +2025-02-13T20:08:02.5673658Z "aiclk": " 500", +2025-02-13T20:08:02.5674072Z "asic_temperature": "36.1" +2025-02-13T20:08:02.5674492Z }, +2025-02-13T20:08:02.5674820Z "firmwares": { +2025-02-13T20:08:02.5675209Z "arc_fw": "2.29.0.0", +2025-02-13T20:08:02.5675617Z "arc_fw_date": "2024-11-01", +2025-02-13T20:08:02.5676064Z "eth_fw": "6.10.0", +2025-02-13T20:08:02.5676498Z "m3_bl_fw": "129.2.0.0", +2025-02-13T20:08:02.5676924Z "m3_app_fw": "5.10.0.0", +2025-02-13T20:08:02.5677364Z "tt_flash_version": "0.3.1.0" +2025-02-13T20:08:02.5677776Z }, +2025-02-13T20:08:02.5678100Z "limits": { +2025-02-13T20:08:02.5678449Z "vdd_min": "0.72", +2025-02-13T20:08:02.5678844Z "vdd_max": "1.00", +2025-02-13T20:08:02.5679246Z "tdp_limit": "100", +2025-02-13T20:08:02.5679658Z "tdc_limit": "240", +2025-02-13T20:08:02.5680077Z "asic_fmax": "1000", +2025-02-13T20:08:02.5680631Z "therm_trip_l1_limit": "83", +2025-02-13T20:08:02.5681079Z "thm_limit": "75", +2025-02-13T20:08:02.5681532Z "bus_peak_limit": null +2025-02-13T20:08:02.5681914Z } +2025-02-13T20:08:02.5682229Z } +2025-02-13T20:08:02.5682529Z ] +2025-02-13T20:08:02.5683074Z }::notice title=attempting-reset-startup::Attempting to reset card(s). Sleeping first +2025-02-13T20:08:32.5651367Z + '[' 0 -lt 10 ']' +2025-02-13T20:08:32.5651881Z + (( i++ )) +2025-02-13T20:08:32.5654515Z ++ tt-smi-metal -r 0 +2025-02-13T20:08:43.6502600Z + reset_output=' Starting pci link reset on WH devices at pci indices: 0  +2025-02-13T20:08:43.6503546Z  Finishing pci link reset on WH devices at pci indices: 0  +2025-02-13T20:08:43.6503986Z +2025-02-13T20:08:43.6504356Z  Re-initializing boards after reset....  +2025-02-13T20:08:43.6505156Z  Done! Detected 1 boards on host. ' +2025-02-13T20:08:43.6505641Z + [[ 0 -ne 0 ]] +2025-02-13T20:08:43.6506645Z + [[  Starting pci link reset on WH devices at pci indices: 0  +2025-02-13T20:08:43.6507462Z  Finishing pci link reset on WH devices at pci indices: 0  +2025-02-13T20:08:43.6507845Z +2025-02-13T20:08:43.6508150Z  Re-initializing boards after reset....  +2025-02-13T20:08:43.6508914Z  Done! Detected 1 boards on host.  == *\N\o\ \c\h\i\p\s\ \d\e\t\e\c\t\e\d* ]] +2025-02-13T20:08:43.6509574Z + break +2025-02-13T20:08:43.6509939Z + '[' 1 -eq 10 ']' +2025-02-13T20:08:43.6510625Z + echo '::notice title=reset-successful-startup::tt-smi reset was successful' +2025-02-13T20:08:43.6511266Z + check_hugepages_service_status=0 +2025-02-13T20:08:43.6511801Z + sudo systemctl status tenstorrent-hugepages.service +2025-02-13T20:08:43.6540292Z ##[notice]tt-smi reset was successful +2025-02-13T20:08:43.6873732Z ● tenstorrent-hugepages.service - Script that configures hugepages for Tenstorrent ASICs +2025-02-13T20:08:43.6874854Z Loaded: loaded (/lib/systemd/system/tenstorrent-hugepages.service; enabled; vendor preset: enabled) +2025-02-13T20:08:43.6875836Z Active: failed (Result: exit-code) since Thu 2025-02-13 19:50:33 UTC; 18min ago +2025-02-13T20:08:43.6876670Z Process: 1295998 ExecStart=/opt/tenstorrent/bin/hugepages-setup.sh (code=exited, status=1/FAILURE) +2025-02-13T20:08:43.6877383Z Main PID: 1295998 (code=exited, status=1/FAILURE) +2025-02-13T20:08:43.6878216Z +2025-02-13T20:08:43.6878616Z Feb 13 19:50:33 tt-metal-ci-vm-27 systemd[1]: Started Script that configures hugepages for Tenstorrent ASICs. +2025-02-13T20:08:43.6879439Z Feb 13 19:50:33 tt-metal-ci-vm-27 hugepages-setup.sh[1295998]: Node 0 hugepages before: 0 +2025-02-13T20:08:43.6880228Z Feb 13 19:50:33 tt-metal-ci-vm-27 hugepages-setup.sh[1295998]: Node 0 hugepages needed: 4 +2025-02-13T20:08:43.6881091Z Feb 13 19:50:33 tt-metal-ci-vm-27 hugepages-setup.sh[1295998]: Node 0 hugepages after: 0 +2025-02-13T20:08:43.6881956Z Feb 13 19:50:33 tt-metal-ci-vm-27 hugepages-setup.sh[1295998]: Failed to get requested 4 hugepages, only got 0 +2025-02-13T20:08:43.6882963Z Feb 13 19:50:33 tt-metal-ci-vm-27 systemd[1]: tenstorrent-hugepages.service: Main process exited, code=exited, status=1/FAILURE +2025-02-13T20:08:43.6883966Z Feb 13 19:50:33 tt-metal-ci-vm-27 systemd[1]: tenstorrent-hugepages.service: Failed with result 'exit-code'. +2025-02-13T20:08:43.6884673Z + check_hugepages_service_status=3 +2025-02-13T20:08:43.6885044Z + '[' 3 -eq 4 ']' +2025-02-13T20:08:43.6885959Z + echo '::notice title=hugepages-service-found-startup::Hugepages service found. Command returned with exit code 3. Restarting it so we can ensure hugepages are available' +2025-02-13T20:08:43.6887059Z + sudo systemctl restart tenstorrent-hugepages.service +2025-02-13T20:08:43.6889023Z ##[notice]Hugepages service found. Command returned with exit code 3. Restarting it so we can ensure hugepages are available +2025-02-13T20:08:43.7158114Z ++ date +%s +2025-02-13T20:08:43.7181379Z + hugepages_check_start=1739477323 +2025-02-13T20:08:43.7182077Z + hugepages_check_timeout=60 +2025-02-13T20:08:43.7183085Z ++ cat /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages +2025-02-13T20:08:43.7191868Z + [[ 1 -eq 0 ]] +2025-02-13T20:08:43.7192969Z + echo '::notice title=hugepages-setup-success-startup::Hugepages is now setup.' +2025-02-13T20:08:43.7194785Z ##[notice]Hugepages is now setup. +2025-02-13T20:08:43.7196473Z Printing out cpu information... +2025-02-13T20:08:43.7197014Z + echo 'Printing out cpu information...' +2025-02-13T20:08:43.7197500Z + lscpu +2025-02-13T20:08:43.7227834Z Architecture: x86_64 +2025-02-13T20:08:43.7228375Z CPU op-mode(s): 32-bit, 64-bit +2025-02-13T20:08:43.7228870Z Byte Order: Little Endian +2025-02-13T20:08:43.7229398Z Address sizes: 40 bits physical, 48 bits virtual +2025-02-13T20:08:43.7229903Z CPU(s): 14 +2025-02-13T20:08:43.7230316Z On-line CPU(s) list: 0-13 +2025-02-13T20:08:43.7230790Z Thread(s) per core: 1 +2025-02-13T20:08:43.7231196Z Core(s) per socket: 1 +2025-02-13T20:08:43.7231824Z Socket(s): 14 +2025-02-13T20:08:43.7232232Z NUMA node(s): 2 +2025-02-13T20:08:43.7232658Z Vendor ID: AuthenticAMD +2025-02-13T20:08:43.7233106Z CPU family: 23 +2025-02-13T20:08:43.7233515Z Model: 49 +2025-02-13T20:08:43.7233983Z Model name: AMD EPYC-Rome Processor +2025-02-13T20:08:43.7234467Z Stepping: 0 +2025-02-13T20:08:43.7234948Z CPU MHz: 2299.974 +2025-02-13T20:08:43.7235377Z BogoMIPS: 4599.94 +2025-02-13T20:08:43.7235797Z Virtualization: AMD-V +2025-02-13T20:08:43.7236228Z Hypervisor vendor: KVM +2025-02-13T20:08:43.7254373Z Virtualization type: full +2025-02-13T20:08:43.7254940Z L1d cache: 448 KiB +2025-02-13T20:08:43.7255389Z L1i cache: 448 KiB +2025-02-13T20:08:43.7256361Z L2 cache: 7 MiB +2025-02-13T20:08:43.7256989Z L3 cache: 224 MiB +2025-02-13T20:08:43.7257413Z NUMA node0 CPU(s): 0-6 +2025-02-13T20:08:43.7257821Z NUMA node1 CPU(s): 7-13 +2025-02-13T20:08:43.7258502Z Vulnerability Gather data sampling: Not affected +2025-02-13T20:08:43.7258999Z Vulnerability Itlb multihit: Not affected +2025-02-13T20:08:43.7259508Z Vulnerability L1tf: Not affected +2025-02-13T20:08:43.7259971Z Vulnerability Mds: Not affected +2025-02-13T20:08:43.7260432Z Vulnerability Meltdown: Not affected +2025-02-13T20:08:43.7260903Z Vulnerability Mmio stale data: Not affected +2025-02-13T20:08:43.7261372Z Vulnerability Retbleed: Vulnerable +2025-02-13T20:08:43.7262180Z Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp +2025-02-13T20:08:43.7263141Z Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization +2025-02-13T20:08:43.7264261Z Vulnerability Spectre v2: Mitigation; Retpolines; IBPB conditional; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected +2025-02-13T20:08:43.7265140Z Vulnerability Srbds: Not affected +2025-02-13T20:08:43.7265612Z Vulnerability Tsx async abort: Not affected +2025-02-13T20:08:43.7268637Z Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid tsc_known_freq pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm svm cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core ssbd ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 clzero xsaveerptr wbnoinvd arat npt nrip_save umip rdpid +2025-02-13T20:08:43.7486949Z ##[group]Run tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main +2025-02-13T20:08:43.7487731Z with: +2025-02-13T20:08:43.7488195Z token: *** +2025-02-13T20:08:43.7488490Z fetch-depth: 1 +2025-02-13T20:08:43.7488779Z env: +2025-02-13T20:08:43.7489089Z ARCH_NAME: wormhole_b0 +2025-02-13T20:08:43.7489412Z LOGURU_LEVEL: INFO +2025-02-13T20:08:43.7489702Z ##[endgroup] +2025-02-13T20:08:43.7578650Z ##[group]Run set -x +2025-02-13T20:08:43.7579021Z set -x +2025-02-13T20:08:43.7579315Z ls -al +2025-02-13T20:08:43.7579686Z if [ -f "semicolon_delimited_script" ]; then +2025-02-13T20:08:43.7580159Z  file semicolon_delimited_script +2025-02-13T20:08:43.7580595Z  head semicolon_delimited_script +2025-02-13T20:08:43.7580977Z fi +2025-02-13T20:08:43.7581282Z sudo rm -rf deleteme +2025-02-13T20:08:43.7581651Z sudo rm -rf docker-job +2025-02-13T20:08:43.7582021Z if [ -d ".git" ]; then +2025-02-13T20:08:43.7582434Z  echo 'Cleaning repo' +2025-02-13T20:08:43.7582807Z  git clean -xffd +2025-02-13T20:08:43.7583155Z  echo 'Done git clean -xffd' +2025-02-13T20:08:43.7583597Z  echo 'Attempting to delete any lock files' +2025-02-13T20:08:43.7584096Z  find .git -type f -iname '*.lock' -delete +2025-02-13T20:08:43.7584550Z  echo 'Done deleting lock files' +2025-02-13T20:08:43.7584969Z  echo 'De-init-ing submodules' +2025-02-13T20:08:43.7585387Z  git submodule deinit -f --all +2025-02-13T20:08:43.7585809Z  echo 'Done de-initing submodules' +2025-02-13T20:08:43.7586269Z fi +2025-02-13T20:08:43.7606032Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:08:43.7606493Z env: +2025-02-13T20:08:43.7606775Z ARCH_NAME: wormhole_b0 +2025-02-13T20:08:43.7607091Z LOGURU_LEVEL: INFO +2025-02-13T20:08:43.7607392Z ##[endgroup] +2025-02-13T20:08:43.7645153Z + ls -al +2025-02-13T20:08:43.7660499Z total 359828 +2025-02-13T20:08:43.7661531Z drwxr-xr-x 24 ubuntu ubuntu 4096 Feb 13 19:55 . +2025-02-13T20:08:43.7662119Z drwxr-xr-x 3 ubuntu ubuntu 4096 Apr 15 2024 .. +2025-02-13T20:08:43.7662647Z drwxr-xr-x 4 ubuntu ubuntu 4096 Feb 13 19:52 .cache +2025-02-13T20:08:43.7663447Z -rw-r--r-- 1 ubuntu ubuntu 3966 Jan 2 08:46 .clang-format +2025-02-13T20:08:43.7664064Z -rw-r--r-- 1 ubuntu ubuntu 6268 Jan 26 15:59 .clang-format-ignore +2025-02-13T20:08:43.7664689Z -rw-r--r-- 1 ubuntu ubuntu 6374 Jan 26 15:59 .clang-tidy +2025-02-13T20:08:43.7665213Z -rw-r--r-- 1 ubuntu ubuntu 43 Sep 25 11:01 .clangd +2025-02-13T20:08:43.7665721Z -rw-r--r-- 1 ubuntu ubuntu 222 Oct 25 23:14 .gersemirc +2025-02-13T20:08:43.7666297Z drwxr-xr-x 10 ubuntu ubuntu 4096 Feb 13 20:07 .git +2025-02-13T20:08:43.7666859Z -rw-r--r-- 1 ubuntu ubuntu 239 Jan 2 08:46 .git-blame-ignore-revs +2025-02-13T20:08:43.7667453Z -rw-r--r-- 1 ubuntu ubuntu 35 Jan 2 08:46 .gitattributes +2025-02-13T20:08:43.7667997Z drwxr-xr-x 6 ubuntu ubuntu 4096 Feb 13 05:46 .github +2025-02-13T20:08:43.7668524Z -rw-r--r-- 1 ubuntu ubuntu 1730 Jan 21 18:03 .gitignore +2025-02-13T20:08:43.7669070Z -rw-r--r-- 1 ubuntu ubuntu 991 Feb 4 23:43 .gitmodules +2025-02-13T20:08:43.7669653Z drwx------ 6 ubuntu ubuntu 4096 Feb 13 19:52 .local +2025-02-13T20:08:43.7670210Z -rw-r--r-- 1 ubuntu ubuntu 932 Jan 2 08:46 .pre-commit-config.yaml +2025-02-13T20:08:43.7670813Z drwxr-xr-x 3 ubuntu ubuntu 4096 Feb 13 19:52 .pytest_cache +2025-02-13T20:08:43.7671378Z -rw-r--r-- 1 ubuntu ubuntu 15813574 Feb 13 05:46 .test_durations +2025-02-13T20:08:43.7672210Z drwxr-xr-x 4 ubuntu ubuntu 4096 Feb 13 19:52 .ttnn_runtime_artifacts +2025-02-13T20:08:43.7672785Z -rw-r--r-- 1 ubuntu ubuntu 213 Nov 25 22:06 .yamllint +2025-02-13T20:08:43.7673315Z -rw-r--r-- 1 ubuntu ubuntu 11086 Feb 13 05:46 CMakeLists.txt +2025-02-13T20:08:43.7674050Z -rw-r--r-- 1 ubuntu ubuntu 2231 Feb 4 23:43 CMakePresets.json +2025-02-13T20:08:43.7674604Z -rw-r--r-- 1 ubuntu ubuntu 11478 Feb 13 05:46 CODEOWNERS +2025-02-13T20:08:43.7675166Z -rw-r--r-- 1 ubuntu ubuntu 5253 Sep 19 18:09 CODE_OF_CONDUCT.md +2025-02-13T20:08:43.7675795Z -rw-r--r-- 1 ubuntu ubuntu 36527 Jan 15 01:12 CONTRIBUTING.md +2025-02-13T20:08:43.7676342Z -rw-r--r-- 1 ubuntu ubuntu 126373 Jan 26 15:59 Doxyfile +2025-02-13T20:08:43.7676864Z -rw-r--r-- 1 ubuntu ubuntu 6046 Feb 4 23:43 INSTALLING.md +2025-02-13T20:08:43.7677386Z -rw-r--r-- 1 ubuntu ubuntu 11825 Sep 24 08:48 LICENSE +2025-02-13T20:08:43.7677916Z -rw-r--r-- 1 ubuntu ubuntu 1562 Jan 27 05:29 MANIFEST.in +2025-02-13T20:08:43.7678466Z -rw-r--r-- 1 ubuntu ubuntu 18372 Feb 13 05:46 METALIUM_GUIDE.md +2025-02-13T20:08:43.7679007Z -rw-r--r-- 1 ubuntu ubuntu 15526 Feb 13 05:46 README.md +2025-02-13T20:08:43.7679538Z drwxr-xr-x 2 ubuntu ubuntu 4096 Feb 13 19:52 __pycache__ +2025-02-13T20:08:43.7680098Z -rwxr-xr-x 1 ubuntu ubuntu 11097 Feb 13 05:46 build_metal.sh +2025-02-13T20:08:43.7680713Z -rw-r--r-- 1 ubuntu ubuntu 1438 Sep 24 08:48 check_copyright_config.yaml +2025-02-13T20:08:43.7681294Z -rw-r--r-- 1 ubuntu ubuntu 1821 Sep 19 18:09 cloc.sh +2025-02-13T20:08:43.7681798Z drwxr-xr-x 4 ubuntu ubuntu 4096 Feb 13 05:46 cmake +2025-02-13T20:08:43.7682320Z -rw-r--r-- 1 ubuntu ubuntu 23178 Feb 13 05:46 conftest.py +2025-02-13T20:08:43.7683068Z drwxr-xr-x 2 ubuntu ubuntu 4096 Nov 26 11:03 contributing +2025-02-13T20:08:43.7683636Z -rwxr-xr-x 1 ubuntu ubuntu 1420 Oct 25 23:14 create_venv.sh +2025-02-13T20:08:43.7684197Z drwxr-xr-x 2 ubuntu ubuntu 4096 Feb 13 05:46 dependencies +2025-02-13T20:08:43.7684746Z drwxr-xr-x 2 ubuntu ubuntu 4096 Feb 13 05:46 dockerfile +2025-02-13T20:08:43.7685268Z drwxr-xr-x 3 ubuntu ubuntu 4096 Jan 28 11:27 docs +2025-02-13T20:08:43.7685770Z drwxr-xr-x 3 ubuntu ubuntu 4096 Feb 13 19:52 generated +2025-02-13T20:08:43.7686343Z drwxr-xr-x 4 ubuntu ubuntu 4096 Feb 4 23:43 infra +2025-02-13T20:08:43.7686921Z -rwxr-xr-x 1 ubuntu ubuntu 6885 Feb 13 05:46 install_dependencies.sh +2025-02-13T20:08:43.7687500Z drwxr-xr-x 10 ubuntu ubuntu 4096 Feb 13 19:52 models +2025-02-13T20:08:43.7688174Z -rw-r--r-- 1 ubuntu ubuntu 1042 Jan 2 08:46 pyproject.toml +2025-02-13T20:08:43.7688869Z -rw-r--r-- 1 ubuntu ubuntu 1200 Sep 24 08:48 pytest.ini +2025-02-13T20:08:43.7689391Z drwxr-xr-x 4 ubuntu ubuntu 4096 Feb 13 05:46 scripts +2025-02-13T20:08:43.7689948Z -rw-r--r-- 1 root root 228 Feb 13 19:51 semicolon_delimited_script +2025-02-13T20:08:43.7690508Z -rw-r--r-- 1 ubuntu ubuntu 7551 Feb 4 23:43 setup.py +2025-02-13T20:08:43.7691044Z drwxr-xr-x 24 ubuntu ubuntu 4096 Jan 15 01:12 tech_reports +2025-02-13T20:08:43.7691563Z drwxr-xr-x 11 ubuntu ubuntu 4096 Feb 13 05:46 tests +2025-02-13T20:08:43.7692070Z drwxr-xr-x 11 ubuntu ubuntu 4096 Feb 13 05:46 tt-train +2025-02-13T20:08:43.7692598Z drwxr-xr-x 5 ubuntu ubuntu 4096 Feb 13 19:50 tt_fabric +2025-02-13T20:08:43.7693337Z drwxr-xr-x 22 ubuntu ubuntu 4096 Feb 13 08:16 tt_metal +2025-02-13T20:08:43.7693859Z drwxr-xr-x 9 ubuntu ubuntu 4096 Feb 13 19:50 ttnn +2025-02-13T20:08:43.7694529Z -rw-r--r-- 1 ubuntu ubuntu 137787499 Feb 13 19:50 ttnn-0.56.0rc27.dev24+any-cp38-cp38-linux_x86_64.whl +2025-02-13T20:08:43.7695313Z -rw-r--r-- 1 ubuntu ubuntu 214270360 Feb 13 19:51 ttnn-0.56.0rc27.dev24+any.tar.gz +2025-02-13T20:08:43.7695976Z -rw-r--r-- 1 ubuntu ubuntu 85467 Feb 13 19:55 ttnn_prediction_demo.jpg +2025-02-13T20:08:43.7703610Z + '[' -f semicolon_delimited_script ']' +2025-02-13T20:08:43.7704284Z + file semicolon_delimited_script +2025-02-13T20:08:43.7704722Z semicolon_delimited_script: ASCII text +2025-02-13T20:08:43.7705211Z + head semicolon_delimited_script +2025-02-13T20:08:43.7713046Z set -eu +2025-02-13T20:08:43.7713248Z +2025-02-13T20:08:43.7713593Z install_wheel=true +2025-02-13T20:08:43.7714523Z if [ "${install_wheel,,}" == "true" ]; then +2025-02-13T20:08:43.7714987Z + sudo rm -rf deleteme +2025-02-13T20:08:43.7715506Z WHEEL_FILENAME=$(ls -1 *.whl) +2025-02-13T20:08:43.7715896Z pip3 install "$WHEEL_FILENAME" +2025-02-13T20:08:43.7716247Z fi +2025-02-13T20:08:43.7716394Z +2025-02-13T20:08:43.7716746Z source tests/scripts/run_python_model_tests.sh && run_python_model_tests_wormhole_b0 +2025-02-13T20:08:43.7717226Z +2025-02-13T20:08:43.7935517Z + sudo rm -rf docker-job +2025-02-13T20:08:43.8151710Z + '[' -d .git ']' +2025-02-13T20:08:43.8152085Z Cleaning repo +2025-02-13T20:08:43.8152483Z + echo 'Cleaning repo' +2025-02-13T20:08:43.8152811Z + git clean -xffd +2025-02-13T20:08:45.1837000Z Removing .cache/ +2025-02-13T20:08:45.1837480Z Removing .local/ +2025-02-13T20:08:45.1837919Z Removing .pytest_cache/ +2025-02-13T20:08:45.1838432Z Removing .ttnn_runtime_artifacts/ +2025-02-13T20:08:45.1838910Z Removing __pycache__/ +2025-02-13T20:08:45.1839312Z Removing generated/ +2025-02-13T20:08:45.1856226Z Removing models/__pycache__/ +2025-02-13T20:08:45.1856641Z Removing models/common/__pycache__/ +2025-02-13T20:08:45.1857252Z Removing models/demos/falcon7b_common/tests/__pycache__/ +2025-02-13T20:08:45.1857957Z Removing models/demos/falcon7b_common/tests/unit_tests/__pycache__/ +2025-02-13T20:08:45.1858583Z Removing models/demos/falcon7b_common/tt/__pycache__/ +2025-02-13T20:08:45.1859095Z Removing models/demos/llama3/tests/__pycache__/ +2025-02-13T20:08:45.1859548Z Removing models/demos/llama3/tt/__pycache__/ +2025-02-13T20:08:45.1860023Z Removing models/demos/ttnn_resnet/tests/__pycache__/ +2025-02-13T20:08:45.1860523Z + echo 'Done git clean -xffd' +2025-02-13T20:08:45.1860910Z + echo 'Attempting to delete any lock files' +2025-02-13T20:08:45.1861348Z + find .git -type f -iname '*.lock' -delete +2025-02-13T20:08:45.1861801Z Removing models/demos/ttnn_resnet/tt/__pycache__/ +2025-02-13T20:08:45.1862322Z Removing models/demos/wormhole/mamba/reference/__pycache__/ +2025-02-13T20:08:45.1862893Z Removing models/demos/wormhole/mamba/tests/__pycache__/ +2025-02-13T20:08:45.1863458Z Removing models/demos/wormhole/mamba/tt/__pycache__/ +2025-02-13T20:08:45.1863944Z Removing models/demos/yolov4/demo/__pycache__/ +2025-02-13T20:08:45.1864413Z Removing models/demos/yolov4/reference/__pycache__/ +2025-02-13T20:08:45.1864898Z Removing models/demos/yolov4/ttnn/__pycache__/ +2025-02-13T20:08:45.1865440Z Removing models/experimental/functional_unet/tests/__pycache__/ +2025-02-13T20:08:45.1866777Z Removing models/experimental/functional_unet/tt/__pycache__/ +2025-02-13T20:08:45.1867290Z Removing semicolon_delimited_script +2025-02-13T20:08:45.1867687Z Removing tests/scripts/__pycache__/ +2025-02-13T20:08:45.1868209Z Removing tests/tt_eager/python_api_testing/sweep_tests/__pycache__/ +2025-02-13T20:08:45.1868740Z Removing tests/ttnn/__pycache__/ +2025-02-13T20:08:45.1869209Z Removing tests/ttnn/integration_tests/resnet/__pycache__/ +2025-02-13T20:08:45.1869787Z Removing ttnn-0.56.0rc27.dev24+any-cp38-cp38-linux_x86_64.whl +2025-02-13T20:08:45.1870303Z Removing ttnn-0.56.0rc27.dev24+any.tar.gz +2025-02-13T20:08:45.1870732Z Removing ttnn_prediction_demo.jpg +2025-02-13T20:08:45.1871117Z Done git clean -xffd +2025-02-13T20:08:45.1871461Z Attempting to delete any lock files +2025-02-13T20:08:45.2694511Z + echo 'Done deleting lock files' +2025-02-13T20:08:45.2694963Z Done deleting lock files +2025-02-13T20:08:45.2695362Z De-init-ing submodules +2025-02-13T20:08:45.2695787Z + echo 'De-init-ing submodules' +2025-02-13T20:08:45.2696253Z + git submodule deinit -f --all +2025-02-13T20:08:45.2964208Z Cleared directory 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:08:45.2992715Z Submodule 'models/demos/t3000/llama2_70b/reference/llama' (https://github.com/tenstorrent-metal/llama.git) unregistered for path 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:08:45.2994025Z Cleared directory 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:08:45.3148797Z Submodule '3rd_party/wandb-cpp' (https://github.com/yhisaki/wandb-cpp) unregistered for path 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:08:45.3149679Z Cleared directory 'tt_metal/third_party/tracy' +2025-02-13T20:08:45.3188563Z Submodule 'tt_metal/third_party/tracy' (https://github.com/tenstorrent-metal/tracy.git) unregistered for path 'tt_metal/third_party/tracy' +2025-02-13T20:08:45.3189735Z Cleared directory 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:08:45.3219452Z Submodule 'tt_metal/third_party/tt_llk_blackhole' (https://github.com/tenstorrent/tt-llk-bh.git) unregistered for path 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:08:45.3220677Z Cleared directory 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:08:45.3251932Z Submodule 'tt_metal/third_party/tt_llk_grayskull' (https://github.com/tenstorrent/tt-llk-gs.git) unregistered for path 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:08:45.3253009Z Cleared directory 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:08:45.3399848Z Submodule 'tt_metal/third_party/tt_llk_wormhole_b0' (https://github.com/tenstorrent/tt-llk-wh-b0.git) unregistered for path 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:08:45.3401008Z Cleared directory 'tt_metal/third_party/umd' +2025-02-13T20:08:45.3416542Z Submodule 'tt_metal/third_party/umd' (https://github.com/tenstorrent/tt-umd.git) unregistered for path 'tt_metal/third_party/umd' +2025-02-13T20:08:45.3428623Z + echo 'Done de-initing submodules' +2025-02-13T20:08:45.3429110Z Done de-initing submodules +2025-02-13T20:08:45.3533082Z ##[group]Run actions/checkout@v4 +2025-02-13T20:08:45.3533470Z with: +2025-02-13T20:08:45.3534002Z token: *** +2025-02-13T20:08:45.3534296Z fetch-depth: 1 +2025-02-13T20:08:45.3534610Z lfs: false +2025-02-13T20:08:45.3534918Z submodules: recursive +2025-02-13T20:08:45.3535256Z clean: true +2025-02-13T20:08:45.3535584Z repository: tenstorrent/tt-metal +2025-02-13T20:08:45.3535974Z ssh-strict: true +2025-02-13T20:08:45.3536289Z ssh-user: git +2025-02-13T20:08:45.3536619Z persist-credentials: true +2025-02-13T20:08:45.3536998Z sparse-checkout-cone-mode: true +2025-02-13T20:08:45.3537386Z fetch-tags: false +2025-02-13T20:08:45.3537708Z show-progress: true +2025-02-13T20:08:45.3538814Z set-safe-directory: true +2025-02-13T20:08:45.3539175Z env: +2025-02-13T20:08:45.3539472Z ARCH_NAME: wormhole_b0 +2025-02-13T20:08:45.3539814Z LOGURU_LEVEL: INFO +2025-02-13T20:08:45.3540124Z ##[endgroup] +2025-02-13T20:08:45.4889419Z Syncing repository: tenstorrent/tt-metal +2025-02-13T20:08:45.4892541Z ##[group]Getting Git version info +2025-02-13T20:08:45.4893313Z Working directory is '/home/ubuntu/actions-runner/_work/tt-metal/tt-metal' +2025-02-13T20:08:45.4894457Z [command]/usr/bin/git version +2025-02-13T20:08:45.4894969Z git version 2.25.1 +2025-02-13T20:08:45.4923042Z ##[endgroup] +2025-02-13T20:08:45.4937326Z Copying '/home/ubuntu/.gitconfig' to '/home/ubuntu/actions-runner/_work/_temp/5b2486bd-7479-4a13-a631-66196d42bf4d/.gitconfig' +2025-02-13T20:08:45.4950438Z Temporarily overriding HOME='/home/ubuntu/actions-runner/_work/_temp/5b2486bd-7479-4a13-a631-66196d42bf4d' before making global git config changes +2025-02-13T20:08:45.4951965Z Adding repository directory to the temporary git global config as a safe directory +2025-02-13T20:08:45.4957202Z [command]/usr/bin/git config --global --add safe.directory /home/ubuntu/actions-runner/_work/tt-metal/tt-metal +2025-02-13T20:08:45.5000814Z [command]/usr/bin/git config --local --get remote.origin.url +2025-02-13T20:08:45.5026879Z https://github.com/tenstorrent/tt-metal +2025-02-13T20:08:45.5047397Z ##[group]Removing previously created refs, to avoid conflicts +2025-02-13T20:08:45.5052180Z [command]/usr/bin/git rev-parse --symbolic-full-name --verify --quiet HEAD +2025-02-13T20:08:45.5074929Z refs/heads/smanoj/conv_device_weights +2025-02-13T20:08:45.5086189Z [command]/usr/bin/git checkout --detach +2025-02-13T20:08:45.5643076Z HEAD is now at 68e85df3d #0: Skip weights bfloat8 on grayskull +2025-02-13T20:08:45.6340745Z [command]/usr/bin/git branch --delete --force smanoj/conv_device_weights +2025-02-13T20:08:45.6411914Z Deleted branch smanoj/conv_device_weights (was 68e85df3d). +2025-02-13T20:08:45.6853437Z ##[endgroup] +2025-02-13T20:08:45.6854454Z [command]/usr/bin/git submodule status +2025-02-13T20:08:45.7119732Z -29125b7ad8b5513eeaa4417ed92892bf39c8bd74 models/demos/t3000/llama2_70b/reference/llama +2025-02-13T20:08:45.7120615Z -368cd07f89f497df20a66936fbfae3956f151af4 tt-train/3rd_party/wandb-cpp +2025-02-13T20:08:45.7121388Z -71d4c8d378b52af7da7012b9b595a61e9304f0bb tt_metal/third_party/tracy +2025-02-13T20:08:45.7122184Z -9fd3e2d93d1532373f52e11e963de40c1cdf9a55 tt_metal/third_party/tt_llk_blackhole +2025-02-13T20:08:45.7122991Z -0c04db64275a4bd36a7e14d3c533855cb33f6a20 tt_metal/third_party/tt_llk_grayskull +2025-02-13T20:08:45.7123808Z -0ec3177bfc262f7edf6cfc19531ecb8f669895d2 tt_metal/third_party/tt_llk_wormhole_b0 +2025-02-13T20:08:45.7124562Z -5de287e9c5b2fa3d55fbfd53e9bc59e2050f32fb tt_metal/third_party/umd +2025-02-13T20:08:45.7128680Z ##[group]Cleaning the repository +2025-02-13T20:08:45.7133811Z [command]/usr/bin/git clean -ffdx +2025-02-13T20:08:45.7389172Z [command]/usr/bin/git reset --hard HEAD +2025-02-13T20:08:45.8062722Z HEAD is now at 68e85df3d #0: Skip weights bfloat8 on grayskull +2025-02-13T20:08:45.8079206Z ##[endgroup] +2025-02-13T20:08:45.8080468Z ##[group]Disabling automatic garbage collection +2025-02-13T20:08:45.8085106Z [command]/usr/bin/git config --local gc.auto 0 +2025-02-13T20:08:45.8120004Z ##[endgroup] +2025-02-13T20:08:45.8120658Z ##[group]Setting up auth +2025-02-13T20:08:45.8125841Z [command]/usr/bin/git config --local --name-only --get-regexp core\.sshCommand +2025-02-13T20:08:45.8153500Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'core\.sshCommand' && git config --local --unset-all 'core.sshCommand' || :" +2025-02-13T20:08:45.8469831Z [command]/usr/bin/git config --local --name-only --get-regexp http\.https\:\/\/github\.com\/\.extraheader +2025-02-13T20:08:45.8500975Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'http\.https\:\/\/github\.com\/\.extraheader' && git config --local --unset-all 'http.https://github.com/.extraheader' || :" +2025-02-13T20:08:45.8772444Z [command]/usr/bin/git config --local http.https://github.com/.extraheader AUTHORIZATION: basic *** +2025-02-13T20:08:45.8813180Z ##[endgroup] +2025-02-13T20:08:45.8813821Z ##[group]Fetching the repository +2025-02-13T20:08:45.8822223Z [command]/usr/bin/git -c protocol.version=2 fetch --no-tags --prune --no-recurse-submodules --depth=1 origin +ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70:refs/remotes/origin/sagarwal/multi_page_buffer +2025-02-13T20:08:46.4579427Z From https://github.com/tenstorrent/tt-metal +2025-02-13T20:08:46.4580517Z + 6d3999637...ac8ce51fe ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70 -> origin/sagarwal/multi_page_buffer (forced update) +2025-02-13T20:08:46.4611355Z ##[endgroup] +2025-02-13T20:08:46.4611931Z ##[group]Determining the checkout info +2025-02-13T20:08:46.4613142Z ##[endgroup] +2025-02-13T20:08:46.4613884Z ##[group]Checking out the ref +2025-02-13T20:08:46.4620326Z [command]/usr/bin/git checkout --progress --force -B sagarwal/multi_page_buffer refs/remotes/origin/sagarwal/multi_page_buffer +2025-02-13T20:08:46.5683314Z Previous HEAD position was 68e85df3d #0: Skip weights bfloat8 on grayskull +2025-02-13T20:08:46.5849183Z Switched to a new branch 'sagarwal/multi_page_buffer' +2025-02-13T20:08:46.5851158Z Branch 'sagarwal/multi_page_buffer' set up to track remote branch 'sagarwal/multi_page_buffer' from 'origin'. +2025-02-13T20:08:46.6562906Z ##[endgroup] +2025-02-13T20:08:46.6563498Z ##[group]Setting up auth for fetching submodules +2025-02-13T20:08:46.6571395Z [command]/usr/bin/git config --global http.https://github.com/.extraheader AUTHORIZATION: basic *** +2025-02-13T20:08:46.6612467Z [command]/usr/bin/git config --global --unset-all url.https://github.com/.insteadOf +2025-02-13T20:08:46.6639064Z [command]/usr/bin/git config --global --add url.https://github.com/.insteadOf git@github.com: +2025-02-13T20:08:46.6667862Z [command]/usr/bin/git config --global --add url.https://github.com/.insteadOf org-64161552@github.com: +2025-02-13T20:08:46.6698835Z ##[endgroup] +2025-02-13T20:08:46.6700259Z ##[group]Fetching submodules +2025-02-13T20:08:46.6702215Z [command]/usr/bin/git submodule sync --recursive +2025-02-13T20:08:46.6976806Z [command]/usr/bin/git -c protocol.version=2 submodule update --init --force --depth=1 --recursive +2025-02-13T20:08:46.7241730Z Submodule 'models/demos/t3000/llama2_70b/reference/llama' (https://github.com/tenstorrent-metal/llama.git) registered for path 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:08:46.7243688Z Submodule '3rd_party/wandb-cpp' (https://github.com/yhisaki/wandb-cpp) registered for path 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:08:46.7248264Z Submodule 'tt_metal/third_party/tracy' (https://github.com/tenstorrent-metal/tracy.git) registered for path 'tt_metal/third_party/tracy' +2025-02-13T20:08:46.7252099Z Submodule 'tt_metal/third_party/tt_llk_blackhole' (https://github.com/tenstorrent/tt-llk-bh.git) registered for path 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:08:46.7255471Z Submodule 'tt_metal/third_party/tt_llk_grayskull' (https://github.com/tenstorrent/tt-llk-gs.git) registered for path 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:08:46.7259473Z Submodule 'tt_metal/third_party/tt_llk_wormhole_b0' (https://github.com/tenstorrent/tt-llk-wh-b0.git) registered for path 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:08:46.7263022Z Submodule 'tt_metal/third_party/umd' (https://github.com/tenstorrent/tt-umd.git) registered for path 'tt_metal/third_party/umd' +2025-02-13T20:08:46.7752709Z Submodule path 'models/demos/t3000/llama2_70b/reference/llama': checked out '29125b7ad8b5513eeaa4417ed92892bf39c8bd74' +2025-02-13T20:08:46.8134141Z Submodule path 'tt-train/3rd_party/wandb-cpp': checked out '368cd07f89f497df20a66936fbfae3956f151af4' +2025-02-13T20:08:46.9586228Z Submodule path 'tt_metal/third_party/tracy': checked out '71d4c8d378b52af7da7012b9b595a61e9304f0bb' +2025-02-13T20:08:46.9944856Z Submodule path 'tt_metal/third_party/tt_llk_blackhole': checked out '9fd3e2d93d1532373f52e11e963de40c1cdf9a55' +2025-02-13T20:08:47.0262534Z Submodule path 'tt_metal/third_party/tt_llk_grayskull': checked out '0c04db64275a4bd36a7e14d3c533855cb33f6a20' +2025-02-13T20:08:47.0579961Z Submodule path 'tt_metal/third_party/tt_llk_wormhole_b0': checked out '0ec3177bfc262f7edf6cfc19531ecb8f669895d2' +2025-02-13T20:08:47.3320644Z WARNING: Multiple 'url.*..insteadof' keys with the same alias: "git@github.com:" +2025-02-13T20:08:47.3323104Z WARNING: Multiple 'url.*..insteadof' keys with the same alias: "org-64161552@github.com:" +2025-02-13T20:08:47.3400802Z Submodule path 'tt_metal/third_party/umd': checked out '5de287e9c5b2fa3d55fbfd53e9bc59e2050f32fb' +2025-02-13T20:08:47.3483622Z [command]/usr/bin/git submodule foreach --recursive git config --local gc.auto 0 +2025-02-13T20:08:47.3733518Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:08:47.3778612Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:08:47.3817110Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:08:47.3857080Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:08:47.3898745Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:08:47.3938021Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:08:47.3978076Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:08:47.4030601Z ##[endgroup] +2025-02-13T20:08:47.4031319Z ##[group]Persisting credentials for submodules +2025-02-13T20:08:47.4037984Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'url\.https\:\/\/github\.com\/\.insteadOf' && git config --local --unset-all 'url.https://github.com/.insteadOf' || :" +2025-02-13T20:08:47.4291108Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:08:47.4316862Z url.https://github.com/.insteadof +2025-02-13T20:08:47.4317351Z url.https://github.com/.insteadof +2025-02-13T20:08:47.4348584Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:08:47.4376547Z url.https://github.com/.insteadof +2025-02-13T20:08:47.4377083Z url.https://github.com/.insteadof +2025-02-13T20:08:47.4421317Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:08:47.4443928Z url.https://github.com/.insteadof +2025-02-13T20:08:47.4444425Z url.https://github.com/.insteadof +2025-02-13T20:08:47.4485717Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:08:47.4510344Z url.https://github.com/.insteadof +2025-02-13T20:08:47.4510826Z url.https://github.com/.insteadof +2025-02-13T20:08:47.4545752Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:08:47.4570548Z url.https://github.com/.insteadof +2025-02-13T20:08:47.4571038Z url.https://github.com/.insteadof +2025-02-13T20:08:47.4609317Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:08:47.4636645Z url.https://github.com/.insteadof +2025-02-13T20:08:47.4637127Z url.https://github.com/.insteadof +2025-02-13T20:08:47.4675630Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:08:47.4699473Z url.https://github.com/.insteadof +2025-02-13T20:08:47.4699985Z url.https://github.com/.insteadof +2025-02-13T20:08:47.4752643Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local 'http.https://github.com/.extraheader' 'AUTHORIZATION: basic ***' && git config --local --show-origin --name-only --get-regexp remote.origin.url" +2025-02-13T20:08:47.5007669Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:08:47.5062899Z file:/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.git/modules/models/demos/t3000/llama2_70b/reference/llama/config remote.origin.url +2025-02-13T20:08:47.5081708Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:08:47.5126330Z file:/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.git/modules/3rd_party/wandb-cpp/config remote.origin.url +2025-02-13T20:08:47.5149840Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:08:47.5193234Z file:/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.git/modules/tt_metal/third_party/tracy/config remote.origin.url +2025-02-13T20:08:47.5216364Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:08:47.5259512Z file:/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.git/modules/tt_metal/third_party/tt_llk_blackhole/config remote.origin.url +2025-02-13T20:08:47.5276841Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:08:47.5317679Z file:/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.git/modules/tt_metal/third_party/tt_llk_grayskull/config remote.origin.url +2025-02-13T20:08:47.5336978Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:08:47.5377864Z file:/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.git/modules/tt_metal/third_party/tt_llk_wormhole_b0/config remote.origin.url +2025-02-13T20:08:47.5399842Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:08:47.5444257Z file:/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.git/modules/tt_metal/third_party/umd/config remote.origin.url +2025-02-13T20:08:47.5532546Z [command]/usr/bin/git submodule foreach --recursive git config --local --add 'url.https://github.com/.insteadOf' 'git@github.com:' +2025-02-13T20:08:47.5798039Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:08:47.5839703Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:08:47.5877403Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:08:47.5922556Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:08:47.5960696Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:08:47.6001822Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:08:47.6043989Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:08:47.6104420Z [command]/usr/bin/git submodule foreach --recursive git config --local --add 'url.https://github.com/.insteadOf' 'org-64161552@github.com:' +2025-02-13T20:08:47.6354499Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:08:47.6391865Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:08:47.6431087Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:08:47.6472426Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:08:47.6511021Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:08:47.6558695Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:08:47.6603915Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:08:47.6656762Z ##[endgroup] +2025-02-13T20:08:47.6729193Z [command]/usr/bin/git log -1 --format=%H +2025-02-13T20:08:47.6790381Z ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70 +2025-02-13T20:08:47.6938939Z ##[group]Run git submodule foreach 'git clean -xffd' +2025-02-13T20:08:47.6939635Z git submodule foreach 'git clean -xffd' +2025-02-13T20:08:47.6961951Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:08:47.6962449Z env: +2025-02-13T20:08:47.6962827Z ARCH_NAME: wormhole_b0 +2025-02-13T20:08:47.6963213Z LOGURU_LEVEL: INFO +2025-02-13T20:08:47.6963584Z ##[endgroup] +2025-02-13T20:08:47.7259857Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:08:47.7288577Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:08:47.7312804Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:08:47.7353812Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:08:47.7379696Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:08:47.7405252Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:08:47.7430130Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:08:47.7544870Z Prepare all required actions +2025-02-13T20:08:47.7545518Z Getting action download info +2025-02-13T20:08:47.9103371Z Download action repository 'actions/download-artifact@v4' (SHA:fa0a91b85d4f404e444e00e005971372dc801d16) +2025-02-13T20:08:48.5748373Z Download action repository 'catchpoint/workflow-telemetry-action@v2' (SHA:94c3c3d9567a0205de6da68a76c428ce4e769af1) +2025-02-13T20:08:49.4557046Z ##[group]Run ./.github/actions/prepare-metal-run +2025-02-13T20:08:49.4557466Z with: +2025-02-13T20:08:49.4557756Z is_profiler: false +2025-02-13T20:08:49.4558089Z python-version: 3.8 +2025-02-13T20:08:49.4558416Z run-telemetry: false +2025-02-13T20:08:49.4558723Z env: +2025-02-13T20:08:49.4558998Z ARCH_NAME: wormhole_b0 +2025-02-13T20:08:49.4559321Z LOGURU_LEVEL: INFO +2025-02-13T20:08:49.4559616Z ##[endgroup] +2025-02-13T20:08:49.4624908Z ##[group]Run actions/download-artifact@v4 +2025-02-13T20:08:49.4625309Z with: +2025-02-13T20:08:49.4625598Z name: TTMetal_build_any +2025-02-13T20:08:49.4625978Z merge-multiple: false +2025-02-13T20:08:49.4626578Z repository: tenstorrent/tt-metal +2025-02-13T20:08:49.4626955Z run-id: 13315815702 +2025-02-13T20:08:49.4627241Z env: +2025-02-13T20:08:49.4627513Z ARCH_NAME: wormhole_b0 +2025-02-13T20:08:49.4627840Z LOGURU_LEVEL: INFO +2025-02-13T20:08:49.4628141Z ##[endgroup] +2025-02-13T20:08:49.7128203Z Downloading single artifact +2025-02-13T20:08:49.9225508Z Preparing to download the following artifacts: +2025-02-13T20:08:49.9226167Z - TTMetal_build_any (ID: 2588416029, Size: 171796974) +2025-02-13T20:08:50.0511628Z Redirecting to blob download url: https://productionresultssa8.blob.core.windows.net/actions-results/c50d1cc6-5c31-4c4c-b0e4-cb91df2420e1/workflow-job-run-85e4bcb1-b635-5839-8d32-ecb05ba8175c/artifacts/220fe10383c34fbe00d66e183fcfa42d19c438ee1c01790da9aeb9ea9685c6a0.zip +2025-02-13T20:08:50.0513477Z Starting download of artifact to: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal +2025-02-13T20:08:50.2958610Z (node:1330807) [DEP0005] DeprecationWarning: Buffer() is deprecated due to security and usability issues. Please use the Buffer.alloc(), Buffer.allocUnsafe(), or Buffer.from() methods instead. +2025-02-13T20:08:50.2959967Z (Use `node --trace-deprecation ...` to show where the warning was created) +2025-02-13T20:09:00.2272628Z Artifact download completed successfully. +2025-02-13T20:09:00.2273139Z Total of 1 artifact(s) downloaded +2025-02-13T20:09:00.2279826Z Download artifact has finished successfully +2025-02-13T20:09:00.2462112Z ##[group]Run tar -xvf ttm_any.tar +2025-02-13T20:09:00.2462550Z tar -xvf ttm_any.tar +2025-02-13T20:09:00.2482448Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:09:00.2482929Z env: +2025-02-13T20:09:00.2483228Z ARCH_NAME: wormhole_b0 +2025-02-13T20:09:00.2483578Z LOGURU_LEVEL: INFO +2025-02-13T20:09:00.2483896Z ##[endgroup] +2025-02-13T20:09:00.2541324Z ttnn/ttnn/_ttnn.so +2025-02-13T20:09:00.3693817Z build/lib/ +2025-02-13T20:09:00.3694213Z build/lib/libtt_fabric.so +2025-02-13T20:09:00.3706804Z build/lib/libgmock.so +2025-02-13T20:09:00.3708368Z build/lib/pkgconfig/ +2025-02-13T20:09:00.3708755Z build/lib/pkgconfig/libuv-static.pc +2025-02-13T20:09:00.3709147Z build/lib/_ttnn.so +2025-02-13T20:09:00.4763803Z build/lib/libdevice.so +2025-02-13T20:09:00.4858101Z build/lib/libgtest.so +2025-02-13T20:09:00.4864394Z build/lib/libgtest.so.1.13.0 +2025-02-13T20:09:00.4864786Z build/lib/libbenchmark.so +2025-02-13T20:09:00.4868842Z build/lib/cmake/ +2025-02-13T20:09:00.4869181Z build/lib/cmake/umd/ +2025-02-13T20:09:00.4869574Z build/lib/cmake/umd/umdConfigVersion.cmake +2025-02-13T20:09:00.4870022Z build/lib/cmake/umd/umdConfig.cmake +2025-02-13T20:09:00.4870464Z build/lib/cmake/umd/umdTargets-release.cmake +2025-02-13T20:09:00.4870935Z build/lib/cmake/umd/umdTargets.cmake +2025-02-13T20:09:00.4871337Z build/lib/cmake/msgpack-cxx/ +2025-02-13T20:09:00.4871787Z build/lib/cmake/msgpack-cxx/msgpack-cxx-config.cmake +2025-02-13T20:09:00.4872378Z build/lib/cmake/msgpack-cxx/msgpack-cxx-config-version.cmake +2025-02-13T20:09:00.4873748Z build/lib/cmake/msgpack-cxx/msgpack-cxx-targets.cmake +2025-02-13T20:09:00.4874243Z build/lib/cmake/libuv/ +2025-02-13T20:09:00.4874669Z build/lib/cmake/libuv/libuvConfig-release.cmake +2025-02-13T20:09:00.4875138Z build/lib/cmake/libuv/libuvConfig.cmake +2025-02-13T20:09:00.4875543Z build/lib/cmake/nng/ +2025-02-13T20:09:00.4875900Z build/lib/cmake/nng/nng-config.cmake +2025-02-13T20:09:00.4876317Z build/lib/cmake/nng/nng-config-version.cmake +2025-02-13T20:09:00.4876762Z build/lib/cmake/nng/nng-targets.cmake +2025-02-13T20:09:00.4877205Z build/lib/cmake/nng/nng-targets-release.cmake +2025-02-13T20:09:00.4877648Z build/lib/libgmock.so.1.13.0 +2025-02-13T20:09:00.4878032Z build/lib/libbenchmark.so.1.9.1 +2025-02-13T20:09:00.4878419Z build/lib/libgmock_main.so.1.13.0 +2025-02-13T20:09:00.4878813Z build/lib/libgmock_main.so +2025-02-13T20:09:00.4879179Z build/lib/libbenchmark.so.1 +2025-02-13T20:09:00.4879515Z build/lib/libnng.a +2025-02-13T20:09:00.4884517Z build/lib/libtt_metal.so +2025-02-13T20:09:00.4929994Z build/lib/libuv.a +2025-02-13T20:09:00.4933403Z ttnn/ttnn/_ttnn.so +2025-02-13T20:09:00.4933762Z build/programming_examples/ +2025-02-13T20:09:00.4934185Z build/programming_examples/vecadd_multi_core +2025-02-13T20:09:00.4934658Z build/programming_examples/eltwise_binary +2025-02-13T20:09:00.4937194Z build/programming_examples/matmul_multicore_reuse +2025-02-13T20:09:00.4944278Z build/programming_examples/distributed/ +2025-02-13T20:09:00.4945102Z build/programming_examples/distributed/distributed_program_dispatch +2025-02-13T20:09:00.4945830Z build/programming_examples/distributed/distributed_eltwise_add +2025-02-13T20:09:00.4946462Z build/programming_examples/distributed/distributed_buffer_rw +2025-02-13T20:09:00.4947045Z build/programming_examples/hello_world_compute_kernel +2025-02-13T20:09:00.4947534Z build/programming_examples/contributed/ +2025-02-13T20:09:00.4948033Z build/programming_examples/contributed/vecadd +2025-02-13T20:09:00.4948562Z build/programming_examples/profiler/ +2025-02-13T20:09:00.4949305Z build/programming_examples/profiler/test_custom_cycle_count_slow_dispatch +2025-02-13T20:09:00.4950091Z build/programming_examples/profiler/test_timestamped_events +2025-02-13T20:09:00.4952880Z build/programming_examples/profiler/test_custom_cycle_count +2025-02-13T20:09:00.4955286Z build/programming_examples/profiler/test_dispatch_cores +2025-02-13T20:09:00.4957578Z build/programming_examples/profiler/test_multi_op +2025-02-13T20:09:00.4960038Z build/programming_examples/profiler/test_full_buffer +2025-02-13T20:09:00.4962359Z build/programming_examples/vecadd_sharding +2025-02-13T20:09:00.4964856Z build/programming_examples/loopback +2025-02-13T20:09:00.4967386Z build/programming_examples/pad_multi_core +2025-02-13T20:09:00.4968034Z build/programming_examples/hello_world_datatypes_kernel +2025-02-13T20:09:00.4969043Z build/programming_examples/hello_world_datamovement_kernel +2025-02-13T20:09:00.4969627Z build/programming_examples/matmul_multi_core +2025-02-13T20:09:00.4972368Z build/programming_examples/add_2_integers_in_riscv +2025-02-13T20:09:00.4972898Z build/programming_examples/shard_data_rm +2025-02-13T20:09:00.4988966Z build/programming_examples/matmul_single_core +2025-02-13T20:09:00.4989538Z build/programming_examples/eltwise_sfpu +2025-02-13T20:09:00.4990097Z build/programming_examples/add_2_integers_in_compute +2025-02-13T20:09:00.4990840Z build/programming_examples/matmul_multicore_reuse_mcast +2025-02-13T20:09:00.4991343Z build/test/ +2025-02-13T20:09:00.4991627Z build/test/tt_eager/ +2025-02-13T20:09:00.4991956Z build/test/tt_eager/tensors/ +2025-02-13T20:09:00.4992347Z build/test/tt_eager/tensors/test_copy_and_move +2025-02-13T20:09:00.4992843Z build/test/tt_eager/tensors/test_raw_host_memory_pointer +2025-02-13T20:09:00.4993356Z build/test/tt_eager/tensors/test_async_tensor_apis +2025-02-13T20:09:00.4997452Z build/test/tt_eager/tensors/test_host_device_loopback +2025-02-13T20:09:00.4999804Z build/test/tt_eager/ops/ +2025-02-13T20:09:00.5001757Z build/test/tt_eager/ops/test_softmax_op +2025-02-13T20:09:00.5003693Z build/test/tt_eager/ops/test_average_pool +2025-02-13T20:09:00.5006239Z build/test/tt_eager/ops/test_conv_prepare_weights_and_biases +2025-02-13T20:09:00.5009686Z build/test/tt_eager/ops/test_sfpu +2025-02-13T20:09:00.5012915Z build/test/tt_eager/ops/test_sliding_window_ops +2025-02-13T20:09:00.5015920Z build/test/tt_eager/ops/test_layernorm_op +2025-02-13T20:09:00.5018646Z build/test/tt_eager/ops/test_fold_op +2025-02-13T20:09:00.5021745Z build/test/tt_eager/ops/test_eltwise_unary_op +2025-02-13T20:09:00.5026717Z build/test/tt_eager/ops/test_bcast_op +2025-02-13T20:09:00.5030064Z build/test/tt_eager/ops/test_bmm_op +2025-02-13T20:09:00.5033321Z build/test/tt_eager/ops/test_eltwise_binary_op +2025-02-13T20:09:00.5036632Z build/test/tt_eager/integration_tests/ +2025-02-13T20:09:00.5037345Z build/test/tt_eager/integration_tests/test_bert +2025-02-13T20:09:00.5043308Z build/test/ttnn/ +2025-02-13T20:09:00.5043663Z build/test/ttnn/unit_tests_ttnn_ccl +2025-02-13T20:09:00.5060165Z build/test/ttnn/galaxy_unit_tests_ttnn +2025-02-13T20:09:00.5064639Z build/test/ttnn/unit_tests_ttnn +2025-02-13T20:09:00.5080197Z build/test/ttnn/test_multi_device +2025-02-13T20:09:00.5083709Z build/test/ttnn/test_distributed_atexit +2025-02-13T20:09:00.5084150Z build/test/ttnn/unit_tests_ttnn_tensor +2025-02-13T20:09:00.5106510Z build/test/ttnn/test_distributed +2025-02-13T20:09:00.5111719Z build/test/tt_metal/ +2025-02-13T20:09:00.5112104Z build/test/tt_metal/test_dataflow_cb +2025-02-13T20:09:00.5114415Z build/test/tt_metal/test_add_two_ints +2025-02-13T20:09:00.5116925Z build/test/tt_metal/test_dram_loopback_single_core +2025-02-13T20:09:00.5119073Z build/test/tt_metal/test_interleaved_l1_buffer +2025-02-13T20:09:00.5121474Z build/test/tt_metal/unit_tests_debug_tools_grayskull +2025-02-13T20:09:00.5130495Z build/test/tt_metal/unit_tests_eth_wormhole_b0 +2025-02-13T20:09:00.5135413Z build/test/tt_metal/unit_tests_api_grayskull +2025-02-13T20:09:00.5150560Z build/test/tt_metal/test_clean_init +2025-02-13T20:09:00.5153078Z build/test/tt_metal/test_multiple_programs +2025-02-13T20:09:00.5155820Z build/test/tt_metal/tt_fabric/ +2025-02-13T20:09:00.5156260Z build/test/tt_metal/tt_fabric/fabric_unit_tests +2025-02-13T20:09:00.5158407Z build/test/tt_metal/test_core_range_set +2025-02-13T20:09:00.5161187Z build/test/tt_metal/unit_tests_api_wormhole_b0 +2025-02-13T20:09:00.5175931Z build/test/tt_metal/test_stress_noc_mcast +2025-02-13T20:09:00.5178222Z build/test/tt_metal/unit_tests_dispatch_blackhole +2025-02-13T20:09:00.5201590Z build/test/tt_metal/distributed/ +2025-02-13T20:09:00.5202190Z build/test/tt_metal/distributed/distributed_unit_tests_wormhole_b0 +2025-02-13T20:09:00.5205298Z build/test/tt_metal/distributed/distributed_unit_tests_grayskull +2025-02-13T20:09:00.5212807Z build/test/tt_metal/distributed/distributed_unit_tests_blackhole +2025-02-13T20:09:00.5218768Z build/test/tt_metal/unit_tests_eth_grayskull +2025-02-13T20:09:00.5223534Z build/test/tt_metal/unit_tests_device +2025-02-13T20:09:00.5228406Z build/test/tt_metal/test_bmm +2025-02-13T20:09:00.5231183Z build/test/tt_metal/unit_tests_noc +2025-02-13T20:09:00.5234042Z build/test/tt_metal/test_matmul_single_tile_bfp8b +2025-02-13T20:09:00.5236650Z build/test/tt_metal/unit_tests_debug_tools_wormhole_b0 +2025-02-13T20:09:00.5245560Z build/test/tt_metal/test_matmul_single_tile_output_in_l1 +2025-02-13T20:09:00.5247973Z build/test/tt_metal/unit_tests_eth_blackhole +2025-02-13T20:09:00.5253646Z build/test/tt_metal/test_compile_program +2025-02-13T20:09:00.5256459Z build/test/tt_metal/test_interleaved_layouts +2025-02-13T20:09:00.5259031Z build/test/tt_metal/test_eltwise_binary +2025-02-13T20:09:00.5261667Z build/test/tt_metal/test_dram_copy_sticks_multi_core +2025-02-13T20:09:00.5264024Z build/test/tt_metal/unit_tests_dispatch_wormhole_b0 +2025-02-13T20:09:00.5282904Z build/test/tt_metal/unit_tests_debug_tools_blackhole +2025-02-13T20:09:00.5291389Z build/test/tt_metal/test_datacopy_output_in_l1 +2025-02-13T20:09:00.5293514Z build/test/tt_metal/test_multi_core_kernel +2025-02-13T20:09:00.5296096Z build/test/tt_metal/unit_tests_integration +2025-02-13T20:09:00.5305193Z build/test/tt_metal/test_datacopy_bfp8b +2025-02-13T20:09:00.5307426Z build/test/tt_metal/test_transpose_hc +2025-02-13T20:09:00.5310086Z build/test/tt_metal/unit_tests_lightmetal +2025-02-13T20:09:00.5313550Z build/test/tt_metal/test_generic_binary_reader_matmul_large_block +2025-02-13T20:09:00.5316178Z build/test/tt_metal/test_bcast +2025-02-13T20:09:00.5318954Z build/test/tt_metal/test_untilize_eltwise_binary +2025-02-13T20:09:00.5321367Z build/test/tt_metal/test_bfp8_conversion +2025-02-13T20:09:00.5323804Z build/test/tt_metal/unit_tests_llk +2025-02-13T20:09:00.5334936Z build/test/tt_metal/test_compile_sets_kernel_binaries +2025-02-13T20:09:00.5337473Z build/test/tt_metal/unit_tests_stl +2025-02-13T20:09:00.5343271Z build/test/tt_metal/unit_tests_api_blackhole +2025-02-13T20:09:00.5356894Z build/test/tt_metal/test_datacopy +2025-02-13T20:09:00.5359133Z build/test/tt_metal/perf_microbenchmark/ +2025-02-13T20:09:00.5359651Z build/test/tt_metal/perf_microbenchmark/1_compute_mm/ +2025-02-13T20:09:00.5360309Z build/test/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm_grayskull +2025-02-13T20:09:00.5364122Z build/test/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm_wormhole_b0 +2025-02-13T20:09:00.5368213Z build/test/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm_blackhole +2025-02-13T20:09:00.5372466Z build/test/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/ +2025-02-13T20:09:00.5373440Z build/test/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write_blackhole +2025-02-13T20:09:00.5376494Z build/test/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write_wormhole_b0 +2025-02-13T20:09:00.5379995Z build/test/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write_grayskull +2025-02-13T20:09:00.5383232Z build/test/tt_metal/perf_microbenchmark/2_noc_rtor/ +2025-02-13T20:09:00.5383892Z build/test/tt_metal/perf_microbenchmark/2_noc_rtor/test_noc_rtor_blackhole +2025-02-13T20:09:00.5386597Z build/test/tt_metal/perf_microbenchmark/2_noc_rtor/test_noc_rtor_wormhole_b0 +2025-02-13T20:09:00.5389205Z build/test/tt_metal/perf_microbenchmark/2_noc_rtor/test_noc_rtor_grayskull +2025-02-13T20:09:00.5391820Z build/test/tt_metal/perf_microbenchmark/6_dram_offchip/ +2025-02-13T20:09:00.5392555Z build/test/tt_metal/perf_microbenchmark/6_dram_offchip/test_dram_offchip_wormhole_b0 +2025-02-13T20:09:00.5395075Z build/test/tt_metal/perf_microbenchmark/6_dram_offchip/test_dram_offchip_grayskull +2025-02-13T20:09:00.5398019Z build/test/tt_metal/perf_microbenchmark/6_dram_offchip/test_dram_offchip_blackhole +2025-02-13T20:09:00.5401110Z build/test/tt_metal/perf_microbenchmark/7_kernel_launch/ +2025-02-13T20:09:00.5401809Z build/test/tt_metal/perf_microbenchmark/7_kernel_launch/test_kernel_launch_grayskull +2025-02-13T20:09:00.5403644Z build/test/tt_metal/perf_microbenchmark/7_kernel_launch/test_kernel_launch_blackhole +2025-02-13T20:09:00.5406391Z build/test/tt_metal/perf_microbenchmark/7_kernel_launch/test_kernel_launch_wormhole_b0 +2025-02-13T20:09:00.5409122Z build/test/tt_metal/perf_microbenchmark/3_pcie_transfer/ +2025-02-13T20:09:00.5409876Z build/test/tt_metal/perf_microbenchmark/3_pcie_transfer/test_rw_buffer_grayskull +2025-02-13T20:09:00.5411912Z build/test/tt_metal/perf_microbenchmark/3_pcie_transfer/test_pull_from_pcie_wormhole_b0 +2025-02-13T20:09:00.5414665Z build/test/tt_metal/perf_microbenchmark/3_pcie_transfer/test_rw_buffer_blackhole +2025-02-13T20:09:00.5417208Z build/test/tt_metal/perf_microbenchmark/3_pcie_transfer/test_pull_from_pcie_grayskull +2025-02-13T20:09:00.5419894Z build/test/tt_metal/perf_microbenchmark/3_pcie_transfer/test_pull_from_pcie_blackhole +2025-02-13T20:09:00.5422651Z build/test/tt_metal/perf_microbenchmark/3_pcie_transfer/test_rw_buffer_wormhole_b0 +2025-02-13T20:09:00.5425205Z build/test/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/ +2025-02-13T20:09:00.5426096Z build/test/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb_wormhole_b0 +2025-02-13T20:09:00.5429965Z build/test/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb_blackhole +2025-02-13T20:09:00.5433491Z build/test/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb_grayskull +2025-02-13T20:09:00.5437013Z build/test/tt_metal/perf_microbenchmark/dispatch/ +2025-02-13T20:09:00.5437640Z build/test/tt_metal/perf_microbenchmark/dispatch/test_prefetcher_grayskull +2025-02-13T20:09:00.5442436Z build/test/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency_wormhole_b0 +2025-02-13T20:09:00.5445566Z build/test/tt_metal/perf_microbenchmark/dispatch/test_dispatcher_grayskull +2025-02-13T20:09:00.5449949Z build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_blackhole +2025-02-13T20:09:00.5453084Z build/test/tt_metal/perf_microbenchmark/dispatch/test_prefetcher_blackhole +2025-02-13T20:09:00.5458177Z build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_grayskull +2025-02-13T20:09:00.5461273Z build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_wormhole_b0 +2025-02-13T20:09:00.5464399Z build/test/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency_grayskull +2025-02-13T20:09:00.5467512Z build/test/tt_metal/perf_microbenchmark/dispatch/test_prefetcher_wormhole_b0 +2025-02-13T20:09:00.5472274Z build/test/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency_blackhole +2025-02-13T20:09:00.5475450Z build/test/tt_metal/perf_microbenchmark/dispatch/test_dispatcher_blackhole +2025-02-13T20:09:00.5478991Z build/test/tt_metal/perf_microbenchmark/dispatch/test_dispatcher_wormhole_b0 +2025-02-13T20:09:00.5482699Z build/test/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/ +2025-02-13T20:09:00.5483609Z build/test/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/test_remote_cb_sync_matmul_grayskull +2025-02-13T20:09:00.5487006Z build/test/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/test_remote_cb_sync_matmul_wormhole_b0 +2025-02-13T20:09:00.5491144Z build/test/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/test_remote_cb_sync_matmul_blackhole +2025-02-13T20:09:00.5494649Z build/test/tt_metal/perf_microbenchmark/routing/ +2025-02-13T20:09:00.5495323Z build/test/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_2ep_blackhole +2025-02-13T20:09:00.5498655Z build/test/tt_metal/perf_microbenchmark/routing/test_tx_rx_wormhole_b0 +2025-02-13T20:09:00.5501296Z build/test/tt_metal/perf_microbenchmark/routing/test_vc_loopback_tunnel_blackhole +2025-02-13T20:09:00.5504825Z build/test/tt_metal/perf_microbenchmark/routing/test_tx_rx_blackhole +2025-02-13T20:09:00.5507285Z build/test/tt_metal/perf_microbenchmark/routing/test_tx_rx_grayskull +2025-02-13T20:09:00.5509852Z build/test/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_2ep_grayskull +2025-02-13T20:09:00.5513868Z build/test/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux_blackhole +2025-02-13T20:09:00.5517381Z build/test/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level_blackhole +2025-02-13T20:09:00.5520188Z build/test/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level_wormhole_b0 +2025-02-13T20:09:00.5523071Z build/test/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel_blackhole +2025-02-13T20:09:00.5526520Z build/test/tt_metal/perf_microbenchmark/routing/test_vc_loopback_tunnel_grayskull +2025-02-13T20:09:00.5530336Z build/test/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux_wormhole_b0 +2025-02-13T20:09:00.5533711Z build/test/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_4ep_blackhole +2025-02-13T20:09:00.5537677Z build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity_grayskull +2025-02-13T20:09:00.5540776Z build/test/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_4ep_grayskull +2025-02-13T20:09:00.5544521Z build/test/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel_grayskull +2025-02-13T20:09:00.5548231Z build/test/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux_grayskull +2025-02-13T20:09:00.5552010Z build/test/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_4ep_wormhole_b0 +2025-02-13T20:09:00.5555691Z build/test/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel_wormhole_b0 +2025-02-13T20:09:00.5559398Z build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity_blackhole +2025-02-13T20:09:00.5562342Z build/test/tt_metal/perf_microbenchmark/routing/test_mux_demux_blackhole +2025-02-13T20:09:00.5565914Z build/test/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_2ep_wormhole_b0 +2025-02-13T20:09:00.5570032Z build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity_wormhole_b0 +2025-02-13T20:09:00.5573053Z build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity_grayskull +2025-02-13T20:09:00.5577615Z build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity_blackhole +2025-02-13T20:09:00.5580759Z build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity_grayskull +2025-02-13T20:09:00.5583458Z build/test/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level_grayskull +2025-02-13T20:09:00.5586230Z build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity_blackhole +2025-02-13T20:09:00.5590700Z build/test/tt_metal/perf_microbenchmark/routing/test_mux_demux_wormhole_b0 +2025-02-13T20:09:00.5594236Z build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity_wormhole_b0 +2025-02-13T20:09:00.5598449Z build/test/tt_metal/perf_microbenchmark/routing/test_vc_loopback_tunnel_wormhole_b0 +2025-02-13T20:09:00.5602146Z build/test/tt_metal/perf_microbenchmark/routing/test_mux_demux_grayskull +2025-02-13T20:09:00.5605641Z build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity_wormhole_b0 +2025-02-13T20:09:00.5608669Z build/test/tt_metal/perf_microbenchmark/2_noc_adjacent/ +2025-02-13T20:09:00.5609390Z build/test/tt_metal/perf_microbenchmark/2_noc_adjacent/test_noc_adjacent_blackhole +2025-02-13T20:09:00.5611975Z build/test/tt_metal/perf_microbenchmark/2_noc_adjacent/test_noc_adjacent_wormhole_b0 +2025-02-13T20:09:00.5614654Z build/test/tt_metal/perf_microbenchmark/2_noc_adjacent/test_noc_adjacent_grayskull +2025-02-13T20:09:00.5617526Z build/test/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/ +2025-02-13T20:09:00.5618329Z build/test/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read_grayskull +2025-02-13T20:09:00.5620929Z build/test/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read_blackhole +2025-02-13T20:09:00.5624167Z build/test/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read_wormhole_b0 +2025-02-13T20:09:00.5627294Z build/test/tt_metal/perf_microbenchmark/noc/ +2025-02-13T20:09:00.5628068Z build/test/tt_metal/perf_microbenchmark/noc/test_noc_unicast_vs_multicast_to_single_core_latency_blackhole +2025-02-13T20:09:00.5630195Z build/test/tt_metal/perf_microbenchmark/noc/test_noc_unicast_vs_multicast_to_single_core_latency_wormhole_b0 +2025-02-13T20:09:00.5632714Z build/test/tt_metal/perf_microbenchmark/noc/test_noc_unicast_vs_multicast_to_single_core_latency_grayskull +2025-02-13T20:09:00.5634971Z build/test/tt_metal/perf_microbenchmark/ethernet/ +2025-02-13T20:09:00.5635758Z build/test/tt_metal/perf_microbenchmark/ethernet/test_ethernet_link_ping_latency_no_edm_grayskull +2025-02-13T20:09:00.5638220Z build/test/tt_metal/perf_microbenchmark/ethernet/test_ethernet_hop_latencies_no_edm_grayskull +2025-02-13T20:09:00.5641286Z build/test/tt_metal/perf_microbenchmark/ethernet/test_ethernet_bidirectional_bandwidth_no_edm_wormhole_b0 +2025-02-13T20:09:00.5644064Z build/test/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional_wormhole_b0 +2025-02-13T20:09:00.5646795Z build/test/tt_metal/perf_microbenchmark/ethernet/test_ethernet_hop_latencies_no_edm_wormhole_b0 +2025-02-13T20:09:00.5650012Z build/test/tt_metal/perf_microbenchmark/ethernet/test_ethernet_bidirectional_bandwidth_no_edm_blackhole +2025-02-13T20:09:00.5652575Z build/test/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional_blackhole +2025-02-13T20:09:00.5655695Z build/test/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional_grayskull +2025-02-13T20:09:00.5658429Z build/test/tt_metal/perf_microbenchmark/ethernet/test_ethernet_write_worker_latency_no_edm_blackhole +2025-02-13T20:09:00.5661316Z build/test/tt_metal/perf_microbenchmark/ethernet/test_ethernet_hop_latencies_no_edm_blackhole +2025-02-13T20:09:00.5664114Z build/test/tt_metal/perf_microbenchmark/ethernet/test_ethernet_read_and_send_data_blackhole +2025-02-13T20:09:00.5667007Z build/test/tt_metal/perf_microbenchmark/ethernet/test_ethernet_read_and_send_data_grayskull +2025-02-13T20:09:00.5669816Z build/test/tt_metal/perf_microbenchmark/ethernet/test_ethernet_read_and_send_data_wormhole_b0 +2025-02-13T20:09:00.5672513Z build/test/tt_metal/perf_microbenchmark/ethernet/test_ethernet_bidirectional_bandwidth_no_edm_grayskull +2025-02-13T20:09:00.5675168Z build/test/tt_metal/perf_microbenchmark/ethernet/test_ethernet_write_worker_latency_no_edm_grayskull +2025-02-13T20:09:00.5678314Z build/test/tt_metal/perf_microbenchmark/ethernet/test_ethernet_link_ping_latency_no_edm_blackhole +2025-02-13T20:09:00.5681160Z build/test/tt_metal/perf_microbenchmark/ethernet/test_ethernet_write_worker_latency_no_edm_wormhole_b0 +2025-02-13T20:09:00.5684076Z build/test/tt_metal/perf_microbenchmark/ethernet/test_ethernet_link_ping_latency_no_edm_wormhole_b0 +2025-02-13T20:09:00.5686575Z build/test/tt_metal/perf_microbenchmark/old/ +2025-02-13T20:09:00.5687082Z build/test/tt_metal/perf_microbenchmark/old/matmul/ +2025-02-13T20:09:00.5687827Z build/test/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1_grayskull +2025-02-13T20:09:00.5690720Z build/test/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1_blackhole +2025-02-13T20:09:00.5694097Z build/test/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1_wormhole_b0 +2025-02-13T20:09:00.5697290Z build/test/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1_wormhole_b0 +2025-02-13T20:09:00.5701221Z build/test/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1_blackhole +2025-02-13T20:09:00.5704896Z build/test/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1_grayskull +2025-02-13T20:09:00.5708604Z build/test/tt_metal/perf_microbenchmark/old/pcie/ +2025-02-13T20:09:00.5709229Z build/test/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_l1_grayskull +2025-02-13T20:09:00.5711418Z build/test/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_l1_wormhole_b0 +2025-02-13T20:09:00.5713838Z build/test/tt_metal/perf_microbenchmark/old/pcie/test_enqueue_rw_buffer_grayskull +2025-02-13T20:09:00.5716272Z build/test/tt_metal/perf_microbenchmark/old/pcie/test_rw_buffer_old_wormhole_b0 +2025-02-13T20:09:00.5718807Z build/test/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_dram_grayskull +2025-02-13T20:09:00.5721508Z build/test/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_dram_blackhole +2025-02-13T20:09:00.5723699Z build/test/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_l1_blackhole +2025-02-13T20:09:00.5726195Z build/test/tt_metal/perf_microbenchmark/old/pcie/test_rw_buffer_old_grayskull +2025-02-13T20:09:00.5729101Z build/test/tt_metal/perf_microbenchmark/old/pcie/test_enqueue_rw_buffer_blackhole +2025-02-13T20:09:00.5731840Z build/test/tt_metal/perf_microbenchmark/old/pcie/test_enqueue_rw_buffer_wormhole_b0 +2025-02-13T20:09:00.5734373Z build/test/tt_metal/perf_microbenchmark/old/pcie/test_rw_buffer_old_blackhole +2025-02-13T20:09:00.5736852Z build/test/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_dram_wormhole_b0 +2025-02-13T20:09:00.5739210Z build/test/tt_metal/perf_microbenchmark/old/noc/ +2025-02-13T20:09:00.5739861Z build/test/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1_grayskull +2025-02-13T20:09:00.5742770Z build/test/tt_metal/perf_microbenchmark/old/noc/test_noc_read_local_l1_grayskull +2025-02-13T20:09:00.5745844Z build/test/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1_wormhole_b0 +2025-02-13T20:09:00.5749195Z build/test/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1_blackhole +2025-02-13T20:09:00.5752126Z build/test/tt_metal/perf_microbenchmark/old/noc/test_noc_read_local_l1_blackhole +2025-02-13T20:09:00.5755349Z build/test/tt_metal/perf_microbenchmark/old/noc/test_noc_read_local_l1_wormhole_b0 +2025-02-13T20:09:00.5758407Z build/test/tt_metal/unit_tests_dispatch_grayskull +2025-02-13T20:09:00.5781198Z build/tools/ +2025-02-13T20:09:00.5781543Z build/tools/watcher_dump +2025-02-13T20:09:00.5783803Z build/tools/lightmetal_runner +2025-02-13T20:09:00.5786245Z build/tt-train/ +2025-02-13T20:09:00.5786618Z build/tt-train/DartConfiguration.tcl +2025-02-13T20:09:00.5787059Z build/tt-train/CMakeFiles/ +2025-02-13T20:09:00.5787485Z build/tt-train/CTestTestfile.cmake +2025-02-13T20:09:00.5787872Z build/tt-train/sources/ +2025-02-13T20:09:00.5788235Z build/tt-train/sources/CMakeFiles/ +2025-02-13T20:09:00.5788664Z build/tt-train/sources/CTestTestfile.cmake +2025-02-13T20:09:00.5789142Z build/tt-train/sources/ttml/ +2025-02-13T20:09:00.5789539Z build/tt-train/sources/ttml/libttml.a +2025-02-13T20:09:00.5945085Z build/tt-train/sources/ttml/CMakeFiles/ +2025-02-13T20:09:00.5945634Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/ +2025-02-13T20:09:00.5946210Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/optimizers/ +2025-02-13T20:09:00.5946938Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/optimizers/optimizer_base.cpp.o +2025-02-13T20:09:00.5949713Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/optimizers/sgd.cpp.o +2025-02-13T20:09:00.5953345Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/optimizers/adamw.cpp.o +2025-02-13T20:09:00.5960156Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/ops/ +2025-02-13T20:09:00.5960799Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/ops/unary_ops.cpp.o +2025-02-13T20:09:00.5965969Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/ops/binary_ops.cpp.o +2025-02-13T20:09:00.5970386Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/ops/embedding_op.cpp.o +2025-02-13T20:09:00.5972605Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/ops/losses.cpp.o +2025-02-13T20:09:00.5977125Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/ops/distributed/ +2025-02-13T20:09:00.5977904Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/ops/distributed/comm_ops.cpp.o +2025-02-13T20:09:00.5979401Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/ops/scaled_dot_product_attention.cpp.o +2025-02-13T20:09:00.5983161Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/ops/layernorm_op.cpp.o +2025-02-13T20:09:00.5991565Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/ops/linear_op.cpp.o +2025-02-13T20:09:00.5994465Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/ops/multi_head_utils.cpp.o +2025-02-13T20:09:00.5997600Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/ops/dropout_op.cpp.o +2025-02-13T20:09:00.5999199Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/modules/ +2025-02-13T20:09:00.5999840Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/modules/gpt_block.cpp.o +2025-02-13T20:09:00.6002241Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/modules/positional_embeddings.cpp.o +2025-02-13T20:09:00.6004216Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/modules/single_head_attention.cpp.o +2025-02-13T20:09:00.6005193Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/modules/multi_layer_perceptron.cpp.o +2025-02-13T20:09:00.6006230Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/modules/distributed/ +2025-02-13T20:09:00.6007029Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/modules/distributed/linear.cpp.o +2025-02-13T20:09:00.6009870Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/modules/embedding_module.cpp.o +2025-02-13T20:09:00.6013027Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/modules/linear_module.cpp.o +2025-02-13T20:09:00.6013877Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/modules/multi_head_attention.cpp.o +2025-02-13T20:09:00.6014716Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/modules/layer_norm_module.cpp.o +2025-02-13T20:09:00.6015541Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/modules/dropout_module.cpp.o +2025-02-13T20:09:00.6016696Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/models/ +2025-02-13T20:09:00.6017377Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/models/gpt2.cpp.o +2025-02-13T20:09:00.6021877Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/models/mlp.cpp.o +2025-02-13T20:09:00.6023732Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/models/linear_regression.cpp.o +2025-02-13T20:09:00.6024448Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/init/ +2025-02-13T20:09:00.6025144Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/init/tensor_initializers.cpp.o +2025-02-13T20:09:00.6025969Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/init/cpu_initializers.cpp.o +2025-02-13T20:09:00.6026706Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/serialization/ +2025-02-13T20:09:00.6027467Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/serialization/msgpack_file.cpp.o +2025-02-13T20:09:00.6032648Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/serialization/serialization.cpp.o +2025-02-13T20:09:00.6037083Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/autograd/ +2025-02-13T20:09:00.6037782Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/autograd/tensor.cpp.o +2025-02-13T20:09:00.6041649Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/autograd/auto_context.cpp.o +2025-02-13T20:09:00.6042439Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/autograd/autocast_tensor.cpp.o +2025-02-13T20:09:00.6049053Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/autograd/module_base.cpp.o +2025-02-13T20:09:00.6052572Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/autograd/graph.cpp.o +2025-02-13T20:09:00.6053261Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/tokenizers/ +2025-02-13T20:09:00.6054024Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/tokenizers/char_tokenizer.cpp.o +2025-02-13T20:09:00.6054941Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/tokenizers/char_tokenizer_trainer.cpp.o +2025-02-13T20:09:00.6055812Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/tokenizers/bpe_tokenizer.cpp.o +2025-02-13T20:09:00.6056801Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/ttml.cpp.o +2025-02-13T20:09:00.6057446Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/datasets/ +2025-02-13T20:09:00.6058205Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/datasets/in_memory_token_dataset.cpp.o +2025-02-13T20:09:00.6059021Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/datasets/utils.cpp.o +2025-02-13T20:09:00.6059846Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/datasets/generators.cpp.o +2025-02-13T20:09:00.6060548Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/ttnn_fixed/ +2025-02-13T20:09:00.6061208Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/ttnn_fixed/distributed/ +2025-02-13T20:09:00.6062041Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/ttnn_fixed/distributed/ttnn_ops.cpp.o +2025-02-13T20:09:00.6063934Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/ttnn_fixed/trivial_ttnn_ops.cpp.o +2025-02-13T20:09:00.6068817Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/core/ +2025-02-13T20:09:00.6069494Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/core/mesh_device.cpp.o +2025-02-13T20:09:00.6070270Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/core/clip_grad_norm.cpp.o +2025-02-13T20:09:00.6071054Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/core/distributed/ +2025-02-13T20:09:00.6071847Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/core/distributed/distributed.cpp.o +2025-02-13T20:09:00.6073066Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/core/compute_kernel_config.cpp.o +2025-02-13T20:09:00.6073878Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/core/device.cpp.o +2025-02-13T20:09:00.6074675Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/core/tt_tensor_utils.cpp.o +2025-02-13T20:09:00.6086463Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/core/system_utils.cpp.o +2025-02-13T20:09:00.6087179Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/schedulers/ +2025-02-13T20:09:00.6088433Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/schedulers/linear_scheduler.cpp.o +2025-02-13T20:09:00.6089304Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/schedulers/scheduler_base.cpp.o +2025-02-13T20:09:00.6090359Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/schedulers/sequential_scheduler.cpp.o +2025-02-13T20:09:00.6091270Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/schedulers/step_scheduler.cpp.o +2025-02-13T20:09:00.6094832Z build/tt-train/sources/ttml/CMakeFiles/ttml.dir/schedulers/lambda_scheduler.cpp.o +2025-02-13T20:09:00.6096061Z build/tt-train/sources/ttml/CTestTestfile.cmake +2025-02-13T20:09:00.6096672Z build/tt-train/sources/ttml/cmake_install.cmake +2025-02-13T20:09:00.6097207Z build/tt-train/sources/cmake_install.cmake +2025-02-13T20:09:00.6097718Z build/tt-train/sources/examples/ +2025-02-13T20:09:00.6098189Z build/tt-train/sources/examples/mnist_mlp/ +2025-02-13T20:09:00.6098684Z build/tt-train/sources/examples/mnist_mlp/mnist_mlp +2025-02-13T20:09:00.6123966Z build/tt-train/sources/examples/mnist_mlp/CMakeFiles/ +2025-02-13T20:09:00.6124635Z build/tt-train/sources/examples/mnist_mlp/CMakeFiles/mnist_mlp.dir/ +2025-02-13T20:09:00.6125401Z build/tt-train/sources/examples/mnist_mlp/CMakeFiles/mnist_mlp.dir/utils.cpp.o +2025-02-13T20:09:00.6126567Z build/tt-train/sources/examples/mnist_mlp/CMakeFiles/mnist_mlp.dir/model.cpp.o +2025-02-13T20:09:00.6127411Z build/tt-train/sources/examples/mnist_mlp/CMakeFiles/mnist_mlp.dir/main.cpp.o +2025-02-13T20:09:00.6141235Z build/tt-train/sources/examples/mnist_mlp/CTestTestfile.cmake +2025-02-13T20:09:00.6141924Z build/tt-train/sources/examples/mnist_mlp/cmake_install.cmake +2025-02-13T20:09:00.6142708Z build/tt-train/sources/examples/CMakeFiles/ +2025-02-13T20:09:00.6143318Z build/tt-train/sources/examples/CTestTestfile.cmake +2025-02-13T20:09:00.6144035Z build/tt-train/sources/examples/linear_regression_ddp/ +2025-02-13T20:09:00.6144839Z build/tt-train/sources/examples/linear_regression_ddp/CMakeFiles/ +2025-02-13T20:09:00.6145780Z build/tt-train/sources/examples/linear_regression_ddp/CMakeFiles/linear_regression_ddp.dir/ +2025-02-13T20:09:00.6146911Z build/tt-train/sources/examples/linear_regression_ddp/CMakeFiles/linear_regression_ddp.dir/main.cpp.o +2025-02-13T20:09:00.6147943Z build/tt-train/sources/examples/linear_regression_ddp/CTestTestfile.cmake +2025-02-13T20:09:00.6148835Z build/tt-train/sources/examples/linear_regression_ddp/linear_regression_ddp +2025-02-13T20:09:00.6167090Z build/tt-train/sources/examples/linear_regression_ddp/cmake_install.cmake +2025-02-13T20:09:00.6168027Z build/tt-train/sources/examples/cmake_install.cmake +2025-02-13T20:09:00.6168674Z build/tt-train/sources/examples/nano_gpt/ +2025-02-13T20:09:00.6169277Z build/tt-train/sources/examples/nano_gpt/CMakeFiles/ +2025-02-13T20:09:00.6170019Z build/tt-train/sources/examples/nano_gpt/CMakeFiles/nano_gpt.dir/ +2025-02-13T20:09:00.6170920Z build/tt-train/sources/examples/nano_gpt/CMakeFiles/nano_gpt.dir/utils.cpp.o +2025-02-13T20:09:00.6171811Z build/tt-train/sources/examples/nano_gpt/CMakeFiles/nano_gpt.dir/main.cpp.o +2025-02-13T20:09:00.6193161Z build/tt-train/sources/examples/nano_gpt/CTestTestfile.cmake +2025-02-13T20:09:00.6193883Z build/tt-train/sources/examples/nano_gpt/cmake_install.cmake +2025-02-13T20:09:00.6194493Z build/tt-train/sources/examples/nano_gpt/nano_gpt +2025-02-13T20:09:00.6431962Z build/tt-train/sources/examples/graph_capture/ +2025-02-13T20:09:00.6432578Z build/tt-train/sources/examples/graph_capture/CMakeFiles/ +2025-02-13T20:09:00.6433312Z build/tt-train/sources/examples/graph_capture/CMakeFiles/graph_capture.dir/ +2025-02-13T20:09:00.6434169Z build/tt-train/sources/examples/graph_capture/CMakeFiles/graph_capture.dir/main.cpp.o +2025-02-13T20:09:00.6438522Z build/tt-train/sources/examples/graph_capture/CTestTestfile.cmake +2025-02-13T20:09:00.6439234Z build/tt-train/sources/examples/graph_capture/cmake_install.cmake +2025-02-13T20:09:00.6439899Z build/tt-train/sources/examples/graph_capture/graph_capture +2025-02-13T20:09:00.6457620Z build/tt-train/sources/examples/sample_app/ +2025-02-13T20:09:00.6458193Z build/tt-train/sources/examples/sample_app/CMakeFiles/ +2025-02-13T20:09:00.6458886Z build/tt-train/sources/examples/sample_app/CMakeFiles/sample_app.dir/ +2025-02-13T20:09:00.6459954Z build/tt-train/sources/examples/sample_app/CMakeFiles/sample_app.dir/main.cpp.o +2025-02-13T20:09:00.6460743Z build/tt-train/sources/examples/sample_app/CTestTestfile.cmake +2025-02-13T20:09:00.6461473Z build/tt-train/sources/examples/sample_app/cmake_install.cmake +2025-02-13T20:09:00.6462084Z build/tt-train/sources/examples/sample_app/sample_app +2025-02-13T20:09:00.6462646Z build/tt-train/sources/examples/linear_regression/ +2025-02-13T20:09:00.6463235Z build/tt-train/sources/examples/linear_regression/CMakeFiles/ +2025-02-13T20:09:00.6463945Z build/tt-train/sources/examples/linear_regression/CMakeFiles/linear_regression.dir/ +2025-02-13T20:09:00.6464843Z build/tt-train/sources/examples/linear_regression/CMakeFiles/linear_regression.dir/main.cpp.o +2025-02-13T20:09:00.6465695Z build/tt-train/sources/examples/linear_regression/CTestTestfile.cmake +2025-02-13T20:09:00.6466396Z build/tt-train/sources/examples/linear_regression/cmake_install.cmake +2025-02-13T20:09:00.6467077Z build/tt-train/sources/examples/linear_regression/linear_regression +2025-02-13T20:09:00.6484541Z build/tt-train/cmake_install.cmake +2025-02-13T20:09:00.6484977Z build/tt-train/tests/ +2025-02-13T20:09:00.6485343Z build/tt-train/tests/ttml_tests +2025-02-13T20:09:00.6735860Z build/tt-train/tests/ttml_tests[1]_include.cmake +2025-02-13T20:09:00.6736368Z build/tt-train/tests/CMakeFiles/ +2025-02-13T20:09:00.6736802Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/ +2025-02-13T20:09:00.6737382Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/optimizers/ +2025-02-13T20:09:00.6738047Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/optimizers/adamw_test.cpp.o +2025-02-13T20:09:00.6738713Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/ops/ +2025-02-13T20:09:00.6739348Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/ops/embedding_op_test.cpp.o +2025-02-13T20:09:00.6740167Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/ops/layer_norm_op_test.cpp.o +2025-02-13T20:09:00.6740873Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/ops/distributed/ +2025-02-13T20:09:00.6741697Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/ops/distributed/comm_ops_test.cpp.o +2025-02-13T20:09:00.6744940Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/ops/linear_op_test.cpp.o +2025-02-13T20:09:00.6745859Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/ops/positional_embedding_test.cpp.o +2025-02-13T20:09:00.6747051Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/ops/unary_ops_test.cpp.o +2025-02-13T20:09:00.6748273Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/modules/ +2025-02-13T20:09:00.6748919Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/modules/distributed/ +2025-02-13T20:09:00.6749690Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/modules/distributed/linear_test.cpp.o +2025-02-13T20:09:00.6755371Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/serialization/ +2025-02-13T20:09:00.6756217Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/serialization/msgpack_serializer_test.cpp.o +2025-02-13T20:09:00.6758301Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/serialization/model_serializer_test.cpp.o +2025-02-13T20:09:00.6760512Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/serialization/tensor_serializer_test.cpp.o +2025-02-13T20:09:00.6761319Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/autograd/ +2025-02-13T20:09:00.6762007Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/autograd/autograd_tensor.cpp.o +2025-02-13T20:09:00.6762869Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/autograd/module_base_parameters_test.cpp.o +2025-02-13T20:09:00.6764578Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/autograd/autograd_test.cpp.o +2025-02-13T20:09:00.6765663Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/tokenizers/ +2025-02-13T20:09:00.6766418Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/tokenizers/char_tokenizer_test.cpp.o +2025-02-13T20:09:00.6767359Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/tokenizers/bpe_tokenizer_test.cpp.o +2025-02-13T20:09:00.6768379Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/tokenizers/char_tokenizer_trainer_test.cpp.o +2025-02-13T20:09:00.6769133Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/datasets/ +2025-02-13T20:09:00.6770015Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/datasets/dataloader_test.cpp.o +2025-02-13T20:09:00.6771321Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/datasets/in_memory_token_dataset_test.cpp.o +2025-02-13T20:09:00.6772170Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/datasets/utils_test.cpp.o +2025-02-13T20:09:00.6773496Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/datasets/generators_test.cpp.o +2025-02-13T20:09:00.6774774Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/datasets/random_split_test.cpp.o +2025-02-13T20:09:00.6775874Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/ttnn_fixed/ +2025-02-13T20:09:00.6776522Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/ttnn_fixed/distributed/ +2025-02-13T20:09:00.6777415Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/ttnn_fixed/distributed/distributed_ttnn_ops_test.cpp.o +2025-02-13T20:09:00.6778385Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/ttnn_fixed/reduce_ops_test.cpp.o +2025-02-13T20:09:00.6781547Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/ttnn_fixed/dropout_op_test.cpp.o +2025-02-13T20:09:00.6786677Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/ttnn_fixed/trivial_ttnn_ops_test.cpp.o +2025-02-13T20:09:00.6789438Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/3rd_party/ +2025-02-13T20:09:00.6790143Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/3rd_party/tokenizers_test.cpp.o +2025-02-13T20:09:00.6790906Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/3rd_party/xtensor_test.cpp.o +2025-02-13T20:09:00.6791605Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/model/ +2025-02-13T20:09:00.6792282Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/model/model_names_test.cpp.o +2025-02-13T20:09:00.6793109Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/model/linear_regression_ddp_test.cpp.o +2025-02-13T20:09:00.6795512Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/model/gpt2s_test.cpp.o +2025-02-13T20:09:00.6799528Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/model/linear_regression_full_test.cpp.o +2025-02-13T20:09:00.6800387Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/model/weight_tying_test.cpp.o +2025-02-13T20:09:00.6802344Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/model/nano_gpt_test.cpp.o +2025-02-13T20:09:00.6806801Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/core/ +2025-02-13T20:09:00.6807451Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/core/scoped_test.cpp.o +2025-02-13T20:09:00.6808331Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/core/n300_utils_test.cpp.o +2025-02-13T20:09:00.6816971Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/core/distributed_test.cpp.o +2025-02-13T20:09:00.6824330Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/core/tensor_utils_test.cpp.o +2025-02-13T20:09:00.6827239Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/core/clip_grad_norm_test.cpp.o +2025-02-13T20:09:00.6829262Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/schedulers/ +2025-02-13T20:09:00.6829988Z build/tt-train/tests/CMakeFiles/ttml_tests.dir/schedulers/schedulers_test.cpp.o +2025-02-13T20:09:00.6831179Z build/tt-train/tests/CTestTestfile.cmake +2025-02-13T20:09:00.6831738Z build/tt-train/tests/cmake_install.cmake +2025-02-13T20:09:00.6832233Z build/tt-train/tests/ttml_tests[1]_tests.cmake +2025-02-13T20:09:00.6832705Z build/tt-train/tests/shakespeare.txt +2025-02-13T20:09:00.6842980Z build/tt-train/3rd_party/ +2025-02-13T20:09:00.6843372Z build/tt-train/3rd_party/CMakeFiles/ +2025-02-13T20:09:00.6844070Z build/tt-train/3rd_party/CTestTestfile.cmake +2025-02-13T20:09:00.6844594Z build/tt-train/3rd_party/cmake_install.cmake +2025-02-13T20:09:00.6845070Z build/tt-train/3rd_party/wandb-cpp/ +2025-02-13T20:09:00.6845524Z build/tt-train/3rd_party/wandb-cpp/CMakeFiles/ +2025-02-13T20:09:00.6846084Z build/tt-train/3rd_party/wandb-cpp/CMakeFiles/wandbcpp.dir/ +2025-02-13T20:09:00.6846722Z build/tt-train/3rd_party/wandb-cpp/CMakeFiles/wandbcpp.dir/src/ +2025-02-13T20:09:00.6847445Z build/tt-train/3rd_party/wandb-cpp/CMakeFiles/wandbcpp.dir/src/wandb_object.cpp.o +2025-02-13T20:09:00.6849096Z build/tt-train/3rd_party/wandb-cpp/CMakeFiles/wandbcpp.dir/src/wandbcpp.cpp.o +2025-02-13T20:09:00.6849959Z build/tt-train/3rd_party/wandb-cpp/CMakeFiles/wandbcpp.dir/src/py_util.cpp.o +2025-02-13T20:09:00.6850736Z build/tt-train/3rd_party/wandb-cpp/CMakeFiles/wandbcpp.dir/src/async_logging.cpp.o +2025-02-13T20:09:00.6851526Z build/tt-train/3rd_party/wandb-cpp/CMakeFiles/wandbcpp.dir/src/np_object.cpp.o +2025-02-13T20:09:00.6852305Z build/tt-train/3rd_party/wandb-cpp/CMakeFiles/wandbcpp.dir/src/py_object.cpp.o +2025-02-13T20:09:00.6853047Z build/tt-train/3rd_party/wandb-cpp/CTestTestfile.cmake +2025-02-13T20:09:00.6853627Z build/tt-train/3rd_party/wandb-cpp/cmake_install.cmake +2025-02-13T20:09:00.6854178Z build/tt-train/3rd_party/wandb-cpp/libwandbcpp.so +2025-02-13T20:09:00.6854820Z build/tt-train/Testing/ +2025-02-13T20:09:00.6855173Z build/tt-train/Testing/Temporary/ +2025-02-13T20:09:00.6855542Z data/ +2025-02-13T20:09:00.6855838Z data/tokenizer.json +2025-02-13T20:09:00.6871280Z data/gpt2-tokenizer.json +2025-02-13T20:09:00.6891176Z data/shakespeare.txt +2025-02-13T20:09:00.6901833Z runtime/ +2025-02-13T20:09:00.6902679Z runtime/hw/ +2025-02-13T20:09:00.6902998Z runtime/hw/lib/ +2025-02-13T20:09:00.6903312Z runtime/hw/lib/blackhole/ +2025-02-13T20:09:00.6903704Z runtime/hw/lib/blackhole/tmu-crt0.o +2025-02-13T20:09:00.6904132Z runtime/hw/lib/blackhole/tdma_xmov.o +2025-02-13T20:09:00.6904789Z runtime/hw/lib/blackhole/substitutes.o +2025-02-13T20:09:00.6905222Z runtime/hw/lib/blackhole/noc.o +2025-02-13T20:09:00.6905651Z runtime/hw/lib/blackhole/tmu-crt0k.o +2025-02-13T20:09:00.6906062Z runtime/hw/lib/wormhole/ +2025-02-13T20:09:00.6906440Z runtime/hw/lib/wormhole/ncrisc-halt.o +2025-02-13T20:09:00.6906867Z runtime/hw/lib/wormhole/tmu-crt0.o +2025-02-13T20:09:00.6907287Z runtime/hw/lib/wormhole/tdma_xmov.o +2025-02-13T20:09:00.6907705Z runtime/hw/lib/wormhole/substitutes.o +2025-02-13T20:09:00.6908122Z runtime/hw/lib/wormhole/noc.o +2025-02-13T20:09:00.6908520Z runtime/hw/lib/wormhole/tmu-crt0k.o +2025-02-13T20:09:00.6908926Z runtime/hw/lib/grayskull/ +2025-02-13T20:09:00.6909327Z runtime/hw/lib/grayskull/ncrisc-halt.o +2025-02-13T20:09:00.6909778Z runtime/hw/lib/grayskull/tmu-crt0.o +2025-02-13T20:09:00.6910201Z runtime/hw/lib/grayskull/tdma_xmov.o +2025-02-13T20:09:00.6910654Z runtime/hw/lib/grayskull/substitutes.o +2025-02-13T20:09:00.6911062Z runtime/hw/lib/grayskull/noc.o +2025-02-13T20:09:00.6911461Z runtime/hw/lib/grayskull/tmu-crt0k.o +2025-02-13T20:09:00.6911860Z runtime/hw/toolchain/ +2025-02-13T20:09:00.6912215Z runtime/hw/toolchain/blackhole/ +2025-02-13T20:09:00.6912850Z runtime/hw/toolchain/blackhole/kernel_slave_ierisc.ld +2025-02-13T20:09:00.6913436Z runtime/hw/toolchain/blackhole/kernel_brisc.ld +2025-02-13T20:09:00.6913945Z runtime/hw/toolchain/blackhole/kernel_ncrisc.ld +2025-02-13T20:09:00.6914589Z runtime/hw/toolchain/blackhole/firmware_trisc0.ld +2025-02-13T20:09:00.6917131Z runtime/hw/toolchain/blackhole/kernel_ierisc.ld +2025-02-13T20:09:00.6917636Z runtime/hw/toolchain/blackhole/firmware_trisc2.ld +2025-02-13T20:09:00.6918181Z runtime/hw/toolchain/blackhole/firmware_slave_ierisc.ld +2025-02-13T20:09:00.6918741Z runtime/hw/toolchain/blackhole/firmware_aerisc.ld +2025-02-13T20:09:00.6919260Z runtime/hw/toolchain/blackhole/kernel_aerisc.ld +2025-02-13T20:09:00.6919769Z runtime/hw/toolchain/blackhole/firmware_ncrisc.ld +2025-02-13T20:09:00.6920265Z runtime/hw/toolchain/blackhole/firmware_brisc.ld +2025-02-13T20:09:00.6920817Z runtime/hw/toolchain/blackhole/kernel_trisc1.ld +2025-02-13T20:09:00.6921315Z runtime/hw/toolchain/blackhole/kernel_trisc2.ld +2025-02-13T20:09:00.6921797Z runtime/hw/toolchain/blackhole/firmware_ierisc.ld +2025-02-13T20:09:00.6922370Z runtime/hw/toolchain/blackhole/firmware_trisc1.ld +2025-02-13T20:09:00.6922874Z runtime/hw/toolchain/blackhole/kernel_trisc0.ld +2025-02-13T20:09:00.6923507Z runtime/hw/toolchain/wormhole/ +2025-02-13T20:09:00.6924010Z runtime/hw/toolchain/wormhole/kernel_slave_ierisc.ld +2025-02-13T20:09:00.6924557Z runtime/hw/toolchain/wormhole/kernel_brisc.ld +2025-02-13T20:09:00.6925043Z runtime/hw/toolchain/wormhole/kernel_ncrisc.ld +2025-02-13T20:09:00.6925741Z runtime/hw/toolchain/wormhole/firmware_trisc0.ld +2025-02-13T20:09:00.6926259Z runtime/hw/toolchain/wormhole/kernel_ierisc.ld +2025-02-13T20:09:00.6926756Z runtime/hw/toolchain/wormhole/firmware_trisc2.ld +2025-02-13T20:09:00.6927268Z runtime/hw/toolchain/wormhole/firmware_slave_ierisc.ld +2025-02-13T20:09:00.6928331Z runtime/hw/toolchain/wormhole/firmware_aerisc.ld +2025-02-13T20:09:00.6928836Z runtime/hw/toolchain/wormhole/kernel_aerisc.ld +2025-02-13T20:09:00.6929331Z runtime/hw/toolchain/wormhole/firmware_ncrisc.ld +2025-02-13T20:09:00.6929840Z runtime/hw/toolchain/wormhole/firmware_brisc.ld +2025-02-13T20:09:00.6930369Z runtime/hw/toolchain/wormhole/kernel_trisc1.ld +2025-02-13T20:09:00.6930863Z runtime/hw/toolchain/wormhole/kernel_trisc2.ld +2025-02-13T20:09:00.6931406Z runtime/hw/toolchain/wormhole/firmware_ierisc.ld +2025-02-13T20:09:00.6931913Z runtime/hw/toolchain/wormhole/firmware_trisc1.ld +2025-02-13T20:09:00.6932402Z runtime/hw/toolchain/wormhole/kernel_trisc0.ld +2025-02-13T20:09:00.6932869Z runtime/hw/toolchain/grayskull/ +2025-02-13T20:09:00.6933508Z runtime/hw/toolchain/grayskull/kernel_slave_ierisc.ld +2025-02-13T20:09:00.6934228Z runtime/hw/toolchain/grayskull/kernel_brisc.ld +2025-02-13T20:09:00.6934735Z runtime/hw/toolchain/grayskull/kernel_ncrisc.ld +2025-02-13T20:09:00.6935250Z runtime/hw/toolchain/grayskull/firmware_trisc0.ld +2025-02-13T20:09:00.6935759Z runtime/hw/toolchain/grayskull/kernel_ierisc.ld +2025-02-13T20:09:00.6936258Z runtime/hw/toolchain/grayskull/firmware_trisc2.ld +2025-02-13T20:09:00.6936783Z runtime/hw/toolchain/grayskull/firmware_slave_ierisc.ld +2025-02-13T20:09:00.6937330Z runtime/hw/toolchain/grayskull/firmware_aerisc.ld +2025-02-13T20:09:00.6937840Z runtime/hw/toolchain/grayskull/kernel_aerisc.ld +2025-02-13T20:09:00.6938513Z runtime/hw/toolchain/grayskull/firmware_ncrisc.ld +2025-02-13T20:09:00.6939032Z runtime/hw/toolchain/grayskull/firmware_brisc.ld +2025-02-13T20:09:00.6939530Z runtime/hw/toolchain/grayskull/kernel_trisc1.ld +2025-02-13T20:09:00.6940049Z runtime/hw/toolchain/grayskull/kernel_trisc2.ld +2025-02-13T20:09:00.6940564Z runtime/hw/toolchain/grayskull/firmware_ierisc.ld +2025-02-13T20:09:00.6941091Z runtime/hw/toolchain/grayskull/firmware_trisc1.ld +2025-02-13T20:09:00.6941578Z runtime/hw/toolchain/grayskull/kernel_trisc0.ld +2025-02-13T20:09:00.6942122Z runtime/sfpi/ +2025-02-13T20:09:00.6942447Z runtime/sfpi/include/ +2025-02-13T20:09:00.6942812Z runtime/sfpi/include/blackhole/ +2025-02-13T20:09:00.6943227Z runtime/sfpi/include/blackhole/sfpi_lib.h +2025-02-13T20:09:00.6944009Z runtime/sfpi/include/blackhole/ckernel_ops.h +2025-02-13T20:09:00.6944494Z runtime/sfpi/include/blackhole/sfpi_imp.h +2025-02-13T20:09:00.6945083Z runtime/sfpi/include/blackhole/sfpi_hw.h +2025-02-13T20:09:00.6945523Z runtime/sfpi/include/sfpi.h +2025-02-13T20:09:00.6945903Z runtime/sfpi/include/sfpi_fp16.h +2025-02-13T20:09:00.6946291Z runtime/sfpi/include/wormhole/ +2025-02-13T20:09:00.6946699Z runtime/sfpi/include/wormhole/sfpi_lib.h +2025-02-13T20:09:00.6947155Z runtime/sfpi/include/wormhole/ckernel_ops.h +2025-02-13T20:09:00.6947625Z runtime/sfpi/include/wormhole/sfpi_imp.h +2025-02-13T20:09:00.6948086Z runtime/sfpi/include/wormhole/sfpi_hw.h +2025-02-13T20:09:00.6948527Z runtime/sfpi/include/grayskull/ +2025-02-13T20:09:00.6948945Z runtime/sfpi/include/grayskull/sfpi_lib.h +2025-02-13T20:09:00.6949412Z runtime/sfpi/include/grayskull/ckernel_ops.h +2025-02-13T20:09:00.6949884Z runtime/sfpi/include/grayskull/sfpi_imp.h +2025-02-13T20:09:00.6950322Z runtime/sfpi/include/grayskull/sfpi_hw.h +2025-02-13T20:09:00.6950749Z runtime/sfpi/src-hashes +2025-02-13T20:09:00.6951103Z runtime/sfpi/compiler/ +2025-02-13T20:09:00.6951465Z runtime/sfpi/compiler/include/ +2025-02-13T20:09:00.6951880Z runtime/sfpi/compiler/riscv32-unknown-elf/ +2025-02-13T20:09:00.6952374Z runtime/sfpi/compiler/riscv32-unknown-elf/include/ +2025-02-13T20:09:00.6952974Z runtime/sfpi/compiler/riscv32-unknown-elf/include/cpio.h +2025-02-13T20:09:00.6953573Z runtime/sfpi/compiler/riscv32-unknown-elf/include/errno.h +2025-02-13T20:09:00.6954572Z runtime/sfpi/compiler/riscv32-unknown-elf/include/rpc/ +2025-02-13T20:09:00.6955340Z runtime/sfpi/compiler/riscv32-unknown-elf/include/machine/ +2025-02-13T20:09:00.6956009Z runtime/sfpi/compiler/riscv32-unknown-elf/include/machine/param.h +2025-02-13T20:09:00.6956698Z runtime/sfpi/compiler/riscv32-unknown-elf/include/machine/endian.h +2025-02-13T20:09:00.6957388Z runtime/sfpi/compiler/riscv32-unknown-elf/include/machine/stdlib.h +2025-02-13T20:09:00.6958100Z runtime/sfpi/compiler/riscv32-unknown-elf/include/machine/_arc4random.h +2025-02-13T20:09:00.6958819Z runtime/sfpi/compiler/riscv32-unknown-elf/include/machine/_time.h +2025-02-13T20:09:00.6959513Z runtime/sfpi/compiler/riscv32-unknown-elf/include/machine/setjmp-dj.h +2025-02-13T20:09:00.6960221Z runtime/sfpi/compiler/riscv32-unknown-elf/include/machine/ieeefp.h +2025-02-13T20:09:00.6960908Z runtime/sfpi/compiler/riscv32-unknown-elf/include/machine/malloc.h +2025-02-13T20:09:00.6961569Z runtime/sfpi/compiler/riscv32-unknown-elf/include/machine/types.h +2025-02-13T20:09:00.6962273Z runtime/sfpi/compiler/riscv32-unknown-elf/include/machine/setjmp.h +2025-02-13T20:09:00.6963069Z runtime/sfpi/compiler/riscv32-unknown-elf/include/machine/ansi.h +2025-02-13T20:09:00.6963823Z runtime/sfpi/compiler/riscv32-unknown-elf/include/machine/termios.h +2025-02-13T20:09:00.6964524Z runtime/sfpi/compiler/riscv32-unknown-elf/include/machine/fastmath.h +2025-02-13T20:09:00.6965457Z runtime/sfpi/compiler/riscv32-unknown-elf/include/machine/_endian.h +2025-02-13T20:09:00.6966285Z runtime/sfpi/compiler/riscv32-unknown-elf/include/machine/_types.h +2025-02-13T20:09:00.6967010Z runtime/sfpi/compiler/riscv32-unknown-elf/include/machine/_default_types.h +2025-02-13T20:09:00.6967847Z runtime/sfpi/compiler/riscv32-unknown-elf/include/machine/syscall.h +2025-02-13T20:09:00.6968526Z runtime/sfpi/compiler/riscv32-unknown-elf/include/machine/time.h +2025-02-13T20:09:00.6969141Z runtime/sfpi/compiler/riscv32-unknown-elf/include/stdio.h +2025-02-13T20:09:00.6969730Z runtime/sfpi/compiler/riscv32-unknown-elf/include/stdlib.h +2025-02-13T20:09:00.6970329Z runtime/sfpi/compiler/riscv32-unknown-elf/include/grp.h +2025-02-13T20:09:00.6970967Z runtime/sfpi/compiler/riscv32-unknown-elf/include/tgmath.h +2025-02-13T20:09:00.6971566Z runtime/sfpi/compiler/riscv32-unknown-elf/include/search.h +2025-02-13T20:09:00.6972164Z runtime/sfpi/compiler/riscv32-unknown-elf/include/math.h +2025-02-13T20:09:00.6972756Z runtime/sfpi/compiler/riscv32-unknown-elf/include/alloca.h +2025-02-13T20:09:00.6973374Z runtime/sfpi/compiler/riscv32-unknown-elf/include/newlib-nano/ +2025-02-13T20:09:00.6974059Z runtime/sfpi/compiler/riscv32-unknown-elf/include/newlib-nano/newlib.h +2025-02-13T20:09:00.6974707Z runtime/sfpi/compiler/riscv32-unknown-elf/include/spawn.h +2025-02-13T20:09:00.6975308Z runtime/sfpi/compiler/riscv32-unknown-elf/include/paths.h +2025-02-13T20:09:00.6976126Z runtime/sfpi/compiler/riscv32-unknown-elf/include/ctype.h +2025-02-13T20:09:00.6976904Z runtime/sfpi/compiler/riscv32-unknown-elf/include/fnmatch.h +2025-02-13T20:09:00.6977525Z runtime/sfpi/compiler/riscv32-unknown-elf/include/regdef.h +2025-02-13T20:09:00.6978137Z runtime/sfpi/compiler/riscv32-unknown-elf/include/wctype.h +2025-02-13T20:09:00.6978785Z runtime/sfpi/compiler/riscv32-unknown-elf/include/pthread.h +2025-02-13T20:09:00.6979393Z runtime/sfpi/compiler/riscv32-unknown-elf/include/string.h +2025-02-13T20:09:00.6980012Z runtime/sfpi/compiler/riscv32-unknown-elf/include/inttypes.h +2025-02-13T20:09:00.6980631Z runtime/sfpi/compiler/riscv32-unknown-elf/include/regex.h +2025-02-13T20:09:00.6981244Z runtime/sfpi/compiler/riscv32-unknown-elf/include/iconv.h +2025-02-13T20:09:00.6981872Z runtime/sfpi/compiler/riscv32-unknown-elf/include/newlib.h +2025-02-13T20:09:00.6982546Z runtime/sfpi/compiler/riscv32-unknown-elf/include/envz.h +2025-02-13T20:09:00.6983145Z runtime/sfpi/compiler/riscv32-unknown-elf/include/_ansi.h +2025-02-13T20:09:00.6983753Z runtime/sfpi/compiler/riscv32-unknown-elf/include/ieeefp.h +2025-02-13T20:09:00.6984376Z runtime/sfpi/compiler/riscv32-unknown-elf/include/envlock.h +2025-02-13T20:09:00.6985180Z runtime/sfpi/compiler/riscv32-unknown-elf/include/stdatomic.h +2025-02-13T20:09:00.6985830Z runtime/sfpi/compiler/riscv32-unknown-elf/include/malloc.h +2025-02-13T20:09:00.6986628Z runtime/sfpi/compiler/riscv32-unknown-elf/include/unctrl.h +2025-02-13T20:09:00.6987365Z runtime/sfpi/compiler/riscv32-unknown-elf/include/stdint.h +2025-02-13T20:09:00.6987953Z runtime/sfpi/compiler/riscv32-unknown-elf/include/setjmp.h +2025-02-13T20:09:00.6988547Z runtime/sfpi/compiler/riscv32-unknown-elf/include/signal.h +2025-02-13T20:09:00.6989128Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/ +2025-02-13T20:09:00.6989727Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/errno.h +2025-02-13T20:09:00.6990432Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/param.h +2025-02-13T20:09:00.6991072Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/stdio.h +2025-02-13T20:09:00.6991730Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/_tz_structs.h +2025-02-13T20:09:00.6992456Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/config.h +2025-02-13T20:09:00.6993301Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/features.h +2025-02-13T20:09:00.6994211Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/stat.h +2025-02-13T20:09:00.6994853Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/times.h +2025-02-13T20:09:00.6995469Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/string.h +2025-02-13T20:09:00.6996156Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/_pthreadtypes.h +2025-02-13T20:09:00.6996864Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/custom_file.h +2025-02-13T20:09:00.6997531Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/tree.h +2025-02-13T20:09:00.6998297Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/dir.h +2025-02-13T20:09:00.6998925Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/_intsup.h +2025-02-13T20:09:00.6999583Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/wait.h +2025-02-13T20:09:00.7000229Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/iconvnls.h +2025-02-13T20:09:00.7000892Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/types.h +2025-02-13T20:09:00.7001513Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/_stdint.h +2025-02-13T20:09:00.7002143Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/asm.h +2025-02-13T20:09:00.7002804Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/signal.h +2025-02-13T20:09:00.7003444Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/_sigset.h +2025-02-13T20:09:00.7004093Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/resource.h +2025-02-13T20:09:00.7004925Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/reent.h +2025-02-13T20:09:00.7005580Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/_timeval.h +2025-02-13T20:09:00.7006211Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/fenv.h +2025-02-13T20:09:00.7006832Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/fcntl.h +2025-02-13T20:09:00.7007459Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/unistd.h +2025-02-13T20:09:00.7008359Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/cdefs.h +2025-02-13T20:09:00.7009052Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/_default_fcntl.h +2025-02-13T20:09:00.7009727Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/_types.h +2025-02-13T20:09:00.7010371Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/_timespec.h +2025-02-13T20:09:00.7011020Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/queue.h +2025-02-13T20:09:00.7011659Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/timespec.h +2025-02-13T20:09:00.7012334Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/syslimits.h +2025-02-13T20:09:00.7012983Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/lock.h +2025-02-13T20:09:00.7013650Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/_locale.h +2025-02-13T20:09:00.7014290Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/timeb.h +2025-02-13T20:09:00.7014902Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/sched.h +2025-02-13T20:09:00.7016043Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/dirent.h +2025-02-13T20:09:00.7016709Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/utime.h +2025-02-13T20:09:00.7017338Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/time.h +2025-02-13T20:09:00.7017979Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/file.h +2025-02-13T20:09:00.7018617Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sys/select.h +2025-02-13T20:09:00.7019409Z runtime/sfpi/compiler/riscv32-unknown-elf/include/langinfo.h +2025-02-13T20:09:00.7020034Z runtime/sfpi/compiler/riscv32-unknown-elf/include/strings.h +2025-02-13T20:09:00.7020691Z runtime/sfpi/compiler/riscv32-unknown-elf/include/argz.h +2025-02-13T20:09:00.7021261Z runtime/sfpi/compiler/riscv32-unknown-elf/include/reent.h +2025-02-13T20:09:00.7021839Z runtime/sfpi/compiler/riscv32-unknown-elf/include/tar.h +2025-02-13T20:09:00.7022482Z runtime/sfpi/compiler/riscv32-unknown-elf/include/wordexp.h +2025-02-13T20:09:00.7023160Z runtime/sfpi/compiler/riscv32-unknown-elf/include/_newlib_version.h +2025-02-13T20:09:00.7023993Z runtime/sfpi/compiler/riscv32-unknown-elf/include/memory.h +2025-02-13T20:09:00.7024583Z runtime/sfpi/compiler/riscv32-unknown-elf/include/wchar.h +2025-02-13T20:09:00.7025167Z runtime/sfpi/compiler/riscv32-unknown-elf/include/fenv.h +2025-02-13T20:09:00.7025944Z runtime/sfpi/compiler/riscv32-unknown-elf/include/elf.h +2025-02-13T20:09:00.7026612Z runtime/sfpi/compiler/riscv32-unknown-elf/include/fcntl.h +2025-02-13T20:09:00.7027204Z runtime/sfpi/compiler/riscv32-unknown-elf/include/unistd.h +2025-02-13T20:09:00.7027784Z runtime/sfpi/compiler/riscv32-unknown-elf/include/assert.h +2025-02-13T20:09:00.7028370Z runtime/sfpi/compiler/riscv32-unknown-elf/include/bits/ +2025-02-13T20:09:00.7028954Z runtime/sfpi/compiler/riscv32-unknown-elf/include/threads.h +2025-02-13T20:09:00.7029698Z runtime/sfpi/compiler/riscv32-unknown-elf/include/termios.h +2025-02-13T20:09:00.7030341Z runtime/sfpi/compiler/riscv32-unknown-elf/include/fastmath.h +2025-02-13T20:09:00.7030940Z runtime/sfpi/compiler/riscv32-unknown-elf/include/libgen.h +2025-02-13T20:09:00.7031571Z runtime/sfpi/compiler/riscv32-unknown-elf/include/utmp.h +2025-02-13T20:09:00.7032136Z runtime/sfpi/compiler/riscv32-unknown-elf/include/devctl.h +2025-02-13T20:09:00.7032713Z runtime/sfpi/compiler/riscv32-unknown-elf/include/locale.h +2025-02-13T20:09:00.7033292Z runtime/sfpi/compiler/riscv32-unknown-elf/include/limits.h +2025-02-13T20:09:00.7033865Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/ +2025-02-13T20:09:00.7034497Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ +2025-02-13T20:09:00.7035140Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/cinttypes +2025-02-13T20:09:00.7035899Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/ +2025-02-13T20:09:00.7036967Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32i_xttgs/ +2025-02-13T20:09:00.7037942Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32i_xttgs/ilp32/ +2025-02-13T20:09:00.7038959Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32i_xttgs/ilp32/bits/ +2025-02-13T20:09:00.7040170Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32i_xttgs/ilp32/bits/c++allocator.h +2025-02-13T20:09:00.7041347Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32i_xttgs/ilp32/bits/messages_members.h +2025-02-13T20:09:00.7042694Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32i_xttgs/ilp32/bits/atomic_word.h +2025-02-13T20:09:00.7044249Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32i_xttgs/ilp32/bits/stdc++.h +2025-02-13T20:09:00.7045439Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32i_xttgs/ilp32/bits/extc++.h +2025-02-13T20:09:00.7046695Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32i_xttgs/ilp32/bits/c++locale.h +2025-02-13T20:09:00.7068083Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32i_xttgs/ilp32/bits/time_members.h +2025-02-13T20:09:00.7069264Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32i_xttgs/ilp32/bits/gthr-posix.h +2025-02-13T20:09:00.7070431Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32i_xttgs/ilp32/bits/ctype_inline.h +2025-02-13T20:09:00.7071636Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32i_xttgs/ilp32/bits/error_constants.h +2025-02-13T20:09:00.7072981Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32i_xttgs/ilp32/bits/basic_file.h +2025-02-13T20:09:00.7074119Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32i_xttgs/ilp32/bits/cpu_defines.h +2025-02-13T20:09:00.7075280Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32i_xttgs/ilp32/bits/cxxabi_tweaks.h +2025-02-13T20:09:00.7076866Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32i_xttgs/ilp32/bits/gthr-default.h +2025-02-13T20:09:00.7077974Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32i_xttgs/ilp32/bits/c++io.h +2025-02-13T20:09:00.7079049Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32i_xttgs/ilp32/bits/gthr.h +2025-02-13T20:09:00.7080161Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32i_xttgs/ilp32/bits/stdtr1c++.h +2025-02-13T20:09:00.7081325Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32i_xttgs/ilp32/bits/opt_random.h +2025-02-13T20:09:00.7082471Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32i_xttgs/ilp32/bits/c++config.h +2025-02-13T20:09:00.7083821Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32i_xttgs/ilp32/bits/ctype_base.h +2025-02-13T20:09:00.7085031Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32i_xttgs/ilp32/bits/gthr-single.h +2025-02-13T20:09:00.7086177Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32i_xttgs/ilp32/bits/os_defines.h +2025-02-13T20:09:00.7087425Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32i_xttgs/ilp32/ext/ +2025-02-13T20:09:00.7088643Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32i_xttgs/ilp32/ext/opt_random.h +2025-02-13T20:09:00.7089643Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/bits/ +2025-02-13T20:09:00.7090569Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/bits/c++allocator.h +2025-02-13T20:09:00.7091576Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/bits/messages_members.h +2025-02-13T20:09:00.7092636Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/bits/atomic_word.h +2025-02-13T20:09:00.7093966Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/bits/stdc++.h +2025-02-13T20:09:00.7094892Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/bits/extc++.h +2025-02-13T20:09:00.7095840Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/bits/c++locale.h +2025-02-13T20:09:00.7096818Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/bits/time_members.h +2025-02-13T20:09:00.7097794Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/bits/gthr-posix.h +2025-02-13T20:09:00.7098774Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/bits/ctype_inline.h +2025-02-13T20:09:00.7099791Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/bits/error_constants.h +2025-02-13T20:09:00.7100991Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/bits/basic_file.h +2025-02-13T20:09:00.7101982Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/bits/cpu_defines.h +2025-02-13T20:09:00.7103070Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/bits/cxxabi_tweaks.h +2025-02-13T20:09:00.7104335Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/bits/gthr-default.h +2025-02-13T20:09:00.7105296Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/bits/c++io.h +2025-02-13T20:09:00.7106208Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/bits/gthr.h +2025-02-13T20:09:00.7107159Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/bits/stdtr1c++.h +2025-02-13T20:09:00.7108120Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/bits/opt_random.h +2025-02-13T20:09:00.7109110Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/bits/c++config.h +2025-02-13T20:09:00.7110252Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/bits/ctype_base.h +2025-02-13T20:09:00.7111242Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/bits/gthr-single.h +2025-02-13T20:09:00.7112234Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/bits/os_defines.h +2025-02-13T20:09:00.7113160Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/ext/ +2025-02-13T20:09:00.7114136Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/ext/opt_random.h +2025-02-13T20:09:00.7115330Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttbh/ +2025-02-13T20:09:00.7116324Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttbh/ilp32/ +2025-02-13T20:09:00.7117496Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttbh/ilp32/bits/ +2025-02-13T20:09:00.7118617Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttbh/ilp32/bits/c++allocator.h +2025-02-13T20:09:00.7119792Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttbh/ilp32/bits/messages_members.h +2025-02-13T20:09:00.7121020Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttbh/ilp32/bits/atomic_word.h +2025-02-13T20:09:00.7122133Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttbh/ilp32/bits/stdc++.h +2025-02-13T20:09:00.7123221Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttbh/ilp32/bits/extc++.h +2025-02-13T20:09:00.7124401Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttbh/ilp32/bits/c++locale.h +2025-02-13T20:09:00.7125769Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttbh/ilp32/bits/time_members.h +2025-02-13T20:09:00.7126942Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttbh/ilp32/bits/gthr-posix.h +2025-02-13T20:09:00.7128225Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttbh/ilp32/bits/ctype_inline.h +2025-02-13T20:09:00.7129416Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttbh/ilp32/bits/error_constants.h +2025-02-13T20:09:00.7130578Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttbh/ilp32/bits/basic_file.h +2025-02-13T20:09:00.7131750Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttbh/ilp32/bits/cpu_defines.h +2025-02-13T20:09:00.7132906Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttbh/ilp32/bits/cxxabi_tweaks.h +2025-02-13T20:09:00.7134256Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttbh/ilp32/bits/gthr-default.h +2025-02-13T20:09:00.7135395Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttbh/ilp32/bits/c++io.h +2025-02-13T20:09:00.7136733Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttbh/ilp32/bits/gthr.h +2025-02-13T20:09:00.7137853Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttbh/ilp32/bits/stdtr1c++.h +2025-02-13T20:09:00.7139016Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttbh/ilp32/bits/opt_random.h +2025-02-13T20:09:00.7140171Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttbh/ilp32/bits/c++config.h +2025-02-13T20:09:00.7141279Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttbh/ilp32/bits/ctype_base.h +2025-02-13T20:09:00.7142533Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttbh/ilp32/bits/gthr-single.h +2025-02-13T20:09:00.7144074Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttbh/ilp32/bits/os_defines.h +2025-02-13T20:09:00.7145178Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttbh/ilp32/ext/ +2025-02-13T20:09:00.7146286Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttbh/ilp32/ext/opt_random.h +2025-02-13T20:09:00.7147471Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttwh/ +2025-02-13T20:09:00.7148452Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttwh/ilp32/ +2025-02-13T20:09:00.7149478Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttwh/ilp32/bits/ +2025-02-13T20:09:00.7150603Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttwh/ilp32/bits/c++allocator.h +2025-02-13T20:09:00.7151797Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttwh/ilp32/bits/messages_members.h +2025-02-13T20:09:00.7153044Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttwh/ilp32/bits/atomic_word.h +2025-02-13T20:09:00.7154235Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttwh/ilp32/bits/stdc++.h +2025-02-13T20:09:00.7155525Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttwh/ilp32/bits/extc++.h +2025-02-13T20:09:00.7156656Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttwh/ilp32/bits/c++locale.h +2025-02-13T20:09:00.7157948Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttwh/ilp32/bits/time_members.h +2025-02-13T20:09:00.7159109Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttwh/ilp32/bits/gthr-posix.h +2025-02-13T20:09:00.7160296Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttwh/ilp32/bits/ctype_inline.h +2025-02-13T20:09:00.7161473Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttwh/ilp32/bits/error_constants.h +2025-02-13T20:09:00.7162640Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttwh/ilp32/bits/basic_file.h +2025-02-13T20:09:00.7163832Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttwh/ilp32/bits/cpu_defines.h +2025-02-13T20:09:00.7165157Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttwh/ilp32/bits/cxxabi_tweaks.h +2025-02-13T20:09:00.7166347Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttwh/ilp32/bits/gthr-default.h +2025-02-13T20:09:00.7167906Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttwh/ilp32/bits/c++io.h +2025-02-13T20:09:00.7169041Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttwh/ilp32/bits/gthr.h +2025-02-13T20:09:00.7170161Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttwh/ilp32/bits/stdtr1c++.h +2025-02-13T20:09:00.7171413Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttwh/ilp32/bits/opt_random.h +2025-02-13T20:09:00.7172572Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttwh/ilp32/bits/c++config.h +2025-02-13T20:09:00.7173712Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttwh/ilp32/bits/ctype_base.h +2025-02-13T20:09:00.7174862Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttwh/ilp32/bits/gthr-single.h +2025-02-13T20:09:00.7176202Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttwh/ilp32/bits/os_defines.h +2025-02-13T20:09:00.7177448Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttwh/ilp32/ext/ +2025-02-13T20:09:00.7178672Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/riscv32-unknown-elf/rv32im_xttwh/ilp32/ext/opt_random.h +2025-02-13T20:09:00.7179571Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/expected +2025-02-13T20:09:00.7180246Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/cstdio +2025-02-13T20:09:00.7180909Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/fstream +2025-02-13T20:09:00.7181819Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tuple +2025-02-13T20:09:00.7182526Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/atomic +2025-02-13T20:09:00.7183181Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/execution +2025-02-13T20:09:00.7183854Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/utility +2025-02-13T20:09:00.7184527Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/stdlib.h +2025-02-13T20:09:00.7185202Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/barrier +2025-02-13T20:09:00.7185866Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tgmath.h +2025-02-13T20:09:00.7186718Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ratio +2025-02-13T20:09:00.7187370Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/math.h +2025-02-13T20:09:00.7188023Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/numbers +2025-02-13T20:09:00.7188665Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/cstdlib +2025-02-13T20:09:00.7189509Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/backward/ +2025-02-13T20:09:00.7190225Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/backward/hash_set +2025-02-13T20:09:00.7190985Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/backward/hash_map +2025-02-13T20:09:00.7191802Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/backward/auto_ptr.h +2025-02-13T20:09:00.7192614Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/backward/hash_fun.h +2025-02-13T20:09:00.7193389Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/backward/binders.h +2025-02-13T20:09:00.7194434Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/backward/strstream +2025-02-13T20:09:00.7195250Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/backward/backward_warning.h +2025-02-13T20:09:00.7196076Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/backward/hashtable.h +2025-02-13T20:09:00.7196808Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/string_view +2025-02-13T20:09:00.7197514Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/filesystem +2025-02-13T20:09:00.7198197Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/typeindex +2025-02-13T20:09:00.7198879Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/algorithm +2025-02-13T20:09:00.7199852Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ranges +2025-02-13T20:09:00.7200538Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/exception +2025-02-13T20:09:00.7201193Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ctime +2025-02-13T20:09:00.7201828Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/codecvt +2025-02-13T20:09:00.7202509Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/semaphore +2025-02-13T20:09:00.7203227Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/typeinfo +2025-02-13T20:09:00.7203930Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ciso646 +2025-02-13T20:09:00.7204812Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/condition_variable +2025-02-13T20:09:00.7205522Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/thread +2025-02-13T20:09:00.7206159Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/iomanip +2025-02-13T20:09:00.7206844Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/stop_token +2025-02-13T20:09:00.7207904Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/csetjmp +2025-02-13T20:09:00.7208567Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/cassert +2025-02-13T20:09:00.7209223Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/numeric +2025-02-13T20:09:00.7209869Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/version +2025-02-13T20:09:00.7210741Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/unordered_map +2025-02-13T20:09:00.7211427Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/iosfwd +2025-02-13T20:09:00.7212091Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/forward_list +2025-02-13T20:09:00.7212759Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/latch +2025-02-13T20:09:00.7213447Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/any +2025-02-13T20:09:00.7214152Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/ +2025-02-13T20:09:00.7214859Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/cinttypes +2025-02-13T20:09:00.7215805Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/hashtable_policy.h +2025-02-13T20:09:00.7216602Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/hypergeometric.tcc +2025-02-13T20:09:00.7217338Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/cstdio +2025-02-13T20:09:00.7218020Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/tuple +2025-02-13T20:09:00.7218844Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/utility +2025-02-13T20:09:00.7219534Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/stdio.h +2025-02-13T20:09:00.7220220Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/stdlib.h +2025-02-13T20:09:00.7221138Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/tgmath.h +2025-02-13T20:09:00.7221841Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/math.h +2025-02-13T20:09:00.7222595Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/cstdlib +2025-02-13T20:09:00.7223279Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/ctype.h +2025-02-13T20:09:00.7223981Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/ctime +2025-02-13T20:09:00.7224719Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/unordered_set.h +2025-02-13T20:09:00.7225438Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/wctype.h +2025-02-13T20:09:00.7226395Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/special_function_util.h +2025-02-13T20:09:00.7227184Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/inttypes.h +2025-02-13T20:09:00.7227976Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/modified_bessel_func.tcc +2025-02-13T20:09:00.7228776Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/unordered_map +2025-02-13T20:09:00.7229726Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/riemann_zeta.tcc +2025-02-13T20:09:00.7230472Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/random.h +2025-02-13T20:09:00.7231194Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/climits +2025-02-13T20:09:00.7232082Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/cfloat +2025-02-13T20:09:00.7232761Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/cfenv +2025-02-13T20:09:00.7233436Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/stdint.h +2025-02-13T20:09:00.7234169Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/poly_laguerre.tcc +2025-02-13T20:09:00.7234911Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/stdbool.h +2025-02-13T20:09:00.7235656Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/bessel_function.tcc +2025-02-13T20:09:00.7236618Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/random.tcc +2025-02-13T20:09:00.7237546Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/unordered_map.h +2025-02-13T20:09:00.7238266Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/cwctype +2025-02-13T20:09:00.7238958Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/gamma.tcc +2025-02-13T20:09:00.7239667Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/ccomplex +2025-02-13T20:09:00.7240356Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/regex +2025-02-13T20:09:00.7241085Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/exp_integral.tcc +2025-02-13T20:09:00.7241814Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/array +2025-02-13T20:09:00.7242689Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/wchar.h +2025-02-13T20:09:00.7243370Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/fenv.h +2025-02-13T20:09:00.7244299Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/shared_ptr.h +2025-02-13T20:09:00.7245076Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/functional_hash.h +2025-02-13T20:09:00.7245858Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/functional +2025-02-13T20:09:00.7246597Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/ctgmath +2025-02-13T20:09:00.7247322Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/memory +2025-02-13T20:09:00.7248181Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/type_traits +2025-02-13T20:09:00.7248980Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/legendre_function.tcc +2025-02-13T20:09:00.7249778Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/float.h +2025-02-13T20:09:00.7250500Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/cmath +2025-02-13T20:09:00.7251201Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/complex +2025-02-13T20:09:00.7251959Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/poly_hermite.tcc +2025-02-13T20:09:00.7252923Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/stdarg.h +2025-02-13T20:09:00.7253662Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/cwchar +2025-02-13T20:09:00.7254578Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/ell_integral.tcc +2025-02-13T20:09:00.7255358Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/limits.h +2025-02-13T20:09:00.7256123Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/beta_function.tcc +2025-02-13T20:09:00.7256911Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/unordered_set +2025-02-13T20:09:00.7257655Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/cstdarg +2025-02-13T20:09:00.7258364Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/random +2025-02-13T20:09:00.7259075Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/complex.h +2025-02-13T20:09:00.7259952Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/cstdint +2025-02-13T20:09:00.7260690Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/hashtable.h +2025-02-13T20:09:00.7261410Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/cstdbool +2025-02-13T20:09:00.7262179Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr1/cctype +2025-02-13T20:09:00.7262872Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/climits +2025-02-13T20:09:00.7263759Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ostream +2025-02-13T20:09:00.7264486Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/scoped_allocator +2025-02-13T20:09:00.7265368Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/cfloat +2025-02-13T20:09:00.7266046Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/map +2025-02-13T20:09:00.7266694Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/queue +2025-02-13T20:09:00.7267394Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/stdatomic.h +2025-02-13T20:09:00.7268254Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/cfenv +2025-02-13T20:09:00.7269088Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/source_location +2025-02-13T20:09:00.7269851Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/initializer_list +2025-02-13T20:09:00.7270651Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/shared_mutex +2025-02-13T20:09:00.7271361Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/limits +2025-02-13T20:09:00.7272041Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/csignal +2025-02-13T20:09:00.7272701Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ios +2025-02-13T20:09:00.7273351Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/list +2025-02-13T20:09:00.7274171Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/new +2025-02-13T20:09:00.7274862Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/coroutine +2025-02-13T20:09:00.7275567Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/stack +2025-02-13T20:09:00.7276381Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/cwctype +2025-02-13T20:09:00.7277058Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/cuchar +2025-02-13T20:09:00.7277739Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ccomplex +2025-02-13T20:09:00.7278428Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/vector +2025-02-13T20:09:00.7279096Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/locale +2025-02-13T20:09:00.7279760Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/regex +2025-02-13T20:09:00.7280404Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bit +2025-02-13T20:09:00.7281072Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/clocale +2025-02-13T20:09:00.7281785Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/charconv +2025-02-13T20:09:00.7282483Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/decimal/ +2025-02-13T20:09:00.7283218Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/decimal/decimal.h +2025-02-13T20:09:00.7284011Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/decimal/decimal +2025-02-13T20:09:00.7284884Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/array +2025-02-13T20:09:00.7285544Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/fenv.h +2025-02-13T20:09:00.7286306Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/mutex +2025-02-13T20:09:00.7286984Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bitset +2025-02-13T20:09:00.7287739Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/string +2025-02-13T20:09:00.7288436Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/spanstream +2025-02-13T20:09:00.7289131Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/functional +2025-02-13T20:09:00.7290005Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/streambuf +2025-02-13T20:09:00.7290680Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/ +2025-02-13T20:09:00.7291387Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/fstream.tcc +2025-02-13T20:09:00.7292189Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/std_thread.h +2025-02-13T20:09:00.7292939Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/invoke.h +2025-02-13T20:09:00.7293934Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/stl_construct.h +2025-02-13T20:09:00.7294813Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/hashtable_policy.h +2025-02-13T20:09:00.7295725Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/regex.tcc +2025-02-13T20:09:00.7296477Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/stl_queue.h +2025-02-13T20:09:00.7297268Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/basic_string.tcc +2025-02-13T20:09:00.7298198Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/move.h +2025-02-13T20:09:00.7298955Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/gslice_array.h +2025-02-13T20:09:00.7299712Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/mask_array.h +2025-02-13T20:09:00.7300506Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/string_view.tcc +2025-02-13T20:09:00.7301275Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/stl_list.h +2025-02-13T20:09:00.7302116Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/regex_compiler.tcc +2025-02-13T20:09:00.7302970Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/stl_multiset.h +2025-02-13T20:09:00.7303750Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/allocator.h +2025-02-13T20:09:00.7304716Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/predefined_ops.h +2025-02-13T20:09:00.7305565Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/atomic_timed_wait.h +2025-02-13T20:09:00.7306567Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/c++0x_warning.h +2025-02-13T20:09:00.7307358Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/shared_ptr_base.h +2025-02-13T20:09:00.7308212Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/stl_iterator_base_funcs.h +2025-02-13T20:09:00.7309042Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/quoted_string.h +2025-02-13T20:09:00.7309837Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/ostream_insert.h +2025-02-13T20:09:00.7310648Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/valarray_before.h +2025-02-13T20:09:00.7311456Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/stl_tempbuf.h +2025-02-13T20:09:00.7312256Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/regex_executor.tcc +2025-02-13T20:09:00.7313084Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/indirect_array.h +2025-02-13T20:09:00.7313916Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/codecvt.h +2025-02-13T20:09:00.7314679Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/streambuf.tcc +2025-02-13T20:09:00.7315637Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/ranges_util.h +2025-02-13T20:09:00.7316525Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/postypes.h +2025-02-13T20:09:00.7317312Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/regex_compiler.h +2025-02-13T20:09:00.7318074Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/specfun.h +2025-02-13T20:09:00.7318846Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/locale_classes.h +2025-02-13T20:09:00.7319762Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/unordered_set.h +2025-02-13T20:09:00.7320757Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/forward_list.tcc +2025-02-13T20:09:00.7321584Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/regex_executor.h +2025-02-13T20:09:00.7322380Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/allocated_ptr.h +2025-02-13T20:09:00.7323203Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/stl_raw_storage_iter.h +2025-02-13T20:09:00.7324037Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/regex_constants.h +2025-02-13T20:09:00.7324838Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/new_allocator.h +2025-02-13T20:09:00.7325622Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/stl_iterator.h +2025-02-13T20:09:00.7326593Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/stringfwd.h +2025-02-13T20:09:00.7327455Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/list.tcc +2025-02-13T20:09:00.7328300Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/istream.tcc +2025-02-13T20:09:00.7329083Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/max_size_type.h +2025-02-13T20:09:00.7330080Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/ranges_uninitialized.h +2025-02-13T20:09:00.7330899Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/slice_array.h +2025-02-13T20:09:00.7331709Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/regex.h +2025-02-13T20:09:00.7332443Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/fs_path.h +2025-02-13T20:09:00.7333202Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/stl_vector.h +2025-02-13T20:09:00.7334007Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/cxxabi_init_exception.h +2025-02-13T20:09:00.7334820Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/stl_pair.h +2025-02-13T20:09:00.7335586Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/stl_multimap.h +2025-02-13T20:09:00.7336578Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/stl_uninitialized.h +2025-02-13T20:09:00.7337402Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/stl_algo.h +2025-02-13T20:09:00.7338335Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/nested_exception.h +2025-02-13T20:09:00.7339170Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/uses_allocator_args.h +2025-02-13T20:09:00.7339994Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/valarray_array.h +2025-02-13T20:09:00.7340772Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/node_handle.h +2025-02-13T20:09:00.7341520Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/random.h +2025-02-13T20:09:00.7342369Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/char_traits.h +2025-02-13T20:09:00.7343203Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/atomic_lockfree_defines.h +2025-02-13T20:09:00.7344343Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/std_function.h +2025-02-13T20:09:00.7345153Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/semaphore_base.h +2025-02-13T20:09:00.7345933Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/atomic_base.h +2025-02-13T20:09:00.7346749Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/regex_automaton.tcc +2025-02-13T20:09:00.7347579Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/boost_concept_check.h +2025-02-13T20:09:00.7348525Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/basic_string.h +2025-02-13T20:09:00.7349298Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/atomic_wait.h +2025-02-13T20:09:00.7350077Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/range_access.h +2025-02-13T20:09:00.7350856Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/ptr_traits.h +2025-02-13T20:09:00.7351815Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/uses_allocator.h +2025-02-13T20:09:00.7352663Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/exception.h +2025-02-13T20:09:00.7353451Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/concept_check.h +2025-02-13T20:09:00.7354221Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/std_mutex.h +2025-02-13T20:09:00.7355166Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/stl_map.h +2025-02-13T20:09:00.7355915Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/stl_algobase.h +2025-02-13T20:09:00.7356671Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/stl_tree.h +2025-02-13T20:09:00.7357430Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/stl_relops.h +2025-02-13T20:09:00.7358240Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/move_only_function.h +2025-02-13T20:09:00.7359229Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/ranges_algobase.h +2025-02-13T20:09:00.7360036Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/mofunc_impl.h +2025-02-13T20:09:00.7360944Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/erase_if.h +2025-02-13T20:09:00.7361708Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/locale_facets.tcc +2025-02-13T20:09:00.7362521Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/exception_defines.h +2025-02-13T20:09:00.7363358Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/parse_numbers.h +2025-02-13T20:09:00.7364145Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/algorithmfwd.h +2025-02-13T20:09:00.7364895Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/random.tcc +2025-02-13T20:09:00.7365897Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/enable_special_members.h +2025-02-13T20:09:00.7366721Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/locale_facets.h +2025-02-13T20:09:00.7367507Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/basic_ios.tcc +2025-02-13T20:09:00.7368392Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/ios_base.h +2025-02-13T20:09:00.7369177Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/this_thread_sleep.h +2025-02-13T20:09:00.7381689Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/deque.tcc +2025-02-13T20:09:00.7382546Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/unordered_map.h +2025-02-13T20:09:00.7383366Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/shared_ptr_atomic.h +2025-02-13T20:09:00.7384208Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/stl_stack.h +2025-02-13T20:09:00.7384966Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/functexcept.h +2025-02-13T20:09:00.7385738Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/exception_ptr.h +2025-02-13T20:09:00.7386701Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/stl_set.h +2025-02-13T20:09:00.7387426Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/chrono.h +2025-02-13T20:09:00.7388156Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/shared_ptr.h +2025-02-13T20:09:00.7388898Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/localefwd.h +2025-02-13T20:09:00.7389640Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/charconv.h +2025-02-13T20:09:00.7390390Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/vector.tcc +2025-02-13T20:09:00.7391305Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/uniform_int_dist.h +2025-02-13T20:09:00.7392146Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/forward_list.h +2025-02-13T20:09:00.7392921Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/atomic_futex.h +2025-02-13T20:09:00.7393903Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/functional_hash.h +2025-02-13T20:09:00.7394884Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/align.h +2025-02-13T20:09:00.7395604Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/fs_fwd.h +2025-02-13T20:09:00.7396344Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/alloc_traits.h +2025-02-13T20:09:00.7397117Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/valarray_after.h +2025-02-13T20:09:00.7397890Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/ranges_base.h +2025-02-13T20:09:00.7398644Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/stl_function.h +2025-02-13T20:09:00.7399409Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/locale_conv.h +2025-02-13T20:09:00.7400211Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/locale_facets_nonio.h +2025-02-13T20:09:00.7401017Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/regex_automaton.h +2025-02-13T20:09:00.7401998Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/regex_scanner.tcc +2025-02-13T20:09:00.7403046Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/stl_iterator_base_types.h +2025-02-13T20:09:00.7403865Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/ranges_algo.h +2025-02-13T20:09:00.7404785Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/memoryfwd.h +2025-02-13T20:09:00.7405590Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/locale_facets_nonio.tcc +2025-02-13T20:09:00.7406436Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/iterator_concepts.h +2025-02-13T20:09:00.7407231Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/stl_heap.h +2025-02-13T20:09:00.7408100Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/stream_iterator.h +2025-02-13T20:09:00.7408886Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/regex_error.h +2025-02-13T20:09:00.7409651Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/hash_bytes.h +2025-02-13T20:09:00.7410387Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/fs_dir.h +2025-02-13T20:09:00.7411153Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/valarray_array.tcc +2025-02-13T20:09:00.7412121Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/stl_bvector.h +2025-02-13T20:09:00.7412960Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/streambuf_iterator.h +2025-02-13T20:09:00.7413817Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/utility.h +2025-02-13T20:09:00.7414618Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/cxxabi_forced.h +2025-02-13T20:09:00.7415538Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/cow_string.h +2025-02-13T20:09:00.7416298Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/unique_lock.h +2025-02-13T20:09:00.7417069Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/regex_scanner.h +2025-02-13T20:09:00.7417845Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/refwrap.h +2025-02-13T20:09:00.7418574Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/std_abs.h +2025-02-13T20:09:00.7419288Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/gslice.h +2025-02-13T20:09:00.7420153Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/basic_ios.h +2025-02-13T20:09:00.7420956Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/sstream.tcc +2025-02-13T20:09:00.7421710Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/ranges_cmp.h +2025-02-13T20:09:00.7422638Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/hashtable.h +2025-02-13T20:09:00.7422964Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/stl_deque.h +2025-02-13T20:09:00.7423300Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/locale_classes.tcc +2025-02-13T20:09:00.7423823Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/cpp_type_traits.h +2025-02-13T20:09:00.7424143Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/ostream.tcc +2025-02-13T20:09:00.7424465Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/stl_numeric.h +2025-02-13T20:09:00.7424755Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/fs_ops.h +2025-02-13T20:09:00.7425078Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/bits/unique_ptr.h +2025-02-13T20:09:00.7425329Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/ +2025-02-13T20:09:00.7425784Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/hash_set +2025-02-13T20:09:00.7426120Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/aligned_buffer.h +2025-02-13T20:09:00.7426455Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/rc_string_base.h +2025-02-13T20:09:00.7426805Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/stdio_sync_filebuf.h +2025-02-13T20:09:00.7427322Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/malloc_allocator.h +2025-02-13T20:09:00.7427666Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/bitmap_allocator.h +2025-02-13T20:09:00.7427986Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/enc_filebuf.h +2025-02-13T20:09:00.7428296Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pointer.h +2025-02-13T20:09:00.7428644Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/string_conversions.h +2025-02-13T20:09:00.7428953Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/algorithm +2025-02-13T20:09:00.7429269Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/vstring_util.h +2025-02-13T20:09:00.7429571Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/ropeimpl.h +2025-02-13T20:09:00.7429898Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/new_allocator.h +2025-02-13T20:09:00.7430222Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/type_traits.h +2025-02-13T20:09:00.7430509Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/numeric +2025-02-13T20:09:00.7430851Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/debug_allocator.h +2025-02-13T20:09:00.7431164Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/hash_map +2025-02-13T20:09:00.7431504Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/vstring.tcc +2025-02-13T20:09:00.7431789Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/slist +2025-02-13T20:09:00.7432135Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pod_char_traits.h +2025-02-13T20:09:00.7432517Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/codecvt_specializations.h +2025-02-13T20:09:00.7432815Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/random.tcc +2025-02-13T20:09:00.7433253Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/vstring.h +2025-02-13T20:09:00.7433575Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/cast.h +2025-02-13T20:09:00.7433881Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/atomicity.h +2025-02-13T20:09:00.7434201Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/vstring_fwd.h +2025-02-13T20:09:00.7434556Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/extptr_allocator.h +2025-02-13T20:09:00.7434871Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/alloc_traits.h +2025-02-13T20:09:00.7435188Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/functional +2025-02-13T20:09:00.7435508Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/sso_string_base.h +2025-02-13T20:09:00.7435799Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/memory +2025-02-13T20:09:00.7436123Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/stdio_filebuf.h +2025-02-13T20:09:00.7436781Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/concurrence.h +2025-02-13T20:09:00.7437084Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/cmath +2025-02-13T20:09:00.7437400Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/typelist.h +2025-02-13T20:09:00.7437723Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/throw_allocator.h +2025-02-13T20:09:00.7438052Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/numeric_traits.h +2025-02-13T20:09:00.7438339Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/iterator +2025-02-13T20:09:00.7438619Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/random +2025-02-13T20:09:00.7438904Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/ +2025-02-13T20:09:00.7439291Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/hash_policy.hpp +2025-02-13T20:09:00.7439636Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/trie_policy.hpp +2025-02-13T20:09:00.7440008Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/tag_and_trait.hpp +2025-02-13T20:09:00.7440523Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/list_update_policy.hpp +2025-02-13T20:09:00.7440876Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/tree_policy.hpp +2025-02-13T20:09:00.7441235Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/exception.hpp +2025-02-13T20:09:00.7441598Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/assoc_container.hpp +2025-02-13T20:09:00.7441958Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/ +2025-02-13T20:09:00.7442392Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/standard_policies.hpp +2025-02-13T20:09:00.7442790Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/list_update_map_/ +2025-02-13T20:09:00.7443356Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/list_update_map_/constructor_destructor_fn_imps.hpp +2025-02-13T20:09:00.7444109Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/list_update_map_/find_fn_imps.hpp +2025-02-13T20:09:00.7444585Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/list_update_map_/debug_fn_imps.hpp +2025-02-13T20:09:00.7445064Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/list_update_map_/erase_fn_imps.hpp +2025-02-13T20:09:00.7445531Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/list_update_map_/info_fn_imps.hpp +2025-02-13T20:09:00.7446021Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/list_update_map_/iterators_fn_imps.hpp +2025-02-13T20:09:00.7446491Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/list_update_map_/insert_fn_imps.hpp +2025-02-13T20:09:00.7446956Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/list_update_map_/lu_map_.hpp +2025-02-13T20:09:00.7447455Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/list_update_map_/entry_metadata_base.hpp +2025-02-13T20:09:00.7448087Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/list_update_map_/trace_fn_imps.hpp +2025-02-13T20:09:00.7448464Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/trie_policy/ +2025-02-13T20:09:00.7448987Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/trie_policy/sample_trie_access_traits.hpp +2025-02-13T20:09:00.7449469Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/trie_policy/node_metadata_selector.hpp +2025-02-13T20:09:00.7450005Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/trie_policy/trie_string_access_traits_imp.hpp +2025-02-13T20:09:00.7450700Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/trie_policy/sample_trie_node_update.hpp +2025-02-13T20:09:00.7451240Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/trie_policy/prefix_search_node_update_imp.hpp +2025-02-13T20:09:00.7451716Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/trie_policy/order_statistics_imp.hpp +2025-02-13T20:09:00.7452186Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/trie_policy/trie_policy_base.hpp +2025-02-13T20:09:00.7452585Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/pairing_heap_/ +2025-02-13T20:09:00.7453078Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/pairing_heap_/find_fn_imps.hpp +2025-02-13T20:09:00.7453582Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/pairing_heap_/debug_fn_imps.hpp +2025-02-13T20:09:00.7454032Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/pairing_heap_/erase_fn_imps.hpp +2025-02-13T20:09:00.7454749Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/pairing_heap_/pairing_heap_.hpp +2025-02-13T20:09:00.7455434Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/pairing_heap_/split_join_fn_imps.hpp +2025-02-13T20:09:00.7455984Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/pairing_heap_/constructors_destructor_fn_imps.hpp +2025-02-13T20:09:00.7456435Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/pairing_heap_/insert_fn_imps.hpp +2025-02-13T20:09:00.7456815Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/ov_tree_map_/ +2025-02-13T20:09:00.7457259Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/ov_tree_map_/debug_fn_imps.hpp +2025-02-13T20:09:00.7457716Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/ov_tree_map_/erase_fn_imps.hpp +2025-02-13T20:09:00.7458148Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/ov_tree_map_/traits.hpp +2025-02-13T20:09:00.7458608Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/ov_tree_map_/info_fn_imps.hpp +2025-02-13T20:09:00.7459055Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/ov_tree_map_/node_iterators.hpp +2025-02-13T20:09:00.7459532Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/ov_tree_map_/iterators_fn_imps.hpp +2025-02-13T20:09:00.7459994Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/ov_tree_map_/split_join_fn_imps.hpp +2025-02-13T20:09:00.7460479Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/ov_tree_map_/policy_access_fn_imps.hpp +2025-02-13T20:09:00.7461004Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/ov_tree_map_/constructors_destructor_fn_imps.hpp +2025-02-13T20:09:00.7461473Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/ov_tree_map_/insert_fn_imps.hpp +2025-02-13T20:09:00.7461923Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/ov_tree_map_/ov_tree_map_.hpp +2025-02-13T20:09:00.7462375Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/bin_search_tree_/ +2025-02-13T20:09:00.7462827Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/bin_search_tree_/find_fn_imps.hpp +2025-02-13T20:09:00.7463310Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/bin_search_tree_/debug_fn_imps.hpp +2025-02-13T20:09:00.7463797Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/bin_search_tree_/erase_fn_imps.hpp +2025-02-13T20:09:00.7464293Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/bin_search_tree_/traits.hpp +2025-02-13T20:09:00.7464777Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/bin_search_tree_/info_fn_imps.hpp +2025-02-13T20:09:00.7465611Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/bin_search_tree_/node_iterators.hpp +2025-02-13T20:09:00.7466122Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/bin_search_tree_/iterators_fn_imps.hpp +2025-02-13T20:09:00.7466611Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/bin_search_tree_/split_join_fn_imps.hpp +2025-02-13T20:09:00.7467110Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/bin_search_tree_/policy_access_fn_imps.hpp +2025-02-13T20:09:00.7467599Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/bin_search_tree_/point_iterators.hpp +2025-02-13T20:09:00.7468154Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/bin_search_tree_/constructors_destructor_fn_imps.hpp +2025-02-13T20:09:00.7468632Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/bin_search_tree_/insert_fn_imps.hpp +2025-02-13T20:09:00.7469113Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/bin_search_tree_/r_erase_fn_imps.hpp +2025-02-13T20:09:00.7469721Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/bin_search_tree_/rotate_fn_imps.hpp +2025-02-13T20:09:00.7470194Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/bin_search_tree_/bin_search_tree_.hpp +2025-02-13T20:09:00.7470808Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/binomial_heap_/ +2025-02-13T20:09:00.7471282Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/binomial_heap_/debug_fn_imps.hpp +2025-02-13T20:09:00.7471828Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/binomial_heap_/constructors_destructor_fn_imps.hpp +2025-02-13T20:09:00.7472291Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/binomial_heap_/binomial_heap_.hpp +2025-02-13T20:09:00.7472751Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/left_child_next_sibling_heap_/ +2025-02-13T20:09:00.7473292Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/left_child_next_sibling_heap_/debug_fn_imps.hpp +2025-02-13T20:09:00.7473810Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/left_child_next_sibling_heap_/erase_fn_imps.hpp +2025-02-13T20:09:00.7474377Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/left_child_next_sibling_heap_/point_const_iterator.hpp +2025-02-13T20:09:00.7474891Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/left_child_next_sibling_heap_/info_fn_imps.hpp +2025-02-13T20:09:00.7475454Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/left_child_next_sibling_heap_/iterators_fn_imps.hpp +2025-02-13T20:09:00.7476176Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/left_child_next_sibling_heap_/node.hpp +2025-02-13T20:09:00.7476786Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/left_child_next_sibling_heap_/policy_access_fn_imps.hpp +2025-02-13T20:09:00.7477384Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/left_child_next_sibling_heap_/left_child_next_sibling_heap_.hpp +2025-02-13T20:09:00.7477999Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/left_child_next_sibling_heap_/constructors_destructor_fn_imps.hpp +2025-02-13T20:09:00.7478523Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/left_child_next_sibling_heap_/insert_fn_imps.hpp +2025-02-13T20:09:00.7479056Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/left_child_next_sibling_heap_/trace_fn_imps.hpp +2025-02-13T20:09:00.7479582Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/left_child_next_sibling_heap_/const_iterator.hpp +2025-02-13T20:09:00.7480185Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/container_base_dispatch.hpp +2025-02-13T20:09:00.7480607Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/unordered_iterator/ +2025-02-13T20:09:00.7481134Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/unordered_iterator/point_const_iterator.hpp +2025-02-13T20:09:00.7481658Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/unordered_iterator/point_iterator.hpp +2025-02-13T20:09:00.7482155Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/unordered_iterator/const_iterator.hpp +2025-02-13T20:09:00.7482612Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/unordered_iterator/iterator.hpp +2025-02-13T20:09:00.7483035Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/binomial_heap_base_/ +2025-02-13T20:09:00.7483558Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/binomial_heap_base_/binomial_heap_base_.hpp +2025-02-13T20:09:00.7484176Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/binomial_heap_base_/find_fn_imps.hpp +2025-02-13T20:09:00.7484653Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/binomial_heap_base_/debug_fn_imps.hpp +2025-02-13T20:09:00.7485140Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/binomial_heap_base_/erase_fn_imps.hpp +2025-02-13T20:09:00.7485637Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/binomial_heap_base_/split_join_fn_imps.hpp +2025-02-13T20:09:00.7486436Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/binomial_heap_base_/constructors_destructor_fn_imps.hpp +2025-02-13T20:09:00.7486947Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/binomial_heap_base_/insert_fn_imps.hpp +2025-02-13T20:09:00.7487360Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/branch_policy/ +2025-02-13T20:09:00.7487950Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/branch_policy/null_node_metadata.hpp +2025-02-13T20:09:00.7488383Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/branch_policy/traits.hpp +2025-02-13T20:09:00.7488854Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/branch_policy/branch_policy.hpp +2025-02-13T20:09:00.7489317Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/priority_queue_base_dispatch.hpp +2025-02-13T20:09:00.7489688Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/splay_tree_/ +2025-02-13T20:09:00.7490134Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/splay_tree_/find_fn_imps.hpp +2025-02-13T20:09:00.7490579Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/splay_tree_/splay_fn_imps.hpp +2025-02-13T20:09:00.7491045Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/splay_tree_/debug_fn_imps.hpp +2025-02-13T20:09:00.7491500Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/splay_tree_/erase_fn_imps.hpp +2025-02-13T20:09:00.7491959Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/splay_tree_/traits.hpp +2025-02-13T20:09:00.7492448Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/splay_tree_/info_fn_imps.hpp +2025-02-13T20:09:00.7492914Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/splay_tree_/split_join_fn_imps.hpp +2025-02-13T20:09:00.7493320Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/splay_tree_/node.hpp +2025-02-13T20:09:00.7493994Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/splay_tree_/splay_tree_.hpp +2025-02-13T20:09:00.7494712Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/splay_tree_/constructors_destructor_fn_imps.hpp +2025-02-13T20:09:00.7495183Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/splay_tree_/insert_fn_imps.hpp +2025-02-13T20:09:00.7495541Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/pat_trie_/ +2025-02-13T20:09:00.7495985Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/pat_trie_/find_fn_imps.hpp +2025-02-13T20:09:00.7496442Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/pat_trie_/insert_join_fn_imps.hpp +2025-02-13T20:09:00.7497048Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/pat_trie_/debug_fn_imps.hpp +2025-02-13T20:09:00.7497497Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/pat_trie_/erase_fn_imps.hpp +2025-02-13T20:09:00.7497919Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/pat_trie_/traits.hpp +2025-02-13T20:09:00.7498378Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/pat_trie_/info_fn_imps.hpp +2025-02-13T20:09:00.7498980Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/pat_trie_/iterators_fn_imps.hpp +2025-02-13T20:09:00.7499452Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/pat_trie_/policy_access_fn_imps.hpp +2025-02-13T20:09:00.7499975Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/pat_trie_/constructors_destructor_fn_imps.hpp +2025-02-13T20:09:00.7500422Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/pat_trie_/trace_fn_imps.hpp +2025-02-13T20:09:00.7500847Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/pat_trie_/pat_trie_.hpp +2025-02-13T20:09:00.7501294Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/pat_trie_/r_erase_fn_imps.hpp +2025-02-13T20:09:00.7501766Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/pat_trie_/synth_access_traits.hpp +2025-02-13T20:09:00.7502294Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/pat_trie_/pat_trie_base.hpp +2025-02-13T20:09:00.7502789Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/pat_trie_/rotate_fn_imps.hpp +2025-02-13T20:09:00.7503236Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/pat_trie_/split_fn_imps.hpp +2025-02-13T20:09:00.7503678Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/pat_trie_/update_fn_imps.hpp +2025-02-13T20:09:00.7504048Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/tree_policy/ +2025-02-13T20:09:00.7504706Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/tree_policy/node_metadata_selector.hpp +2025-02-13T20:09:00.7505210Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/tree_policy/sample_tree_node_update.hpp +2025-02-13T20:09:00.7505700Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/tree_policy/order_statistics_imp.hpp +2025-02-13T20:09:00.7506098Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/type_utils.hpp +2025-02-13T20:09:00.7506496Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/rc_binomial_heap_/ +2025-02-13T20:09:00.7506970Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/rc_binomial_heap_/debug_fn_imps.hpp +2025-02-13T20:09:00.7507447Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/rc_binomial_heap_/erase_fn_imps.hpp +2025-02-13T20:09:00.7508067Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/rc_binomial_heap_/rc_binomial_heap_.hpp +2025-02-13T20:09:00.7508503Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/rc_binomial_heap_/rc.hpp +2025-02-13T20:09:00.7509173Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/rc_binomial_heap_/split_join_fn_imps.hpp +2025-02-13T20:09:00.7509749Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/rc_binomial_heap_/constructors_destructor_fn_imps.hpp +2025-02-13T20:09:00.7510232Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/rc_binomial_heap_/insert_fn_imps.hpp +2025-02-13T20:09:00.7510701Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/rc_binomial_heap_/trace_fn_imps.hpp +2025-02-13T20:09:00.7511126Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/gp_hash_table_map_/ +2025-02-13T20:09:00.7511676Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/gp_hash_table_map_/constructor_destructor_fn_imps.hpp +2025-02-13T20:09:00.7512149Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/gp_hash_table_map_/find_fn_imps.hpp +2025-02-13T20:09:00.7512630Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/gp_hash_table_map_/debug_fn_imps.hpp +2025-02-13T20:09:00.7513392Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/gp_hash_table_map_/constructor_destructor_store_hash_fn_imps.hpp +2025-02-13T20:09:00.7513877Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/gp_hash_table_map_/erase_fn_imps.hpp +2025-02-13T20:09:00.7514395Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/gp_hash_table_map_/info_fn_imps.hpp +2025-02-13T20:09:00.7514911Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/gp_hash_table_map_/find_store_hash_fn_imps.hpp +2025-02-13T20:09:00.7515647Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/gp_hash_table_map_/resize_no_store_hash_fn_imps.hpp +2025-02-13T20:09:00.7516185Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/gp_hash_table_map_/find_no_store_hash_fn_imps.hpp +2025-02-13T20:09:00.7516735Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/gp_hash_table_map_/erase_no_store_hash_fn_imps.hpp +2025-02-13T20:09:00.7517225Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/gp_hash_table_map_/resize_fn_imps.hpp +2025-02-13T20:09:00.7517711Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/gp_hash_table_map_/gp_ht_map_.hpp +2025-02-13T20:09:00.7518334Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/gp_hash_table_map_/iterator_fn_imps.hpp +2025-02-13T20:09:00.7518857Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/gp_hash_table_map_/policy_access_fn_imps.hpp +2025-02-13T20:09:00.7519378Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/gp_hash_table_map_/erase_store_hash_fn_imps.hpp +2025-02-13T20:09:00.7519923Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/gp_hash_table_map_/insert_no_store_hash_fn_imps.hpp +2025-02-13T20:09:00.7520492Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/gp_hash_table_map_/insert_store_hash_fn_imps.hpp +2025-02-13T20:09:00.7521055Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/gp_hash_table_map_/resize_store_hash_fn_imps.hpp +2025-02-13T20:09:00.7521674Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/gp_hash_table_map_/insert_fn_imps.hpp +2025-02-13T20:09:00.7522163Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/gp_hash_table_map_/trace_fn_imps.hpp +2025-02-13T20:09:00.7522683Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/gp_hash_table_map_/debug_store_hash_fn_imps.hpp +2025-02-13T20:09:00.7523324Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/gp_hash_table_map_/constructor_destructor_no_store_hash_fn_imps.hpp +2025-02-13T20:09:00.7524007Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/gp_hash_table_map_/debug_no_store_hash_fn_imps.hpp +2025-02-13T20:09:00.7524432Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/tree_trace_base.hpp +2025-02-13T20:09:00.7524851Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/list_update_policy/ +2025-02-13T20:09:00.7525378Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/list_update_policy/sample_update_policy.hpp +2025-02-13T20:09:00.7526098Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/list_update_policy/lu_counter_metadata.hpp +2025-02-13T20:09:00.7526482Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/binary_heap_/ +2025-02-13T20:09:00.7526932Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/binary_heap_/find_fn_imps.hpp +2025-02-13T20:09:00.7527372Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/binary_heap_/entry_cmp.hpp +2025-02-13T20:09:00.7527947Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/binary_heap_/debug_fn_imps.hpp +2025-02-13T20:09:00.7528587Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/binary_heap_/erase_fn_imps.hpp +2025-02-13T20:09:00.7529255Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/binary_heap_/binary_heap_.hpp +2025-02-13T20:09:00.7529756Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/binary_heap_/point_const_iterator.hpp +2025-02-13T20:09:00.7530200Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/binary_heap_/info_fn_imps.hpp +2025-02-13T20:09:00.7530654Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/binary_heap_/resize_policy.hpp +2025-02-13T20:09:00.7531144Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/binary_heap_/iterators_fn_imps.hpp +2025-02-13T20:09:00.7531685Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/binary_heap_/split_join_fn_imps.hpp +2025-02-13T20:09:00.7532143Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/binary_heap_/entry_pred.hpp +2025-02-13T20:09:00.7532650Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/binary_heap_/policy_access_fn_imps.hpp +2025-02-13T20:09:00.7533186Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/binary_heap_/constructors_destructor_fn_imps.hpp +2025-02-13T20:09:00.7533657Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/binary_heap_/insert_fn_imps.hpp +2025-02-13T20:09:00.7534105Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/binary_heap_/trace_fn_imps.hpp +2025-02-13T20:09:00.7534569Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/binary_heap_/const_iterator.hpp +2025-02-13T20:09:00.7534982Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/debug_map_base.hpp +2025-02-13T20:09:00.7535397Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/cc_hash_table_map_/ +2025-02-13T20:09:00.7535945Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/cc_hash_table_map_/constructor_destructor_fn_imps.hpp +2025-02-13T20:09:00.7536592Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/cc_hash_table_map_/cc_ht_map_.hpp +2025-02-13T20:09:00.7537067Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/cc_hash_table_map_/find_fn_imps.hpp +2025-02-13T20:09:00.7537544Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/cc_hash_table_map_/debug_fn_imps.hpp +2025-02-13T20:09:00.7538147Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/cc_hash_table_map_/constructor_destructor_store_hash_fn_imps.hpp +2025-02-13T20:09:00.7538818Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/cc_hash_table_map_/erase_fn_imps.hpp +2025-02-13T20:09:00.7539518Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/cc_hash_table_map_/cond_key_dtor_entry_dealtor.hpp +2025-02-13T20:09:00.7540013Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/cc_hash_table_map_/info_fn_imps.hpp +2025-02-13T20:09:00.7540527Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/cc_hash_table_map_/entry_list_fn_imps.hpp +2025-02-13T20:09:00.7541045Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/cc_hash_table_map_/find_store_hash_fn_imps.hpp +2025-02-13T20:09:00.7541581Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/cc_hash_table_map_/resize_no_store_hash_fn_imps.hpp +2025-02-13T20:09:00.7542235Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/cc_hash_table_map_/erase_no_store_hash_fn_imps.hpp +2025-02-13T20:09:00.7542725Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/cc_hash_table_map_/resize_fn_imps.hpp +2025-02-13T20:09:00.7543390Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/cc_hash_table_map_/iterators_fn_imps.hpp +2025-02-13T20:09:00.7544127Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/cc_hash_table_map_/policy_access_fn_imps.hpp +2025-02-13T20:09:00.7544660Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/cc_hash_table_map_/erase_store_hash_fn_imps.hpp +2025-02-13T20:09:00.7545192Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/cc_hash_table_map_/insert_no_store_hash_fn_imps.hpp +2025-02-13T20:09:00.7545713Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/cc_hash_table_map_/insert_store_hash_fn_imps.hpp +2025-02-13T20:09:00.7546230Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/cc_hash_table_map_/resize_store_hash_fn_imps.hpp +2025-02-13T20:09:00.7546724Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/cc_hash_table_map_/insert_fn_imps.hpp +2025-02-13T20:09:00.7547210Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/cc_hash_table_map_/trace_fn_imps.hpp +2025-02-13T20:09:00.7547736Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/cc_hash_table_map_/debug_store_hash_fn_imps.hpp +2025-02-13T20:09:00.7548343Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/cc_hash_table_map_/constructor_destructor_no_store_hash_fn_imps.hpp +2025-02-13T20:09:00.7548817Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/cc_hash_table_map_/size_fn_imps.hpp +2025-02-13T20:09:00.7549347Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/cc_hash_table_map_/debug_no_store_hash_fn_imps.hpp +2025-02-13T20:09:00.7549829Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/cc_hash_table_map_/cmp_fn_imps.hpp +2025-02-13T20:09:00.7550324Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/eq_fn/ +2025-02-13T20:09:00.7550754Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/eq_fn/hash_eq_fn.hpp +2025-02-13T20:09:00.7551164Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/eq_fn/eq_by_less.hpp +2025-02-13T20:09:00.7551548Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/thin_heap_/ +2025-02-13T20:09:00.7551989Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/thin_heap_/find_fn_imps.hpp +2025-02-13T20:09:00.7552441Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/thin_heap_/debug_fn_imps.hpp +2025-02-13T20:09:00.7552935Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/thin_heap_/erase_fn_imps.hpp +2025-02-13T20:09:00.7553540Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/thin_heap_/split_join_fn_imps.hpp +2025-02-13T20:09:00.7554105Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/thin_heap_/constructors_destructor_fn_imps.hpp +2025-02-13T20:09:00.7554813Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/thin_heap_/insert_fn_imps.hpp +2025-02-13T20:09:00.7555285Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/thin_heap_/trace_fn_imps.hpp +2025-02-13T20:09:00.7555731Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/thin_heap_/thin_heap_.hpp +2025-02-13T20:09:00.7556119Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/rb_tree_map_/ +2025-02-13T20:09:00.7556570Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/rb_tree_map_/find_fn_imps.hpp +2025-02-13T20:09:00.7557020Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/rb_tree_map_/debug_fn_imps.hpp +2025-02-13T20:09:00.7557559Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/rb_tree_map_/erase_fn_imps.hpp +2025-02-13T20:09:00.7558132Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/rb_tree_map_/traits.hpp +2025-02-13T20:09:00.7558579Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/rb_tree_map_/info_fn_imps.hpp +2025-02-13T20:09:00.7559023Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/rb_tree_map_/rb_tree_.hpp +2025-02-13T20:09:00.7559493Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/rb_tree_map_/split_join_fn_imps.hpp +2025-02-13T20:09:00.7559918Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/rb_tree_map_/node.hpp +2025-02-13T20:09:00.7560451Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/rb_tree_map_/constructors_destructor_fn_imps.hpp +2025-02-13T20:09:00.7561093Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/rb_tree_map_/insert_fn_imps.hpp +2025-02-13T20:09:00.7561466Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/hash_fn/ +2025-02-13T20:09:00.7561922Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/hash_fn/linear_probe_fn_imp.hpp +2025-02-13T20:09:00.7562411Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/hash_fn/direct_mod_range_hashing_imp.hpp +2025-02-13T20:09:00.7562853Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/hash_fn/ranged_probe_fn.hpp +2025-02-13T20:09:00.7563368Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/hash_fn/mask_based_range_hashing.hpp +2025-02-13T20:09:00.7563907Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/hash_fn/direct_mask_range_hashing_imp.hpp +2025-02-13T20:09:00.7564397Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/hash_fn/sample_range_hashing.hpp +2025-02-13T20:09:00.7564847Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/hash_fn/ranged_hash_fn.hpp +2025-02-13T20:09:00.7565478Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/hash_fn/sample_ranged_hash_fn.hpp +2025-02-13T20:09:00.7565926Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/hash_fn/probe_fn_base.hpp +2025-02-13T20:09:00.7566367Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/hash_fn/sample_probe_fn.hpp +2025-02-13T20:09:00.7566841Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/hash_fn/sample_ranged_probe_fn.hpp +2025-02-13T20:09:00.7567302Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/hash_fn/quadratic_probe_fn_imp.hpp +2025-02-13T20:09:00.7567873Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/hash_fn/mod_based_range_hashing.hpp +2025-02-13T20:09:00.7568474Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/cond_dealtor.hpp +2025-02-13T20:09:00.7568884Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/resize_policy/ +2025-02-13T20:09:00.7569438Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/resize_policy/hash_load_check_resize_trigger_imp.hpp +2025-02-13T20:09:00.7570019Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/resize_policy/hash_load_check_resize_trigger_size_base.hpp +2025-02-13T20:09:00.7570552Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/resize_policy/sample_resize_trigger.hpp +2025-02-13T20:09:00.7571102Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/resize_policy/hash_standard_resize_policy_imp.hpp +2025-02-13T20:09:00.7571977Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/resize_policy/cc_hash_max_collision_check_resize_trigger_imp.hpp +2025-02-13T20:09:00.7572499Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/resize_policy/sample_resize_policy.hpp +2025-02-13T20:09:00.7573218Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/resize_policy/hash_prime_size_policy_imp.hpp +2025-02-13T20:09:00.7573705Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/resize_policy/sample_size_policy.hpp +2025-02-13T20:09:00.7574297Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/resize_policy/hash_exponential_size_policy_imp.hpp +2025-02-13T20:09:00.7574700Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/detail/types_traits.hpp +2025-02-13T20:09:00.7575071Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pb_ds/priority_queue.hpp +2025-02-13T20:09:00.7575411Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/pool_allocator.h +2025-02-13T20:09:00.7575849Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/rb_tree +2025-02-13T20:09:00.7576201Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/mt_allocator.h +2025-02-13T20:09:00.7576483Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ext/rope +2025-02-13T20:09:00.7576745Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/ctgmath +2025-02-13T20:09:00.7577018Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/cstring +2025-02-13T20:09:00.7577276Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/memory +2025-02-13T20:09:00.7577558Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/type_traits +2025-02-13T20:09:00.7577856Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/memory_resource +2025-02-13T20:09:00.7578133Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/concepts +2025-02-13T20:09:00.7578419Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/optional +2025-02-13T20:09:00.7578697Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/stacktrace +2025-02-13T20:09:00.7578986Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/ +2025-02-13T20:09:00.7579369Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/multiway_merge.h +2025-02-13T20:09:00.7579684Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/algo.h +2025-02-13T20:09:00.7580014Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/base.h +2025-02-13T20:09:00.7580400Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/compiletime_settings.h +2025-02-13T20:09:00.7580724Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/checkers.h +2025-02-13T20:09:00.7581046Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/for_each.h +2025-02-13T20:09:00.7581461Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/multiseq_selection.h +2025-02-13T20:09:00.7581783Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/partition.h +2025-02-13T20:09:00.7582461Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/search.h +2025-02-13T20:09:00.7582836Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/basic_iterator.h +2025-02-13T20:09:00.7583192Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/compatibility.h +2025-02-13T20:09:00.7583569Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/multiway_mergesort.h +2025-02-13T20:09:00.7583937Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/balanced_quicksort.h +2025-02-13T20:09:00.7584263Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/algorithm +2025-02-13T20:09:00.7584579Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/features.h +2025-02-13T20:09:00.7584901Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/sort.h +2025-02-13T20:09:00.7585201Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/tags.h +2025-02-13T20:09:00.7585543Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/omp_loop.h +2025-02-13T20:09:00.7585977Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/numeric +2025-02-13T20:09:00.7586482Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/settings.h +2025-02-13T20:09:00.7586833Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/numericfwd.h +2025-02-13T20:09:00.7587185Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/unique_copy.h +2025-02-13T20:09:00.7587559Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/for_each_selectors.h +2025-02-13T20:09:00.7587902Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/partial_sum.h +2025-02-13T20:09:00.7588248Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/list_partition.h +2025-02-13T20:09:00.7588553Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/find.h +2025-02-13T20:09:00.7588902Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/random_number.h +2025-02-13T20:09:00.7589235Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/merge.h +2025-02-13T20:09:00.7589548Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/types.h +2025-02-13T20:09:00.7589904Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/random_shuffle.h +2025-02-13T20:09:00.7590244Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/algorithmfwd.h +2025-02-13T20:09:00.7590561Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/parallel.h +2025-02-13T20:09:00.7590888Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/losertree.h +2025-02-13T20:09:00.7591213Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/algobase.h +2025-02-13T20:09:00.7591541Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/iterator.h +2025-02-13T20:09:00.7591934Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/omp_loop_static.h +2025-02-13T20:09:00.7592310Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/par_loop.h +2025-02-13T20:09:00.7592765Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/queue.h +2025-02-13T20:09:00.7593143Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/find_selectors.h +2025-02-13T20:09:00.7593497Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/set_operations.h +2025-02-13T20:09:00.7594049Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/equally_split.h +2025-02-13T20:09:00.7594375Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/quicksort.h +2025-02-13T20:09:00.7594714Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/parallel/workstealing.h +2025-02-13T20:09:00.7594980Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/deque +2025-02-13T20:09:00.7595267Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/debug/ +2025-02-13T20:09:00.7595716Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/debug/debug.h +2025-02-13T20:09:00.7596046Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/debug/safe_base.h +2025-02-13T20:09:00.7596374Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/debug/safe_container.h +2025-02-13T20:09:00.7596743Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/debug/safe_local_iterator.tcc +2025-02-13T20:09:00.7597143Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/debug/safe_unordered_container.tcc +2025-02-13T20:09:00.7597473Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/debug/stl_iterator.h +2025-02-13T20:09:00.7597827Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/debug/safe_unordered_base.h +2025-02-13T20:09:00.7598142Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/debug/unordered_map +2025-02-13T20:09:00.7598476Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/debug/forward_list +2025-02-13T20:09:00.7598795Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/debug/assertions.h +2025-02-13T20:09:00.7599201Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/debug/map +2025-02-13T20:09:00.7599585Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/debug/safe_unordered_container.h +2025-02-13T20:09:00.7599869Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/debug/list +2025-02-13T20:09:00.7600216Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/debug/safe_sequence.tcc +2025-02-13T20:09:00.7600540Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/debug/functions.h +2025-02-13T20:09:00.7600822Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/debug/vector +2025-02-13T20:09:00.7601158Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/debug/safe_sequence.h +2025-02-13T20:09:00.7601437Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/debug/map.h +2025-02-13T20:09:00.7601760Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/debug/multiset.h +2025-02-13T20:09:00.7602058Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/debug/bitset +2025-02-13T20:09:00.7602348Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/debug/set.h +2025-02-13T20:09:00.7602665Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/debug/string +2025-02-13T20:09:00.7603036Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/debug/safe_local_iterator.h +2025-02-13T20:09:00.7603497Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/debug/macros.h +2025-02-13T20:09:00.7603847Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/debug/safe_iterator.h +2025-02-13T20:09:00.7604130Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/debug/deque +2025-02-13T20:09:00.7604578Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/debug/multimap.h +2025-02-13T20:09:00.7604864Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/debug/set +2025-02-13T20:09:00.7605200Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/debug/unordered_set +2025-02-13T20:09:00.7605531Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/debug/formatter.h +2025-02-13T20:09:00.7605867Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/debug/safe_iterator.tcc +2025-02-13T20:09:00.7606221Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/debug/helper_functions.h +2025-02-13T20:09:00.7606519Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/system_error +2025-02-13T20:09:00.7606785Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/cmath +2025-02-13T20:09:00.7607033Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/set +2025-02-13T20:09:00.7607300Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/complex +2025-02-13T20:09:00.7607653Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/istream +2025-02-13T20:09:00.7607943Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/cwchar +2025-02-13T20:09:00.7608392Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/cstddef +2025-02-13T20:09:00.7608700Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/iterator +2025-02-13T20:09:00.7608991Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/unordered_set +2025-02-13T20:09:00.7609269Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/variant +2025-02-13T20:09:00.7609533Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/cstdarg +2025-02-13T20:09:00.7609790Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/pstl/ +2025-02-13T20:09:00.7610132Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/pstl/execution_defs.h +2025-02-13T20:09:00.7610483Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/pstl/glue_execution_defs.h +2025-02-13T20:09:00.7610822Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/pstl/algorithm_fwd.h +2025-02-13T20:09:00.7611157Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/pstl/algorithm_impl.h +2025-02-13T20:09:00.7611517Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/pstl/parallel_backend.h +2025-02-13T20:09:00.7612018Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/pstl/glue_numeric_impl.h +2025-02-13T20:09:00.7612413Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/pstl/parallel_backend_serial.h +2025-02-13T20:09:00.7612716Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/pstl/utils.h +2025-02-13T20:09:00.7613062Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/pstl/glue_numeric_defs.h +2025-02-13T20:09:00.7613437Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/pstl/glue_memory_impl.h +2025-02-13T20:09:00.7613957Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/pstl/glue_memory_defs.h +2025-02-13T20:09:00.7614294Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/pstl/parallel_impl.h +2025-02-13T20:09:00.7614642Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/pstl/execution_impl.h +2025-02-13T20:09:00.7615108Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/pstl/numeric_fwd.h +2025-02-13T20:09:00.7615520Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/pstl/parallel_backend_utils.h +2025-02-13T20:09:00.7615877Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/pstl/glue_algorithm_defs.h +2025-02-13T20:09:00.7616215Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/pstl/numeric_impl.h +2025-02-13T20:09:00.7616535Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/pstl/pstl_config.h +2025-02-13T20:09:00.7616897Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/pstl/unseq_backend_simd.h +2025-02-13T20:09:00.7617212Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/pstl/memory_impl.h +2025-02-13T20:09:00.7617583Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/pstl/glue_algorithm_impl.h +2025-02-13T20:09:00.7617953Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/pstl/parallel_backend_tbb.h +2025-02-13T20:09:00.7618224Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/future +2025-02-13T20:09:00.7618504Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/random +2025-02-13T20:09:00.7618779Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/stdexcept +2025-02-13T20:09:00.7619051Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/span +2025-02-13T20:09:00.7619360Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/ +2025-02-13T20:09:00.7619688Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/tuple +2025-02-13T20:09:00.7620008Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/utility +2025-02-13T20:09:00.7620345Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/ratio +2025-02-13T20:09:00.7620737Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/string_view +2025-02-13T20:09:00.7621077Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/algorithm +2025-02-13T20:09:00.7621574Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/internet +2025-02-13T20:09:00.7621944Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/io_context +2025-02-13T20:09:00.7622317Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/net +2025-02-13T20:09:00.7622822Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/numeric +2025-02-13T20:09:00.7623184Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/unordered_map +2025-02-13T20:09:00.7623563Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/forward_list +2025-02-13T20:09:00.7623868Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/any +2025-02-13T20:09:00.7624171Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/map +2025-02-13T20:09:00.7624663Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/source_location +2025-02-13T20:09:00.7624995Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/list +2025-02-13T20:09:00.7625457Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/vector +2025-02-13T20:09:00.7625926Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/regex +2025-02-13T20:09:00.7626251Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/timer +2025-02-13T20:09:00.7626572Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/array +2025-02-13T20:09:00.7626904Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/string +2025-02-13T20:09:00.7627222Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/buffer +2025-02-13T20:09:00.7627571Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/functional +2025-02-13T20:09:00.7627895Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/bits/ +2025-02-13T20:09:00.7628305Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/bits/string_view.tcc +2025-02-13T20:09:00.7628684Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/bits/simd_neon.h +2025-02-13T20:09:00.7629065Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/bits/simd_ppc.h +2025-02-13T20:09:00.7629396Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/bits/net.h +2025-02-13T20:09:00.7629788Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/bits/simd_detail.h +2025-02-13T20:09:00.7630158Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/bits/simd_scalar.h +2025-02-13T20:09:00.7630589Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/bits/simd_x86_conversions.h +2025-02-13T20:09:00.7630984Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/bits/simd_converter.h +2025-02-13T20:09:00.7631442Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/bits/simd_fixed_size.h +2025-02-13T20:09:00.7631794Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/bits/simd.h +2025-02-13T20:09:00.7632173Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/bits/lfts_config.h +2025-02-13T20:09:00.7632529Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/bits/simd_x86.h +2025-02-13T20:09:00.7632904Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/bits/shared_ptr.h +2025-02-13T20:09:00.7633299Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/bits/numeric_traits.h +2025-02-13T20:09:00.7633689Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/bits/simd_builtin.h +2025-02-13T20:09:00.7634095Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/bits/simd_math.h +2025-02-13T20:09:00.7634465Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/memory +2025-02-13T20:09:00.7634959Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/type_traits +2025-02-13T20:09:00.7635521Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/memory_resource +2025-02-13T20:09:00.7635867Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/optional +2025-02-13T20:09:00.7636318Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/socket +2025-02-13T20:09:00.7636652Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/deque +2025-02-13T20:09:00.7636969Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/netfwd +2025-02-13T20:09:00.7637335Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/system_error +2025-02-13T20:09:00.7637640Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/set +2025-02-13T20:09:00.7637993Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/executor +2025-02-13T20:09:00.7638330Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/iterator +2025-02-13T20:09:00.7638831Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/unordered_set +2025-02-13T20:09:00.7639148Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/random +2025-02-13T20:09:00.7639470Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/simd +2025-02-13T20:09:00.7639841Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/propagate_const +2025-02-13T20:09:00.7640180Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/experimental/chrono +2025-02-13T20:09:00.7640456Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/complex.h +2025-02-13T20:09:00.7640723Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/iostream +2025-02-13T20:09:00.7640994Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/cstdint +2025-02-13T20:09:00.7641266Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/cstdalign +2025-02-13T20:09:00.7641540Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/chrono +2025-02-13T20:09:00.7641828Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/cerrno +2025-02-13T20:09:00.7642131Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/cxxabi.h +2025-02-13T20:09:00.7642397Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/sstream +2025-02-13T20:09:00.7642682Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/valarray +2025-02-13T20:09:00.7642952Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/cstdbool +2025-02-13T20:09:00.7643212Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr2/ +2025-02-13T20:09:00.7643496Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr2/bool_set +2025-02-13T20:09:00.7644002Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr2/ratio +2025-02-13T20:09:00.7644327Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr2/dynamic_bitset +2025-02-13T20:09:00.7644661Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr2/bool_set.tcc +2025-02-13T20:09:00.7645007Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr2/dynamic_bitset.tcc +2025-02-13T20:09:00.7645323Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/tr2/type_traits +2025-02-13T20:09:00.7645583Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/cctype +2025-02-13T20:09:00.7646020Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/compare +2025-02-13T20:09:00.7646312Z runtime/sfpi/compiler/riscv32-unknown-elf/include/c++/12.4.0/syncstream +2025-02-13T20:09:00.7646542Z runtime/sfpi/compiler/riscv32-unknown-elf/include/complex.h +2025-02-13T20:09:00.7646754Z runtime/sfpi/compiler/riscv32-unknown-elf/include/ssp/ +2025-02-13T20:09:00.7646989Z runtime/sfpi/compiler/riscv32-unknown-elf/include/ssp/stdio.h +2025-02-13T20:09:00.7647238Z runtime/sfpi/compiler/riscv32-unknown-elf/include/ssp/stdlib.h +2025-02-13T20:09:00.7647476Z runtime/sfpi/compiler/riscv32-unknown-elf/include/ssp/string.h +2025-02-13T20:09:00.7648021Z runtime/sfpi/compiler/riscv32-unknown-elf/include/ssp/strings.h +2025-02-13T20:09:00.7648276Z runtime/sfpi/compiler/riscv32-unknown-elf/include/ssp/wchar.h +2025-02-13T20:09:00.7648529Z runtime/sfpi/compiler/riscv32-unknown-elf/include/ssp/unistd.h +2025-02-13T20:09:00.7648763Z runtime/sfpi/compiler/riscv32-unknown-elf/include/ssp/ssp.h +2025-02-13T20:09:00.7649011Z runtime/sfpi/compiler/riscv32-unknown-elf/include/stdio_ext.h +2025-02-13T20:09:00.7649219Z runtime/sfpi/compiler/riscv32-unknown-elf/include/pwd.h +2025-02-13T20:09:00.7649456Z runtime/sfpi/compiler/riscv32-unknown-elf/include/ndbm.h +2025-02-13T20:09:00.7649678Z runtime/sfpi/compiler/riscv32-unknown-elf/include/getopt.h +2025-02-13T20:09:00.7649897Z runtime/sfpi/compiler/riscv32-unknown-elf/include/sched.h +2025-02-13T20:09:00.7650134Z runtime/sfpi/compiler/riscv32-unknown-elf/include/dirent.h +2025-02-13T20:09:00.7650353Z runtime/sfpi/compiler/riscv32-unknown-elf/include/utime.h +2025-02-13T20:09:00.7650607Z runtime/sfpi/compiler/riscv32-unknown-elf/include/_syslist.h +2025-02-13T20:09:00.7650968Z runtime/sfpi/compiler/riscv32-unknown-elf/include/time.h +2025-02-13T20:09:00.7651186Z runtime/sfpi/compiler/riscv32-unknown-elf/include/ar.h +2025-02-13T20:09:00.7651396Z runtime/sfpi/compiler/riscv32-unknown-elf/include/glob.h +2025-02-13T20:09:00.7651569Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ +2025-02-13T20:09:00.7651762Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/libm.a +2025-02-13T20:09:00.7651971Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/libc.a +2025-02-13T20:09:00.7652197Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/libc_nano.a +2025-02-13T20:09:00.7652423Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/nano.specs +2025-02-13T20:09:00.7652656Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/sim.specs +2025-02-13T20:09:00.7652898Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/libgloss_nano.a +2025-02-13T20:09:00.7653113Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/libgloss.a +2025-02-13T20:09:00.7653349Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/libsupc++.la +2025-02-13T20:09:00.7653597Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32i_xttgs/ +2025-02-13T20:09:00.7653846Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32i_xttgs/ilp32/ +2025-02-13T20:09:00.7654130Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32i_xttgs/ilp32/libm.a +2025-02-13T20:09:00.7654618Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32i_xttgs/ilp32/libc.a +2025-02-13T20:09:00.7654938Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32i_xttgs/ilp32/libc_nano.a +2025-02-13T20:09:00.7655250Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32i_xttgs/ilp32/nano.specs +2025-02-13T20:09:00.7655547Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32i_xttgs/ilp32/sim.specs +2025-02-13T20:09:00.7655866Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32i_xttgs/ilp32/libgloss_nano.a +2025-02-13T20:09:00.7656183Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32i_xttgs/ilp32/libgloss.a +2025-02-13T20:09:00.7656632Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32i_xttgs/ilp32/libsupc++.la +2025-02-13T20:09:00.7656962Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32i_xttgs/ilp32/libm_nano.a +2025-02-13T20:09:00.7657277Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32i_xttgs/ilp32/semihost.specs +2025-02-13T20:09:00.7657588Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32i_xttgs/ilp32/libsemihost.a +2025-02-13T20:09:00.7657887Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32i_xttgs/ilp32/libsupc++.a +2025-02-13T20:09:00.7658184Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32i_xttgs/ilp32/nosys.specs +2025-02-13T20:09:00.7658477Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32i_xttgs/ilp32/crt0.o +2025-02-13T20:09:00.7658776Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32i_xttgs/ilp32/libg_nano.a +2025-02-13T20:09:00.7659081Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32i_xttgs/ilp32/libnosys.a +2025-02-13T20:09:00.7659366Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32i_xttgs/ilp32/libsim.a +2025-02-13T20:09:00.7659817Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32i_xttgs/ilp32/libg.a +2025-02-13T20:09:00.7660146Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32i_xttgs/ilp32/libstdc++.a +2025-02-13T20:09:00.7660478Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32i_xttgs/ilp32/libstdc++.la +2025-02-13T20:09:00.7660827Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32i_xttgs/ilp32/libstdc++.a-gdb.py +2025-02-13T20:09:00.7661069Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/ +2025-02-13T20:09:00.7661356Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf32lriscv.xu +2025-02-13T20:09:00.7661669Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf64briscv.xce +2025-02-13T20:09:00.7661956Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf32briscv.xce +2025-02-13T20:09:00.7662315Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf64lriscv.xn +2025-02-13T20:09:00.7662605Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf32lriscv.xe +2025-02-13T20:09:00.7662928Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf64lriscv.xbn +2025-02-13T20:09:00.7663388Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf64briscv.x +2025-02-13T20:09:00.7663691Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf64briscv.xn +2025-02-13T20:09:00.7664000Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf64lriscv.xw +2025-02-13T20:09:00.7664284Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf32briscv.x +2025-02-13T20:09:00.7664589Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf32briscv.xr +2025-02-13T20:09:00.7664881Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf64briscv.xu +2025-02-13T20:09:00.7665356Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf64briscv.xe +2025-02-13T20:09:00.7665646Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf32briscv.xwe +2025-02-13T20:09:00.7674160Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf32lriscv.xbn +2025-02-13T20:09:00.7674555Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf32briscv.xc +2025-02-13T20:09:00.7674865Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf32lriscv.xwe +2025-02-13T20:09:00.7675147Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf32lriscv.xn +2025-02-13T20:09:00.7675433Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf64lriscv.xwe +2025-02-13T20:09:00.7675905Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf32briscv.xn +2025-02-13T20:09:00.7676208Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf32briscv.xbn +2025-02-13T20:09:00.7676492Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf64lriscv.xr +2025-02-13T20:09:00.7676786Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf32briscv.xu +2025-02-13T20:09:00.7677067Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf64lriscv.xc +2025-02-13T20:09:00.7677349Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf64briscv.xwe +2025-02-13T20:09:00.7677819Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf64lriscv.xce +2025-02-13T20:09:00.7678121Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf64briscv.xbn +2025-02-13T20:09:00.7678405Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf32lriscv.xce +2025-02-13T20:09:00.7678685Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf32lriscv.xr +2025-02-13T20:09:00.7678970Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf64briscv.xr +2025-02-13T20:09:00.7679254Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf32briscv.xw +2025-02-13T20:09:00.7679534Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf64briscv.xc +2025-02-13T20:09:00.7679810Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf64briscv.xw +2025-02-13T20:09:00.7680089Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf64lriscv.x +2025-02-13T20:09:00.7680365Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf32lriscv.xw +2025-02-13T20:09:00.7680653Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf32lriscv.x +2025-02-13T20:09:00.7681137Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf64lriscv.xe +2025-02-13T20:09:00.7681481Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf64lriscv.xu +2025-02-13T20:09:00.7681771Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf32lriscv.xc +2025-02-13T20:09:00.7682057Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/ldscripts/elf32briscv.xe +2025-02-13T20:09:00.7682284Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/libm_nano.a +2025-02-13T20:09:00.7682525Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/semihost.specs +2025-02-13T20:09:00.7682753Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/libsemihost.a +2025-02-13T20:09:00.7682974Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/libsupc++.a +2025-02-13T20:09:00.7683191Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/nosys.specs +2025-02-13T20:09:00.7683388Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/crt0.o +2025-02-13T20:09:00.7683608Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/libg_nano.a +2025-02-13T20:09:00.7683829Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/libnosys.a +2025-02-13T20:09:00.7684173Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/libsim.a +2025-02-13T20:09:00.7684363Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/libg.a +2025-02-13T20:09:00.7684590Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttbh/ +2025-02-13T20:09:00.7684842Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttbh/ilp32/ +2025-02-13T20:09:00.7685123Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttbh/ilp32/libm.a +2025-02-13T20:09:00.7685406Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttbh/ilp32/libc.a +2025-02-13T20:09:00.7685713Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttbh/ilp32/libc_nano.a +2025-02-13T20:09:00.7686024Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttbh/ilp32/nano.specs +2025-02-13T20:09:00.7686514Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttbh/ilp32/sim.specs +2025-02-13T20:09:00.7686860Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttbh/ilp32/libgloss_nano.a +2025-02-13T20:09:00.7687180Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttbh/ilp32/libgloss.a +2025-02-13T20:09:00.7687496Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttbh/ilp32/libsupc++.la +2025-02-13T20:09:00.7687933Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttbh/ilp32/libm_nano.a +2025-02-13T20:09:00.7688436Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttbh/ilp32/semihost.specs +2025-02-13T20:09:00.7688760Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttbh/ilp32/libsemihost.a +2025-02-13T20:09:00.7689073Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttbh/ilp32/libsupc++.a +2025-02-13T20:09:00.7689379Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttbh/ilp32/nosys.specs +2025-02-13T20:09:00.7689667Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttbh/ilp32/crt0.o +2025-02-13T20:09:00.7689969Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttbh/ilp32/libg_nano.a +2025-02-13T20:09:00.7690281Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttbh/ilp32/libnosys.a +2025-02-13T20:09:00.7690582Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttbh/ilp32/libsim.a +2025-02-13T20:09:00.7690874Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttbh/ilp32/libg.a +2025-02-13T20:09:00.7691187Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttbh/ilp32/libstdc++.a +2025-02-13T20:09:00.7691537Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttbh/ilp32/libstdc++.la +2025-02-13T20:09:00.7691881Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttbh/ilp32/libstdc++.a-gdb.py +2025-02-13T20:09:00.7692128Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/libstdc++.a +2025-02-13T20:09:00.7692378Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttwh/ +2025-02-13T20:09:00.7692632Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttwh/ilp32/ +2025-02-13T20:09:00.7692917Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttwh/ilp32/libm.a +2025-02-13T20:09:00.7693377Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttwh/ilp32/libc.a +2025-02-13T20:09:00.7694700Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttwh/ilp32/libc_nano.a +2025-02-13T20:09:00.7707601Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttwh/ilp32/nano.specs +2025-02-13T20:09:00.7708727Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttwh/ilp32/sim.specs +2025-02-13T20:09:00.7709134Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttwh/ilp32/libgloss_nano.a +2025-02-13T20:09:00.7709513Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttwh/ilp32/libgloss.a +2025-02-13T20:09:00.7709863Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttwh/ilp32/libsupc++.la +2025-02-13T20:09:00.7710246Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttwh/ilp32/libm_nano.a +2025-02-13T20:09:00.7717657Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttwh/ilp32/semihost.specs +2025-02-13T20:09:00.7718061Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttwh/ilp32/libsemihost.a +2025-02-13T20:09:00.7718428Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttwh/ilp32/libsupc++.a +2025-02-13T20:09:00.7722078Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttwh/ilp32/nosys.specs +2025-02-13T20:09:00.7723332Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttwh/ilp32/crt0.o +2025-02-13T20:09:00.7724343Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttwh/ilp32/libg_nano.a +2025-02-13T20:09:00.7736227Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttwh/ilp32/libnosys.a +2025-02-13T20:09:00.7736591Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttwh/ilp32/libsim.a +2025-02-13T20:09:00.7736908Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttwh/ilp32/libg.a +2025-02-13T20:09:00.7737267Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttwh/ilp32/libstdc++.a +2025-02-13T20:09:00.7799227Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttwh/ilp32/libstdc++.la +2025-02-13T20:09:00.7799641Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/rv32im_xttwh/ilp32/libstdc++.a-gdb.py +2025-02-13T20:09:00.7799963Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/libstdc++.la +2025-02-13T20:09:00.7800302Z runtime/sfpi/compiler/riscv32-unknown-elf/lib/libstdc++.a-gdb.py +2025-02-13T20:09:00.7800496Z runtime/sfpi/compiler/riscv32-unknown-elf/bin/ +2025-02-13T20:09:00.7800735Z runtime/sfpi/compiler/riscv32-unknown-elf/bin/readelf +2025-02-13T20:09:00.7810668Z runtime/sfpi/compiler/riscv32-unknown-elf/bin/ranlib +2025-02-13T20:09:00.7821984Z runtime/sfpi/compiler/riscv32-unknown-elf/bin/ld +2025-02-13T20:09:00.7840181Z runtime/sfpi/compiler/riscv32-unknown-elf/bin/objdump +2025-02-13T20:09:00.7859482Z runtime/sfpi/compiler/riscv32-unknown-elf/bin/as +2025-02-13T20:09:00.7878035Z runtime/sfpi/compiler/riscv32-unknown-elf/bin/objcopy +2025-02-13T20:09:00.7893991Z runtime/sfpi/compiler/riscv32-unknown-elf/bin/ld.bfd +2025-02-13T20:09:00.7894208Z runtime/sfpi/compiler/riscv32-unknown-elf/bin/nm +2025-02-13T20:09:00.7904807Z runtime/sfpi/compiler/riscv32-unknown-elf/bin/strip +2025-02-13T20:09:00.7916837Z runtime/sfpi/compiler/riscv32-unknown-elf/bin/ar +2025-02-13T20:09:00.7927926Z runtime/sfpi/compiler/libexec/ +2025-02-13T20:09:00.7928657Z runtime/sfpi/compiler/libexec/gcc/ +2025-02-13T20:09:00.7928938Z runtime/sfpi/compiler/libexec/gcc/riscv32-unknown-elf/ +2025-02-13T20:09:00.7929204Z runtime/sfpi/compiler/libexec/gcc/riscv32-unknown-elf/12.4.0/ +2025-02-13T20:09:00.7929563Z runtime/sfpi/compiler/libexec/gcc/riscv32-unknown-elf/12.4.0/liblto_plugin.la +2025-02-13T20:09:00.7929897Z runtime/sfpi/compiler/libexec/gcc/riscv32-unknown-elf/12.4.0/liblto_plugin.so +2025-02-13T20:09:00.7930587Z runtime/sfpi/compiler/libexec/gcc/riscv32-unknown-elf/12.4.0/lto-wrapper +2025-02-13T20:09:00.7953376Z runtime/sfpi/compiler/libexec/gcc/riscv32-unknown-elf/12.4.0/cc1plus +2025-02-13T20:09:00.8210316Z runtime/sfpi/compiler/libexec/gcc/riscv32-unknown-elf/12.4.0/lto1 +2025-02-13T20:09:00.8423502Z runtime/sfpi/compiler/libexec/gcc/riscv32-unknown-elf/12.4.0/install-tools/ +2025-02-13T20:09:00.8424739Z runtime/sfpi/compiler/libexec/gcc/riscv32-unknown-elf/12.4.0/install-tools/mkheaders +2025-02-13T20:09:00.8425169Z runtime/sfpi/compiler/libexec/gcc/riscv32-unknown-elf/12.4.0/install-tools/fixinc.sh +2025-02-13T20:09:00.8425556Z runtime/sfpi/compiler/libexec/gcc/riscv32-unknown-elf/12.4.0/install-tools/mkinstalldirs +2025-02-13T20:09:00.8425927Z runtime/sfpi/compiler/libexec/gcc/riscv32-unknown-elf/12.4.0/install-tools/fixincl +2025-02-13T20:09:00.8428721Z runtime/sfpi/compiler/libexec/gcc/riscv32-unknown-elf/12.4.0/collect2 +2025-02-13T20:09:00.8435768Z runtime/sfpi/compiler/libexec/gcc/riscv32-unknown-elf/12.4.0/plugin/ +2025-02-13T20:09:00.8436106Z runtime/sfpi/compiler/libexec/gcc/riscv32-unknown-elf/12.4.0/plugin/gengtype +2025-02-13T20:09:00.8438226Z runtime/sfpi/compiler/libexec/gcc/riscv32-unknown-elf/12.4.0/cc1 +2025-02-13T20:09:00.8662233Z runtime/sfpi/compiler/libexec/gcc/riscv32-unknown-elf/12.4.0/g++-mapper-server +2025-02-13T20:09:00.8665578Z runtime/sfpi/compiler/lib/ +2025-02-13T20:09:00.8666564Z runtime/sfpi/compiler/lib/libcc1.so.0.0.0 +2025-02-13T20:09:00.8667728Z runtime/sfpi/compiler/lib/libcc1.so +2025-02-13T20:09:00.8668253Z runtime/sfpi/compiler/lib/gcc/ +2025-02-13T20:09:00.8668473Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/ +2025-02-13T20:09:00.8668735Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/ +2025-02-13T20:09:00.8669000Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/include/ +2025-02-13T20:09:00.8669317Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/include/tgmath.h +2025-02-13T20:09:00.8669611Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/include/unwind.h +2025-02-13T20:09:00.8669959Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/include/stdnoreturn.h +2025-02-13T20:09:00.8670253Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/include/gcov.h +2025-02-13T20:09:00.8670608Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/include/stddef.h +2025-02-13T20:09:00.8670929Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/include/varargs.h +2025-02-13T20:09:00.8671271Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/include/stdatomic.h +2025-02-13T20:09:00.8671579Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/include/stdint.h +2025-02-13T20:09:00.8671896Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/include/iso646.h +2025-02-13T20:09:00.8672199Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/include/stdbool.h +2025-02-13T20:09:00.8672513Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/include/stdint-gcc.h +2025-02-13T20:09:00.8672810Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/include/stdfix.h +2025-02-13T20:09:00.8673097Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/include/float.h +2025-02-13T20:09:00.8673408Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/include/stdarg.h +2025-02-13T20:09:00.8673718Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/include/stdalign.h +2025-02-13T20:09:00.8674021Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/include-fixed/ +2025-02-13T20:09:00.8674361Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/include-fixed/README +2025-02-13T20:09:00.8674700Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/include-fixed/limits.h +2025-02-13T20:09:00.8675042Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/include-fixed/syslimits.h +2025-02-13T20:09:00.8675296Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/crtn.o +2025-02-13T20:09:00.8675541Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/crti.o +2025-02-13T20:09:00.8676036Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/libgcov.a +2025-02-13T20:09:00.8677418Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/rv32i_xttgs/ +2025-02-13T20:09:00.8678108Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/rv32i_xttgs/ilp32/ +2025-02-13T20:09:00.8678518Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/rv32i_xttgs/ilp32/crtn.o +2025-02-13T20:09:00.8678876Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/rv32i_xttgs/ilp32/crti.o +2025-02-13T20:09:00.8679622Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/rv32i_xttgs/ilp32/libgcov.a +2025-02-13T20:09:00.8682421Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/rv32i_xttgs/ilp32/crtbegin.o +2025-02-13T20:09:00.8682837Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/rv32i_xttgs/ilp32/libgcc.a +2025-02-13T20:09:00.8707512Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/rv32i_xttgs/ilp32/crtend.o +2025-02-13T20:09:00.8707841Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/crtbegin.o +2025-02-13T20:09:00.8708165Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/install-tools/ +2025-02-13T20:09:00.8708535Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/install-tools/include/ +2025-02-13T20:09:00.8708912Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/install-tools/include/README +2025-02-13T20:09:00.8709342Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/install-tools/include/limits.h +2025-02-13T20:09:00.8709708Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/install-tools/macro_list +2025-02-13T20:09:00.8710379Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/install-tools/gsyslimits.h +2025-02-13T20:09:00.8710753Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/install-tools/mkheaders.conf +2025-02-13T20:09:00.8711137Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/install-tools/fixinc_list +2025-02-13T20:09:00.8711409Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/ +2025-02-13T20:09:00.8711746Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/ +2025-02-13T20:09:00.8712123Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/sel-sched-ir.h +2025-02-13T20:09:00.8712537Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gtype-desc.h +2025-02-13T20:09:00.8712920Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/symtab-thunks.h +2025-02-13T20:09:00.8713316Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/libfuncs.h +2025-02-13T20:09:00.8713672Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gcov-io.h +2025-02-13T20:09:00.8714088Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gcc-plugin.h +2025-02-13T20:09:00.8714516Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/diagnostic-metadata.h +2025-02-13T20:09:00.8714941Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-ssa-reassoc.h +2025-02-13T20:09:00.8715287Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/bitmap.h +2025-02-13T20:09:00.8715702Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-cfgcleanup.h +2025-02-13T20:09:00.8716172Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gimple-ssa-warn-restrict.h +2025-02-13T20:09:00.8716559Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/range-op.h +2025-02-13T20:09:00.8716991Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-scalar-evolution.h +2025-02-13T20:09:00.8717369Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/debug.h +2025-02-13T20:09:00.8717750Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/value-relation.h +2025-02-13T20:09:00.8718093Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/ccmp.h +2025-02-13T20:09:00.8718512Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/vr-values.h +2025-02-13T20:09:00.8718897Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/explow.h +2025-02-13T20:09:00.8719286Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tm-preds.h +2025-02-13T20:09:00.8719726Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/rtx-vector-builder.h +2025-02-13T20:09:00.8720139Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-vrp.h +2025-02-13T20:09:00.8720615Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/print-tree.h +2025-02-13T20:09:00.8720997Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/opts.h +2025-02-13T20:09:00.8721411Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gimple-array-bounds.h +2025-02-13T20:09:00.8721827Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/sync-builtins.def +2025-02-13T20:09:00.8722202Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-ssa-loop.h +2025-02-13T20:09:00.8722571Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/cgraph.h +2025-02-13T20:09:00.8722950Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/optabs-tree.h +2025-02-13T20:09:00.8723348Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/array-traits.h +2025-02-13T20:09:00.8723735Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gimple-iterator.h +2025-02-13T20:09:00.8724204Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gimple-ssa-evrp-analyze.h +2025-02-13T20:09:00.8724773Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gsstruct.def +2025-02-13T20:09:00.8725162Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/cppbuiltin.h +2025-02-13T20:09:00.8725506Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/domwalk.h +2025-02-13T20:09:00.8725878Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/ipa-ref.h +2025-02-13T20:09:00.8726259Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/mode-classes.def +2025-02-13T20:09:00.8726623Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/hooks.h +2025-02-13T20:09:00.8726954Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/rtl.h +2025-02-13T20:09:00.8727387Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/mem-stats-traits.h +2025-02-13T20:09:00.8727896Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/predict.h +2025-02-13T20:09:00.8728279Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/limity.h +2025-02-13T20:09:00.8728652Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/data-streamer.h +2025-02-13T20:09:00.8729007Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/cfg.h +2025-02-13T20:09:00.8729427Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-ssa-loop-ivopts.h +2025-02-13T20:09:00.8729803Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-iterator.h +2025-02-13T20:09:00.8730182Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/bversion.h +2025-02-13T20:09:00.8730533Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/realmpfr.h +2025-02-13T20:09:00.8730918Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/insn-codes.h +2025-02-13T20:09:00.8731301Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/profile-count.h +2025-02-13T20:09:00.8731678Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/except.h +2025-02-13T20:09:00.8732028Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gengtype.h +2025-02-13T20:09:00.8732430Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/internal-fn.def +2025-02-13T20:09:00.8732840Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/ipa-modref-tree.h +2025-02-13T20:09:00.8733239Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/ssa-iterators.h +2025-02-13T20:09:00.8733583Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/flags.h +2025-02-13T20:09:00.8733975Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/reg-notes.def +2025-02-13T20:09:00.8734341Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/value-query.h +2025-02-13T20:09:00.8734880Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/cfgexpand.h +2025-02-13T20:09:00.8735359Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gimple-range-edge.h +2025-02-13T20:09:00.8735750Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gimplify-me.h +2025-02-13T20:09:00.8736107Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/typeclass.h +2025-02-13T20:09:00.8736468Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/sese.h +2025-02-13T20:09:00.8736858Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/insn-attr-common.h +2025-02-13T20:09:00.8737261Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-into-ssa.h +2025-02-13T20:09:00.8737649Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/diagnostic-core.h +2025-02-13T20:09:00.8738037Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/builtins.def +2025-02-13T20:09:00.8738543Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/target-def.h +2025-02-13T20:09:00.8738932Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/value-prof.h +2025-02-13T20:09:00.8739289Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/cfghooks.h +2025-02-13T20:09:00.8739751Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/int-vector-builder.h +2025-02-13T20:09:00.8740081Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/lcm.h +2025-02-13T20:09:00.8740456Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tristate.h +2025-02-13T20:09:00.8740796Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/config.h +2025-02-13T20:09:00.8741182Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/regrename.h +2025-02-13T20:09:00.8741537Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/rtl-error.h +2025-02-13T20:09:00.8741939Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/streamer-hooks.h +2025-02-13T20:09:00.8742339Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/symtab-clones.h +2025-02-13T20:09:00.8742720Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/wide-int-print.h +2025-02-13T20:09:00.8743080Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tsan.h +2025-02-13T20:09:00.8743444Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/fixed-value.h +2025-02-13T20:09:00.8743811Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/ifcvt.h +2025-02-13T20:09:00.8744183Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/lto-compress.h +2025-02-13T20:09:00.8744568Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/double-int.h +2025-02-13T20:09:00.8744905Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tm_p.h +2025-02-13T20:09:00.8745299Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/machmode.h +2025-02-13T20:09:00.8745673Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/splay-tree.h +2025-02-13T20:09:00.8746043Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/c-tree.h +2025-02-13T20:09:00.8746416Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/libiberty.h +2025-02-13T20:09:00.8746832Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/inchash.h +2025-02-13T20:09:00.8747207Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-ssa-dse.h +2025-02-13T20:09:00.8747563Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gcc.h +2025-02-13T20:09:00.8747934Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-if-conv.h +2025-02-13T20:09:00.8748360Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-ssa-coalesce.h +2025-02-13T20:09:00.8748914Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/omp-general.h +2025-02-13T20:09:00.8749308Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/dbgcnt.def +2025-02-13T20:09:00.8749727Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/opts-jobserver.h +2025-02-13T20:09:00.8750104Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/ira-int.h +2025-02-13T20:09:00.8750517Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-ssa-loop-niter.h +2025-02-13T20:09:00.8750921Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/langhooks-def.h +2025-02-13T20:09:00.8751287Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/attr-fnspec.h +2025-02-13T20:09:00.8751664Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/escaped_string.h +2025-02-13T20:09:00.8752077Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-ssa-alias.h +2025-02-13T20:09:00.8752557Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-pass.h +2025-02-13T20:09:00.8752910Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/cp/ +2025-02-13T20:09:00.8753293Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/cp/name-lookup.h +2025-02-13T20:09:00.8753738Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/cp/cxx-pretty-print.h +2025-02-13T20:09:00.8754101Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/cp/cp-tree.h +2025-02-13T20:09:00.8754507Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/cp/operators.def +2025-02-13T20:09:00.8754877Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/cp/cp-tree.def +2025-02-13T20:09:00.8755281Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/cp/type-utils.h +2025-02-13T20:09:00.8755719Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-ssa-alias-compare.h +2025-02-13T20:09:00.8756119Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-stdarg.h +2025-02-13T20:09:00.8756484Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/flag-types.h +2025-02-13T20:09:00.8756871Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/stringpool.h +2025-02-13T20:09:00.8757264Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/pass-instances.def +2025-02-13T20:09:00.8757729Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-vector-builder.h +2025-02-13T20:09:00.8758108Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gtm-builtins.def +2025-02-13T20:09:00.8758478Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tsystem.h +2025-02-13T20:09:00.8758871Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/ordered-hash-map.h +2025-02-13T20:09:00.8759293Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/diagnostic-path.h +2025-02-13T20:09:00.8759697Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/typed-splay-tree.h +2025-02-13T20:09:00.8760099Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/ipa-fnsummary.h +2025-02-13T20:09:00.8760424Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/ada/ +2025-02-13T20:09:00.8760840Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/ada/gcc-interface/ +2025-02-13T20:09:00.8761300Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/ada/gcc-interface/ada-tree.def +2025-02-13T20:09:00.8761669Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/cpplib.h +2025-02-13T20:09:00.8762054Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/spellcheck-tree.h +2025-02-13T20:09:00.8762409Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/lra.h +2025-02-13T20:09:00.8762891Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gsyslimits.h +2025-02-13T20:09:00.8763268Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/hash-table.h +2025-02-13T20:09:00.8763647Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-dfa.h +2025-02-13T20:09:00.8763986Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/alias.h +2025-02-13T20:09:00.8764364Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/coverage.h +2025-02-13T20:09:00.8764744Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/target-globals.h +2025-02-13T20:09:00.8765134Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree.def +2025-02-13T20:09:00.8765476Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/cselib.h +2025-02-13T20:09:00.8765864Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-hasher.h +2025-02-13T20:09:00.8766235Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/statistics.h +2025-02-13T20:09:00.8766784Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-ssa-propagate.h +2025-02-13T20:09:00.8767114Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/ctfc.h +2025-02-13T20:09:00.8767494Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/rtl.def +2025-02-13T20:09:00.8768029Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/et-forest.h +2025-02-13T20:09:00.8768379Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/d/ +2025-02-13T20:09:00.8768734Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/d/d-tree.def +2025-02-13T20:09:00.8769100Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/attribs.h +2025-02-13T20:09:00.8769533Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gimple-ssa-warn-access.h +2025-02-13T20:09:00.8769979Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gimple-range-path.h +2025-02-13T20:09:00.8770339Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/c-family/ +2025-02-13T20:09:00.8770775Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/c-family/c-common.def +2025-02-13T20:09:00.8771173Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/c-family/c-pragma.h +2025-02-13T20:09:00.8771587Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/c-family/c-objc.h +2025-02-13T20:09:00.8771983Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/c-family/c-common.h +2025-02-13T20:09:00.8772439Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/c-family/c-pretty-print.h +2025-02-13T20:09:00.8772819Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/diagnostic.def +2025-02-13T20:09:00.8773219Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/plugin-api.h +2025-02-13T20:09:00.8773606Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/generic-match.h +2025-02-13T20:09:00.8773990Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/graphite.h +2025-02-13T20:09:00.8774404Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/value-pointer-equiv.h +2025-02-13T20:09:00.8774745Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/vmsdbg.h +2025-02-13T20:09:00.8775177Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/varasm.h +2025-02-13T20:09:00.8775542Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/value-range.h +2025-02-13T20:09:00.8775912Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-sra.h +2025-02-13T20:09:00.8776316Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/valtrack.h +2025-02-13T20:09:00.8776668Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/graphviz.h +2025-02-13T20:09:00.8777280Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/insn-config.h +2025-02-13T20:09:00.8777622Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/stmt.h +2025-02-13T20:09:00.8778056Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-ssa-math-opts.h +2025-02-13T20:09:00.8778457Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/value-range-equiv.h +2025-02-13T20:09:00.8778861Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/cfgloopmanip.h +2025-02-13T20:09:00.8779235Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-ssa-dom.h +2025-02-13T20:09:00.8779609Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/selftest.h +2025-02-13T20:09:00.8779972Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gimple-ssa.h +2025-02-13T20:09:00.8780311Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/vec.h +2025-02-13T20:09:00.8780878Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/plugin.def +2025-02-13T20:09:00.8781278Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/dump-context.h +2025-02-13T20:09:00.8781608Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/ggc.h +2025-02-13T20:09:00.8781945Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/asan.h +2025-02-13T20:09:00.8782337Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/sanitizer.def +2025-02-13T20:09:00.8782671Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree.h +2025-02-13T20:09:00.8783050Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/xcoffout.h +2025-02-13T20:09:00.8783415Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-check.h +2025-02-13T20:09:00.8783786Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/reload.h +2025-02-13T20:09:00.8784186Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/pass_manager.h +2025-02-13T20:09:00.8784562Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/function.h +2025-02-13T20:09:00.8784994Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-ssa-threadupdate.h +2025-02-13T20:09:00.8785399Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/iterator-utils.h +2025-02-13T20:09:00.8785778Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-ssa-sccvn.h +2025-02-13T20:09:00.8786142Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/range.h +2025-02-13T20:09:00.8786481Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/config/ +2025-02-13T20:09:00.8786877Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/config/elfos.h +2025-02-13T20:09:00.8787243Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/config/riscv/ +2025-02-13T20:09:00.8787693Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/config/riscv/riscv.h +2025-02-13T20:09:00.8788126Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/config/riscv/riscv-opts.h +2025-02-13T20:09:00.8788547Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/config/riscv/elf.h +2025-02-13T20:09:00.8788985Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/config/riscv/riscv-protos.h +2025-02-13T20:09:00.8789433Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/config/initfini-array.h +2025-02-13T20:09:00.8789847Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/config/newlib-stdint.h +2025-02-13T20:09:00.8790265Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-vectorizer.h +2025-02-13T20:09:00.8790652Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/omp-simd-clone.h +2025-02-13T20:09:00.8791099Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/options.h +2025-02-13T20:09:00.8791541Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gimple-match.h +2025-02-13T20:09:00.8791915Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/backend.h +2025-02-13T20:09:00.8792296Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-data-ref.h +2025-02-13T20:09:00.8792683Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/line-map.h +2025-02-13T20:09:00.8793068Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-ssa-live.h +2025-02-13T20:09:00.8793416Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/rtl-ssa.h +2025-02-13T20:09:00.8793816Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/shrink-wrap.h +2025-02-13T20:09:00.8794160Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gcse.h +2025-02-13T20:09:00.8794560Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/hosthooks.h +2025-02-13T20:09:00.8795039Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/collect-utils.h +2025-02-13T20:09:00.8795410Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gimple.h +2025-02-13T20:09:00.8795810Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/wide-int-bitmask.h +2025-02-13T20:09:00.8796201Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/predict.def +2025-02-13T20:09:00.8796587Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/insn-constants.h +2025-02-13T20:09:00.8796956Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/output.h +2025-02-13T20:09:00.8797343Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/shortest-paths.h +2025-02-13T20:09:00.8797738Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/sparseset.h +2025-02-13T20:09:00.8798092Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/json.h +2025-02-13T20:09:00.8798505Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/toplev.h +2025-02-13T20:09:00.8798888Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/function-abi.h +2025-02-13T20:09:00.8799346Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/collect2.h +2025-02-13T20:09:00.8799776Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/incpath.h +2025-02-13T20:09:00.8800238Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/signop.h +2025-02-13T20:09:00.8800655Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/md5.h +2025-02-13T20:09:00.8801129Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/opt-problem.h +2025-02-13T20:09:00.8801497Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/configargs.h +2025-02-13T20:09:00.8801911Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/color-macros.h +2025-02-13T20:09:00.8802262Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/system.h +2025-02-13T20:09:00.8802712Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/selftest-diagnostic.h +2025-02-13T20:09:00.8803069Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/hashtab.h +2025-02-13T20:09:00.8803452Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gimple-predict.h +2025-02-13T20:09:00.8803854Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gimple-walk.h +2025-02-13T20:09:00.8804201Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gsyms.h +2025-02-13T20:09:00.8804576Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/errors.h +2025-02-13T20:09:00.8804934Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/ipa-prop.h +2025-02-13T20:09:00.8805440Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-ssa.h +2025-02-13T20:09:00.8805818Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/bb-reorder.h +2025-02-13T20:09:00.8806206Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/dwarf2out.h +2025-02-13T20:09:00.8806583Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gimple-expr.h +2025-02-13T20:09:00.8806979Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-affine.h +2025-02-13T20:09:00.8807311Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/df.h +2025-02-13T20:09:00.8807869Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-ssa-ter.h +2025-02-13T20:09:00.8808296Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/pointer-query.h +2025-02-13T20:09:00.8808735Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tm.h +2025-02-13T20:09:00.8809175Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/hwint.h +2025-02-13T20:09:00.8809770Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/coretypes.h +2025-02-13T20:09:00.8810132Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/langhooks.h +2025-02-13T20:09:00.8810516Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/hash-map.h +2025-02-13T20:09:00.8810866Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/defaults.h +2025-02-13T20:09:00.8811242Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/cfgbuild.h +2025-02-13T20:09:00.8811664Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/target-hooks-macros.h +2025-02-13T20:09:00.8812112Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-ssa-loop-manip.h +2025-02-13T20:09:00.8812470Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/trans-mem.h +2025-02-13T20:09:00.8812877Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/vtable-verify.h +2025-02-13T20:09:00.8813249Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/fold-const.h +2025-02-13T20:09:00.8813584Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/intl.h +2025-02-13T20:09:00.8814013Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/optinfo-emit-json.h +2025-02-13T20:09:00.8814381Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gcse-common.h +2025-02-13T20:09:00.8814808Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/read-rtl-function.h +2025-02-13T20:09:00.8815241Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/selftest-rtl.h +2025-02-13T20:09:00.8815616Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/rtlhash.h +2025-02-13T20:09:00.8815981Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/omp-offload.h +2025-02-13T20:09:00.8816365Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/profile.h +2025-02-13T20:09:00.8816719Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/regset.h +2025-02-13T20:09:00.8817120Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-ssa-dce.h +2025-02-13T20:09:00.8817466Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/cfganal.h +2025-02-13T20:09:00.8817877Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/poly-int-types.h +2025-02-13T20:09:00.8818245Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gimple-fold.h +2025-02-13T20:09:00.8818653Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gimple-streamer.h +2025-02-13T20:09:00.8819029Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-ssa-ccp.h +2025-02-13T20:09:00.8819405Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/resource.h +2025-02-13T20:09:00.8819988Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gimple-predicate-analysis.h +2025-02-13T20:09:00.8820439Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/optabs-query.h +2025-02-13T20:09:00.8820795Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gimple.def +2025-02-13T20:09:00.8821200Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/ipa-predicate.h +2025-02-13T20:09:00.8821565Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/sel-sched.h +2025-02-13T20:09:00.8821953Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/insn-addr.h +2025-02-13T20:09:00.8822341Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/builtin-types.def +2025-02-13T20:09:00.8822765Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/lto-section-names.h +2025-02-13T20:09:00.8823145Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/fibonacci_heap.h +2025-02-13T20:09:00.8823632Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/input.h +2025-02-13T20:09:00.8823977Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/rtlanal.h +2025-02-13T20:09:00.8824411Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-pretty-print.h +2025-02-13T20:09:00.8824741Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/dce.h +2025-02-13T20:09:00.8825127Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/mem-stats.h +2025-02-13T20:09:00.8825468Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/genrtl.h +2025-02-13T20:09:00.8825811Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/expmed.h +2025-02-13T20:09:00.8826243Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-ssa-operands.h +2025-02-13T20:09:00.8826626Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-diagnostic.h +2025-02-13T20:09:00.8827052Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/ipa-icf-gimple.h +2025-02-13T20:09:00.8827426Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gimple-range.h +2025-02-13T20:09:00.8827846Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/diagnostic-color.h +2025-02-13T20:09:00.8828238Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/vec-perm-indices.h +2025-02-13T20:09:00.8828628Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/hash-traits.h +2025-02-13T20:09:00.8829014Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/builtin-attrs.def +2025-02-13T20:09:00.8829445Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gimple-range-cache.h +2025-02-13T20:09:00.8829783Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/expr.h +2025-02-13T20:09:00.8830248Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-switch-conversion.h +2025-02-13T20:09:00.8830639Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/lower-subreg.h +2025-02-13T20:09:00.8831017Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/hash-set.h +2025-02-13T20:09:00.8831449Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/highlev-plugin-common.h +2025-02-13T20:09:00.8831815Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/ubsan.h +2025-02-13T20:09:00.8832167Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/poly-int.h +2025-02-13T20:09:00.8832549Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/regcprop.h +2025-02-13T20:09:00.8832921Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/hard-reg-set.h +2025-02-13T20:09:00.8833309Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/print-rtl.h +2025-02-13T20:09:00.8833807Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-nested.h +2025-02-13T20:09:00.8834182Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/conditions.h +2025-02-13T20:09:00.8834546Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/xcoff.h +2025-02-13T20:09:00.8834967Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/loop-unroll.h +2025-02-13T20:09:00.8835350Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/plugin-version.h +2025-02-13T20:09:00.8835718Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/target.h +2025-02-13T20:09:00.8836056Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/align.h +2025-02-13T20:09:00.8836419Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/recog.h +2025-02-13T20:09:00.8836764Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/cfgloop.h +2025-02-13T20:09:00.8837107Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/objc/ +2025-02-13T20:09:00.8837648Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/objc/objc-tree.def +2025-02-13T20:09:00.8838000Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/memmodel.h +2025-02-13T20:09:00.8838405Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-outof-ssa.h +2025-02-13T20:09:00.8838732Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/ira.h +2025-02-13T20:09:00.8839142Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/opt-suggestions.h +2025-02-13T20:09:00.8839523Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gimple-builder.h +2025-02-13T20:09:00.8839908Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/all-tree.def +2025-02-13T20:09:00.8840279Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/edit-context.h +2025-02-13T20:09:00.8840712Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/insn-modes-inline.h +2025-02-13T20:09:00.8841067Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/timevar.h +2025-02-13T20:09:00.8841442Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/obstack.h +2025-02-13T20:09:00.8841804Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/auto-host.h +2025-02-13T20:09:00.8842215Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/target-insns.def +2025-02-13T20:09:00.8842583Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/diagnostic.h +2025-02-13T20:09:00.8843001Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/optabs-libfuncs.h +2025-02-13T20:09:00.8843357Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/emit-rtl.h +2025-02-13T20:09:00.8843746Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/dominance.h +2025-02-13T20:09:00.8844161Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gimple-range-gori.h +2025-02-13T20:09:00.8844549Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-cfg.h +2025-02-13T20:09:00.8844916Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/cppdefault.h +2025-02-13T20:09:00.8845282Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gimple-low.h +2025-02-13T20:09:00.8845706Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-ssa-address.h +2025-02-13T20:09:00.8846059Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/optabs.def +2025-02-13T20:09:00.8846433Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/optinfo.h +2025-02-13T20:09:00.8846778Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/optabs.h +2025-02-13T20:09:00.8847167Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/timevar.def +2025-02-13T20:09:00.8847865Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/regs.h +2025-02-13T20:09:00.8848273Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/context.h +2025-02-13T20:09:00.8848670Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/fold-const-call.h +2025-02-13T20:09:00.8849067Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/cfgcleanup.h +2025-02-13T20:09:00.8849413Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/graph.h +2025-02-13T20:09:00.8849783Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/calls.h +2025-02-13T20:09:00.8850131Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-eh.h +2025-02-13T20:09:00.8850508Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/lra-int.h +2025-02-13T20:09:00.8850868Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/dwarf2asm.h +2025-02-13T20:09:00.8851283Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/collect2-aix.h +2025-02-13T20:09:00.8851851Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/substring-locations.h +2025-02-13T20:09:00.8852215Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/real.h +2025-02-13T20:09:00.8852600Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/hash-map-traits.h +2025-02-13T20:09:00.8853018Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/file-prefix-map.h +2025-02-13T20:09:00.8853436Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gcov-counter.def +2025-02-13T20:09:00.8853813Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/ipa-icf.h +2025-02-13T20:09:00.8854185Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gensupport.h +2025-02-13T20:09:00.8854543Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/dfp.h +2025-02-13T20:09:00.8854898Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/cfgrtl.h +2025-02-13T20:09:00.8855346Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/omp-low.h +2025-02-13T20:09:00.8855689Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/convert.h +2025-02-13T20:09:00.8856043Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/ssa.h +2025-02-13T20:09:00.8856393Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/target.def +2025-02-13T20:09:00.8856734Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/symtab.h +2025-02-13T20:09:00.8857110Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/version.h +2025-02-13T20:09:00.8857487Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/ipa-reference.h +2025-02-13T20:09:00.8857900Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-ssa-strlen.h +2025-02-13T20:09:00.8858265Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-dump.h +2025-02-13T20:09:00.8858628Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/ddg.h +2025-02-13T20:09:00.8858970Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tracer.h +2025-02-13T20:09:00.8859331Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/is-a.h +2025-02-13T20:09:00.8859711Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gomp-constants.h +2025-02-13T20:09:00.8860114Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/insn-notes.def +2025-02-13T20:09:00.8860471Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/sched-int.h +2025-02-13T20:09:00.8860841Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/prefix.h +2025-02-13T20:09:00.8861200Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/spellcheck.h +2025-02-13T20:09:00.8861589Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/cfg-flags.def +2025-02-13T20:09:00.8862094Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-parloops.h +2025-02-13T20:09:00.8862479Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/insn-attr.h +2025-02-13T20:09:00.8862840Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/ipa-modref.h +2025-02-13T20:09:00.8863239Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/treestruct.def +2025-02-13T20:09:00.8863648Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gimple-range-trace.h +2025-02-13T20:09:00.8864033Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-core.h +2025-02-13T20:09:00.8864388Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/addresses.h +2025-02-13T20:09:00.8864809Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/coroutine-builtins.def +2025-02-13T20:09:00.8865207Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/dwarf2ctf.h +2025-02-13T20:09:00.8865682Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-chrec.h +2025-02-13T20:09:00.8866075Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/stor-layout.h +2025-02-13T20:09:00.8866435Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/safe-ctype.h +2025-02-13T20:09:00.8866845Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/symbol-summary.h +2025-02-13T20:09:00.8867212Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-inline.h +2025-02-13T20:09:00.8867592Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/dumpfile.h +2025-02-13T20:09:00.8867967Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-phinodes.h +2025-02-13T20:09:00.8868356Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/omp-expand.h +2025-02-13T20:09:00.8868779Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/diagnostic-event-id.h +2025-02-13T20:09:00.8869178Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/machmode.def +2025-02-13T20:09:00.8869532Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/file-find.h +2025-02-13T20:09:00.8869928Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/memory-block.h +2025-02-13T20:09:00.8870295Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/b-header-vars +2025-02-13T20:09:00.8870671Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/passes.def +2025-02-13T20:09:00.8871015Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/sbitmap.h +2025-02-13T20:09:00.8871429Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/diagnostic-spec.h +2025-02-13T20:09:00.8871818Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/cif-code.def +2025-02-13T20:09:00.8872224Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/obstack-utils.h +2025-02-13T20:09:00.8872583Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/builtins.h +2025-02-13T20:09:00.8872964Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gimplify.h +2025-02-13T20:09:00.8873306Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/dbgcnt.h +2025-02-13T20:09:00.8873767Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/ipa-param-manipulation.h +2025-02-13T20:09:00.8874116Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/sreal.h +2025-02-13T20:09:00.8874526Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/omp-builtins.def +2025-02-13T20:09:00.8874899Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/auto-profile.h +2025-02-13T20:09:00.8875314Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/opts-diagnostic.h +2025-02-13T20:09:00.8875791Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/mux-utils.h +2025-02-13T20:09:00.8876165Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/ipa-inline.h +2025-02-13T20:09:00.8876541Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/ansidecl.h +2025-02-13T20:09:00.8876904Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/insn-flags.h +2025-02-13T20:09:00.8877268Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gstab.h +2025-02-13T20:09:00.8877639Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/ggc-internal.h +2025-02-13T20:09:00.8878057Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/splay-tree-utils.h +2025-02-13T20:09:00.8878425Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/basic-block.h +2025-02-13T20:09:00.8878805Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/hw-doloop.h +2025-02-13T20:09:00.8879229Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gimple-pretty-print.h +2025-02-13T20:09:00.8879714Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/limitx.h +2025-02-13T20:09:00.8880138Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-ssa-scopedtables.h +2025-02-13T20:09:00.8880561Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-hash-traits.h +2025-02-13T20:09:00.8880943Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/sel-sched-dump.h +2025-02-13T20:09:00.8881356Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/diagnostic-url.h +2025-02-13T20:09:00.8881713Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/ipa-utils.h +2025-02-13T20:09:00.8882079Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/dojump.h +2025-02-13T20:09:00.8882477Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gcc-rich-location.h +2025-02-13T20:09:00.8882880Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/internal-fn.h +2025-02-13T20:09:00.8883233Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/digraph.h +2025-02-13T20:09:00.8883639Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/vector-builder.h +2025-02-13T20:09:00.8884011Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/rtlhooks-def.h +2025-02-13T20:09:00.8884377Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/plugin.h +2025-02-13T20:09:00.8886607Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/dbxout.h +2025-02-13T20:09:00.8886986Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/pretty-print.h +2025-02-13T20:09:00.8887417Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gimple-range-fold.h +2025-02-13T20:09:00.8887901Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/wide-int.h +2025-02-13T20:09:00.8888334Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/insn-modes.h +2025-02-13T20:09:00.8888724Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-streamer.h +2025-02-13T20:09:00.8889135Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/hosthooks-def.h +2025-02-13T20:09:00.8889516Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-ssanames.h +2025-02-13T20:09:00.8889895Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/read-md.h +2025-02-13T20:09:00.8890264Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/gcc-symtab.h +2025-02-13T20:09:00.8890668Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/lto-streamer.h +2025-02-13T20:09:00.8891036Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/alloc-pool.h +2025-02-13T20:09:00.8891447Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/run-rtl-passes.h +2025-02-13T20:09:00.8892058Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-object-size.h +2025-02-13T20:09:00.8892447Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/rtl-iter.h +2025-02-13T20:09:00.8892795Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/graphds.h +2025-02-13T20:09:00.8893182Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/filenames.h +2025-02-13T20:09:00.8893596Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/tree-ssa-threadedge.h +2025-02-13T20:09:00.8893979Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/targhooks.h +2025-02-13T20:09:00.8894335Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/glimits.h +2025-02-13T20:09:00.8894700Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/include/stab.def +2025-02-13T20:09:00.8895103Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/libcc1plugin.so +2025-02-13T20:09:00.8895671Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/libcp1plugin.so.0 +2025-02-13T20:09:00.8896022Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/libcc1plugin.so.0 +2025-02-13T20:09:00.8896395Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/libcp1plugin.la +2025-02-13T20:09:00.8896772Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/libcc1plugin.so.0.0.0 +2025-02-13T20:09:00.8897142Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/libcp1plugin.so.0.0.0 +2025-02-13T20:09:00.8897515Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/libcp1plugin.so +2025-02-13T20:09:00.8897856Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/libcc1plugin.la +2025-02-13T20:09:00.8898206Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/plugin/gtype.state +2025-02-13T20:09:00.8898509Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/rv32im_xttbh/ +2025-02-13T20:09:00.8898874Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/rv32im_xttbh/ilp32/ +2025-02-13T20:09:00.8899237Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/rv32im_xttbh/ilp32/crtn.o +2025-02-13T20:09:00.8899614Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/rv32im_xttbh/ilp32/crti.o +2025-02-13T20:09:00.8899997Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/rv32im_xttbh/ilp32/libgcov.a +2025-02-13T20:09:00.8900449Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/rv32im_xttbh/ilp32/crtbegin.o +2025-02-13T20:09:00.8900811Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/rv32im_xttbh/ilp32/libgcc.a +2025-02-13T20:09:00.8919860Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/rv32im_xttbh/ilp32/crtend.o +2025-02-13T20:09:00.8920184Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/rv32im_xttwh/ +2025-02-13T20:09:00.8920536Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/rv32im_xttwh/ilp32/ +2025-02-13T20:09:00.8920900Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/rv32im_xttwh/ilp32/crtn.o +2025-02-13T20:09:00.8921290Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/rv32im_xttwh/ilp32/crti.o +2025-02-13T20:09:00.8921653Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/rv32im_xttwh/ilp32/libgcov.a +2025-02-13T20:09:00.8924918Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/rv32im_xttwh/ilp32/crtbegin.o +2025-02-13T20:09:00.8925291Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/rv32im_xttwh/ilp32/libgcc.a +2025-02-13T20:09:00.8949366Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/rv32im_xttwh/ilp32/crtend.o +2025-02-13T20:09:00.8949664Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/libgcc.a +2025-02-13T20:09:00.8976449Z runtime/sfpi/compiler/lib/gcc/riscv32-unknown-elf/12.4.0/crtend.o +2025-02-13T20:09:00.8976636Z runtime/sfpi/compiler/lib/libcc1.so.0 +2025-02-13T20:09:00.8976828Z runtime/sfpi/compiler/lib/libcc1.la +2025-02-13T20:09:00.8977002Z runtime/sfpi/compiler/lib/bfd-plugins/ +2025-02-13T20:09:00.8977429Z runtime/sfpi/compiler/lib/bfd-plugins/libdep.so +2025-02-13T20:09:00.8977608Z runtime/sfpi/compiler/bin/ +2025-02-13T20:09:00.8977844Z runtime/sfpi/compiler/bin/riscv32-unknown-elf-addr2line +2025-02-13T20:09:00.8988666Z runtime/sfpi/compiler/bin/riscv32-unknown-elf-size +2025-02-13T20:09:00.8999475Z runtime/sfpi/compiler/bin/riscv32-unknown-elf-nm +2025-02-13T20:09:00.9000200Z runtime/sfpi/compiler/bin/riscv32-unknown-elf-gcc +2025-02-13T20:09:00.9024137Z runtime/sfpi/compiler/bin/riscv32-unknown-elf-readelf +2025-02-13T20:09:00.9024429Z runtime/sfpi/compiler/bin/riscv32-unknown-elf-ranlib +2025-02-13T20:09:00.9024993Z runtime/sfpi/compiler/bin/riscv32-unknown-elf-gcov-tool +2025-02-13T20:09:00.9031497Z runtime/sfpi/compiler/bin/riscv32-unknown-elf-gcov +2025-02-13T20:09:00.9040491Z runtime/sfpi/compiler/bin/riscv32-unknown-elf-objcopy +2025-02-13T20:09:00.9041078Z runtime/sfpi/compiler/bin/riscv32-unknown-elf-gcc-ar +2025-02-13T20:09:00.9041316Z runtime/sfpi/compiler/bin/riscv32-unknown-elf-elfedit +2025-02-13T20:09:00.9041551Z runtime/sfpi/compiler/bin/riscv32-unknown-elf-gprof +2025-02-13T20:09:00.9053646Z runtime/sfpi/compiler/bin/riscv32-unknown-elf-lto-dump +2025-02-13T20:09:00.9282473Z runtime/sfpi/compiler/bin/riscv32-unknown-elf-gcc-ranlib +2025-02-13T20:09:00.9283187Z runtime/sfpi/compiler/bin/riscv32-unknown-elf-c++ +2025-02-13T20:09:00.9308547Z runtime/sfpi/compiler/bin/riscv32-unknown-elf-as +2025-02-13T20:09:00.9309177Z runtime/sfpi/compiler/bin/riscv32-unknown-elf-ld +2025-02-13T20:09:00.9309460Z runtime/sfpi/compiler/bin/riscv32-unknown-elf-strings +2025-02-13T20:09:00.9319426Z runtime/sfpi/compiler/bin/riscv32-unknown-elf-strip +2025-02-13T20:09:00.9320209Z runtime/sfpi/compiler/bin/riscv32-unknown-elf-cpp +2025-02-13T20:09:00.9359281Z runtime/sfpi/compiler/bin/riscv32-unknown-elf-objdump +2025-02-13T20:09:00.9359627Z runtime/sfpi/compiler/bin/riscv32-unknown-elf-ld.bfd +2025-02-13T20:09:00.9359862Z runtime/sfpi/compiler/bin/riscv32-unknown-elf-c++filt +2025-02-13T20:09:00.9360159Z runtime/sfpi/compiler/bin/riscv32-unknown-elf-gcov-dump +2025-02-13T20:09:00.9362076Z runtime/sfpi/compiler/bin/riscv32-unknown-elf-gcc-12.4.0 +2025-02-13T20:09:00.9362346Z runtime/sfpi/compiler/bin/riscv32-unknown-elf-ar +2025-02-13T20:09:00.9362583Z runtime/sfpi/compiler/bin/riscv32-unknown-elf-gcc-nm +2025-02-13T20:09:00.9362832Z runtime/sfpi/compiler/bin/riscv32-unknown-elf-g++ +2025-02-13T20:09:00.9362992Z runtime/sfpi/compiler/share/ +2025-02-13T20:09:00.9363168Z runtime/sfpi/compiler/share/man/ +2025-02-13T20:09:00.9363368Z runtime/sfpi/compiler/share/man/man7/ +2025-02-13T20:09:00.9363572Z runtime/sfpi/compiler/share/man/man7/fsf-funding.7 +2025-02-13T20:09:00.9363792Z runtime/sfpi/compiler/share/man/man7/gfdl.7 +2025-02-13T20:09:00.9364010Z runtime/sfpi/compiler/share/man/man7/gpl.7 +2025-02-13T20:09:00.9364671Z runtime/sfpi/compiler/share/man/man1/ +2025-02-13T20:09:00.9365436Z runtime/sfpi/compiler/share/man/man1/riscv32-unknown-elf-ar.1 +2025-02-13T20:09:00.9365750Z runtime/sfpi/compiler/share/man/man1/riscv32-unknown-elf-readelf.1 +2025-02-13T20:09:00.9366097Z runtime/sfpi/compiler/share/man/man1/riscv32-unknown-elf-strings.1 +2025-02-13T20:09:00.9366381Z runtime/sfpi/compiler/share/man/man1/riscv32-unknown-elf-objdump.1 +2025-02-13T20:09:00.9367086Z runtime/sfpi/compiler/share/man/man1/riscv32-unknown-elf-addr2line.1 +2025-02-13T20:09:00.9367352Z runtime/sfpi/compiler/share/man/man1/riscv32-unknown-elf-gcc.1 +2025-02-13T20:09:00.9381292Z runtime/sfpi/compiler/share/man/man1/riscv32-unknown-elf-dlltool.1 +2025-02-13T20:09:00.9382274Z runtime/sfpi/compiler/share/man/man1/riscv32-unknown-elf-windmc.1 +2025-02-13T20:09:00.9382837Z runtime/sfpi/compiler/share/man/man1/riscv32-unknown-elf-c++filt.1 +2025-02-13T20:09:00.9383125Z runtime/sfpi/compiler/share/man/man1/riscv32-unknown-elf-size.1 +2025-02-13T20:09:00.9383388Z runtime/sfpi/compiler/share/man/man1/riscv32-unknown-elf-as.1 +2025-02-13T20:09:00.9383669Z runtime/sfpi/compiler/share/man/man1/riscv32-unknown-elf-lto-dump.1 +2025-02-13T20:09:00.9384249Z runtime/sfpi/compiler/share/man/man1/riscv32-unknown-elf-strip.1 +2025-02-13T20:09:00.9384584Z runtime/sfpi/compiler/share/man/man1/riscv32-unknown-elf-g++.1 +2025-02-13T20:09:00.9398681Z runtime/sfpi/compiler/share/man/man1/riscv32-unknown-elf-windres.1 +2025-02-13T20:09:00.9398966Z runtime/sfpi/compiler/share/man/man1/riscv32-unknown-elf-gcov.1 +2025-02-13T20:09:00.9399754Z runtime/sfpi/compiler/share/man/man1/riscv32-unknown-elf-gcov-dump.1 +2025-02-13T20:09:00.9400094Z runtime/sfpi/compiler/share/man/man1/riscv32-unknown-elf-objcopy.1 +2025-02-13T20:09:00.9400413Z runtime/sfpi/compiler/share/man/man1/riscv32-unknown-elf-cpp.1 +2025-02-13T20:09:00.9401110Z runtime/sfpi/compiler/share/man/man1/riscv32-unknown-elf-nm.1 +2025-02-13T20:09:00.9401418Z runtime/sfpi/compiler/share/man/man1/riscv32-unknown-elf-gprof.1 +2025-02-13T20:09:00.9401738Z runtime/sfpi/compiler/share/man/man1/riscv32-unknown-elf-elfedit.1 +2025-02-13T20:09:00.9402415Z runtime/sfpi/compiler/share/man/man1/riscv32-unknown-elf-ranlib.1 +2025-02-13T20:09:00.9402775Z runtime/sfpi/compiler/share/man/man1/riscv32-unknown-elf-ld.1 +2025-02-13T20:09:00.9404424Z runtime/sfpi/compiler/share/man/man1/riscv32-unknown-elf-gcov-tool.1 +2025-02-13T20:09:00.9404642Z runtime/sfpi/compiler/share/gcc-12.4.0/ +2025-02-13T20:09:00.9404849Z runtime/sfpi/compiler/share/gcc-12.4.0/python/ +2025-02-13T20:09:00.9405118Z runtime/sfpi/compiler/share/gcc-12.4.0/python/libstdcxx/ +2025-02-13T20:09:00.9405376Z runtime/sfpi/compiler/share/gcc-12.4.0/python/libstdcxx/v6/ +2025-02-13T20:09:00.9405718Z runtime/sfpi/compiler/share/gcc-12.4.0/python/libstdcxx/v6/xmethods.py +2025-02-13T20:09:00.9406031Z runtime/sfpi/compiler/share/gcc-12.4.0/python/libstdcxx/v6/printers.py +2025-02-13T20:09:00.9406781Z runtime/sfpi/compiler/share/gcc-12.4.0/python/libstdcxx/v6/__init__.py +2025-02-13T20:09:00.9407089Z runtime/sfpi/compiler/share/gcc-12.4.0/python/libstdcxx/__init__.py +2025-02-13T20:09:00.9407290Z runtime/sfpi/compiler/share/info/ +2025-02-13T20:09:00.9407496Z runtime/sfpi/compiler/share/info/gccinstall.info +2025-02-13T20:09:00.9410186Z runtime/sfpi/compiler/share/info/cpp.info +2025-02-13T20:09:00.9412626Z runtime/sfpi/compiler/share/info/gccint.info +2025-02-13T20:09:00.9437986Z runtime/sfpi/compiler/share/info/gprof.info +2025-02-13T20:09:00.9439157Z runtime/sfpi/compiler/share/info/dir +2025-02-13T20:09:00.9439786Z runtime/sfpi/compiler/share/info/as.info +2025-02-13T20:09:00.9451736Z runtime/sfpi/compiler/share/info/cppinternals.info +2025-02-13T20:09:00.9452299Z runtime/sfpi/compiler/share/info/gcc.info +2025-02-13T20:09:00.9481218Z runtime/sfpi/compiler/share/info/ld.info +2025-02-13T20:09:00.9485722Z runtime/sfpi/compiler/share/info/bfd.info +2025-02-13T20:09:00.9492529Z runtime/sfpi/compiler/share/info/binutils.info +2025-02-13T20:09:00.9495099Z runtime/sfpi/compiler/share/info/ctf-spec.info +2025-02-13T20:09:00.9496478Z runtime/sfpi/compiler/share/locale/ +2025-02-13T20:09:00.9496689Z runtime/sfpi/compiler/share/locale/zh_CN/ +2025-02-13T20:09:00.9496914Z runtime/sfpi/compiler/share/locale/zh_CN/LC_MESSAGES/ +2025-02-13T20:09:00.9497122Z runtime/sfpi/compiler/share/locale/eo/ +2025-02-13T20:09:00.9497345Z runtime/sfpi/compiler/share/locale/eo/LC_MESSAGES/ +2025-02-13T20:09:00.9497507Z runtime/sfpi/compiler/share/locale/vi/ +2025-02-13T20:09:00.9497687Z runtime/sfpi/compiler/share/locale/vi/LC_MESSAGES/ +2025-02-13T20:09:00.9497870Z runtime/sfpi/compiler/share/locale/ru/ +2025-02-13T20:09:00.9498074Z runtime/sfpi/compiler/share/locale/ru/LC_MESSAGES/ +2025-02-13T20:09:00.9498256Z runtime/sfpi/compiler/share/locale/pt/ +2025-02-13T20:09:00.9498447Z runtime/sfpi/compiler/share/locale/pt/LC_MESSAGES/ +2025-02-13T20:09:00.9498625Z runtime/sfpi/compiler/share/locale/bg/ +2025-02-13T20:09:00.9498815Z runtime/sfpi/compiler/share/locale/bg/LC_MESSAGES/ +2025-02-13T20:09:00.9499014Z runtime/sfpi/compiler/share/locale/hu/ +2025-02-13T20:09:00.9499204Z runtime/sfpi/compiler/share/locale/hu/LC_MESSAGES/ +2025-02-13T20:09:00.9499361Z runtime/sfpi/compiler/share/locale/de/ +2025-02-13T20:09:00.9499552Z runtime/sfpi/compiler/share/locale/de/LC_MESSAGES/ +2025-02-13T20:09:00.9499926Z runtime/sfpi/compiler/share/locale/sr/ +2025-02-13T20:09:00.9500169Z runtime/sfpi/compiler/share/locale/sr/LC_MESSAGES/ +2025-02-13T20:09:00.9500327Z runtime/sfpi/compiler/share/locale/ro/ +2025-02-13T20:09:00.9500541Z runtime/sfpi/compiler/share/locale/ro/LC_MESSAGES/ +2025-02-13T20:09:00.9500698Z runtime/sfpi/compiler/share/locale/fr/ +2025-02-13T20:09:00.9500931Z runtime/sfpi/compiler/share/locale/fr/LC_MESSAGES/ +2025-02-13T20:09:00.9501092Z runtime/sfpi/compiler/share/locale/uk/ +2025-02-13T20:09:00.9501303Z runtime/sfpi/compiler/share/locale/uk/LC_MESSAGES/ +2025-02-13T20:09:00.9501470Z runtime/sfpi/compiler/share/locale/fi/ +2025-02-13T20:09:00.9501681Z runtime/sfpi/compiler/share/locale/fi/LC_MESSAGES/ +2025-02-13T20:09:00.9501841Z runtime/sfpi/compiler/share/locale/id/ +2025-02-13T20:09:00.9502070Z runtime/sfpi/compiler/share/locale/id/LC_MESSAGES/ +2025-02-13T20:09:00.9502233Z runtime/sfpi/compiler/share/locale/ms/ +2025-02-13T20:09:00.9502691Z runtime/sfpi/compiler/share/locale/ms/LC_MESSAGES/ +2025-02-13T20:09:00.9502885Z runtime/sfpi/compiler/share/locale/da/ +2025-02-13T20:09:00.9503299Z runtime/sfpi/compiler/share/locale/da/LC_MESSAGES/ +2025-02-13T20:09:00.9503487Z runtime/sfpi/compiler/share/locale/es/ +2025-02-13T20:09:00.9503681Z runtime/sfpi/compiler/share/locale/es/LC_MESSAGES/ +2025-02-13T20:09:00.9503869Z runtime/sfpi/compiler/share/locale/ja/ +2025-02-13T20:09:00.9504076Z runtime/sfpi/compiler/share/locale/ja/LC_MESSAGES/ +2025-02-13T20:09:00.9504272Z runtime/sfpi/compiler/share/locale/zh_TW/ +2025-02-13T20:09:00.9504490Z runtime/sfpi/compiler/share/locale/zh_TW/LC_MESSAGES/ +2025-02-13T20:09:00.9504716Z runtime/sfpi/compiler/share/locale/ga/ +2025-02-13T20:09:00.9504912Z runtime/sfpi/compiler/share/locale/ga/LC_MESSAGES/ +2025-02-13T20:09:00.9505072Z runtime/sfpi/compiler/share/locale/pt_BR/ +2025-02-13T20:09:00.9505303Z runtime/sfpi/compiler/share/locale/pt_BR/LC_MESSAGES/ +2025-02-13T20:09:00.9505458Z runtime/sfpi/compiler/share/locale/sv/ +2025-02-13T20:09:00.9505650Z runtime/sfpi/compiler/share/locale/sv/LC_MESSAGES/ +2025-02-13T20:09:00.9505864Z runtime/sfpi/compiler/share/locale/ka/ +2025-02-13T20:09:00.9506060Z runtime/sfpi/compiler/share/locale/ka/LC_MESSAGES/ +2025-02-13T20:09:00.9506220Z runtime/sfpi/compiler/share/locale/rw/ +2025-02-13T20:09:00.9506452Z runtime/sfpi/compiler/share/locale/rw/LC_MESSAGES/ +2025-02-13T20:09:00.9506609Z runtime/sfpi/compiler/share/locale/ca/ +2025-02-13T20:09:00.9506830Z runtime/sfpi/compiler/share/locale/ca/LC_MESSAGES/ +2025-02-13T20:09:00.9506987Z runtime/sfpi/compiler/share/locale/nl/ +2025-02-13T20:09:00.9507210Z runtime/sfpi/compiler/share/locale/nl/LC_MESSAGES/ +2025-02-13T20:09:00.9507371Z runtime/sfpi/compiler/share/locale/it/ +2025-02-13T20:09:00.9507599Z runtime/sfpi/compiler/share/locale/it/LC_MESSAGES/ +2025-02-13T20:09:00.9507757Z runtime/sfpi/compiler/share/locale/tr/ +2025-02-13T20:09:00.9507991Z runtime/sfpi/compiler/share/locale/tr/LC_MESSAGES/ +2025-02-13T20:09:00.9508149Z runtime/sfpi/compiler/share/locale/sk/ +2025-02-13T20:09:00.9508379Z runtime/sfpi/compiler/share/locale/sk/LC_MESSAGES/ +2025-02-13T20:09:00.9508540Z runtime/sfpi/compiler/share/locale/hr/ +2025-02-13T20:09:00.9508733Z runtime/sfpi/compiler/share/locale/hr/LC_MESSAGES/ +2025-02-13T20:09:00.9556692Z Prepare all required actions +2025-02-13T20:09:00.9556917Z Getting action download info +2025-02-13T20:09:01.0815935Z Download action repository 'getsentry/action-setup-venv@v2.1.1' (SHA:3a832a9604b3e1a4202ae559248f26867b467cc7) +2025-02-13T20:09:01.4289665Z Getting action download info +2025-02-13T20:09:01.5847761Z Download action repository 'actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c' (SHA:0a5c61591373683505ea898e09a3ea4f39ef2b9c) +2025-02-13T20:09:02.3147540Z Download action repository 'actions/cache@1bd1e32a3bdc45362d1e726936510720a7c30a57' (SHA:1bd1e32a3bdc45362d1e726936510720a7c30a57) +2025-02-13T20:09:03.0834393Z ##[group]Run ./.github/actions/install-python-deps +2025-02-13T20:09:03.0834846Z with: +2025-02-13T20:09:03.0835145Z python-version: 3.8 +2025-02-13T20:09:03.0835468Z env: +2025-02-13T20:09:03.0835966Z ARCH_NAME: wormhole_b0 +2025-02-13T20:09:03.0836340Z LOGURU_LEVEL: INFO +2025-02-13T20:09:03.0836657Z ##[endgroup] +2025-02-13T20:09:03.0900434Z ##[group]Run getsentry/action-setup-venv@v2.1.1 +2025-02-13T20:09:03.0900860Z with: +2025-02-13T20:09:03.0901146Z python-version: 3.8 +2025-02-13T20:09:03.0901632Z venv-dir: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-13T20:09:03.0902814Z cache-dependency-path: tt_metal/python_env/requirements-dev.txt +docs/requirements-docs.txt +tests/sweep_framework/requirements-sweeps.txt +pyproject.toml +create_venv.sh + +2025-02-13T20:09:03.0903843Z install-cmd: ./create_venv.sh +2025-02-13T20:09:03.0904184Z env: +2025-02-13T20:09:03.0904454Z ARCH_NAME: wormhole_b0 +2025-02-13T20:09:03.0904781Z LOGURU_LEVEL: INFO +2025-02-13T20:09:03.0905091Z ##[endgroup] +2025-02-13T20:09:03.0962776Z ##[group]Run actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c +2025-02-13T20:09:03.0963302Z with: +2025-02-13T20:09:03.0963595Z python-version: 3.8 +2025-02-13T20:09:03.0964138Z check-latest: false +2025-02-13T20:09:03.0964575Z token: *** +2025-02-13T20:09:03.0964876Z update-environment: true +2025-02-13T20:09:03.0965234Z allow-prereleases: false +2025-02-13T20:09:03.0965550Z env: +2025-02-13T20:09:03.0965820Z ARCH_NAME: wormhole_b0 +2025-02-13T20:09:03.0966146Z LOGURU_LEVEL: INFO +2025-02-13T20:09:03.0966451Z ##[endgroup] +2025-02-13T20:09:03.2958261Z ##[group]Installed versions +2025-02-13T20:09:03.3011667Z Successfully set up CPython (3.8.18) +2025-02-13T20:09:03.3012447Z ##[endgroup] +2025-02-13T20:09:03.3150208Z ##[group]Run echo '::remove-matcher owner=python::' +2025-02-13T20:09:03.3150788Z echo '::remove-matcher owner=python::' +2025-02-13T20:09:03.3176544Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:09:03.3177076Z env: +2025-02-13T20:09:03.3177404Z ARCH_NAME: wormhole_b0 +2025-02-13T20:09:03.3177780Z LOGURU_LEVEL: INFO +2025-02-13T20:09:03.3178369Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:03.3179240Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:09:03.3180121Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:03.3180893Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:03.3181671Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:03.3182443Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:09:03.3183028Z ##[endgroup] +2025-02-13T20:09:03.3985717Z ##[group]Run actions/cache@1bd1e32a3bdc45362d1e726936510720a7c30a57 +2025-02-13T20:09:03.3986226Z with: +2025-02-13T20:09:03.3986744Z path: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-13T20:09:03.3988035Z key: setup-venv-Linux-py-3.8.18-/home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/bin/python-509e0fbc74e4697ea036d8e6b4ed76321c253e4ffef8468c11ee556fb8e370e2-./create_venv.sh +2025-02-13T20:09:03.3989238Z enableCrossOsArchive: false +2025-02-13T20:09:03.3989646Z fail-on-cache-miss: false +2025-02-13T20:09:03.3990020Z lookup-only: false +2025-02-13T20:09:03.3990352Z save-always: false +2025-02-13T20:09:03.3990680Z env: +2025-02-13T20:09:03.3990970Z ARCH_NAME: wormhole_b0 +2025-02-13T20:09:03.4011891Z LOGURU_LEVEL: INFO +2025-02-13T20:09:03.4012636Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:03.4013533Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:09:03.4014349Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:03.4015102Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:03.4015903Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:03.4016933Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:09:03.4017548Z ##[endgroup] +2025-02-13T20:09:03.7450799Z Cache hit for: setup-venv-Linux-py-3.8.18-/home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/bin/python-509e0fbc74e4697ea036d8e6b4ed76321c253e4ffef8468c11ee556fb8e370e2-./create_venv.sh +2025-02-13T20:09:04.9552946Z Received 46137344 of 652437919 (7.1%), 43.9 MBs/sec +2025-02-13T20:09:05.9552938Z Received 150994944 of 652437919 (23.1%), 71.9 MBs/sec +2025-02-13T20:09:06.9559437Z Received 268435456 of 652437919 (41.1%), 85.2 MBs/sec +2025-02-13T20:09:07.9565608Z Received 390070272 of 652437919 (59.8%), 92.9 MBs/sec +2025-02-13T20:09:08.9568768Z Received 473956352 of 652437919 (72.6%), 90.3 MBs/sec +2025-02-13T20:09:09.9564012Z Received 536870912 of 652437919 (82.3%), 85.3 MBs/sec +2025-02-13T20:09:10.9575176Z Received 644049311 of 652437919 (98.7%), 87.7 MBs/sec +2025-02-13T20:09:11.1707264Z Received 652437919 of 652437919 (100.0%), 86.2 MBs/sec +2025-02-13T20:09:11.1712423Z Cache Size: ~622 MB (652437919 B) +2025-02-13T20:09:11.1761294Z [command]/usr/bin/tar -xf /home/ubuntu/actions-runner/_work/_temp/ec88795b-dcf9-4708-b4c9-80bff2c14bbb/cache.tgz -P -C /home/ubuntu/actions-runner/_work/tt-metal/tt-metal -z +2025-02-13T20:09:27.6307061Z Cache restored successfully +2025-02-13T20:09:27.7249961Z Cache restored from key: setup-venv-Linux-py-3.8.18-/home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/bin/python-509e0fbc74e4697ea036d8e6b4ed76321c253e4ffef8468c11ee556fb8e370e2-./create_venv.sh +2025-02-13T20:09:27.7540807Z ##[group]Run source /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env/bin/activate +2025-02-13T20:09:27.7541770Z source /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env/bin/activate +2025-02-13T20:09:27.7542491Z echo "VIRTUAL_ENV=${VIRTUAL_ENV}" >> $GITHUB_ENV +2025-02-13T20:09:27.7543000Z echo "${VIRTUAL_ENV}/bin" >> $GITHUB_PATH +2025-02-13T20:09:27.7565238Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:09:27.7565731Z env: +2025-02-13T20:09:27.7566075Z ARCH_NAME: wormhole_b0 +2025-02-13T20:09:27.7566440Z LOGURU_LEVEL: INFO +2025-02-13T20:09:27.7566989Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:27.7568016Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:09:27.7568796Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:27.7569508Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:27.7570228Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:27.7570961Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:09:27.7571603Z ##[endgroup] +2025-02-13T20:09:27.7750339Z Prepare all required actions +2025-02-13T20:09:27.7750829Z Getting action download info +2025-02-13T20:09:27.9338580Z Download action repository 'docker/login-action@v3' (SHA:9780b0c442fbb1117ed29e0efdff1e18412f7567) +2025-02-13T20:09:28.7131087Z Download action repository 'tenstorrent/docker-run-action@v5' (SHA:f939ca6b256fc7d5c78538d8af38b00a287e3415) +2025-02-13T20:09:29.1036517Z ##[group]Run ./.github/actions/docker-run +2025-02-13T20:09:29.1036951Z with: +2025-02-13T20:09:29.1037322Z docker_os_arch: tt-metalium/ubuntu-20.04-amd64 +2025-02-13T20:09:29.1038005Z docker_password: *** +2025-02-13T20:09:29.1039243Z docker_opts: -e TT_METAL_HOME=/home/ubuntu/actions-runner/_work/tt-metal/tt-metal +-e ARCH_NAME=wormhole_b0 +-e LD_LIBRARY_PATH=/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/build/lib +-e GTEST_OUTPUT=xml:generated/test_reports/ + +2025-02-13T20:09:29.1041217Z run_args: pip install --force-reinstall pip==21.2.4 +pip install -r tt_metal/python_env/requirements-dev.txt +pip install -e . +mkdir -p generated/test_reports +./tests/scripts/run_tools_tests.sh + +2025-02-13T20:09:29.1042344Z docker_username: sagarwalTT +2025-02-13T20:09:29.1042886Z device: -v /dev/hugepages-1G:/dev/hugepages-1G +--device /dev/tenstorrent + +2025-02-13T20:09:29.1043468Z install_wheel: false +2025-02-13T20:09:29.1043784Z env: +2025-02-13T20:09:29.1044083Z ARCH_NAME: wormhole_b0 +2025-02-13T20:09:29.1044430Z LOGURU_LEVEL: INFO +2025-02-13T20:09:29.1044942Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:29.1045738Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:09:29.1046506Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:29.1047211Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:29.1048228Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:29.1048965Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:09:29.1049689Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-13T20:09:29.1050225Z ##[endgroup] +2025-02-13T20:09:29.1076952Z ##[group]Build container for action use: '/home/ubuntu/actions-runner/_work/_actions/tenstorrent/docker-run-action/v5/Dockerfile'. +2025-02-13T20:09:29.1124242Z ##[command]/usr/bin/docker build -t e8ee94:a42ea301cca041f39695332b6ab5e9e4 -f "/home/ubuntu/actions-runner/_work/_actions/tenstorrent/docker-run-action/v5/Dockerfile" "/home/ubuntu/actions-runner/_work/_actions/tenstorrent/docker-run-action/v5" +2025-02-13T20:09:29.4675539Z #0 building with "default" instance using docker driver +2025-02-13T20:09:29.4675996Z +2025-02-13T20:09:29.4676195Z #1 [internal] load build definition from Dockerfile +2025-02-13T20:09:29.4676677Z #1 transferring dockerfile: 171B done +2025-02-13T20:09:29.4677075Z #1 DONE 0.0s +2025-02-13T20:09:29.4677260Z +2025-02-13T20:09:29.4677578Z #2 [internal] load metadata for public.ecr.aws/docker/library/docker:20.10 +2025-02-13T20:09:29.8379830Z #2 DONE 0.5s +2025-02-13T20:09:29.8713998Z +2025-02-13T20:09:29.8715286Z #3 [internal] load .dockerignore +2025-02-13T20:09:29.8715805Z #3 transferring context: 2B done +2025-02-13T20:09:29.8716425Z #3 DONE 0.0s +2025-02-13T20:09:29.8716611Z +2025-02-13T20:09:29.8717217Z #4 [1/3] FROM public.ecr.aws/docker/library/docker:20.10@sha256:2967f0819c84dd589ed0a023b9d25dcfe7a3c123d5bf784ffbb77edf55335f0c +2025-02-13T20:09:29.8717982Z #4 DONE 0.0s +2025-02-13T20:09:29.8718155Z +2025-02-13T20:09:29.8718293Z #5 [internal] load build context +2025-02-13T20:09:29.8718670Z #5 transferring context: 35B done +2025-02-13T20:09:29.8719026Z #5 DONE 0.0s +2025-02-13T20:09:29.8719188Z +2025-02-13T20:09:29.8719323Z #6 [2/3] RUN apk add bash +2025-02-13T20:09:29.8719643Z #6 CACHED +2025-02-13T20:09:29.8719796Z +2025-02-13T20:09:29.8719958Z #7 [3/3] COPY entrypoint.sh /entrypoint.sh +2025-02-13T20:09:29.8720335Z #7 CACHED +2025-02-13T20:09:29.8720939Z +2025-02-13T20:09:29.8721263Z #8 exporting to image +2025-02-13T20:09:29.8721601Z #8 exporting layers done +2025-02-13T20:09:29.8722370Z #8 writing image sha256:d99f28d2888e002da97cb2d4f5ef920b5a9db63871fdc0dbe63ba2509bf04528 done +2025-02-13T20:09:29.8723989Z #8 naming to docker.io/library/e8ee94:a42ea301cca041f39695332b6ab5e9e4 done +2025-02-13T20:09:29.8724880Z #8 DONE 0.0s +2025-02-13T20:09:29.8807465Z ##[endgroup] +2025-02-13T20:09:29.8851927Z Prepare all required actions +2025-02-13T20:09:29.8852406Z Getting action download info +2025-02-13T20:09:30.0317786Z Download action repository 'actions/checkout@v3' (SHA:f43a0e5ff2bd294095638e18286ca9a3d1956744) +2025-02-13T20:09:30.5942232Z ##[group]Run ./.github/actions/generate-docker-tag +2025-02-13T20:09:30.5942656Z with: +2025-02-13T20:09:30.5942963Z image: tt-metalium/ubuntu-20.04-amd64 +2025-02-13T20:09:30.5943335Z env: +2025-02-13T20:09:30.5943613Z ARCH_NAME: wormhole_b0 +2025-02-13T20:09:30.5943932Z LOGURU_LEVEL: INFO +2025-02-13T20:09:30.5944443Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:30.5945221Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:09:30.5945972Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:30.5946687Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:30.5947379Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:30.5948073Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:09:30.5948779Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-13T20:09:30.5949312Z ##[endgroup] +2025-02-13T20:09:30.5974005Z ##[group]Run echo "::notice::[DEPRECATION] This action is deprecated. Please migrate to reading the Docker image from the pipeline." +2025-02-13T20:09:30.5975262Z echo "::notice::[DEPRECATION] This action is deprecated. Please migrate to reading the Docker image from the pipeline." +2025-02-13T20:09:30.6000178Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:09:30.6000647Z env: +2025-02-13T20:09:30.6000917Z ARCH_NAME: wormhole_b0 +2025-02-13T20:09:30.6001450Z LOGURU_LEVEL: INFO +2025-02-13T20:09:30.6001933Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:30.6002694Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:09:30.6003444Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:30.6004128Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:30.6004808Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:30.6005502Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:09:30.6006217Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-13T20:09:30.6006754Z ##[endgroup] +2025-02-13T20:09:30.6050741Z ##[notice][DEPRECATION] This action is deprecated. Please migrate to reading the Docker image from the pipeline. +2025-02-13T20:09:30.6107017Z ##[group]Run actions/checkout@v3 +2025-02-13T20:09:30.6107433Z with: +2025-02-13T20:09:30.6107747Z fetch-depth: 1 +2025-02-13T20:09:30.6108094Z clean: false +2025-02-13T20:09:30.6108431Z repository: tenstorrent/tt-metal +2025-02-13T20:09:30.6109001Z token: *** +2025-02-13T20:09:30.6109307Z ssh-strict: true +2025-02-13T20:09:30.6109656Z persist-credentials: true +2025-02-13T20:09:30.6110051Z sparse-checkout-cone-mode: true +2025-02-13T20:09:30.6110447Z fetch-tags: false +2025-02-13T20:09:30.6110756Z lfs: false +2025-02-13T20:09:30.6111062Z submodules: false +2025-02-13T20:09:30.6111379Z set-safe-directory: true +2025-02-13T20:09:30.6111734Z env: +2025-02-13T20:09:30.6112033Z ARCH_NAME: wormhole_b0 +2025-02-13T20:09:30.6112379Z LOGURU_LEVEL: INFO +2025-02-13T20:09:30.6113052Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:30.6113844Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:09:30.6114621Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:30.6115351Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:30.6116087Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:30.6116848Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:09:30.6117593Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-13T20:09:30.6118158Z ##[endgroup] +2025-02-13T20:09:30.6994754Z Syncing repository: tenstorrent/tt-metal +2025-02-13T20:09:30.6999045Z ##[group]Getting Git version info +2025-02-13T20:09:30.6999877Z Working directory is '/home/ubuntu/actions-runner/_work/tt-metal/tt-metal' +2025-02-13T20:09:30.7038783Z [command]/usr/bin/git version +2025-02-13T20:09:30.7080305Z git version 2.25.1 +2025-02-13T20:09:30.7109057Z ##[endgroup] +2025-02-13T20:09:30.7121223Z Copying '/home/ubuntu/.gitconfig' to '/home/ubuntu/actions-runner/_work/_temp/2abee880-a096-428c-a290-253df4495270/.gitconfig' +2025-02-13T20:09:30.7135124Z Temporarily overriding HOME='/home/ubuntu/actions-runner/_work/_temp/2abee880-a096-428c-a290-253df4495270' before making global git config changes +2025-02-13T20:09:30.7136674Z Adding repository directory to the temporary git global config as a safe directory +2025-02-13T20:09:30.7139480Z [command]/usr/bin/git config --global --add safe.directory /home/ubuntu/actions-runner/_work/tt-metal/tt-metal +2025-02-13T20:09:30.7172087Z [command]/usr/bin/git config --local --get remote.origin.url +2025-02-13T20:09:30.7189566Z https://github.com/tenstorrent/tt-metal +2025-02-13T20:09:30.7206199Z ##[group]Removing previously created refs, to avoid conflicts +2025-02-13T20:09:30.7209826Z [command]/usr/bin/git rev-parse --symbolic-full-name --verify --quiet HEAD +2025-02-13T20:09:30.7227563Z refs/heads/sagarwal/multi_page_buffer +2025-02-13T20:09:30.7236699Z [command]/usr/bin/git checkout --detach +2025-02-13T20:09:31.1049877Z HEAD is now at ac8ce51fe Fixing merge conflict +2025-02-13T20:09:31.1766167Z [command]/usr/bin/git branch --delete --force sagarwal/multi_page_buffer +2025-02-13T20:09:31.1814710Z Deleted branch sagarwal/multi_page_buffer (was ac8ce51fe). +2025-02-13T20:09:31.2239720Z ##[endgroup] +2025-02-13T20:09:31.2243421Z [command]/usr/bin/git submodule status +2025-02-13T20:09:31.2602121Z 29125b7ad8b5513eeaa4417ed92892bf39c8bd74 models/demos/t3000/llama2_70b/reference/llama (remotes/origin/HEAD) +2025-02-13T20:09:31.2679689Z 368cd07f89f497df20a66936fbfae3956f151af4 tt-train/3rd_party/wandb-cpp (heads/master) +2025-02-13T20:09:31.2751070Z 71d4c8d378b52af7da7012b9b595a61e9304f0bb tt_metal/third_party/tracy (71d4c8d) +2025-02-13T20:09:31.2824892Z 9fd3e2d93d1532373f52e11e963de40c1cdf9a55 tt_metal/third_party/tt_llk_blackhole (remotes/origin/HEAD) +2025-02-13T20:09:31.2895192Z 0c04db64275a4bd36a7e14d3c533855cb33f6a20 tt_metal/third_party/tt_llk_grayskull (remotes/origin/HEAD) +2025-02-13T20:09:31.2968940Z 0ec3177bfc262f7edf6cfc19531ecb8f669895d2 tt_metal/third_party/tt_llk_wormhole_b0 (remotes/origin/HEAD) +2025-02-13T20:09:31.3043078Z 5de287e9c5b2fa3d55fbfd53e9bc59e2050f32fb tt_metal/third_party/umd (5de287e) +2025-02-13T20:09:31.3057484Z ##[group]Disabling automatic garbage collection +2025-02-13T20:09:31.3061614Z [command]/usr/bin/git config --local gc.auto 0 +2025-02-13T20:09:31.3086836Z ##[endgroup] +2025-02-13T20:09:31.3087812Z ##[group]Setting up auth +2025-02-13T20:09:31.3093567Z [command]/usr/bin/git config --local --name-only --get-regexp core\.sshCommand +2025-02-13T20:09:31.3121090Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'core\.sshCommand' && git config --local --unset-all 'core.sshCommand' || :" +2025-02-13T20:09:31.3379801Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:09:31.3425440Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:09:31.3472718Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:09:31.3526947Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:09:31.3572486Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:09:31.3620194Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:09:31.3670617Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:09:31.3734275Z [command]/usr/bin/git config --local --name-only --get-regexp http\.https\:\/\/github\.com\/\.extraheader +2025-02-13T20:09:31.3753255Z http.https://github.com/.extraheader +2025-02-13T20:09:31.3761911Z [command]/usr/bin/git config --local --unset-all http.https://github.com/.extraheader +2025-02-13T20:09:31.3792529Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'http\.https\:\/\/github\.com\/\.extraheader' && git config --local --unset-all 'http.https://github.com/.extraheader' || :" +2025-02-13T20:09:31.4049129Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:09:31.4074698Z http.https://github.com/.extraheader +2025-02-13T20:09:31.4105938Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:09:31.4132348Z http.https://github.com/.extraheader +2025-02-13T20:09:31.4170988Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:09:31.4197859Z http.https://github.com/.extraheader +2025-02-13T20:09:31.4230887Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:09:31.4258009Z http.https://github.com/.extraheader +2025-02-13T20:09:31.4294778Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:09:31.4321692Z http.https://github.com/.extraheader +2025-02-13T20:09:31.4358630Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:09:31.4383287Z http.https://github.com/.extraheader +2025-02-13T20:09:31.4417813Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:09:31.4444175Z http.https://github.com/.extraheader +2025-02-13T20:09:31.4504438Z [command]/usr/bin/git config --local http.https://github.com/.extraheader AUTHORIZATION: basic *** +2025-02-13T20:09:31.4543293Z ##[endgroup] +2025-02-13T20:09:31.4544313Z ##[group]Fetching the repository +2025-02-13T20:09:31.4551611Z [command]/usr/bin/git -c protocol.version=2 fetch --no-tags --prune --progress --no-recurse-submodules --depth=1 origin +ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70:refs/remotes/origin/sagarwal/multi_page_buffer +2025-02-13T20:09:31.8959580Z remote: Total 0 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0) +2025-02-13T20:09:31.9622973Z ##[endgroup] +2025-02-13T20:09:31.9623652Z ##[group]Determining the checkout info +2025-02-13T20:09:31.9627287Z ##[endgroup] +2025-02-13T20:09:31.9627931Z ##[group]Checking out the ref +2025-02-13T20:09:31.9632337Z [command]/usr/bin/git checkout --progress --force -B sagarwal/multi_page_buffer refs/remotes/origin/sagarwal/multi_page_buffer +2025-02-13T20:09:32.0173283Z Switched to a new branch 'sagarwal/multi_page_buffer' +2025-02-13T20:09:32.0174152Z Branch 'sagarwal/multi_page_buffer' set up to track remote branch 'sagarwal/multi_page_buffer' from 'origin'. +2025-02-13T20:09:32.0861445Z ##[endgroup] +2025-02-13T20:09:32.0927342Z [command]/usr/bin/git log -1 --format='%H' +2025-02-13T20:09:32.0980157Z 'ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70' +2025-02-13T20:09:32.1107982Z ##[group]Run BUILD_TAG=$(cat \ +2025-02-13T20:09:32.1108404Z BUILD_TAG=$(cat \ +2025-02-13T20:09:32.1108780Z  install_dependencies.sh \ +2025-02-13T20:09:32.1109196Z  dockerfile/Dockerfile \ +2025-02-13T20:09:32.1109651Z  tt_metal/python_env/requirements-dev.txt \ +2025-02-13T20:09:32.1110124Z  docs/requirements-docs.txt \ +2025-02-13T20:09:32.1110614Z  tests/sweep_framework/requirements-sweeps.txt \ +2025-02-13T20:09:32.1111119Z  | sha1sum | cut -d' ' -f1) +2025-02-13T20:09:32.1111571Z echo "BUILD_TAG=$BUILD_TAG" >> $GITHUB_ENV +2025-02-13T20:09:32.1112660Z echo "TT_METAL_DOCKER_IMAGE_TAG=ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:${BUILD_TAG}" >> $GITHUB_ENV +2025-02-13T20:09:32.1137487Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:09:32.1138022Z env: +2025-02-13T20:09:32.1138356Z ARCH_NAME: wormhole_b0 +2025-02-13T20:09:32.1138784Z LOGURU_LEVEL: INFO +2025-02-13T20:09:32.1139421Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:32.1140250Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:09:32.1141049Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:32.1141797Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:32.1142534Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:32.1143300Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:09:32.1144086Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-13T20:09:32.1144648Z ##[endgroup] +2025-02-13T20:09:32.1248909Z ##[group]Run echo "RUNNER_UID=$(id -u)" >> $GITHUB_ENV +2025-02-13T20:09:32.1249455Z echo "RUNNER_UID=$(id -u)" >> $GITHUB_ENV +2025-02-13T20:09:32.1249988Z echo "RUNNER_GID=$(id -g)" >> $GITHUB_ENV +2025-02-13T20:09:32.1274502Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:09:32.1274975Z env: +2025-02-13T20:09:32.1275266Z ARCH_NAME: wormhole_b0 +2025-02-13T20:09:32.1275601Z LOGURU_LEVEL: INFO +2025-02-13T20:09:32.1276094Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:32.1276872Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:09:32.1277627Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:32.1278327Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:32.1279057Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:32.1279762Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:09:32.1280481Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-13T20:09:32.1281410Z BUILD_TAG: 3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:09:32.1282276Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:09:32.1283088Z ##[endgroup] +2025-02-13T20:09:32.1407059Z ##[group]Run docker/login-action@v3 +2025-02-13T20:09:32.1407480Z with: +2025-02-13T20:09:32.1408284Z registry: https://ghcr.io +2025-02-13T20:09:32.1408671Z username: sagarwalTT +2025-02-13T20:09:32.1409367Z password: *** +2025-02-13T20:09:32.1409720Z ecr: auto +2025-02-13T20:09:32.1410043Z logout: true +2025-02-13T20:09:32.1410350Z env: +2025-02-13T20:09:32.1410700Z ARCH_NAME: wormhole_b0 +2025-02-13T20:09:32.1411067Z LOGURU_LEVEL: INFO +2025-02-13T20:09:32.1411588Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:32.1412391Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:09:32.1413191Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:32.1413916Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:32.1414627Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:32.1415352Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:09:32.1416092Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-13T20:09:32.1416731Z BUILD_TAG: 3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:09:32.1417870Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:09:32.1418674Z RUNNER_UID: 1000 +2025-02-13T20:09:32.1418990Z RUNNER_GID: 1000 +2025-02-13T20:09:32.1419319Z ##[endgroup] +2025-02-13T20:09:32.4878465Z Logging into https://ghcr.io... +2025-02-13T20:09:32.9619762Z Login Succeeded! +2025-02-13T20:09:32.9735707Z ##[group]Run docker pull ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:09:32.9736917Z docker pull ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:09:32.9758862Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:09:32.9759314Z env: +2025-02-13T20:09:32.9759591Z ARCH_NAME: wormhole_b0 +2025-02-13T20:09:32.9759917Z LOGURU_LEVEL: INFO +2025-02-13T20:09:32.9760396Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:32.9761193Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:09:32.9761943Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:32.9762629Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:32.9763317Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:32.9764124Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:09:32.9764826Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-13T20:09:32.9765429Z BUILD_TAG: 3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:09:32.9766269Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:09:32.9767046Z RUNNER_UID: 1000 +2025-02-13T20:09:32.9767360Z RUNNER_GID: 1000 +2025-02-13T20:09:32.9767826Z ##[endgroup] +2025-02-13T20:09:33.6193731Z 3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6: Pulling from tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64 +2025-02-13T20:09:33.6209385Z Digest: sha256:8a4d11f562408a7a138235af5a27a98439b4c5655255b17980d1a8dcbd067fd7 +2025-02-13T20:09:33.6210413Z Status: Image is up to date for ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:09:33.6224902Z ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:09:33.6338004Z ##[group]Run tenstorrent/docker-run-action@v5 +2025-02-13T20:09:33.6338466Z with: +2025-02-13T20:09:33.6338765Z shell: bash +2025-02-13T20:09:33.6339080Z username: sagarwalTT +2025-02-13T20:09:33.6339698Z password: *** +2025-02-13T20:09:33.6340026Z registry: ghcr.io +2025-02-13T20:09:33.6340684Z image: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:09:33.6344632Z options: -u 1000:1000 +--rm +-v /etc/passwd:/etc/passwd:ro +-v /etc/shadow:/etc/shadow:ro +-v /etc/bashrc:/etc/bashrc:ro +-v /home/ubuntu/actions-runner/_work/tt-metal/tt-metal:/home/ubuntu/actions-runner/_work/tt-metal/tt-metal +--net=host +--log-driver local +--log-opt max-size=50m +-e TT_METAL_HOME=/home/ubuntu/actions-runner/_work/tt-metal/tt-metal +-e ARCH_NAME=wormhole_b0 +-e LD_LIBRARY_PATH=/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/build/lib +-e GTEST_OUTPUT=xml:generated/test_reports/ + +-e LOGURU_LEVEL=INFO +-e PYTHONPATH=/home/ubuntu/actions-runner/_work/tt-metal/tt-metal +-e HOME=/home/ubuntu/actions-runner/_work/tt-metal/tt-metal +-v /dev/hugepages-1G:/dev/hugepages-1G +--device /dev/tenstorrent + +-w /home/ubuntu/actions-runner/_work/tt-metal/tt-metal + +2025-02-13T20:09:33.6349252Z run: set -eu + +install_wheel=false +if [ "${install_wheel,,}" == "true" ]; then + WHEEL_FILENAME=$(ls -1 *.whl) + pip3 install "$WHEEL_FILENAME" +fi + +pip install --force-reinstall pip==21.2.4 +pip install -r tt_metal/python_env/requirements-dev.txt +pip install -e . +mkdir -p generated/test_reports +./tests/scripts/run_tools_tests.sh + + +2025-02-13T20:09:33.6350936Z env: +2025-02-13T20:09:33.6351497Z ARCH_NAME: wormhole_b0 +2025-02-13T20:09:33.6351842Z LOGURU_LEVEL: INFO +2025-02-13T20:09:33.6352336Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:33.6353145Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:09:33.6353938Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:33.6354692Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:33.6355442Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:09:33.6356178Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:09:33.6356935Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-13T20:09:33.6357594Z BUILD_TAG: 3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:09:33.6358460Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:09:33.6359261Z RUNNER_UID: 1000 +2025-02-13T20:09:33.6359605Z RUNNER_GID: 1000 +2025-02-13T20:09:33.6359956Z ##[endgroup] +2025-02-13T20:09:33.6487005Z ##[command]/usr/bin/docker run --name e8ee94a42ea301cca041f39695332b6ab5e9e4_3ff2c8 --label e8ee94 --workdir /github/workspace --rm -e "ARCH_NAME" -e "LOGURU_LEVEL" -e "pythonLocation" -e "PKG_CONFIG_PATH" -e "Python_ROOT_DIR" -e "Python2_ROOT_DIR" -e "Python3_ROOT_DIR" -e "LD_LIBRARY_PATH" -e "VIRTUAL_ENV" -e "BUILD_TAG" -e "TT_METAL_DOCKER_IMAGE_TAG" -e "RUNNER_UID" -e "RUNNER_GID" -e "INPUT_SHELL" -e "INPUT_USERNAME" -e "INPUT_PASSWORD" -e "INPUT_REGISTRY" -e "INPUT_IMAGE" -e "INPUT_OPTIONS" -e "INPUT_RUN" -e "INPUT_DOCKER_NETWORK" -e "HOME" -e "GITHUB_JOB" -e "GITHUB_REF" -e "GITHUB_SHA" -e "GITHUB_REPOSITORY" -e "GITHUB_REPOSITORY_OWNER" -e "GITHUB_REPOSITORY_OWNER_ID" -e "GITHUB_RUN_ID" -e "GITHUB_RUN_NUMBER" -e "GITHUB_RETENTION_DAYS" -e "GITHUB_RUN_ATTEMPT" -e "GITHUB_REPOSITORY_ID" -e "GITHUB_ACTOR_ID" -e "GITHUB_ACTOR" -e "GITHUB_TRIGGERING_ACTOR" -e "GITHUB_WORKFLOW" -e "GITHUB_HEAD_REF" -e "GITHUB_BASE_REF" -e "GITHUB_EVENT_NAME" -e "GITHUB_SERVER_URL" -e "GITHUB_API_URL" -e "GITHUB_GRAPHQL_URL" -e "GITHUB_REF_NAME" -e "GITHUB_REF_PROTECTED" -e "GITHUB_REF_TYPE" -e "GITHUB_WORKFLOW_REF" -e "GITHUB_WORKFLOW_SHA" -e "GITHUB_WORKSPACE" -e "GITHUB_EVENT_PATH" -e "GITHUB_PATH" -e "GITHUB_ENV" -e "GITHUB_STEP_SUMMARY" -e "GITHUB_STATE" -e "GITHUB_OUTPUT" -e "GITHUB_ACTION" -e "GITHUB_ACTION_REPOSITORY" -e "GITHUB_ACTION_REF" -e "GITHUB_ACTION_PATH" -e "RUNNER_OS" -e "RUNNER_ARCH" -e "RUNNER_NAME" -e "RUNNER_ENVIRONMENT" -e "RUNNER_TOOL_CACHE" -e "RUNNER_TEMP" -e "RUNNER_WORKSPACE" -e "ACTIONS_RUNTIME_URL" -e "ACTIONS_RUNTIME_TOKEN" -e "ACTIONS_CACHE_URL" -e "ACTIONS_ID_TOKEN_REQUEST_URL" -e "ACTIONS_ID_TOKEN_REQUEST_TOKEN" -e "ACTIONS_RESULTS_URL" -e GITHUB_ACTIONS=true -e CI=true -v "/var/run/docker.sock":"/var/run/docker.sock" -v "/home/ubuntu/actions-runner/_work/_temp/_github_home":"/github/home" -v "/home/ubuntu/actions-runner/_work/_temp/_github_workflow":"/github/workflow" -v "/home/ubuntu/actions-runner/_work/_temp/_runner_file_commands":"/github/file_commands" -v "/home/ubuntu/actions-runner/_work/tt-metal/tt-metal":"/github/workspace" e8ee94:a42ea301cca041f39695332b6ab5e9e4 +2025-02-13T20:09:40.4846139Z WARNING! Your password will be stored unencrypted in /github/home/.docker/config.json. +2025-02-13T20:09:40.4847070Z Configure a credential helper to remove this warning. See +2025-02-13T20:09:40.4848005Z https://docs.docker.com/engine/reference/commandline/login/#credentials-store +2025-02-13T20:09:40.4848535Z +2025-02-13T20:09:40.4848704Z Login Succeeded +2025-02-13T20:09:41.8707179Z Collecting pip==21.2.4 +2025-02-13T20:09:41.9086089Z Downloading pip-21.2.4-py3-none-any.whl (1.6 MB) +2025-02-13T20:09:43.9493659Z Installing collected packages: pip +2025-02-13T20:09:44.7676809Z WARNING: The scripts pip, pip3 and pip3.8 are installed in '/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.local/bin' which is not on PATH. +2025-02-13T20:09:44.7678109Z Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location. +2025-02-13T20:09:44.7884819Z Successfully installed pip-21.2.4 +2025-02-13T20:09:45.9332595Z Requirement already satisfied: platformdirs<4.0.0 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 8)) (3.11.0) +2025-02-13T20:09:45.9424637Z Requirement already satisfied: pre-commit==3.0.4 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 9)) (3.0.4) +2025-02-13T20:09:45.9465705Z Requirement already satisfied: black==24.3.0 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 10)) (24.3.0) +2025-02-13T20:09:45.9632881Z Requirement already satisfied: clang-format==19.1.4 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 11)) (19.1.4) +2025-02-13T20:09:45.9646286Z Requirement already satisfied: build==0.10.0 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 12)) (0.10.0) +2025-02-13T20:09:45.9856398Z Requirement already satisfied: twine==4.0.2 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 13)) (4.0.2) +2025-02-13T20:09:45.9925743Z Requirement already satisfied: yamllint==1.32.0 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 14)) (1.32.0) +2025-02-13T20:09:45.9982080Z Requirement already satisfied: mypy==1.9.0 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 15)) (1.9.0) +2025-02-13T20:09:46.0052042Z Requirement already satisfied: docutils==0.18.1 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/../../docs/requirements-docs.txt (line 1)) (0.18.1) +2025-02-13T20:09:46.0067165Z Requirement already satisfied: sphinx==7.1.2 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/../../docs/requirements-docs.txt (line 2)) (7.1.2) +2025-02-13T20:09:46.0277738Z Requirement already satisfied: sphinx-rtd-theme==1.3.0 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/../../docs/requirements-docs.txt (line 3)) (1.3.0) +2025-02-13T20:09:46.0341512Z Requirement already satisfied: sphinxcontrib-email==0.3.5 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/../../docs/requirements-docs.txt (line 4)) (0.3.5) +2025-02-13T20:09:46.0365190Z Requirement already satisfied: lxml==4.9.4 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/../../docs/requirements-docs.txt (line 5)) (4.9.4) +2025-02-13T20:09:46.0408620Z Requirement already satisfied: breathe==4.35.0 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/../../docs/requirements-docs.txt (line 6)) (4.35.0) +2025-02-13T20:09:46.0433082Z Requirement already satisfied: nbsphinx==0.9.3 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (0.9.3) +2025-02-13T20:09:46.0475499Z Requirement already satisfied: sphinxcontrib-jquery==4.1 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/../../docs/requirements-docs.txt (line 8)) (4.1) +2025-02-13T20:09:46.0493785Z Requirement already satisfied: ipython==8.12.3 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/../../docs/requirements-docs.txt (line 9)) (8.12.3) +2025-02-13T20:09:46.1017396Z Requirement already satisfied: pandoc==2.3 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/../../docs/requirements-docs.txt (line 10)) (2.3) +2025-02-13T20:09:46.1037722Z Requirement already satisfied: tabulate==0.9.0 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/../../docs/requirements-docs.txt (line 11)) (0.9.0) +2025-02-13T20:09:46.1071037Z Requirement already satisfied: myst-parser==3.0.0 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/../../docs/requirements-docs.txt (line 12)) (3.0.0) +2025-02-13T20:09:46.1293096Z Requirement already satisfied: elasticsearch in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/../../tests/sweep_framework/requirements-sweeps.txt (line 1)) (8.17.1) +2025-02-13T20:09:46.1526221Z Requirement already satisfied: termcolor in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/../../tests/sweep_framework/requirements-sweeps.txt (line 2)) (2.4.0) +2025-02-13T20:09:46.1550346Z Requirement already satisfied: beautifultable in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/../../tests/sweep_framework/requirements-sweeps.txt (line 3)) (1.1.0) +2025-02-13T20:09:46.1581854Z Requirement already satisfied: faster-fifo in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/../../tests/sweep_framework/requirements-sweeps.txt (line 4)) (1.4.7) +2025-02-13T20:09:46.1624387Z Requirement already satisfied: pytest==7.2.2 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 24)) (7.2.2) +2025-02-13T20:09:46.1750588Z Requirement already satisfied: pytest-timeout==2.2.0 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 25)) (2.2.0) +2025-02-13T20:09:46.1774882Z Requirement already satisfied: pytest-split==0.8.2 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 26)) (0.8.2) +2025-02-13T20:09:46.1797252Z Requirement already satisfied: pytest-xdist==3.6.1 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 27)) (3.6.1) +2025-02-13T20:09:46.1841815Z Requirement already satisfied: jsbeautifier==1.14.7 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 28)) (1.14.7) +2025-02-13T20:09:46.1866648Z Requirement already satisfied: datasets==2.9.0 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 29)) (2.9.0) +2025-02-13T20:09:46.2719882Z Requirement already satisfied: torch==2.2.1.0+cpu in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 30)) (2.2.1+cpu) +2025-02-13T20:09:46.2788318Z Requirement already satisfied: networkx==3.1 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 31)) (3.1) +2025-02-13T20:09:46.2947620Z Requirement already satisfied: torchvision==0.17.1+cpu in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 32)) (0.17.1+cpu) +2025-02-13T20:09:46.2983457Z Requirement already satisfied: torchmetrics==1.3.1 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 33)) (1.3.1) +2025-02-13T20:09:46.4009645Z Requirement already satisfied: torch-fidelity==0.3.0 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 34)) (0.3.0) +2025-02-13T20:09:46.4051328Z Requirement already satisfied: transformers==4.38.0 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 35)) (4.38.0) +2025-02-13T20:09:46.7982188Z Requirement already satisfied: xlsxwriter==3.0.8 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 36)) (3.0.8) +2025-02-13T20:09:46.7995801Z Requirement already satisfied: tiktoken==0.3.3 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 37)) (0.3.3) +2025-02-13T20:09:46.8026638Z Requirement already satisfied: tqdm==4.66.3 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 38)) (4.66.3) +2025-02-13T20:09:46.8118710Z Requirement already satisfied: enlighten==1.12.4 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 39)) (1.12.4) +2025-02-13T20:09:46.8156307Z Requirement already satisfied: sentencepiece==0.1.97 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 40)) (0.1.97) +2025-02-13T20:09:46.8168981Z Requirement already satisfied: numba>=0.58.1 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 41)) (0.58.1) +2025-02-13T20:09:46.8209440Z Requirement already satisfied: librosa==0.10.0 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 42)) (0.10.0) +2025-02-13T20:09:46.8437785Z Requirement already satisfied: timm==0.6.13 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 43)) (0.6.13) +2025-02-13T20:09:46.8478982Z Requirement already satisfied: opencv-python-headless==4.8.1.78 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 44)) (4.8.1.78) +2025-02-13T20:09:46.8589049Z Requirement already satisfied: diffusers==0.12.1 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 45)) (0.12.1) +2025-02-13T20:09:46.8997143Z Requirement already satisfied: accelerate==0.27.2 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 46)) (0.27.2) +2025-02-13T20:09:46.9616629Z Requirement already satisfied: ftfy==6.1.1 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 47)) (6.1.1) +2025-02-13T20:09:46.9636550Z Requirement already satisfied: gitpython==3.1.41 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 48)) (3.1.41) +2025-02-13T20:09:46.9751168Z Requirement already satisfied: einops==0.6.1 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 49)) (0.6.1) +2025-02-13T20:09:46.9768873Z Requirement already satisfied: multiprocess==0.70.14 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 53)) (0.70.14) +2025-02-13T20:09:46.9787959Z Requirement already satisfied: evaluate==0.4.0 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 54)) (0.4.0) +2025-02-13T20:09:47.0425126Z Requirement already satisfied: bert-score==0.3.12 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 55)) (0.3.12) +2025-02-13T20:09:47.0478694Z Requirement already satisfied: fsspec==2023.9.2 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 56)) (2023.9.2) +2025-02-13T20:09:47.0854229Z Requirement already satisfied: docopt==0.6.2 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 57)) (0.6.2) +2025-02-13T20:09:47.0872415Z Requirement already satisfied: blobfile==2.1.1 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 59)) (2.1.1) +2025-02-13T20:09:47.0910119Z Requirement already satisfied: numpy<2,>=1.24.4 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 60)) (1.24.4) +2025-02-13T20:09:47.0925110Z Requirement already satisfied: huggingface-hub==0.25.2 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 61)) (0.25.2) +2025-02-13T20:09:47.1714127Z Requirement already satisfied: pydantic==2.9.2 in /usr/local/lib/python3.8/dist-packages (from -r tt_metal/python_env/requirements-dev.txt (line 62)) (2.9.2) +2025-02-13T20:09:47.1819590Z Requirement already satisfied: pyyaml>=5.1 in /usr/lib/python3/dist-packages (from pre-commit==3.0.4->-r tt_metal/python_env/requirements-dev.txt (line 9)) (5.3.1) +2025-02-13T20:09:47.1828666Z Requirement already satisfied: identify>=1.0.0 in /usr/local/lib/python3.8/dist-packages (from pre-commit==3.0.4->-r tt_metal/python_env/requirements-dev.txt (line 9)) (2.6.1) +2025-02-13T20:09:47.1848079Z Requirement already satisfied: cfgv>=2.0.0 in /usr/local/lib/python3.8/dist-packages (from pre-commit==3.0.4->-r tt_metal/python_env/requirements-dev.txt (line 9)) (3.4.0) +2025-02-13T20:09:47.1863847Z Requirement already satisfied: nodeenv>=0.11.1 in /usr/local/lib/python3.8/dist-packages (from pre-commit==3.0.4->-r tt_metal/python_env/requirements-dev.txt (line 9)) (1.9.1) +2025-02-13T20:09:47.1886067Z Requirement already satisfied: virtualenv>=20.10.0 in /usr/local/lib/python3.8/dist-packages (from pre-commit==3.0.4->-r tt_metal/python_env/requirements-dev.txt (line 9)) (20.29.1) +2025-02-13T20:09:47.2088318Z Requirement already satisfied: typing-extensions>=4.0.1; python_version < "3.11" in /usr/local/lib/python3.8/dist-packages (from black==24.3.0->-r tt_metal/python_env/requirements-dev.txt (line 10)) (4.12.2) +2025-02-13T20:09:47.2104667Z Requirement already satisfied: click>=8.0.0 in /usr/local/lib/python3.8/dist-packages (from black==24.3.0->-r tt_metal/python_env/requirements-dev.txt (line 10)) (8.1.8) +2025-02-13T20:09:47.2129022Z Requirement already satisfied: mypy-extensions>=0.4.3 in /usr/local/lib/python3.8/dist-packages (from black==24.3.0->-r tt_metal/python_env/requirements-dev.txt (line 10)) (1.0.0) +2025-02-13T20:09:47.2145662Z Requirement already satisfied: tomli>=1.1.0; python_version < "3.11" in /usr/local/lib/python3.8/dist-packages (from black==24.3.0->-r tt_metal/python_env/requirements-dev.txt (line 10)) (2.2.1) +2025-02-13T20:09:47.2159211Z Requirement already satisfied: packaging>=22.0 in /usr/local/lib/python3.8/dist-packages (from black==24.3.0->-r tt_metal/python_env/requirements-dev.txt (line 10)) (24.2) +2025-02-13T20:09:47.2170712Z Requirement already satisfied: pathspec>=0.9.0 in /usr/local/lib/python3.8/dist-packages (from black==24.3.0->-r tt_metal/python_env/requirements-dev.txt (line 10)) (0.12.1) +2025-02-13T20:09:47.2190637Z Requirement already satisfied: pyproject_hooks in /usr/local/lib/python3.8/dist-packages (from build==0.10.0->-r tt_metal/python_env/requirements-dev.txt (line 12)) (1.2.0) +2025-02-13T20:09:47.2200865Z Requirement already satisfied: requests>=2.20 in /usr/local/lib/python3.8/dist-packages (from twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (2.32.3) +2025-02-13T20:09:47.2259037Z Requirement already satisfied: requests-toolbelt!=0.9.0,>=0.8.0 in /usr/local/lib/python3.8/dist-packages (from twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (1.0.0) +2025-02-13T20:09:47.2285745Z Requirement already satisfied: urllib3>=1.26.0 in /usr/local/lib/python3.8/dist-packages (from twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (2.2.3) +2025-02-13T20:09:47.2347020Z Requirement already satisfied: rich>=12.0.0 in /usr/local/lib/python3.8/dist-packages (from twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (13.9.4) +2025-02-13T20:09:47.2404805Z Requirement already satisfied: rfc3986>=1.4.0 in /usr/local/lib/python3.8/dist-packages (from twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (2.0.0) +2025-02-13T20:09:47.2427196Z Requirement already satisfied: importlib-metadata>=3.6 in /usr/local/lib/python3.8/dist-packages (from twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (8.5.0) +2025-02-13T20:09:47.2604212Z Requirement already satisfied: keyring>=15.1 in /usr/local/lib/python3.8/dist-packages (from twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (25.5.0) +2025-02-13T20:09:47.2837870Z Requirement already satisfied: readme-renderer>=35.0 in /usr/local/lib/python3.8/dist-packages (from twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (43.0) +2025-02-13T20:09:47.2874633Z Requirement already satisfied: pkginfo>=1.8.1 in /usr/local/lib/python3.8/dist-packages (from twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (1.12.0) +2025-02-13T20:09:47.2908756Z Requirement already satisfied: sphinxcontrib-applehelp in /usr/local/lib/python3.8/dist-packages (from sphinx==7.1.2->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 2)) (1.0.4) +2025-02-13T20:09:47.2947998Z Requirement already satisfied: sphinxcontrib-serializinghtml>=1.1.5 in /usr/local/lib/python3.8/dist-packages (from sphinx==7.1.2->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 2)) (1.1.5) +2025-02-13T20:09:47.2988249Z Requirement already satisfied: alabaster<0.8,>=0.7 in /usr/local/lib/python3.8/dist-packages (from sphinx==7.1.2->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 2)) (0.7.13) +2025-02-13T20:09:47.2998749Z Requirement already satisfied: sphinxcontrib-jsmath in /usr/local/lib/python3.8/dist-packages (from sphinx==7.1.2->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 2)) (1.0.1) +2025-02-13T20:09:47.3030343Z Requirement already satisfied: imagesize>=1.3 in /usr/local/lib/python3.8/dist-packages (from sphinx==7.1.2->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 2)) (1.4.1) +2025-02-13T20:09:47.3041792Z Requirement already satisfied: sphinxcontrib-devhelp in /usr/local/lib/python3.8/dist-packages (from sphinx==7.1.2->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 2)) (1.0.2) +2025-02-13T20:09:47.3080514Z Requirement already satisfied: Jinja2>=3.0 in /usr/local/lib/python3.8/dist-packages (from sphinx==7.1.2->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 2)) (3.1.5) +2025-02-13T20:09:47.3106738Z Requirement already satisfied: sphinxcontrib-htmlhelp>=2.0.0 in /usr/local/lib/python3.8/dist-packages (from sphinx==7.1.2->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 2)) (2.0.1) +2025-02-13T20:09:47.3150276Z Requirement already satisfied: sphinxcontrib-qthelp in /usr/local/lib/python3.8/dist-packages (from sphinx==7.1.2->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 2)) (1.0.3) +2025-02-13T20:09:47.3187866Z Requirement already satisfied: snowballstemmer>=2.0 in /usr/local/lib/python3.8/dist-packages (from sphinx==7.1.2->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 2)) (2.2.0) +2025-02-13T20:09:47.3201671Z Requirement already satisfied: Pygments>=2.13 in /usr/local/lib/python3.8/dist-packages (from sphinx==7.1.2->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 2)) (2.19.1) +2025-02-13T20:09:47.3221618Z Requirement already satisfied: babel>=2.9 in /usr/local/lib/python3.8/dist-packages (from sphinx==7.1.2->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 2)) (2.17.0) +2025-02-13T20:09:47.3311585Z Requirement already satisfied: nbconvert!=5.4 in /usr/local/lib/python3.8/dist-packages (from nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (7.16.6) +2025-02-13T20:09:47.3617666Z Requirement already satisfied: nbformat in /usr/local/lib/python3.8/dist-packages (from nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (5.10.4) +2025-02-13T20:09:47.3713056Z Requirement already satisfied: traitlets>=5 in /usr/local/lib/python3.8/dist-packages (from nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (5.14.3) +2025-02-13T20:09:47.3789504Z Requirement already satisfied: jedi>=0.16 in /usr/local/lib/python3.8/dist-packages (from ipython==8.12.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 9)) (0.19.2) +2025-02-13T20:09:47.4046686Z Requirement already satisfied: decorator in /usr/local/lib/python3.8/dist-packages (from ipython==8.12.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 9)) (5.1.1) +2025-02-13T20:09:47.4058151Z Requirement already satisfied: pickleshare in /usr/local/lib/python3.8/dist-packages (from ipython==8.12.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 9)) (0.7.5) +2025-02-13T20:09:47.4079986Z Requirement already satisfied: pexpect>4.3; sys_platform != "win32" in /usr/local/lib/python3.8/dist-packages (from ipython==8.12.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 9)) (4.9.0) +2025-02-13T20:09:47.4097063Z Requirement already satisfied: backcall in /usr/local/lib/python3.8/dist-packages (from ipython==8.12.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 9)) (0.2.0) +2025-02-13T20:09:47.4106184Z Requirement already satisfied: matplotlib-inline in /usr/local/lib/python3.8/dist-packages (from ipython==8.12.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 9)) (0.1.7) +2025-02-13T20:09:47.4122825Z Requirement already satisfied: stack-data in /usr/local/lib/python3.8/dist-packages (from ipython==8.12.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 9)) (0.6.3) +2025-02-13T20:09:47.4193180Z Requirement already satisfied: prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30 in /usr/local/lib/python3.8/dist-packages (from ipython==8.12.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 9)) (3.0.50) +2025-02-13T20:09:47.4208673Z Requirement already satisfied: plumbum in /usr/local/lib/python3.8/dist-packages (from pandoc==2.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 10)) (1.9.0) +2025-02-13T20:09:47.4363796Z Requirement already satisfied: ply in /usr/local/lib/python3.8/dist-packages (from pandoc==2.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 10)) (3.11) +2025-02-13T20:09:47.4374702Z Requirement already satisfied: markdown-it-py~=3.0 in /usr/local/lib/python3.8/dist-packages (from myst-parser==3.0.0->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 12)) (3.0.0) +2025-02-13T20:09:47.4568830Z Requirement already satisfied: mdit-py-plugins~=0.4 in /usr/local/lib/python3.8/dist-packages (from myst-parser==3.0.0->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 12)) (0.4.2) +2025-02-13T20:09:47.4637480Z Requirement already satisfied: elastic-transport<9,>=8.15.1 in /usr/local/lib/python3.8/dist-packages (from elasticsearch->-r tt_metal/python_env/../../tests/sweep_framework/requirements-sweeps.txt (line 1)) (8.17.0) +2025-02-13T20:09:47.4768961Z Requirement already satisfied: wcwidth in /usr/local/lib/python3.8/dist-packages (from beautifultable->-r tt_metal/python_env/../../tests/sweep_framework/requirements-sweeps.txt (line 3)) (0.2.13) +2025-02-13T20:09:47.4796422Z Requirement already satisfied: setuptools>=45.2.0 in /usr/lib/python3/dist-packages (from faster-fifo->-r tt_metal/python_env/../../tests/sweep_framework/requirements-sweeps.txt (line 4)) (45.2.0) +2025-02-13T20:09:47.4808606Z Requirement already satisfied: cython>=0.29 in /usr/local/lib/python3.8/dist-packages (from faster-fifo->-r tt_metal/python_env/../../tests/sweep_framework/requirements-sweeps.txt (line 4)) (3.0.11) +2025-02-13T20:09:47.4826035Z Requirement already satisfied: exceptiongroup>=1.0.0rc8; python_version < "3.11" in /usr/local/lib/python3.8/dist-packages (from pytest==7.2.2->-r tt_metal/python_env/requirements-dev.txt (line 24)) (1.2.2) +2025-02-13T20:09:47.4844344Z Requirement already satisfied: iniconfig in /usr/local/lib/python3.8/dist-packages (from pytest==7.2.2->-r tt_metal/python_env/requirements-dev.txt (line 24)) (2.0.0) +2025-02-13T20:09:47.4858007Z Requirement already satisfied: attrs>=19.2.0 in /usr/local/lib/python3.8/dist-packages (from pytest==7.2.2->-r tt_metal/python_env/requirements-dev.txt (line 24)) (25.1.0) +2025-02-13T20:09:47.5274582Z Requirement already satisfied: pluggy<2.0,>=0.12 in /usr/local/lib/python3.8/dist-packages (from pytest==7.2.2->-r tt_metal/python_env/requirements-dev.txt (line 24)) (1.5.0) +2025-02-13T20:09:47.5314334Z Requirement already satisfied: execnet>=2.1 in /usr/local/lib/python3.8/dist-packages (from pytest-xdist==3.6.1->-r tt_metal/python_env/requirements-dev.txt (line 27)) (2.1.1) +2025-02-13T20:09:47.5351934Z Requirement already satisfied: six>=1.13.0 in /usr/lib/python3/dist-packages (from jsbeautifier==1.14.7->-r tt_metal/python_env/requirements-dev.txt (line 28)) (1.14.0) +2025-02-13T20:09:47.5362595Z Requirement already satisfied: editorconfig>=0.12.2 in /usr/local/lib/python3.8/dist-packages (from jsbeautifier==1.14.7->-r tt_metal/python_env/requirements-dev.txt (line 28)) (0.17.0) +2025-02-13T20:09:47.5374278Z Requirement already satisfied: pandas in /usr/local/lib/python3.8/dist-packages (from datasets==2.9.0->-r tt_metal/python_env/requirements-dev.txt (line 29)) (2.0.3) +2025-02-13T20:09:47.6322312Z Requirement already satisfied: aiohttp in /usr/local/lib/python3.8/dist-packages (from datasets==2.9.0->-r tt_metal/python_env/requirements-dev.txt (line 29)) (3.10.11) +2025-02-13T20:09:47.6431361Z Requirement already satisfied: dill<0.3.7 in /usr/local/lib/python3.8/dist-packages (from datasets==2.9.0->-r tt_metal/python_env/requirements-dev.txt (line 29)) (0.3.6) +2025-02-13T20:09:47.6453807Z Requirement already satisfied: responses<0.19 in /usr/local/lib/python3.8/dist-packages (from datasets==2.9.0->-r tt_metal/python_env/requirements-dev.txt (line 29)) (0.18.0) +2025-02-13T20:09:47.6545447Z Requirement already satisfied: xxhash in /usr/local/lib/python3.8/dist-packages (from datasets==2.9.0->-r tt_metal/python_env/requirements-dev.txt (line 29)) (3.5.0) +2025-02-13T20:09:47.6563748Z Requirement already satisfied: pyarrow>=6.0.0 in /usr/local/lib/python3.8/dist-packages (from datasets==2.9.0->-r tt_metal/python_env/requirements-dev.txt (line 29)) (17.0.0) +2025-02-13T20:09:47.6611907Z Requirement already satisfied: filelock in /usr/local/lib/python3.8/dist-packages (from torch==2.2.1.0+cpu->-r tt_metal/python_env/requirements-dev.txt (line 30)) (3.16.1) +2025-02-13T20:09:47.6723547Z Requirement already satisfied: sympy in /usr/local/lib/python3.8/dist-packages (from torch==2.2.1.0+cpu->-r tt_metal/python_env/requirements-dev.txt (line 30)) (1.13.3) +2025-02-13T20:09:47.6765588Z Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /usr/local/lib/python3.8/dist-packages (from torchvision==0.17.1+cpu->-r tt_metal/python_env/requirements-dev.txt (line 32)) (10.4.0) +2025-02-13T20:09:47.6924568Z Requirement already satisfied: lightning-utilities>=0.8.0 in /usr/local/lib/python3.8/dist-packages (from torchmetrics==1.3.1->-r tt_metal/python_env/requirements-dev.txt (line 33)) (0.11.9) +2025-02-13T20:09:47.6988120Z Requirement already satisfied: scipy in /usr/local/lib/python3.8/dist-packages (from torch-fidelity==0.3.0->-r tt_metal/python_env/requirements-dev.txt (line 34)) (1.10.1) +2025-02-13T20:09:47.7184548Z Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.8/dist-packages (from transformers==4.38.0->-r tt_metal/python_env/requirements-dev.txt (line 35)) (0.5.2) +2025-02-13T20:09:47.7458079Z Requirement already satisfied: tokenizers<0.19,>=0.14 in /usr/local/lib/python3.8/dist-packages (from transformers==4.38.0->-r tt_metal/python_env/requirements-dev.txt (line 35)) (0.15.2) +2025-02-13T20:09:47.7539796Z Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.8/dist-packages (from transformers==4.38.0->-r tt_metal/python_env/requirements-dev.txt (line 35)) (2024.11.6) +2025-02-13T20:09:47.7569431Z Requirement already satisfied: prefixed>=0.3.2 in /usr/local/lib/python3.8/dist-packages (from enlighten==1.12.4->-r tt_metal/python_env/requirements-dev.txt (line 39)) (0.9.0) +2025-02-13T20:09:47.7584244Z Requirement already satisfied: blessed>=1.17.7 in /usr/local/lib/python3.8/dist-packages (from enlighten==1.12.4->-r tt_metal/python_env/requirements-dev.txt (line 39)) (1.20.0) +2025-02-13T20:09:47.7634465Z Requirement already satisfied: llvmlite<0.42,>=0.41.0dev0 in /usr/local/lib/python3.8/dist-packages (from numba>=0.58.1->-r tt_metal/python_env/requirements-dev.txt (line 41)) (0.41.1) +2025-02-13T20:09:47.7647720Z Requirement already satisfied: joblib>=0.14 in /usr/local/lib/python3.8/dist-packages (from librosa==0.10.0->-r tt_metal/python_env/requirements-dev.txt (line 42)) (1.4.2) +2025-02-13T20:09:47.7660725Z Requirement already satisfied: audioread>=2.1.9 in /usr/local/lib/python3.8/dist-packages (from librosa==0.10.0->-r tt_metal/python_env/requirements-dev.txt (line 42)) (3.0.1) +2025-02-13T20:09:47.7681471Z Requirement already satisfied: soxr>=0.3.2 in /usr/local/lib/python3.8/dist-packages (from librosa==0.10.0->-r tt_metal/python_env/requirements-dev.txt (line 42)) (0.3.7) +2025-02-13T20:09:47.7732296Z Requirement already satisfied: lazy-loader>=0.1 in /usr/local/lib/python3.8/dist-packages (from librosa==0.10.0->-r tt_metal/python_env/requirements-dev.txt (line 42)) (0.4) +2025-02-13T20:09:47.7788562Z Requirement already satisfied: scikit-learn>=0.20.0 in /usr/local/lib/python3.8/dist-packages (from librosa==0.10.0->-r tt_metal/python_env/requirements-dev.txt (line 42)) (1.3.2) +2025-02-13T20:09:47.8071319Z Requirement already satisfied: msgpack>=1.0 in /usr/local/lib/python3.8/dist-packages (from librosa==0.10.0->-r tt_metal/python_env/requirements-dev.txt (line 42)) (1.1.0) +2025-02-13T20:09:47.8087796Z Requirement already satisfied: soundfile>=0.12.1 in /usr/local/lib/python3.8/dist-packages (from librosa==0.10.0->-r tt_metal/python_env/requirements-dev.txt (line 42)) (0.13.1) +2025-02-13T20:09:47.8114666Z Requirement already satisfied: pooch>=1.0 in /usr/local/lib/python3.8/dist-packages (from librosa==0.10.0->-r tt_metal/python_env/requirements-dev.txt (line 42)) (1.8.2) +2025-02-13T20:09:47.8168151Z Requirement already satisfied: psutil in /usr/local/lib/python3.8/dist-packages (from accelerate==0.27.2->-r tt_metal/python_env/requirements-dev.txt (line 46)) (6.1.1) +2025-02-13T20:09:47.8328852Z Requirement already satisfied: gitdb<5,>=4.0.1 in /usr/local/lib/python3.8/dist-packages (from gitpython==3.1.41->-r tt_metal/python_env/requirements-dev.txt (line 48)) (4.0.12) +2025-02-13T20:09:47.8347627Z Requirement already satisfied: matplotlib in /usr/local/lib/python3.8/dist-packages (from bert-score==0.3.12->-r tt_metal/python_env/requirements-dev.txt (line 55)) (3.7.5) +2025-02-13T20:09:47.8430241Z Requirement already satisfied: pycryptodomex~=3.8 in /usr/local/lib/python3.8/dist-packages (from blobfile==2.1.1->-r tt_metal/python_env/requirements-dev.txt (line 59)) (3.21.0) +2025-02-13T20:09:47.8444452Z Requirement already satisfied: pydantic-core==2.23.4 in /usr/local/lib/python3.8/dist-packages (from pydantic==2.9.2->-r tt_metal/python_env/requirements-dev.txt (line 62)) (2.23.4) +2025-02-13T20:09:47.8466762Z Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.8/dist-packages (from pydantic==2.9.2->-r tt_metal/python_env/requirements-dev.txt (line 62)) (0.7.0) +2025-02-13T20:09:47.8500924Z Requirement already satisfied: distlib<1,>=0.3.7 in /usr/local/lib/python3.8/dist-packages (from virtualenv>=20.10.0->pre-commit==3.0.4->-r tt_metal/python_env/requirements-dev.txt (line 9)) (0.3.9) +2025-02-13T20:09:47.8516244Z Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.8/dist-packages (from requests>=2.20->twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (3.4.1) +2025-02-13T20:09:47.8538786Z Requirement already satisfied: certifi>=2017.4.17 in /usr/lib/python3/dist-packages (from requests>=2.20->twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (2019.11.28) +2025-02-13T20:09:47.8550807Z Requirement already satisfied: idna<4,>=2.5 in /usr/lib/python3/dist-packages (from requests>=2.20->twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (2.8) +2025-02-13T20:09:47.8564553Z Requirement already satisfied: zipp>=3.20 in /usr/local/lib/python3.8/dist-packages (from importlib-metadata>=3.6->twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (3.20.2) +2025-02-13T20:09:47.8722153Z Requirement already satisfied: jaraco.classes in /usr/local/lib/python3.8/dist-packages (from keyring>=15.1->twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (3.4.0) +2025-02-13T20:09:47.8828307Z Requirement already satisfied: importlib-resources; python_version < "3.9" in /usr/local/lib/python3.8/dist-packages (from keyring>=15.1->twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (6.4.5) +2025-02-13T20:09:47.8970628Z Requirement already satisfied: jeepney>=0.4.2; sys_platform == "linux" in /usr/local/lib/python3.8/dist-packages (from keyring>=15.1->twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (0.8.0) +2025-02-13T20:09:47.9038155Z Requirement already satisfied: jaraco.context in /usr/local/lib/python3.8/dist-packages (from keyring>=15.1->twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (6.0.1) +2025-02-13T20:09:47.9168640Z Requirement already satisfied: SecretStorage>=3.2; sys_platform == "linux" in /usr/local/lib/python3.8/dist-packages (from keyring>=15.1->twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (3.3.3) +2025-02-13T20:09:47.9190980Z Requirement already satisfied: jaraco.functools in /usr/local/lib/python3.8/dist-packages (from keyring>=15.1->twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (4.1.0) +2025-02-13T20:09:47.9308884Z Requirement already satisfied: nh3>=0.2.14 in /usr/local/lib/python3.8/dist-packages (from readme-renderer>=35.0->twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (0.2.20) +2025-02-13T20:09:47.9322058Z Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.8/dist-packages (from Jinja2>=3.0->sphinx==7.1.2->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 2)) (2.1.5) +2025-02-13T20:09:47.9337746Z Requirement already satisfied: pytz>=2015.7; python_version < "3.9" in /usr/local/lib/python3.8/dist-packages (from babel>=2.9->sphinx==7.1.2->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 2)) (2025.1) +2025-02-13T20:09:47.9360584Z Requirement already satisfied: bleach[css]!=5.0.0 in /usr/local/lib/python3.8/dist-packages (from nbconvert!=5.4->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (6.1.0) +2025-02-13T20:09:47.9416383Z Requirement already satisfied: jupyterlab-pygments in /usr/local/lib/python3.8/dist-packages (from nbconvert!=5.4->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (0.3.0) +2025-02-13T20:09:47.9426736Z Requirement already satisfied: defusedxml in /usr/local/lib/python3.8/dist-packages (from nbconvert!=5.4->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (0.7.1) +2025-02-13T20:09:47.9454069Z Requirement already satisfied: jupyter-core>=4.7 in /usr/local/lib/python3.8/dist-packages (from nbconvert!=5.4->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (5.7.2) +2025-02-13T20:09:47.9558510Z Requirement already satisfied: mistune<4,>=2.0.3 in /usr/local/lib/python3.8/dist-packages (from nbconvert!=5.4->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (3.1.1) +2025-02-13T20:09:47.9584874Z Requirement already satisfied: nbclient>=0.5.0 in /usr/local/lib/python3.8/dist-packages (from nbconvert!=5.4->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (0.10.1) +2025-02-13T20:09:47.9813009Z Requirement already satisfied: pandocfilters>=1.4.1 in /usr/local/lib/python3.8/dist-packages (from nbconvert!=5.4->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (1.5.1) +2025-02-13T20:09:47.9828666Z Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.8/dist-packages (from nbconvert!=5.4->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (4.13.3) +2025-02-13T20:09:47.9885223Z Requirement already satisfied: fastjsonschema>=2.15 in /usr/local/lib/python3.8/dist-packages (from nbformat->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (2.21.1) +2025-02-13T20:09:47.9947439Z Requirement already satisfied: jsonschema>=2.6 in /usr/local/lib/python3.8/dist-packages (from nbformat->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (4.23.0) +2025-02-13T20:09:48.0123056Z Requirement already satisfied: parso<0.9.0,>=0.8.4 in /usr/local/lib/python3.8/dist-packages (from jedi>=0.16->ipython==8.12.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 9)) (0.8.4) +2025-02-13T20:09:48.0173278Z Requirement already satisfied: ptyprocess>=0.5 in /usr/local/lib/python3.8/dist-packages (from pexpect>4.3; sys_platform != "win32"->ipython==8.12.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 9)) (0.7.0) +2025-02-13T20:09:48.0182408Z Requirement already satisfied: pure-eval in /usr/local/lib/python3.8/dist-packages (from stack-data->ipython==8.12.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 9)) (0.2.3) +2025-02-13T20:09:48.0202921Z Requirement already satisfied: executing>=1.2.0 in /usr/local/lib/python3.8/dist-packages (from stack-data->ipython==8.12.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 9)) (2.2.0) +2025-02-13T20:09:48.0268030Z Requirement already satisfied: asttokens>=2.1.0 in /usr/local/lib/python3.8/dist-packages (from stack-data->ipython==8.12.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 9)) (3.0.0) +2025-02-13T20:09:48.0314831Z Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.8/dist-packages (from markdown-it-py~=3.0->myst-parser==3.0.0->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 12)) (0.1.2) +2025-02-13T20:09:48.0327439Z Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.8/dist-packages (from pandas->datasets==2.9.0->-r tt_metal/python_env/requirements-dev.txt (line 29)) (2.9.0.post0) +2025-02-13T20:09:48.0347336Z Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.8/dist-packages (from pandas->datasets==2.9.0->-r tt_metal/python_env/requirements-dev.txt (line 29)) (2025.1) +2025-02-13T20:09:48.0360600Z Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets==2.9.0->-r tt_metal/python_env/requirements-dev.txt (line 29)) (6.1.0) +2025-02-13T20:09:48.0396764Z Requirement already satisfied: async-timeout<6.0,>=4.0; python_version < "3.11" in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets==2.9.0->-r tt_metal/python_env/requirements-dev.txt (line 29)) (5.0.1) +2025-02-13T20:09:48.0410561Z Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets==2.9.0->-r tt_metal/python_env/requirements-dev.txt (line 29)) (1.5.0) +2025-02-13T20:09:48.0428343Z Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets==2.9.0->-r tt_metal/python_env/requirements-dev.txt (line 29)) (1.3.1) +2025-02-13T20:09:48.0447290Z Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets==2.9.0->-r tt_metal/python_env/requirements-dev.txt (line 29)) (2.4.4) +2025-02-13T20:09:48.0463268Z Requirement already satisfied: yarl<2.0,>=1.12.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets==2.9.0->-r tt_metal/python_env/requirements-dev.txt (line 29)) (1.15.2) +2025-02-13T20:09:48.0519504Z Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.8/dist-packages (from sympy->torch==2.2.1.0+cpu->-r tt_metal/python_env/requirements-dev.txt (line 30)) (1.3.0) +2025-02-13T20:09:48.0596785Z Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.8/dist-packages (from scikit-learn>=0.20.0->librosa==0.10.0->-r tt_metal/python_env/requirements-dev.txt (line 42)) (3.5.0) +2025-02-13T20:09:48.0615869Z Requirement already satisfied: cffi>=1.0 in /usr/local/lib/python3.8/dist-packages (from soundfile>=0.12.1->librosa==0.10.0->-r tt_metal/python_env/requirements-dev.txt (line 42)) (1.17.1) +2025-02-13T20:09:48.0638456Z Requirement already satisfied: smmap<6,>=3.0.1 in /usr/local/lib/python3.8/dist-packages (from gitdb<5,>=4.0.1->gitpython==3.1.41->-r tt_metal/python_env/requirements-dev.txt (line 48)) (5.0.2) +2025-02-13T20:09:48.0652856Z Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib->bert-score==0.3.12->-r tt_metal/python_env/requirements-dev.txt (line 55)) (1.4.7) +2025-02-13T20:09:48.0670244Z Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib->bert-score==0.3.12->-r tt_metal/python_env/requirements-dev.txt (line 55)) (3.1.4) +2025-02-13T20:09:48.0702904Z Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.8/dist-packages (from matplotlib->bert-score==0.3.12->-r tt_metal/python_env/requirements-dev.txt (line 55)) (0.12.1) +2025-02-13T20:09:48.0776416Z Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.8/dist-packages (from matplotlib->bert-score==0.3.12->-r tt_metal/python_env/requirements-dev.txt (line 55)) (4.55.8) +2025-02-13T20:09:48.1230994Z Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib->bert-score==0.3.12->-r tt_metal/python_env/requirements-dev.txt (line 55)) (1.1.1) +2025-02-13T20:09:48.1419122Z Requirement already satisfied: more-itertools in /usr/local/lib/python3.8/dist-packages (from jaraco.classes->keyring>=15.1->twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (10.5.0) +2025-02-13T20:09:48.1445136Z Requirement already satisfied: backports.tarfile; python_version < "3.12" in /usr/local/lib/python3.8/dist-packages (from jaraco.context->keyring>=15.1->twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (1.2.0) +2025-02-13T20:09:48.1554127Z Requirement already satisfied: cryptography>=2.0 in /usr/local/lib/python3.8/dist-packages (from SecretStorage>=3.2; sys_platform == "linux"->keyring>=15.1->twine==4.0.2->-r tt_metal/python_env/requirements-dev.txt (line 13)) (44.0.0) +2025-02-13T20:09:48.1802852Z Requirement already satisfied: webencodings in /usr/local/lib/python3.8/dist-packages (from bleach[css]!=5.0.0->nbconvert!=5.4->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (0.5.1) +2025-02-13T20:09:48.1820406Z Requirement already satisfied: tinycss2<1.3,>=1.1.0; extra == "css" in /usr/local/lib/python3.8/dist-packages (from bleach[css]!=5.0.0->nbconvert!=5.4->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (1.2.1) +2025-02-13T20:09:48.1872721Z Requirement already satisfied: jupyter-client>=6.1.12 in /usr/local/lib/python3.8/dist-packages (from nbclient>=0.5.0->nbconvert!=5.4->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (8.6.3) +2025-02-13T20:09:48.2045019Z Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.8/dist-packages (from beautifulsoup4->nbconvert!=5.4->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (2.6) +2025-02-13T20:09:48.2058718Z Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.8/dist-packages (from jsonschema>=2.6->nbformat->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (0.20.1) +2025-02-13T20:09:48.2072279Z Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.8/dist-packages (from jsonschema>=2.6->nbformat->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (0.35.1) +2025-02-13T20:09:48.2097625Z Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.8/dist-packages (from jsonschema>=2.6->nbformat->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (2023.12.1) +2025-02-13T20:09:48.2135926Z Requirement already satisfied: pkgutil-resolve-name>=1.3.10; python_version < "3.9" in /usr/local/lib/python3.8/dist-packages (from jsonschema>=2.6->nbformat->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (1.3.10) +2025-02-13T20:09:48.2148128Z Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.8/dist-packages (from yarl<2.0,>=1.12.0->aiohttp->datasets==2.9.0->-r tt_metal/python_env/requirements-dev.txt (line 29)) (0.2.0) +2025-02-13T20:09:48.2161213Z Requirement already satisfied: pycparser in /usr/local/lib/python3.8/dist-packages (from cffi>=1.0->soundfile>=0.12.1->librosa==0.10.0->-r tt_metal/python_env/requirements-dev.txt (line 42)) (2.22) +2025-02-13T20:09:48.2176545Z Requirement already satisfied: pyzmq>=23.0 in /usr/local/lib/python3.8/dist-packages (from jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert!=5.4->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (26.2.1) +2025-02-13T20:09:48.2195280Z Requirement already satisfied: tornado>=6.2 in /usr/local/lib/python3.8/dist-packages (from jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert!=5.4->nbsphinx==0.9.3->-r tt_metal/python_env/../../docs/requirements-docs.txt (line 7)) (6.4.2) +2025-02-13T20:09:49.2632523Z Obtaining file:///home/ubuntu/actions-runner/_work/tt-metal/tt-metal +2025-02-13T20:09:49.2655327Z Installing build dependencies: started +2025-02-13T20:09:54.3787445Z Installing build dependencies: finished with status 'done' +2025-02-13T20:09:54.3789740Z Getting requirements to build wheel: started +2025-02-13T20:09:57.6407952Z Getting requirements to build wheel: finished with status 'done' +2025-02-13T20:09:57.6434349Z Preparing wheel metadata: started +2025-02-13T20:10:00.8829398Z Preparing wheel metadata: finished with status 'done' +2025-02-13T20:10:01.2419294Z Collecting bokeh==3.1.1 +2025-02-13T20:10:01.2810319Z Downloading bokeh-3.1.1-py3-none-any.whl (8.3 MB) +2025-02-13T20:10:01.8505048Z Collecting seaborn==0.13.2 +2025-02-13T20:10:01.8579362Z Downloading seaborn-0.13.2-py3-none-any.whl (294 kB) +2025-02-13T20:10:01.9632300Z Collecting click==8.1.7 +2025-02-13T20:10:01.9708228Z Downloading click-8.1.7-py3-none-any.whl (97 kB) +2025-02-13T20:10:01.9820251Z Requirement already satisfied: numpy<2,>=1.24.4 in /usr/local/lib/python3.8/dist-packages (from ttnn==0.0.dev1+any) (1.24.4) +2025-02-13T20:10:02.0359278Z Collecting toolz==0.12.0 +2025-02-13T20:10:02.0434258Z Downloading toolz-0.12.0-py3-none-any.whl (55 kB) +2025-02-13T20:10:02.9253737Z Collecting Pillow==10.3.0 +2025-02-13T20:10:02.9407904Z Downloading pillow-10.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.4 MB) +2025-02-13T20:10:03.7461934Z Collecting matplotlib==3.7.1 +2025-02-13T20:10:03.7546709Z Downloading matplotlib-3.7.1-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (9.2 MB) +2025-02-13T20:10:04.2713712Z Collecting plotly==5.18.0 +2025-02-13T20:10:04.2798103Z Downloading plotly-5.18.0-py3-none-any.whl (15.6 MB) +2025-02-13T20:10:06.3094654Z Collecting dash==2.15.0 +2025-02-13T20:10:06.3171563Z Downloading dash-2.15.0-py3-none-any.whl (10.2 MB) +2025-02-13T20:10:06.8869814Z Collecting ipywidgets==8.1.1 +2025-02-13T20:10:06.8957858Z Downloading ipywidgets-8.1.1-py3-none-any.whl (139 kB) +2025-02-13T20:10:06.9985977Z Collecting loguru==0.6.0 +2025-02-13T20:10:07.0062510Z Downloading loguru-0.6.0-py3-none-any.whl (58 kB) +2025-02-13T20:10:07.0570203Z Requirement already satisfied: pandas==2.0.3 in /usr/local/lib/python3.8/dist-packages (from ttnn==0.0.dev1+any) (2.0.3) +2025-02-13T20:10:07.1327784Z Requirement already satisfied: networkx==3.1 in /usr/local/lib/python3.8/dist-packages (from ttnn==0.0.dev1+any) (3.1) +2025-02-13T20:10:07.5145363Z Collecting jupyterlab==4.2.5 +2025-02-13T20:10:07.5254456Z Downloading jupyterlab-4.2.5-py3-none-any.whl (11.6 MB) +2025-02-13T20:10:08.0594501Z Requirement already satisfied: torch==2.2.1+cpu in /usr/local/lib/python3.8/dist-packages (from ttnn==0.0.dev1+any) (2.2.1+cpu) +2025-02-13T20:10:08.2238733Z Collecting pyyaml>=5.4 +2025-02-13T20:10:08.2316890Z Downloading PyYAML-6.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (746 kB) +2025-02-13T20:10:08.3380164Z Collecting graphviz==0.20.3 +2025-02-13T20:10:08.3458146Z Downloading graphviz-0.20.3-py3-none-any.whl (47 kB) +2025-02-13T20:10:08.3930126Z Requirement already satisfied: packaging>=16.8 in /usr/local/lib/python3.8/dist-packages (from bokeh==3.1.1->ttnn==0.0.dev1+any) (24.2) +2025-02-13T20:10:08.3941952Z Requirement already satisfied: Jinja2>=2.9 in /usr/local/lib/python3.8/dist-packages (from bokeh==3.1.1->ttnn==0.0.dev1+any) (3.1.5) +2025-02-13T20:10:08.3968932Z Requirement already satisfied: contourpy>=1 in /usr/local/lib/python3.8/dist-packages (from bokeh==3.1.1->ttnn==0.0.dev1+any) (1.1.1) +2025-02-13T20:10:08.4117440Z Requirement already satisfied: tornado>=5.1 in /usr/local/lib/python3.8/dist-packages (from bokeh==3.1.1->ttnn==0.0.dev1+any) (6.4.2) +2025-02-13T20:10:08.4739305Z Collecting xyzservices>=2021.09.1 +2025-02-13T20:10:08.4861178Z Downloading xyzservices-2025.1.0-py3-none-any.whl (88 kB) +2025-02-13T20:10:08.5254333Z Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib==3.7.1->ttnn==0.0.dev1+any) (3.1.4) +2025-02-13T20:10:08.5280415Z Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.8/dist-packages (from matplotlib==3.7.1->ttnn==0.0.dev1+any) (2.9.0.post0) +2025-02-13T20:10:08.5303724Z Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib==3.7.1->ttnn==0.0.dev1+any) (1.4.7) +2025-02-13T20:10:08.5316267Z Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.8/dist-packages (from matplotlib==3.7.1->ttnn==0.0.dev1+any) (4.55.8) +2025-02-13T20:10:08.5665884Z Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.8/dist-packages (from matplotlib==3.7.1->ttnn==0.0.dev1+any) (0.12.1) +2025-02-13T20:10:08.5729360Z Requirement already satisfied: importlib-resources>=3.2.0; python_version < "3.10" in /usr/local/lib/python3.8/dist-packages (from matplotlib==3.7.1->ttnn==0.0.dev1+any) (6.4.5) +2025-02-13T20:10:08.6533721Z Collecting tenacity>=6.2.0 +2025-02-13T20:10:08.6612167Z Downloading tenacity-9.0.0-py3-none-any.whl (28 kB) +2025-02-13T20:10:08.6995331Z Requirement already satisfied: requests in /usr/local/lib/python3.8/dist-packages (from dash==2.15.0->ttnn==0.0.dev1+any) (2.32.3) +2025-02-13T20:10:08.7057971Z Requirement already satisfied: importlib-metadata; python_version >= "3.7" in /usr/local/lib/python3.8/dist-packages (from dash==2.15.0->ttnn==0.0.dev1+any) (8.5.0) +2025-02-13T20:10:08.7242004Z Requirement already satisfied: typing-extensions>=4.1.1 in /usr/local/lib/python3.8/dist-packages (from dash==2.15.0->ttnn==0.0.dev1+any) (4.12.2) +2025-02-13T20:10:08.7981718Z Collecting nest-asyncio +2025-02-13T20:10:08.8085302Z Downloading nest_asyncio-1.6.0-py3-none-any.whl (5.2 kB) +2025-02-13T20:10:08.8858258Z Collecting retrying +2025-02-13T20:10:08.8938672Z Downloading retrying-1.3.4-py3-none-any.whl (11 kB) +2025-02-13T20:10:09.0127533Z Collecting dash-core-components==2.0.0 +2025-02-13T20:10:09.0213373Z Downloading dash_core_components-2.0.0-py3-none-any.whl (3.8 kB) +2025-02-13T20:10:09.1110259Z Collecting dash-html-components==2.0.0 +2025-02-13T20:10:09.1187451Z Downloading dash_html_components-2.0.0-py3-none-any.whl (4.1 kB) +2025-02-13T20:10:09.1520703Z Requirement already satisfied: setuptools in /usr/lib/python3/dist-packages (from dash==2.15.0->ttnn==0.0.dev1+any) (45.2.0) +2025-02-13T20:10:09.2327403Z Collecting Flask<3.1,>=1.0.4 +2025-02-13T20:10:09.2411810Z Downloading flask-3.0.3-py3-none-any.whl (101 kB) +2025-02-13T20:10:09.3961502Z Collecting Werkzeug<3.1 +2025-02-13T20:10:09.4040880Z Downloading werkzeug-3.0.6-py3-none-any.whl (227 kB) +2025-02-13T20:10:09.5289521Z Collecting dash-table==5.0.0 +2025-02-13T20:10:09.5391859Z Downloading dash_table-5.0.0-py3-none-any.whl (3.9 kB) +2025-02-13T20:10:09.5720596Z Requirement already satisfied: ipython>=6.1.0 in /usr/local/lib/python3.8/dist-packages (from ipywidgets==8.1.1->ttnn==0.0.dev1+any) (8.12.3) +2025-02-13T20:10:09.7371898Z Collecting widgetsnbextension~=4.0.9 +2025-02-13T20:10:09.7458577Z Downloading widgetsnbextension-4.0.13-py3-none-any.whl (2.3 MB) +2025-02-13T20:10:09.8606695Z Requirement already satisfied: traitlets>=4.3.1 in /usr/local/lib/python3.8/dist-packages (from ipywidgets==8.1.1->ttnn==0.0.dev1+any) (5.14.3) +2025-02-13T20:10:09.9516914Z Collecting jupyterlab-widgets~=3.0.9 +2025-02-13T20:10:09.9595019Z Downloading jupyterlab_widgets-3.0.13-py3-none-any.whl (214 kB) +2025-02-13T20:10:10.0464986Z Collecting comm>=0.1.3 +2025-02-13T20:10:10.0545946Z Downloading comm-0.2.2-py3-none-any.whl (7.2 kB) +2025-02-13T20:10:10.0883787Z Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.8/dist-packages (from pandas==2.0.3->ttnn==0.0.dev1+any) (2025.1) +2025-02-13T20:10:10.0896847Z Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.8/dist-packages (from pandas==2.0.3->ttnn==0.0.dev1+any) (2025.1) +2025-02-13T20:10:10.1481114Z Collecting jupyter-lsp>=2.0.0 +2025-02-13T20:10:10.1557546Z Downloading jupyter_lsp-2.2.5-py3-none-any.whl (69 kB) +2025-02-13T20:10:10.2002831Z Requirement already satisfied: tomli>=1.2.2; python_version < "3.11" in /usr/local/lib/python3.8/dist-packages (from jupyterlab==4.2.5->ttnn==0.0.dev1+any) (2.2.1) +2025-02-13T20:10:10.2022755Z Requirement already satisfied: jupyter-core in /usr/local/lib/python3.8/dist-packages (from jupyterlab==4.2.5->ttnn==0.0.dev1+any) (5.7.2) +2025-02-13T20:10:10.2963862Z Collecting httpx>=0.25.0 +2025-02-13T20:10:10.3044015Z Downloading httpx-0.28.1-py3-none-any.whl (73 kB) +2025-02-13T20:10:10.4889970Z Collecting ipykernel>=6.5.0 +2025-02-13T20:10:10.4970069Z Downloading ipykernel-6.29.5-py3-none-any.whl (117 kB) +2025-02-13T20:10:10.6112886Z Collecting notebook-shim>=0.2 +2025-02-13T20:10:10.6188634Z Downloading notebook_shim-0.2.4-py3-none-any.whl (13 kB) +2025-02-13T20:10:10.7685999Z Collecting jupyterlab-server<3,>=2.27.1 +2025-02-13T20:10:10.7768737Z Downloading jupyterlab_server-2.27.3-py3-none-any.whl (59 kB) +2025-02-13T20:10:10.8861566Z Collecting async-lru>=1.0.0 +2025-02-13T20:10:10.8937641Z Downloading async_lru-2.0.4-py3-none-any.whl (6.1 kB) +2025-02-13T20:10:11.0938489Z Collecting jupyter-server<3,>=2.4.0 +2025-02-13T20:10:11.1020286Z Downloading jupyter_server-2.14.2-py3-none-any.whl (383 kB) +2025-02-13T20:10:11.1971628Z Requirement already satisfied: fsspec in /usr/local/lib/python3.8/dist-packages (from torch==2.2.1+cpu->ttnn==0.0.dev1+any) (2023.9.2) +2025-02-13T20:10:11.2270054Z Requirement already satisfied: sympy in /usr/local/lib/python3.8/dist-packages (from torch==2.2.1+cpu->ttnn==0.0.dev1+any) (1.13.3) +2025-02-13T20:10:11.2307070Z Requirement already satisfied: filelock in /usr/local/lib/python3.8/dist-packages (from torch==2.2.1+cpu->ttnn==0.0.dev1+any) (3.16.1) +2025-02-13T20:10:11.2418959Z Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.8/dist-packages (from Jinja2>=2.9->bokeh==3.1.1->ttnn==0.0.dev1+any) (2.1.5) +2025-02-13T20:10:11.2433049Z Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.7->matplotlib==3.7.1->ttnn==0.0.dev1+any) (1.14.0) +2025-02-13T20:10:11.2448773Z Requirement already satisfied: zipp>=3.1.0; python_version < "3.10" in /usr/local/lib/python3.8/dist-packages (from importlib-resources>=3.2.0; python_version < "3.10"->matplotlib==3.7.1->ttnn==0.0.dev1+any) (3.20.2) +2025-02-13T20:10:11.2614420Z Requirement already satisfied: certifi>=2017.4.17 in /usr/lib/python3/dist-packages (from requests->dash==2.15.0->ttnn==0.0.dev1+any) (2019.11.28) +2025-02-13T20:10:11.2625050Z Requirement already satisfied: idna<4,>=2.5 in /usr/lib/python3/dist-packages (from requests->dash==2.15.0->ttnn==0.0.dev1+any) (2.8) +2025-02-13T20:10:11.2640342Z Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.8/dist-packages (from requests->dash==2.15.0->ttnn==0.0.dev1+any) (3.4.1) +2025-02-13T20:10:11.2665804Z Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.8/dist-packages (from requests->dash==2.15.0->ttnn==0.0.dev1+any) (2.2.3) +2025-02-13T20:10:11.3238714Z Collecting itsdangerous>=2.1.2 +2025-02-13T20:10:11.3323736Z Downloading itsdangerous-2.2.0-py3-none-any.whl (16 kB) +2025-02-13T20:10:11.4138278Z Collecting blinker>=1.6.2 +2025-02-13T20:10:11.4215580Z Downloading blinker-1.8.2-py3-none-any.whl (9.5 kB) +2025-02-13T20:10:11.4546078Z Requirement already satisfied: jedi>=0.16 in /usr/local/lib/python3.8/dist-packages (from ipython>=6.1.0->ipywidgets==8.1.1->ttnn==0.0.dev1+any) (0.19.2) +2025-02-13T20:10:11.4808200Z Requirement already satisfied: pygments>=2.4.0 in /usr/local/lib/python3.8/dist-packages (from ipython>=6.1.0->ipywidgets==8.1.1->ttnn==0.0.dev1+any) (2.19.1) +2025-02-13T20:10:11.4826007Z Requirement already satisfied: matplotlib-inline in /usr/local/lib/python3.8/dist-packages (from ipython>=6.1.0->ipywidgets==8.1.1->ttnn==0.0.dev1+any) (0.1.7) +2025-02-13T20:10:11.4839636Z Requirement already satisfied: decorator in /usr/local/lib/python3.8/dist-packages (from ipython>=6.1.0->ipywidgets==8.1.1->ttnn==0.0.dev1+any) (5.1.1) +2025-02-13T20:10:11.4860314Z Requirement already satisfied: pexpect>4.3; sys_platform != "win32" in /usr/local/lib/python3.8/dist-packages (from ipython>=6.1.0->ipywidgets==8.1.1->ttnn==0.0.dev1+any) (4.9.0) +2025-02-13T20:10:11.4876966Z Requirement already satisfied: backcall in /usr/local/lib/python3.8/dist-packages (from ipython>=6.1.0->ipywidgets==8.1.1->ttnn==0.0.dev1+any) (0.2.0) +2025-02-13T20:10:11.4884333Z Requirement already satisfied: pickleshare in /usr/local/lib/python3.8/dist-packages (from ipython>=6.1.0->ipywidgets==8.1.1->ttnn==0.0.dev1+any) (0.7.5) +2025-02-13T20:10:11.4905953Z Requirement already satisfied: prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30 in /usr/local/lib/python3.8/dist-packages (from ipython>=6.1.0->ipywidgets==8.1.1->ttnn==0.0.dev1+any) (3.0.50) +2025-02-13T20:10:11.4922934Z Requirement already satisfied: stack-data in /usr/local/lib/python3.8/dist-packages (from ipython>=6.1.0->ipywidgets==8.1.1->ttnn==0.0.dev1+any) (0.6.3) +2025-02-13T20:10:11.4986586Z Requirement already satisfied: platformdirs>=2.5 in /usr/local/lib/python3.8/dist-packages (from jupyter-core->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (3.11.0) +2025-02-13T20:10:11.5934748Z Collecting anyio +2025-02-13T20:10:11.6010262Z Downloading anyio-4.5.2-py3-none-any.whl (89 kB) +2025-02-13T20:10:11.7358773Z Collecting httpcore==1.* +2025-02-13T20:10:11.7433669Z Downloading httpcore-1.0.7-py3-none-any.whl (78 kB) +2025-02-13T20:10:11.7866423Z Requirement already satisfied: jupyter-client>=6.1.12 in /usr/local/lib/python3.8/dist-packages (from ipykernel>=6.5.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (8.6.3) +2025-02-13T20:10:12.2996910Z Collecting debugpy>=1.6.5 +2025-02-13T20:10:12.3083402Z Downloading debugpy-1.8.12-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB) +2025-02-13T20:10:12.4947620Z Requirement already satisfied: psutil in /usr/local/lib/python3.8/dist-packages (from ipykernel>=6.5.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (6.1.1) +2025-02-13T20:10:12.5111264Z Requirement already satisfied: pyzmq>=24 in /usr/local/lib/python3.8/dist-packages (from ipykernel>=6.5.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (26.2.1) +2025-02-13T20:10:12.5129423Z Requirement already satisfied: jsonschema>=4.18.0 in /usr/local/lib/python3.8/dist-packages (from jupyterlab-server<3,>=2.27.1->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (4.23.0) +2025-02-13T20:10:12.5308670Z Requirement already satisfied: babel>=2.10 in /usr/local/lib/python3.8/dist-packages (from jupyterlab-server<3,>=2.27.1->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (2.17.0) +2025-02-13T20:10:12.6120555Z Collecting json5>=0.9.0 +2025-02-13T20:10:12.6200574Z Downloading json5-0.10.0-py3-none-any.whl (34 kB) +2025-02-13T20:10:12.7213676Z Collecting overrides>=5.0 +2025-02-13T20:10:12.7287725Z Downloading overrides-7.7.0-py3-none-any.whl (17 kB) +2025-02-13T20:10:12.7628390Z Requirement already satisfied: nbformat>=5.3.0 in /usr/local/lib/python3.8/dist-packages (from jupyter-server<3,>=2.4.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (5.10.4) +2025-02-13T20:10:12.8675464Z Collecting argon2-cffi>=21.1 +2025-02-13T20:10:12.8758445Z Downloading argon2_cffi-23.1.0-py3-none-any.whl (15 kB) +2025-02-13T20:10:12.9723355Z Collecting send2trash>=1.8.2 +2025-02-13T20:10:12.9805497Z Downloading Send2Trash-1.8.3-py3-none-any.whl (18 kB) +2025-02-13T20:10:13.0712815Z Collecting jupyter-server-terminals>=0.4.4 +2025-02-13T20:10:13.0793000Z Downloading jupyter_server_terminals-0.5.3-py3-none-any.whl (13 kB) +2025-02-13T20:10:13.1910330Z Collecting terminado>=0.8.3 +2025-02-13T20:10:13.1987150Z Downloading terminado-0.18.1-py3-none-any.whl (14 kB) +2025-02-13T20:10:13.2404203Z Requirement already satisfied: nbconvert>=6.4.4 in /usr/local/lib/python3.8/dist-packages (from jupyter-server<3,>=2.4.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (7.16.6) +2025-02-13T20:10:13.3463664Z Collecting websocket-client>=1.7 +2025-02-13T20:10:13.3541695Z Downloading websocket_client-1.8.0-py3-none-any.whl (58 kB) +2025-02-13T20:10:13.4571579Z Collecting prometheus-client>=0.9 +2025-02-13T20:10:13.4656994Z Downloading prometheus_client-0.21.1-py3-none-any.whl (54 kB) +2025-02-13T20:10:13.5528837Z Collecting jupyter-events>=0.9.0 +2025-02-13T20:10:13.5605024Z Downloading jupyter_events-0.10.0-py3-none-any.whl (18 kB) +2025-02-13T20:10:13.6071127Z Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.8/dist-packages (from sympy->torch==2.2.1+cpu->ttnn==0.0.dev1+any) (1.3.0) +2025-02-13T20:10:13.6146949Z Requirement already satisfied: parso<0.9.0,>=0.8.4 in /usr/local/lib/python3.8/dist-packages (from jedi>=0.16->ipython>=6.1.0->ipywidgets==8.1.1->ttnn==0.0.dev1+any) (0.8.4) +2025-02-13T20:10:13.6196335Z Requirement already satisfied: ptyprocess>=0.5 in /usr/local/lib/python3.8/dist-packages (from pexpect>4.3; sys_platform != "win32"->ipython>=6.1.0->ipywidgets==8.1.1->ttnn==0.0.dev1+any) (0.7.0) +2025-02-13T20:10:13.6205449Z Requirement already satisfied: wcwidth in /usr/local/lib/python3.8/dist-packages (from prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30->ipython>=6.1.0->ipywidgets==8.1.1->ttnn==0.0.dev1+any) (0.2.13) +2025-02-13T20:10:13.6230158Z Requirement already satisfied: asttokens>=2.1.0 in /usr/local/lib/python3.8/dist-packages (from stack-data->ipython>=6.1.0->ipywidgets==8.1.1->ttnn==0.0.dev1+any) (3.0.0) +2025-02-13T20:10:13.6281275Z Requirement already satisfied: executing>=1.2.0 in /usr/local/lib/python3.8/dist-packages (from stack-data->ipython>=6.1.0->ipywidgets==8.1.1->ttnn==0.0.dev1+any) (2.2.0) +2025-02-13T20:10:13.6342447Z Requirement already satisfied: pure-eval in /usr/local/lib/python3.8/dist-packages (from stack-data->ipython>=6.1.0->ipywidgets==8.1.1->ttnn==0.0.dev1+any) (0.2.3) +2025-02-13T20:10:13.6795386Z Collecting sniffio>=1.1 +2025-02-13T20:10:13.6870761Z Downloading sniffio-1.3.1-py3-none-any.whl (10 kB) +2025-02-13T20:10:13.7554738Z Requirement already satisfied: exceptiongroup>=1.0.2; python_version < "3.11" in /usr/local/lib/python3.8/dist-packages (from anyio->httpx>=0.25.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (1.2.2) +2025-02-13T20:10:13.8051582Z Collecting h11<0.15,>=0.13 +2025-02-13T20:10:13.8181072Z Downloading h11-0.14.0-py3-none-any.whl (58 kB) +2025-02-13T20:10:13.8557187Z Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.8/dist-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.27.1->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (0.35.1) +2025-02-13T20:10:13.8588767Z Requirement already satisfied: pkgutil-resolve-name>=1.3.10; python_version < "3.9" in /usr/local/lib/python3.8/dist-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.27.1->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (1.3.10) +2025-02-13T20:10:13.8598258Z Requirement already satisfied: attrs>=22.2.0 in /usr/local/lib/python3.8/dist-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.27.1->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (25.1.0) +2025-02-13T20:10:13.9028337Z Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.8/dist-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.27.1->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (2023.12.1) +2025-02-13T20:10:13.9061645Z Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.8/dist-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.27.1->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (0.20.1) +2025-02-13T20:10:13.9074260Z Requirement already satisfied: fastjsonschema>=2.15 in /usr/local/lib/python3.8/dist-packages (from nbformat>=5.3.0->jupyter-server<3,>=2.4.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (2.21.1) +2025-02-13T20:10:13.9666564Z Collecting argon2-cffi-bindings +2025-02-13T20:10:13.9743492Z Downloading argon2_cffi_bindings-21.2.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (86 kB) +2025-02-13T20:10:14.0135393Z Requirement already satisfied: jupyterlab-pygments in /usr/local/lib/python3.8/dist-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (0.3.0) +2025-02-13T20:10:14.0149498Z Requirement already satisfied: nbclient>=0.5.0 in /usr/local/lib/python3.8/dist-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (0.10.1) +2025-02-13T20:10:14.0372656Z Requirement already satisfied: defusedxml in /usr/local/lib/python3.8/dist-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (0.7.1) +2025-02-13T20:10:14.0398063Z Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.8/dist-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (4.13.3) +2025-02-13T20:10:14.0455908Z Requirement already satisfied: pandocfilters>=1.4.1 in /usr/local/lib/python3.8/dist-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (1.5.1) +2025-02-13T20:10:14.0471767Z Requirement already satisfied: mistune<4,>=2.0.3 in /usr/local/lib/python3.8/dist-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (3.1.1) +2025-02-13T20:10:14.0501607Z Requirement already satisfied: bleach[css]!=5.0.0 in /usr/local/lib/python3.8/dist-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (6.1.0) +2025-02-13T20:10:14.0983652Z Collecting rfc3986-validator>=0.1.1 +2025-02-13T20:10:14.1059615Z Downloading rfc3986_validator-0.1.1-py2.py3-none-any.whl (4.2 kB) +2025-02-13T20:10:14.1962996Z Collecting python-json-logger>=2.0.4 +2025-02-13T20:10:14.2038931Z Downloading python_json_logger-3.2.1-py3-none-any.whl (14 kB) +2025-02-13T20:10:14.3038896Z Collecting rfc3339-validator +2025-02-13T20:10:14.3116007Z Downloading rfc3339_validator-0.1.4-py2.py3-none-any.whl (3.5 kB) +2025-02-13T20:10:14.3443515Z Requirement already satisfied: cffi>=1.0.1 in /usr/local/lib/python3.8/dist-packages (from argon2-cffi-bindings->argon2-cffi>=21.1->jupyter-server<3,>=2.4.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (1.17.1) +2025-02-13T20:10:14.3460505Z Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.8/dist-packages (from beautifulsoup4->nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (2.6) +2025-02-13T20:10:14.3470512Z Requirement already satisfied: webencodings in /usr/local/lib/python3.8/dist-packages (from bleach[css]!=5.0.0->nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (0.5.1) +2025-02-13T20:10:14.3488965Z Requirement already satisfied: tinycss2<1.3,>=1.1.0; extra == "css" in /usr/local/lib/python3.8/dist-packages (from bleach[css]!=5.0.0->nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (1.2.1) +2025-02-13T20:10:14.3539315Z Requirement already satisfied: pycparser in /usr/local/lib/python3.8/dist-packages (from cffi>=1.0.1->argon2-cffi-bindings->argon2-cffi>=21.1->jupyter-server<3,>=2.4.0->jupyterlab==4.2.5->ttnn==0.0.dev1+any) (2.22) +2025-02-13T20:10:16.2354215Z Installing collected packages: Pillow, pyyaml, xyzservices, bokeh, matplotlib, seaborn, click, toolz, tenacity, plotly, nest-asyncio, retrying, dash-core-components, dash-html-components, Werkzeug, itsdangerous, blinker, Flask, dash-table, dash, widgetsnbextension, jupyterlab-widgets, comm, ipywidgets, loguru, overrides, argon2-cffi-bindings, argon2-cffi, send2trash, terminado, jupyter-server-terminals, websocket-client, prometheus-client, rfc3986-validator, python-json-logger, rfc3339-validator, jupyter-events, sniffio, anyio, jupyter-server, jupyter-lsp, h11, httpcore, httpx, debugpy, ipykernel, notebook-shim, json5, jupyterlab-server, async-lru, jupyterlab, graphviz, ttnn +2025-02-13T20:10:17.4120269Z WARNING: The script bokeh is installed in '/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.local/bin' which is not on PATH. +2025-02-13T20:10:17.4121453Z Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location. +2025-02-13T20:10:29.4766857Z WARNING: The script flask is installed in '/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.local/bin' which is not on PATH. +2025-02-13T20:10:29.4768344Z Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location. +2025-02-13T20:10:29.9677124Z WARNING: The scripts dash-generate-components, dash-update-components and renderer are installed in '/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.local/bin' which is not on PATH. +2025-02-13T20:10:29.9678686Z Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location. +2025-02-13T20:10:30.3195998Z WARNING: The script send2trash is installed in '/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.local/bin' which is not on PATH. +2025-02-13T20:10:30.3197216Z Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location. +2025-02-13T20:10:30.3817797Z WARNING: The script wsdump is installed in '/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.local/bin' which is not on PATH. +2025-02-13T20:10:30.3819035Z Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location. +2025-02-13T20:10:30.4598707Z WARNING: The script jupyter-events is installed in '/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.local/bin' which is not on PATH. +2025-02-13T20:10:30.4599894Z Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location. +2025-02-13T20:10:30.6969374Z WARNING: The script jupyter-server is installed in '/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.local/bin' which is not on PATH. +2025-02-13T20:10:30.6970649Z Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location. +2025-02-13T20:10:30.9023373Z WARNING: The script httpx is installed in '/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.local/bin' which is not on PATH. +2025-02-13T20:10:30.9024582Z Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location. +2025-02-13T20:10:31.5992315Z WARNING: The script debugpy is installed in '/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.local/bin' which is not on PATH. +2025-02-13T20:10:31.5993680Z Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location. +2025-02-13T20:10:31.7188846Z WARNING: The script pyjson5 is installed in '/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.local/bin' which is not on PATH. +2025-02-13T20:10:31.7190552Z Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location. +2025-02-13T20:10:32.3003967Z WARNING: The scripts jlpm, jupyter-lab, jupyter-labextension and jupyter-labhub are installed in '/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.local/bin' which is not on PATH. +2025-02-13T20:10:32.3005359Z Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location. +2025-02-13T20:10:32.3676807Z Running setup.py develop for ttnn +2025-02-13T20:10:35.7195711Z Successfully installed Flask-3.0.3 Pillow-10.3.0 Werkzeug-3.0.6 anyio-4.5.2 argon2-cffi-23.1.0 argon2-cffi-bindings-21.2.0 async-lru-2.0.4 blinker-1.8.2 bokeh-3.1.1 click-8.1.7 comm-0.2.2 dash-2.15.0 dash-core-components-2.0.0 dash-html-components-2.0.0 dash-table-5.0.0 debugpy-1.8.12 graphviz-0.20.3 h11-0.14.0 httpcore-1.0.7 httpx-0.28.1 ipykernel-6.29.5 ipywidgets-8.1.1 itsdangerous-2.2.0 json5-0.10.0 jupyter-events-0.10.0 jupyter-lsp-2.2.5 jupyter-server-2.14.2 jupyter-server-terminals-0.5.3 jupyterlab-4.2.5 jupyterlab-server-2.27.3 jupyterlab-widgets-3.0.13 loguru-0.6.0 matplotlib-3.7.1 nest-asyncio-1.6.0 notebook-shim-0.2.4 overrides-7.7.0 plotly-5.18.0 prometheus-client-0.21.1 python-json-logger-3.2.1 pyyaml-6.0.2 retrying-1.3.4 rfc3339-validator-0.1.4 rfc3986-validator-0.1.1 seaborn-0.13.2 send2trash-1.8.3 sniffio-1.3.1 tenacity-9.0.0 terminado-0.18.1 toolz-0.12.0 ttnn websocket-client-1.8.0 widgetsnbextension-4.0.13 xyzservices-2025.1.0 +2025-02-13T20:10:36.2716033Z Running watcher dump tool tests... +2025-02-13T20:10:36.2760373Z Running main() from gmock_main.cc +2025-02-13T20:10:36.2761033Z Note: Google Test filter = *PrintHanging +2025-02-13T20:10:36.2762042Z [==========] Running 1 test from 1 test suite. +2025-02-13T20:10:36.2762515Z [----------] Global test environment set-up. +2025-02-13T20:10:36.2762950Z [----------] 1 test from DPrintFixture +2025-02-13T20:10:36.2763404Z [ RUN ] DPrintFixture.TensixTestPrintHanging +2025-02-13T20:10:36.2764782Z  Test | INFO  | Running test using Fast Dispatch +2025-02-13T20:10:36.2818019Z +2025-02-13T20:10:36.2854274Z  Device | INFO  | Opening user mode device driver +2025-02-13T20:10:36.2881258Z 2025-02-13 20:10:36.287 | INFO  | SiliconDriver  - Opened PCI device 0; KMD version: 1.29.0, IOMMU: disabled +2025-02-13T20:10:36.2893255Z 2025-02-13 20:10:36.288 | INFO  | SiliconDriver  - Detected PCI devices: [0] +2025-02-13T20:10:36.2894436Z 2025-02-13 20:10:36.288 | INFO  | SiliconDriver  - Using local chip ids: {0} and remote chip ids {} +2025-02-13T20:10:36.3036817Z 2025-02-13 20:10:36.303 | WARNING  | SiliconDriver  - init_detect_tt_device_numanodes(): Could not determine NumaNodeSet for TT device (physical_device_id: 0 pci_bus_id: 0000:07:00.0) +2025-02-13T20:10:36.3038648Z 2025-02-13 20:10:36.303 | WARNING  | SiliconDriver  - Could not find NumaNodeSet for TT Device (physical_device_id: 0 pci_bus_id: 0000:07:00.0) +2025-02-13T20:10:36.3057363Z 2025-02-13 20:10:36.305 | WARNING  | SiliconDriver  - bind_area_memory_nodeset(): Unable to determine TT Device to NumaNode mapping for physical_device_id: 0. Skipping membind. +2025-02-13T20:10:36.3060668Z 2025-02-13 20:10:36.305 | WARNING  | SiliconDriver  - ---- ttSiliconDevice::init_hugepage: bind_area_to_memory_nodeset() failed (physical_device_id: 0 ch: 0). Hugepage allocation is not on NumaNode matching TT Device. Side-Effect is decreased Device->Host perf (Issue #893). +2025-02-13T20:10:36.3124065Z 2025-02-13 20:10:36.311 | INFO  | SiliconDriver  - Software version 6.0.0, Ethernet FW version 6.10.0 (Device 0) +2025-02-13T20:10:36.3163804Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:10:36.3190418Z  BuildKernels | INFO  | Skipping deleting built cache +2025-02-13T20:10:36.3198592Z  Metal | INFO  | AI CLK for device 0 is: 1000 MHz +2025-02-13T20:10:36.7944611Z  Always | WARNING  | Dispatch Core Type = CoreType::WORKER +2025-02-13T20:10:36.7952135Z  Metal | INFO  | DPRINT enabled on device 0, worker worker cores. +2025-02-13T20:10:36.7955083Z  Metal | INFO  | DPRINT enabled on device 0, ethernet worker cores. +2025-02-13T20:10:36.7965673Z  Metal | INFO  | DPRINT Server attached device 0 +2025-02-13T20:10:37.6901566Z  Test | INFO  | Running test on device 0. +2025-02-13T20:10:37.9902451Z  Metal | WARNING  | Debug Print Server encountered an error: DPRINT server timed out on Device 0, worker core (x=0,y=0), riscv 4, waiting on a RAISE signal: 1 +2025-02-13T20:10:37.9903639Z +2025-02-13T20:10:37.9904814Z  Always | FATAL  | DPRINT server timed out on Device 0, worker core (x=0,y=0), riscv 4, waiting on a RAISE signal: 1 +2025-02-13T20:10:37.9905534Z +2025-02-13T20:10:38.0377328Z  Test | INFO  | Finished running test on device 0. +2025-02-13T20:10:38.0379528Z  Metal | INFO  | Closing device 0 +2025-02-13T20:10:38.0432815Z  Metal | INFO  | DPRINT Server dettached device 0 +2025-02-13T20:10:38.0450567Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:10:38.0455659Z [ OK ] DPrintFixture.TensixTestPrintHanging (1769 ms) +2025-02-13T20:10:38.0458275Z [----------] 1 test from DPrintFixture (1769 ms total) +2025-02-13T20:10:38.0458774Z +2025-02-13T20:10:38.0458999Z [----------] Global test environment tear-down +2025-02-13T20:10:38.0462986Z [==========] 1 test from 1 test suite ran. (1770 ms total) +2025-02-13T20:10:38.0463932Z [ PASSED ] 1 test. +2025-02-13T20:10:38.0471644Z  Device | INFO  | Closing user mode device drivers +2025-02-13T20:10:38.0572340Z Running watcher dump tool... +2025-02-13T20:10:38.0573126Z  Device | INFO  | Opening user mode device driver +2025-02-13T20:10:38.0627022Z +2025-02-13T20:10:38.0688115Z 2025-02-13 20:10:38.068 | INFO  | SiliconDriver  - Opened PCI device 0; KMD version: 1.29.0, IOMMU: disabled +2025-02-13T20:10:38.0700894Z 2025-02-13 20:10:38.069 | INFO  | SiliconDriver  - Detected PCI devices: [0] +2025-02-13T20:10:38.0702130Z 2025-02-13 20:10:38.069 | INFO  | SiliconDriver  - Using local chip ids: {0} and remote chip ids {} +2025-02-13T20:10:38.0848207Z 2025-02-13 20:10:38.083 | WARNING  | SiliconDriver  - init_detect_tt_device_numanodes(): Could not determine NumaNodeSet for TT device (physical_device_id: 0 pci_bus_id: 0000:07:00.0) +2025-02-13T20:10:38.0849988Z 2025-02-13 20:10:38.084 | WARNING  | SiliconDriver  - Could not find NumaNodeSet for TT Device (physical_device_id: 0 pci_bus_id: 0000:07:00.0) +2025-02-13T20:10:38.0861990Z 2025-02-13 20:10:38.085 | WARNING  | SiliconDriver  - bind_area_memory_nodeset(): Unable to determine TT Device to NumaNode mapping for physical_device_id: 0. Skipping membind. +2025-02-13T20:10:38.0864661Z 2025-02-13 20:10:38.085 | WARNING  | SiliconDriver  - ---- ttSiliconDevice::init_hugepage: bind_area_to_memory_nodeset() failed (physical_device_id: 0 ch: 0). Hugepage allocation is not on NumaNode matching TT Device. Side-Effect is decreased Device->Host perf (Issue #893). +2025-02-13T20:10:38.0912419Z 2025-02-13 20:10:38.090 | INFO  | SiliconDriver  - Software version 6.0.0, Ethernet FW version 6.10.0 (Device 0) +2025-02-13T20:10:38.0924798Z Dumping Command Queues into: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/generated/watcher/command_queue_dump/ +2025-02-13T20:10:38.0926117Z Dumping Watcher Log into: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/generated/watcher/watcher.log +2025-02-13T20:10:38.0957162Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:10:38.0987886Z  Metal | INFO  | AI CLK for device 0 is: 1000 MHz +2025-02-13T20:10:38.0992999Z  Always | INFO  | Reading Device 0 CQ 0, Completion Queue... +2025-02-13T20:10:38.0999486Z [ ]0 % +2025-02-13T20:10:38.1003889Z [= ]1 % +2025-02-13T20:10:38.1010845Z [== ]2 % +2025-02-13T20:10:38.1015534Z [=== ]3 % +2025-02-13T20:10:38.1022187Z [==== ]5 % +2025-02-13T20:10:38.1025856Z [===== ]6 % +2025-02-13T20:10:38.1031469Z [====== ]7 % +2025-02-13T20:10:38.1036687Z [======= ]8 % +2025-02-13T20:10:38.1042314Z [======== ]10 % +2025-02-13T20:10:38.1048249Z [========= ]11 % +2025-02-13T20:10:38.1053096Z [========== ]12 % +2025-02-13T20:10:38.1055837Z [=========== ]14 % +2025-02-13T20:10:38.1057892Z [====================== ]28 % +2025-02-13T20:10:38.1059932Z [================================= ]41 % +2025-02-13T20:10:38.1063421Z [================================== ]43 % +2025-02-13T20:10:38.1068049Z [============================================= ]56 % +2025-02-13T20:10:38.1069497Z [======================================================= ]69 % +2025-02-13T20:10:38.1076359Z [======================================================== ]70 % +2025-02-13T20:10:38.1079378Z [========================================================= ]71 % +2025-02-13T20:10:38.1083870Z [==================================================================== ]85 % +2025-02-13T20:10:38.1086736Z [===================================================================== ]86 % +2025-02-13T20:10:38.1088917Z [=============================================================================== ]99 % +2025-02-13T20:10:38.1091376Z [================================================================================]100 % +2025-02-13T20:10:38.1092321Z  Always | INFO  | Reading Device 0 CQ 0, Issue Queue... +2025-02-13T20:10:38.1113062Z [ ]0 % +2025-02-13T20:10:38.1137975Z [= ]1 % +2025-02-13T20:10:38.1161822Z [== ]2 % +2025-02-13T20:10:38.1184093Z [=== ]3 % +2025-02-13T20:10:38.1207458Z [==== ]5 % +2025-02-13T20:10:38.1231777Z [===== ]6 % +2025-02-13T20:10:38.1254347Z [====== ]7 % +2025-02-13T20:10:38.1276780Z [======= ]8 % +2025-02-13T20:10:38.1318430Z [======== ]10 % +2025-02-13T20:10:38.1322929Z [========= ]11 % +2025-02-13T20:10:38.1344701Z [========== ]12 % +2025-02-13T20:10:38.1368661Z [=========== ]13 % +2025-02-13T20:10:38.1393696Z [============ ]15 % +2025-02-13T20:10:38.1416883Z [============= ]16 % +2025-02-13T20:10:38.1440349Z [============== ]17 % +2025-02-13T20:10:38.1464098Z [=============== ]18 % +2025-02-13T20:10:38.1487146Z [================ ]20 % +2025-02-13T20:10:38.1511612Z [================= ]21 % +2025-02-13T20:10:38.1534870Z [================== ]22 % +2025-02-13T20:10:38.1558321Z [=================== ]23 % +2025-02-13T20:10:38.1581684Z [==================== ]25 % +2025-02-13T20:10:38.1605591Z [===================== ]26 % +2025-02-13T20:10:38.1628330Z [====================== ]27 % +2025-02-13T20:10:38.1652027Z [======================= ]28 % +2025-02-13T20:10:38.1676181Z [======================== ]30 % +2025-02-13T20:10:38.6606184Z [========================= ]31 % +2025-02-13T20:10:38.6629073Z [========================== ]32 % +2025-02-13T20:10:38.6652091Z [=========================== ]33 % +2025-02-13T20:10:38.6675890Z [============================ ]35 % +2025-02-13T20:10:38.6699705Z [============================= ]36 % +2025-02-13T20:10:38.6723330Z [============================== ]37 % +2025-02-13T20:10:38.6746978Z [=============================== ]38 % +2025-02-13T20:10:38.6769677Z [================================ ]40 % +2025-02-13T20:10:38.6791805Z [================================= ]41 % +2025-02-13T20:10:38.6815638Z [================================== ]42 % +2025-02-13T20:10:38.6838219Z [=================================== ]43 % +2025-02-13T20:10:38.6861825Z [==================================== ]45 % +2025-02-13T20:10:38.6885102Z [===================================== ]46 % +2025-02-13T20:10:38.6909744Z [====================================== ]47 % +2025-02-13T20:10:38.6933502Z [======================================= ]48 % +2025-02-13T20:10:38.6957053Z [======================================== ]50 % +2025-02-13T20:10:38.6980939Z [========================================= ]51 % +2025-02-13T20:10:38.7004147Z [========================================== ]52 % +2025-02-13T20:10:38.7027596Z [=========================================== ]53 % +2025-02-13T20:10:38.7051296Z [============================================ ]55 % +2025-02-13T20:10:38.7075479Z [============================================= ]56 % +2025-02-13T20:10:38.7099659Z [============================================== ]57 % +2025-02-13T20:10:38.7123967Z [=============================================== ]58 % +2025-02-13T20:10:38.7147458Z [================================================ ]60 % +2025-02-13T20:10:38.7172298Z [================================================= ]61 % +2025-02-13T20:10:38.7195970Z [================================================== ]62 % +2025-02-13T20:10:38.7219299Z [=================================================== ]63 % +2025-02-13T20:10:38.7243466Z [==================================================== ]65 % +2025-02-13T20:10:38.7265905Z [===================================================== ]66 % +2025-02-13T20:10:38.7290170Z [====================================================== ]67 % +2025-02-13T20:10:38.7313466Z [======================================================= ]68 % +2025-02-13T20:10:38.7337287Z [======================================================== ]70 % +2025-02-13T20:10:38.7360693Z [========================================================= ]71 % +2025-02-13T20:10:38.7384493Z [========================================================== ]72 % +2025-02-13T20:10:38.7407535Z [=========================================================== ]73 % +2025-02-13T20:10:38.7430491Z [============================================================ ]75 % +2025-02-13T20:10:38.7453950Z [============================================================= ]76 % +2025-02-13T20:10:38.7476992Z [============================================================== ]77 % +2025-02-13T20:10:38.7500566Z [=============================================================== ]78 % +2025-02-13T20:10:38.7523883Z [================================================================ ]80 % +2025-02-13T20:10:38.7548025Z [================================================================= ]81 % +2025-02-13T20:10:38.7572426Z [================================================================== ]82 % +2025-02-13T20:10:38.7595757Z [=================================================================== ]83 % +2025-02-13T20:10:38.7619155Z [==================================================================== ]85 % +2025-02-13T20:10:38.7642661Z [===================================================================== ]86 % +2025-02-13T20:10:38.7666500Z [====================================================================== ]87 % +2025-02-13T20:10:38.7689723Z [======================================================================= ]88 % +2025-02-13T20:10:38.7713172Z [======================================================================== ]90 % +2025-02-13T20:10:38.7736164Z [========================================================================= ]91 % +2025-02-13T20:10:38.7759126Z [========================================================================== ]92 % +2025-02-13T20:10:38.7782761Z [=========================================================================== ]93 % +2025-02-13T20:10:38.7805439Z [============================================================================ ]95 % +2025-02-13T20:10:38.7829317Z [============================================================================= ]96 % +2025-02-13T20:10:38.7853149Z [============================================================================== ]97 % +2025-02-13T20:10:38.7876172Z [=============================================================================== ]98 % +2025-02-13T20:10:38.7887223Z [================================================================================]100 % +2025-02-13T20:10:38.7891564Z  LLRuntime | INFO  | Watcher log file: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/generated/watcher/watcher.log +2025-02-13T20:10:38.7893034Z  LLRuntime | INFO  | Watcher checking device 0 +2025-02-13T20:10:39.0636763Z Watcher dump tool finished. +2025-02-13T20:10:39.0639703Z  Device | INFO  | Closing user mode device drivers +2025-02-13T20:10:39.0716104Z Watcher dump minimal test - Pass +2025-02-13T20:10:39.0760668Z Running main() from gmock_main.cc +2025-02-13T20:10:39.0761552Z Note: Google Test filter = *WatcherAssertBrisc +2025-02-13T20:10:39.0762367Z [==========] Running 1 test from 1 test suite. +2025-02-13T20:10:39.0763075Z [----------] Global test environment set-up. +2025-02-13T20:10:39.0763828Z [----------] 1 test from WatcherFixture +2025-02-13T20:10:39.0764589Z [ RUN ] WatcherFixture.TestWatcherAssertBrisc +2025-02-13T20:10:39.0766204Z  LLRuntime | INFO  | Watcher log file: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/generated/watcher/watcher.log +2025-02-13T20:10:39.0767857Z  Test | INFO  | Running test using Fast Dispatch +2025-02-13T20:10:39.0816906Z +2025-02-13T20:10:39.0853065Z  Device | INFO  | Opening user mode device driver +2025-02-13T20:10:39.0880935Z 2025-02-13 20:10:39.087 | INFO  | SiliconDriver  - Opened PCI device 0; KMD version: 1.29.0, IOMMU: disabled +2025-02-13T20:10:39.0892903Z 2025-02-13 20:10:39.088 | INFO  | SiliconDriver  - Detected PCI devices: [0] +2025-02-13T20:10:39.0894184Z 2025-02-13 20:10:39.088 | INFO  | SiliconDriver  - Using local chip ids: {0} and remote chip ids {} +2025-02-13T20:10:39.1039701Z 2025-02-13 20:10:39.103 | WARNING  | SiliconDriver  - init_detect_tt_device_numanodes(): Could not determine NumaNodeSet for TT device (physical_device_id: 0 pci_bus_id: 0000:07:00.0) +2025-02-13T20:10:39.1041598Z 2025-02-13 20:10:39.103 | WARNING  | SiliconDriver  - Could not find NumaNodeSet for TT Device (physical_device_id: 0 pci_bus_id: 0000:07:00.0) +2025-02-13T20:10:39.1052365Z 2025-02-13 20:10:39.104 | WARNING  | SiliconDriver  - bind_area_memory_nodeset(): Unable to determine TT Device to NumaNode mapping for physical_device_id: 0. Skipping membind. +2025-02-13T20:10:39.1055208Z 2025-02-13 20:10:39.104 | WARNING  | SiliconDriver  - ---- ttSiliconDevice::init_hugepage: bind_area_to_memory_nodeset() failed (physical_device_id: 0 ch: 0). Hugepage allocation is not on NumaNode matching TT Device. Side-Effect is decreased Device->Host perf (Issue #893). +2025-02-13T20:10:39.1104594Z 2025-02-13 20:10:39.109 | INFO  | SiliconDriver  - Software version 6.0.0, Ethernet FW version 6.10.0 (Device 0) +2025-02-13T20:10:39.1142242Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:10:39.1168917Z  Metal | INFO  | AI CLK for device 0 is: 1000 MHz +2025-02-13T20:10:39.7399401Z  LLRuntime | INFO  | Watcher log file: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/generated/watcher/watcher.log +2025-02-13T20:10:39.7402555Z  LLRuntime | INFO  | Watcher attached device 0 LLRuntime | INFO  | Watcher server initialized, disabled features: None +2025-02-13T20:10:39.7404070Z +2025-02-13T20:10:39.9903596Z  LLRuntime | INFO  | Watcher checking device 0 +2025-02-13T20:10:40.5135446Z  LLRuntime | INFO  | Watcher checking device 0 +2025-02-13T20:10:40.5986020Z  Test | INFO  | Running test on device 0. +2025-02-13T20:10:40.5987015Z  Test | INFO  | Running test on device 0 core (x=18,y=18)... +2025-02-13T20:10:40.7943108Z  Test | INFO  | Running args that shouldn't assert... +2025-02-13T20:10:41.0444665Z  LLRuntime | INFO  | Watcher checking device 0 +2025-02-13T20:10:41.1918233Z  Test | INFO  | Args did not assert! +2025-02-13T20:10:41.1919240Z  Test | INFO  | Running args that should assert... +2025-02-13T20:10:41.1921533Z  Test | INFO  | Expected error: Device 0 worker core(x= 0,y= 0) virtual(x=18,y=18): brisc tripped an assert on line 57. Current kernel: tests/tt_metal/tt_metal/test_kernels/misc/watcher_asserts.cpp. Note that file name reporting is not yet implemented, and the reported line number for the assert may be from a different file. +2025-02-13T20:10:41.5677446Z  LLRuntime | INFO  | Watcher checking device 0 +2025-02-13T20:10:41.5710348Z  Always | WARNING  | Watcher stopped the device due to tripped assert, see watcher log for more details +2025-02-13T20:10:41.5712786Z  Always | WARNING  | Device 0 worker core(x= 0,y= 0) virtual(x=18,y=18): brisc tripped an assert on line 57. Current kernel: tests/tt_metal/tt_metal/test_kernels/misc/watcher_asserts.cpp. Note that file name reporting is not yet implemented, and the reported line number for the assert may be from a different file. +2025-02-13T20:10:41.5715241Z  Always | INFO  | Last waypoint: R, W, W, W, W +2025-02-13T20:10:41.5716154Z  Always | INFO  | Last ring buffer status: +2025-02-13T20:10:41.5716774Z debug_ring_buffer= +2025-02-13T20:10:41.5717153Z [0x00000003,0x00000003,0x00000004,0x00000003] +2025-02-13T20:10:41.5717956Z  Always | INFO  | While running kernels: +2025-02-13T20:10:41.5718993Z  Always | INFO  | brisc : tests/tt_metal/tt_metal/test_kernels/misc/watcher_asserts.cpp +2025-02-13T20:10:41.5720072Z  Always | INFO  | ncrisc: blank +2025-02-13T20:10:41.5720999Z  Always | INFO  | triscs: blank +2025-02-13T20:10:41.5723298Z  Test | INFO  | Reported error: Device 0 worker core(x= 0,y= 0) virtual(x=18,y=18): brisc tripped an assert on line 57. Current kernel: tests/tt_metal/tt_metal/test_kernels/misc/watcher_asserts.cpp. Note that file name reporting is not yet implemented, and the reported line number for the assert may be from a different file. +2025-02-13T20:10:41.5725329Z  Always | FATAL  | Watcher detected tripped assert and stopped device. +2025-02-13T20:10:41.5726508Z  Test | INFO  | Finished running test on device 0. +2025-02-13T20:10:41.5727460Z  LLRuntime | INFO  | Watcher thread stopped watching... +2025-02-13T20:10:41.8212898Z  LLRuntime | INFO  | Watcher log file: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/generated/watcher/watcher.log +2025-02-13T20:10:41.8214070Z  Metal | INFO  | Closing device 0 +2025-02-13T20:10:41.8271794Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:10:41.8274493Z [ OK ] WatcherFixture.TestWatcherAssertBrisc (2751 ms) +2025-02-13T20:10:41.8275231Z [----------] 1 test from WatcherFixture (2751 ms total) +2025-02-13T20:10:41.8275667Z +2025-02-13T20:10:41.8275877Z [----------] Global test environment tear-down +2025-02-13T20:10:41.8276447Z [==========] 1 test from 1 test suite ran. (2751 ms total) +2025-02-13T20:10:41.8276916Z [ PASSED ] 1 test. +2025-02-13T20:10:41.8282632Z  Device | INFO  | Closing user mode device drivers +2025-02-13T20:10:42.1196531Z ./tests/scripts/run_tools_tests.sh: line 66: 674 Aborted (core dumped) ./build/tools/watcher_dump -d=0 -w &> tmp.log +2025-02-13T20:10:42.1197328Z Above failure is expected. +2025-02-13T20:10:42.1210873Z Watcher dump all data test - Pass +2025-02-13T20:10:42.1257798Z Running main() from gmock_main.cc +2025-02-13T20:10:42.1258363Z Note: Google Test filter = *TestWatcherRingBufferBrisc +2025-02-13T20:10:42.1258889Z [==========] Running 1 test from 1 test suite. +2025-02-13T20:10:42.1259338Z [----------] Global test environment set-up. +2025-02-13T20:10:42.1259784Z [----------] 1 test from WatcherFixture +2025-02-13T20:10:42.1260257Z [ RUN ] WatcherFixture.TestWatcherRingBufferBrisc +2025-02-13T20:10:42.1261537Z  LLRuntime | INFO  | Watcher log file: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/generated/watcher/watcher.log +2025-02-13T20:10:42.1262788Z  Test | INFO  | Running test using Fast Dispatch +2025-02-13T20:10:42.1313882Z +2025-02-13T20:10:42.1350748Z  Device | INFO  | Opening user mode device driver +2025-02-13T20:10:42.1378957Z 2025-02-13 20:10:42.137 | INFO  | SiliconDriver  - Opened PCI device 0; KMD version: 1.29.0, IOMMU: disabled +2025-02-13T20:10:42.1391132Z 2025-02-13 20:10:42.138 | INFO  | SiliconDriver  - Detected PCI devices: [0] +2025-02-13T20:10:42.1392390Z 2025-02-13 20:10:42.138 | INFO  | SiliconDriver  - Using local chip ids: {0} and remote chip ids {} +2025-02-13T20:10:42.1536195Z 2025-02-13 20:10:42.152 | WARNING  | SiliconDriver  - init_detect_tt_device_numanodes(): Could not determine NumaNodeSet for TT device (physical_device_id: 0 pci_bus_id: 0000:07:00.0) +2025-02-13T20:10:42.1538101Z 2025-02-13 20:10:42.153 | WARNING  | SiliconDriver  - Could not find NumaNodeSet for TT Device (physical_device_id: 0 pci_bus_id: 0000:07:00.0) +2025-02-13T20:10:42.1548073Z 2025-02-13 20:10:42.154 | WARNING  | SiliconDriver  - bind_area_memory_nodeset(): Unable to determine TT Device to NumaNode mapping for physical_device_id: 0. Skipping membind. +2025-02-13T20:10:42.1551132Z 2025-02-13 20:10:42.154 | WARNING  | SiliconDriver  - ---- ttSiliconDevice::init_hugepage: bind_area_to_memory_nodeset() failed (physical_device_id: 0 ch: 0). Hugepage allocation is not on NumaNode matching TT Device. Side-Effect is decreased Device->Host perf (Issue #893). +2025-02-13T20:10:42.1597790Z 2025-02-13 20:10:42.159 | INFO  | SiliconDriver  - Software version 6.0.0, Ethernet FW version 6.10.0 (Device 0) +2025-02-13T20:10:42.1633517Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:10:42.1657938Z  Metal | INFO  | AI CLK for device 0 is: 1000 MHz +2025-02-13T20:10:42.7908553Z  LLRuntime | INFO  | Watcher log file: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/generated/watcher/watcher.log +2025-02-13T20:10:42.7909941Z  LLRuntime | INFO  | Watcher attached device 0 +2025-02-13T20:10:42.7911051Z  LLRuntime | INFO  | Watcher server initialized, disabled features: None +2025-02-13T20:10:43.0408434Z  LLRuntime | INFO  | Watcher checking device 0 +2025-02-13T20:10:43.5943864Z  LLRuntime | INFO  | Watcher checking device 0 +2025-02-13T20:10:43.5958526Z  Test | INFO  | Running test on device 0. +2025-02-13T20:10:43.5959521Z  Test | INFO  | Running test on device 0 core (x=0,y=0)[(x=18,y=18)]... +2025-02-13T20:10:44.1526624Z  LLRuntime | INFO  | Watcher checking device 0 +2025-02-13T20:10:44.7049863Z  LLRuntime | INFO  | Watcher checking device 0 +2025-02-13T20:10:45.0055381Z  Always | INFO  | Checking file: generated/watcher/watcher.log +2025-02-13T20:10:45.0064003Z  Test | INFO  | Finished running test on device 0. +2025-02-13T20:10:45.2556974Z  LLRuntime | INFO  | Watcher checking device 0 +2025-02-13T20:10:45.2566054Z  LLRuntime | INFO  | Watcher log file: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/generated/watcher/watcher.log +2025-02-13T20:10:45.2567352Z  Metal | INFO  | Closing device 0 +2025-02-13T20:10:45.5569800Z  LLRuntime | INFO  | Watcher detached device 0 +2025-02-13T20:10:45.5571139Z  LLRuntime | INFO  | Watcher thread stopped watching... +2025-02-13T20:10:45.5581270Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:10:45.5582851Z [ OK ] WatcherFixture.TestWatcherRingBufferBrisc (3432 ms) +2025-02-13T20:10:45.5583638Z [----------] 1 test from WatcherFixture (3432 ms total) +2025-02-13T20:10:45.5584074Z +2025-02-13T20:10:45.5584299Z [----------] Global test environment tear-down +2025-02-13T20:10:45.5589406Z [==========] 1 test from 1 test suite ran. (3432 ms total) +2025-02-13T20:10:45.5590031Z [ PASSED ] 1 test. +2025-02-13T20:10:45.5598992Z  Device | INFO  | Closing user mode device drivers +2025-02-13T20:10:45.5732964Z Running watcher dump tool... +2025-02-13T20:10:45.5734177Z  Device | INFO  | Opening user mode device driver +2025-02-13T20:10:45.5791194Z +2025-02-13T20:10:45.5854800Z 2025-02-13 20:10:45.584 | INFO  | SiliconDriver  - Opened PCI device 0; KMD version: 1.29.0, IOMMU: disabled +2025-02-13T20:10:45.5864645Z 2025-02-13 20:10:45.585 | INFO  | SiliconDriver  - Detected PCI devices: [0] +2025-02-13T20:10:45.5865851Z 2025-02-13 20:10:45.585 | INFO  | SiliconDriver  - Using local chip ids: {0} and remote chip ids {} +2025-02-13T20:10:45.6009127Z 2025-02-13 20:10:45.600 | WARNING  | SiliconDriver  - init_detect_tt_device_numanodes(): Could not determine NumaNodeSet for TT device (physical_device_id: 0 pci_bus_id: 0000:07:00.0) +2025-02-13T20:10:45.6010890Z 2025-02-13 20:10:45.600 | WARNING  | SiliconDriver  - Could not find NumaNodeSet for TT Device (physical_device_id: 0 pci_bus_id: 0000:07:00.0) +2025-02-13T20:10:45.6021511Z 2025-02-13 20:10:45.601 | WARNING  | SiliconDriver  - bind_area_memory_nodeset(): Unable to determine TT Device to NumaNode mapping for physical_device_id: 0. Skipping membind. +2025-02-13T20:10:45.6024114Z 2025-02-13 20:10:45.601 | WARNING  | SiliconDriver  - ---- ttSiliconDevice::init_hugepage: bind_area_to_memory_nodeset() failed (physical_device_id: 0 ch: 0). Hugepage allocation is not on NumaNode matching TT Device. Side-Effect is decreased Device->Host perf (Issue #893). +2025-02-13T20:10:45.6084893Z 2025-02-13 20:10:45.607 | INFO  | SiliconDriver  - Software version 6.0.0, Ethernet FW version 6.10.0 (Device 0) +2025-02-13T20:10:45.6094320Z Dumping Watcher Log into: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/generated/watcher/watcher.log +2025-02-13T20:10:45.6122744Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:10:45.6149069Z  Metal | INFO  | AI CLK for device 0 is: 1000 MHz +2025-02-13T20:10:45.6154233Z  LLRuntime | INFO  | Watcher log file: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/generated/watcher/watcher.log +2025-02-13T20:10:45.6155717Z  LLRuntime | INFO  | Watcher checking device 0 +2025-02-13T20:10:45.8883463Z Watcher dump tool finished. +2025-02-13T20:10:45.8884228Z  Device | INFO  | Closing user mode device drivers +2025-02-13T20:10:45.8976146Z Watcher stack usage test - Pass +2025-02-13T20:10:45.9001308Z Watcher dump tool tests finished... +2025-02-13T20:10:45.9001786Z Running clean init tests - FD-on-Tensix +2025-02-13T20:10:45.9002201Z First run, no teardown +2025-02-13T20:10:45.9046703Z  Always | INFO  | Running loopback test with no teardown, to see if we can recover next run. +2025-02-13T20:10:45.9047935Z  Device | INFO  | Opening user mode device driver +2025-02-13T20:10:45.9107193Z +2025-02-13T20:10:45.9170577Z 2025-02-13 20:10:45.916 | INFO  | SiliconDriver  - Opened PCI device 0; KMD version: 1.29.0, IOMMU: disabled +2025-02-13T20:10:45.9182839Z 2025-02-13 20:10:45.917 | INFO  | SiliconDriver  - Detected PCI devices: [0] +2025-02-13T20:10:45.9184014Z 2025-02-13 20:10:45.917 | INFO  | SiliconDriver  - Using local chip ids: {0} and remote chip ids {} +2025-02-13T20:10:45.9329598Z 2025-02-13 20:10:45.932 | WARNING  | SiliconDriver  - init_detect_tt_device_numanodes(): Could not determine NumaNodeSet for TT device (physical_device_id: 0 pci_bus_id: 0000:07:00.0) +2025-02-13T20:10:45.9331443Z 2025-02-13 20:10:45.932 | WARNING  | SiliconDriver  - Could not find NumaNodeSet for TT Device (physical_device_id: 0 pci_bus_id: 0000:07:00.0) +2025-02-13T20:10:45.9340981Z 2025-02-13 20:10:45.933 | WARNING  | SiliconDriver  - bind_area_memory_nodeset(): Unable to determine TT Device to NumaNode mapping for physical_device_id: 0. Skipping membind. +2025-02-13T20:10:45.9343830Z 2025-02-13 20:10:45.933 | WARNING  | SiliconDriver  - ---- ttSiliconDevice::init_hugepage: bind_area_to_memory_nodeset() failed (physical_device_id: 0 ch: 0). Hugepage allocation is not on NumaNode matching TT Device. Side-Effect is decreased Device->Host perf (Issue #893). +2025-02-13T20:10:45.9399775Z 2025-02-13 20:10:45.939 | INFO  | SiliconDriver  - Software version 6.0.0, Ethernet FW version 6.10.0 (Device 0) +2025-02-13T20:10:45.9435519Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:10:45.9461716Z  Metal | INFO  | AI CLK for device 0 is: 1000 MHz +2025-02-13T20:10:47.5393797Z  Always | INFO  | Started program +2025-02-13T20:10:47.5395014Z libc++abi: terminating due to uncaught exception of type std::runtime_error: TT_THROW @ /work/tests/tt_metal/tt_metal/test_clean_init.cpp:143: tt::exception +2025-02-13T20:10:47.5395885Z info: +2025-02-13T20:10:47.5396189Z Skip teardown by throwing +2025-02-13T20:10:47.5396533Z backtrace: +2025-02-13T20:10:47.5396949Z --- ./build/test/tt_metal/test_clean_init(+0x11ae8) [0x555921e00ae8] +2025-02-13T20:10:47.5397572Z --- ./build/test/tt_metal/test_clean_init(main+0xa11) [0x555921dff3b1] +2025-02-13T20:10:47.5398255Z --- /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf3) [0x7f168d435083] +2025-02-13T20:10:47.5398893Z --- ./build/test/tt_metal/test_clean_init(+0xf8de) [0x555921dfe8de] +2025-02-13T20:10:47.5399251Z +2025-02-13T20:10:47.5399660Z  Always | INFO  | Finished program +2025-02-13T20:10:47.5400861Z  Test | INFO  | Test Passed +2025-02-13T20:10:47.5401651Z  Always | FATAL  | Skip teardown by throwing +2025-02-13T20:10:47.7127751Z ./tests/scripts/run_tools_tests.sh: line 66: 892 Aborted (core dumped) ./build/test/tt_metal/test_clean_init --skip-teardown +2025-02-13T20:10:47.7128575Z Above failure is expected. +2025-02-13T20:10:47.7128947Z Second run, expect clean init +2025-02-13T20:10:47.7188056Z  Always | INFO  | Running loopback test with proper teardown +2025-02-13T20:10:47.7189242Z  Device | INFO  | Opening user mode device driver +2025-02-13T20:10:47.7246030Z +2025-02-13T20:10:47.7307648Z 2025-02-13 20:10:47.730 | INFO  | SiliconDriver  - Opened PCI device 0; KMD version: 1.29.0, IOMMU: disabled +2025-02-13T20:10:47.7317099Z 2025-02-13 20:10:47.731 | INFO  | SiliconDriver  - Detected PCI devices: [0] +2025-02-13T20:10:47.7318324Z 2025-02-13 20:10:47.731 | INFO  | SiliconDriver  - Using local chip ids: {0} and remote chip ids {} +2025-02-13T20:10:47.7460985Z 2025-02-13 20:10:47.745 | WARNING  | SiliconDriver  - init_detect_tt_device_numanodes(): Could not determine NumaNodeSet for TT device (physical_device_id: 0 pci_bus_id: 0000:07:00.0) +2025-02-13T20:10:47.7463307Z 2025-02-13 20:10:47.745 | WARNING  | SiliconDriver  - Could not find NumaNodeSet for TT Device (physical_device_id: 0 pci_bus_id: 0000:07:00.0) +2025-02-13T20:10:47.7473842Z 2025-02-13 20:10:47.746 | WARNING  | SiliconDriver  - bind_area_memory_nodeset(): Unable to determine TT Device to NumaNode mapping for physical_device_id: 0. Skipping membind. +2025-02-13T20:10:47.7476670Z 2025-02-13 20:10:47.746 | WARNING  | SiliconDriver  - ---- ttSiliconDevice::init_hugepage: bind_area_to_memory_nodeset() failed (physical_device_id: 0 ch: 0). Hugepage allocation is not on NumaNode matching TT Device. Side-Effect is decreased Device->Host perf (Issue #893). +2025-02-13T20:10:47.7521080Z 2025-02-13 20:10:47.751 | INFO  | SiliconDriver  - Software version 6.0.0, Ethernet FW version 6.10.0 (Device 0) +2025-02-13T20:10:47.7552756Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:10:47.7577090Z  Metal | INFO  | AI CLK for device 0 is: 1000 MHz +2025-02-13T20:10:49.3002062Z  Always | INFO  | Started program +2025-02-13T20:10:49.3002905Z  Always | INFO  | Finished program +2025-02-13T20:10:49.3003682Z  Test | INFO  | Test Passed +2025-02-13T20:10:49.3004475Z  Metal | INFO  | Closing device 0 +2025-02-13T20:10:49.3008695Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:10:49.3017756Z  Device | INFO  | Closing user mode device drivers +2025-02-13T20:10:49.3082800Z Clean init tests - FD-on-Tensix passed! +2025-02-13T20:10:49.3083309Z Running clean init tests - FD-on-Eth +2025-02-13T20:10:49.3083744Z First run, no teardown +2025-02-13T20:10:49.3138256Z  Always | INFO  | Running loopback test with no teardown, to see if we can recover next run. +2025-02-13T20:10:49.3139490Z  Device | INFO  | Opening user mode device driver +2025-02-13T20:10:49.3194901Z +2025-02-13T20:10:49.3255978Z 2025-02-13 20:10:49.325 | INFO  | SiliconDriver  - Opened PCI device 0; KMD version: 1.29.0, IOMMU: disabled +2025-02-13T20:10:49.3266673Z 2025-02-13 20:10:49.326 | INFO  | SiliconDriver  - Detected PCI devices: [0] +2025-02-13T20:10:49.3267810Z 2025-02-13 20:10:49.326 | INFO  | SiliconDriver  - Using local chip ids: {0} and remote chip ids {} +2025-02-13T20:10:49.3411883Z 2025-02-13 20:10:49.340 | WARNING  | SiliconDriver  - init_detect_tt_device_numanodes(): Could not determine NumaNodeSet for TT device (physical_device_id: 0 pci_bus_id: 0000:07:00.0) +2025-02-13T20:10:49.3413701Z 2025-02-13 20:10:49.340 | WARNING  | SiliconDriver  - Could not find NumaNodeSet for TT Device (physical_device_id: 0 pci_bus_id: 0000:07:00.0) +2025-02-13T20:10:49.3424124Z 2025-02-13 20:10:49.341 | WARNING  | SiliconDriver  - bind_area_memory_nodeset(): Unable to determine TT Device to NumaNode mapping for physical_device_id: 0. Skipping membind. +2025-02-13T20:10:49.3426976Z 2025-02-13 20:10:49.341 | WARNING  | SiliconDriver  - ---- ttSiliconDevice::init_hugepage: bind_area_to_memory_nodeset() failed (physical_device_id: 0 ch: 0). Hugepage allocation is not on NumaNode matching TT Device. Side-Effect is decreased Device->Host perf (Issue #893). +2025-02-13T20:10:49.3476643Z 2025-02-13 20:10:49.347 | INFO  | SiliconDriver  - Software version 6.0.0, Ethernet FW version 6.10.0 (Device 0) +2025-02-13T20:10:49.3516666Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:10:49.3543572Z  Metal | INFO  | AI CLK for device 0 is: 1000 MHz +2025-02-13T20:10:50.9357575Z  Always | INFO  | Started program +2025-02-13T20:10:50.9358472Z  Always | INFO  | Finished program +2025-02-13T20:10:50.9359354Z  Test | INFO  | Test Passed +2025-02-13T20:10:50.9360251Z  Always | FATAL  | Skip teardown by throwing +2025-02-13T20:10:50.9362886Z libc++abi: terminating due to uncaught exception of type std::runtime_error: TT_THROW @ /work/tests/tt_metal/tt_metal/test_clean_init.cpp:143: tt::exception +2025-02-13T20:10:50.9363859Z info: +2025-02-13T20:10:50.9364191Z Skip teardown by throwing +2025-02-13T20:10:50.9364573Z backtrace: +2025-02-13T20:10:50.9365018Z --- ./build/test/tt_metal/test_clean_init(+0x11ae8) [0x5644fba4eae8] +2025-02-13T20:10:50.9365732Z --- ./build/test/tt_metal/test_clean_init(main+0xa11) [0x5644fba4d3b1] +2025-02-13T20:10:50.9366424Z --- /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf3) [0x7f50469ac083] +2025-02-13T20:10:50.9367108Z --- ./build/test/tt_metal/test_clean_init(+0xf8de) [0x5644fba4c8de] +2025-02-13T20:10:50.9367805Z +2025-02-13T20:10:51.1281326Z ./tests/scripts/run_tools_tests.sh: line 66: 1297 Aborted (core dumped) env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml ./build/test/tt_metal/test_clean_init --skip-teardown +2025-02-13T20:10:51.1282359Z Above failure is expected. +2025-02-13T20:10:51.1282721Z Second run, expect clean init +2025-02-13T20:10:51.1340284Z  Always | INFO  | Running loopback test with proper teardown +2025-02-13T20:10:51.1341245Z  Device | INFO  | Opening user mode device driver +2025-02-13T20:10:51.1400743Z +2025-02-13T20:10:51.1463141Z 2025-02-13 20:10:51.145 | INFO  | SiliconDriver  - Opened PCI device 0; KMD version: 1.29.0, IOMMU: disabled +2025-02-13T20:10:51.1475082Z 2025-02-13 20:10:51.146 | INFO  | SiliconDriver  - Detected PCI devices: [0] +2025-02-13T20:10:51.1476548Z 2025-02-13 20:10:51.146 | INFO  | SiliconDriver  - Using local chip ids: {0} and remote chip ids {} +2025-02-13T20:10:51.1619358Z 2025-02-13 20:10:51.161 | WARNING  | SiliconDriver  - init_detect_tt_device_numanodes(): Could not determine NumaNodeSet for TT device (physical_device_id: 0 pci_bus_id: 0000:07:00.0) +2025-02-13T20:10:51.1621225Z 2025-02-13 20:10:51.161 | WARNING  | SiliconDriver  - Could not find NumaNodeSet for TT Device (physical_device_id: 0 pci_bus_id: 0000:07:00.0) +2025-02-13T20:10:51.1630761Z 2025-02-13 20:10:51.162 | WARNING  | SiliconDriver  - bind_area_memory_nodeset(): Unable to determine TT Device to NumaNode mapping for physical_device_id: 0. Skipping membind. +2025-02-13T20:10:51.1633738Z 2025-02-13 20:10:51.162 | WARNING  | SiliconDriver  - ---- ttSiliconDevice::init_hugepage: bind_area_to_memory_nodeset() failed (physical_device_id: 0 ch: 0). Hugepage allocation is not on NumaNode matching TT Device. Side-Effect is decreased Device->Host perf (Issue #893). +2025-02-13T20:10:51.1679654Z 2025-02-13 20:10:51.167 | INFO  | SiliconDriver  - Software version 6.0.0, Ethernet FW version 6.10.0 (Device 0) +2025-02-13T20:10:51.1714553Z  Metal | INFO  | Initializing device 0. Program cache is NOT enabled +2025-02-13T20:10:51.1738599Z  Metal | INFO  | AI CLK for device 0 is: 1000 MHz +2025-02-13T20:10:52.7476991Z  Always | INFO  | Started program +2025-02-13T20:10:52.7479421Z  Always | INFO  | Finished program +2025-02-13T20:10:52.7481593Z  Test | INFO  | Test Passed +2025-02-13T20:10:52.7482431Z  Metal | INFO  | Closing device 0 +2025-02-13T20:10:52.7483390Z  Metal | INFO  | Disabling and clearing program cache on device 0 +2025-02-13T20:10:52.7489820Z  Device | INFO  | Closing user mode device drivers +2025-02-13T20:10:52.7556886Z Clean init tests - FD-on-Eth passed! +2025-02-13T20:10:53.5051242Z Prepare all required actions +2025-02-13T20:10:53.5051755Z Getting action download info +2025-02-13T20:10:53.8034070Z Download action repository 'actions/upload-artifact@v4' (SHA:65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08) +2025-02-13T20:10:54.5497095Z ##[group]Run ./.github/actions/upload-artifact-with-job-uuid +2025-02-13T20:10:54.5497603Z with: +2025-02-13T20:10:54.5497919Z path: generated/test_reports/ + +2025-02-13T20:10:54.5498315Z prefix: test_reports_ +2025-02-13T20:10:54.5498656Z env: +2025-02-13T20:10:54.5498960Z ARCH_NAME: wormhole_b0 +2025-02-13T20:10:54.5499296Z LOGURU_LEVEL: INFO +2025-02-13T20:10:54.5499807Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:10:54.5500843Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:10:54.5501634Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:10:54.5502421Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:10:54.5503172Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:10:54.5503948Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:10:54.5504697Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-13T20:10:54.5505328Z BUILD_TAG: 3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:10:54.5506300Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:10:54.5507105Z RUNNER_UID: 1000 +2025-02-13T20:10:54.5507431Z RUNNER_GID: 1000 +2025-02-13T20:10:54.5507769Z ##[endgroup] +2025-02-13T20:10:54.5533741Z ##[group]Run uuid=$(uuidgen) +2025-02-13T20:10:54.5534116Z uuid=$(uuidgen) +2025-02-13T20:10:54.5534485Z artifact_name="test_reports_$uuid" +2025-02-13T20:10:54.5534959Z echo "[UPLOAD-ARTIFACT-UUID] $artifact_name" +2025-02-13T20:10:54.5535515Z echo "artifact-name=$artifact_name" >> "$GITHUB_OUTPUT" +2025-02-13T20:10:54.5557202Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:10:54.5557669Z env: +2025-02-13T20:10:54.5557974Z ARCH_NAME: wormhole_b0 +2025-02-13T20:10:54.5558335Z LOGURU_LEVEL: INFO +2025-02-13T20:10:54.5558848Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:10:54.5559660Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:10:54.5560459Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:10:54.5561184Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:10:54.5561923Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:10:54.5562644Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:10:54.5563393Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-13T20:10:54.5564035Z BUILD_TAG: 3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:10:54.5564888Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:10:54.5565683Z RUNNER_UID: 1000 +2025-02-13T20:10:54.5566015Z RUNNER_GID: 1000 +2025-02-13T20:10:54.5566334Z ##[endgroup] +2025-02-13T20:10:54.5623372Z [UPLOAD-ARTIFACT-UUID] test_reports_be17cde1-9f41-464b-988e-7df7929dd6a6 +2025-02-13T20:10:54.5671429Z ##[group]Run actions/upload-artifact@v4 +2025-02-13T20:10:54.5671844Z with: +2025-02-13T20:10:54.5672214Z name: test_reports_be17cde1-9f41-464b-988e-7df7929dd6a6 +2025-02-13T20:10:54.5672735Z path: generated/test_reports/ + +2025-02-13T20:10:54.5673196Z if-no-files-found: warn +2025-02-13T20:10:54.5673598Z compression-level: 6 +2025-02-13T20:10:54.5673955Z overwrite: false +2025-02-13T20:10:54.5674303Z include-hidden-files: false +2025-02-13T20:10:54.5674889Z env: +2025-02-13T20:10:54.5675217Z ARCH_NAME: wormhole_b0 +2025-02-13T20:10:54.5675660Z LOGURU_LEVEL: INFO +2025-02-13T20:10:54.5676172Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:10:54.5677021Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-13T20:10:54.5677856Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:10:54.5678625Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:10:54.5679413Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-13T20:10:54.5680196Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-13T20:10:54.5681128Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-13T20:10:54.5681825Z BUILD_TAG: 3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:10:54.5682805Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:10:54.5683641Z RUNNER_UID: 1000 +2025-02-13T20:10:54.5706744Z RUNNER_GID: 1000 +2025-02-13T20:10:54.5707103Z ##[endgroup] +2025-02-13T20:10:54.8406086Z With the provided path, there will be 3 files uploaded +2025-02-13T20:10:54.8411346Z Artifact name is valid! +2025-02-13T20:10:54.8413394Z Root directory input is valid! +2025-02-13T20:10:55.0505970Z Beginning upload of artifact content to blob storage +2025-02-13T20:10:55.2829367Z Uploaded bytes 1366 +2025-02-13T20:10:55.3429254Z Finished uploading artifact content to blob storage! +2025-02-13T20:10:55.3432596Z SHA256 hash of uploaded artifact zip is 1b5bb8564b74feb9ab672fdc271ee7e43af3814e36c2ff788348a2bf4334a421 +2025-02-13T20:10:55.3434466Z Finalizing artifact upload +2025-02-13T20:10:55.4484644Z Artifact test_reports_be17cde1-9f41-464b-988e-7df7929dd6a6.zip successfully finalized. Artifact ID 2588481890 +2025-02-13T20:10:55.4485936Z Artifact test_reports_be17cde1-9f41-464b-988e-7df7929dd6a6 has been successfully uploaded! Final size is 1366 bytes. Artifact ID is 2588481890 +2025-02-13T20:10:55.4492634Z Artifact download URL: https://github.com/tenstorrent/tt-metal/actions/runs/13315815702/artifacts/2588481890 +2025-02-13T20:10:55.4712755Z Post job cleanup. +2025-02-13T20:10:55.4770753Z Post job cleanup. +2025-02-13T20:10:55.5726918Z [command]/usr/bin/git version +2025-02-13T20:10:55.5768998Z git version 2.25.1 +2025-02-13T20:10:55.5813652Z Copying '/home/ubuntu/.gitconfig' to '/home/ubuntu/actions-runner/_work/_temp/1ec9791e-9ac6-47d2-a41f-598590ea8bc5/.gitconfig' +2025-02-13T20:10:55.5825557Z Temporarily overriding HOME='/home/ubuntu/actions-runner/_work/_temp/1ec9791e-9ac6-47d2-a41f-598590ea8bc5' before making global git config changes +2025-02-13T20:10:55.5826793Z Adding repository directory to the temporary git global config as a safe directory +2025-02-13T20:10:55.5831636Z [command]/usr/bin/git config --global --add safe.directory /home/ubuntu/actions-runner/_work/tt-metal/tt-metal +2025-02-13T20:10:55.5862277Z [command]/usr/bin/git config --local --name-only --get-regexp core\.sshCommand +2025-02-13T20:10:55.5890675Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'core\.sshCommand' && git config --local --unset-all 'core.sshCommand' || :" +2025-02-13T20:10:55.6163187Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:10:55.6208681Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:10:55.6256375Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:10:55.6301541Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:10:55.6344461Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:10:55.6389504Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:10:55.6441827Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:10:55.6517479Z [command]/usr/bin/git config --local --name-only --get-regexp http\.https\:\/\/github\.com\/\.extraheader +2025-02-13T20:10:55.6536339Z http.https://github.com/.extraheader +2025-02-13T20:10:55.6549368Z [command]/usr/bin/git config --local --unset-all http.https://github.com/.extraheader +2025-02-13T20:10:55.6576560Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'http\.https\:\/\/github\.com\/\.extraheader' && git config --local --unset-all 'http.https://github.com/.extraheader' || :" +2025-02-13T20:10:55.6842468Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:10:55.6895661Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:10:55.6944577Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:10:55.6993084Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:10:55.7043650Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:10:55.7097578Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:10:55.7149866Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:10:55.7354034Z Post job cleanup. +2025-02-13T20:10:56.1191652Z [command]/usr/bin/docker logout https://ghcr.io +2025-02-13T20:10:56.1346087Z Removing login credentials for ghcr.io +2025-02-13T20:10:56.1406540Z ##[group]Post cache +2025-02-13T20:10:56.1407887Z State not set +2025-02-13T20:10:56.1409530Z ##[endgroup] +2025-02-13T20:10:56.1616377Z Post job cleanup. +2025-02-13T20:10:56.1683714Z Post job cleanup. +2025-02-13T20:10:56.2459666Z Post job cleanup. +2025-02-13T20:10:56.4169563Z Cache hit occurred on the primary key setup-venv-Linux-py-3.8.18-/home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/bin/python-509e0fbc74e4697ea036d8e6b4ed76321c253e4ffef8468c11ee556fb8e370e2-./create_venv.sh, not saving cache. +2025-02-13T20:10:56.4278236Z Post job cleanup. +2025-02-13T20:10:56.6302238Z Post job cleanup. +2025-02-13T20:10:56.6360190Z Post job cleanup. +2025-02-13T20:10:56.7521369Z [command]/usr/bin/git version +2025-02-13T20:10:56.7563394Z git version 2.25.1 +2025-02-13T20:10:56.7606360Z Copying '/home/ubuntu/.gitconfig' to '/home/ubuntu/actions-runner/_work/_temp/b6a490ff-95c0-4d2f-8eeb-6a9b1d82f8ce/.gitconfig' +2025-02-13T20:10:56.7619353Z Temporarily overriding HOME='/home/ubuntu/actions-runner/_work/_temp/b6a490ff-95c0-4d2f-8eeb-6a9b1d82f8ce' before making global git config changes +2025-02-13T20:10:56.7620781Z Adding repository directory to the temporary git global config as a safe directory +2025-02-13T20:10:56.7625471Z [command]/usr/bin/git config --global --add safe.directory /home/ubuntu/actions-runner/_work/tt-metal/tt-metal +2025-02-13T20:10:56.7666529Z [command]/usr/bin/git config --local --name-only --get-regexp core\.sshCommand +2025-02-13T20:10:56.7701022Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'core\.sshCommand' && git config --local --unset-all 'core.sshCommand' || :" +2025-02-13T20:10:56.8011457Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:10:56.8061670Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:10:56.8108156Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:10:56.8160523Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:10:56.8205595Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:10:56.8253779Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:10:56.8297751Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:10:56.8362041Z [command]/usr/bin/git config --local --name-only --get-regexp http\.https\:\/\/github\.com\/\.extraheader +2025-02-13T20:10:56.8393693Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'http\.https\:\/\/github\.com\/\.extraheader' && git config --local --unset-all 'http.https://github.com/.extraheader' || :" +2025-02-13T20:10:56.8675090Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:10:56.8727440Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:10:56.8780237Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:10:56.8834982Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:10:56.8886335Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:10:56.8937666Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:10:56.8985336Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:10:56.9170826Z A job completed hook has been configured by the self-hosted runner administrator +2025-02-13T20:10:56.9204920Z ##[group]Run '/opt/tt_metal_infra/scripts/ci/wormhole_b0/cleanup.sh' +2025-02-13T20:10:56.9220904Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:10:56.9221405Z ##[endgroup] +2025-02-13T20:10:56.9285302Z Current date / time is Thu Feb 13 20:10:56 UTC 2025 +2025-02-13T20:10:57.1169771Z Cleaning up orphan processes diff --git a/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/logs/37190251054.log b/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/logs/37190251054.log new file mode 100644 index 00000000000..752a3d4b85d --- /dev/null +++ b/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/logs/37190251054.log @@ -0,0 +1,690 @@ +2025-02-13T20:06:43.9939058Z Current runner version: '2.322.0' +2025-02-13T20:06:43.9946463Z Runner name: 'tt-metal-ci-vm-4' +2025-02-13T20:06:43.9947292Z Runner group name: 'Default' +2025-02-13T20:06:43.9948417Z Machine name: '3b996119-328e-4871-b980-bb63dcfbb963' +2025-02-13T20:06:43.9951789Z ##[group]GITHUB_TOKEN Permissions +2025-02-13T20:06:43.9953996Z Actions: read +2025-02-13T20:06:43.9954633Z Contents: write +2025-02-13T20:06:43.9955191Z Metadata: read +2025-02-13T20:06:43.9955753Z Packages: write +2025-02-13T20:06:43.9956428Z Pages: write +2025-02-13T20:06:43.9956998Z PullRequests: write +2025-02-13T20:06:43.9957548Z ##[endgroup] +2025-02-13T20:06:43.9960674Z Secret source: Actions +2025-02-13T20:06:43.9961452Z Prepare workflow directory +2025-02-13T20:06:44.0478922Z Prepare all required actions +2025-02-13T20:06:44.0521801Z Getting action download info +2025-02-13T20:06:44.2424400Z Download action repository 'tenstorrent/tt-metal@main' (SHA:ac426de3d4a9c274964843fdae6aa83ea3960a30) +2025-02-13T20:06:49.9329826Z Download action repository 'actions/download-artifact@v4' (SHA:fa0a91b85d4f404e444e00e005971372dc801d16) +2025-02-13T20:06:50.6590455Z Getting action download info +2025-02-13T20:06:50.8135276Z Download action repository 'actions/checkout@v4' (SHA:11bd71901bbe5b1630ceea73d27597364c9af683) +2025-02-13T20:06:51.3972999Z Uses: tenstorrent/tt-metal/.github/workflows/ttnn-post-commit.yaml@refs/heads/sagarwal/multi_page_buffer (ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70) +2025-02-13T20:06:51.3975742Z ##[group] Inputs +2025-02-13T20:06:51.3976133Z build-type: Release +2025-02-13T20:06:51.3976889Z with-retries: false +2025-02-13T20:06:51.3977163Z arch: grayskull +2025-02-13T20:06:51.3977461Z runner-label: E150 +2025-02-13T20:06:51.3978274Z timeout: 45 +2025-02-13T20:06:51.3978625Z num-groups: 12 +2025-02-13T20:06:51.3978933Z ##[endgroup] +2025-02-13T20:06:51.3979379Z Complete job name: ttnn-unit-tests (grayskull, E150) / ttnn group 1 grayskull E150 +2025-02-13T20:06:51.4512432Z A job started hook has been configured by the self-hosted runner administrator +2025-02-13T20:06:51.4644929Z ##[group]Run '/opt/tt_metal_infra/scripts/ci/grayskull/reset.sh' +2025-02-13T20:06:51.4660040Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:06:51.4660751Z ##[endgroup] +2025-02-13T20:06:51.4817916Z ++ date +2025-02-13T20:06:51.4818690Z + echo Current date / time is Thu Feb 13 20:06:51 UTC 2025 +2025-02-13T20:06:51.4819187Z + set_e_was_enabled=false +2025-02-13T20:06:51.4819732Z + [[ ehxB == *e* ]] +2025-02-13T20:06:51.4820127Z + set_e_was_enabled=true +2025-02-13T20:06:51.4820522Z + set +e +2025-02-13T20:06:51.4820912Z + docker image prune +2025-02-13T20:06:51.4821426Z Current date / time is Thu Feb 13 20:06:51 UTC 2025 +2025-02-13T20:06:51.4972870Z WARNING! This will remove all dangling images. +2025-02-13T20:06:51.4997750Z ++ df +2025-02-13T20:06:51.5001159Z ++ awk '{print $5}' +2025-02-13T20:06:51.5001985Z ++ sed s/%// +2025-02-13T20:06:51.5002542Z +++ findmnt -n -o SOURCE / +2025-02-13T20:06:51.5022449Z ++ grep -w '^/dev/vda1' +2025-02-13T20:06:51.5048046Z + disk_usage_before=60 +2025-02-13T20:06:51.5064415Z Are you sure you want to continue? [y/N] ::notice title=disk-usage-before-startup::Disk usage is 60 % +2025-02-13T20:06:51.5065702Z + echo '::notice title=disk-usage-before-startup::Disk usage is 60 %' +2025-02-13T20:06:51.5066166Z + '[' 60 -ge 90 ']' +2025-02-13T20:06:51.5066470Z ++ df +2025-02-13T20:06:51.5066951Z ++ awk '{print $5}' +2025-02-13T20:06:51.5067270Z ++ sed s/%// +2025-02-13T20:06:51.5068359Z +++ findmnt -n -o SOURCE / +2025-02-13T20:06:51.5086044Z ++ grep -w '^/dev/vda1' +2025-02-13T20:06:51.5104683Z + disk_usage_after=60 +2025-02-13T20:06:51.5129074Z ##[notice]Disk usage is 60 % +2025-02-13T20:06:51.5136880Z + echo '::notice title=disk-usage-after-startup::Disk usage is 60 %' +2025-02-13T20:06:51.5137397Z + '[' 60 -ge 90 ']' +2025-02-13T20:06:51.5137742Z ++ lsmod +2025-02-13T20:06:51.5144021Z + lsmod_output='Module Size Used by +2025-02-13T20:06:51.5145068Z wekafsio 70086656 1 +2025-02-13T20:06:51.5145530Z wekafsgw 40960 4 wekafsio +2025-02-13T20:06:51.5146654Z veth 28672 0 +2025-02-13T20:06:51.5147166Z uio_pci_generic 16384 0 +2025-02-13T20:06:51.5147683Z igb_uio 20480 0 +2025-02-13T20:06:51.5148198Z uio 20480 2 igb_uio,uio_pci_generic +2025-02-13T20:06:51.5148784Z xt_conntrack 16384 1 +2025-02-13T20:06:51.5149248Z xt_MASQUERADE 20480 1 +2025-02-13T20:06:51.5149725Z nf_conntrack_netlink 45056 0 +2025-02-13T20:06:51.5150246Z nfnetlink 16384 2 nf_conntrack_netlink +2025-02-13T20:06:51.5150830Z xfrm_user 36864 1 +2025-02-13T20:06:51.5151161Z xfrm_algo 16384 1 xfrm_user +2025-02-13T20:06:51.5151494Z iptable_nat 16384 1 +2025-02-13T20:06:51.5151845Z nf_nat 45056 2 iptable_nat,xt_MASQUERADE +2025-02-13T20:06:51.5152363Z nf_conntrack 139264 4 xt_conntrack,nf_nat,nf_conntrack_netlink,xt_MASQUERADE +2025-02-13T20:06:51.5152871Z nf_defrag_ipv6 24576 1 nf_conntrack +2025-02-13T20:06:51.5153279Z nf_defrag_ipv4 16384 1 nf_conntrack +2025-02-13T20:06:51.5153630Z xt_addrtype 16384 2 +2025-02-13T20:06:51.5153929Z iptable_filter 16384 1 +2025-02-13T20:06:51.5154226Z bpfilter 32768 0 +2025-02-13T20:06:51.5154851Z br_netfilter 28672 0 +2025-02-13T20:06:51.5155304Z bridge 176128 1 br_netfilter +2025-02-13T20:06:51.5155717Z stp 16384 1 bridge +2025-02-13T20:06:51.5156164Z llc 16384 2 bridge,stp +2025-02-13T20:06:51.5156501Z aufs 262144 0 +2025-02-13T20:06:51.5156797Z xfs 1286144 2 +2025-02-13T20:06:51.5157095Z overlay 118784 0 +2025-02-13T20:06:51.5157389Z rdma_ucm 28672 0 +2025-02-13T20:06:51.5157709Z rdma_cm 110592 1 rdma_ucm +2025-02-13T20:06:51.5158056Z iw_cm 49152 1 rdma_cm +2025-02-13T20:06:51.5158638Z ib_ipoib 131072 0 +2025-02-13T20:06:51.5158973Z ib_cm 114688 2 rdma_cm,ib_ipoib +2025-02-13T20:06:51.5159368Z ib_umad 28672 8 +2025-02-13T20:06:51.5159669Z nls_iso8859_1 16384 1 +2025-02-13T20:06:51.5160013Z dm_multipath 32768 0 +2025-02-13T20:06:51.5160338Z scsi_dh_rdac 16384 0 +2025-02-13T20:06:51.5160637Z scsi_dh_emc 16384 0 +2025-02-13T20:06:51.5160925Z scsi_dh_alua 20480 0 +2025-02-13T20:06:51.5161212Z mlx5_ib 397312 0 +2025-02-13T20:06:51.5161524Z ib_uverbs 139264 18 rdma_ucm,mlx5_ib +2025-02-13T20:06:51.5161869Z input_leds 16384 0 +2025-02-13T20:06:51.5162161Z serio_raw 20480 0 +2025-02-13T20:06:51.5162451Z kvm_amd 98304 0 +2025-02-13T20:06:51.5162744Z ccp 90112 1 kvm_amd +2025-02-13T20:06:51.5163060Z joydev 24576 0 +2025-02-13T20:06:51.5163357Z kvm 667648 1 kvm_amd +2025-02-13T20:06:51.5163864Z ib_core 348160 8 rdma_cm,ib_ipoib,iw_cm,ib_umad,rdma_ucm,ib_uverbs,mlx5_ib,ib_cm +2025-02-13T20:06:51.5164326Z tenstorrent 32768 0 +2025-02-13T20:06:51.5164655Z sch_fq_codel 20480 45 +2025-02-13T20:06:51.5165037Z binfmt_misc 24576 1 +2025-02-13T20:06:51.5165631Z msr 16384 0 +2025-02-13T20:06:51.5166051Z efi_pstore 16384 0 +2025-02-13T20:06:51.5166512Z virtio_rng 16384 0 +2025-02-13T20:06:51.5166979Z ip_tables 32768 2 iptable_filter,iptable_nat +2025-02-13T20:06:51.5167725Z x_tables 40960 5 xt_conntrack,iptable_filter,xt_addrtype,ip_tables,xt_MASQUERADE +2025-02-13T20:06:51.5168222Z autofs4 45056 2 +2025-02-13T20:06:51.5168510Z btrfs 1269760 0 +2025-02-13T20:06:51.5168820Z zstd_compress 167936 1 btrfs +2025-02-13T20:06:51.5169156Z raid10 61440 0 +2025-02-13T20:06:51.5169455Z raid456 155648 0 +2025-02-13T20:06:51.5169830Z async_raid6_recov 24576 1 raid456 +2025-02-13T20:06:51.5170285Z async_memcpy 20480 2 raid456,async_raid6_recov +2025-02-13T20:06:51.5171011Z async_pq 24576 2 raid456,async_raid6_recov +2025-02-13T20:06:51.5171496Z async_xor 20480 3 async_pq,raid456,async_raid6_recov +2025-02-13T20:06:51.5172031Z async_tx 20480 5 async_pq,async_memcpy,async_xor,raid456,async_raid6_recov +2025-02-13T20:06:51.5172667Z xor 24576 2 async_xor,btrfs +2025-02-13T20:06:51.5173101Z raid6_pq 114688 4 async_pq,btrfs,raid456,async_raid6_recov +2025-02-13T20:06:51.5173594Z libcrc32c 16384 5 nf_conntrack,nf_nat,btrfs,xfs,raid456 +2025-02-13T20:06:51.5173998Z raid1 45056 0 +2025-02-13T20:06:51.5174344Z raid0 24576 0 +2025-02-13T20:06:51.5174665Z multipath 20480 0 +2025-02-13T20:06:51.5174956Z linear 20480 0 +2025-02-13T20:06:51.5175262Z hid_generic 16384 0 +2025-02-13T20:06:51.5175557Z crct10dif_pclmul 16384 1 +2025-02-13T20:06:51.5176103Z crc32_pclmul 16384 0 +2025-02-13T20:06:51.5176410Z usbhid 57344 0 +2025-02-13T20:06:51.5176745Z cirrus 16384 0 +2025-02-13T20:06:51.5177064Z ghash_clmulni_intel 16384 0 +2025-02-13T20:06:51.5177439Z hid 131072 2 usbhid,hid_generic +2025-02-13T20:06:51.5177803Z aesni_intel 372736 0 +2025-02-13T20:06:51.5178178Z mlx5_core 1626112 1 mlx5_ib +2025-02-13T20:06:51.5178578Z crypto_simd 16384 1 aesni_intel +2025-02-13T20:06:51.5178994Z drm_kms_helper 184320 3 cirrus +2025-02-13T20:06:51.5179377Z syscopyarea 16384 1 drm_kms_helper +2025-02-13T20:06:51.5179803Z sysfillrect 16384 1 drm_kms_helper +2025-02-13T20:06:51.5180236Z sysimgblt 16384 1 drm_kms_helper +2025-02-13T20:06:51.5180624Z fb_sys_fops 16384 1 drm_kms_helper +2025-02-13T20:06:51.5181154Z pci_hyperv_intf 16384 1 mlx5_core +2025-02-13T20:06:51.5181568Z mlxdevm 172032 1 mlx5_core +2025-02-13T20:06:51.5182006Z cryptd 24576 2 crypto_simd,ghash_clmulni_intel +2025-02-13T20:06:51.5182453Z auxiliary 16384 2 mlx5_ib,mlx5_core +2025-02-13T20:06:51.5182840Z glue_helper 16384 1 aesni_intel +2025-02-13T20:06:51.5183842Z mlx_compat 65536 12 rdma_cm,ib_ipoib,mlxdevm,iw_cm,auxiliary,ib_umad,ib_core,rdma_ucm,ib_uverbs,mlx5_ib,ib_cm,mlx5_core +2025-02-13T20:06:51.5184484Z virtio_blk 20480 3 +2025-02-13T20:06:51.5184787Z tls 73728 1 mlx5_core +2025-02-13T20:06:51.5185114Z ahci 40960 0 +2025-02-13T20:06:51.5185434Z drm 495616 3 drm_kms_helper,cirrus +2025-02-13T20:06:51.5185792Z psmouse 155648 0 +2025-02-13T20:06:51.5186109Z libahci 36864 1 ahci +2025-02-13T20:06:51.5186450Z mlxfw 32768 1 mlx5_core +2025-02-13T20:06:51.5186840Z psample 20480 1 mlx5_core' +2025-02-13T20:06:51.5187201Z + grep -q tenstorrent +2025-02-13T20:06:51.5197777Z + echo Module Size Used by wekafsio 70086656 1 wekafsgw 40960 4 wekafsio veth 28672 0 uio_pci_generic 16384 0 igb_uio 20480 0 uio 20480 2 igb_uio,uio_pci_generic xt_conntrack 16384 1 xt_MASQUERADE 20480 1 nf_conntrack_netlink 45056 0 nfnetlink 16384 2 nf_conntrack_netlink xfrm_user 36864 1 xfrm_algo 16384 1 xfrm_user iptable_nat 16384 1 nf_nat 45056 2 iptable_nat,xt_MASQUERADE nf_conntrack 139264 4 xt_conntrack,nf_nat,nf_conntrack_netlink,xt_MASQUERADE nf_defrag_ipv6 24576 1 nf_conntrack nf_defrag_ipv4 16384 1 nf_conntrack xt_addrtype 16384 2 iptable_filter 16384 1 bpfilter 32768 0 br_netfilter 28672 0 bridge 176128 1 br_netfilter stp 16384 1 bridge llc 16384 2 bridge,stp aufs 262144 0 xfs 1286144 2 overlay 118784 0 rdma_ucm 28672 0 rdma_cm 110592 1 rdma_ucm iw_cm 49152 1 rdma_cm ib_ipoib 131072 0 ib_cm 114688 2 rdma_cm,ib_ipoib ib_umad 28672 8 nls_iso8859_1 16384 1 dm_multipath 32768 0 scsi_dh_rdac 16384 0 scsi_dh_emc 16384 0 scsi_dh_alua 20480 0 mlx5_ib 397312 0 ib_uverbs 139264 18 rdma_ucm,mlx5_ib input_leds 16384 0 serio_raw 20480 0 kvm_amd 98304 0 ccp 90112 1 kvm_amd joydev 24576 0 kvm 667648 1 kvm_amd ib_core 348160 8 rdma_cm,ib_ipoib,iw_cm,ib_umad,rdma_ucm,ib_uverbs,mlx5_ib,ib_cm tenstorrent 32768 0 sch_fq_codel 20480 45 binfmt_misc 24576 1 msr 16384 0 efi_pstore 16384 0 virtio_rng 16384 0 ip_tables 32768 2 iptable_filter,iptable_nat x_tables 40960 5 xt_conntrack,iptable_filter,xt_addrtype,ip_tables,xt_MASQUERADE autofs4 45056 2 btrfs 1269760 0 zstd_compress 167936 1 btrfs raid10 61440 0 raid456 155648 0 async_raid6_recov 24576 1 raid456 async_memcpy 20480 2 raid456,async_raid6_recov async_pq 24576 2 raid456,async_raid6_recov async_xor 20480 3 async_pq,raid456,async_raid6_recov async_tx 20480 5 async_pq,async_memcpy,async_xor,raid456,async_raid6_recov xor 24576 2 async_xor,btrfs raid6_pq 114688 4 async_pq,btrfs,raid456,async_raid6_recov libcrc32c 16384 5 nf_conntrack,nf_nat,btrfs,xfs,raid456 raid1 45056 0 raid0 24576 0 multipath 20480 0 linear 20480 0 hid_generic 16384 0 crct10dif_pclmul 16384 1 crc32_pclmul 16384 0 usbhid 57344 0 cirrus 16384 0 ghash_clmulni_intel 16384 0 hid 131072 2 usbhid,hid_generic aesni_intel 372736 0 mlx5_core 1626112 1 mlx5_ib crypto_simd 16384 1 aesni_intel drm_kms_helper 184320 3 cirrus syscopyarea 16384 1 drm_kms_helper sysfillrect 16384 1 drm_kms_helper sysimgblt 16384 1 drm_kms_helper fb_sys_fops 16384 1 drm_kms_helper pci_hyperv_intf 16384 1 mlx5_core mlxdevm 172032 1 mlx5_core cryptd 24576 2 crypto_simd,ghash_clmulni_intel auxiliary 16384 2 mlx5_ib,mlx5_core glue_helper 16384 1 aesni_intel mlx_compat 65536 12 rdma_cm,ib_ipoib,mlxdevm,iw_cm,auxiliary,ib_umad,ib_core,rdma_ucm,ib_uverbs,mlx5_ib,ib_cm,mlx5_core virtio_blk 20480 3 tls 73728 1 mlx5_core ahci 40960 0 drm 495616 3 drm_kms_helper,cirrus psmouse 155648 0 libahci 36864 1 ahci mlxfw 32768 1 mlx5_core psample 20480 1 mlx5_core +2025-02-13T20:06:51.5207605Z + [[ 0 -ne 0 ]] +2025-02-13T20:06:51.5207871Z ++ lsof -w /dev/tenstorrent/0 +2025-02-13T20:06:51.6508683Z + lsof_output= +2025-02-13T20:06:51.6509511Z + '[' -n '' ']' +2025-02-13T20:06:51.6511277Z ##[notice]Touching and printing out SMI info +2025-02-13T20:06:51.6512406Z + i=0 +2025-02-13T20:06:51.6512672Z + iter_limit=10 +2025-02-13T20:06:51.6513228Z + echo '::notice title=printing-smi-info-startup::Touching and printing out SMI info' +2025-02-13T20:06:51.6513795Z + sleep 20 +2025-02-13T20:07:11.6522967Z + sudo touch /opt/tt_metal_infra/smi.log +2025-02-13T20:07:11.6785253Z + sudo chown ubuntu /opt/tt_metal_infra/smi.log +2025-02-13T20:07:11.7011495Z + tt-smi-metal -s -f /opt/tt_metal_infra/smi.log +2025-02-13T20:07:12.0504887Z +2025-02-13T20:07:12.0529598Z  Detected Chips: 1 +2025-02-13T20:07:12.0530296Z  +2025-02-13T20:07:12.0530636Z  Detected Chips: 1 +2025-02-13T20:07:12.0530861Z +2025-02-13T20:07:12.0531040Z  Detecting ARC: | +2025-02-13T20:07:12.0531246Z +2025-02-13T20:07:12.0531431Z  Detecting DRAM: | +2025-02-13T20:07:12.0532122Z +2025-02-13T20:07:12.0534815Z [] ETH: | +2025-02-13T20:07:12.0593599Z Gathering Information ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:00 +2025-02-13T20:07:12.0665878Z  Saved tt-smi log to: /opt/tt_metal_infra/smi.log  +2025-02-13T20:07:12.1045527Z + cat /opt/tt_metal_infra/smi.log +2025-02-13T20:07:12.1054135Z { +2025-02-13T20:07:12.1055780Z + echo '::notice title=attempting-reset-startup::Attempting to reset card(s). Sleeping first' +2025-02-13T20:07:12.1056556Z "time": "2025-02-13T20:07:12.053106", +2025-02-13T20:07:12.1056952Z "host_info": { +2025-02-13T20:07:12.1057260Z "OS": "Linux", +2025-02-13T20:07:12.1057587Z "Distro": "Ubuntu 20.04.3 LTS", +2025-02-13T20:07:12.1057972Z "Kernel": "5.4.0-205-generic", +2025-02-13T20:07:12.1058397Z "Hostname": "3b996119-328e-4871-b980-bb63dcfbb963", +2025-02-13T20:07:12.1058811Z "Platform": "x86_64", +2025-02-13T20:07:12.1059187Z "Python": "3.8.10", +2025-02-13T20:07:12.1059653Z "Memory": "47.14 GB", +2025-02-13T20:07:12.1060078Z "Driver": "TTKMD 1.26" +2025-02-13T20:07:12.1061255Z }, +2025-02-13T20:07:12.1061675Z "device_info": [ +2025-02-13T20:07:12.1062529Z + sleep 30 +2025-02-13T20:07:12.1062809Z { +2025-02-13T20:07:12.1063061Z "smbus_telem": { +2025-02-13T20:07:12.1063353Z "BOARD_ID": "0x10000331152302e", +2025-02-13T20:07:12.1063802Z "SMBUS_TX_ENUM_VERSION": "0xba5e0001", +2025-02-13T20:07:12.1064151Z "SMBUS_TX_DEVICE_ID": "0xfaca1e52", +2025-02-13T20:07:12.1064489Z "SMBUS_TX_ASIC_RO": null, +2025-02-13T20:07:12.1064817Z "SMBUS_TX_ASIC_IDD": null, +2025-02-13T20:07:12.1065396Z "SMBUS_TX_BOARD_ID_HIGH": "0x1000033", +2025-02-13T20:07:12.1065759Z "SMBUS_TX_BOARD_ID_LOW": "0x1152302e", +2025-02-13T20:07:12.1066109Z "SMBUS_TX_ARC0_FW_VERSION": "0x1070000", +2025-02-13T20:07:12.1066479Z "SMBUS_TX_ARC1_FW_VERSION": "0x1070000", +2025-02-13T20:07:12.1066824Z "SMBUS_TX_ARC2_FW_VERSION": null, +2025-02-13T20:07:12.1067171Z "SMBUS_TX_ARC3_FW_VERSION": "0x1070000", +2025-02-13T20:07:12.1067529Z "SMBUS_TX_SPIBOOTROM_FW_VERSION": null, +2025-02-13T20:07:12.1067875Z "SMBUS_TX_ETH_FW_VERSION": null, +2025-02-13T20:07:12.1068216Z "SMBUS_TX_M3_BL_FW_VERSION": null, +2025-02-13T20:07:12.1068559Z "SMBUS_TX_M3_APP_FW_VERSION": null, +2025-02-13T20:07:12.1068967Z "SMBUS_TX_DDR_SPEED": "0xe74", +2025-02-13T20:07:12.1069305Z "SMBUS_TX_DDR_STATUS": "0x111111", +2025-02-13T20:07:12.1069634Z "SMBUS_TX_ETH_STATUS0": null, +2025-02-13T20:07:12.1069952Z "SMBUS_TX_ETH_STATUS1": null, +2025-02-13T20:07:12.1070284Z "SMBUS_TX_PCIE_STATUS": "0x11040040", +2025-02-13T20:07:12.1070616Z "SMBUS_TX_FAULTS": null, +2025-02-13T20:07:12.1071161Z "SMBUS_TX_ARC0_HEALTH": "0xf2e2dcf", +2025-02-13T20:07:12.1071507Z "SMBUS_TX_ARC1_HEALTH": null, +2025-02-13T20:07:12.1071832Z "SMBUS_TX_ARC2_HEALTH": null, +2025-02-13T20:07:12.1072148Z "SMBUS_TX_ARC3_HEALTH": null, +2025-02-13T20:07:12.1072464Z "SMBUS_TX_FAN_SPEED": "0xff", +2025-02-13T20:07:12.1072781Z "SMBUS_TX_AICLK": "0x4b200fa", +2025-02-13T20:07:12.1073097Z "SMBUS_TX_AXICLK": "0x384", +2025-02-13T20:07:12.1073416Z "SMBUS_TX_ARCCLK": "0x21c", +2025-02-13T20:07:12.1073729Z "SMBUS_TX_THROTTLER": null, +2025-02-13T20:07:12.1074046Z "SMBUS_TX_VCORE": "0x2e4", +2025-02-13T20:07:12.1074381Z "SMBUS_TX_ASIC_TEMPERATURE": "0x2cf0204", +2025-02-13T20:07:12.1074731Z "SMBUS_TX_VREG_TEMPERATURE": null, +2025-02-13T20:07:12.1075071Z "SMBUS_TX_BOARD_TEMPERATURE": null, +2025-02-13T20:07:12.1075402Z "SMBUS_TX_TDP": "0xaa0016", +2025-02-13T20:07:12.1075717Z "SMBUS_TX_TDC": "0x12c001c", +2025-02-13T20:07:12.1076242Z "SMBUS_TX_VDD_LIMITS": "0x3a202e4", +2025-02-13T20:07:12.1076586Z "SMBUS_TX_THM_LIMITS": "0x53004b", +2025-02-13T20:07:12.1076917Z "SMBUS_TX_WH_FW_DATE": "0x3b171127", +2025-02-13T20:07:12.1077251Z "SMBUS_TX_ASIC_TMON0": "0x22220721", +2025-02-13T20:07:12.1077584Z "SMBUS_TX_ASIC_TMON1": "0x2121", +2025-02-13T20:07:12.1077917Z "SMBUS_TX_MVDDQ_POWER": null, +2025-02-13T20:07:12.1078253Z "SMBUS_TX_GDDR_TRAIN_TEMP0": null, +2025-02-13T20:07:12.1078673Z "SMBUS_TX_GDDR_TRAIN_TEMP1": null, +2025-02-13T20:07:12.1079014Z "SMBUS_TX_BOOT_DATE": "0x520c0b18", +2025-02-13T20:07:12.1079340Z "SMBUS_TX_RT_SECONDS": null, +2025-02-13T20:07:12.1079670Z "SMBUS_TX_AUX_STATUS": null, +2025-02-13T20:07:12.1080086Z "SMBUS_TX_ETH_DEBUG_STATUS0": null, +2025-02-13T20:07:12.1080440Z "SMBUS_TX_ETH_DEBUG_STATUS1": null, +2025-02-13T20:07:12.1080799Z "SMBUS_TX_TT_FLASH_VERSION": "0x50040000" +2025-02-13T20:07:12.1081226Z }, +2025-02-13T20:07:12.1081460Z "board_info": { +2025-02-13T20:07:12.1081736Z "bus_id": "0000:07:00.0", +2025-02-13T20:07:12.1082032Z "board_type": "e150", +2025-02-13T20:07:12.1082337Z "board_id": "010000331152302e", +2025-02-13T20:07:12.1082656Z "coords": "N/A", +2025-02-13T20:07:12.1083166Z "dram_status": true, +2025-02-13T20:07:12.1083485Z "dram_speed": "3700", +2025-02-13T20:07:12.1083925Z "pcie_speed": 4, +2025-02-13T20:07:12.1084212Z "pcie_width": 16 +2025-02-13T20:07:12.1084484Z }, +2025-02-13T20:07:12.1084715Z "telemetry": { +2025-02-13T20:07:12.1084989Z "voltage": "0.74", +2025-02-13T20:07:12.1085278Z "current": " 28.0", +2025-02-13T20:07:12.1085578Z "power": " 22.0", +2025-02-13T20:07:12.1085862Z "aiclk": " 250", +2025-02-13T20:07:12.1086157Z "asic_temperature": "32.2" +2025-02-13T20:07:12.1086462Z }, +2025-02-13T20:07:12.1086700Z "firmwares": { +2025-02-13T20:07:12.1086955Z "arc_fw": "1.7.0.0", +2025-02-13T20:07:12.1087254Z "arc_fw_date": "2023-11-23", +2025-02-13T20:07:12.1087562Z "eth_fw": "N/A", +2025-02-13T20:07:12.1087840Z "m3_bl_fw": "N/A", +2025-02-13T20:07:12.1088133Z "m3_app_fw": "N/A", +2025-02-13T20:07:12.1088435Z "tt_flash_version": "80.4.0.0" +2025-02-13T20:07:12.1088740Z }, +2025-02-13T20:07:12.1088969Z "limits": { +2025-02-13T20:07:12.1089216Z "vdd_min": "0.74", +2025-02-13T20:07:12.1089499Z "vdd_max": "0.93", +2025-02-13T20:07:12.1089782Z "tdp_limit": "170", +2025-02-13T20:07:12.1090173Z "tdc_limit": "300", +2025-02-13T20:07:12.1090472Z "asic_fmax": "1202", +2025-02-13T20:07:12.1090779Z "therm_trip_l1_limit": "83", +2025-02-13T20:07:12.1091094Z "thm_limit": "75", +2025-02-13T20:07:12.1091394Z "bus_peak_limit": null +2025-02-13T20:07:12.1091673Z } +2025-02-13T20:07:12.1091889Z } +2025-02-13T20:07:12.1092102Z ] +2025-02-13T20:07:12.1092704Z }::notice title=attempting-reset-startup::Attempting to reset card(s). Sleeping first +2025-02-13T20:07:42.1070218Z + '[' 0 -lt 10 ']' +2025-02-13T20:07:42.1070759Z + (( i++ )) +2025-02-13T20:07:42.1072898Z ++ tt-smi-metal -r 0 +2025-02-13T20:07:42.5190532Z + reset_output=' Starting tensix reset on GS board at pci index 0  +2025-02-13T20:07:42.5191289Z  Lowering clks to safe value...  +2025-02-13T20:07:42.5191776Z  Beginning reset sequence...  +2025-02-13T20:07:42.5192248Z  Finishing reset sequence...  +2025-02-13T20:07:42.5192808Z  Returning clks to original values...  +2025-02-13T20:07:42.5193420Z  Finished tensix reset on GS board at pci index 0 +2025-02-13T20:07:42.5194119Z  +2025-02-13T20:07:42.5194576Z  Re-initializing boards after reset....  +2025-02-13T20:07:42.5194911Z +2025-02-13T20:07:42.5195187Z  Detected Chips: 1 +2025-02-13T20:07:42.5195599Z  +2025-02-13T20:07:42.5196019Z  Detected Chips: 1 +2025-02-13T20:07:42.5196302Z +2025-02-13T20:07:42.5196488Z  Detecting ARC: | +2025-02-13T20:07:42.5196736Z +2025-02-13T20:07:42.5196913Z  Detecting DRAM: | +2025-02-13T20:07:42.5197154Z +2025-02-13T20:07:42.5197324Z [] ETH: |' +2025-02-13T20:07:42.5197685Z + [[ 0 -ne 0 ]] +2025-02-13T20:07:42.5198155Z + [[  Starting tensix reset on GS board at pci index 0  +2025-02-13T20:07:42.5198682Z  Lowering clks to safe value...  +2025-02-13T20:07:42.5199150Z  Beginning reset sequence...  +2025-02-13T20:07:42.5199582Z  Finishing reset sequence...  +2025-02-13T20:07:42.5200075Z  Returning clks to original values...  +2025-02-13T20:07:42.5201329Z  Finished tensix reset on GS board at pci index 0 +2025-02-13T20:07:42.5201780Z  +2025-02-13T20:07:42.5202333Z  Re-initializing boards after reset....  +2025-02-13T20:07:42.5202722Z +2025-02-13T20:07:42.5202908Z  Detected Chips: 1 +2025-02-13T20:07:42.5203301Z  +2025-02-13T20:07:42.5203651Z  Detected Chips: 1 +2025-02-13T20:07:42.5203860Z +2025-02-13T20:07:42.5204082Z  Detecting ARC: | +2025-02-13T20:07:42.5204286Z +2025-02-13T20:07:42.5204500Z  Detecting DRAM: | +2025-02-13T20:07:42.5204713Z +2025-02-13T20:07:42.5205009Z [] ETH: | == *\N\o\ \c\h\i\p\s\ \d\e\t\e\c\t\e\d* ]] +2025-02-13T20:07:42.5205449Z + break +2025-02-13T20:07:42.5205741Z + '[' 1 -eq 10 ']' +2025-02-13T20:07:42.5206446Z + echo '::notice title=reset-successful-startup::tt-smi reset was successful' +2025-02-13T20:07:42.5207051Z + check_hugepages_service_status=0 +2025-02-13T20:07:42.5207508Z + sudo systemctl status tenstorrent-hugepages.service +2025-02-13T20:07:42.5209472Z ##[notice]tt-smi reset was successful +2025-02-13T20:07:42.5476131Z Unit tenstorrent-hugepages.service could not be found. +2025-02-13T20:07:42.5483862Z + check_hugepages_service_status=4 +2025-02-13T20:07:42.5484332Z + '[' 4 -eq 4 ']' +2025-02-13T20:07:42.5485152Z + echo '::warning title=hugepages-service-not-found-startup::Hugepages service not found. Using old rc.local method' +2025-02-13T20:07:42.5485860Z + sudo /etc/rc.local +2025-02-13T20:07:42.5488441Z ##[warning]Hugepages service not found. Using old rc.local method +2025-02-13T20:08:12.5927706Z ++ date +%s +2025-02-13T20:08:12.5933031Z + hugepages_check_start=1739477292 +2025-02-13T20:08:12.5933635Z + hugepages_check_timeout=60 +2025-02-13T20:08:12.5936229Z ++ cat /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages +2025-02-13T20:08:12.5944834Z + [[ 1 -eq 0 ]] +2025-02-13T20:08:12.5947550Z ##[notice]Hugepages is now setup. +2025-02-13T20:08:12.5950041Z Printing out cpu information... +2025-02-13T20:08:12.5950702Z + echo '::notice title=hugepages-setup-success-startup::Hugepages is now setup.' +2025-02-13T20:08:12.5951346Z + echo 'Printing out cpu information...' +2025-02-13T20:08:12.5951771Z + lscpu +2025-02-13T20:08:12.5973057Z Architecture: x86_64 +2025-02-13T20:08:12.5973514Z CPU op-mode(s): 32-bit, 64-bit +2025-02-13T20:08:12.5973974Z Byte Order: Little Endian +2025-02-13T20:08:12.5974474Z Address sizes: 40 bits physical, 48 bits virtual +2025-02-13T20:08:12.5974956Z CPU(s): 14 +2025-02-13T20:08:12.5975364Z On-line CPU(s) list: 0-13 +2025-02-13T20:08:12.5977022Z Thread(s) per core: 1 +2025-02-13T20:08:12.5977874Z Core(s) per socket: 1 +2025-02-13T20:08:12.5978395Z Socket(s): 14 +2025-02-13T20:08:12.5978858Z NUMA node(s): 2 +2025-02-13T20:08:12.5979365Z Vendor ID: AuthenticAMD +2025-02-13T20:08:12.5979830Z CPU family: 23 +2025-02-13T20:08:12.5980230Z Model: 49 +2025-02-13T20:08:12.5980696Z Model name: AMD EPYC-Rome Processor +2025-02-13T20:08:12.5981152Z Stepping: 0 +2025-02-13T20:08:12.5981552Z CPU MHz: 3000.000 +2025-02-13T20:08:12.5981976Z BogoMIPS: 6000.00 +2025-02-13T20:08:12.5982370Z Virtualization: AMD-V +2025-02-13T20:08:12.5982798Z Hypervisor vendor: KVM +2025-02-13T20:08:12.5983212Z Virtualization type: full +2025-02-13T20:08:12.5983631Z L1d cache: 448 KiB +2025-02-13T20:08:12.5984033Z L1i cache: 448 KiB +2025-02-13T20:08:12.5984461Z L2 cache: 7 MiB +2025-02-13T20:08:12.5984933Z L3 cache: 224 MiB +2025-02-13T20:08:12.5985325Z NUMA node0 CPU(s): 0-6 +2025-02-13T20:08:12.5985684Z NUMA node1 CPU(s): 7-13 +2025-02-13T20:08:12.5986416Z Vulnerability Gather data sampling: Not affected +2025-02-13T20:08:12.5986881Z Vulnerability Itlb multihit: Not affected +2025-02-13T20:08:12.5987328Z Vulnerability L1tf: Not affected +2025-02-13T20:08:12.5987772Z Vulnerability Mds: Not affected +2025-02-13T20:08:12.5988228Z Vulnerability Meltdown: Not affected +2025-02-13T20:08:12.5988682Z Vulnerability Mmio stale data: Not affected +2025-02-13T20:08:12.5989120Z Vulnerability Retbleed: Vulnerable +2025-02-13T20:08:12.5991748Z Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp +2025-02-13T20:08:12.5992541Z Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization +2025-02-13T20:08:12.5993578Z Vulnerability Spectre v2: Mitigation; Retpolines; IBPB conditional; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected +2025-02-13T20:08:12.5994337Z Vulnerability Srbds: Not affected +2025-02-13T20:08:12.5994733Z Vulnerability Tsx async abort: Not affected +2025-02-13T20:08:12.5997044Z Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid tsc_known_freq pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm svm cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core ssbd ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 clzero xsaveerptr wbnoinvd arat npt nrip_save umip rdpid +2025-02-13T20:08:12.6224654Z ##[group]Run tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main +2025-02-13T20:08:12.6225153Z with: +2025-02-13T20:08:12.6225573Z token: *** +2025-02-13T20:08:12.6225802Z fetch-depth: 1 +2025-02-13T20:08:12.6226070Z env: +2025-02-13T20:08:12.6226277Z LOGURU_LEVEL: INFO +2025-02-13T20:08:12.6226508Z ##[endgroup] +2025-02-13T20:08:12.6307520Z ##[group]Run set -x +2025-02-13T20:08:12.6307792Z set -x +2025-02-13T20:08:12.6308031Z ls -al +2025-02-13T20:08:12.6308325Z if [ -f "semicolon_delimited_script" ]; then +2025-02-13T20:08:12.6308699Z  file semicolon_delimited_script +2025-02-13T20:08:12.6309038Z  head semicolon_delimited_script +2025-02-13T20:08:12.6309334Z fi +2025-02-13T20:08:12.6309565Z sudo rm -rf deleteme +2025-02-13T20:08:12.6309854Z sudo rm -rf docker-job +2025-02-13T20:08:12.6310150Z if [ -d ".git" ]; then +2025-02-13T20:08:12.6310461Z  echo 'Cleaning repo' +2025-02-13T20:08:12.6310748Z  git clean -xffd +2025-02-13T20:08:12.6311032Z  echo 'Done git clean -xffd' +2025-02-13T20:08:12.6311383Z  echo 'Attempting to delete any lock files' +2025-02-13T20:08:12.6311766Z  find .git -type f -iname '*.lock' -delete +2025-02-13T20:08:12.6312135Z  echo 'Done deleting lock files' +2025-02-13T20:08:12.6312475Z  echo 'De-init-ing submodules' +2025-02-13T20:08:12.6312892Z  git submodule deinit -f --all +2025-02-13T20:08:12.6313256Z  echo 'Done de-initing submodules' +2025-02-13T20:08:12.6313580Z fi +2025-02-13T20:08:12.6327040Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:08:12.6327430Z env: +2025-02-13T20:08:12.6327663Z LOGURU_LEVEL: INFO +2025-02-13T20:08:12.6327910Z ##[endgroup] +2025-02-13T20:08:12.6363828Z + ls -al +2025-02-13T20:08:12.6382006Z total 359940 +2025-02-13T20:08:12.6382470Z drwxr-xr-x 16 ubuntu ubuntu 4096 Feb 13 20:06 . +2025-02-13T20:08:12.6383029Z + '[' -f semicolon_delimited_script ']' +2025-02-13T20:08:12.6383433Z + sudo rm -rf deleteme +2025-02-13T20:08:12.6383850Z drwxr-xr-x 3 ubuntu ubuntu 4096 Apr 15 2024 .. +2025-02-13T20:08:12.6384359Z -rw-r--r-- 1 ubuntu ubuntu 3966 Jan 28 07:20 .clang-format +2025-02-13T20:08:12.6385056Z -rw-r--r-- 1 ubuntu ubuntu 6268 Jan 28 07:20 .clang-format-ignore +2025-02-13T20:08:12.6385551Z -rw-r--r-- 1 ubuntu ubuntu 6374 Jan 28 07:20 .clang-tidy +2025-02-13T20:08:12.6385970Z -rw-r--r-- 1 ubuntu ubuntu 43 Jan 28 07:20 .clangd +2025-02-13T20:08:12.6386382Z -rw-r--r-- 1 ubuntu ubuntu 222 Jan 28 07:20 .gersemirc +2025-02-13T20:08:12.6386813Z drwxr-xr-x 9 ubuntu ubuntu 4096 Feb 13 20:06 .git +2025-02-13T20:08:12.6387273Z -rw-r--r-- 1 ubuntu ubuntu 239 Jan 28 07:20 .git-blame-ignore-revs +2025-02-13T20:08:12.6387840Z -rw-r--r-- 1 ubuntu ubuntu 35 Jan 28 07:20 .gitattributes +2025-02-13T20:08:12.6388307Z drwxr-xr-x 6 ubuntu ubuntu 4096 Feb 13 08:54 .github +2025-02-13T20:08:12.6388769Z -rw-r--r-- 1 ubuntu ubuntu 1730 Jan 28 07:20 .gitignore +2025-02-13T20:08:12.6389201Z -rw-r--r-- 1 ubuntu ubuntu 991 Feb 2 10:06 .gitmodules +2025-02-13T20:08:12.6389691Z -rw-r--r-- 1 ubuntu ubuntu 932 Jan 28 07:20 .pre-commit-config.yaml +2025-02-13T20:08:12.6390219Z -rw-r--r-- 1 ubuntu ubuntu 15813574 Feb 13 08:54 .test_durations +2025-02-13T20:08:12.6390721Z drwxr-xr-x 4 ubuntu ubuntu 4096 Feb 13 20:06 .ttnn_runtime_artifacts +2025-02-13T20:08:12.6391211Z -rw-r--r-- 1 ubuntu ubuntu 213 Jan 28 07:20 .yamllint +2025-02-13T20:08:12.6391664Z -rw-r--r-- 1 ubuntu ubuntu 11086 Feb 13 08:54 CMakeLists.txt +2025-02-13T20:08:12.6392145Z -rw-r--r-- 1 ubuntu ubuntu 2231 Feb 2 10:06 CMakePresets.json +2025-02-13T20:08:12.6392599Z -rw-r--r-- 1 ubuntu ubuntu 11478 Feb 13 08:54 CODEOWNERS +2025-02-13T20:08:12.6393143Z -rw-r--r-- 1 ubuntu ubuntu 5253 Jan 28 07:20 CODE_OF_CONDUCT.md +2025-02-13T20:08:12.6393788Z -rw-r--r-- 1 ubuntu ubuntu 36527 Jan 28 07:20 CONTRIBUTING.md +2025-02-13T20:08:12.6394257Z -rw-r--r-- 1 ubuntu ubuntu 126373 Jan 28 07:20 Doxyfile +2025-02-13T20:08:12.6394706Z -rw-r--r-- 1 ubuntu ubuntu 6046 Feb 2 10:06 INSTALLING.md +2025-02-13T20:08:12.6395155Z -rw-r--r-- 1 ubuntu ubuntu 11825 Jan 28 07:20 LICENSE +2025-02-13T20:08:12.6395594Z -rw-r--r-- 1 ubuntu ubuntu 1562 Jan 28 07:20 MANIFEST.in +2025-02-13T20:08:12.6396356Z -rw-r--r-- 1 ubuntu ubuntu 18372 Feb 13 08:54 METALIUM_GUIDE.md +2025-02-13T20:08:12.6397482Z -rw-r--r-- 1 ubuntu ubuntu 15526 Feb 13 08:54 README.md +2025-02-13T20:08:12.6398626Z -rwxr-xr-x 1 ubuntu ubuntu 11097 Feb 13 08:54 build_metal.sh +2025-02-13T20:08:12.6399770Z -rw-r--r-- 1 ubuntu ubuntu 1438 Jan 28 07:20 check_copyright_config.yaml +2025-02-13T20:08:12.6400809Z -rw-r--r-- 1 ubuntu ubuntu 1821 Jan 28 07:20 cloc.sh +2025-02-13T20:08:12.6401798Z drwxr-xr-x 4 ubuntu ubuntu 4096 Feb 13 08:54 cmake +2025-02-13T20:08:12.6402839Z -rw-r--r-- 1 ubuntu ubuntu 23178 Feb 13 08:54 conftest.py +2025-02-13T20:08:12.6403866Z drwxr-xr-x 2 ubuntu ubuntu 4096 Jan 28 07:20 contributing +2025-02-13T20:08:12.6405017Z -rwxr-xr-x 1 ubuntu ubuntu 1420 Jan 28 07:20 create_venv.sh +2025-02-13T20:08:12.6406172Z drwxr-xr-x 2 ubuntu ubuntu 4096 Feb 13 08:54 dependencies +2025-02-13T20:08:12.6407178Z drwxr-xr-x 2 ubuntu ubuntu 4096 Feb 13 08:54 dockerfile +2025-02-13T20:08:12.6408167Z drwxr-xr-x 3 ubuntu ubuntu 4096 Feb 7 17:42 docs +2025-02-13T20:08:12.6409222Z drwxr-xr-x 4 ubuntu ubuntu 4096 Feb 5 15:56 infra +2025-02-13T20:08:12.6410248Z -rwxr-xr-x 1 ubuntu ubuntu 6885 Feb 13 08:54 install_dependencies.sh +2025-02-13T20:08:12.6411289Z -rw-r--r-- 1 ubuntu ubuntu 1042 Jan 28 07:20 pyproject.toml +2025-02-13T20:08:12.6412286Z -rw-r--r-- 1 ubuntu ubuntu 1200 Jan 28 07:20 pytest.ini +2025-02-13T20:08:12.6413572Z drwxr-xr-x 4 ubuntu ubuntu 4096 Feb 13 08:54 scripts +2025-02-13T20:08:12.6414549Z -rw-r--r-- 1 ubuntu ubuntu 7551 Feb 5 15:56 setup.py +2025-02-13T20:08:12.6415551Z drwxr-xr-x 24 ubuntu ubuntu 4096 Jan 28 07:20 tech_reports +2025-02-13T20:08:12.6416582Z drwxr-xr-x 11 ubuntu ubuntu 4096 Feb 13 08:54 tests +2025-02-13T20:08:12.6417769Z drwxr-xr-x 11 ubuntu ubuntu 4096 Feb 13 08:54 tt-train +2025-02-13T20:08:12.6419272Z drwxr-xr-x 5 ubuntu ubuntu 4096 Feb 13 20:02 tt_fabric +2025-02-13T20:08:12.6420503Z -rw-r--r-- 1 ubuntu ubuntu 138013606 Feb 13 20:02 ttnn-0.56.0rc29.dev11+any-cp38-cp38-linux_x86_64.whl +2025-02-13T20:08:12.6421709Z -rw-r--r-- 1 ubuntu ubuntu 214282272 Feb 13 20:03 ttnn-0.56.0rc29.dev11+any.tar.gz +2025-02-13T20:08:12.6756401Z + sudo rm -rf docker-job +2025-02-13T20:08:12.7006618Z + '[' -d .git ']' +2025-02-13T20:08:12.7007386Z + echo 'Cleaning repo' +2025-02-13T20:08:12.7007731Z + git clean -xffd +2025-02-13T20:08:12.7008020Z Cleaning repo +2025-02-13T20:08:14.8382641Z Removing .ttnn_runtime_artifacts/ +2025-02-13T20:08:14.8383210Z Removing tests/end_to_end_tests/.pytest_cache/ +2025-02-13T20:08:14.8383810Z Removing tests/end_to_end_tests/.ttnn_runtime_artifacts/ +2025-02-13T20:08:14.8384245Z Removing tests/end_to_end_tests/__pycache__/ +2025-02-13T20:08:14.8384594Z Removing tests/end_to_end_tests/env/ +2025-02-13T20:08:14.8385014Z Removing ttnn-0.56.0rc29.dev11+any-cp38-cp38-linux_x86_64.whl +2025-02-13T20:08:14.8385521Z Removing ttnn-0.56.0rc29.dev11+any.tar.gz +2025-02-13T20:08:14.8413511Z + echo 'Done git clean -xffd' +2025-02-13T20:08:14.8413802Z Done git clean -xffd +2025-02-13T20:08:14.8414109Z + echo 'Attempting to delete any lock files' +2025-02-13T20:08:14.8414453Z + find .git -type f -iname '*.lock' -delete +2025-02-13T20:08:14.8414788Z Attempting to delete any lock files +2025-02-13T20:08:14.8620457Z + echo 'Done deleting lock files' +2025-02-13T20:08:14.8621148Z Done deleting lock files +2025-02-13T20:08:14.8621604Z + echo 'De-init-ing submodules' +2025-02-13T20:08:14.8622037Z + git submodule deinit -f --all +2025-02-13T20:08:14.8622414Z De-init-ing submodules +2025-02-13T20:08:14.8876944Z could not create empty submodule directory models/demos/t3000/llama2_70b/reference/llamaSubmodule 'models/demos/t3000/llama2_70b/reference/llama' (https://github.com/tenstorrent-metal/llama.git) unregistered for path 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:08:14.8878783Z Cleared directory 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:08:14.8890240Z Submodule '3rd_party/wandb-cpp' (https://github.com/yhisaki/wandb-cpp) unregistered for path 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:08:14.8904786Z could not create empty submodule directory tt_metal/third_party/tracySubmodule 'tt_metal/third_party/tracy' (https://github.com/tenstorrent-metal/tracy.git) unregistered for path 'tt_metal/third_party/tracy' +2025-02-13T20:08:14.8919094Z could not create empty submodule directory tt_metal/third_party/tt_llk_blackholeSubmodule 'tt_metal/third_party/tt_llk_blackhole' (https://github.com/tenstorrent/tt-llk-bh.git) unregistered for path 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:08:14.8931805Z could not create empty submodule directory tt_metal/third_party/tt_llk_grayskullSubmodule 'tt_metal/third_party/tt_llk_grayskull' (https://github.com/tenstorrent/tt-llk-gs.git) unregistered for path 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:08:14.8953630Z could not create empty submodule directory tt_metal/third_party/tt_llk_wormhole_b0Submodule 'tt_metal/third_party/tt_llk_wormhole_b0' (https://github.com/tenstorrent/tt-llk-wh-b0.git) unregistered for path 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:08:14.8968470Z could not create empty submodule directory tt_metal/third_party/umdSubmodule 'tt_metal/third_party/umd' (https://github.com/tenstorrent/tt-umd.git) unregistered for path 'tt_metal/third_party/umd' +2025-02-13T20:08:14.8977038Z + echo 'Done de-initing submodules' +2025-02-13T20:08:14.8977495Z Done de-initing submodules +2025-02-13T20:08:14.9078884Z ##[group]Run actions/checkout@v4 +2025-02-13T20:08:14.9079292Z with: +2025-02-13T20:08:14.9080019Z token: *** +2025-02-13T20:08:14.9080405Z fetch-depth: 1 +2025-02-13T20:08:14.9080719Z lfs: false +2025-02-13T20:08:14.9081017Z submodules: recursive +2025-02-13T20:08:14.9081337Z clean: true +2025-02-13T20:08:14.9081618Z repository: tenstorrent/tt-metal +2025-02-13T20:08:14.9082348Z ssh-strict: true +2025-02-13T20:08:14.9082657Z ssh-user: git +2025-02-13T20:08:14.9082973Z persist-credentials: true +2025-02-13T20:08:14.9083348Z sparse-checkout-cone-mode: true +2025-02-13T20:08:14.9083718Z fetch-tags: false +2025-02-13T20:08:14.9084022Z show-progress: true +2025-02-13T20:08:14.9084337Z set-safe-directory: true +2025-02-13T20:08:14.9084647Z env: +2025-02-13T20:08:14.9084932Z LOGURU_LEVEL: INFO +2025-02-13T20:08:14.9085248Z ##[endgroup] +2025-02-13T20:08:15.0188998Z Syncing repository: tenstorrent/tt-metal +2025-02-13T20:08:15.0190755Z ##[group]Getting Git version info +2025-02-13T20:08:15.0191365Z Working directory is '/home/ubuntu/actions-runner/_work/tt-metal/tt-metal' +2025-02-13T20:08:15.0192083Z [command]/usr/bin/git version +2025-02-13T20:08:15.0192424Z git version 2.25.1 +2025-02-13T20:08:15.0194735Z ##[endgroup] +2025-02-13T20:08:15.0198678Z Copying '/home/ubuntu/.gitconfig' to '/home/ubuntu/actions-runner/_work/_temp/34b257f9-c1d1-427f-a115-9614afa862b6/.gitconfig' +2025-02-13T20:08:15.0202362Z Temporarily overriding HOME='/home/ubuntu/actions-runner/_work/_temp/34b257f9-c1d1-427f-a115-9614afa862b6' before making global git config changes +2025-02-13T20:08:15.0203362Z Adding repository directory to the temporary git global config as a safe directory +2025-02-13T20:08:15.0207205Z [command]/usr/bin/git config --global --add safe.directory /home/ubuntu/actions-runner/_work/tt-metal/tt-metal +2025-02-13T20:08:15.0246872Z [command]/usr/bin/git config --local --get remote.origin.url +2025-02-13T20:08:15.0272037Z https://github.com/tenstorrent/tt-metal +2025-02-13T20:08:15.0288939Z ##[group]Removing previously created refs, to avoid conflicts +2025-02-13T20:08:15.0291866Z [command]/usr/bin/git rev-parse --symbolic-full-name --verify --quiet HEAD +2025-02-13T20:08:15.0312179Z refs/heads/sagarwal/multi_page_buffer +2025-02-13T20:08:15.0319230Z [command]/usr/bin/git checkout --detach +2025-02-13T20:13:45.7280857Z FAILED tests/ttnn/unit_tests/test_to_layout.py::test_to_layout_wide_tensor[to_layout=Layout.ROW_MAJOR-from_layout=Layout.TILE-on_device=True-shape=(1, 1, 32, 131072)] +2025-02-13T20:13:45.7281011Z !!!!!!!!!!!!!!!!!!!!!!!!!! stopping after 1 failures !!!!!!!!!!!!!!!!!!!!!!!!!!! +2025-02-13T20:13:45.7281269Z = 1 failed, 628 passed, 1237 skipped, 60602 deselected, 637 warnings in 267.74s (0:04:27) = +2025-02-13T20:13:48.2361669Z  Always | WARNING  | Attempting to push work to Device 0 which is not initialized. Ignoring... +2025-02-13T20:13:49.9802749Z  Device | INFO  | Closing user mode device drivers +2025-02-13T20:13:50.6787954Z Prepare all required actions +2025-02-13T20:13:50.6788454Z Getting action download info +2025-02-13T20:13:50.9646782Z Download action repository 'slackapi/slack-github-action@v1.26.0' (SHA:70cd7be8e40a46e8b0eced40b0de447bdb42f68e) +2025-02-13T20:13:51.5017192Z ##[group]Run ./.github/actions/slack-report +2025-02-13T20:13:51.5017611Z with: +2025-02-13T20:13:51.5018348Z slack_webhook_url: *** +2025-02-13T20:13:51.5018696Z owner: U06CXU895AP +2025-02-13T20:13:51.5019003Z env: +2025-02-13T20:13:51.5019282Z LOGURU_LEVEL: INFO +2025-02-13T20:13:51.5019808Z BUILD_TAG: 3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:13:51.5020607Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:13:51.5021366Z RUNNER_UID: 1000 +2025-02-13T20:13:51.5021726Z RUNNER_GID: 1000 +2025-02-13T20:13:51.5022033Z ##[endgroup] +2025-02-13T20:13:51.5080093Z Prepare all required actions +2025-02-13T20:13:51.5080855Z Getting action download info +2025-02-13T20:13:51.6670633Z Download action repository 'actions/upload-artifact@v4' (SHA:65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08) +2025-02-13T20:13:52.4061967Z ##[group]Run ./.github/actions/upload-artifact-with-job-uuid +2025-02-13T20:13:52.4062422Z with: +2025-02-13T20:13:52.4062770Z path: generated/test_reports/ + +2025-02-13T20:13:52.4063233Z prefix: test_reports_ +2025-02-13T20:13:52.4063570Z env: +2025-02-13T20:13:52.4063890Z LOGURU_LEVEL: INFO +2025-02-13T20:13:52.4064241Z BUILD_TAG: 3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:13:52.4065048Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:13:52.4065776Z RUNNER_UID: 1000 +2025-02-13T20:13:52.4066110Z RUNNER_GID: 1000 +2025-02-13T20:13:52.4066434Z ##[endgroup] +2025-02-13T20:13:52.4140669Z ##[group]Run uuid=$(uuidgen) +2025-02-13T20:13:52.4141262Z uuid=$(uuidgen) +2025-02-13T20:13:52.4141680Z artifact_name="test_reports_$uuid" +2025-02-13T20:13:52.4142279Z echo "[UPLOAD-ARTIFACT-UUID] $artifact_name" +2025-02-13T20:13:52.4142780Z echo "artifact-name=$artifact_name" >> "$GITHUB_OUTPUT" +2025-02-13T20:13:52.4162505Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:13:52.4162980Z env: +2025-02-13T20:13:52.4163323Z LOGURU_LEVEL: INFO +2025-02-13T20:13:52.4163717Z BUILD_TAG: 3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:13:52.4164554Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:13:52.4165363Z RUNNER_UID: 1000 +2025-02-13T20:13:52.4165766Z RUNNER_GID: 1000 +2025-02-13T20:13:52.4166181Z ##[endgroup] +2025-02-13T20:13:52.4215775Z [UPLOAD-ARTIFACT-UUID] test_reports_36168e86-3fe3-4807-94c7-1f22471b0c56 +2025-02-13T20:13:52.4305300Z ##[group]Run actions/upload-artifact@v4 +2025-02-13T20:13:52.4305777Z with: +2025-02-13T20:13:52.4306168Z name: test_reports_36168e86-3fe3-4807-94c7-1f22471b0c56 +2025-02-13T20:13:52.4306649Z path: generated/test_reports/ + +2025-02-13T20:13:52.4307062Z if-no-files-found: warn +2025-02-13T20:13:52.4307435Z compression-level: 6 +2025-02-13T20:13:52.4307779Z overwrite: false +2025-02-13T20:13:52.4308075Z include-hidden-files: false +2025-02-13T20:13:52.4308426Z env: +2025-02-13T20:13:52.4308723Z LOGURU_LEVEL: INFO +2025-02-13T20:13:52.4309151Z BUILD_TAG: 3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:13:52.4309942Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:13:52.4310674Z RUNNER_UID: 1000 +2025-02-13T20:13:52.4310999Z RUNNER_GID: 1000 +2025-02-13T20:13:52.4311311Z ##[endgroup] +2025-02-13T20:13:52.6589096Z With the provided path, there will be 1 file uploaded +2025-02-13T20:13:52.6594696Z Artifact name is valid! +2025-02-13T20:13:52.6596333Z Root directory input is valid! +2025-02-13T20:13:52.8619294Z Beginning upload of artifact content to blob storage +2025-02-13T20:13:53.1968720Z Uploaded bytes 53753 +2025-02-13T20:13:53.2564320Z Finished uploading artifact content to blob storage! +2025-02-13T20:13:53.2568243Z SHA256 hash of uploaded artifact zip is 147b6c23147b7b96f86996e4301fc68550c8c4caf316bc389bfbb09dfa6a81e8 +2025-02-13T20:13:53.2570779Z Finalizing artifact upload +2025-02-13T20:13:53.3644936Z Artifact test_reports_36168e86-3fe3-4807-94c7-1f22471b0c56.zip successfully finalized. Artifact ID 2588499743 +2025-02-13T20:13:53.3646153Z Artifact test_reports_36168e86-3fe3-4807-94c7-1f22471b0c56 has been successfully uploaded! Final size is 53753 bytes. Artifact ID is 2588499743 +2025-02-13T20:13:53.3652984Z Artifact download URL: https://github.com/tenstorrent/tt-metal/actions/runs/13315815702/artifacts/2588499743 +2025-02-13T20:13:53.3843083Z Post job cleanup. +2025-02-13T20:13:53.3918539Z Post job cleanup. +2025-02-13T20:13:53.4647879Z [command]/usr/bin/git version +2025-02-13T20:13:53.4684195Z git version 2.25.1 +2025-02-13T20:13:53.4732996Z Copying '/home/ubuntu/.gitconfig' to '/home/ubuntu/actions-runner/_work/_temp/8186620d-6097-4768-bd9e-4577219272c4/.gitconfig' +2025-02-13T20:13:53.4745168Z Temporarily overriding HOME='/home/ubuntu/actions-runner/_work/_temp/8186620d-6097-4768-bd9e-4577219272c4' before making global git config changes +2025-02-13T20:13:53.4747823Z Adding repository directory to the temporary git global config as a safe directory +2025-02-13T20:13:53.4753683Z [command]/usr/bin/git config --global --add safe.directory /home/ubuntu/actions-runner/_work/tt-metal/tt-metal +2025-02-13T20:13:53.4783931Z [command]/usr/bin/git config --local --name-only --get-regexp core\.sshCommand +2025-02-13T20:13:53.4806825Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'core\.sshCommand' && git config --local --unset-all 'core.sshCommand' || :" +2025-02-13T20:13:53.5069267Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:13:53.5109318Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:13:53.5148679Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:13:53.5194577Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:13:53.5249103Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:13:53.5288109Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:13:53.5326843Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:13:53.5376088Z [command]/usr/bin/git config --local --name-only --get-regexp http\.https\:\/\/github\.com\/\.extraheader +2025-02-13T20:13:53.5389606Z http.https://github.com/.extraheader +2025-02-13T20:13:53.5399307Z [command]/usr/bin/git config --local --unset-all http.https://github.com/.extraheader +2025-02-13T20:13:53.5419228Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'http\.https\:\/\/github\.com\/\.extraheader' && git config --local --unset-all 'http.https://github.com/.extraheader' || :" +2025-02-13T20:13:53.5620648Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:13:53.5660238Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:13:53.5699241Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:13:53.5737861Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:13:53.5777059Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:13:53.5815839Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:13:53.5861700Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:13:53.6002745Z Post job cleanup. +2025-02-13T20:13:53.9238258Z [command]/usr/bin/docker logout https://ghcr.io +2025-02-13T20:13:53.9443448Z Removing login credentials for ghcr.io +2025-02-13T20:13:53.9497010Z ##[group]Post cache +2025-02-13T20:13:53.9497597Z State not set +2025-02-13T20:13:53.9499157Z ##[endgroup] +2025-02-13T20:13:53.9730290Z Post job cleanup. +2025-02-13T20:13:53.9791304Z Post job cleanup. +2025-02-13T20:13:54.0877791Z [command]/usr/bin/git version +2025-02-13T20:13:54.0914693Z git version 2.25.1 +2025-02-13T20:13:54.0954349Z Copying '/home/ubuntu/.gitconfig' to '/home/ubuntu/actions-runner/_work/_temp/554da43a-162d-4688-aa02-dec2208f93f9/.gitconfig' +2025-02-13T20:13:54.0965730Z Temporarily overriding HOME='/home/ubuntu/actions-runner/_work/_temp/554da43a-162d-4688-aa02-dec2208f93f9' before making global git config changes +2025-02-13T20:13:54.0967145Z Adding repository directory to the temporary git global config as a safe directory +2025-02-13T20:13:54.0971518Z [command]/usr/bin/git config --global --add safe.directory /home/ubuntu/actions-runner/_work/tt-metal/tt-metal +2025-02-13T20:13:54.1002174Z [command]/usr/bin/git config --local --name-only --get-regexp core\.sshCommand +2025-02-13T20:13:54.1039243Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'core\.sshCommand' && git config --local --unset-all 'core.sshCommand' || :" +2025-02-13T20:13:54.1296699Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:13:54.1338690Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:13:54.1378392Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:13:54.1416476Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:13:54.1459190Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:13:54.1500239Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:13:54.1537107Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:13:54.1590063Z [command]/usr/bin/git config --local --name-only --get-regexp http\.https\:\/\/github\.com\/\.extraheader +2025-02-13T20:13:54.1621765Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'http\.https\:\/\/github\.com\/\.extraheader' && git config --local --unset-all 'http.https://github.com/.extraheader' || :" +2025-02-13T20:13:54.1852831Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:13:54.1898592Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:13:54.1946170Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:13:54.1991871Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:13:54.2036453Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:13:54.2080815Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:13:54.2124248Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:13:54.2281262Z A job completed hook has been configured by the self-hosted runner administrator +2025-02-13T20:13:54.2305857Z ##[group]Run '/opt/tt_metal_infra/scripts/ci/grayskull/cleanup.sh' +2025-02-13T20:13:54.2316327Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:13:54.2316749Z ##[endgroup] +2025-02-13T20:13:54.2369342Z Current date / time is Thu Feb 13 20:13:54 UTC 2025 +2025-02-13T20:13:54.4164967Z Cleaning up orphan processes diff --git a/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/logs/37190251054_annotations.json b/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/logs/37190251054_annotations.json new file mode 100644 index 00000000000..289a8468d0d --- /dev/null +++ b/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/logs/37190251054_annotations.json @@ -0,0 +1 @@ +[{"path":".github","blob_href":"https://github.com/tenstorrent/tt-metal/blob/ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70/.github","start_line":72,"start_column":null,"end_line":72,"end_column":null,"annotation_level":"notice","title":"","message":"[DEPRECATION] This action is deprecated. Please migrate to reading the Docker image from the pipeline.","raw_details":""},{"path":".github","blob_href":"https://github.com/tenstorrent/tt-metal/blob/ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70/.github","start_line":29,"start_column":null,"end_line":29,"end_column":null,"annotation_level":"notice","title":"disk-usage-after-startup","message":"Disk usage is 60 %","raw_details":""},{"path":".github","blob_href":"https://github.com/tenstorrent/tt-metal/blob/ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70/.github","start_line":141,"start_column":null,"end_line":141,"end_column":null,"annotation_level":"notice","title":"printing-smi-info-startup","message":"Touching and printing out SMI info","raw_details":""},{"path":".github","blob_href":"https://github.com/tenstorrent/tt-metal/blob/ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70/.github","start_line":313,"start_column":null,"end_line":313,"end_column":null,"annotation_level":"notice","title":"reset-successful-startup","message":"tt-smi reset was successful","raw_details":""},{"path":".github","blob_href":"https://github.com/tenstorrent/tt-metal/blob/ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70/.github","start_line":319,"start_column":null,"end_line":319,"end_column":null,"annotation_level":"warning","title":"hugepages-service-not-found-startup","message":"Hugepages service not found. Using old rc.local method","raw_details":""},{"path":".github","blob_href":"https://github.com/tenstorrent/tt-metal/blob/ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70/.github","start_line":325,"start_column":null,"end_line":325,"end_column":null,"annotation_level":"notice","title":"hugepages-setup-success-startup","message":"Hugepages is now setup.","raw_details":""}] diff --git a/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/logs/37190252200.log b/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/logs/37190252200.log new file mode 100644 index 00000000000..e821f531285 --- /dev/null +++ b/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/13315815702/logs/37190252200.log @@ -0,0 +1,568 @@ +2025-02-13T20:06:57.8754450Z Current runner version: '2.322.0' +2025-02-13T20:06:57.8761837Z Runner name: 'tt-metal-ci-vm-104' +2025-02-13T20:06:57.8762796Z Runner group name: 'Default' +2025-02-13T20:06:57.8763953Z Machine name: 'tt-metal-ci-vm-104' +2025-02-13T20:06:57.8768040Z ##[group]GITHUB_TOKEN Permissions +2025-02-13T20:06:57.8770709Z Actions: read +2025-02-13T20:06:57.8771418Z Contents: write +2025-02-13T20:06:57.8772101Z Metadata: read +2025-02-13T20:06:57.8772765Z Packages: write +2025-02-13T20:06:57.8773451Z Pages: write +2025-02-13T20:06:57.8774111Z PullRequests: write +2025-02-13T20:06:57.8774839Z ##[endgroup] +2025-02-13T20:06:57.8778275Z Secret source: Actions +2025-02-13T20:06:57.8779147Z Prepare workflow directory +2025-02-13T20:06:58.1109299Z Prepare all required actions +2025-02-13T20:06:58.1167893Z Getting action download info +2025-02-13T20:06:58.2890142Z Download action repository 'tenstorrent/tt-metal@main' (SHA:ac426de3d4a9c274964843fdae6aa83ea3960a30) +2025-02-13T20:07:05.3312643Z Download action repository 'actions/download-artifact@v4' (SHA:fa0a91b85d4f404e444e00e005971372dc801d16) +2025-02-13T20:07:06.0408809Z Getting action download info +2025-02-13T20:07:06.1857784Z Download action repository 'actions/checkout@v4' (SHA:11bd71901bbe5b1630ceea73d27597364c9af683) +2025-02-13T20:07:06.8814094Z Uses: tenstorrent/tt-metal/.github/workflows/ttnn-post-commit.yaml@refs/heads/sagarwal/multi_page_buffer (ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70) +2025-02-13T20:07:06.8817782Z ##[group] Inputs +2025-02-13T20:07:06.8818271Z build-type: Release +2025-02-13T20:07:06.8819167Z with-retries: false +2025-02-13T20:07:06.8819614Z arch: grayskull +2025-02-13T20:07:06.8820034Z runner-label: E150 +2025-02-13T20:07:06.8821041Z timeout: 45 +2025-02-13T20:07:06.8821450Z num-groups: 12 +2025-02-13T20:07:06.8821864Z ##[endgroup] +2025-02-13T20:07:06.8822507Z Complete job name: ttnn-unit-tests (grayskull, E150) / ttnn group 2 grayskull E150 +2025-02-13T20:07:06.9570275Z A job started hook has been configured by the self-hosted runner administrator +2025-02-13T20:07:06.9729512Z ##[group]Run '/opt/tt_metal_infra/scripts/ci/grayskull/reset.sh' +2025-02-13T20:07:06.9751078Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:07:06.9752059Z ##[endgroup] +2025-02-13T20:07:06.9936139Z ++ date +2025-02-13T20:07:06.9936631Z Current date / time is Thu Feb 13 20:07:06 UTC 2025 +2025-02-13T20:07:06.9937319Z + echo Current date / time is Thu Feb 13 20:07:06 UTC 2025 +2025-02-13T20:07:06.9937897Z + set_e_was_enabled=false +2025-02-13T20:07:06.9938318Z + [[ ehxB == *e* ]] +2025-02-13T20:07:06.9938678Z + set_e_was_enabled=true +2025-02-13T20:07:06.9939066Z + set +e +2025-02-13T20:07:06.9939447Z + docker image prune +2025-02-13T20:07:07.0060453Z WARNING! This will remove all dangling images. +2025-02-13T20:07:07.0085932Z ++ df +2025-02-13T20:07:07.0089776Z ++ awk '{print $5}' +2025-02-13T20:07:07.0091349Z +++ findmnt -n -o SOURCE / +2025-02-13T20:07:07.0092334Z ++ sed s/%// +2025-02-13T20:07:07.0114988Z ++ grep -w '^/dev/vda3' +2025-02-13T20:07:07.0137779Z + disk_usage_before=75 +2025-02-13T20:07:07.0150535Z + echo '::notice title=disk-usage-before-startup::Disk usage is 75 %' +2025-02-13T20:07:07.0151847Z + '[' 75 -ge 90 ']' +2025-02-13T20:07:07.0154899Z Are you sure you want to continue? [y/N] ::notice title=disk-usage-before-startup::Disk usage is 75 % +2025-02-13T20:07:07.0155661Z ++ df +2025-02-13T20:07:07.0155974Z ++ sed s/%// +2025-02-13T20:07:07.0156332Z +++ findmnt -n -o SOURCE / +2025-02-13T20:07:07.0156735Z ++ awk '{print $5}' +2025-02-13T20:07:07.0170022Z ++ grep -w '^/dev/vda3' +2025-02-13T20:07:07.0192959Z + disk_usage_after=75 +2025-02-13T20:07:07.0218541Z ##[notice]Disk usage is 75 % +2025-02-13T20:07:07.0226683Z + echo '::notice title=disk-usage-after-startup::Disk usage is 75 %' +2025-02-13T20:07:07.0227496Z + '[' 75 -ge 90 ']' +2025-02-13T20:07:07.0227834Z ++ lsmod +2025-02-13T20:07:07.0247198Z + lsmod_output='Module Size Used by +2025-02-13T20:07:07.0247947Z veth 28672 0 +2025-02-13T20:07:07.0248412Z wekafsio 70086656 2 +2025-02-13T20:07:07.0249600Z wekafsgw 40960 8 wekafsio +2025-02-13T20:07:07.0250113Z uio_pci_generic 16384 0 +2025-02-13T20:07:07.0250594Z igb_uio 20480 0 +2025-02-13T20:07:07.0251102Z uio 20480 2 igb_uio,uio_pci_generic +2025-02-13T20:07:07.0251603Z xt_conntrack 16384 1 +2025-02-13T20:07:07.0252054Z xt_MASQUERADE 20480 1 +2025-02-13T20:07:07.0252493Z nf_conntrack_netlink 45056 0 +2025-02-13T20:07:07.0253045Z nfnetlink 16384 2 nf_conntrack_netlink +2025-02-13T20:07:07.0253589Z xfrm_user 36864 1 +2025-02-13T20:07:07.0254062Z xfrm_algo 16384 1 xfrm_user +2025-02-13T20:07:07.0254568Z iptable_nat 16384 1 +2025-02-13T20:07:07.0255073Z nf_nat 45056 2 iptable_nat,xt_MASQUERADE +2025-02-13T20:07:07.0255788Z nf_conntrack 139264 4 xt_conntrack,nf_nat,nf_conntrack_netlink,xt_MASQUERADE +2025-02-13T20:07:07.0256536Z nf_defrag_ipv6 24576 1 nf_conntrack +2025-02-13T20:07:07.0257029Z nf_defrag_ipv4 16384 1 nf_conntrack +2025-02-13T20:07:07.0257516Z xt_addrtype 16384 2 +2025-02-13T20:07:07.0257971Z iptable_filter 16384 1 +2025-02-13T20:07:07.0258402Z bpfilter 32768 0 +2025-02-13T20:07:07.0258834Z br_netfilter 28672 0 +2025-02-13T20:07:07.0259296Z bridge 176128 1 br_netfilter +2025-02-13T20:07:07.0259779Z stp 16384 1 bridge +2025-02-13T20:07:07.0260258Z llc 16384 2 bridge,stp +2025-02-13T20:07:07.0260696Z aufs 262144 0 +2025-02-13T20:07:07.0261125Z xfs 1286144 2 +2025-02-13T20:07:07.0261559Z overlay 118784 0 +2025-02-13T20:07:07.0262002Z rdma_ucm 28672 0 +2025-02-13T20:07:07.0262469Z rdma_cm 110592 1 rdma_ucm +2025-02-13T20:07:07.0262952Z iw_cm 49152 1 rdma_cm +2025-02-13T20:07:07.0263713Z ib_ipoib 131072 0 +2025-02-13T20:07:07.0264195Z ib_cm 114688 2 rdma_cm,ib_ipoib +2025-02-13T20:07:07.0264657Z ib_umad 28672 8 +2025-02-13T20:07:07.0265106Z nls_iso8859_1 16384 1 +2025-02-13T20:07:07.0266318Z dm_multipath 32768 0 +2025-02-13T20:07:07.0266739Z scsi_dh_rdac 16384 0 +2025-02-13T20:07:07.0267172Z scsi_dh_emc 16384 0 +2025-02-13T20:07:07.0267602Z scsi_dh_alua 20480 0 +2025-02-13T20:07:07.0268036Z mlx5_ib 397312 0 +2025-02-13T20:07:07.0268463Z kvm_amd 98304 0 +2025-02-13T20:07:07.0268901Z ccp 90112 1 kvm_amd +2025-02-13T20:07:07.0269408Z ib_uverbs 139264 24 rdma_ucm,mlx5_ib +2025-02-13T20:07:07.0269921Z kvm 667648 1 kvm_amd +2025-02-13T20:07:07.0270361Z input_leds 16384 0 +2025-02-13T20:07:07.0270795Z joydev 24576 0 +2025-02-13T20:07:07.0271458Z ib_core 348160 10 rdma_cm,ib_ipoib,iw_cm,ib_umad,rdma_ucm,ib_uverbs,mlx5_ib,ib_cm +2025-02-13T20:07:07.0272092Z serio_raw 20480 0 +2025-02-13T20:07:07.0272530Z tenstorrent 49152 0 +2025-02-13T20:07:07.0272969Z sch_fq_codel 20480 45 +2025-02-13T20:07:07.0273413Z binfmt_misc 24576 1 +2025-02-13T20:07:07.0273837Z msr 16384 0 +2025-02-13T20:07:07.0274245Z efi_pstore 16384 0 +2025-02-13T20:07:07.0274686Z virtio_rng 16384 0 +2025-02-13T20:07:07.0275217Z ip_tables 32768 2 iptable_filter,iptable_nat +2025-02-13T20:07:07.0275991Z x_tables 40960 5 xt_conntrack,iptable_filter,xt_addrtype,ip_tables,xt_MASQUERADE +2025-02-13T20:07:07.0276652Z autofs4 45056 2 +2025-02-13T20:07:07.0277086Z btrfs 1269760 0 +2025-02-13T20:07:07.0277560Z zstd_compress 167936 1 btrfs +2025-02-13T20:07:07.0278044Z raid10 61440 0 +2025-02-13T20:07:07.0278470Z raid456 155648 0 +2025-02-13T20:07:07.0278895Z async_raid6_recov 24576 1 raid456 +2025-02-13T20:07:07.0279447Z async_memcpy 20480 2 raid456,async_raid6_recov +2025-02-13T20:07:07.0280197Z async_pq 24576 2 raid456,async_raid6_recov +2025-02-13T20:07:07.0280822Z async_xor 20480 3 async_pq,raid456,async_raid6_recov +2025-02-13T20:07:07.0281632Z async_tx 20480 5 async_pq,async_memcpy,async_xor,raid456,async_raid6_recov +2025-02-13T20:07:07.0282288Z xor 24576 2 async_xor,btrfs +2025-02-13T20:07:07.0282898Z raid6_pq 114688 4 async_pq,btrfs,raid456,async_raid6_recov +2025-02-13T20:07:07.0283607Z libcrc32c 16384 5 nf_conntrack,nf_nat,btrfs,xfs,raid456 +2025-02-13T20:07:07.0284177Z raid1 45056 0 +2025-02-13T20:07:07.0284597Z raid0 24576 0 +2025-02-13T20:07:07.0285035Z multipath 20480 0 +2025-02-13T20:07:07.0285440Z linear 20480 0 +2025-02-13T20:07:07.0285876Z hid_generic 16384 0 +2025-02-13T20:07:07.0286336Z crct10dif_pclmul 16384 1 +2025-02-13T20:07:07.0286798Z crc32_pclmul 16384 0 +2025-02-13T20:07:07.0287247Z usbhid 57344 0 +2025-02-13T20:07:07.0287666Z ghash_clmulni_intel 16384 0 +2025-02-13T20:07:07.0288186Z hid 131072 2 usbhid,hid_generic +2025-02-13T20:07:07.0288711Z mlx5_core 1626112 1 mlx5_ib +2025-02-13T20:07:07.0289156Z cirrus 16384 0 +2025-02-13T20:07:07.0289562Z drm_kms_helper 184320 3 cirrus +2025-02-13T20:07:07.0290066Z pci_hyperv_intf 16384 1 mlx5_core +2025-02-13T20:07:07.0290549Z aesni_intel 372736 0 +2025-02-13T20:07:07.0290977Z mlxdevm 172032 1 mlx5_core +2025-02-13T20:07:07.0291490Z syscopyarea 16384 1 drm_kms_helper +2025-02-13T20:07:07.0292011Z sysfillrect 16384 1 drm_kms_helper +2025-02-13T20:07:07.0292527Z crypto_simd 16384 1 aesni_intel +2025-02-13T20:07:07.0293247Z auxiliary 16384 2 mlx5_ib,mlx5_core +2025-02-13T20:07:07.0293796Z sysimgblt 16384 1 drm_kms_helper +2025-02-13T20:07:07.0294656Z mlx_compat 65536 12 rdma_cm,ib_ipoib,mlxdevm,iw_cm,auxiliary,ib_umad,ib_core,rdma_ucm,ib_uverbs,mlx5_ib,ib_cm,mlx5_core +2025-02-13T20:07:07.0295536Z cryptd 24576 2 crypto_simd,ghash_clmulni_intel +2025-02-13T20:07:07.0296165Z tls 73728 1 mlx5_core +2025-02-13T20:07:07.0296642Z glue_helper 16384 1 aesni_intel +2025-02-13T20:07:07.0297122Z ahci 40960 0 +2025-02-13T20:07:07.0297605Z fb_sys_fops 16384 1 drm_kms_helper +2025-02-13T20:07:07.0298103Z virtio_blk 20480 3 +2025-02-13T20:07:07.0298542Z psmouse 155648 0 +2025-02-13T20:07:07.0298966Z mlxfw 32768 1 mlx5_core +2025-02-13T20:07:07.0299463Z libahci 36864 1 ahci +2025-02-13T20:07:07.0299984Z drm 495616 3 drm_kms_helper,cirrus +2025-02-13T20:07:07.0300542Z psample 20480 1 mlx5_core' +2025-02-13T20:07:07.0301048Z + grep -q tenstorrent +2025-02-13T20:07:07.0313492Z + echo Module Size Used by veth 28672 0 wekafsio 70086656 2 wekafsgw 40960 8 wekafsio uio_pci_generic 16384 0 igb_uio 20480 0 uio 20480 2 igb_uio,uio_pci_generic xt_conntrack 16384 1 xt_MASQUERADE 20480 1 nf_conntrack_netlink 45056 0 nfnetlink 16384 2 nf_conntrack_netlink xfrm_user 36864 1 xfrm_algo 16384 1 xfrm_user iptable_nat 16384 1 nf_nat 45056 2 iptable_nat,xt_MASQUERADE nf_conntrack 139264 4 xt_conntrack,nf_nat,nf_conntrack_netlink,xt_MASQUERADE nf_defrag_ipv6 24576 1 nf_conntrack nf_defrag_ipv4 16384 1 nf_conntrack xt_addrtype 16384 2 iptable_filter 16384 1 bpfilter 32768 0 br_netfilter 28672 0 bridge 176128 1 br_netfilter stp 16384 1 bridge llc 16384 2 bridge,stp aufs 262144 0 xfs 1286144 2 overlay 118784 0 rdma_ucm 28672 0 rdma_cm 110592 1 rdma_ucm iw_cm 49152 1 rdma_cm ib_ipoib 131072 0 ib_cm 114688 2 rdma_cm,ib_ipoib ib_umad 28672 8 nls_iso8859_1 16384 1 dm_multipath 32768 0 scsi_dh_rdac 16384 0 scsi_dh_emc 16384 0 scsi_dh_alua 20480 0 mlx5_ib 397312 0 kvm_amd 98304 0 ccp 90112 1 kvm_amd ib_uverbs 139264 24 rdma_ucm,mlx5_ib kvm 667648 1 kvm_amd input_leds 16384 0 joydev 24576 0 ib_core 348160 10 rdma_cm,ib_ipoib,iw_cm,ib_umad,rdma_ucm,ib_uverbs,mlx5_ib,ib_cm serio_raw 20480 0 tenstorrent 49152 0 sch_fq_codel 20480 45 binfmt_misc 24576 1 msr 16384 0 efi_pstore 16384 0 virtio_rng 16384 0 ip_tables 32768 2 iptable_filter,iptable_nat x_tables 40960 5 xt_conntrack,iptable_filter,xt_addrtype,ip_tables,xt_MASQUERADE autofs4 45056 2 btrfs 1269760 0 zstd_compress 167936 1 btrfs raid10 61440 0 raid456 155648 0 async_raid6_recov 24576 1 raid456 async_memcpy 20480 2 raid456,async_raid6_recov async_pq 24576 2 raid456,async_raid6_recov async_xor 20480 3 async_pq,raid456,async_raid6_recov async_tx 20480 5 async_pq,async_memcpy,async_xor,raid456,async_raid6_recov xor 24576 2 async_xor,btrfs raid6_pq 114688 4 async_pq,btrfs,raid456,async_raid6_recov libcrc32c 16384 5 nf_conntrack,nf_nat,btrfs,xfs,raid456 raid1 45056 0 raid0 24576 0 multipath 20480 0 linear 20480 0 hid_generic 16384 0 crct10dif_pclmul 16384 1 crc32_pclmul 16384 0 usbhid 57344 0 ghash_clmulni_intel 16384 0 hid 131072 2 usbhid,hid_generic mlx5_core 1626112 1 mlx5_ib cirrus 16384 0 drm_kms_helper 184320 3 cirrus pci_hyperv_intf 16384 1 mlx5_core aesni_intel 372736 0 mlxdevm 172032 1 mlx5_core syscopyarea 16384 1 drm_kms_helper sysfillrect 16384 1 drm_kms_helper crypto_simd 16384 1 aesni_intel auxiliary 16384 2 mlx5_ib,mlx5_core sysimgblt 16384 1 drm_kms_helper mlx_compat 65536 12 rdma_cm,ib_ipoib,mlxdevm,iw_cm,auxiliary,ib_umad,ib_core,rdma_ucm,ib_uverbs,mlx5_ib,ib_cm,mlx5_core cryptd 24576 2 crypto_simd,ghash_clmulni_intel tls 73728 1 mlx5_core glue_helper 16384 1 aesni_intel ahci 40960 0 fb_sys_fops 16384 1 drm_kms_helper virtio_blk 20480 3 psmouse 155648 0 mlxfw 32768 1 mlx5_core libahci 36864 1 ahci drm 495616 3 drm_kms_helper,cirrus psample 20480 1 mlx5_core +2025-02-13T20:07:07.0324965Z + [[ 0 -ne 0 ]] +2025-02-13T20:07:07.0325412Z ++ lsof -w /dev/tenstorrent/0 +2025-02-13T20:07:07.1678302Z + lsof_output= +2025-02-13T20:07:07.1678809Z + '[' -n '' ']' +2025-02-13T20:07:07.1679144Z + i=0 +2025-02-13T20:07:07.1679520Z + iter_limit=10 +2025-02-13T20:07:07.1680194Z + echo '::notice title=printing-smi-info-startup::Touching and printing out SMI info' +2025-02-13T20:07:07.1680887Z + sleep 20 +2025-02-13T20:07:07.1682689Z ##[notice]Touching and printing out SMI info +2025-02-13T20:07:27.1694945Z + sudo touch /opt/tt_metal_infra/smi.log +2025-02-13T20:07:27.1919884Z + sudo chown ubuntu /opt/tt_metal_infra/smi.log +2025-02-13T20:07:27.2135116Z + tt-smi-metal -s -f /opt/tt_metal_infra/smi.log +2025-02-13T20:07:27.6260621Z +2025-02-13T20:07:27.6262656Z  Detected Chips: 1 +2025-02-13T20:07:27.6282320Z  +2025-02-13T20:07:27.6283053Z  Detected Chips: 1 +2025-02-13T20:07:27.6283319Z +2025-02-13T20:07:27.6283537Z  Detecting ARC: | +2025-02-13T20:07:27.6283770Z +2025-02-13T20:07:27.6283984Z  Detecting DRAM: | +2025-02-13T20:07:27.6287158Z +2025-02-13T20:07:27.6287698Z [] ETH: | +2025-02-13T20:07:27.6349249Z Gathering Information ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:00 +2025-02-13T20:07:27.6390667Z  Saved tt-smi log to: /opt/tt_metal_infra/smi.log  +2025-02-13T20:07:27.7097632Z + cat /opt/tt_metal_infra/smi.log +2025-02-13T20:07:27.7103510Z { +2025-02-13T20:07:27.7103867Z "time": "2025-02-13T20:07:27.628657", +2025-02-13T20:07:27.7104293Z "host_info": { +2025-02-13T20:07:27.7104616Z "OS": "Linux", +2025-02-13T20:07:27.7104956Z "Distro": "Ubuntu 20.04.6 LTS", +2025-02-13T20:07:27.7105360Z "Kernel": "5.4.0-205-generic", +2025-02-13T20:07:27.7106021Z "Hostname": "tt-metal-ci-vm-104", +2025-02-13T20:07:27.7106456Z "Platform": "x86_64", +2025-02-13T20:07:27.7106805Z "Python": "3.8.10", +2025-02-13T20:07:27.7107153Z "Memory": "47.14 GB", +2025-02-13T20:07:27.7107562Z "Driver": "TTKMD 1.29" +2025-02-13T20:07:27.7107957Z }, +2025-02-13T20:07:27.7108316Z "device_info": [ +2025-02-13T20:07:27.7108759Z { +2025-02-13T20:07:27.7109598Z + echo '::notice title=attempting-reset-startup::Attempting to reset card(s). Sleeping first' +2025-02-13T20:07:27.7110920Z + sleep 30 +2025-02-13T20:07:27.7111223Z "smbus_telem": { +2025-02-13T20:07:27.7111621Z "BOARD_ID": "0x10000361160a012", +2025-02-13T20:07:27.7112121Z "ENUM_VERSION": "0xba5e0001", +2025-02-13T20:07:27.7112534Z "DEVICE_ID": "0xfaca1e52", +2025-02-13T20:07:27.7112963Z "ASIC_RO": null, +2025-02-13T20:07:27.7113326Z "ASIC_IDD": null, +2025-02-13T20:07:27.7113710Z "BOARD_ID_HIGH": "0x1000036", +2025-02-13T20:07:27.7114125Z "BOARD_ID_LOW": "0x1160a012", +2025-02-13T20:07:27.7114543Z "ARC0_FW_VERSION": "0x1070000", +2025-02-13T20:07:27.7114962Z "ARC1_FW_VERSION": "0x1070000", +2025-02-13T20:07:27.7115379Z "ARC2_FW_VERSION": null, +2025-02-13T20:07:27.7115788Z "ARC3_FW_VERSION": "0x1070000", +2025-02-13T20:07:27.7116219Z "SPIBOOTROM_FW_VERSION": null, +2025-02-13T20:07:27.7116654Z "ETH_FW_VERSION": null, +2025-02-13T20:07:27.7117057Z "M3_BL_FW_VERSION": null, +2025-02-13T20:07:27.7117465Z "M3_APP_FW_VERSION": null, +2025-02-13T20:07:27.7117877Z "DDR_SPEED": "0xe74", +2025-02-13T20:07:27.7118267Z "DDR_STATUS": "0x111111", +2025-02-13T20:07:27.7118664Z "ETH_STATUS0": null, +2025-02-13T20:07:27.7119039Z "ETH_STATUS1": null, +2025-02-13T20:07:27.7119446Z "PCIE_STATUS": "0x11040040", +2025-02-13T20:07:27.7119844Z "FAULTS": null, +2025-02-13T20:07:27.7120225Z "ARC0_HEALTH": "0x1dfceedb", +2025-02-13T20:07:27.7120625Z "ARC1_HEALTH": null, +2025-02-13T20:07:27.7121003Z "ARC2_HEALTH": null, +2025-02-13T20:07:27.7121566Z "ARC3_HEALTH": null, +2025-02-13T20:07:27.7121950Z "FAN_SPEED": "0xff", +2025-02-13T20:07:27.7122332Z "AICLK": "0x4b200fa", +2025-02-13T20:07:27.7122710Z "AXICLK": "0x384", +2025-02-13T20:07:27.7123080Z "ARCCLK": "0x21c", +2025-02-13T20:07:27.7123451Z "THROTTLER": null, +2025-02-13T20:07:27.7123818Z "VCORE": "0x2e4", +2025-02-13T20:07:27.7124206Z "ASIC_TEMPERATURE": "0x2e00246", +2025-02-13T20:07:27.7124635Z "VREG_TEMPERATURE": null, +2025-02-13T20:07:27.7125045Z "BOARD_TEMPERATURE": null, +2025-02-13T20:07:27.7125447Z "TDP": "0xaa0011", +2025-02-13T20:07:27.7125806Z "TDC": "0x12c0016", +2025-02-13T20:07:27.7126189Z "VDD_LIMITS": "0x3a202e4", +2025-02-13T20:07:27.7126593Z "THM_LIMITS": "0x53004b", +2025-02-13T20:07:27.7126995Z "WH_FW_DATE": "0x45011317", +2025-02-13T20:07:27.7127405Z "ASIC_TMON0": "0x25262523", +2025-02-13T20:07:27.7127801Z "ASIC_TMON1": "0x2524", +2025-02-13T20:07:27.7128189Z "MVDDQ_POWER": null, +2025-02-13T20:07:27.7128586Z "GDDR_TRAIN_TEMP0": null, +2025-02-13T20:07:27.7128980Z "GDDR_TRAIN_TEMP1": null, +2025-02-13T20:07:27.7129383Z "BOOT_DATE": "0x520b0531", +2025-02-13T20:07:27.7129785Z "RT_SECONDS": null, +2025-02-13T20:07:27.7130170Z "AUX_STATUS": null, +2025-02-13T20:07:27.7130560Z "ETH_DEBUG_STATUS0": null, +2025-02-13T20:07:27.7130973Z "ETH_DEBUG_STATUS1": null, +2025-02-13T20:07:27.7131394Z "TT_FLASH_VERSION": "0x30100", +2025-02-13T20:07:27.7131823Z "FW_BUNDLE_VERSION": "0x50090000" +2025-02-13T20:07:27.7132208Z }, +2025-02-13T20:07:27.7132513Z "board_info": { +2025-02-13T20:07:27.7132866Z "bus_id": "0000:07:00.0", +2025-02-13T20:07:27.7133259Z "board_type": "e150", +2025-02-13T20:07:27.7133721Z "board_id": "10000361160a012", +2025-02-13T20:07:27.7134151Z "coords": "N/A", +2025-02-13T20:07:27.7134624Z "dram_status": true, +2025-02-13T20:07:27.7135013Z "dram_speed": "3700", +2025-02-13T20:07:27.7135402Z "pcie_speed": 4, +2025-02-13T20:07:27.7135778Z "pcie_width": "16" +2025-02-13T20:07:27.7136215Z }, +2025-02-13T20:07:27.7136528Z "telemetry": { +2025-02-13T20:07:27.7136883Z "voltage": "0.74", +2025-02-13T20:07:27.7137258Z "current": " 22.0", +2025-02-13T20:07:27.7137638Z "power": " 17.0", +2025-02-13T20:07:27.7138014Z "aiclk": " 250", +2025-02-13T20:07:27.7138399Z "asic_temperature": "36.4" +2025-02-13T20:07:27.7138782Z }, +2025-02-13T20:07:27.7139099Z "firmwares": { +2025-02-13T20:07:27.7139486Z "fw_bundle_version": "80.9.0.0", +2025-02-13T20:07:27.7139934Z "tt_flash_version": "0.3.1.0", +2025-02-13T20:07:27.7140356Z "cm_fw": "1.7.0.0", +2025-02-13T20:07:27.7140776Z "cm_fw_date": "2024-05-01", +2025-02-13T20:07:27.7141173Z "eth_fw": "N/A", +2025-02-13T20:07:27.7141552Z "bm_bl_fw": "N/A", +2025-02-13T20:07:27.7141941Z "bm_app_fw": "N/A" +2025-02-13T20:07:27.7142314Z }, +2025-02-13T20:07:27.7142627Z "limits": { +2025-02-13T20:07:27.7143003Z "vdd_min": "0.74", +2025-02-13T20:07:27.7143435Z "vdd_max": "0.93", +2025-02-13T20:07:27.7143831Z "tdp_limit": "170", +2025-02-13T20:07:27.7146062Z "tdc_limit": "300", +2025-02-13T20:07:27.7146464Z "asic_fmax": "1202", +2025-02-13T20:07:27.7146883Z "therm_trip_l1_limit": "83", +2025-02-13T20:07:27.7147312Z "thm_limit": "75", +2025-02-13T20:07:27.7147716Z "bus_peak_limit": null +2025-02-13T20:07:27.7148244Z } +2025-02-13T20:07:27.7148563Z } +2025-02-13T20:07:27.7148855Z ] +2025-02-13T20:07:27.7149373Z }::notice title=attempting-reset-startup::Attempting to reset card(s). Sleeping first +2025-02-13T20:07:57.7123742Z + '[' 0 -lt 10 ']' +2025-02-13T20:07:57.7124235Z + (( i++ )) +2025-02-13T20:07:57.7125541Z ++ tt-smi-metal -r 0 +2025-02-13T20:07:58.2546127Z + reset_output=' Starting Tensix reset on GS board at PCI index 0  +2025-02-13T20:07:58.2548527Z ##[notice]tt-smi reset was successful +2025-02-13T20:07:58.2551122Z  Lowering clks to safe value...  +2025-02-13T20:07:58.2551784Z  Beginning reset sequence...  +2025-02-13T20:07:58.2552425Z  Finishing reset sequence...  +2025-02-13T20:07:58.2553094Z  Returning clks to original values...  +2025-02-13T20:07:58.2553843Z  Finished Tensix reset on GS board at PCI index 0 +2025-02-13T20:07:58.2554455Z  +2025-02-13T20:07:58.2554972Z  Re-initializing boards after reset....  +2025-02-13T20:07:58.2555429Z +2025-02-13T20:07:58.2555702Z  Detected Chips: 1 +2025-02-13T20:07:58.2556203Z  +2025-02-13T20:07:58.2556733Z  Detected Chips: 1 +2025-02-13T20:07:58.2557180Z +2025-02-13T20:07:58.2557574Z  Detecting ARC: | +2025-02-13T20:07:58.2557894Z +2025-02-13T20:07:58.2558143Z  Detecting DRAM: | +2025-02-13T20:07:58.2558456Z +2025-02-13T20:07:58.2558692Z [] ETH: |' +2025-02-13T20:07:58.2559150Z + [[ 0 -ne 0 ]] +2025-02-13T20:07:58.2559775Z + [[  Starting Tensix reset on GS board at PCI index 0  +2025-02-13T20:07:58.2560516Z  Lowering clks to safe value...  +2025-02-13T20:07:58.2561147Z  Beginning reset sequence...  +2025-02-13T20:07:58.2561765Z  Finishing reset sequence...  +2025-02-13T20:07:58.2562459Z  Returning clks to original values...  +2025-02-13T20:07:58.2563195Z  Finished Tensix reset on GS board at PCI index 0 +2025-02-13T20:07:58.2563915Z  +2025-02-13T20:07:58.2564520Z  Re-initializing boards after reset....  +2025-02-13T20:07:58.2564936Z +2025-02-13T20:07:58.2565191Z  Detected Chips: 1 +2025-02-13T20:07:58.2566359Z  +2025-02-13T20:07:58.2566850Z  Detected Chips: 1 +2025-02-13T20:07:58.2567155Z +2025-02-13T20:07:58.2567412Z  Detecting ARC: | +2025-02-13T20:07:58.2567722Z +2025-02-13T20:07:58.2567979Z  Detecting DRAM: | +2025-02-13T20:07:58.2568295Z +2025-02-13T20:07:58.2568715Z [] ETH: | == *\N\o\ \c\h\i\p\s\ \d\e\t\e\c\t\e\d* ]] +2025-02-13T20:07:58.2569306Z + break +2025-02-13T20:07:58.2569653Z + '[' 1 -eq 10 ']' +2025-02-13T20:07:58.2570300Z + echo '::notice title=reset-successful-startup::tt-smi reset was successful' +2025-02-13T20:07:58.2571073Z + check_hugepages_service_status=0 +2025-02-13T20:07:58.2571685Z + sudo systemctl status tenstorrent-hugepages.service +2025-02-13T20:07:58.2898808Z ● tenstorrent-hugepages.service - Script that configures hugepages for Tenstorrent ASICs +2025-02-13T20:07:58.2900346Z Loaded: loaded (/lib/systemd/system/tenstorrent-hugepages.service; enabled; vendor preset: enabled) +2025-02-13T20:07:58.2901510Z Active: inactive (dead) since Thu 2025-02-13 19:49:54 UTC; 18min ago +2025-02-13T20:07:58.2902584Z Process: 1919773 ExecStart=/opt/tenstorrent/bin/hugepages-setup.sh (code=exited, status=0/SUCCESS) +2025-02-13T20:07:58.2903698Z Main PID: 1919773 (code=exited, status=0/SUCCESS) +2025-02-13T20:07:58.2904103Z +2025-02-13T20:07:58.2904667Z Feb 13 19:49:54 tt-metal-ci-vm-104 systemd[1]: Started Script that configures hugepages for Tenstorrent ASICs. +2025-02-13T20:07:58.2905971Z Feb 13 19:49:54 tt-metal-ci-vm-104 hugepages-setup.sh[1919773]: Node 0 hugepages before: 1 +2025-02-13T20:07:58.2907060Z Feb 13 19:49:54 tt-metal-ci-vm-104 hugepages-setup.sh[1919773]: Node 0 hugepages needed: 1 +2025-02-13T20:07:58.2908136Z Feb 13 19:49:54 tt-metal-ci-vm-104 hugepages-setup.sh[1919773]: Node 0 hugepages after: 1 +2025-02-13T20:07:58.2909432Z Feb 13 19:49:54 tt-metal-ci-vm-104 hugepages-setup.sh[1919773]: Completed hugepage setup +2025-02-13T20:07:58.2910457Z Feb 13 19:49:54 tt-metal-ci-vm-104 systemd[1]: tenstorrent-hugepages.service: Succeeded. +2025-02-13T20:07:58.2911296Z + check_hugepages_service_status=3 +2025-02-13T20:07:58.2911790Z + '[' 3 -eq 4 ']' +2025-02-13T20:07:58.2913087Z + echo '::notice title=hugepages-service-found-startup::Hugepages service found. Command returned with exit code 3. Restarting it so we can ensure hugepages are available' +2025-02-13T20:07:58.2914519Z + sudo systemctl restart tenstorrent-hugepages.service +2025-02-13T20:07:58.2916981Z ##[notice]Hugepages service found. Command returned with exit code 3. Restarting it so we can ensure hugepages are available +2025-02-13T20:07:58.3196373Z ++ date +%s +2025-02-13T20:07:58.3200672Z + hugepages_check_start=1739477278 +2025-02-13T20:07:58.3213288Z + hugepages_check_timeout=60 +2025-02-13T20:07:58.3214449Z ++ cat /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages +2025-02-13T20:07:58.3219397Z ##[notice]Hugepages is now setup. +2025-02-13T20:07:58.3220860Z + [[ 1 -eq 0 ]] +2025-02-13T20:07:58.3221676Z + echo '::notice title=hugepages-setup-success-startup::Hugepages is now setup.' +2025-02-13T20:07:58.3222522Z + echo 'Printing out cpu information...' +2025-02-13T20:07:58.3223024Z + lscpu +2025-02-13T20:07:58.3223469Z Printing out cpu information... +2025-02-13T20:07:58.3263284Z Architecture: x86_64 +2025-02-13T20:07:58.3264000Z CPU op-mode(s): 32-bit, 64-bit +2025-02-13T20:07:58.3264513Z Byte Order: Little Endian +2025-02-13T20:07:58.3265055Z Address sizes: 40 bits physical, 48 bits virtual +2025-02-13T20:07:58.3265751Z CPU(s): 14 +2025-02-13T20:07:58.3266215Z On-line CPU(s) list: 0-13 +2025-02-13T20:07:58.3266658Z Thread(s) per core: 1 +2025-02-13T20:07:58.3267069Z Core(s) per socket: 1 +2025-02-13T20:07:58.3267461Z Socket(s): 14 +2025-02-13T20:07:58.3267870Z NUMA node(s): 2 +2025-02-13T20:07:58.3268458Z Vendor ID: AuthenticAMD +2025-02-13T20:07:58.3269389Z CPU family: 23 +2025-02-13T20:07:58.3269787Z Model: 49 +2025-02-13T20:07:58.3270287Z Model name: AMD EPYC-Rome Processor +2025-02-13T20:07:58.3270770Z Stepping: 0 +2025-02-13T20:07:58.3271225Z CPU MHz: 2300.000 +2025-02-13T20:07:58.3271683Z BogoMIPS: 4600.00 +2025-02-13T20:07:58.3272113Z Virtualization: AMD-V +2025-02-13T20:07:58.3272549Z Hypervisor vendor: KVM +2025-02-13T20:07:58.3272969Z Virtualization type: full +2025-02-13T20:07:58.3273370Z L1d cache: 448 KiB +2025-02-13T20:07:58.3273784Z L1i cache: 448 KiB +2025-02-13T20:07:58.3274196Z L2 cache: 7 MiB +2025-02-13T20:07:58.3274611Z L3 cache: 224 MiB +2025-02-13T20:07:58.3275018Z NUMA node0 CPU(s): 0-6 +2025-02-13T20:07:58.3275439Z NUMA node1 CPU(s): 7-13 +2025-02-13T20:07:58.3275890Z Vulnerability Gather data sampling: Not affected +2025-02-13T20:07:58.3276378Z Vulnerability Itlb multihit: Not affected +2025-02-13T20:07:58.3276836Z Vulnerability L1tf: Not affected +2025-02-13T20:07:58.3277307Z Vulnerability Mds: Not affected +2025-02-13T20:07:58.3277780Z Vulnerability Meltdown: Not affected +2025-02-13T20:07:58.3278242Z Vulnerability Mmio stale data: Not affected +2025-02-13T20:07:58.3278739Z Vulnerability Retbleed: Vulnerable +2025-02-13T20:07:58.3279530Z Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp +2025-02-13T20:07:58.3280500Z Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization +2025-02-13T20:07:58.3281845Z Vulnerability Spectre v2: Mitigation; Retpolines; IBPB conditional; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected +2025-02-13T20:07:58.3282833Z Vulnerability Srbds: Not affected +2025-02-13T20:07:58.3283323Z Vulnerability Tsx async abort: Not affected +2025-02-13T20:07:58.3286095Z Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid tsc_known_freq pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm svm cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core ssbd ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 clzero xsaveerptr wbnoinvd arat npt nrip_save umip rdpid +2025-02-13T20:07:58.3537874Z ##[group]Run tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main +2025-02-13T20:07:58.3538511Z with: +2025-02-13T20:07:58.3538953Z token: *** +2025-02-13T20:07:58.3539288Z fetch-depth: 1 +2025-02-13T20:07:58.3539588Z env: +2025-02-13T20:07:58.3539863Z LOGURU_LEVEL: INFO +2025-02-13T20:07:58.3540182Z ##[endgroup] +2025-02-13T20:07:58.3630960Z ##[group]Run set -x +2025-02-13T20:07:58.3631369Z set -x +2025-02-13T20:07:58.3631732Z ls -al +2025-02-13T20:07:58.3632110Z if [ -f "semicolon_delimited_script" ]; then +2025-02-13T20:07:58.3632599Z  file semicolon_delimited_script +2025-02-13T20:07:58.3633046Z  head semicolon_delimited_script +2025-02-13T20:07:58.3633438Z fi +2025-02-13T20:07:58.3633755Z sudo rm -rf deleteme +2025-02-13T20:07:58.3634136Z sudo rm -rf docker-job +2025-02-13T20:07:58.3634526Z if [ -d ".git" ]; then +2025-02-13T20:07:58.3634966Z  echo 'Cleaning repo' +2025-02-13T20:07:58.3635355Z  git clean -xffd +2025-02-13T20:07:58.3635721Z  echo 'Done git clean -xffd' +2025-02-13T20:07:58.3636186Z  echo 'Attempting to delete any lock files' +2025-02-13T20:07:58.3636931Z  find .git -type f -iname '*.lock' -delete +2025-02-13T20:07:58.3637412Z  echo 'Done deleting lock files' +2025-02-13T20:07:58.3637871Z  echo 'De-init-ing submodules' +2025-02-13T20:07:58.3638314Z  git submodule deinit -f --all +2025-02-13T20:07:58.3638761Z  echo 'Done de-initing submodules' +2025-02-13T20:07:58.3639176Z fi +2025-02-13T20:07:58.3659241Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:07:58.3659726Z env: +2025-02-13T20:07:58.3660024Z LOGURU_LEVEL: INFO +2025-02-13T20:07:58.3660342Z ##[endgroup] +2025-02-13T20:07:58.3698050Z + ls -al +2025-02-13T20:07:58.3718745Z total 359748 +2025-02-13T20:09:27.8862319Z SKIPPED [5347] tests/ttnn/unit_tests/operations/test_batch_norm.py:16: Unsupported dtype for Grayskull +2025-02-13T20:09:27.8862732Z SKIPPED [64] tests/ttnn/unit_tests/operations/test_batch_norm.py:124: Unsupported dtype for Grayskull +2025-02-13T20:09:27.8863153Z SKIPPED [40] tests/ttnn/unit_tests/operations/test_batch_norm.py:168: Unsupported dtype for Grayskull +2025-02-13T20:09:27.8863403Z ============= 5451 skipped, 59950 deselected, 1 warning in 16.65s ============== +2025-02-13T20:09:31.4542530Z Prepare all required actions +2025-02-13T20:09:31.4543153Z Getting action download info +2025-02-13T20:09:31.7353367Z Download action repository 'actions/upload-artifact@v4' (SHA:65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08) +2025-02-13T20:09:32.4838363Z ##[group]Run ./.github/actions/upload-artifact-with-job-uuid +2025-02-13T20:09:32.4838877Z with: +2025-02-13T20:09:32.4839210Z path: generated/test_reports/ + +2025-02-13T20:09:32.4839609Z prefix: test_reports_ +2025-02-13T20:09:32.4839949Z env: +2025-02-13T20:09:32.4840224Z LOGURU_LEVEL: INFO +2025-02-13T20:09:32.4840626Z BUILD_TAG: 3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:09:32.4841502Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:09:32.4842485Z RUNNER_UID: 1000 +2025-02-13T20:09:32.4842810Z RUNNER_GID: 1000 +2025-02-13T20:09:32.4843133Z ##[endgroup] +2025-02-13T20:09:32.4862765Z ##[group]Run uuid=$(uuidgen) +2025-02-13T20:09:32.4863249Z uuid=$(uuidgen) +2025-02-13T20:09:32.4863647Z artifact_name="test_reports_$uuid" +2025-02-13T20:09:32.4864148Z echo "[UPLOAD-ARTIFACT-UUID] $artifact_name" +2025-02-13T20:09:32.4864705Z echo "artifact-name=$artifact_name" >> "$GITHUB_OUTPUT" +2025-02-13T20:09:32.4884612Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:09:32.4885083Z env: +2025-02-13T20:09:32.4885380Z LOGURU_LEVEL: INFO +2025-02-13T20:09:32.4885787Z BUILD_TAG: 3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:09:32.4886653Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:09:32.4887452Z RUNNER_UID: 1000 +2025-02-13T20:09:32.4887774Z RUNNER_GID: 1000 +2025-02-13T20:09:32.4888238Z ##[endgroup] +2025-02-13T20:09:32.4942668Z [UPLOAD-ARTIFACT-UUID] test_reports_e6d768c7-44f2-4bd8-a96a-4277c643d4a3 +2025-02-13T20:09:32.5051562Z ##[group]Run actions/upload-artifact@v4 +2025-02-13T20:09:32.5052217Z with: +2025-02-13T20:09:32.5052725Z name: test_reports_e6d768c7-44f2-4bd8-a96a-4277c643d4a3 +2025-02-13T20:09:32.5053400Z path: generated/test_reports/ + +2025-02-13T20:09:32.5053941Z if-no-files-found: warn +2025-02-13T20:09:32.5054393Z compression-level: 6 +2025-02-13T20:09:32.5054835Z overwrite: false +2025-02-13T20:09:32.5055274Z include-hidden-files: false +2025-02-13T20:09:32.5055736Z env: +2025-02-13T20:09:32.5056083Z LOGURU_LEVEL: INFO +2025-02-13T20:09:32.5056583Z BUILD_TAG: 3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:09:32.5057818Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:3a3dbaa78a8197befe10ef2aca8c49f9e5b087c6 +2025-02-13T20:09:32.5058894Z RUNNER_UID: 1000 +2025-02-13T20:09:32.5059296Z RUNNER_GID: 1000 +2025-02-13T20:09:32.5059708Z ##[endgroup] +2025-02-13T20:09:32.7761763Z With the provided path, there will be 1 file uploaded +2025-02-13T20:09:32.7767089Z Artifact name is valid! +2025-02-13T20:09:32.7768565Z Root directory input is valid! +2025-02-13T20:09:32.9975679Z Beginning upload of artifact content to blob storage +2025-02-13T20:09:33.2919054Z Uploaded bytes 22119 +2025-02-13T20:09:33.3510874Z Finished uploading artifact content to blob storage! +2025-02-13T20:09:33.3514353Z SHA256 hash of uploaded artifact zip is 051588680eed12cf7f233260b71a6626661f275197d8034e35605feab6280ab7 +2025-02-13T20:09:33.3516662Z Finalizing artifact upload +2025-02-13T20:09:33.4642314Z Artifact test_reports_e6d768c7-44f2-4bd8-a96a-4277c643d4a3.zip successfully finalized. Artifact ID 2588473193 +2025-02-13T20:09:33.4644130Z Artifact test_reports_e6d768c7-44f2-4bd8-a96a-4277c643d4a3 has been successfully uploaded! Final size is 22119 bytes. Artifact ID is 2588473193 +2025-02-13T20:09:33.4651339Z Artifact download URL: https://github.com/tenstorrent/tt-metal/actions/runs/13315815702/artifacts/2588473193 +2025-02-13T20:09:33.4843680Z Post job cleanup. +2025-02-13T20:09:33.4891474Z Post job cleanup. +2025-02-13T20:09:33.6474990Z [command]/usr/bin/git version +2025-02-13T20:09:33.6565105Z git version 2.25.1 +2025-02-13T20:09:33.6724847Z Temporarily overriding HOME='/home/ubuntu/actions-runner/_work/_temp/f3e2eb27-7869-48fd-9834-873430944f47' before making global git config changes +2025-02-13T20:09:33.6726287Z Adding repository directory to the temporary git global config as a safe directory +2025-02-13T20:09:33.6730255Z [command]/usr/bin/git config --global --add safe.directory /home/ubuntu/actions-runner/_work/tt-metal/tt-metal +2025-02-13T20:09:33.6761287Z [command]/usr/bin/git config --local --name-only --get-regexp core\.sshCommand +2025-02-13T20:09:33.6801358Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'core\.sshCommand' && git config --local --unset-all 'core.sshCommand' || :" +2025-02-13T20:09:33.7071705Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:09:33.7123024Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:09:33.7171136Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:09:33.7221607Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:09:33.7268527Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:09:33.7315490Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:09:33.7362617Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:09:33.7430643Z [command]/usr/bin/git config --local --name-only --get-regexp http\.https\:\/\/github\.com\/\.extraheader +2025-02-13T20:09:33.7448462Z http.https://github.com/.extraheader +2025-02-13T20:09:33.7460996Z [command]/usr/bin/git config --local --unset-all http.https://github.com/.extraheader +2025-02-13T20:09:33.7488123Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'http\.https\:\/\/github\.com\/\.extraheader' && git config --local --unset-all 'http.https://github.com/.extraheader' || :" +2025-02-13T20:09:33.7750482Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:09:33.7796013Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:09:33.7841391Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:09:33.7882326Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:09:33.7928440Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:09:33.7982134Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:09:33.8025110Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:09:33.8222917Z Post job cleanup. +2025-02-13T20:09:34.1759609Z [command]/usr/bin/docker logout https://ghcr.io +2025-02-13T20:09:34.1901924Z Removing login credentials for ghcr.io +2025-02-13T20:09:34.1941525Z ##[group]Post cache +2025-02-13T20:09:34.1942367Z State not set +2025-02-13T20:09:34.1943448Z ##[endgroup] +2025-02-13T20:09:34.2103972Z Post job cleanup. +2025-02-13T20:09:34.2154293Z Post job cleanup. +2025-02-13T20:09:34.3587081Z [command]/usr/bin/git version +2025-02-13T20:09:34.3640088Z git version 2.25.1 +2025-02-13T20:09:34.3698301Z Temporarily overriding HOME='/home/ubuntu/actions-runner/_work/_temp/e16338fd-9124-4d05-9f91-652afdebf105' before making global git config changes +2025-02-13T20:09:34.3699672Z Adding repository directory to the temporary git global config as a safe directory +2025-02-13T20:09:34.3705406Z [command]/usr/bin/git config --global --add safe.directory /home/ubuntu/actions-runner/_work/tt-metal/tt-metal +2025-02-13T20:09:34.3743010Z [command]/usr/bin/git config --local --name-only --get-regexp core\.sshCommand +2025-02-13T20:09:34.3790258Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'core\.sshCommand' && git config --local --unset-all 'core.sshCommand' || :" +2025-02-13T20:09:34.4037920Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:09:34.4086737Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:09:34.4131044Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:09:34.4182526Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:09:34.4229109Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:09:34.4271807Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:09:34.4316627Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:09:34.4387989Z [command]/usr/bin/git config --local --name-only --get-regexp http\.https\:\/\/github\.com\/\.extraheader +2025-02-13T20:09:34.4421665Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'http\.https\:\/\/github\.com\/\.extraheader' && git config --local --unset-all 'http.https://github.com/.extraheader' || :" +2025-02-13T20:09:34.4675380Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-13T20:09:34.4721802Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-13T20:09:34.4769401Z Entering 'tt_metal/third_party/tracy' +2025-02-13T20:09:34.4819952Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-13T20:09:34.4869509Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-13T20:09:34.4919126Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-13T20:09:34.4966842Z Entering 'tt_metal/third_party/umd' +2025-02-13T20:09:34.5127031Z A job completed hook has been configured by the self-hosted runner administrator +2025-02-13T20:09:34.5156984Z ##[group]Run '/opt/tt_metal_infra/scripts/ci/grayskull/cleanup.sh' +2025-02-13T20:09:34.5170442Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-13T20:09:34.5170926Z ##[endgroup] +2025-02-13T20:09:34.5221985Z Current date / time is Thu Feb 13 20:09:34 UTC 2025 +2025-02-13T20:09:34.7349453Z Cleaning up orphan processes diff --git a/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/workflow.json b/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/workflow.json new file mode 100644 index 00000000000..e81018bac7b --- /dev/null +++ b/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/workflow.json @@ -0,0 +1 @@ +{"id":13315815702,"name":"All post-commit tests","node_id":"WFR_kwLOI9Wqc88AAAADGa85Fg","head_branch":"sagarwal/multi_page_buffer","head_sha":"ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70","path":".github/workflows/all-post-commit-workflows.yaml","display_title":"All post-commit tests","run_number":25760,"event":"workflow_dispatch","status":"completed","conclusion":"failure","workflow_id":67993574,"check_suite_id":34361313627,"check_suite_node_id":"CS_kwDOI9Wqc88AAAAIABgJWw","url":"https://api.github.com/repos/tenstorrent/tt-metal/actions/runs/13315815702","html_url":"https://github.com/tenstorrent/tt-metal/actions/runs/13315815702","pull_requests":[{"url":"https://api.github.com/repos/tenstorrent/tt-metal/pulls/17677","id":2320845259,"number":17677,"head":{"ref":"sagarwal/multi_page_buffer","sha":"beb03d9f2d6634e1cec437dcda5abbfe0608740e","repo":{"id":601205363,"url":"https://api.github.com/repos/tenstorrent/tt-metal","name":"tt-metal"}},"base":{"ref":"main","sha":"ac426de3d4a9c274964843fdae6aa83ea3960a30","repo":{"id":601205363,"url":"https://api.github.com/repos/tenstorrent/tt-metal","name":"tt-metal"}}}],"created_at":"2025-02-13T19:45:29Z","updated_at":"2025-02-13T20:35:13Z","actor":{"login":"sagarwalTT","id":174518297,"node_id":"U_kgDOCmbwGQ","avatar_url":"https://avatars.githubusercontent.com/u/174518297?v=4","gravatar_id":"","url":"https://api.github.com/users/sagarwalTT","html_url":"https://github.com/sagarwalTT","followers_url":"https://api.github.com/users/sagarwalTT/followers","following_url":"https://api.github.com/users/sagarwalTT/following{/other_user}","gists_url":"https://api.github.com/users/sagarwalTT/gists{/gist_id}","starred_url":"https://api.github.com/users/sagarwalTT/starred{/owner}{/repo}","subscriptions_url":"https://api.github.com/users/sagarwalTT/subscriptions","organizations_url":"https://api.github.com/users/sagarwalTT/orgs","repos_url":"https://api.github.com/users/sagarwalTT/repos","events_url":"https://api.github.com/users/sagarwalTT/events{/privacy}","received_events_url":"https://api.github.com/users/sagarwalTT/received_events","type":"User","user_view_type":"public","site_admin":false},"run_attempt":1,"referenced_workflows":[{"path":"tenstorrent/tt-metal/.github/workflows/build-docker-artifact.yaml@ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70","sha":"ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70","ref":"refs/heads/sagarwal/multi_page_buffer"},{"path":"tenstorrent/tt-metal/.github/workflows/run-profiler-regression.yaml@ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70","sha":"ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70","ref":"refs/heads/sagarwal/multi_page_buffer"},{"path":"tenstorrent/tt-metal/.github/workflows/fast-dispatch-build-and-unit-tests.yaml@ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70","sha":"ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70","ref":"refs/heads/sagarwal/multi_page_buffer"},{"path":"tenstorrent/tt-metal/.github/workflows/cpp-post-commit.yaml@ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70","sha":"ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70","ref":"refs/heads/sagarwal/multi_page_buffer"},{"path":"tenstorrent/tt-metal/.github/workflows/code-analysis.yaml@ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70","sha":"ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70","ref":"refs/heads/sagarwal/multi_page_buffer"},{"path":"tenstorrent/tt-metal/.github/workflows/_test-wheels-impl.yaml@ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70","sha":"ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70","ref":"refs/heads/sagarwal/multi_page_buffer"},{"path":"tenstorrent/tt-metal/.github/workflows/build-artifact.yaml@ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70","sha":"ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70","ref":"refs/heads/sagarwal/multi_page_buffer"},{"path":"tenstorrent/tt-metal/.github/workflows/tt-train-post-commit.yaml@ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70","sha":"ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70","ref":"refs/heads/sagarwal/multi_page_buffer"},{"path":"tenstorrent/tt-metal/.github/workflows/all-static-checks.yaml@ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70","sha":"ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70","ref":"refs/heads/sagarwal/multi_page_buffer"},{"path":"tenstorrent/tt-metal/.github/workflows/docs-latest-public.yaml@ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70","sha":"ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70","ref":"refs/heads/sagarwal/multi_page_buffer"},{"path":"tenstorrent/tt-metal/.github/workflows/ttnn-post-commit.yaml@ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70","sha":"ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70","ref":"refs/heads/sagarwal/multi_page_buffer"},{"path":"tenstorrent/tt-metal/.github/workflows/models-post-commit.yaml@ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70","sha":"ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70","ref":"refs/heads/sagarwal/multi_page_buffer"},{"path":"tenstorrent/tt-metal/.github/workflows/build-and-unit-tests.yaml@ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70","sha":"ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70","ref":"refs/heads/sagarwal/multi_page_buffer"}],"run_started_at":"2025-02-13T19:45:29Z","triggering_actor":{"login":"sagarwalTT","id":174518297,"node_id":"U_kgDOCmbwGQ","avatar_url":"https://avatars.githubusercontent.com/u/174518297?v=4","gravatar_id":"","url":"https://api.github.com/users/sagarwalTT","html_url":"https://github.com/sagarwalTT","followers_url":"https://api.github.com/users/sagarwalTT/followers","following_url":"https://api.github.com/users/sagarwalTT/following{/other_user}","gists_url":"https://api.github.com/users/sagarwalTT/gists{/gist_id}","starred_url":"https://api.github.com/users/sagarwalTT/starred{/owner}{/repo}","subscriptions_url":"https://api.github.com/users/sagarwalTT/subscriptions","organizations_url":"https://api.github.com/users/sagarwalTT/orgs","repos_url":"https://api.github.com/users/sagarwalTT/repos","events_url":"https://api.github.com/users/sagarwalTT/events{/privacy}","received_events_url":"https://api.github.com/users/sagarwalTT/received_events","type":"User","user_view_type":"public","site_admin":false},"jobs_url":"https://api.github.com/repos/tenstorrent/tt-metal/actions/runs/13315815702/attempts/1/jobs","logs_url":"https://api.github.com/repos/tenstorrent/tt-metal/actions/runs/13315815702/attempts/1/logs","check_suite_url":"https://api.github.com/repos/tenstorrent/tt-metal/check-suites/34361313627","artifacts_url":"https://api.github.com/repos/tenstorrent/tt-metal/actions/runs/13315815702/artifacts","cancel_url":"https://api.github.com/repos/tenstorrent/tt-metal/actions/runs/13315815702/cancel","rerun_url":"https://api.github.com/repos/tenstorrent/tt-metal/actions/runs/13315815702/rerun","previous_attempt_url":null,"workflow_url":"https://api.github.com/repos/tenstorrent/tt-metal/actions/workflows/67993574","head_commit":{"id":"ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70","tree_id":"0bba29a25189a3d6496517c4e177d355b7cdffda","message":"Fixing merge conflict","timestamp":"2025-02-13T19:41:52Z","author":{"name":"Samarth Agarwal","email":"sagarwal@tenstorrent.com"},"committer":{"name":"Samarth Agarwal","email":"sagarwal@tenstorrent.com"}},"repository":{"id":601205363,"node_id":"R_kgDOI9Wqcw","name":"tt-metal","full_name":"tenstorrent/tt-metal","private":false,"owner":{"login":"tenstorrent","id":64161552,"node_id":"MDEyOk9yZ2FuaXphdGlvbjY0MTYxNTUy","avatar_url":"https://avatars.githubusercontent.com/u/64161552?v=4","gravatar_id":"","url":"https://api.github.com/users/tenstorrent","html_url":"https://github.com/tenstorrent","followers_url":"https://api.github.com/users/tenstorrent/followers","following_url":"https://api.github.com/users/tenstorrent/following{/other_user}","gists_url":"https://api.github.com/users/tenstorrent/gists{/gist_id}","starred_url":"https://api.github.com/users/tenstorrent/starred{/owner}{/repo}","subscriptions_url":"https://api.github.com/users/tenstorrent/subscriptions","organizations_url":"https://api.github.com/users/tenstorrent/orgs","repos_url":"https://api.github.com/users/tenstorrent/repos","events_url":"https://api.github.com/users/tenstorrent/events{/privacy}","received_events_url":"https://api.github.com/users/tenstorrent/received_events","type":"Organization","user_view_type":"public","site_admin":false},"html_url":"https://github.com/tenstorrent/tt-metal","description":":metal: TT-NN operator library, and TT-Metalium low level kernel programming model.","fork":false,"url":"https://api.github.com/repos/tenstorrent/tt-metal","forks_url":"https://api.github.com/repos/tenstorrent/tt-metal/forks","keys_url":"https://api.github.com/repos/tenstorrent/tt-metal/keys{/key_id}","collaborators_url":"https://api.github.com/repos/tenstorrent/tt-metal/collaborators{/collaborator}","teams_url":"https://api.github.com/repos/tenstorrent/tt-metal/teams","hooks_url":"https://api.github.com/repos/tenstorrent/tt-metal/hooks","issue_events_url":"https://api.github.com/repos/tenstorrent/tt-metal/issues/events{/number}","events_url":"https://api.github.com/repos/tenstorrent/tt-metal/events","assignees_url":"https://api.github.com/repos/tenstorrent/tt-metal/assignees{/user}","branches_url":"https://api.github.com/repos/tenstorrent/tt-metal/branches{/branch}","tags_url":"https://api.github.com/repos/tenstorrent/tt-metal/tags","blobs_url":"https://api.github.com/repos/tenstorrent/tt-metal/git/blobs{/sha}","git_tags_url":"https://api.github.com/repos/tenstorrent/tt-metal/git/tags{/sha}","git_refs_url":"https://api.github.com/repos/tenstorrent/tt-metal/git/refs{/sha}","trees_url":"https://api.github.com/repos/tenstorrent/tt-metal/git/trees{/sha}","statuses_url":"https://api.github.com/repos/tenstorrent/tt-metal/statuses/{sha}","languages_url":"https://api.github.com/repos/tenstorrent/tt-metal/languages","stargazers_url":"https://api.github.com/repos/tenstorrent/tt-metal/stargazers","contributors_url":"https://api.github.com/repos/tenstorrent/tt-metal/contributors","subscribers_url":"https://api.github.com/repos/tenstorrent/tt-metal/subscribers","subscription_url":"https://api.github.com/repos/tenstorrent/tt-metal/subscription","commits_url":"https://api.github.com/repos/tenstorrent/tt-metal/commits{/sha}","git_commits_url":"https://api.github.com/repos/tenstorrent/tt-metal/git/commits{/sha}","comments_url":"https://api.github.com/repos/tenstorrent/tt-metal/comments{/number}","issue_comment_url":"https://api.github.com/repos/tenstorrent/tt-metal/issues/comments{/number}","contents_url":"https://api.github.com/repos/tenstorrent/tt-metal/contents/{+path}","compare_url":"https://api.github.com/repos/tenstorrent/tt-metal/compare/{base}...{head}","merges_url":"https://api.github.com/repos/tenstorrent/tt-metal/merges","archive_url":"https://api.github.com/repos/tenstorrent/tt-metal/{archive_format}{/ref}","downloads_url":"https://api.github.com/repos/tenstorrent/tt-metal/downloads","issues_url":"https://api.github.com/repos/tenstorrent/tt-metal/issues{/number}","pulls_url":"https://api.github.com/repos/tenstorrent/tt-metal/pulls{/number}","milestones_url":"https://api.github.com/repos/tenstorrent/tt-metal/milestones{/number}","notifications_url":"https://api.github.com/repos/tenstorrent/tt-metal/notifications{?since,all,participating}","labels_url":"https://api.github.com/repos/tenstorrent/tt-metal/labels{/name}","releases_url":"https://api.github.com/repos/tenstorrent/tt-metal/releases{/id}","deployments_url":"https://api.github.com/repos/tenstorrent/tt-metal/deployments"},"head_repository":{"id":601205363,"node_id":"R_kgDOI9Wqcw","name":"tt-metal","full_name":"tenstorrent/tt-metal","private":false,"owner":{"login":"tenstorrent","id":64161552,"node_id":"MDEyOk9yZ2FuaXphdGlvbjY0MTYxNTUy","avatar_url":"https://avatars.githubusercontent.com/u/64161552?v=4","gravatar_id":"","url":"https://api.github.com/users/tenstorrent","html_url":"https://github.com/tenstorrent","followers_url":"https://api.github.com/users/tenstorrent/followers","following_url":"https://api.github.com/users/tenstorrent/following{/other_user}","gists_url":"https://api.github.com/users/tenstorrent/gists{/gist_id}","starred_url":"https://api.github.com/users/tenstorrent/starred{/owner}{/repo}","subscriptions_url":"https://api.github.com/users/tenstorrent/subscriptions","organizations_url":"https://api.github.com/users/tenstorrent/orgs","repos_url":"https://api.github.com/users/tenstorrent/repos","events_url":"https://api.github.com/users/tenstorrent/events{/privacy}","received_events_url":"https://api.github.com/users/tenstorrent/received_events","type":"Organization","user_view_type":"public","site_admin":false},"html_url":"https://github.com/tenstorrent/tt-metal","description":":metal: TT-NN operator library, and TT-Metalium low level kernel programming model.","fork":false,"url":"https://api.github.com/repos/tenstorrent/tt-metal","forks_url":"https://api.github.com/repos/tenstorrent/tt-metal/forks","keys_url":"https://api.github.com/repos/tenstorrent/tt-metal/keys{/key_id}","collaborators_url":"https://api.github.com/repos/tenstorrent/tt-metal/collaborators{/collaborator}","teams_url":"https://api.github.com/repos/tenstorrent/tt-metal/teams","hooks_url":"https://api.github.com/repos/tenstorrent/tt-metal/hooks","issue_events_url":"https://api.github.com/repos/tenstorrent/tt-metal/issues/events{/number}","events_url":"https://api.github.com/repos/tenstorrent/tt-metal/events","assignees_url":"https://api.github.com/repos/tenstorrent/tt-metal/assignees{/user}","branches_url":"https://api.github.com/repos/tenstorrent/tt-metal/branches{/branch}","tags_url":"https://api.github.com/repos/tenstorrent/tt-metal/tags","blobs_url":"https://api.github.com/repos/tenstorrent/tt-metal/git/blobs{/sha}","git_tags_url":"https://api.github.com/repos/tenstorrent/tt-metal/git/tags{/sha}","git_refs_url":"https://api.github.com/repos/tenstorrent/tt-metal/git/refs{/sha}","trees_url":"https://api.github.com/repos/tenstorrent/tt-metal/git/trees{/sha}","statuses_url":"https://api.github.com/repos/tenstorrent/tt-metal/statuses/{sha}","languages_url":"https://api.github.com/repos/tenstorrent/tt-metal/languages","stargazers_url":"https://api.github.com/repos/tenstorrent/tt-metal/stargazers","contributors_url":"https://api.github.com/repos/tenstorrent/tt-metal/contributors","subscribers_url":"https://api.github.com/repos/tenstorrent/tt-metal/subscribers","subscription_url":"https://api.github.com/repos/tenstorrent/tt-metal/subscription","commits_url":"https://api.github.com/repos/tenstorrent/tt-metal/commits{/sha}","git_commits_url":"https://api.github.com/repos/tenstorrent/tt-metal/git/commits{/sha}","comments_url":"https://api.github.com/repos/tenstorrent/tt-metal/comments{/number}","issue_comment_url":"https://api.github.com/repos/tenstorrent/tt-metal/issues/comments{/number}","contents_url":"https://api.github.com/repos/tenstorrent/tt-metal/contents/{+path}","compare_url":"https://api.github.com/repos/tenstorrent/tt-metal/compare/{base}...{head}","merges_url":"https://api.github.com/repos/tenstorrent/tt-metal/merges","archive_url":"https://api.github.com/repos/tenstorrent/tt-metal/{archive_format}{/ref}","downloads_url":"https://api.github.com/repos/tenstorrent/tt-metal/downloads","issues_url":"https://api.github.com/repos/tenstorrent/tt-metal/issues{/number}","pulls_url":"https://api.github.com/repos/tenstorrent/tt-metal/pulls{/number}","milestones_url":"https://api.github.com/repos/tenstorrent/tt-metal/milestones{/number}","notifications_url":"https://api.github.com/repos/tenstorrent/tt-metal/notifications{?since,all,participating}","labels_url":"https://api.github.com/repos/tenstorrent/tt-metal/labels{/name}","releases_url":"https://api.github.com/repos/tenstorrent/tt-metal/releases{/id}","deployments_url":"https://api.github.com/repos/tenstorrent/tt-metal/deployments"}} diff --git a/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/workflow_jobs.json b/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/workflow_jobs.json new file mode 100644 index 00000000000..28236d3da2e --- /dev/null +++ b/infra/tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/workflow_jobs.json @@ -0,0 +1,657 @@ +{ + "total_count": 199, + "jobs": [ + { + "id": 37190230023, + "run_id": 13315815702, + "workflow_name": "All post-commit tests", + "head_branch": "sagarwal/multi_page_buffer", + "run_url": "https://api.github.com/repos/tenstorrent/tt-metal/actions/runs/13315815702", + "run_attempt": 1, + "node_id": "CR_kwDOI9Wqc88AAAAIqLXgBw", + "head_sha": "ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70", + "url": "https://api.github.com/repos/tenstorrent/tt-metal/actions/jobs/37190230023", + "html_url": "https://github.com/tenstorrent/tt-metal/actions/runs/13315815702/job/37190230023", + "status": "completed", + "conclusion": "success", + "created_at": "2025-02-13T20:01:20Z", + "started_at": "2025-02-13T20:07:35Z", + "completed_at": "2025-02-13T20:11:00Z", + "name": "cpp-unit-tests (wormhole_b0, N150) / tools wormhole_b0 N150", + "steps": [ + { + "name": "Set up job", + "status": "completed", + "conclusion": "success", + "number": 1, + "started_at": "2025-02-13T20:07:34Z", + "completed_at": "2025-02-13T20:07:41Z" + }, + { + "name": "Set up runner", + "status": "completed", + "conclusion": "success", + "number": 2, + "started_at": "2025-02-13T20:07:41Z", + "completed_at": "2025-02-13T20:08:43Z" + }, + { + "name": "Run tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main", + "status": "completed", + "conclusion": "success", + "number": 3, + "started_at": "2025-02-13T20:08:43Z", + "completed_at": "2025-02-13T20:08:47Z" + }, + { + "name": "Run /./.github/actions/prepare-metal-run", + "status": "completed", + "conclusion": "success", + "number": 4, + "started_at": "2025-02-13T20:08:47Z", + "completed_at": "2025-02-13T20:09:27Z" + }, + { + "name": "tools tests", + "status": "completed", + "conclusion": "success", + "number": 5, + "started_at": "2025-02-13T20:09:27Z", + "completed_at": "2025-02-13T20:10:53Z" + }, + { + "name": "Run /./.github/actions/slack-report", + "status": "completed", + "conclusion": "skipped", + "number": 6, + "started_at": "2025-02-13T20:10:53Z", + "completed_at": "2025-02-13T20:10:53Z" + }, + { + "name": "Run /./.github/actions/upload-artifact-with-job-uuid", + "status": "completed", + "conclusion": "success", + "number": 7, + "started_at": "2025-02-13T20:10:53Z", + "completed_at": "2025-02-13T20:10:55Z" + }, + { + "name": "Generate system logs on failure", + "status": "completed", + "conclusion": "skipped", + "number": 8, + "started_at": "2025-02-13T20:10:55Z", + "completed_at": "2025-02-13T20:10:55Z" + }, + { + "name": "Post tools tests", + "status": "completed", + "conclusion": "success", + "number": 13, + "started_at": "2025-02-13T20:10:55Z", + "completed_at": "2025-02-13T20:10:56Z" + }, + { + "name": "Post Run /./.github/actions/prepare-metal-run", + "status": "completed", + "conclusion": "success", + "number": 14, + "started_at": "2025-02-13T20:10:56Z", + "completed_at": "2025-02-13T20:10:56Z" + }, + { + "name": "Post Run tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main", + "status": "completed", + "conclusion": "success", + "number": 15, + "started_at": "2025-02-13T20:10:56Z", + "completed_at": "2025-02-13T20:10:56Z" + }, + { + "name": "Complete runner", + "status": "completed", + "conclusion": "success", + "number": 16, + "started_at": "2025-02-13T20:11:00Z", + "completed_at": "2025-02-13T20:11:00Z" + }, + { + "name": "Complete job", + "status": "completed", + "conclusion": "success", + "number": 17, + "started_at": "2025-02-13T20:10:57Z", + "completed_at": "2025-02-13T20:10:57Z" + } + ], + "check_run_url": "https://api.github.com/repos/tenstorrent/tt-metal/check-runs/37190230023", + "labels": [ + "N150", + "cloud-virtual-machine", + "in-service" + ], + "runner_id": 387, + "runner_name": "tt-metal-ci-vm-27", + "runner_group_id": 1, + "runner_group_name": "Default" + }, + { + "id": 37190213375, + "run_id": 13315815702, + "workflow_name": "All post-commit tests", + "head_branch": "sagarwal/multi_page_buffer", + "run_url": "https://api.github.com/repos/tenstorrent/tt-metal/actions/runs/13315815702", + "run_attempt": 1, + "node_id": "CR_kwDOI9Wqc88AAAAIqLWe_w", + "head_sha": "ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70", + "url": "https://api.github.com/repos/tenstorrent/tt-metal/actions/jobs/37190213375", + "html_url": "https://github.com/tenstorrent/tt-metal/actions/runs/13315815702/job/37190213375", + "status": "completed", + "conclusion": "failure", + "created_at": "2025-02-13T20:01:03Z", + "started_at": "2025-02-13T20:00:51Z", + "completed_at": "2025-02-13T20:04:26Z", + "name": "sd-unit-tests (grayskull, E150) / grayskull E150 api", + "steps": [ + { + "name": "Set up job", + "status": "completed", + "conclusion": "success", + "number": 1, + "started_at": "2025-02-13T20:00:51Z", + "completed_at": "2025-02-13T20:00:58Z" + }, + { + "name": "Set up runner", + "status": "completed", + "conclusion": "success", + "number": 2, + "started_at": "2025-02-13T20:00:58Z", + "completed_at": "2025-02-13T20:01:50Z" + }, + { + "name": "Run tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main", + "status": "completed", + "conclusion": "success", + "number": 3, + "started_at": "2025-02-13T20:01:50Z", + "completed_at": "2025-02-13T20:01:55Z" + }, + { + "name": "Run /./.github/actions/prepare-metal-run", + "status": "completed", + "conclusion": "success", + "number": 4, + "started_at": "2025-02-13T20:01:55Z", + "completed_at": "2025-02-13T20:02:41Z" + }, + { + "name": "api tests", + "status": "completed", + "conclusion": "failure", + "number": 5, + "started_at": "2025-02-13T20:02:41Z", + "completed_at": "2025-02-13T20:04:17Z" + }, + { + "name": "Run /./.github/actions/slack-report", + "status": "completed", + "conclusion": "success", + "number": 6, + "started_at": "2025-02-13T20:04:18Z", + "completed_at": "2025-02-13T20:04:18Z" + }, + { + "name": "Run /./.github/actions/upload-artifact-with-job-uuid", + "status": "completed", + "conclusion": "success", + "number": 7, + "started_at": "2025-02-13T20:04:18Z", + "completed_at": "2025-02-13T20:04:20Z" + }, + { + "name": "Generate system logs on failure", + "status": "completed", + "conclusion": "success", + "number": 8, + "started_at": "2025-02-13T20:04:20Z", + "completed_at": "2025-02-13T20:04:22Z" + }, + { + "name": "Post api tests", + "status": "completed", + "conclusion": "success", + "number": 13, + "started_at": "2025-02-13T20:04:22Z", + "completed_at": "2025-02-13T20:04:23Z" + }, + { + "name": "Post Run /./.github/actions/prepare-metal-run", + "status": "completed", + "conclusion": "success", + "number": 14, + "started_at": "2025-02-13T20:04:23Z", + "completed_at": "2025-02-13T20:04:23Z" + }, + { + "name": "Post Run tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main", + "status": "completed", + "conclusion": "success", + "number": 15, + "started_at": "2025-02-13T20:04:23Z", + "completed_at": "2025-02-13T20:04:23Z" + }, + { + "name": "Complete runner", + "status": "completed", + "conclusion": "success", + "number": 16, + "started_at": "2025-02-13T20:04:26Z", + "completed_at": "2025-02-13T20:04:26Z" + }, + { + "name": "Complete job", + "status": "completed", + "conclusion": "success", + "number": 17, + "started_at": "2025-02-13T20:04:23Z", + "completed_at": "2025-02-13T20:04:23Z" + } + ], + "check_run_url": "https://api.github.com/repos/tenstorrent/tt-metal/check-runs/37190213375", + "labels": [ + "E150", + "cloud-virtual-machine", + "in-service" + ], + "runner_id": 434, + "runner_name": "tt-metal-ci-vm-160", + "runner_group_id": 1, + "runner_group_name": "Default" + }, + { + "id": 37190251054, + "run_id": 13315815702, + "workflow_name": "All post-commit tests", + "head_branch": "sagarwal/multi_page_buffer", + "run_url": "https://api.github.com/repos/tenstorrent/tt-metal/actions/runs/13315815702", + "run_attempt": 1, + "node_id": "CR_kwDOI9Wqc88AAAAIqLYyLg", + "head_sha": "ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70", + "url": "https://api.github.com/repos/tenstorrent/tt-metal/actions/jobs/37190251054", + "html_url": "https://github.com/tenstorrent/tt-metal/actions/runs/13315815702/job/37190251054", + "status": "completed", + "conclusion": "failure", + "created_at": "2025-02-13T20:01:41Z", + "started_at": "2025-02-13T20:06:44Z", + "completed_at": "2025-02-13T20:13:57Z", + "name": "ttnn-unit-tests (grayskull, E150) / ttnn group 1 grayskull E150", + "steps": [ + { + "name": "Set up job", + "status": "completed", + "conclusion": "success", + "number": 1, + "started_at": "2025-02-13T20:06:43Z", + "completed_at": "2025-02-13T20:06:51Z" + }, + { + "name": "Set up runner", + "status": "completed", + "conclusion": "success", + "number": 2, + "started_at": "2025-02-13T20:06:51Z", + "completed_at": "2025-02-13T20:08:12Z" + }, + { + "name": "Run tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main", + "status": "completed", + "conclusion": "success", + "number": 3, + "started_at": "2025-02-13T20:08:13Z", + "completed_at": "2025-02-13T20:08:17Z" + }, + { + "name": "Run actions/download-artifact@v4", + "status": "completed", + "conclusion": "success", + "number": 4, + "started_at": "2025-02-13T20:08:17Z", + "completed_at": "2025-02-13T20:08:30Z" + }, + { + "name": "Set ttnn fast runtime if exists in config", + "status": "completed", + "conclusion": "skipped", + "number": 5, + "started_at": "2025-02-13T20:08:30Z", + "completed_at": "2025-02-13T20:08:30Z" + }, + { + "name": "ttnn group 1 tests", + "status": "completed", + "conclusion": "failure", + "number": 6, + "started_at": "2025-02-13T20:08:30Z", + "completed_at": "2025-02-13T20:13:50Z" + }, + { + "name": "Run /./.github/actions/slack-report", + "status": "completed", + "conclusion": "success", + "number": 7, + "started_at": "2025-02-13T20:13:50Z", + "completed_at": "2025-02-13T20:13:51Z" + }, + { + "name": "Run /./.github/actions/upload-artifact-with-job-uuid", + "status": "completed", + "conclusion": "success", + "number": 8, + "started_at": "2025-02-13T20:13:51Z", + "completed_at": "2025-02-13T20:13:53Z" + }, + { + "name": "Post ttnn group 1 tests", + "status": "completed", + "conclusion": "success", + "number": 14, + "started_at": "2025-02-13T20:13:53Z", + "completed_at": "2025-02-13T20:13:53Z" + }, + { + "name": "Post Run tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main", + "status": "completed", + "conclusion": "success", + "number": 15, + "started_at": "2025-02-13T20:13:54Z", + "completed_at": "2025-02-13T20:13:54Z" + }, + { + "name": "Complete runner", + "status": "completed", + "conclusion": "success", + "number": 16, + "started_at": "2025-02-13T20:13:54Z", + "completed_at": "2025-02-13T20:13:54Z" + }, + { + "name": "Complete job", + "status": "completed", + "conclusion": "success", + "number": 17, + "started_at": "2025-02-13T20:13:54Z", + "completed_at": "2025-02-13T20:13:54Z" + } + ], + "check_run_url": "https://api.github.com/repos/tenstorrent/tt-metal/check-runs/37190251054", + "labels": [ + "E150", + "in-service" + ], + "runner_id": 55, + "runner_name": "tt-metal-ci-vm-4", + "runner_group_id": 1, + "runner_group_name": "Default" + }, + { + "id": 37190252200, + "run_id": 13315815702, + "workflow_name": "All post-commit tests", + "head_branch": "sagarwal/multi_page_buffer", + "run_url": "https://api.github.com/repos/tenstorrent/tt-metal/actions/runs/13315815702", + "run_attempt": 1, + "node_id": "CR_kwDOI9Wqc88AAAAIqLY2qA", + "head_sha": "ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70", + "url": "https://api.github.com/repos/tenstorrent/tt-metal/actions/jobs/37190252200", + "html_url": "https://github.com/tenstorrent/tt-metal/actions/runs/13315815702/job/37190252200", + "status": "completed", + "conclusion": "success", + "created_at": "2025-02-13T20:01:42Z", + "started_at": "2025-02-13T20:06:58Z", + "completed_at": "2025-02-13T20:09:37Z", + "name": "ttnn-unit-tests (grayskull, E150) / ttnn group 2 grayskull E150", + "steps": [ + { + "name": "Set up job", + "status": "completed", + "conclusion": "success", + "number": 1, + "started_at": "2025-02-13T20:06:57Z", + "completed_at": "2025-02-13T20:07:06Z" + }, + { + "name": "Set up runner", + "status": "completed", + "conclusion": "success", + "number": 2, + "started_at": "2025-02-13T20:07:06Z", + "completed_at": "2025-02-13T20:07:58Z" + }, + { + "name": "Run tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main", + "status": "completed", + "conclusion": "success", + "number": 3, + "started_at": "2025-02-13T20:07:58Z", + "completed_at": "2025-02-13T20:08:01Z" + }, + { + "name": "Run actions/download-artifact@v4", + "status": "completed", + "conclusion": "success", + "number": 4, + "started_at": "2025-02-13T20:08:02Z", + "completed_at": "2025-02-13T20:08:15Z" + }, + { + "name": "Set ttnn fast runtime if exists in config", + "status": "completed", + "conclusion": "skipped", + "number": 5, + "started_at": "2025-02-13T20:08:15Z", + "completed_at": "2025-02-13T20:08:15Z" + }, + { + "name": "ttnn group 2 tests", + "status": "completed", + "conclusion": "success", + "number": 6, + "started_at": "2025-02-13T20:08:15Z", + "completed_at": "2025-02-13T20:09:31Z" + }, + { + "name": "Run /./.github/actions/slack-report", + "status": "completed", + "conclusion": "skipped", + "number": 7, + "started_at": "2025-02-13T20:09:31Z", + "completed_at": "2025-02-13T20:09:31Z" + }, + { + "name": "Run /./.github/actions/upload-artifact-with-job-uuid", + "status": "completed", + "conclusion": "success", + "number": 8, + "started_at": "2025-02-13T20:09:31Z", + "completed_at": "2025-02-13T20:09:33Z" + }, + { + "name": "Post ttnn group 2 tests", + "status": "completed", + "conclusion": "success", + "number": 14, + "started_at": "2025-02-13T20:09:34Z", + "completed_at": "2025-02-13T20:09:34Z" + }, + { + "name": "Post Run tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main", + "status": "completed", + "conclusion": "success", + "number": 15, + "started_at": "2025-02-13T20:09:34Z", + "completed_at": "2025-02-13T20:09:34Z" + }, + { + "name": "Complete runner", + "status": "completed", + "conclusion": "success", + "number": 16, + "started_at": "2025-02-13T20:09:34Z", + "completed_at": "2025-02-13T20:09:34Z" + }, + { + "name": "Complete job", + "status": "completed", + "conclusion": "success", + "number": 17, + "started_at": "2025-02-13T20:09:34Z", + "completed_at": "2025-02-13T20:09:34Z" + } + ], + "check_run_url": "https://api.github.com/repos/tenstorrent/tt-metal/check-runs/37190252200", + "labels": [ + "E150", + "in-service" + ], + "runner_id": 411, + "runner_name": "tt-metal-ci-vm-104", + "runner_group_id": 1, + "runner_group_name": "Default" + }, + { + "id": 37190219113, + "run_id": 13315815702, + "workflow_name": "All post-commit tests", + "head_branch": "sagarwal/multi_page_buffer", + "run_url": "https://api.github.com/repos/tenstorrent/tt-metal/actions/runs/13315815702", + "run_attempt": 1, + "node_id": "CR_kwDOI9Wqc88AAAAIqLW1aQ", + "head_sha": "ac8ce51fedfe3a43fc1ee309ca81e8c67b736d70", + "url": "https://api.github.com/repos/tenstorrent/tt-metal/actions/jobs/37190219113", + "html_url": "https://github.com/tenstorrent/tt-metal/actions/runs/13315815702/job/37190219113", + "status": "completed", + "conclusion": "success", + "created_at": "2025-02-13T20:01:09Z", + "started_at": "2025-02-13T20:00:53Z", + "completed_at": "2025-02-13T20:04:18Z", + "name": "sd-unit-tests (wormhole_b0, N150) / wormhole_b0 N150 device", + "steps": [ + { + "name": "Set up job", + "status": "completed", + "conclusion": "success", + "number": 1, + "started_at": "2025-02-13T20:00:52Z", + "completed_at": "2025-02-13T20:01:00Z" + }, + { + "name": "Set up runner", + "status": "completed", + "conclusion": "success", + "number": 2, + "started_at": "2025-02-13T20:01:00Z", + "completed_at": "2025-02-13T20:02:01Z" + }, + { + "name": "Run tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main", + "status": "completed", + "conclusion": "success", + "number": 3, + "started_at": "2025-02-13T20:02:02Z", + "completed_at": "2025-02-13T20:02:08Z" + }, + { + "name": "Run /./.github/actions/prepare-metal-run", + "status": "completed", + "conclusion": "success", + "number": 4, + "started_at": "2025-02-13T20:02:08Z", + "completed_at": "2025-02-13T20:02:59Z" + }, + { + "name": "device tests", + "status": "completed", + "conclusion": "success", + "number": 5, + "started_at": "2025-02-13T20:03:00Z", + "completed_at": "2025-02-13T20:04:12Z" + }, + { + "name": "Run /./.github/actions/slack-report", + "status": "completed", + "conclusion": "skipped", + "number": 6, + "started_at": "2025-02-13T20:04:12Z", + "completed_at": "2025-02-13T20:04:12Z" + }, + { + "name": "Run /./.github/actions/upload-artifact-with-job-uuid", + "status": "completed", + "conclusion": "success", + "number": 7, + "started_at": "2025-02-13T20:04:12Z", + "completed_at": "2025-02-13T20:04:14Z" + }, + { + "name": "Generate system logs on failure", + "status": "completed", + "conclusion": "skipped", + "number": 8, + "started_at": "2025-02-13T20:04:14Z", + "completed_at": "2025-02-13T20:04:14Z" + }, + { + "name": "Post device tests", + "status": "completed", + "conclusion": "success", + "number": 13, + "started_at": "2025-02-13T20:04:14Z", + "completed_at": "2025-02-13T20:04:14Z" + }, + { + "name": "Post Run /./.github/actions/prepare-metal-run", + "status": "completed", + "conclusion": "success", + "number": 14, + "started_at": "2025-02-13T20:04:14Z", + "completed_at": "2025-02-13T20:04:15Z" + }, + { + "name": "Post Run tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main", + "status": "completed", + "conclusion": "success", + "number": 15, + "started_at": "2025-02-13T20:04:15Z", + "completed_at": "2025-02-13T20:04:15Z" + }, + { + "name": "Complete runner", + "status": "completed", + "conclusion": "success", + "number": 16, + "started_at": "2025-02-13T20:04:18Z", + "completed_at": "2025-02-13T20:04:18Z" + }, + { + "name": "Complete job", + "status": "completed", + "conclusion": "success", + "number": 17, + "started_at": "2025-02-13T20:04:15Z", + "completed_at": "2025-02-13T20:04:15Z" + } + ], + "check_run_url": "https://api.github.com/repos/tenstorrent/tt-metal/check-runs/37190219113", + "labels": [ + "N150", + "cloud-virtual-machine", + "in-service" + ], + "runner_id": 94, + "runner_name": "tt-metal-ci-vm-68", + "runner_group_id": 1, + "runner_group_name": "Default" + } + ] +} diff --git a/infra/tests/data_collection/test_cicd.py b/infra/tests/data_collection/test_cicd.py index bd47c10fb37..99a97230ef2 100644 --- a/infra/tests/data_collection/test_cicd.py +++ b/infra/tests/data_collection/test_cicd.py @@ -174,3 +174,53 @@ def test_create_pipeline_json_for_timeout_bad_testcase(workflow_run_gh_environme for job in pipeline.jobs: if job.github_job_id == 36492361640: assert len(job.tests) > 0 + + +def test_create_pipeline_json_for_gtest_testcases(workflow_run_gh_environment): + github_runner_environment = workflow_run_gh_environment + github_pipeline_json_filename = ( + "tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/workflow.json" + ) + github_jobs_json_filename = ( + "tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/workflow_jobs.json" + ) + + workflow_outputs_dir = pathlib.Path( + "tests/_data/data_collection/cicd/all_post_commit_gtest_testcases_13315815702/" + ).resolve() + assert workflow_outputs_dir.is_dir() + assert workflow_outputs_dir.exists() + + pipeline = create_cicd_json_for_data_analysis( + workflow_outputs_dir, + github_runner_environment, + github_pipeline_json_filename, + github_jobs_json_filename, + ) + + assert pipeline.github_pipeline_id == 13315815702 + + for job in pipeline.jobs: + # passing gtest testcase + if job.github_job_id == 37190230023: + assert len(job.tests) > 0 + assert job.job_success is True + # failing gtest testcase + if job.github_job_id == 37190213375: + assert len(job.tests) > 0 + assert job.job_success is False + # check that there are failing gtests stored in the pydantic testcase list + assert len([x for x in job.tests if not x.success]) > 0 + # passing pytest testcase + if job.github_job_id == 37190252200: + assert len(job.tests) > 0 + assert job.job_success is True + # failing pytest testcase + if job.github_job_id == 37190251054: + assert len(job.tests) > 0 + assert job.job_success is False + # check that there are failing pytests stored in the pydantic testcase list + assert len([x for x in job.tests if not x.success]) > 0 + + # fails validation, job is expected be skipped + assert len([x for x in pipeline.jobs if x.github_job_id == 37190219113]) == 0 From 3ebf8a899fbd0cdf8cfb74440fc6702299067ed9 Mon Sep 17 00:00:00 2001 From: Jason Davies Date: Wed, 19 Feb 2025 15:41:05 +0000 Subject: [PATCH 168/316] Fix typos. (#15365) --- tech_reports/GEMM_FLOPS/GEMM_FLOPS.md | 40 +++++++-------- tech_reports/matrix_engine/matrix_engine.md | 55 ++++++++++----------- 2 files changed, 46 insertions(+), 49 deletions(-) diff --git a/tech_reports/GEMM_FLOPS/GEMM_FLOPS.md b/tech_reports/GEMM_FLOPS/GEMM_FLOPS.md index 1940bf2af26..d42ef32da64 100644 --- a/tech_reports/GEMM_FLOPS/GEMM_FLOPS.md +++ b/tech_reports/GEMM_FLOPS/GEMM_FLOPS.md @@ -55,27 +55,27 @@ For more details please refer to the tech reports [Matrix Engine](../matrix_engi For example, when changing the precision of the matrix, for a given size of matrix the output performance is expected to be different. -![A simple bar chart of the TFLOPS on WH when changing the precision of matrcies](images/effects_of_precision.png "Variance in performance of TFLOPS on WH from SRAM due to changing precision") +![A simple bar chart of the TFLOPS on WH when changing the precision of matrices](images/effects_of_precision.png "Variance in performance of TFLOPS on WH from SRAM due to changing precision") ## MicroBenchmarks -### Matrix Multiplication TFLOPs on Wormhole (WH) +### Matrix Multiplication TFLOPS on Wormhole (WH) The WH matrix engine performs 8x16 x 16x16 = 8x16 in a single cycle. - This is 2*8\*16\*16 = 4096 muladds in a single cycle. -- At 1GHz, this is 4 TFLOPs per matrix engine. +- At 1GHz, this is 4 TFLOPS per matrix engine. - The 8x16 is the smallest matrix that can be fed into in0, and 16x16 is the smallest matrix that can be fed into in1. If the input matrices fed into the engine are "shorter" than 8x16, for example 1x16, the engine will still perform 8x16 x 16x16 = 8x16, but the effective throughput will be 1/8. -Thus, for 1x16 x 16x16 matrices, the effective throughput is 0.5 TFLOP per matrix engine. +Thus, for 1x16 x 16x16 matrices, the effective throughput is 0.5 TFLOPS per matrix engine. -MATH_FIDELITY is used for higher precision, and TFLOPs are calculated by dividing by the MATH_FIDELITY value. -- LoFi -> ~4 TFLOPs -- HiFi2 -> ~2 TFLOPs -- HiFi3 -> ~1.33 TFLOPs -- HiFi4 -> ~1 TFLOPs +MATH_FIDELITY is used for higher precision, and TFLOPS are calculated by dividing by the MATH_FIDELITY value. +- LoFi -> ~4 TFLOPS +- HiFi2 -> ~2 TFLOPS +- HiFi3 -> ~1.33 TFLOPS +- HiFi4 -> ~1 TFLOPS ### Utilization derivation formula @@ -90,7 +90,7 @@ Ideal cycles = (m * k * n) / (tile_height * tile_width * tile_height) * (cycle_p ### Manually tuned Performance -Here we show the peak results we can get based on manually selected matmul configuturations, including packer l1 enablement, math fidelity, input output sharding, and input ouput L1/DRAM selection. +Here we show the peak results we can get based on manually selected matmul configurations, including packer l1 enablement, math fidelity, input output sharding, and input output L1/DRAM selection. #### Peak FLOPS @@ -100,7 +100,7 @@ Below is the results generated from running the benchmark script, showcasing the We also show the results with and without trace (see [AdvancedPerformanceOptimizationsForModels](../AdvancedPerformanceOptimizationsForModels/AdvancedPerformanceOptimizationsForModels.md) for details of trace). With trace, we can minimize the overhead of host which can reflect the actual device performance better. -Finally, we present the results in terms of device time, device throughput in TFLOPs, device utilization compared to the user-specified grid size and device utilization compared to the full grid size (8x8 in Wormhole). Utilization is calculated with +Finally, we present the results in terms of device time, device throughput in TFLOPS, device utilization compared to the user-specified grid size and device utilization compared to the full grid size (8x8 in Wormhole). Utilization is calculated with #### TFLOPS plot across all matrix sizes and configurations @@ -108,7 +108,7 @@ Finally, we present the results in terms of device time, device throughput in TF ![](images/matmul_tflops_5_exp.png) -#### Utilization plot across all matrix sizes and configurations, based on the Chip TFLOPs calculated per each Math Fidelity +#### Utilization plot across all matrix sizes and configurations, based on the Chip TFLOPS calculated per each Math Fidelity ![](images/matmul_utilization_5_exp.png) @@ -123,7 +123,7 @@ Finally, we present the results in terms of device time, device throughput in TF ![](images/matmul_utilization_table_5_exp.png) -#### TFLOPS ratio between the results with trace and without-trace. The trace mode has signficiant impact (i.e. higher ratio) when running a sequence of smaller/faster OPs, because the OP dispatch time will be comparable to the OP device runtime. +#### TFLOPS ratio between the results with trace and without-trace. The trace mode has significant impact (i.e. higher ratio) when running a sequence of smaller/faster OPS, because the OP dispatch time will be comparable to the OP device runtime. ![](images/mamtul_trace_nontrace_ratio_5_exp.png) @@ -131,7 +131,7 @@ Finally, we present the results in terms of device time, device throughput in TF #### The full results table -| m | k | n | use_trace | grid_size | in0_sharded | out_sharded | in0_storage_type | in1_storage_type | out_storage_type | dtype | math_fidelity | inference_time_avg (ns) | TFLOPs (avg) | Utilization (vs user grid) | Utilization (vs 8x8 full grid) | +| m | k | n | use_trace | grid_size | in0_sharded | out_sharded | in0_storage_type | in1_storage_type | out_storage_type | dtype | math_fidelity | inference_time_avg (ns) | TFLOPS (avg) | Utilization (vs user grid) | Utilization (vs 8x8 full grid) | |------:|------:|------:|:------------|:------------|:--------------|:--------------|:-------------------|:-------------------|:-------------------|:-------------------|:-------------------|--------------------------:|---------------:|:-----------------------------|:---------------------------------| | 512 | 512 | 512 | False | (8, 8) | True | True | L1 | DRAM | L1 | DataType.BFLOAT16 | MathFidelity.HiFi2 | 378654 | 0.71 | 0.54% | 0.54% | | 512 | 1024 | 1024 | False | (8, 8) | True | True | L1 | DRAM | L1 | DataType.BFLOAT16 | MathFidelity.HiFi2 | 363193 | 2.96 | 2.26% | 2.26% | @@ -289,7 +289,7 @@ Finally, we present the results in terms of device time, device throughput in TF For most hardware, peak performance is achieved with square matrices that best align with the underlying hardware, for example WH performs best when using Square input matrices, we achieve highest device utilization with bfloat16 and HiFi4. -![A simple bar chart of the TFLOPS on WH when using various square matrcies](images/TFLOPS_WH_SQUARE.png "Square Matrix TFLOPS on WH from SRAM") +![A simple bar chart of the TFLOPS on WH when using various square matrices](images/TFLOPS_WH_SQUARE.png "Square Matrix TFLOPS on WH from SRAM") #### Rectangular matrices @@ -297,23 +297,23 @@ When deviating from Square matrices, the total balance of compute can be thrown Given input matrix A of 512x1024 and B of 1024x2048 to produce output matrix 512x2048 requires the same amount of computation as if the input matrices were of dimensions 1024^2. However, the performance results are measurably different: -| m | k | n | use_trace | grid_size | in0_sharded | out_sharded | in0_storage_type | in1_storage_type | out_storage_type | dtype | math_fidelity | inference_time_avg (ns) | TFLOPs (avg) | Utilization (vs user grid) | Utilization (vs 8x8 full grid) | +| m | k | n | use_trace | grid_size | in0_sharded | out_sharded | in0_storage_type | in1_storage_type | out_storage_type | dtype | math_fidelity | inference_time_avg (ns) | TFLOPS (avg) | Utilization (vs user grid) | Utilization (vs 8x8 full grid) | |------:|------:|------:|:------------|:------------|:--------------|:--------------|:-------------------|:-------------------|:-------------------|:-------------------|:-------------------|--------------------------:|---------------:|:-----------------------------|:---------------------------------| | 512 | 1024 | 2048 | True | (8, 8) | True | True | L1 | DRAM | L1 | DataType.BFLOAT16 | MathFidelity.HiFi2 | 52824 | 40.65 | 31.02% | 31.02% | | 1024 | 1024 | 1024 | True | (8, 8) | True | True | L1 | DRAM | L1 | DataType.BFLOAT16 | MathFidelity.HiFi2 | 36845.2 | 58.28 | 44.47% | 44.47% -![A simple bar chart of the TFLOPS on WH when using square vs rectangular matrcies](images/effects_of_shapes.png "Square vs rectangular Matrix TFLOPS on WH from SRAM") +![A simple bar chart of the TFLOPS on WH when using square vs rectangular matrices](images/effects_of_shapes.png "Square vs rectangular Matrix TFLOPS on WH from SRAM") ### Out of Box performance -We also show the peak results we can get based on auto-selected matmul configuturations, which the matmul op itself chooses the configuraitons. It currently is not perfect and we'll continue improve it so that it can match or even surpass the manually selected ones. We show the results from 512x512x512 to 4096x4096x4096. The reason we are not testing shapes larger is due to the wrong selections of matmul configuturations. +We also show the peak results we can get based on auto-selected matmul configurations, which the matmul op itself chooses the configurations. It currently is not perfect and we'll continue improve it so that it can match or even surpass the manually selected ones. We show the results from 512x512x512 to 4096x4096x4096. The reason we are not testing shapes larger is due to the wrong selections of matmul configurations. -As we can see, the results are comprable to the manutally selected. +As we can see, the results are comparable to the manually selected. #### The full results table -| m | k | n | use_trace | grid_size | in0_storage_type | in1_storage_type | out_storage_type | dtype | math_fidelity | inference_time_avg (ns) | TFLOPs (avg) | Utilization (vs user grid) | Utilization (vs 8x8 full grid) | +| m | k | n | use_trace | grid_size | in0_storage_type | in1_storage_type | out_storage_type | dtype | math_fidelity | inference_time_avg (ns) | TFLOPS (avg) | Utilization (vs user grid) | Utilization (vs 8x8 full grid) | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | | 512 | 512 | 512 | False | (8, 8) | DRAM | DRAM | DRAM | DataType.BFLOAT16 | MathFidelity.HiFi2 | 400640.96 | 0.67 | 0.51% | 0.51% | | 512 | 1024 | 1024 | False | (8, 8) | DRAM | DRAM | DRAM | DataType.BFLOAT16 | MathFidelity.HiFi2 | 296726.23 | 3.62 | 2.76% | 2.76% | diff --git a/tech_reports/matrix_engine/matrix_engine.md b/tech_reports/matrix_engine/matrix_engine.md index 30653fe0be1..2784826ea72 100644 --- a/tech_reports/matrix_engine/matrix_engine.md +++ b/tech_reports/matrix_engine/matrix_engine.md @@ -2,47 +2,47 @@ ## Introduction -The matrix engine supports the following operations: matrix mult, reduction, eltwise add/sub/mul, and tranpose_xy. +The matrix engine supports the following operations: matrix mult, reduction, eltwise add/sub/mul, and transpose_xy. ## Operations -### Matrix Mult +### Matrix Mult The WH matrix engine performs 8x16 x 16x16 = 8x16 in a single cycle. \ -This is 2*8\*16\*16 = 4096 muladds in a single cycle. At 1GHz, this is 4 TFLOPs per matrix engine. \ -The 8x16 is the smallest matrix that can be fed into in0, and 16x16 is the +This is 2*8\*16\*16 = 4096 muladds in a single cycle. At 1GHz, this is 4 TFLOPS per matrix engine. \ +The 8x16 is the smallest matrix that can be fed into in0, and 16x16 is the smallest matrix that can be fed into in1. -If the input matrices fed into the engine are "shorter" than 8x16, for example 1x16, the engine will still perform 8x16 x 16x16 = 8x16, but the effective throughput will be 1/8. -Thus, for 1x16 x 16x16 matricies, the effective throughput is 0.5 TFLOP per matrix engine. +If the input matrices fed into the engine are "shorter" than 8x16, for example 1x16, the engine will still perform 8x16 x 16x16 = 8x16, but the effective throughput will be 1/8. +Thus, for 1x16 x 16x16 matrices, the effective throughput is 0.5 TFLOPS per matrix engine. -MATH_FIDELITY is used for higher precision, and TFLOPs are calculated by dividing by the MATH_FIDELITY value. +MATH_FIDELITY is used for higher precision, and TFLOPS are calculated by dividing by the MATH_FIDELITY value. -LoFi -> 4 TFLOPs \ -HiFi2 -> 2 TFLOPs \ -HiFi3 -> 1.33 TFLOPs \ -HiFi4 -> 1 TFLOPs +LoFi -> 4 TFLOPS \ +HiFi2 -> 2 TFLOPS \ +HiFi3 -> 1.33 TFLOPS \ +HiFi4 -> 1 TFLOPS -### Reduction: Addition and Max -The WH matrix engine performs 16x16 reduce max/average operations in a single cycle. \ -This is 2*16\*16 multiply + adds in a single cycle. At 1GHz, this is 0.512 TFLOPs per matrix engine. +### Reduction: Max/Average/Sum +The WH matrix engine performs 16x16 reduce max/average/sum operations in a single cycle. \ +This is 2*16\*16 multiply + adds in a single cycle. At 1GHz, this is 0.512 TFLOPS per matrix engine. -Reduce max does not use MATH_FIDELITY; however reduce average does use MATH_FIDELITY for higher precision, and TFLOPs are calculated by dividing by the MATH_FIDELITY value. +Reduce max does not use MATH_FIDELITY; however reduce average/sum does use MATH_FIDELITY for higher precision, and TFLOPS are calculated by dividing by the MATH_FIDELITY value. -LoFi -> 0.512 TFLOPs \ -HiFi2 -> 0.256 TFLOPs \ -HiFi3 -> 0.171 TFLOPs \ -HiFi4 -> 0.128 TFLOPs +LoFi -> 0.512 TFLOPS \ +HiFi2 -> 0.256 TFLOPS \ +HiFi3 -> 0.171 TFLOPS \ +HiFi4 -> 0.128 TFLOPS ### Eltwise: Add, Sub, Mul The WH matrix engine performs 8x16 elementwise addition/subtraction/multiplication in a single cycle. \ -This is 8\*16 (multiply or adds, not both) in a single cycle. At 1Ghz, this is 0.128 TFLOPs per matrix engine. \ -Elementwise addition and subtraction do not use MATH_FIDELITY; however, Elementwise multiplication does use MATH_FIDELITY for higher precision, and TFLOPs are calculated by dividing by the MATH_FIDELITY value. +This is 8\*16 (multiply or adds, not both) in a single cycle. At 1GHz, this is 0.128 TFLOPS per matrix engine. \ +Elementwise addition and subtraction do not use MATH_FIDELITY; however, elementwise multiplication does use MATH_FIDELITY for higher precision, and TFLOPS are calculated by dividing by the MATH_FIDELITY value. -LoFi -> 0.128 TFLOPs \ -HiFi2 -> 0.064 TFLOPs \ -HiFi3 -> 0.043 TFLOPs \ -HiFi4 -> 0.032 TFLOPs +LoFi -> 0.128 TFLOPS \ +HiFi2 -> 0.064 TFLOPS \ +HiFi3 -> 0.043 TFLOPS \ +HiFi4 -> 0.032 TFLOPS ## Configurations @@ -65,7 +65,7 @@ Math Fidelity specifies the number of times an operation is run to consume the f LoFi -> SrcA register: uses 1 hidden bit + 4 most significant bits of the mantissa (MSB of the mantissa), SrcB register: uses 1 hidden bit + 6 MSB of the mantissa \ HiFi2 -> SrcA register: uses 1 hidden bit + next 4 bits of LSBs of the mantissa, SrcB register: uses 1 hidden bit + 6 MSB of the mantissa \ HiFi3 -> SrcA register: uses 1 hidden bit + 4 MSB of the mantissa, SrcB register: Uses 1 hidden bit + next 6 LSB of the mantissa \ -HiFi4 -> SrcA register: uses 1 hidden bit + next 4 bits of LSBs of the mantissa, SrcB register: Uses 1 hidden bit + next 6 LSB of the mantissa +HiFi4 -> SrcA register: uses 1 hidden bit + next 4 bits of LSBs of the mantissa, SrcB register: Uses 1 hidden bit + next 6 LSB of the mantissa ### Math Approx Mode @@ -84,6 +84,3 @@ Warning: If this flag is set, the math destination register can fit as half as m Wormhole has the ability to do accumulation in the L1 memory, the packer will read the input address, and accumulate it with the values read from dest, then write back into the same address. This feature is useful for accumulations in higher precision, and then a final pack call can be done to convert into lower precision (for example accumulate in fp32, then final output as float16_b). In order to enable this feature, `packer_l1_acc` must be set. - - - From 01d33fcf6cd568d85e43985b94812e97ca6c75a9 Mon Sep 17 00:00:00 2001 From: Rashid Kaleem Date: Wed, 19 Feb 2025 11:08:18 -0600 Subject: [PATCH 169/316] [skip ci] Fix memory usage for repack script for Mixtral and Llama3 (#18008) --- models/demos/llama3/scripts/repack_weights.py | 3 +++ models/demos/t3000/mixtral8x7b/scripts/repack_weights.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/models/demos/llama3/scripts/repack_weights.py b/models/demos/llama3/scripts/repack_weights.py index e92c9b74570..2b3944328df 100644 --- a/models/demos/llama3/scripts/repack_weights.py +++ b/models/demos/llama3/scripts/repack_weights.py @@ -30,6 +30,9 @@ def repack_mixtral_weights(ckpt_dir, repack_dir): ) } + # clear the state dict to lower the memory footprint + state_dict.clear() + base_address = "feed_forward." for l in range(model_args.n_layers): print(f"Updating layer {l}...") diff --git a/models/demos/t3000/mixtral8x7b/scripts/repack_weights.py b/models/demos/t3000/mixtral8x7b/scripts/repack_weights.py index e92c9b74570..2b3944328df 100644 --- a/models/demos/t3000/mixtral8x7b/scripts/repack_weights.py +++ b/models/demos/t3000/mixtral8x7b/scripts/repack_weights.py @@ -30,6 +30,9 @@ def repack_mixtral_weights(ckpt_dir, repack_dir): ) } + # clear the state dict to lower the memory footprint + state_dict.clear() + base_address = "feed_forward." for l in range(model_args.n_layers): print(f"Updating layer {l}...") From 608e76ffa0d94c52c7f6a645cf28b7dcb02ac923 Mon Sep 17 00:00:00 2001 From: John Bauman Date: Sat, 1 Feb 2025 19:52:06 +0000 Subject: [PATCH 170/316] Unit test insert_write_packed_payloads Move insert_write_packed_payloads to DeviceCommandCalculator so it can be unit tested. Add a random test of it, templated over both subcommand types. --- .../dispatch_util/test_device_command.cpp | 67 +++++++++++++--- tt_metal/impl/CMakeLists.txt | 1 + .../dispatch/device_command_calculator.cpp | 72 +++++++++++++++++ .../dispatch/device_command_calculator.hpp | 18 +++++ tt_metal/impl/program/dispatch.cpp | 77 ++++--------------- 5 files changed, 164 insertions(+), 71 deletions(-) create mode 100644 tt_metal/impl/dispatch/device_command_calculator.cpp diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_util/test_device_command.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_util/test_device_command.cpp index acb99427b8f..8a5c67497ba 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_util/test_device_command.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_util/test_device_command.cpp @@ -176,29 +176,33 @@ TEST(DeviceCommandTest, AddPrefetchRelayPagedPacked) { EXPECT_EQ(command.size_bytes(), command.write_offset_bytes()); } -TEST(DeviceCommandTest, AddDispatchWritePacked) { +template +class WritePackedCommandTest : public ::testing::Test {}; + +using TestTypes = testing::Types; +TYPED_TEST_SUITE(WritePackedCommandTest, TestTypes); + +TYPED_TEST(WritePackedCommandTest, AddDispatchWritePacked) { { DeviceCommandCalculator calculator; - calculator.add_dispatch_write_packed(2, 5, 100, /*no_stride*/ false); + calculator.add_dispatch_write_packed(2, 5, 100, /*no_stride*/ false); HostMemDeviceCommand command(calculator.write_offset_bytes()); - std::vector sub_cmds(2); + std::vector sub_cmds(2); uint32_t data[1] = {}; std::vector> data_collection{{data, 4}, {data, 4}}; - command.add_dispatch_write_packed( - 2, 0, 5, 0, sub_cmds, data_collection, 100, 0, false); + command.add_dispatch_write_packed(2, 0, 5, 0, sub_cmds, data_collection, 100, 0, false); EXPECT_EQ(command.size_bytes(), command.write_offset_bytes()); } { DeviceCommandCalculator calculator; - calculator.add_dispatch_write_packed(2, 5, 100, /*no_stride*/ true); + calculator.add_dispatch_write_packed(2, 5, 100, /*no_stride*/ true); HostMemDeviceCommand command(calculator.write_offset_bytes()); - std::vector sub_cmds(2); + std::vector sub_cmds(2); uint32_t data[1] = {}; std::vector> data_collection{{data, 4}}; - command.add_dispatch_write_packed( - 2, 0, 5, 0, sub_cmds, data_collection, 100, 0, true); + command.add_dispatch_write_packed(2, 0, 5, 0, sub_cmds, data_collection, 100, 0, true); EXPECT_EQ(command.size_bytes(), command.write_offset_bytes()); } } @@ -226,3 +230,48 @@ TEST(DeviceCommandTest, AddDispatchWritePackedLarge) { EXPECT_EQ(command.size_bytes(), command.write_offset_bytes()); } } + +TYPED_TEST(WritePackedCommandTest, RandomAddDispatchWritePacked) { + srand(0); + for (size_t i = 0; i < 100; i++) { + DeviceCommandCalculator calculator; + uint32_t random_start = (rand() % 4) % 32; + calculator.add_data(random_start); + uint32_t num_sub_cmds = rand() % 100 + 1; + uint32_t sub_cmd_sizeB = rand() % 2000 + 1; + uint32_t max_prefetch_command_size = 16384; + uint32_t packed_write_max_unicast_sub_cmds = 64; + + std::vector> packed_cmd_payloads; + calculator.insert_write_packed_payloads( + num_sub_cmds, + sub_cmd_sizeB, + max_prefetch_command_size, + packed_write_max_unicast_sub_cmds, + packed_cmd_payloads); + + uint32_t data[2001] = {}; + std::vector> data_collection; + for (size_t j = 0; j < num_sub_cmds; j++) { + data_collection.push_back({data, sub_cmd_sizeB}); + } + + HostMemDeviceCommand command(calculator.write_offset_bytes()); + command.add_data(nullptr, 0, random_start); + uint32_t curr_sub_cmd_idx = 0; + for (const auto& [sub_cmd_ct, payload_size] : packed_cmd_payloads) { + std::vector sub_cmds(sub_cmd_ct); + command.add_dispatch_write_packed( + sub_cmd_ct, + 0, + sub_cmd_sizeB, + payload_size, + sub_cmds, + data_collection, + packed_write_max_unicast_sub_cmds, + curr_sub_cmd_idx); + curr_sub_cmd_idx += sub_cmd_ct; + } + EXPECT_EQ(command.size_bytes(), command.write_offset_bytes()); + } +} diff --git a/tt_metal/impl/CMakeLists.txt b/tt_metal/impl/CMakeLists.txt index 7af67d6bada..7cd2d6bc3cf 100644 --- a/tt_metal/impl/CMakeLists.txt +++ b/tt_metal/impl/CMakeLists.txt @@ -23,6 +23,7 @@ set(IMPL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/program/dispatch.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dispatch/debug_tools.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dispatch/host_runtime_commands.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dispatch/device_command_calculator.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dispatch/dispatch_query_manager.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dispatch/dispatch_core_common.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dispatch/dispatch_core_manager.cpp diff --git a/tt_metal/impl/dispatch/device_command_calculator.cpp b/tt_metal/impl/dispatch/device_command_calculator.cpp new file mode 100644 index 00000000000..6760353715c --- /dev/null +++ b/tt_metal/impl/dispatch/device_command_calculator.cpp @@ -0,0 +1,72 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "device_command_calculator.hpp" + +namespace tt::tt_metal { + +template +uint32_t DeviceCommandCalculator::get_max_write_packed_sub_cmds( + uint32_t data_size, + uint32_t max_prefetch_cmd_size, + uint32_t packed_write_max_unicast_sub_cmds, + bool no_stride) const { + static_assert( + std::is_same::value or + std::is_same::value); + constexpr bool is_unicast = std::is_same::value; + uint32_t sub_cmd_sizeB = + is_unicast ? sizeof(CQDispatchWritePackedUnicastSubCmd) : sizeof(CQDispatchWritePackedMulticastSubCmd); + // Approximate calculation due to alignment + uint32_t max_prefetch_size = max_prefetch_cmd_size - sizeof(CQPrefetchCmd) - this->pcie_alignment - + sizeof(CQDispatchCmd) - this->l1_alignment; + uint32_t max_prefetch_num_packed_cmds = + no_stride ? (max_prefetch_size - tt::align(data_size * sizeof(uint32_t), l1_alignment)) / sub_cmd_sizeB + : max_prefetch_size / (tt::align(data_size * sizeof(uint32_t), l1_alignment) + sub_cmd_sizeB); + + uint32_t packed_write_max_multicast_sub_cmds = + get_packed_write_max_multicast_sub_cmds(packed_write_max_unicast_sub_cmds); + return std::min( + max_prefetch_num_packed_cmds, + is_unicast ? packed_write_max_unicast_sub_cmds : packed_write_max_multicast_sub_cmds); +}; + +// Explicit template instantiations +template uint32_t DeviceCommandCalculator::get_max_write_packed_sub_cmds( + uint32_t, uint32_t, uint32_t, bool) const; +template uint32_t DeviceCommandCalculator::get_max_write_packed_sub_cmds( + uint32_t, uint32_t, uint32_t, bool) const; + +template +void DeviceCommandCalculator::insert_write_packed_payloads( + const uint32_t num_sub_cmds, + const uint32_t sub_cmd_sizeB, + const uint32_t max_prefetch_command_size, + const uint32_t packed_write_max_unicast_sub_cmds, + std::vector>& packed_cmd_payloads) { + uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); + const uint32_t aligned_sub_cmd_sizeB = tt::align(sub_cmd_sizeB, l1_alignment); + const uint32_t max_packed_sub_cmds_per_cmd = get_max_write_packed_sub_cmds( + aligned_sub_cmd_sizeB, max_prefetch_command_size, packed_write_max_unicast_sub_cmds, false); + uint32_t rem_num_sub_cmds = num_sub_cmds; + while (rem_num_sub_cmds != 0) { + const uint32_t num_sub_cmds_in_cmd = std::min(max_packed_sub_cmds_per_cmd, rem_num_sub_cmds); + const uint32_t aligned_data_sizeB = aligned_sub_cmd_sizeB * num_sub_cmds_in_cmd; + const uint32_t dispatch_cmd_sizeB = + tt::align(sizeof(CQDispatchCmd) + num_sub_cmds_in_cmd * sizeof(PackedSubCmd), l1_alignment); + packed_cmd_payloads.emplace_back(num_sub_cmds_in_cmd, dispatch_cmd_sizeB + aligned_data_sizeB); + rem_num_sub_cmds -= num_sub_cmds_in_cmd; + this->add_dispatch_write_packed( + num_sub_cmds_in_cmd, sub_cmd_sizeB, packed_write_max_unicast_sub_cmds); + } +} + +// Explicit template instantiations +template void DeviceCommandCalculator::insert_write_packed_payloads( + uint32_t, uint32_t, uint32_t, uint32_t, std::vector>&); + +template void DeviceCommandCalculator::insert_write_packed_payloads( + uint32_t, uint32_t, uint32_t, uint32_t, std::vector>&); + +} // namespace tt::tt_metal diff --git a/tt_metal/impl/dispatch/device_command_calculator.hpp b/tt_metal/impl/dispatch/device_command_calculator.hpp index 97f9ffa5aef..ccc4de7ce77 100644 --- a/tt_metal/impl/dispatch/device_command_calculator.hpp +++ b/tt_metal/impl/dispatch/device_command_calculator.hpp @@ -4,6 +4,7 @@ #include "hal.hpp" #include "tt_align.hpp" +#include namespace tt::tt_metal { class DeviceCommandCalculator { @@ -172,6 +173,23 @@ class DeviceCommandCalculator { this->cmd_write_offsetB = tt::align(this->cmd_write_offsetB, this->pcie_alignment); } + template + uint32_t get_max_write_packed_sub_cmds( + uint32_t data_size, + uint32_t max_prefetch_cmd_size, + uint32_t packed_write_max_unicast_sub_cmds, + bool no_stride) const; + + // Divide the sub commands into multiple dispatch commands if the number of sub commands exceeds the maximum number + // of sub commands that can be written in a single dispatch command. + template + void insert_write_packed_payloads( + const uint32_t num_sub_cmds, + const uint32_t sub_cmd_sizeB, + const uint32_t max_prefetch_command_size, + const uint32_t packed_write_max_unicast_sub_cmds, + std::vector>& packed_cmd_payloads); + private: void add_prefetch_relay_inline() { this->cmd_write_offsetB += sizeof(CQPrefetchCmd); } uint32_t cmd_write_offsetB = 0; diff --git a/tt_metal/impl/program/dispatch.cpp b/tt_metal/impl/program/dispatch.cpp index fdf9e4ee5ab..4dee6c4b520 100644 --- a/tt_metal/impl/program/dispatch.cpp +++ b/tt_metal/impl/program/dispatch.cpp @@ -429,30 +429,6 @@ void insert_stall_cmds(ProgramCommandSequence& program_command_sequence, SubDevi false, dispatch_message_addr, 0); } -template -uint32_t get_max_write_packed_sub_cmds( - uint32_t data_size, uint32_t max_prefetch_cmd_size, uint32_t packed_write_max_unicast_sub_cmds, bool no_stride) { - static_assert( - std::is_same::value or - std::is_same::value); - constexpr bool is_unicast = std::is_same::value; - uint32_t sub_cmd_sizeB = - is_unicast ? sizeof(CQDispatchWritePackedUnicastSubCmd) : sizeof(CQDispatchWritePackedMulticastSubCmd); - // Approximate calculation due to alignment - uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); - uint32_t max_prefetch_size = max_prefetch_cmd_size - sizeof(CQPrefetchCmd) - hal.get_alignment(HalMemType::HOST) - - sizeof(CQDispatchCmd) - l1_alignment; - uint32_t max_prefetch_num_packed_cmds = - no_stride ? (max_prefetch_size - tt::align(data_size * sizeof(uint32_t), l1_alignment)) / sub_cmd_sizeB - : max_prefetch_size / (tt::align(data_size * sizeof(uint32_t), l1_alignment) + sub_cmd_sizeB); - - uint32_t packed_write_max_multicast_sub_cmds = - get_packed_write_max_multicast_sub_cmds(packed_write_max_unicast_sub_cmds); - return std::min( - max_prefetch_num_packed_cmds, - is_unicast ? packed_write_max_unicast_sub_cmds : packed_write_max_multicast_sub_cmds); -}; - template void generate_runtime_args_cmds( std::vector& runtime_args_command_sequences, @@ -493,7 +469,8 @@ void generate_runtime_args_cmds( constexpr bool unicast = std::is_same::value; uint32_t num_packed_cmds_in_seq = sub_cmds.size(); - uint32_t max_packed_cmds = get_max_write_packed_sub_cmds( + DeviceCommandCalculator calculator; + uint32_t max_packed_cmds = calculator.get_max_write_packed_sub_cmds( max_runtime_args_len, max_prefetch_command_size, packed_write_max_unicast_sub_cmds, no_stride); uint32_t offset_idx = 0; if (no_stride) { @@ -568,6 +545,7 @@ void assemble_runtime_args_commands( program_command_sequence.runtime_args_command_sequences = {}; uint32_t command_count = 0; + const DeviceCommandCalculator calculator; // Unique RTAs for (uint32_t programmable_core_type_index = 0; @@ -581,8 +559,9 @@ void assemble_runtime_args_commands( if (kg->total_rta_size != 0) { uint32_t num_sub_cmds = kg->core_ranges.num_cores(); uint32_t max_runtime_args_len = kg->total_rta_size / sizeof(uint32_t); - uint32_t max_packed_cmds = get_max_write_packed_sub_cmds( - max_runtime_args_len, max_prefetch_command_size, packed_write_max_unicast_sub_cmds, false); + uint32_t max_packed_cmds = + calculator.get_max_write_packed_sub_cmds( + max_runtime_args_len, max_prefetch_command_size, packed_write_max_unicast_sub_cmds, false); command_count += div_up(num_sub_cmds, max_packed_cmds); } } @@ -605,13 +584,15 @@ void assemble_runtime_args_commands( CoreType core_type = hal.get_core_type(programmable_core_type_index); if (core_type == CoreType::ETH) { uint32_t num_sub_cmds = kernel->logical_cores().size(); - uint32_t max_packed_cmds = get_max_write_packed_sub_cmds( - max_runtime_args_len, max_prefetch_command_size, packed_write_max_unicast_sub_cmds, true); + uint32_t max_packed_cmds = + calculator.get_max_write_packed_sub_cmds( + max_runtime_args_len, max_prefetch_command_size, packed_write_max_unicast_sub_cmds, true); command_count += div_up(num_sub_cmds, max_packed_cmds); } else { uint32_t num_sub_cmds = kernel->logical_coreranges().size(); - uint32_t max_packed_cmds = get_max_write_packed_sub_cmds( - max_runtime_args_len, max_prefetch_command_size, packed_write_max_unicast_sub_cmds, true); + uint32_t max_packed_cmds = + calculator.get_max_write_packed_sub_cmds( + max_runtime_args_len, max_prefetch_command_size, packed_write_max_unicast_sub_cmds, true); command_count += div_up(num_sub_cmds, max_packed_cmds); } } @@ -788,31 +769,6 @@ void assemble_runtime_args_commands( program_command_sequence.runtime_args_fetch_size_bytes = runtime_args_fetch_size_bytes; } -template -void insert_write_packed_payloads( - DeviceCommandCalculator& calculator, - const uint32_t num_sub_cmds, - const uint32_t sub_cmd_sizeB, - const uint32_t max_prefetch_command_size, - const uint32_t packed_write_max_unicast_sub_cmds, - std::vector>& packed_cmd_payloads) { - uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); - const uint32_t aligned_sub_cmd_sizeB = tt::align(sub_cmd_sizeB, l1_alignment); - const uint32_t max_packed_sub_cmds_per_cmd = get_max_write_packed_sub_cmds( - aligned_sub_cmd_sizeB, max_prefetch_command_size, packed_write_max_unicast_sub_cmds, false); - uint32_t rem_num_sub_cmds = num_sub_cmds; - while (rem_num_sub_cmds != 0) { - const uint32_t num_sub_cmds_in_cmd = std::min(max_packed_sub_cmds_per_cmd, rem_num_sub_cmds); - const uint32_t aligned_data_sizeB = aligned_sub_cmd_sizeB * num_sub_cmds_in_cmd; - const uint32_t dispatch_cmd_sizeB = - tt::align(sizeof(CQDispatchCmd) + num_sub_cmds_in_cmd * sizeof(PackedSubCmd), l1_alignment); - packed_cmd_payloads.emplace_back(num_sub_cmds_in_cmd, dispatch_cmd_sizeB + aligned_data_sizeB); - rem_num_sub_cmds -= num_sub_cmds_in_cmd; - calculator.add_dispatch_write_packed( - num_sub_cmds_in_cmd, sub_cmd_sizeB, packed_write_max_unicast_sub_cmds); - } -} - void assemble_device_commands( ProgramCommandSequence& program_command_sequence, Program& program, IDevice* device, SubDeviceId sub_device_id) { DeviceCommandCalculator calculator; @@ -890,8 +846,7 @@ void assemble_device_commands( transfer_info.data.data(), transfer_info.data.size() * sizeof(uint32_t)); } } - insert_write_packed_payloads( - calculator, + calculator.insert_write_packed_payloads( unicast_sem_sub_cmds[i].size(), unicast_sem_dst_size.back().second, max_prefetch_command_size, @@ -1196,8 +1151,7 @@ void assemble_device_commands( } } if (multicast_go_signal_sub_cmds.size() > 0) { - insert_write_packed_payloads( - calculator, + calculator.insert_write_packed_payloads( multicast_go_signal_sub_cmds.size(), go_signal_sizeB, max_prefetch_command_size, @@ -1233,8 +1187,7 @@ void assemble_device_commands( } if (unicast_go_signal_sub_cmds.size() > 0) { - insert_write_packed_payloads( - calculator, + calculator.insert_write_packed_payloads( unicast_go_signal_sub_cmds.size(), go_signal_sizeB, max_prefetch_command_size, From 20a4d36a8b76565c9c90c6a3202242d6df8d2d96 Mon Sep 17 00:00:00 2001 From: Austin Ho Date: Tue, 18 Feb 2025 18:59:40 +0000 Subject: [PATCH 171/316] #0: Remove client_interface from being a required global needed to be defined by the user We should be able to support multiple client interfaces to enable transfers in multiple directions --- .../tt_fabric_traffic_gen_rx_socket.cpp | 4 +- .../kernels/tt_fabric_traffic_gen_tx.cpp | 5 +- .../tt_fabric_traffic_gen_tx_socket.cpp | 6 +- .../routing/kernels/tt_fabric_tx_ubench.cpp | 8 ++- tt_fabric/hw/inc/tt_fabric_api.h | 70 ++++++++++++------- 5 files changed, 60 insertions(+), 33 deletions(-) diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx_socket.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx_socket.cpp index f2152656090..7431f98eb64 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx_socket.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx_socket.cpp @@ -44,7 +44,8 @@ constexpr uint32_t data_buffer_size_words = get_compile_time_arg_val(13); volatile tt_l1_ptr chan_req_buf* client_pull_req_buf = reinterpret_cast(client_pull_req_buf_addr); -volatile fabric_client_interface_t* client_interface = (volatile fabric_client_interface_t*)client_interface_addr; +volatile tt_l1_ptr fabric_client_interface_t* client_interface = + (volatile tt_l1_ptr fabric_client_interface_t*)client_interface_addr; uint64_t xy_local_addr; socket_reader_state socket_reader; @@ -87,6 +88,7 @@ void kernel_main() { test_results[TT_FABRIC_MISC_INDEX] = 0xff000005; fabric_socket_open( + client_interface, // fabric client interface 3, // the network plane to use for this socket 2, // Temporal epoch for which the socket is being opened 1, // Socket Id to open diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp index 2dac3ffaebe..af0c515e3dc 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp @@ -71,7 +71,8 @@ uint32_t max_packet_size_mask; auto input_queue_state = select_input_queue(); volatile local_pull_request_t *local_pull_request = (volatile local_pull_request_t *)(data_buffer_start_addr - 1024); volatile tt_l1_ptr fabric_router_l1_config_t* routing_table; -volatile fabric_client_interface_t* client_interface; +volatile tt_l1_ptr fabric_client_interface_t* client_interface = + (volatile tt_l1_ptr fabric_client_interface_t*)client_interface_addr; fvc_producer_state_t test_producer __attribute__((aligned(16))); fvcc_inbound_state_t fvcc_test_producer __attribute__((aligned(16))); @@ -456,7 +457,7 @@ void kernel_main() { uint32_t packet_count = 0; // initalize client - fabric_endpoint_init(client_interface_addr, outbound_eth_chan); + fabric_endpoint_init(client_interface, outbound_eth_chan); routing_table = reinterpret_cast(client_interface->routing_tables_l1_offset); while (true) { diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx_socket.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx_socket.cpp index c4518f246b7..8253be83948 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx_socket.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx_socket.cpp @@ -67,7 +67,8 @@ uint32_t max_packet_size_mask; auto input_queue_state = select_input_queue(); volatile local_pull_request_t* local_pull_request = (volatile local_pull_request_t*)(data_buffer_start_addr - 1024); volatile tt_l1_ptr fabric_router_l1_config_t* routing_table; -volatile fabric_client_interface_t* client_interface; +volatile tt_l1_ptr fabric_client_interface_t* client_interface = + (volatile tt_l1_ptr fabric_client_interface_t*)client_interface_addr; volatile tt_l1_ptr chan_req_buf* client_pull_req_buf = reinterpret_cast(client_pull_req_buf_addr); @@ -350,7 +351,7 @@ void kernel_main() { zero_l1_buf((uint32_t*)&packet_header, sizeof(packet_header_t)); // initalize client - fabric_endpoint_init(client_interface_addr, gk_interface_addr_l, gk_interface_addr_h); + fabric_endpoint_init(client_interface, gk_interface_addr_l, gk_interface_addr_h); routing_table = reinterpret_cast( client_interface->routing_tables_l1_offset + sizeof(fabric_router_l1_config_t) * routing_plane); @@ -402,6 +403,7 @@ void kernel_main() { uint32_t packet_count = 0; socket_handle_t* socket_handle = fabric_socket_open( + client_interface_addr, // client interface address 3, // the network plane to use for this socket 2, // Temporal epoch for which the socket is being opened 1, // Socket Id to open diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_tx_ubench.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_tx_ubench.cpp index ae1bebc19de..2cc881e93da 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_tx_ubench.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_tx_ubench.cpp @@ -63,7 +63,8 @@ constexpr uint32_t w_depth = get_compile_time_arg_val(25); constexpr uint32_t n_depth = get_compile_time_arg_val(26); constexpr uint32_t s_depth = get_compile_time_arg_val(27); -volatile fabric_client_interface_t* client_interface; +volatile tt_l1_ptr fabric_client_interface_t* client_interface = + (volatile tt_l1_ptr fabric_client_interface_t*)client_interface_addr; uint64_t xy_local_addr; uint32_t target_address; @@ -136,7 +137,7 @@ void kernel_main() { } // initalize client - fabric_endpoint_init(client_interface_addr, outbound_eth_chan); + fabric_endpoint_init(client_interface, outbound_eth_chan); // notify the controller kernel that this worker is ready to proceed notify_traffic_controller(); @@ -147,6 +148,7 @@ void kernel_main() { while (*(volatile tt_l1_ptr uint32_t*)signal_address == 0); fabric_setup_pull_request( + client_interface, // fabric client interface data_buffer_start_addr, // source address in sender’s memory max_packet_size_words * 16 // number of bytes to write to remote destination ); @@ -157,6 +159,7 @@ void kernel_main() { client_interface->local_pull_request.pull_request.words_read = 0; if constexpr (mcast_data) { fabric_async_write_multicast( + client_interface, 0, // the network plane to use for this transaction data_buffer_start_addr, // source address in sender’s memory dest_device >> 16, @@ -169,6 +172,7 @@ void kernel_main() { s_depth); } else { fabric_async_write( + client_interface, 0, // the network plane to use for this transaction data_buffer_start_addr, // source address in sender’s memory dest_device >> 16, diff --git a/tt_fabric/hw/inc/tt_fabric_api.h b/tt_fabric/hw/inc/tt_fabric_api.h index b3c63d1da4f..d34eccb07c5 100644 --- a/tt_fabric/hw/inc/tt_fabric_api.h +++ b/tt_fabric/hw/inc/tt_fabric_api.h @@ -13,8 +13,6 @@ using namespace tt::tt_fabric; -extern volatile fabric_client_interface_t* client_interface; - #define ASYNC_WR_ADD_PR 1 #define ASYNC_WR_SEND 2 #define ASYNC_WR_ADD_HEADER 4 @@ -25,7 +23,11 @@ enum RoutingType : uint8_t { ROUTER_XY, }; -inline uint32_t get_next_hop_router_noc_xy(uint32_t routing_plane, uint32_t dst_mesh_id, uint32_t dst_dev_id) { +inline uint32_t get_next_hop_router_noc_xy( + volatile tt_l1_ptr fabric_client_interface_t* client_interface, + uint32_t routing_plane, + uint32_t dst_mesh_id, + uint32_t dst_dev_id) { ASSERT(routing_plane < client_interface->num_routing_planes); fabric_router_l1_config_t* routing_table = (fabric_router_l1_config_t*)client_interface->routing_tables_l1_offset; if (dst_mesh_id != routing_table[routing_plane].my_mesh_id) { @@ -37,7 +39,8 @@ inline uint32_t get_next_hop_router_noc_xy(uint32_t routing_plane, uint32_t dst_ } } -inline void fabric_setup_pull_request(uint32_t src_addr, uint32_t size) { +inline void fabric_setup_pull_request( + volatile tt_l1_ptr fabric_client_interface_t* client_interface, uint32_t src_addr, uint32_t size) { uint32_t size_in_words = (size + PACKET_WORD_SIZE_BYTES - 1) >> 4; client_interface->local_pull_request.pull_request.wr_ptr = size_in_words; client_interface->local_pull_request.pull_request.rd_ptr = 0; @@ -52,10 +55,14 @@ inline void fabric_setup_pull_request(uint32_t src_addr, uint32_t size) { } template -inline void fabric_send_pull_request(uint32_t routing, uint16_t dst_mesh_id, uint16_t dst_dev_id) { +inline void fabric_send_pull_request( + volatile tt_l1_ptr fabric_client_interface_t* client_interface, + uint32_t routing, + uint16_t dst_mesh_id, + uint16_t dst_dev_id) { uint64_t router_addr; if constexpr (routing_type == RoutingType::ROUTING_TABLE) { - router_addr = ((uint64_t)get_next_hop_router_noc_xy(routing, dst_mesh_id, dst_dev_id) << 32) | + router_addr = ((uint64_t)get_next_hop_router_noc_xy(client_interface, routing, dst_mesh_id, dst_dev_id) << 32) | FABRIC_ROUTER_REQ_QUEUE_START; } else { router_addr = get_noc_addr_helper(routing, FABRIC_ROUTER_REQ_QUEUE_START); @@ -63,7 +70,8 @@ inline void fabric_send_pull_request(uint32_t routing, uint16_t dst_mesh_id, uin tt_fabric_send_pull_request(router_addr, (volatile local_pull_request_t*)&client_interface->local_pull_request); } -FORCE_INLINE void fabric_wait_for_pull_request_words_flushed(uint32_t words) { +FORCE_INLINE void fabric_wait_for_pull_request_words_flushed( + volatile tt_l1_ptr fabric_client_interface_t* client_interface, uint32_t words) { while (client_interface->local_pull_request.pull_request.words_read < words) { #pragma GCC unroll 4 for (int i = 0; i < 4; i++) { @@ -72,14 +80,15 @@ FORCE_INLINE void fabric_wait_for_pull_request_words_flushed(uint32_t words) { } } -inline void fabric_wait_for_pull_request_bytes_flushed(uint32_t size) { +inline void fabric_wait_for_pull_request_bytes_flushed( + volatile tt_l1_ptr fabric_client_interface_t* client_interface, uint32_t size) { uint32_t size_in_words = (size + PACKET_WORD_SIZE_BYTES - 1) >> 4; - fabric_wait_for_pull_request_words_flushed(size_in_words); + fabric_wait_for_pull_request_words_flushed(client_interface, size_in_words); } -inline void fabric_wait_for_pull_request_flushed() { +inline void fabric_wait_for_pull_request_flushed(volatile tt_l1_ptr fabric_client_interface_t* client_interface) { uint32_t words_written = client_interface->local_pull_request.pull_request.words_written; - fabric_wait_for_pull_request_words_flushed(words_written); + fabric_wait_for_pull_request_words_flushed(client_interface, words_written); } inline void fabric_async_write_add_header( @@ -104,6 +113,7 @@ inline void fabric_async_write_add_header( // Packet is at src_addr in sender L1. template inline void fabric_async_write( + volatile tt_l1_ptr fabric_client_interface_t* client_interface, uint32_t routing, // the network plane to use for this transaction uint32_t src_addr, // source address in sender’s memory uint16_t dst_mesh_id, @@ -116,11 +126,11 @@ inline void fabric_async_write( } if constexpr (mode & ASYNC_WR_ADD_PR) { - fabric_setup_pull_request(src_addr, size); + fabric_setup_pull_request(client_interface, src_addr, size); } if constexpr (mode & ASYNC_WR_SEND) { - fabric_send_pull_request(routing, dst_mesh_id, dst_dev_id); + fabric_send_pull_request(client_interface, routing, dst_mesh_id, dst_dev_id); } } @@ -152,6 +162,7 @@ inline void fabric_async_write_multicast_add_header( // Packet is at src_addr in sender L1. template inline void fabric_async_write_multicast( + volatile tt_l1_ptr fabric_client_interface_t* client_interface, uint32_t routing_plane, // the network plane to use for this transaction uint32_t src_addr, // source address in sender’s memory uint16_t dst_mesh_id, @@ -168,11 +179,11 @@ inline void fabric_async_write_multicast( } if constexpr (mode & ASYNC_WR_ADD_PR) { - fabric_setup_pull_request(src_addr, size); + fabric_setup_pull_request(client_interface, src_addr, size); } if constexpr (mode & ASYNC_WR_SEND) { - fabric_send_pull_request(routing_plane, dst_mesh_id, dst_dev_id); + fabric_send_pull_request(client_interface, routing_plane, dst_mesh_id, dst_dev_id); } } @@ -200,6 +211,7 @@ inline void fabric_atomic_inc_add_header( // Packet is at src_addr in sender L1. template inline void fabric_atomic_inc( + volatile tt_l1_ptr fabric_client_interface_t* client_interface, uint32_t routing, // the network plane to use for this transaction uint32_t src_addr, // source address in sender’s memory uint16_t dst_mesh_id, @@ -212,11 +224,11 @@ inline void fabric_atomic_inc( } if constexpr (mode & ASYNC_WR_ADD_PR) { - fabric_setup_pull_request(src_addr, PACKET_HEADER_SIZE_BYTES); + fabric_setup_pull_request(client_interface, src_addr, PACKET_HEADER_SIZE_BYTES); } if constexpr (mode & ASYNC_WR_SEND) { - fabric_send_pull_request(routing, dst_mesh_id, dst_dev_id); + fabric_send_pull_request(client_interface, routing, dst_mesh_id, dst_dev_id); } } @@ -246,6 +258,7 @@ inline void fabric_async_write_atomic_inc_add_header( // Packet is at src_addr in sender L1. template inline void fabric_async_write_atomic_inc( + volatile tt_l1_ptr fabric_client_interface_t* client_interface, uint32_t routing, // the network plane to use for this transaction uint32_t src_addr, // source address in sender’s memory uint16_t dst_mesh_id, @@ -260,15 +273,15 @@ inline void fabric_async_write_atomic_inc( } if constexpr (mode & ASYNC_WR_ADD_PR) { - fabric_setup_pull_request(src_addr, size); + fabric_setup_pull_request(client_interface, src_addr, size); } if constexpr (mode & ASYNC_WR_SEND) { - fabric_send_pull_request(routing, dst_mesh_id, dst_dev_id); + fabric_send_pull_request(client_interface, routing, dst_mesh_id, dst_dev_id); } } -inline void send_message_to_gk() { +inline void send_message_to_gk(volatile tt_l1_ptr fabric_client_interface_t* client_interface) { uint64_t gk_noc_base = client_interface->gk_msg_buf_addr; uint64_t noc_addr = gk_noc_base + offsetof(ctrl_chan_msg_buf, wrptr); noc_fast_atomic_increment( @@ -298,6 +311,7 @@ inline void send_message_to_gk() { } inline socket_handle_t* fabric_socket_open( + volatile tt_l1_ptr fabric_client_interface_t* client_interface, uint32_t routing_plane, // the network plane to use for this socket uint16_t epoch_id, // Temporal epoch for which the socket is being opened uint16_t socket_id, // Socket Id to open @@ -332,11 +346,12 @@ inline socket_handle_t* fabric_socket_open( client_interface->gk_message.packet_header.packet_parameters.socket_parameters.socket_direction = direction; client_interface->gk_message.packet_header.packet_parameters.socket_parameters.routing_plane = routing_plane; tt_fabric_add_header_checksum((packet_header_t*)&client_interface->gk_message.packet_header); - send_message_to_gk(); + send_message_to_gk(client_interface); return socket_handle; } -inline void fabric_socket_close(socket_handle_t* socket_handle) { +inline void fabric_socket_close( + volatile tt_l1_ptr fabric_client_interface_t* client_interface, socket_handle_t* socket_handle) { packet_header_t* packet_header = (packet_header_t*)&client_interface->gk_message.packet_header; uint32_t dst_mesh_id = socket_handle->rcvr_mesh_id; uint32_t dst_dev_id = socket_handle->rcvr_dev_id; @@ -355,7 +370,8 @@ inline void fabric_socket_close(socket_handle_t* socket_handle) { dst[i] = src[i]; } uint64_t dest_addr = - ((uint64_t)get_next_hop_router_noc_xy(socket_handle->routing_plane, dst_mesh_id, dst_dev_id) << 32) | + ((uint64_t)get_next_hop_router_noc_xy(client_interface, socket_handle->routing_plane, dst_mesh_id, dst_dev_id) + << 32) | FABRIC_ROUTER_REQ_QUEUE_START; tt_fabric_send_pull_request(dest_addr, (volatile local_pull_request_t*)&client_interface->local_pull_request); } @@ -368,10 +384,12 @@ inline void fabric_socket_connect(socket_handle_t* socket_handle) { } template -inline void fabric_endpoint_init(uint32_t base_address, uint32_t outbound_eth_chan) { +inline void fabric_endpoint_init( + volatile tt_l1_ptr fabric_client_interface_t* client_interface, uint32_t outbound_eth_chan) { tt_fabric_init(); - client_interface = (volatile fabric_client_interface_t*)base_address; - uint32_t routing_tables_offset = base_address + sizeof(fabric_client_interface_t); + // TODO: Should not assume routing tables are immediately after the client interface + // This should be a separate address we take in + uint32_t routing_tables_offset = (uint32_t)client_interface + sizeof(fabric_client_interface_t); zero_l1_buf((uint32_t*)client_interface, sizeof(fabric_client_interface_t)); client_interface->routing_tables_l1_offset = routing_tables_offset; From a59ca64b3619e755ea22fcd72af27234ad08f0e5 Mon Sep 17 00:00:00 2001 From: Austin Ho Date: Tue, 18 Feb 2025 21:14:55 +0000 Subject: [PATCH 172/316] #0: Remove dependency on tt_fabric.h and global xy_local_addr from tt_fabric_api.h --- .../kernels/tt_fabric_traffic_gen_rx_socket.cpp | 1 + .../routing/kernels/tt_fabric_traffic_gen_tx.cpp | 1 + .../kernels/tt_fabric_traffic_gen_tx_socket.cpp | 1 + .../routing/kernels/tt_fabric_tx_ubench.cpp | 1 - tt_fabric/hw/inc/tt_fabric.h | 7 +------ tt_fabric/hw/inc/tt_fabric_api.h | 11 +++++++---- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx_socket.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx_socket.cpp index 7431f98eb64..98061fbe385 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx_socket.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx_socket.cpp @@ -81,6 +81,7 @@ void kernel_main() { test_results[TT_FABRIC_MISC_INDEX] = 0xff000004; // make sure fabric node gatekeeper is available. + tt_fabric_init(); fabric_endpoint_init(); socket_reader.init(data_buffer_start_addr, data_buffer_size_words); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp index af0c515e3dc..7783c84645f 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp @@ -457,6 +457,7 @@ void kernel_main() { uint32_t packet_count = 0; // initalize client + tt_fabric_init(); fabric_endpoint_init(client_interface, outbound_eth_chan); routing_table = reinterpret_cast(client_interface->routing_tables_l1_offset); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx_socket.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx_socket.cpp index 8253be83948..c46c85e4a7b 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx_socket.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx_socket.cpp @@ -351,6 +351,7 @@ void kernel_main() { zero_l1_buf((uint32_t*)&packet_header, sizeof(packet_header_t)); // initalize client + tt_fabric_init(); fabric_endpoint_init(client_interface, gk_interface_addr_l, gk_interface_addr_h); routing_table = reinterpret_cast( client_interface->routing_tables_l1_offset + sizeof(fabric_router_l1_config_t) * routing_plane); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_tx_ubench.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_tx_ubench.cpp index 2cc881e93da..bd042ff4ae3 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_tx_ubench.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_tx_ubench.cpp @@ -66,7 +66,6 @@ constexpr uint32_t s_depth = get_compile_time_arg_val(27); volatile tt_l1_ptr fabric_client_interface_t* client_interface = (volatile tt_l1_ptr fabric_client_interface_t*)client_interface_addr; -uint64_t xy_local_addr; uint32_t target_address; uint32_t noc_offset; uint32_t controller_noc_offset; diff --git a/tt_fabric/hw/inc/tt_fabric.h b/tt_fabric/hw/inc/tt_fabric.h index 02ae486c69d..313f0933d66 100644 --- a/tt_fabric/hw/inc/tt_fabric.h +++ b/tt_fabric/hw/inc/tt_fabric.h @@ -1591,9 +1591,4 @@ inline uint64_t tt_fabric_send_pull_request(uint64_t dest_addr, volatile local_p return words_written_addr; } -inline void tt_fabric_init() { - uint32_t noc_id_reg = NOC_CMD_BUF_READ_REG(noc_index, 0, NOC_CFG(NOC_ID_LOGICAL)); - uint32_t my_x = noc_id_reg & NOC_NODE_ID_MASK; - uint32_t my_y = (noc_id_reg >> NOC_ADDR_NODE_ID_BITS) & NOC_NODE_ID_MASK; - xy_local_addr = NOC_XY_ADDR(my_x, my_y, 0); -} +inline void tt_fabric_init() { xy_local_addr = get_noc_addr(0); } diff --git a/tt_fabric/hw/inc/tt_fabric_api.h b/tt_fabric/hw/inc/tt_fabric_api.h index d34eccb07c5..964fe971155 100644 --- a/tt_fabric/hw/inc/tt_fabric_api.h +++ b/tt_fabric/hw/inc/tt_fabric_api.h @@ -5,13 +5,12 @@ #pragma once #include "risc_attribs.h" -#include #include "dataflow_api.h" #include "noc_overlay_parameters.h" #include "ethernet/dataflow_api.h" #include "tt_fabric_interface.h" -using namespace tt::tt_fabric; +namespace tt::tt_fabric { #define ASYNC_WR_ADD_PR 1 #define ASYNC_WR_SEND 2 @@ -42,6 +41,9 @@ inline uint32_t get_next_hop_router_noc_xy( inline void fabric_setup_pull_request( volatile tt_l1_ptr fabric_client_interface_t* client_interface, uint32_t src_addr, uint32_t size) { uint32_t size_in_words = (size + PACKET_WORD_SIZE_BYTES - 1) >> 4; + // TODO: Could return this value to the user and take this as an arg to avoid repeated lookup + // Added here to avoid user having to declare globals + uint64_t xy_local_addr = get_noc_addr(0); client_interface->local_pull_request.pull_request.wr_ptr = size_in_words; client_interface->local_pull_request.pull_request.rd_ptr = 0; client_interface->local_pull_request.pull_request.size = size; @@ -338,7 +340,7 @@ inline socket_handle_t* fabric_socket_open( client_interface->gk_message.packet_header.session.command = SOCKET_OPEN; client_interface->gk_message.packet_header.session.target_offset_h = client_interface->pull_req_buf_addr >> 32; client_interface->gk_message.packet_header.session.target_offset_l = (uint32_t)client_interface->pull_req_buf_addr; - client_interface->gk_message.packet_header.session.ack_offset_h = xy_local_addr >> 32; + client_interface->gk_message.packet_header.session.ack_offset_h = NOC_XY_ENCODING(my_x[noc_index], my_y[noc_index]); client_interface->gk_message.packet_header.session.ack_offset_l = (uint32_t)socket_handle; client_interface->gk_message.packet_header.packet_parameters.socket_parameters.socket_id = socket_id; client_interface->gk_message.packet_header.packet_parameters.socket_parameters.epoch_id = epoch_id; @@ -386,7 +388,6 @@ inline void fabric_socket_connect(socket_handle_t* socket_handle) { template inline void fabric_endpoint_init( volatile tt_l1_ptr fabric_client_interface_t* client_interface, uint32_t outbound_eth_chan) { - tt_fabric_init(); // TODO: Should not assume routing tables are immediately after the client interface // This should be a separate address we take in uint32_t routing_tables_offset = (uint32_t)client_interface + sizeof(fabric_client_interface_t); @@ -403,3 +404,5 @@ inline void fabric_endpoint_init( noc_async_read_barrier(); } } + +} // namespace tt::tt_fabric From 6dea8e6f0f6fcb081622e5c39e70e511bd86a7de Mon Sep 17 00:00:00 2001 From: Nigel Huang Date: Tue, 11 Feb 2025 07:00:34 +0000 Subject: [PATCH 173/316] #0: increase test vc/mux demux thresholds --- .../tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp | 4 ++-- .../perf_microbenchmark/routing/test_vc_mux_demux.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp index 05a35add66a..f267a746382 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp @@ -553,10 +553,10 @@ int main(int argc, char **argv) { && (demux_queue_size_bytes >= 0x20000)) { double target_bandwidth = 0; if (max_packet_size_words >= 1024) { - target_bandwidth = 10; + target_bandwidth = 13; log_info(LogTest, "Perf check for pkt size >= 1024 words"); } else if (max_packet_size_words >= 256) { - target_bandwidth = 3; + target_bandwidth = 4; log_info(LogTest, "Perf check for pkt size >= 256 words"); } if (mux_bw < target_bandwidth) { diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp index 11eda9992de..805ea48ca01 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp @@ -593,10 +593,10 @@ int main(int argc, char **argv) { && (demux_queue_size_bytes >= 0x20000)) { double target_bandwidth = 0; if (max_packet_size_words >= 1024) { - target_bandwidth = 11; + target_bandwidth = 17; log_info(LogTest, "Perf check for pkt size >= 1024 words"); } else if (max_packet_size_words >= 256) { - target_bandwidth = 3; + target_bandwidth = 7; log_info(LogTest, "Perf check for pkt size >= 256 words"); } if (mux_bw < target_bandwidth) { From 22fd7c5b4eb7535cfbc7f616b57c81196cd89c8d Mon Sep 17 00:00:00 2001 From: Sean Nijjar Date: Wed, 19 Feb 2025 16:40:51 -0500 Subject: [PATCH 174/316] upsize EDM fabric channel buffer slots to be able to fit 4 bfp8 tiles per packet (#18000) The current default EDM buffer slot size is 4096 which can only store 3 bfp8 tiles. There is enough space in erisc L1 unreserved space such that all channels can have a power of 2 buffer slot count and also have a slot size of 4 bfp8 tiles. There is inefficient space for 5 bfp8 tiles per slot. This commit bumps up the buffer slot size to fit 4 bfp8 tiles per packet, which is preferable for workloads with bfp8 tiles sent over fabric. --- ...fabric_erisc_data_mover_loopback_with_workers.cpp | 12 ++++++++---- .../ttnn/operations/ccl/erisc_datamover_builder.hpp | 2 +- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp b/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp index 1ab121ffec7..e45aa9d9395 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp @@ -427,7 +427,8 @@ bool RunLoopbackTest( // EDM Builder Setup //////////////////////////////////////////////////////////////////////////// - static constexpr std::size_t edm_buffer_size = 4096 + PACKET_HEADER_SIZE_BYTES; + static constexpr std::size_t edm_buffer_size = + ttnn::ccl::FabricEriscDatamoverBuilder::default_packet_payload_size_bytes + PACKET_HEADER_SIZE_BYTES; auto chip0_worker_fabric_connection = chip_0_edm_builder.build_connection_to_worker_channel(); //////////////////////////////////////////////////////////////////////////// @@ -910,7 +911,8 @@ bool RunLineFabricTest( std::size_t page_plus_header_size = page_size + sizeof(tt::fabric::PacketHeader); std::size_t tensor_size_bytes = num_pages_total * page_size; - static constexpr std::size_t edm_buffer_size = 4096 + PACKET_HEADER_SIZE_BYTES; + static constexpr std::size_t edm_buffer_size = + ttnn::ccl::FabricEriscDatamoverBuilder::default_packet_payload_size_bytes + PACKET_HEADER_SIZE_BYTES; const size_t local_chip_id = 0; const size_t remote_chip_id = 1; auto program_ptrs = std::vector(devices.size()); @@ -1237,7 +1239,8 @@ int TestLoopbackEntrypoint( IDevice* sender_device = device_0; IDevice* receiver_device = device_1; - static constexpr std::size_t edm_buffer_size = 4096 + PACKET_HEADER_SIZE_BYTES; + static constexpr std::size_t edm_buffer_size = + ttnn::ccl::FabricEriscDatamoverBuilder::default_packet_payload_size_bytes + PACKET_HEADER_SIZE_BYTES; const chip_id_t local_chip_id = 0; const chip_id_t remote_chip_id = 1; auto const& edm_config = ttnn::ccl::FabricEriscDatamoverConfig(edm_buffer_size, 1, 2); @@ -2988,7 +2991,8 @@ void RunWriteThroughputStabilityTestWithPersistentFabric( static constexpr uint32_t source_payload_cb_index = tt::CB::c_in1; static constexpr size_t packet_header_cb_size_in_headers = 4; static constexpr bool enable_persistent_fabric_mode = true; - static constexpr size_t packet_payload_size_bytes = 4096; + static constexpr size_t packet_payload_size_bytes = + ttnn::ccl::FabricEriscDatamoverBuilder::default_packet_payload_size_bytes; static constexpr size_t dest_buffer_size = packet_payload_size_bytes * 4; static constexpr tt::DataFormat cb_df = tt::DataFormat::Bfp8; diff --git a/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp b/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp index a9d1a076ba6..b271f19ac52 100644 --- a/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp @@ -183,7 +183,7 @@ class FabricEriscDatamoverBuilder { public: static constexpr size_t default_firmware_context_switch_interval = 200000; // payload only, no header - static constexpr size_t default_packet_payload_size_bytes = 4096; + static constexpr size_t default_packet_payload_size_bytes = tt::tile_size(tt::DataFormat::Bfp8_b) * 4; FabricEriscDatamoverBuilder( const CoreCoord& my_eth_core_logical, From 18433246b99fea337b5abcee1d09eb574159c1fd Mon Sep 17 00:00:00 2001 From: Andrew Fuller Date: Wed, 19 Feb 2025 17:20:47 -0500 Subject: [PATCH 175/316] Set a timeout for TG Demo (#18054) ### Ticket None ### Problem description Sometimes this hangs and clogs the runner for 3h before timing out. A successful run seems to run in ~20m. ### What's changed Set a timeout. --- .github/workflows/tg-demo-tests-impl.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/tg-demo-tests-impl.yaml b/.github/workflows/tg-demo-tests-impl.yaml index b5547d2abd6..492ad10f199 100644 --- a/.github/workflows/tg-demo-tests-impl.yaml +++ b/.github/workflows/tg-demo-tests-impl.yaml @@ -5,6 +5,7 @@ on: jobs: tg-demo-tests: + timeout-minutes: 30 strategy: fail-fast: false matrix: From 2faeab94151eae8a69dfee1c1d91cf8ae0d51956 Mon Sep 17 00:00:00 2001 From: Brian Beggs Date: Wed, 19 Feb 2025 14:39:22 -0800 Subject: [PATCH 176/316] [skip ci] Update matrix_engine.md (#18046) ### Ticket N/A ### Problem description Need document to be ready for BH release. ### What's changed Made use of WH and Wormhole consistent. Added note that numbers and figures apply to Blackhole as well. ### Checklist - [ ] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes - [ ] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [ ] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [ ] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [ ] New/Existing tests provide coverage for changes --- tech_reports/matrix_engine/matrix_engine.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tech_reports/matrix_engine/matrix_engine.md b/tech_reports/matrix_engine/matrix_engine.md index 2784826ea72..74179505b33 100644 --- a/tech_reports/matrix_engine/matrix_engine.md +++ b/tech_reports/matrix_engine/matrix_engine.md @@ -6,9 +6,12 @@ The matrix engine supports the following operations: matrix mult, reduction, elt ## Operations +>[!NOTE] +>All numbers and values apply to both Wormhole and Blackhole devices. + ### Matrix Mult -The WH matrix engine performs 8x16 x 16x16 = 8x16 in a single cycle. \ +The Wormhole matrix engine performs 8x16 x 16x16 = 8x16 in a single cycle. \ This is 2*8\*16\*16 = 4096 muladds in a single cycle. At 1GHz, this is 4 TFLOPS per matrix engine. \ The 8x16 is the smallest matrix that can be fed into in0, and 16x16 is the smallest matrix that can be fed into in1. @@ -24,7 +27,7 @@ HiFi3 -> 1.33 TFLOPS \ HiFi4 -> 1 TFLOPS ### Reduction: Max/Average/Sum -The WH matrix engine performs 16x16 reduce max/average/sum operations in a single cycle. \ +The Wormhole matrix engine performs 16x16 reduce max/average/sum operations in a single cycle. \ This is 2*16\*16 multiply + adds in a single cycle. At 1GHz, this is 0.512 TFLOPS per matrix engine. Reduce max does not use MATH_FIDELITY; however reduce average/sum does use MATH_FIDELITY for higher precision, and TFLOPS are calculated by dividing by the MATH_FIDELITY value. From 7b4857eb477981cc4ee4ccf0e9b6db850f62f051 Mon Sep 17 00:00:00 2001 From: Nigel Huang Date: Wed, 19 Feb 2025 18:35:49 +0000 Subject: [PATCH 177/316] #17992: fix overflow on TG - Col dispatch had a kernel overflow on the TG system by several bytes - Remove some unused data - Moved some members to type traits to be resolved at compiletime - Comment out all the DPRINT code --- .../impl/dispatch/kernels/packet_demux.cpp | 2 +- tt_metal/impl/dispatch/kernels/packet_mux.cpp | 2 +- .../impl/dispatch/kernels/packet_queue.hpp | 95 ++++++++++--------- .../impl/dispatch/kernels/vc_eth_tunneler.cpp | 3 +- .../dispatch/kernels/vc_packet_router.cpp | 4 +- 5 files changed, 56 insertions(+), 50 deletions(-) diff --git a/tt_metal/impl/dispatch/kernels/packet_demux.cpp b/tt_metal/impl/dispatch/kernels/packet_demux.cpp index cbe88e1dbef..36b01a59d3c 100644 --- a/tt_metal/impl/dispatch/kernels/packet_demux.cpp +++ b/tt_metal/impl/dispatch/kernels/packet_demux.cpp @@ -202,7 +202,7 @@ void kernel_main() { for (uint32_t i = 0; i < demux_fan_out; i++) { output_queues[i].init(i + 1, remote_tx_queue_start_addr_words[i], remote_tx_queue_size_words[i], remote_tx_x[i], remote_tx_y[i], remote_tx_queue_id[i], remote_tx_network_type[i], - &input_queue, 1, + &input_queue, output_depacketize[i], output_depacketize_log_page_size[i], output_depacketize_local_sem[i], output_depacketize_downstream_sem[i], output_depacketize_remove_header[i]); diff --git a/tt_metal/impl/dispatch/kernels/packet_mux.cpp b/tt_metal/impl/dispatch/kernels/packet_mux.cpp index 931e3997c85..c1e8777ec84 100644 --- a/tt_metal/impl/dispatch/kernels/packet_mux.cpp +++ b/tt_metal/impl/dispatch/kernels/packet_mux.cpp @@ -157,7 +157,7 @@ void kernel_main() { output_queue.init(mux_fan_in, remote_tx_queue_start_addr_words, remote_tx_queue_size_words, remote_tx_x, remote_tx_y, remote_tx_queue_id, tx_network_type, - input_queues, mux_fan_in, + input_queues, output_depacketize, output_depacketize_log_page_size, output_depacketize_downstream_sem, output_depacketize_local_sem, output_depacketize_remove_header); diff --git a/tt_metal/impl/dispatch/kernels/packet_queue.hpp b/tt_metal/impl/dispatch/kernels/packet_queue.hpp index 33eeec9232a..38d54fc9be5 100644 --- a/tt_metal/impl/dispatch/kernels/packet_queue.hpp +++ b/tt_metal/impl/dispatch/kernels/packet_queue.hpp @@ -16,6 +16,8 @@ #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" #include "debug/dprint.h" +#define ENABLE_DPRINTS true + constexpr ProgrammableCoreType fd_core_type = static_cast(FD_CORE_TYPE); constexpr uint32_t NUM_WR_CMD_BUFS = 4; @@ -26,6 +28,40 @@ constexpr uint32_t DEFAULT_MAX_ETH_SEND_WORDS = 2*1024; constexpr uint32_t NUM_PTR_REGS_PER_INPUT_QUEUE = 1; constexpr uint32_t NUM_PTR_REGS_PER_OUTPUT_QUEUE = 2; +template +struct MaxSendWords { + static_assert(std::is_enum_v, + "NetworkTraits requires DispatchRemoteNetworkType enum"); + static_assert(std::is_void_v, "Unknown DispatchRemoteNetworkType"); +}; + +template<> +struct MaxSendWords { + static constexpr uint32_t max_send_words = 0; +}; + +template<> +struct MaxSendWords { + static constexpr uint32_t max_send_words = 0; +}; + +template<> +struct MaxSendWords { + static constexpr uint32_t max_send_words = DEFAULT_MAX_ETH_SEND_WORDS; +}; + +template<> +struct MaxSendWords { + static constexpr uint32_t max_send_words = DEFAULT_MAX_NOC_SEND_WORDS; +}; + +template<> +struct MaxSendWords { + static constexpr uint32_t max_send_words = DEFAULT_MAX_NOC_SEND_WORDS; +}; + +template +inline constexpr uint32_t max_send_words_v = MaxSendWords::max_send_words; inline uint64_t get_timestamp() { uint32_t timestamp_low = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L); @@ -43,21 +79,18 @@ void zero_l1_buf(tt_l1_ptr uint32_t* buf, uint32_t size_bytes) { } } -static FORCE_INLINE void write_test_results(tt_l1_ptr uint32_t* const buf, uint32_t i, uint32_t val) { if (buf != nullptr) { buf[i] = val; } } -static FORCE_INLINE void write_kernel_status(tt_l1_ptr uint32_t* const buf, uint32_t i, uint32_t val) { if (buf != nullptr) { buf[i] = val; } } -static FORCE_INLINE void set_64b_result(uint32_t* buf, uint64_t val, uint32_t index = 0) { if (buf != nullptr) { buf[index] = val >> 32; @@ -174,6 +207,8 @@ class packet_queue_state_t { this->queue_id = queue_id; this->queue_start_addr_words = queue_start_addr_words; this->queue_size_words = queue_size_words; + this->ptr_offset_mask = queue_size_words - 1; + this->queue_size_mask = (queue_size_words << 1) - 1; this->queue_is_input = queue_is_input; this->remote_x = remote_x; this->remote_y = remote_y; @@ -475,11 +510,8 @@ class packet_queue_state_t { } } - void yield() { - // TODO: implement yield for ethernet here - } - void dprint_object() { +#if ENABLE_DPRINTS DPRINT << " id: " << DEC() << static_cast(this->queue_id) << ENDL(); DPRINT << " start_addr: 0x" << HEX() << static_cast(this->queue_start_addr_words*PACKET_WORD_SIZE_BYTES) << ENDL(); DPRINT << " size_bytes: 0x" << HEX() << static_cast(this->queue_size_words*PACKET_WORD_SIZE_BYTES) << ENDL(); @@ -491,6 +523,7 @@ class packet_queue_state_t { DPRINT << " local_wptr: 0x" << HEX() << this->get_queue_local_wptr() << ENDL(); DPRINT << " local_rptr_sent: 0x" << HEX() << this->get_queue_local_rptr_sent() << ENDL(); DPRINT << " local_rptr_cleared: 0x" << HEX() << this->get_queue_local_rptr_cleared() << ENDL(); +#endif } }; @@ -571,9 +604,6 @@ class packet_input_queue_state_t : public packet_queue_state_t { packetizer_input_remote_sem_id, packetizer_input_log_page_size); - tt_l1_ptr uint32_t* queue_ptr = - reinterpret_cast(queue_start_addr_words*PACKET_WORD_SIZE_BYTES); - this->packetizer_page_words_cleared = 0; if (packetizer_input) { @@ -583,8 +613,6 @@ class packet_input_queue_state_t : public packet_queue_state_t { this->curr_packet_tag = 0xabcd; } - this->ptr_offset_mask = queue_size_words - 1; - this->queue_size_mask = (queue_size_words << 1) - 1; this->curr_packet_valid = false; this->reset_queue_local_wptr(); this->reset_ready_flag(); @@ -725,6 +753,7 @@ class packet_input_queue_state_t : public packet_queue_state_t { } void dprint_object() { +#if ENABLE_DPRINTS DPRINT << "Input queue:" << ENDL(); packet_queue_state_t::dprint_object(); DPRINT << " packet_valid: " << DEC() << static_cast(this->curr_packet_valid) << ENDL(); @@ -734,17 +763,14 @@ class packet_input_queue_state_t : public packet_queue_state_t { DPRINT << " packet_flags: 0x" << HEX() << static_cast(this->curr_packet_flags) << ENDL(); DPRINT << " packet_size_words: " << DEC() << static_cast(this->curr_packet_size_words) << ENDL(); DPRINT << " packet_words_sent: " << DEC() << static_cast(this->curr_packet_words_sent) << ENDL(); +#endif } }; class packet_output_queue_state_t : public packet_queue_state_t { - protected: - - uint32_t output_max_send_words; - uint32_t unpacketizer_page_words_sent; bool unpacketizer_remove_header; @@ -758,10 +784,7 @@ class packet_output_queue_state_t : public packet_queue_state_t { uint32_t curr_output_total_words_in_flight; uint32_t prev_output_total_words_in_flight; - uint8_t num_input_queues; - - void init(packet_input_queue_state_t* input_queue_array, uint32_t num_input_queues) { - this->num_input_queues = num_input_queues; + void init(packet_input_queue_state_t* input_queue_array) { this->input_queue_array = input_queue_array; this->curr_input_queue_words_in_flight = &(this->input_queue_words_in_flight[0]); this->prev_input_queue_words_in_flight = &(this->input_queue_words_in_flight[MAX_SWITCH_FAN_IN]); @@ -810,6 +833,7 @@ class packet_output_queue_state_t : public packet_queue_state_t { } void dprint_object() { +#if ENABLE_DPRINTS DPRINT << " curr_output_total_words_in_flight: " << DEC() << this->curr_output_total_words_in_flight << ENDL(); for (uint32_t j = 0; j < MAX_SWITCH_FAN_IN; j++) { DPRINT << " from input queue id " << DEC() << @@ -824,6 +848,7 @@ class packet_output_queue_state_t : public packet_queue_state_t { << DEC() << this->prev_input_queue_words_in_flight[j] << ENDL(); } +#endif } } input_queue_status; @@ -846,7 +871,6 @@ class packet_output_queue_state_t : public packet_queue_state_t { uint8_t remote_queue_id, DispatchRemoteNetworkType remote_update_network_type, packet_input_queue_state_t* input_queue_array, - uint8_t num_input_queues, bool unpacketizer_output = false, uint16_t unpacketizer_output_log_page_size = 0, uint8_t unpacketizer_output_sem_id = 0, @@ -861,25 +885,7 @@ class packet_output_queue_state_t : public packet_queue_state_t { this->unpacketizer_remove_header = unpacketizer_output_remove_header; this->unpacketizer_page_words_sent = 0; - this->ptr_offset_mask = queue_size_words - 1; - this->queue_size_mask = (queue_size_words << 1) - 1; - this->input_queue_status.init(input_queue_array, num_input_queues); - switch (remote_update_network_type) { - case DispatchRemoteNetworkType::DISABLE_QUEUE: - case DispatchRemoteNetworkType::NONE: - this->output_max_send_words = 0; - break; - case DispatchRemoteNetworkType::ETH: - this->output_max_send_words = DEFAULT_MAX_ETH_SEND_WORDS; - break; - case DispatchRemoteNetworkType::NOC0: - case DispatchRemoteNetworkType::NOC1: - this->output_max_send_words = DEFAULT_MAX_NOC_SEND_WORDS; - break; - default: - ASSERT(false); - } - + this->input_queue_status.init(input_queue_array); this->reset_queue_local_rptr_sent(); this->reset_queue_local_rptr_cleared(); this->reset_ready_flag(); @@ -946,14 +952,13 @@ class packet_output_queue_state_t : public packet_queue_state_t { return false; } } - this->yield(); } this->input_queue_status.prev_words_in_flight_flush(); this->input_queue_status.prev_words_in_flight_flush(); return true; } - template + template inline uint32_t get_num_words_to_send(uint32_t input_queue_index) { packet_input_queue_state_t* input_queue_ptr = &(this->input_queue_status.input_queue_array[input_queue_index]); @@ -965,7 +970,7 @@ class packet_output_queue_state_t : public packet_queue_state_t { uint32_t output_buf_words_before_wptr_wrap = this->get_queue_words_before_wptr_wrap(); num_words_to_forward = std::min(num_words_to_forward, output_buf_words_before_wptr_wrap); - num_words_to_forward = std::min(num_words_to_forward, this->output_max_send_words); + num_words_to_forward = std::min(num_words_to_forward, max_send_words_v); return num_words_to_forward; } @@ -973,7 +978,7 @@ class packet_output_queue_state_t : public packet_queue_state_t { template inline uint32_t forward_data_from_input(uint32_t input_queue_index, bool& full_packet_sent, uint16_t end_of_cmd) { packet_input_queue_state_t* input_queue_ptr = &(this->input_queue_status.input_queue_array[input_queue_index]); - uint32_t num_words_to_forward = this->get_num_words_to_send(input_queue_index); + uint32_t num_words_to_forward = this->get_num_words_to_send(input_queue_index); full_packet_sent = (num_words_to_forward == input_queue_ptr->get_curr_packet_words_remaining()); if (num_words_to_forward == 0) { return 0; @@ -1019,9 +1024,11 @@ class packet_output_queue_state_t : public packet_queue_state_t { } void dprint_object() { +#if ENABLE_DPRINTS DPRINT << "Output queue:" << ENDL(); packet_queue_state_t::dprint_object(); this->input_queue_status.dprint_object(); +#endif } }; diff --git a/tt_metal/impl/dispatch/kernels/vc_eth_tunneler.cpp b/tt_metal/impl/dispatch/kernels/vc_eth_tunneler.cpp index e248c8b6d24..e61bfb2a3bb 100644 --- a/tt_metal/impl/dispatch/kernels/vc_eth_tunneler.cpp +++ b/tt_metal/impl/dispatch/kernels/vc_eth_tunneler.cpp @@ -250,8 +250,7 @@ void kernel_main() { remote_receiver_y[i], remote_receiver_queue_id[i], remote_receiver_network_type[i], - &input_queues[i], - 1); + &input_queues[i]); } if (!wait_all_input_output_ready 0) { + if constexpr (timeout_cycles > 0) { uint32_t cycles_since_progress = get_timestamp_32b() - progress_timestamp; if (cycles_since_progress > timeout_cycles) { timeout = true; From 60741ddbcfc562ec401d1ec9ec30d4ff13eed1c6 Mon Sep 17 00:00:00 2001 From: Allan Liu Date: Thu, 16 Jan 2025 19:05:06 +0000 Subject: [PATCH 178/316] Move fabric to tt_metal --- CMakeLists.txt | 10 --- CODEOWNERS | 6 +- tests/tt_metal/tt_fabric/CMakeLists.txt | 3 +- .../fabric_router/test_routing_tables.cpp | 25 ++++--- .../perf_microbenchmark/CMakeLists.txt | 3 +- .../routing/kernels/traffic_gen.hpp | 2 +- .../routing/kernels/traffic_gen_tx.cpp | 2 +- .../routing/kernels/tt_fabric_traffic_gen.hpp | 2 +- .../kernels/tt_fabric_traffic_gen_rx.cpp | 6 +- .../kernels/tt_fabric_traffic_gen_tx.cpp | 6 +- .../routing/test_common.hpp | 2 +- .../test_tt_fabric_multi_hop_sanity.cpp | 12 ++-- .../routing/test_tt_fabric_sanity.cpp | 12 ++-- .../routing/test_tt_fabric_socket_sanity.cpp | 8 +-- tt_fabric/CMakeLists.txt | 49 -------------- tt_fabric/routing_table_generator.hpp | 60 ----------------- tt_metal/CMakeLists.txt | 2 + .../api/tt-metalium}/control_plane.hpp | 2 +- tt_metal/api/tt-metalium/device_pool.hpp | 1 + .../api/tt-metalium/fabric_host_interface.h | 64 ++++++++++++++++++ .../api/tt-metalium}/mesh_graph.hpp | 65 +++++++++---------- .../tt-metalium/routing_table_generator.hpp | 60 +++++++++++++++++ tt_metal/fabric/CMakeLists.txt | 42 ++++++++++++ .../fabric}/control_plane.cpp | 0 .../fabric}/hw/inc/eth_chan_noc_mapping.h | 0 .../fabric}/hw/inc/routing_table.h | 14 ++-- .../fabric}/hw/inc/tt_fabric.h | 6 +- .../fabric}/hw/inc/tt_fabric_api.h | 0 .../fabric}/hw/inc/tt_fabric_interface.h | 1 + .../fabric}/hw/inc/tt_fabric_status.h | 0 .../impl/kernels/tt_fabric_gatekeeper.cpp | 4 +- .../fabric}/impl/kernels/tt_fabric_router.cpp | 6 +- {tt_fabric => tt_metal/fabric}/mesh_graph.cpp | 0 .../n300_mesh_graph_descriptor.yaml | 0 .../quanta_galaxy_mesh_graph_descriptor.yaml | 0 .../t3k_mesh_graph_descriptor.yaml | 0 .../tg_mesh_graph_descriptor.yaml | 0 .../fabric}/routing_table_generator.cpp | 0 tt_metal/impl/dispatch/topology.cpp | 1 + 39 files changed, 266 insertions(+), 210 deletions(-) delete mode 100644 tt_fabric/CMakeLists.txt delete mode 100644 tt_fabric/routing_table_generator.hpp rename {tt_fabric => tt_metal/api/tt-metalium}/control_plane.hpp (98%) create mode 100644 tt_metal/api/tt-metalium/fabric_host_interface.h rename {tt_fabric => tt_metal/api/tt-metalium}/mesh_graph.hpp (56%) create mode 100644 tt_metal/api/tt-metalium/routing_table_generator.hpp create mode 100644 tt_metal/fabric/CMakeLists.txt rename {tt_fabric => tt_metal/fabric}/control_plane.cpp (100%) rename {tt_fabric => tt_metal/fabric}/hw/inc/eth_chan_noc_mapping.h (100%) rename {tt_fabric => tt_metal/fabric}/hw/inc/routing_table.h (88%) rename {tt_fabric => tt_metal/fabric}/hw/inc/tt_fabric.h (99%) rename {tt_fabric => tt_metal/fabric}/hw/inc/tt_fabric_api.h (100%) rename {tt_fabric => tt_metal/fabric}/hw/inc/tt_fabric_interface.h (99%) rename {tt_fabric => tt_metal/fabric}/hw/inc/tt_fabric_status.h (100%) rename {tt_fabric => tt_metal/fabric}/impl/kernels/tt_fabric_gatekeeper.cpp (99%) rename {tt_fabric => tt_metal/fabric}/impl/kernels/tt_fabric_router.cpp (98%) rename {tt_fabric => tt_metal/fabric}/mesh_graph.cpp (100%) rename {tt_fabric => tt_metal/fabric}/mesh_graph_descriptors/n300_mesh_graph_descriptor.yaml (100%) rename {tt_fabric => tt_metal/fabric}/mesh_graph_descriptors/quanta_galaxy_mesh_graph_descriptor.yaml (100%) rename {tt_fabric => tt_metal/fabric}/mesh_graph_descriptors/t3k_mesh_graph_descriptor.yaml (100%) rename {tt_fabric => tt_metal/fabric}/mesh_graph_descriptors/tg_mesh_graph_descriptor.yaml (100%) rename {tt_fabric => tt_metal/fabric}/routing_table_generator.cpp (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 21ffe59c943..57cf47858c0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -257,7 +257,6 @@ include(tracy) # Build subdirectories ############################################################################################################################ -add_subdirectory(tt_fabric) add_subdirectory(tt_metal) add_subdirectory(ttnn) @@ -272,15 +271,6 @@ endif() ############################################################################################################################ # Install for build artifacts that will upload build/lib -install( - TARGETS - tt_fabric - ARCHIVE - DESTINATION ${CMAKE_INSTALL_LIBDIR} - LIBRARY - DESTINATION ${CMAKE_INSTALL_LIBDIR} - COMPONENT tar -) install( TARGETS tt_metal diff --git a/CODEOWNERS b/CODEOWNERS index f50e3bb6075..62994bfe05c 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -47,9 +47,6 @@ tests/scripts/t3000/ @tenstorrent/metalium-developers-infra tests/scripts/tg/ @tenstorrent/metalium-developers-infra tests/scripts/tgg/ @tenstorrent/metalium-developers-infra -# fabric -tt_fabric/ @ubcheema @aliuTT @aagarwalTT - # Metalium - public API tt_metal/api @abhullar-tt @pgkeller @aliuTT @tt-aho @tt-dma @tt-asaigal @ubcheema @cfjchu @omilyutin-tt @@ -59,6 +56,9 @@ tt_metal/host_api.hpp @abhullar-tt @pgkeller @aliuTT @tt-aho @tt-dma @tt-asaigal tt_metal/impl/device/ @abhullar-tt @pgkeller @aliuTT @tt-aho @tt-dma @tt-asaigal @ubcheema @davorchap @cfjchu @omilyutin-tt tt_metal/**/requirements*.txt @tenstorrent/metalium-developers-infra +# fabric +tt_metal/fabric/ @ubcheema @aliuTT @aagarwalTT + # metal - dispatch tt_metal/impl/dispatch/kernels/packet_* @ubcheema @aliuTT tt_metal/impl/dispatch/kernels/eth_* @ubcheema @aliuTT diff --git a/tests/tt_metal/tt_fabric/CMakeLists.txt b/tests/tt_metal/tt_fabric/CMakeLists.txt index f18be1886d4..796577e524c 100644 --- a/tests/tt_metal/tt_fabric/CMakeLists.txt +++ b/tests/tt_metal/tt_fabric/CMakeLists.txt @@ -5,7 +5,7 @@ target_link_libraries( fabric_unit_tests PRIVATE tt_metal - tt_fabric + fabric test_common_libs ) @@ -13,7 +13,6 @@ target_include_directories( fabric_unit_tests PRIVATE ${UMD_HOME} - ${PROJECT_SOURCE_DIR}/tt_fabric ${PROJECT_SOURCE_DIR}/tests ${PROJECT_SOURCE_DIR}/tt_metal ${CMAKE_CURRENT_SOURCE_DIR}/common diff --git a/tests/tt_metal/tt_fabric/fabric_router/test_routing_tables.cpp b/tests/tt_metal/tt_fabric/fabric_router/test_routing_tables.cpp index 9d335001d56..8b826ebcbac 100644 --- a/tests/tt_metal/tt_fabric/fabric_router/test_routing_tables.cpp +++ b/tests/tt_metal/tt_fabric/fabric_router/test_routing_tables.cpp @@ -4,9 +4,9 @@ #include #include "fabric_fixture.hpp" -#include "tt_fabric/control_plane.hpp" -#include "tt_fabric/mesh_graph.hpp" -#include "tt_fabric/routing_table_generator.hpp" +#include +#include +#include namespace tt::tt_fabric { namespace fabric_router_tests { @@ -14,21 +14,21 @@ namespace fabric_router_tests { TEST_F(ControlPlaneFixture, TestTGMeshGraphInit) { const std::filesystem::path tg_mesh_graph_desc_path = std::filesystem::path(tt::llrt::RunTimeOptions::get_instance().get_root_dir()) / - "tt_fabric/mesh_graph_descriptors/tg_mesh_graph_descriptor.yaml"; + "tt_metal/fabric/mesh_graph_descriptors/tg_mesh_graph_descriptor.yaml"; auto mesh_graph_desc = std::make_unique(tg_mesh_graph_desc_path.string()); } TEST_F(ControlPlaneFixture, TestTGControlPlaneInit) { const std::filesystem::path tg_mesh_graph_desc_path = std::filesystem::path(tt::llrt::RunTimeOptions::get_instance().get_root_dir()) / - "tt_fabric/mesh_graph_descriptors/tg_mesh_graph_descriptor.yaml"; + "tt_metal/fabric/mesh_graph_descriptors/tg_mesh_graph_descriptor.yaml"; auto control_plane = std::make_unique(tg_mesh_graph_desc_path.string()); } TEST_F(ControlPlaneFixture, TestTGFabricRoutes) { const std::filesystem::path tg_mesh_graph_desc_path = std::filesystem::path(tt::llrt::RunTimeOptions::get_instance().get_root_dir()) / - "tt_fabric/mesh_graph_descriptors/tg_mesh_graph_descriptor.yaml"; + "tt_metal/fabric/mesh_graph_descriptors/tg_mesh_graph_descriptor.yaml"; auto control_plane = std::make_unique(tg_mesh_graph_desc_path.string()); auto valid_chans = control_plane->get_valid_eth_chans_on_routing_plane(0, 0, 3); for (auto chan : valid_chans) { @@ -39,21 +39,21 @@ TEST_F(ControlPlaneFixture, TestTGFabricRoutes) { TEST_F(ControlPlaneFixture, TestT3kMeshGraphInit) { const std::filesystem::path t3k_mesh_graph_desc_path = std::filesystem::path(tt::llrt::RunTimeOptions::get_instance().get_root_dir()) / - "tt_fabric/mesh_graph_descriptors/t3k_mesh_graph_descriptor.yaml"; + "tt_metal/fabric/mesh_graph_descriptors/t3k_mesh_graph_descriptor.yaml"; auto mesh_graph_desc = std::make_unique(t3k_mesh_graph_desc_path.string()); } TEST_F(ControlPlaneFixture, TestT3kControlPlaneInit) { const std::filesystem::path t3k_mesh_graph_desc_path = std::filesystem::path(tt::llrt::RunTimeOptions::get_instance().get_root_dir()) / - "tt_fabric/mesh_graph_descriptors/t3k_mesh_graph_descriptor.yaml"; + "tt_metal/fabric/mesh_graph_descriptors/t3k_mesh_graph_descriptor.yaml"; auto control_plane = std::make_unique(t3k_mesh_graph_desc_path.string()); } TEST_F(ControlPlaneFixture, TestT3kFabricRoutes) { const std::filesystem::path t3k_mesh_graph_desc_path = std::filesystem::path(tt::llrt::RunTimeOptions::get_instance().get_root_dir()) / - "tt_fabric/mesh_graph_descriptors/t3k_mesh_graph_descriptor.yaml"; + "tt_metal/fabric/mesh_graph_descriptors/t3k_mesh_graph_descriptor.yaml"; auto control_plane = std::make_unique(t3k_mesh_graph_desc_path.string()); auto valid_chans = control_plane->get_valid_eth_chans_on_routing_plane(0, 0, 0); for (auto chan : valid_chans) { @@ -65,5 +65,12 @@ TEST_F(ControlPlaneFixture, TestT3kFabricRoutes) { } } +TEST_F(ControlPlaneFixture, TestQuantaGalaxyControlPlaneInit) { + const std::filesystem::path quanta_galaxy_mesh_graph_desc_path = + std::filesystem::path(tt::llrt::RunTimeOptions::get_instance().get_root_dir()) / + "tt_metal/fabric/mesh_graph_descriptors/quanta_galaxy_mesh_graph_descriptor.yaml"; + auto control_plane = std::make_unique(quanta_galaxy_mesh_graph_desc_path.string()); +} + } // namespace fabric_router_tests } // namespace tt::tt_fabric diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt b/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt index 7573ef25f91..e4178cba02b 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt @@ -66,7 +66,7 @@ foreach(arch ${ARCHITECTURES}) test_metal_common_libs PRIVATE yaml-cpp::yaml-cpp - tt_fabric + fabric ) if(${TEST_SRC} STREQUAL "dispatch/test_pgm_dispatch.cpp") target_link_libraries(${TEST_TARGET} PRIVATE benchmark::benchmark) @@ -77,7 +77,6 @@ foreach(arch ${ARCHITECTURES}) PRIVATE ${PROJECT_SOURCE_DIR}/tt_metal/hw/inc/${arch} "$" - ${PROJECT_SOURCE_DIR}/tt_fabric ${PROJECT_SOURCE_DIR}/ttnn/cpp/ttnn/deprecated # this all should go away and be replaced with link to ttnn ${PROJECT_SOURCE_DIR}/tests ${CMAKE_CURRENT_SOURCE_DIR} diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen.hpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen.hpp index 01b9dedaae2..76737d354b4 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen.hpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen.hpp @@ -5,7 +5,7 @@ #pragma once #include "debug/dprint.h" -#include "tt_fabric/hw/inc/tt_fabric_status.h" +#include "tt_metal/fabric/hw/inc/tt_fabric_status.h" inline uint32_t prng_next(uint32_t n) { uint32_t x = n; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_tx.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_tx.cpp index 57812ccde36..2dd8613a562 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_tx.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_tx.cpp @@ -5,7 +5,7 @@ #include "dataflow_api.h" #include "debug/dprint.h" #include "tt_metal/impl/dispatch/kernels/packet_queue.hpp" -#include "tt_fabric/hw/inc/tt_fabric_status.h" +#include "tt_metal/fabric/hw/inc/tt_fabric_status.h" #include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen.hpp" constexpr uint32_t src_endpoint_id = get_compile_time_arg_val(0); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen.hpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen.hpp index 19fcdc79dbd..b7ceb0376ff 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen.hpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen.hpp @@ -5,7 +5,7 @@ #pragma once #include "debug/dprint.h" -#include "tt_fabric/hw/inc/tt_fabric_status.h" +#include "tt_metal/fabric/hw/inc/tt_fabric_status.h" #define is_power_of_2(x) (((x) > 0) && (((x) & ((x) - 1)) == 0)) diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx.cpp index 4c29d8b4ef9..b21e5a241ff 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx.cpp @@ -5,10 +5,10 @@ // clang-format off #include "debug/dprint.h" #include "dataflow_api.h" -#include "tt_fabric/hw/inc/tt_fabric.h" +#include "tt_metal/fabric/hw/inc/tt_fabric.h" #include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen.hpp" -#include "tt_fabric/hw/inc/tt_fabric_status.h" -#include "tt_fabric/hw/inc/tt_fabric_interface.h" +#include "tt_metal/fabric/hw/inc/tt_fabric_status.h" +#include "tt_metal/fabric/hw/inc/tt_fabric_interface.h" #include "tests/tt_metal/tt_metal/perf_microbenchmark/common/kernel_utils.hpp" // clang-format on diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp index 7783c84645f..9771420e537 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp @@ -5,10 +5,10 @@ // clang-format off #include "dataflow_api.h" #include "debug/dprint.h" -#include "tt_fabric/hw/inc/tt_fabric.h" +#include "tt_metal/fabric/hw/inc/tt_fabric.h" #include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen.hpp" -#include "tt_fabric/hw/inc/tt_fabric_interface.h" -#include "tt_fabric/hw/inc/tt_fabric_api.h" +#include "tt_metal/fabric/hw/inc/tt_fabric_interface.h" +#include "tt_metal/fabric/hw/inc/tt_fabric_api.h" #include "tests/tt_metal/tt_metal/perf_microbenchmark/common/kernel_utils.hpp" // clang-format on diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp index f055d0a9833..ad6c6eff13b 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp @@ -6,7 +6,7 @@ #include #include -#include "hw/inc/tt_fabric_status.h" +#include "tt_metal/fabric/hw/inc/tt_fabric_status.h" #include "llrt.hpp" static inline std::string to_string(pkt_dest_size_choices_t choice) { diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp index d6aab9503dd..bacca186d10 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp @@ -6,13 +6,13 @@ #include #include #include -#include "tt_fabric/control_plane.hpp" +#include // #include // #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" -#include "tt_fabric/hw/inc/tt_fabric_status.h" +#include "tt_metal/fabric/hw/inc/tt_fabric_status.h" #include "test_common.hpp" #include "eth_l1_address_map.h" -#include "tt_fabric/hw/inc/tt_fabric_interface.h" +#include "tt_metal/fabric/hw/inc/tt_fabric_interface.h" using std::vector; using namespace tt; @@ -233,7 +233,7 @@ int main(int argc, char** argv) { try { const std::filesystem::path tg_mesh_graph_desc_path = std::filesystem::path(tt::llrt::RunTimeOptions::get_instance().get_root_dir()) / - "tt_fabric/mesh_graph_descriptors/tg_mesh_graph_descriptor.yaml"; + "tt_metal/fabric/mesh_graph_descriptors/tg_mesh_graph_descriptor.yaml"; auto control_plane = std::make_unique(tg_mesh_graph_desc_path.string()); int num_devices = tt_metal::GetNumAvailableDevices(); @@ -360,7 +360,7 @@ int main(int argc, char** argv) { for (auto logical_core : device_router_cores) { auto router_kernel = tt_metal::CreateKernel( program_map[device.first], - "tt_fabric/impl/kernels/tt_fabric_router.cpp", + "tt_metal/fabric/impl/kernels/tt_fabric_router.cpp", logical_core, tt_metal::EthernetConfig{ .noc = tt_metal::NOC::NOC_0, .compile_args = router_compile_args, .defines = defines}); @@ -391,7 +391,7 @@ int main(int argc, char** argv) { auto kernel = tt_metal::CreateKernel( program_map[device.first], - "tt_fabric/impl/kernels/tt_fabric_gatekeeper.cpp", + "tt_metal/fabric/impl/kernels/tt_fabric_gatekeeper.cpp", {gk_core}, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_0, diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp index eba9b2ed24e..abf891874ca 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp @@ -6,14 +6,14 @@ #include #include #include -#include "tt_fabric/control_plane.hpp" -#include "tt_fabric/mesh_graph.hpp" +#include +#include //#include //#include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" -#include "tt_fabric/hw/inc/tt_fabric_status.h" +#include "tt_metal/fabric/hw/inc/tt_fabric_status.h" #include "test_common.hpp" #include "eth_l1_address_map.h" -#include "tt_fabric/hw/inc/tt_fabric_interface.h" +#include "tt_metal/fabric/hw/inc/tt_fabric_interface.h" #include #include #include @@ -46,7 +46,7 @@ uint32_t tx_signal_address; uint32_t host_signal_address; // kernels -const std::string router_kernel_src = "tt_fabric/impl/kernels/tt_fabric_router.cpp"; +const std::string router_kernel_src = "tt_metal/fabric/impl/kernels/tt_fabric_router.cpp"; const std::string traffic_controller_src = "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_controller.cpp"; const std::string rx_kernel_src = @@ -171,7 +171,7 @@ typedef struct test_board { try { const std::filesystem::path mesh_graph_desc_path = std::filesystem::path(tt::llrt::RunTimeOptions::get_instance().get_root_dir()) / - "tt_fabric/mesh_graph_descriptors" / mesh_graph_descriptor; + "tt_metal/fabric/mesh_graph_descriptors" / mesh_graph_descriptor; control_plane = std::make_unique(mesh_graph_desc_path.string()); } catch (const std::exception& e) { log_fatal(e.what()); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp index b6b81e575e1..b6a5e0182c8 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp @@ -6,13 +6,13 @@ #include #include #include -#include "tt_fabric/control_plane.hpp" +#include // #include "tt_metal/impl/dispatch/cq_commands.hpp" // #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" -#include "tt_fabric/hw/inc/tt_fabric_status.h" +#include "tt_metal/fabric/hw/inc/tt_fabric_status.h" #include "test_common.hpp" #include "eth_l1_address_map.h" -#include "tt_fabric/hw/inc/tt_fabric_interface.h" +#include "tt_metal/fabric/hw/inc/tt_fabric_interface.h" using std::vector; using namespace tt; @@ -395,7 +395,7 @@ int main(int argc, char** argv) { auto kernel = tt_metal::CreateKernel( program_map[device.first], - "tt_fabric/impl/kernels/tt_fabric_gatekeeper.cpp", + "tt_metal/fabric/impl/kernels/tt_fabric_gatekeeper.cpp", {gk_core}, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_0, diff --git a/tt_fabric/CMakeLists.txt b/tt_fabric/CMakeLists.txt deleted file mode 100644 index aa32e36a7e9..00000000000 --- a/tt_fabric/CMakeLists.txt +++ /dev/null @@ -1,49 +0,0 @@ -add_library(tt_fabric) -add_library(TT::Fabric ALIAS tt_fabric) - -target_sources( - tt_fabric - PRIVATE - control_plane.cpp - routing_table_generator.cpp - mesh_graph.cpp -) - -target_include_directories( - tt_fabric - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR} - ${PROJECT_SOURCE_DIR}/tt_metal/api/tt-metalium -) - -target_link_libraries( - tt_fabric - PRIVATE - Metalium::Metal - Metalium::Metal::LLRT - umd::device - metal_common_libs - magic_enum::magic_enum - fmt::fmt-header-only - yaml-cpp::yaml-cpp -) - -target_precompile_headers( - tt_fabric - PRIVATE - - - - - - -) - -target_compile_options(tt_fabric PRIVATE -Wno-int-to-pointer-cast) - -set_target_properties( - tt_fabric - PROPERTIES - INSTALL_RPATH - "${PROJECT_BINARY_DIR}/lib" -) diff --git a/tt_fabric/routing_table_generator.hpp b/tt_fabric/routing_table_generator.hpp deleted file mode 100644 index 0034ad05a0d..00000000000 --- a/tt_fabric/routing_table_generator.hpp +++ /dev/null @@ -1,60 +0,0 @@ -// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#pragma once -#include -#include "mesh_graph.hpp" - -namespace tt::tt_fabric { - -using RoutingTable = - std::vector>>; // [mesh_id][chip_id][target_chip_or_mesh_id] - -class RoutingTableGenerator { - public: - explicit RoutingTableGenerator(const std::string& mesh_graph_desc_yaml_file); - ~RoutingTableGenerator() = default; - - void dump_to_yaml(); - void load_from_yaml(); - - void print_connectivity() const { this->mesh_graph_->print_connectivity(); } - - const IntraMeshConnectivity& get_intra_mesh_connectivity() const { - return this->mesh_graph_->get_intra_mesh_connectivity(); - } - const InterMeshConnectivity& get_inter_mesh_connectivity() const { - return this->mesh_graph_->get_inter_mesh_connectivity(); - } - const ChipSpec& get_chip_spec() const { return this->mesh_graph_->get_chip_spec(); } - - std::uint32_t get_mesh_ns_size(mesh_id_t mesh_id) const { return this->mesh_graph_->get_mesh_ns_size(mesh_id); } - std::uint32_t get_mesh_ew_size(mesh_id_t mesh_id) const { return this->mesh_graph_->get_mesh_ew_size(mesh_id); } - - RoutingTable get_intra_mesh_table() const { return this->intra_mesh_table_; } - RoutingTable get_inter_mesh_table() const { return this->inter_mesh_table_; } - - void print_routing_tables() const; - - private: - std::unique_ptr mesh_graph_; - // configurable in future architectures - const uint32_t max_nodes_in_mesh_ = 1024; - const uint32_t max_num_meshes_ = 1024; - - std::vector mesh_sizes; - - RoutingTable intra_mesh_table_; - RoutingTable inter_mesh_table_; - - std::vector>>> get_paths_to_all_meshes( - mesh_id_t src, const InterMeshConnectivity& inter_mesh_connectivity); - void generate_intramesh_routing_table(const IntraMeshConnectivity& intra_mesh_connectivity); - // when generating intermesh routing table, we use the intramesh connectivity table to find the shortest path to - // the exit chip - void generate_intermesh_routing_table( - const InterMeshConnectivity& inter_mesh_connectivity, const IntraMeshConnectivity& intra_mesh_connectivity); -}; - -} // namespace tt::tt_fabric diff --git a/tt_metal/CMakeLists.txt b/tt_metal/CMakeLists.txt index 11c36177fa9..46a372f85a8 100644 --- a/tt_metal/CMakeLists.txt +++ b/tt_metal/CMakeLists.txt @@ -37,6 +37,7 @@ target_link_libraries( llrt detail distributed + fabric HAL::grayskull HAL::wormhole HAL::blackhole @@ -157,6 +158,7 @@ add_subdirectory(impl) add_subdirectory(detail) add_subdirectory(distributed) add_subdirectory(tt_stl) +add_subdirectory(fabric) if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.23) install( diff --git a/tt_fabric/control_plane.hpp b/tt_metal/api/tt-metalium/control_plane.hpp similarity index 98% rename from tt_fabric/control_plane.hpp rename to tt_metal/api/tt-metalium/control_plane.hpp index 0ad16aca13a..7c62a0ef9e4 100644 --- a/tt_fabric/control_plane.hpp +++ b/tt_metal/api/tt-metalium/control_plane.hpp @@ -7,7 +7,7 @@ #include "routing_table_generator.hpp" #include #include -#include "hw/inc/routing_table.h" +#include namespace tt::tt_fabric { diff --git a/tt_metal/api/tt-metalium/device_pool.hpp b/tt_metal/api/tt-metalium/device_pool.hpp index 31dbd2bf839..fb2cf7159e5 100644 --- a/tt_metal/api/tt-metalium/device_pool.hpp +++ b/tt_metal/api/tt-metalium/device_pool.hpp @@ -17,6 +17,7 @@ #include "dispatch_core_common.hpp" #include "span.hpp" #include "umd/device/types/cluster_descriptor_types.h" +#include "control_plane.hpp" namespace tt { namespace tt_metal::detail { diff --git a/tt_metal/api/tt-metalium/fabric_host_interface.h b/tt_metal/api/tt-metalium/fabric_host_interface.h new file mode 100644 index 00000000000..4218365b143 --- /dev/null +++ b/tt_metal/api/tt-metalium/fabric_host_interface.h @@ -0,0 +1,64 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include + +#if defined(KERNEL_BUILD) || defined(FW_BUILD) +#include "risc_attribs.h" +#else +#define tt_l1_ptr +#define tt_reg_ptr +#define FORCE_INLINE inline +#endif + +// TODO: move routing table here +namespace tt::tt_fabric { + +constexpr uint32_t GATEKEEPER_INFO_SIZE_BYTES = 848; + +using chan_id_t = std::uint8_t; +using routing_plane_id_t = std::uint8_t; + +static constexpr std::uint32_t MAX_MESH_SIZE = 1024; +static constexpr std::uint32_t MAX_NUM_MESHES = 1024; + +static constexpr std::uint32_t NUM_CHANNELS_PER_UINT32 = sizeof(std::uint32_t) / sizeof(chan_id_t); +static constexpr std::uint32_t LOG_BASE_2_NUM_CHANNELS_PER_UINT32 = 2; +static constexpr std::uint32_t MODULO_LOG_BASE_2 = (1 << LOG_BASE_2_NUM_CHANNELS_PER_UINT32) - 1; +static constexpr std::uint32_t NUM_TABLE_ENTRIES = MAX_MESH_SIZE >> LOG_BASE_2_NUM_CHANNELS_PER_UINT32; + +static_assert(MAX_MESH_SIZE == MAX_NUM_MESHES, "MAX_MESH_SIZE must be equal to MAX_NUM_MESHES"); +static_assert( + (sizeof(std::uint32_t) / sizeof(chan_id_t)) == NUM_CHANNELS_PER_UINT32, + "LOG_BASE_2_NUM_CHANNELS_PER_UINT32 must be equal to log2(sizeof(std::uint32_t) / sizeof(chan_id_t))"); + +enum eth_chan_magic_values { + INVALID_DIRECTION = 0xDD, + INVALID_ROUTING_TABLE_ENTRY = 0xFF, +}; + +struct routing_table_t { + chan_id_t dest_entry[MAX_MESH_SIZE]; +}; + +struct port_direction_t { + chan_id_t north; + chan_id_t south; + chan_id_t east; + chan_id_t west; +}; + +struct fabric_router_l1_config_t { + routing_table_t intra_mesh_table; + routing_table_t inter_mesh_table; + port_direction_t port_direction; + std::uint16_t my_mesh_id; // Do we need this if we tag routing tables with magic values for outbound eth channels + // and route to local NOC? + std::uint16_t my_device_id; + std::uint8_t padding[8]; // pad to 16-byte alignment. +} __attribute__((packed)); + +} // namespace tt::tt_fabric diff --git a/tt_fabric/mesh_graph.hpp b/tt_metal/api/tt-metalium/mesh_graph.hpp similarity index 56% rename from tt_fabric/mesh_graph.hpp rename to tt_metal/api/tt-metalium/mesh_graph.hpp index 1b9ac9c6359..829ce2214d6 100644 --- a/tt_fabric/mesh_graph.hpp +++ b/tt_metal/api/tt-metalium/mesh_graph.hpp @@ -57,39 +57,36 @@ using InterMeshConnectivity = std::vector>>; class MeshGraph { - public: - explicit MeshGraph(const std::string& mesh_graph_desc_file_path); - MeshGraph() = delete; - ~MeshGraph() = default; - - void print_connectivity() const; - - const IntraMeshConnectivity& get_intra_mesh_connectivity() const { return intra_mesh_connectivity_; } - const InterMeshConnectivity& get_inter_mesh_connectivity() const { return inter_mesh_connectivity_; } - - const ChipSpec& get_chip_spec() const { return chip_spec_; } - - std::uint32_t get_mesh_ns_size(mesh_id_t mesh_id) const { return mesh_shapes_[mesh_id].first; } - std::uint32_t get_mesh_ew_size(mesh_id_t mesh_id) const { return mesh_shapes_[mesh_id].second; } - - private: - std::unordered_map get_valid_connections( - chip_id_t src_chip_id, - std::uint32_t row_size, - std::uint32_t num_chips_in_mesh, - FabricType fabric_type) const; - void initialize_from_yaml(const std::string& mesh_graph_desc_file_path); - - void add_to_connectivity( - mesh_id_t src_mesh_id, - chip_id_t src_chip_id, - chip_id_t dest_mesh_id, - chip_id_t dest_chip_id, - RoutingDirection port_direction); - - ChipSpec chip_spec_; - std::vector> mesh_shapes_; - IntraMeshConnectivity intra_mesh_connectivity_; - InterMeshConnectivity inter_mesh_connectivity_; +public: + explicit MeshGraph(const std::string& mesh_graph_desc_file_path); + MeshGraph() = delete; + ~MeshGraph() = default; + + void print_connectivity() const; + + const IntraMeshConnectivity& get_intra_mesh_connectivity() const { return intra_mesh_connectivity_; } + const InterMeshConnectivity& get_inter_mesh_connectivity() const { return inter_mesh_connectivity_; } + + const ChipSpec& get_chip_spec() const { return chip_spec_; } + + std::uint32_t get_mesh_ns_size(mesh_id_t mesh_id) const { return mesh_shapes_[mesh_id].first; } + std::uint32_t get_mesh_ew_size(mesh_id_t mesh_id) const { return mesh_shapes_[mesh_id].second; } + +private: + std::unordered_map get_valid_connections( + chip_id_t src_chip_id, std::uint32_t row_size, std::uint32_t num_chips_in_mesh, FabricType fabric_type) const; + void initialize_from_yaml(const std::string& mesh_graph_desc_file_path); + + void add_to_connectivity( + mesh_id_t src_mesh_id, + chip_id_t src_chip_id, + chip_id_t dest_mesh_id, + chip_id_t dest_chip_id, + RoutingDirection port_direction); + + ChipSpec chip_spec_; + std::vector> mesh_shapes_; + IntraMeshConnectivity intra_mesh_connectivity_; + InterMeshConnectivity inter_mesh_connectivity_; }; } // namespace tt::tt_fabric diff --git a/tt_metal/api/tt-metalium/routing_table_generator.hpp b/tt_metal/api/tt-metalium/routing_table_generator.hpp new file mode 100644 index 00000000000..ac57204ef1e --- /dev/null +++ b/tt_metal/api/tt-metalium/routing_table_generator.hpp @@ -0,0 +1,60 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include +#include "mesh_graph.hpp" + +namespace tt::tt_fabric { + +using RoutingTable = + std::vector>>; // [mesh_id][chip_id][target_chip_or_mesh_id] + +class RoutingTableGenerator { +public: + explicit RoutingTableGenerator(const std::string& mesh_graph_desc_yaml_file); + ~RoutingTableGenerator() = default; + + void dump_to_yaml(); + void load_from_yaml(); + + void print_connectivity() const { this->mesh_graph_->print_connectivity(); } + + const IntraMeshConnectivity& get_intra_mesh_connectivity() const { + return this->mesh_graph_->get_intra_mesh_connectivity(); + } + const InterMeshConnectivity& get_inter_mesh_connectivity() const { + return this->mesh_graph_->get_inter_mesh_connectivity(); + } + const ChipSpec& get_chip_spec() const { return this->mesh_graph_->get_chip_spec(); } + + std::uint32_t get_mesh_ns_size(mesh_id_t mesh_id) const { return this->mesh_graph_->get_mesh_ns_size(mesh_id); } + std::uint32_t get_mesh_ew_size(mesh_id_t mesh_id) const { return this->mesh_graph_->get_mesh_ew_size(mesh_id); } + + RoutingTable get_intra_mesh_table() const { return this->intra_mesh_table_; } + RoutingTable get_inter_mesh_table() const { return this->inter_mesh_table_; } + + void print_routing_tables() const; + +private: + std::unique_ptr mesh_graph_; + // configurable in future architectures + const uint32_t max_nodes_in_mesh_ = 1024; + const uint32_t max_num_meshes_ = 1024; + + std::vector mesh_sizes; + + RoutingTable intra_mesh_table_; + RoutingTable inter_mesh_table_; + + std::vector>>> get_paths_to_all_meshes( + mesh_id_t src, const InterMeshConnectivity& inter_mesh_connectivity); + void generate_intramesh_routing_table(const IntraMeshConnectivity& intra_mesh_connectivity); + // when generating intermesh routing table, we use the intramesh connectivity table to find the shortest path to + // the exit chip + void generate_intermesh_routing_table( + const InterMeshConnectivity& inter_mesh_connectivity, const IntraMeshConnectivity& intra_mesh_connectivity); +}; + +} // namespace tt::tt_fabric diff --git a/tt_metal/fabric/CMakeLists.txt b/tt_metal/fabric/CMakeLists.txt new file mode 100644 index 00000000000..5898839611a --- /dev/null +++ b/tt_metal/fabric/CMakeLists.txt @@ -0,0 +1,42 @@ +set(FABRIC_SRC + ${CMAKE_CURRENT_SOURCE_DIR}/control_plane.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/routing_table_generator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/mesh_graph.cpp +) + +add_library(fabric OBJECT ${FABRIC_SRC}) + +target_include_directories(fabric PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) + +target_link_libraries( + fabric + PRIVATE + Metalium::Metal::LLRT + umd::device + metal_common_libs + magic_enum::magic_enum + fmt::fmt-header-only + yaml-cpp::yaml-cpp + Metalium::Metal::Impl + TT::Metalium::HostDevCommon +) + +target_precompile_headers( + fabric + PRIVATE + + + + + + +) + +target_compile_options(fabric PRIVATE -Wno-int-to-pointer-cast) + +#set_target_properties( +# fabric +# PROPERTIES +# INSTALL_RPATH +# "${PROJECT_BINARY_DIR}/lib" +#) diff --git a/tt_fabric/control_plane.cpp b/tt_metal/fabric/control_plane.cpp similarity index 100% rename from tt_fabric/control_plane.cpp rename to tt_metal/fabric/control_plane.cpp diff --git a/tt_fabric/hw/inc/eth_chan_noc_mapping.h b/tt_metal/fabric/hw/inc/eth_chan_noc_mapping.h similarity index 100% rename from tt_fabric/hw/inc/eth_chan_noc_mapping.h rename to tt_metal/fabric/hw/inc/eth_chan_noc_mapping.h diff --git a/tt_fabric/hw/inc/routing_table.h b/tt_metal/fabric/hw/inc/routing_table.h similarity index 88% rename from tt_fabric/hw/inc/routing_table.h rename to tt_metal/fabric/hw/inc/routing_table.h index 70c862cc009..2c24c76401c 100644 --- a/tt_fabric/hw/inc/routing_table.h +++ b/tt_metal/fabric/hw/inc/routing_table.h @@ -32,7 +32,9 @@ static constexpr std::uint32_t MODULO_LOG_BASE_2 = (1 << LOG_BASE_2_NUM_CHANNELS static constexpr std::uint32_t NUM_TABLE_ENTRIES = MAX_MESH_SIZE >> LOG_BASE_2_NUM_CHANNELS_PER_UINT32; static_assert(MAX_MESH_SIZE == MAX_NUM_MESHES, "MAX_MESH_SIZE must be equal to MAX_NUM_MESHES"); -static_assert((sizeof(std::uint32_t) / sizeof(chan_id_t)) == NUM_CHANNELS_PER_UINT32, "LOG_BASE_2_NUM_CHANNELS_PER_UINT32 must be equal to log2(sizeof(std::uint32_t) / sizeof(chan_id_t))"); +static_assert( + (sizeof(std::uint32_t) / sizeof(chan_id_t)) == NUM_CHANNELS_PER_UINT32, + "LOG_BASE_2_NUM_CHANNELS_PER_UINT32 must be equal to log2(sizeof(std::uint32_t) / sizeof(chan_id_t))"); enum eth_chan_magic_values { INVALID_DIRECTION = 0xDD, @@ -40,14 +42,14 @@ enum eth_chan_magic_values { }; struct routing_table_t { - chan_id_t dest_entry[MAX_MESH_SIZE]; + chan_id_t dest_entry[MAX_MESH_SIZE]; }; struct port_direction_t { - chan_id_t north; - chan_id_t south; - chan_id_t east; - chan_id_t west; + chan_id_t north; + chan_id_t south; + chan_id_t east; + chan_id_t west; }; struct fabric_router_l1_config_t { diff --git a/tt_fabric/hw/inc/tt_fabric.h b/tt_metal/fabric/hw/inc/tt_fabric.h similarity index 99% rename from tt_fabric/hw/inc/tt_fabric.h rename to tt_metal/fabric/hw/inc/tt_fabric.h index 313f0933d66..ac82650c3bb 100644 --- a/tt_fabric/hw/inc/tt_fabric.h +++ b/tt_metal/fabric/hw/inc/tt_fabric.h @@ -9,9 +9,9 @@ #include "dataflow_api.h" #include "noc_overlay_parameters.h" #include "ethernet/dataflow_api.h" -#include "tt_fabric/hw/inc/routing_table.h" -#include "tt_fabric/hw/inc/tt_fabric_interface.h" -#include "tt_fabric/hw/inc/eth_chan_noc_mapping.h" +#include +#include "tt_metal/fabric/hw/inc/tt_fabric_interface.h" +#include "tt_metal/fabric/hw/inc/eth_chan_noc_mapping.h" using namespace tt::tt_fabric; diff --git a/tt_fabric/hw/inc/tt_fabric_api.h b/tt_metal/fabric/hw/inc/tt_fabric_api.h similarity index 100% rename from tt_fabric/hw/inc/tt_fabric_api.h rename to tt_metal/fabric/hw/inc/tt_fabric_api.h diff --git a/tt_fabric/hw/inc/tt_fabric_interface.h b/tt_metal/fabric/hw/inc/tt_fabric_interface.h similarity index 99% rename from tt_fabric/hw/inc/tt_fabric_interface.h rename to tt_metal/fabric/hw/inc/tt_fabric_interface.h index 9f8c1daa949..951231cd47c 100644 --- a/tt_fabric/hw/inc/tt_fabric_interface.h +++ b/tt_metal/fabric/hw/inc/tt_fabric_interface.h @@ -6,6 +6,7 @@ #include "eth_l1_address_map.h" #include "noc/noc_parameters.h" +#include namespace tt::tt_fabric { diff --git a/tt_fabric/hw/inc/tt_fabric_status.h b/tt_metal/fabric/hw/inc/tt_fabric_status.h similarity index 100% rename from tt_fabric/hw/inc/tt_fabric_status.h rename to tt_metal/fabric/hw/inc/tt_fabric_status.h diff --git a/tt_fabric/impl/kernels/tt_fabric_gatekeeper.cpp b/tt_metal/fabric/impl/kernels/tt_fabric_gatekeeper.cpp similarity index 99% rename from tt_fabric/impl/kernels/tt_fabric_gatekeeper.cpp rename to tt_metal/fabric/impl/kernels/tt_fabric_gatekeeper.cpp index c211c6f0133..02d7cb2682b 100644 --- a/tt_fabric/impl/kernels/tt_fabric_gatekeeper.cpp +++ b/tt_metal/fabric/impl/kernels/tt_fabric_gatekeeper.cpp @@ -4,8 +4,8 @@ // clang-format off #include "dataflow_api.h" -#include "tt_fabric/hw/inc/tt_fabric.h" -#include "tt_fabric/hw/inc/tt_fabric_status.h" +#include "tt_metal/fabric/hw/inc/tt_fabric.h" +#include "tt_metal/fabric/hw/inc/tt_fabric_status.h" #include "debug/dprint.h" // clang-format on diff --git a/tt_fabric/impl/kernels/tt_fabric_router.cpp b/tt_metal/fabric/impl/kernels/tt_fabric_router.cpp similarity index 98% rename from tt_fabric/impl/kernels/tt_fabric_router.cpp rename to tt_metal/fabric/impl/kernels/tt_fabric_router.cpp index 9cd08cbe2d8..5949c4bbbaf 100644 --- a/tt_fabric/impl/kernels/tt_fabric_router.cpp +++ b/tt_metal/fabric/impl/kernels/tt_fabric_router.cpp @@ -4,8 +4,8 @@ // clang-format off #include "dataflow_api.h" -#include "tt_fabric/hw/inc/tt_fabric.h" -#include "tt_fabric/hw/inc/tt_fabric_status.h" +#include "tt_metal/fabric/hw/inc/tt_fabric.h" +#include "tt_metal/fabric/hw/inc/tt_fabric_status.h" // clang-format on using namespace tt::tt_fabric; @@ -33,7 +33,7 @@ bool terminated_slave_routers = false; // careful, may be null tt_l1_ptr uint32_t* const kernel_status = reinterpret_cast(kernel_status_buf_addr_arg); -tt_l1_ptr volatile chan_req_buf* fvc_consumer_req_buf = +volatile tt_l1_ptr chan_req_buf* fvc_consumer_req_buf = reinterpret_cast(FABRIC_ROUTER_REQ_QUEUE_START); volatile tt_l1_ptr fabric_router_l1_config_t* routing_table = reinterpret_cast(eth_l1_mem::address_map::FABRIC_ROUTER_CONFIG_BASE); diff --git a/tt_fabric/mesh_graph.cpp b/tt_metal/fabric/mesh_graph.cpp similarity index 100% rename from tt_fabric/mesh_graph.cpp rename to tt_metal/fabric/mesh_graph.cpp diff --git a/tt_fabric/mesh_graph_descriptors/n300_mesh_graph_descriptor.yaml b/tt_metal/fabric/mesh_graph_descriptors/n300_mesh_graph_descriptor.yaml similarity index 100% rename from tt_fabric/mesh_graph_descriptors/n300_mesh_graph_descriptor.yaml rename to tt_metal/fabric/mesh_graph_descriptors/n300_mesh_graph_descriptor.yaml diff --git a/tt_fabric/mesh_graph_descriptors/quanta_galaxy_mesh_graph_descriptor.yaml b/tt_metal/fabric/mesh_graph_descriptors/quanta_galaxy_mesh_graph_descriptor.yaml similarity index 100% rename from tt_fabric/mesh_graph_descriptors/quanta_galaxy_mesh_graph_descriptor.yaml rename to tt_metal/fabric/mesh_graph_descriptors/quanta_galaxy_mesh_graph_descriptor.yaml diff --git a/tt_fabric/mesh_graph_descriptors/t3k_mesh_graph_descriptor.yaml b/tt_metal/fabric/mesh_graph_descriptors/t3k_mesh_graph_descriptor.yaml similarity index 100% rename from tt_fabric/mesh_graph_descriptors/t3k_mesh_graph_descriptor.yaml rename to tt_metal/fabric/mesh_graph_descriptors/t3k_mesh_graph_descriptor.yaml diff --git a/tt_fabric/mesh_graph_descriptors/tg_mesh_graph_descriptor.yaml b/tt_metal/fabric/mesh_graph_descriptors/tg_mesh_graph_descriptor.yaml similarity index 100% rename from tt_fabric/mesh_graph_descriptors/tg_mesh_graph_descriptor.yaml rename to tt_metal/fabric/mesh_graph_descriptors/tg_mesh_graph_descriptor.yaml diff --git a/tt_fabric/routing_table_generator.cpp b/tt_metal/fabric/routing_table_generator.cpp similarity index 100% rename from tt_fabric/routing_table_generator.cpp rename to tt_metal/fabric/routing_table_generator.cpp diff --git a/tt_metal/impl/dispatch/topology.cpp b/tt_metal/impl/dispatch/topology.cpp index b8eff2dd822..59d4c775dac 100644 --- a/tt_metal/impl/dispatch/topology.cpp +++ b/tt_metal/impl/dispatch/topology.cpp @@ -14,6 +14,7 @@ #include "kernel_config/demux.hpp" #include "kernel_config/eth_router.hpp" #include "kernel_config/eth_tunneler.hpp" +#include "fabric_host_interface.h" #include "tt_cluster.hpp" From bfa0f042c312a8d29ec32ca992fd68810f922fb3 Mon Sep 17 00:00:00 2001 From: Allan Liu Date: Tue, 18 Feb 2025 19:13:11 +0000 Subject: [PATCH 179/316] Integrate fabric init infra to metal runtime --- .../routing/test_tt_fabric_sanity.cpp | 70 +++++++++----- tt_metal/api/tt-metalium/device.hpp | 1 + tt_metal/api/tt-metalium/device_impl.hpp | 5 + tt_metal/api/tt-metalium/device_pool.hpp | 16 +++- .../api/tt-metalium/dispatch_core_manager.hpp | 5 +- .../api/tt-metalium/fabric_host_interface.h | 4 +- tt_metal/api/tt-metalium/mesh_device.hpp | 1 + tt_metal/api/tt-metalium/tt_metal.hpp | 12 ++- tt_metal/distributed/mesh_device.cpp | 4 + tt_metal/fabric/control_plane.cpp | 2 +- tt_metal/fabric/hw/inc/tt_fabric_interface.h | 1 + tt_metal/impl/device/device.cpp | 29 ++++++ tt_metal/impl/device/device_pool.cpp | 93 +++++++++++++++++++ tt_metal/impl/dispatch/topology.cpp | 66 ++++++++++++- tt_metal/impl/dispatch/topology.hpp | 6 ++ tt_metal/llrt/tt_cluster.cpp | 35 +++++-- tt_metal/llrt/tt_cluster.hpp | 14 ++- tt_metal/tt_metal.cpp | 11 ++- 18 files changed, 333 insertions(+), 42 deletions(-) diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp index abf891874ca..f495c0b5e7b 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -42,6 +43,11 @@ bool bidirectional_traffic; // benchmark test mode bool benchmark_mode; +// Metal fabric initialization level +// 0: No fabric initialization +// 1: Initialize metal fabric with default settings +uint32_t metal_fabric_init_level; + uint32_t tx_signal_address; uint32_t host_signal_address; @@ -85,9 +91,11 @@ typedef struct test_board { std::vector physical_chip_ids; std::vector>> tx_rx_map; std::map device_handle_map; - std::unique_ptr control_plane; + tt::tt_fabric::ControlPlane* control_plane; + std::unique_ptr cp_owning_ptr; uint32_t num_chips_to_use; std::string mesh_graph_descriptor; + tt::tt_metal::DispatchCoreType dispatch_core_type = tt::tt_metal::DispatchCoreType::WORKER; test_board(std::string& board_type_) { if ("n300" == board_type_) { @@ -129,8 +137,16 @@ typedef struct test_board { throw std::runtime_error("Odd number of chips detected, not supported currently"); } - device_handle_map = tt::tt_metal::detail::CreateDevices(available_chip_ids); - _init_control_plane(mesh_graph_descriptor); + if (metal_fabric_init_level != 0) { + tt::tt_metal::detail::InitializeFabricSetting(tt::tt_metal::detail::FabricSetting::FABRIC); + } + device_handle_map = + tt::tt_metal::detail::CreateDevices(available_chip_ids, 1, 0, 0, DispatchCoreConfig{dispatch_core_type}); + if (metal_fabric_init_level == 0) { + _init_control_plane(mesh_graph_descriptor); + } else { + control_plane = tt::DevicePool::instance().get_control_plane(); + } if (num_chips_to_use != available_chip_ids.size()) { // initialize partial board to get the set of physical chip IDs for fabric kernels @@ -172,7 +188,8 @@ typedef struct test_board { const std::filesystem::path mesh_graph_desc_path = std::filesystem::path(tt::llrt::RunTimeOptions::get_instance().get_root_dir()) / "tt_metal/fabric/mesh_graph_descriptors" / mesh_graph_descriptor; - control_plane = std::make_unique(mesh_graph_desc_path.string()); + cp_owning_ptr = std::make_unique(mesh_graph_desc_path.string()); + control_plane = cp_owning_ptr.get(); } catch (const std::exception& e) { log_fatal(e.what()); } @@ -1301,7 +1318,10 @@ int main(int argc, char **argv) { log_info( LogTest, " --device_id: Device on which the test will be run, default = {}", default_test_device_id_l); log_info( - LogTest, " --device_id_r: Device on which the test will be run, default = {}", default_test_device_id_r); + LogTest, " --device_id_r: DDevice on which the test will be run, default = {}", default_test_device_id_r); + + log_info( + LogTest, " --metal_fabric_init_level: use Metal runtime to load fabric, 0 is disable, 1 is enable", 0); return 0; } @@ -1402,6 +1422,7 @@ int main(int argc, char **argv) { if (mcast && bidirectional_traffic) { throw std::runtime_error("Bidirectional traffic is not supported for mcast"); } + metal_fabric_init_level = test_args::get_command_option_uint32(input_args, "--metal_fabric_init_level", 0); bool pass = true; uint32_t num_available_devices, num_allocated_devices = 0; @@ -1544,17 +1565,19 @@ int main(int argc, char **argv) { uint32_t worker_unreserved_base_addr = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); - // create router kernels - std::vector router_compile_args = { - (tunneler_queue_size_bytes >> 4), // 0: rx_queue_size_words - tunneler_test_results_addr, // 1: test_results_addr - tunneler_test_results_size, // 2: test_results_size - 0, // timeout_mcycles * 1000 * 1000 * 4, // 3: timeout_cycles - }; - for (auto& [chip_id, test_device] : test_devices) { - test_device->create_router_kernels(router_compile_args, defines); + if (metal_fabric_init_level == 0) { + // manual init fabric + // create router kernels + std::vector router_compile_args = { + (tunneler_queue_size_bytes >> 4), // 0: rx_queue_size_words + tunneler_test_results_addr, // 1: test_results_addr + tunneler_test_results_size, // 2: test_results_size + 0, // timeout_mcycles * 1000 * 1000 * 4, // 3: timeout_cycles + }; + for (auto& [chip_id, test_device] : test_devices) { + test_device->create_router_kernels(router_compile_args, defines); + } } - if (check_txrx_timeout) { defines["CHECK_TIMEOUT"] = ""; } @@ -1624,9 +1647,11 @@ int main(int argc, char **argv) { tt_metal::detail::LaunchProgram(test_device->device_handle, test_device->program_handle, false); } - // wait for all routers to handshake with master router - for (auto& [chip_id, test_device] : test_devices) { - test_device->wait_for_router_sync(); + if (metal_fabric_init_level == 0) { + // wait for all routers to handshake with master router + for (auto& [chip_id, test_device] : test_devices) { + test_device->wait_for_router_sync(); + } } // notify tx controller to signal the tx workers @@ -1638,16 +1663,17 @@ int main(int argc, char **argv) { for (auto& traffic : fabric_traffic) { traffic.wait_for_rx_workers_to_finish(); } - // terminate fabric routers - for (auto& [chip_id, test_device] : test_devices) { - test_device->terminate_router_kernels(); + // terminate fabric routers if control plane is not managed by DevicePool + if (metal_fabric_init_level == 0) { + for (auto& [chip_id, test_device] : test_devices) { + test_device->terminate_router_kernels(); + } } // wait for programs to exit for (auto& [chip_id, test_device] : test_devices) { tt_metal::detail::WaitProgramDone(test_device->device_handle, test_device->program_handle); } - auto end = std::chrono::system_clock::now(); std::chrono::duration elapsed_seconds = (end-start); diff --git a/tt_metal/api/tt-metalium/device.hpp b/tt_metal/api/tt-metalium/device.hpp index 36df50bb957..fdc1cbef87d 100644 --- a/tt_metal/api/tt-metalium/device.hpp +++ b/tt_metal/api/tt-metalium/device.hpp @@ -157,6 +157,7 @@ class IDevice { virtual void init_command_queue_host() = 0; virtual void init_command_queue_device() = 0; + virtual void init_fabric() = 0; // Puts device into reset virtual bool close() = 0; diff --git a/tt_metal/api/tt-metalium/device_impl.hpp b/tt_metal/api/tt-metalium/device_impl.hpp index 71cb322c39a..21d017789c0 100644 --- a/tt_metal/api/tt-metalium/device_impl.hpp +++ b/tt_metal/api/tt-metalium/device_impl.hpp @@ -150,6 +150,8 @@ class Device : public IDevice { void init_command_queue_host() override; void init_command_queue_device() override; + void init_fabric() override; + // Puts device into reset bool close() override; @@ -239,6 +241,9 @@ class Device : public IDevice { std::vector> command_queue_programs_; bool using_fast_dispatch_ = false; + // Fabric program includes ethernet router kernel and tensix gatekeeper kernel + std::unique_ptr fabric_program_; + // Work Executor for this device - can asynchronously process host side work for // all tasks scheduled on this device WorkExecutor work_executor_; diff --git a/tt_metal/api/tt-metalium/device_pool.hpp b/tt_metal/api/tt-metalium/device_pool.hpp index fb2cf7159e5..8087c1c3062 100644 --- a/tt_metal/api/tt-metalium/device_pool.hpp +++ b/tt_metal/api/tt-metalium/device_pool.hpp @@ -36,10 +36,12 @@ class DevicePool { DevicePool(DevicePool&& other) noexcept = delete; static DevicePool& instance() noexcept { - TT_ASSERT(_inst != nullptr, "Trying to get DevicePool without initializing it"); + TT_ASSERT((_inst != nullptr) and (_inst->initialized), "Trying to get DevicePool without initializing it"); return *_inst; } + static void initialize_fabric_setting(detail::FabricSetting fabric_setting) noexcept; + static void initialize( const std::vector& device_ids, const uint8_t num_hw_cqs, @@ -57,6 +59,8 @@ class DevicePool { void unregister_worker_thread_for_device(IDevice* device); const std::unordered_set& get_worker_thread_ids() const; + tt::tt_fabric::ControlPlane* get_control_plane() const; + private: ~DevicePool(); DevicePool(); @@ -77,6 +81,11 @@ class DevicePool { bool skip_remote_devices; std::unordered_set firmware_built_keys; + detail::FabricSetting fabric_setting = detail::FabricSetting::DEFAULT; + std::unique_ptr control_plane; + + bool initialized = false; + // Determine which CPU cores the worker threads need to be placed on for each device std::unordered_map worker_thread_to_cpu_core_map; std::unordered_map completion_queue_reader_to_cpu_core_map; @@ -85,7 +94,12 @@ class DevicePool { void activate_device(chip_id_t id); void initialize_device(IDevice* dev) const; void add_devices_to_pool(const std::vector& device_ids); + void wait_for_fabric_master_router_sync() const; IDevice* get_device(chip_id_t id) const; + + // Fabric setup helper functions + void initialize_control_plane(); + static DevicePool* _inst; }; diff --git a/tt_metal/api/tt-metalium/dispatch_core_manager.hpp b/tt_metal/api/tt-metalium/dispatch_core_manager.hpp index 2edda1f01ae..61af796f906 100644 --- a/tt_metal/api/tt-metalium/dispatch_core_manager.hpp +++ b/tt_metal/api/tt-metalium/dispatch_core_manager.hpp @@ -39,7 +39,7 @@ struct dispatch_core_placement_t { std::optional dispatcher_s = std::nullopt; std::optional mux_d = std::nullopt; // Mux std::optional demux_d = std::nullopt; // Demux - std::optional tunneler_d = std::nullopt; // ethernet tunneler + std::optional tunneler_d = std::nullopt; // ethernet tunneler }; class dispatch_core_manager { @@ -189,7 +189,8 @@ class dispatch_core_manager { // {device ID : {channel (hugepage) : {cq_id : dispatch assignment}}} // Each device has an assigned hugepage at a specific channel that holds (up to 2) hardware command queues (represented by cq_id) - std::unordered_map>> dispatch_core_assignments; + std::unordered_map>> + dispatch_core_assignments; std::unordered_map> available_dispatch_cores_by_device; std::unordered_map dispatch_core_config_by_device; //TODO: dispatch_core_type_by_device should probably be for all devices, not per device uint8_t num_hw_cqs; diff --git a/tt_metal/api/tt-metalium/fabric_host_interface.h b/tt_metal/api/tt-metalium/fabric_host_interface.h index 4218365b143..fac0ef01765 100644 --- a/tt_metal/api/tt-metalium/fabric_host_interface.h +++ b/tt_metal/api/tt-metalium/fabric_host_interface.h @@ -17,11 +17,11 @@ // TODO: move routing table here namespace tt::tt_fabric { -constexpr uint32_t GATEKEEPER_INFO_SIZE_BYTES = 848; - using chan_id_t = std::uint8_t; using routing_plane_id_t = std::uint8_t; +static constexpr std::uint32_t DEFAULT_ROUTER_RX_QUEUE_SIZE_BYTES = 0x8000; // maximum queue (power of 2); + static constexpr std::uint32_t MAX_MESH_SIZE = 1024; static constexpr std::uint32_t MAX_NUM_MESHES = 1024; diff --git a/tt_metal/api/tt-metalium/mesh_device.hpp b/tt_metal/api/tt-metalium/mesh_device.hpp index 1ff63629b16..a2fe85910da 100644 --- a/tt_metal/api/tt-metalium/mesh_device.hpp +++ b/tt_metal/api/tt-metalium/mesh_device.hpp @@ -163,6 +163,7 @@ class MeshDevice : public IDevice, public std::enable_shared_from_this CreateDevices( // TODO: delete this in favour of DevicePool const std::vector& device_ids, @@ -327,7 +332,12 @@ bool WriteRegToDevice(IDevice* device, const CoreCoord& logical_core, uint32_t a * fit L1 buffer | Yes | */ bool ReadFromDeviceL1( - IDevice* device, const CoreCoord& logical_core, uint32_t address, uint32_t size, std::vector& host_buffer); + IDevice* device, + const CoreCoord& logical_core, + uint32_t address, + uint32_t size, + std::vector& host_buffer, + CoreType core_type = CoreType::WORKER); bool ReadRegFromDevice(IDevice* device, const CoreCoord& logical_core, uint32_t address, uint32_t& regval); diff --git a/tt_metal/distributed/mesh_device.cpp b/tt_metal/distributed/mesh_device.cpp index 63cf7a6621a..5a693b152ae 100644 --- a/tt_metal/distributed/mesh_device.cpp +++ b/tt_metal/distributed/mesh_device.cpp @@ -636,6 +636,10 @@ void MeshDevice::init_command_queue_device() { TT_THROW("init_command_queue_device() is not supported on MeshDevice - use individual devices instead"); reference_device()->init_command_queue_device(); } +void MeshDevice::init_fabric() { + TT_THROW("init_fabric_program() is not supported on MeshDevice - use individual devices instead"); + reference_device()->init_fabric(); +} void MeshDevice::synchronize() { // Nothing to synchronize, as all work is executed by MeshDevice is synchronous. } diff --git a/tt_metal/fabric/control_plane.cpp b/tt_metal/fabric/control_plane.cpp index c4ba715a7dd..b8787ba29cc 100644 --- a/tt_metal/fabric/control_plane.cpp +++ b/tt_metal/fabric/control_plane.cpp @@ -486,9 +486,9 @@ void ControlPlane::write_routing_tables_to_chip(mesh_id_t mesh_id, chip_id_t chi tt_metal::hal.get_dev_addr( tt_metal::HalProgrammableCoreType::ACTIVE_ETH, tt_metal::HalL1MemAddrType::FABRIC_ROUTER_CONFIG), false); - tt::Cluster::instance().l1_barrier(physical_chip_id); } } + tt::Cluster::instance().l1_barrier(physical_chip_id); } std::pair ControlPlane::get_mesh_chip_id_from_physical_chip_id(chip_id_t physical_chip_id) const { diff --git a/tt_metal/fabric/hw/inc/tt_fabric_interface.h b/tt_metal/fabric/hw/inc/tt_fabric_interface.h index 951231cd47c..11cf5ebbaea 100644 --- a/tt_metal/fabric/hw/inc/tt_fabric_interface.h +++ b/tt_metal/fabric/hw/inc/tt_fabric_interface.h @@ -349,6 +349,7 @@ constexpr uint32_t FABRIC_ROUTER_MISC_START = eth_l1_mem::address_map::ERISC_L1_ constexpr uint32_t FABRIC_ROUTER_MISC_SIZE = 256; constexpr uint32_t FABRIC_ROUTER_SYNC_SEM = FABRIC_ROUTER_MISC_START; constexpr uint32_t FABRIC_ROUTER_SYNC_SEM_SIZE = 16; +static_assert(FABRIC_ROUTER_SYNC_SEM == eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE); // Fabric Virtual Control Channel start/size constexpr uint32_t FVCC_OUT_BUF_START = FABRIC_ROUTER_MISC_START + FABRIC_ROUTER_MISC_SIZE; diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index e87352c4b59..8df3eb90854 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -900,6 +900,35 @@ void Device::init_command_queue_device() { } } +void Device::init_fabric() { + fabric_program_ = create_and_compile_fabric_program(this); + configure_fabric_cores(this); + + program_dispatch::finalize_program_offsets(*fabric_program_, this); + + detail::WriteRuntimeArgsToDevice(this, *fabric_program_); + detail::ConfigureDeviceWithProgram(this, *fabric_program_); + + // Note: the l1_barrier below is needed to be sure writes to cores that + // don't get the GO mailbox (eg, storage cores) have all landed + tt::Cluster::instance().l1_barrier(this->id()); + std::vector> logical_cores_used_in_program = fabric_program_->logical_cores(); + for (uint32_t programmable_core_type_index = 0; programmable_core_type_index < logical_cores_used_in_program.size(); + programmable_core_type_index++) { + CoreType core_type = hal.get_core_type(programmable_core_type_index); + for (const auto& logical_core : logical_cores_used_in_program[programmable_core_type_index]) { + launch_msg_t* msg = + &fabric_program_->kernels_on_core(logical_core, programmable_core_type_index)->launch_msg; + go_msg_t* go_msg = &fabric_program_->kernels_on_core(logical_core, programmable_core_type_index)->go_msg; + msg->kernel_config.host_assigned_id = fabric_program_->get_runtime_id(); + + auto physical_core = this->virtual_core_from_logical_core(logical_core, core_type); + tt::llrt::write_launch_msg_to_core( + this->id(), physical_core, msg, go_msg, this->get_dev_addr(physical_core, HalL1MemAddrType::LAUNCH)); + } + } +} + bool Device::initialize(const uint8_t num_hw_cqs, size_t l1_small_size, size_t trace_region_size, tt::stl::Span l1_bank_remap, bool minimal) { ZoneScoped; log_info(tt::LogMetal, "Initializing device {}. Program cache is {}enabled", this->id_, this->program_cache_.is_enabled() ? "": "NOT "); diff --git a/tt_metal/impl/device/device_pool.cpp b/tt_metal/impl/device/device_pool.cpp index a269e823dd3..a9c9840a9f6 100644 --- a/tt_metal/impl/device/device_pool.cpp +++ b/tt_metal/impl/device/device_pool.cpp @@ -16,6 +16,7 @@ #include "dispatch_settings.hpp" #include "dprint_server.hpp" #include "host_api.hpp" +#include "control_plane.hpp" #include #include "tt_metal/impl/debug/noc_logging.hpp" #include "tt_metal/impl/debug/watcher_server.hpp" @@ -193,6 +194,14 @@ void DevicePool::init_profiler_devices() const { #endif } +void DevicePool::initialize_fabric_setting(detail::FabricSetting fabric_setting) noexcept { + if (_inst == nullptr) { + static DevicePool device_pool{}; + _inst = &device_pool; + } + _inst->fabric_setting = fabric_setting; +} + void DevicePool::initialize( const std::vector& device_ids, const uint8_t num_hw_cqs, @@ -221,6 +230,7 @@ void DevicePool::initialize( // modifying the state of this instance, for example those responsible for // (un)registering worker threads, can only be called in the creation thread _inst->device_pool_creation_thread_id = std::this_thread::get_id(); + _inst->initialized = true; // Never skip for TG Cluster bool skip = not tt::Cluster::instance().is_galaxy_cluster(); @@ -248,7 +258,9 @@ void DevicePool::initialize( _inst->add_devices_to_pool(device_ids); _inst->init_firmware_on_active_devices(); + tt::Cluster::instance().set_internal_routing_info_for_ethernet_cores(true, target_mmio_ids); + _inst->wait_for_fabric_master_router_sync(); _inst->init_profiler_devices(); } @@ -277,6 +289,11 @@ void DevicePool::initialize_device(IDevice* dev) const { watcher_attach(dev); + // TODO: add handling of EDM + if (this->fabric_setting == detail::FabricSetting::FABRIC) { + dev->init_fabric(); + } + // Set up HW command queues on device for FD if (this->using_fast_dispatch) { dev->init_command_queue_device(); @@ -373,12 +390,52 @@ void DevicePool::add_devices_to_pool(const std::vector& device_ids) { this->activate_device(device_id); } } + // Only can launch Fabric if all devices are active + if (this->fabric_setting == detail::FabricSetting::FABRIC) { + for (int i = 0; i < tt::Cluster::instance().number_of_devices(); i++) { + if (not _inst->is_device_active(i)) { + // Fabric currently requires all devices to be active + log_warning(tt::LogMetal, "Fabric is disabled because device {} is not active", i); + this->fabric_setting = detail::FabricSetting::DISABLED; + break; + } + } + } + + // TODO: add handling of EDM + if (this->fabric_setting == detail::FabricSetting::FABRIC) { + // Initialize control plane, which writes routing tables to all ethernet cores + _inst->initialize_control_plane(); + } this->using_fast_dispatch = (std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr); if (this->using_fast_dispatch) { populate_fd_kernels(devices_to_activate, this->num_hw_cqs); } } +void DevicePool::wait_for_fabric_master_router_sync() const { + if (this->fabric_setting == detail::FabricSetting::FABRIC) { + auto fabric_router_sync_sem_addr = + hal.get_dev_addr(HalProgrammableCoreType::ACTIVE_ETH, HalL1MemAddrType::UNRESERVED); + + std::vector master_router_status{0}; + for (const auto& dev : this->get_all_active_devices()) { + auto fabric_master_router_core = *dev->get_active_ethernet_cores().begin(); // TODO: get this from a + // manager + std::uint32_t num_routers = dev->get_active_ethernet_cores().size(); + while (master_router_status[0] != num_routers) { + tt_metal::detail::ReadFromDeviceL1( + dev, + fabric_master_router_core, + fabric_router_sync_sem_addr, + 4, + master_router_status, + CoreType::ETH); + } + } + } +} + void DevicePool::register_worker_thread_for_device(IDevice* device, std::thread::id worker_thread_id) { TT_FATAL( std::this_thread::get_id() == this->device_pool_creation_thread_id, @@ -451,6 +508,30 @@ void DevicePool::init_firmware_on_active_devices() const { } } +void DevicePool::initialize_control_plane() { + // Default mode, auto select mesh graph descriptor. In future, we can add a way for user to specify custom + // descriptors + std::string mesh_graph_descriptor; + if (tt::Cluster::instance().get_cluster_type() == tt::ClusterType::N300) { + mesh_graph_descriptor = "n300_mesh_graph_descriptor.yaml"; + } else if (tt::Cluster::instance().get_cluster_type() == tt::ClusterType::T3K) { + mesh_graph_descriptor = "t3k_mesh_graph_descriptor.yaml"; + } else if (tt::Cluster::instance().get_cluster_type() == tt::ClusterType::GALAXY) { + mesh_graph_descriptor = "quanta_mesh_graph_descriptor.yaml"; + } else if (tt::Cluster::instance().get_cluster_type() == tt::ClusterType::TG) { + mesh_graph_descriptor = "tg_mesh_graph_descriptor.yaml"; + } else { + TT_FATAL(false, "Unknown cluster type"); + } + const std::filesystem::path mesh_graph_desc_path = + std::filesystem::path(tt::llrt::RunTimeOptions::get_instance().get_root_dir()) / + "tt_metal/fabric/mesh_graph_descriptors" / mesh_graph_descriptor; + + this->control_plane = std::make_unique(mesh_graph_desc_path.string()); +} + +tt::tt_fabric::ControlPlane* DevicePool::get_control_plane() const { return this->control_plane.get(); } + DevicePool::DevicePool() { ZoneScoped; log_debug(tt::LogMetal, "DevicePool constructor"); @@ -554,6 +635,18 @@ void DevicePool::close_devices(const std::vector& devices) { Synchronize(dev); // Synchronize device } + // Terminate fabric routers + if (this->fabric_setting == detail::FabricSetting::FABRIC) { + std::vector master_router_terminate(1, 0); + auto fabric_router_sync_sem_addr = + hal.get_dev_addr(HalProgrammableCoreType::ACTIVE_ETH, HalL1MemAddrType::UNRESERVED); + for (const auto& dev : this->get_all_active_devices()) { + auto fabric_master_router_core = *dev->get_active_ethernet_cores().begin(); // TODO: get this from a + // manager + tt_metal::detail::WriteToDeviceL1( + dev, fabric_master_router_core, fabric_router_sync_sem_addr, master_router_terminate, CoreType::ETH); + } + } tt::Cluster::instance().set_internal_routing_info_for_ethernet_cores(false); for (const auto& dev_id : devices_to_close) { auto dev = tt::DevicePool::instance().get_active_device(dev_id); diff --git a/tt_metal/impl/dispatch/topology.cpp b/tt_metal/impl/dispatch/topology.cpp index 59d4c775dac..a337f1421fe 100644 --- a/tt_metal/impl/dispatch/topology.cpp +++ b/tt_metal/impl/dispatch/topology.cpp @@ -6,6 +6,7 @@ #include "kernel_config/fd_kernel.hpp" #include #include +#include #include "kernel_config/fd_kernel.hpp" #include "kernel_config/prefetch.hpp" #include "kernel_config/dispatch.hpp" @@ -21,7 +22,7 @@ namespace tt::tt_metal { // For readablity, unset = x = -1 -#define x -1 +constexpr int x = -1; void increment_node_ids(DispatchKernelNode& node, uint32_t inc) { node.id += inc; @@ -751,4 +752,67 @@ void configure_dispatch_cores(IDevice* device) { } } +std::unique_ptr create_and_compile_fabric_program(IDevice* device) { + auto fabric_program_ptr = std::make_unique(); + std::uint32_t num_routers = device->get_active_ethernet_cores().size(); // TODO: should get this from control plane + + std::map router_defines = {}; + + // TODO: Manual clear of semaphore, move this to proper Metal sempahore apis + std::vector fabric_sem_zero_buf(1, 0); + + std::uint32_t router_mask = 0; + for (const auto& router_logical_core : device->get_active_ethernet_cores()) { + router_mask += 0x1 << router_logical_core.y; + } + + auto master_router_chan = (*device->get_active_ethernet_cores().begin()).y; + // setup runtime args + std::vector router_runtime_args = { + num_routers, // 0: number of active fabric routers + router_mask, // 1: active fabric router mask + master_router_chan, // 2: master router channel + }; + + // create router kernels + std::vector router_compile_args = { + (tt::tt_fabric::DEFAULT_ROUTER_RX_QUEUE_SIZE_BYTES >> 4), // 0: rx_queue_size_words + 0, // 1: test_results_addr + 0, // 2: test_results_size + 0, // 3: timeout_mcycles * 1000 * 1000 * 4, // 3: timeout_cycles + 0, // 4: is_master_router + }; + + for (const auto& router_logical_core : device->get_active_ethernet_cores()) { + if (master_router_chan == router_logical_core.y) { + router_compile_args[4] = 1; + } else { + router_compile_args[4] = 0; + } + auto kernel = tt_metal::CreateKernel( + *fabric_program_ptr, + "tt_metal/fabric/impl/kernels/tt_fabric_router.cpp", + router_logical_core, + tt_metal::EthernetConfig{ + .noc = tt_metal::NOC::NOC_0, .compile_args = router_compile_args, .defines = router_defines}); + + tt_metal::SetRuntimeArgs(*fabric_program_ptr, kernel, router_logical_core, router_runtime_args); + } + + detail::CompileProgram(device, *fabric_program_ptr, /*fd_bootloader_mode=*/true); + return fabric_program_ptr; +} + +void configure_fabric_cores(IDevice* device) { + std::vector router_zero_buf(1, 0); + + for (const auto& router_logical_core : device->get_active_ethernet_cores()) { + // initialize the semaphore + auto fabric_router_sync_sem_addr = + hal.get_dev_addr(HalProgrammableCoreType::ACTIVE_ETH, HalL1MemAddrType::UNRESERVED); + detail::WriteToDeviceL1( + device, router_logical_core, fabric_router_sync_sem_addr, router_zero_buf, CoreType::ETH); + } +} + } // namespace tt::tt_metal diff --git a/tt_metal/impl/dispatch/topology.hpp b/tt_metal/impl/dispatch/topology.hpp index 956c0b6644b..0da7b40472c 100644 --- a/tt_metal/impl/dispatch/topology.hpp +++ b/tt_metal/impl/dispatch/topology.hpp @@ -35,4 +35,10 @@ std::unique_ptr create_and_compile_cq_program(tt::tt_meta // Perform additional configuration (writing to specific L1 addresses, etc.) for FD kernels on this device. void configure_dispatch_cores(tt::tt_metal::IDevice* device); +// Compile fabric kernels needed to support scaleout systems. +std::unique_ptr create_and_compile_fabric_program(tt::tt_metal::IDevice* device); + +// Perform additional configuration (writing to specific L1 addresses, etc.) for fabric kernels on this device. +void configure_fabric_cores(tt::tt_metal::IDevice* device); + } // namespace tt::tt_metal diff --git a/tt_metal/llrt/tt_cluster.cpp b/tt_metal/llrt/tt_cluster.cpp index 807dca854fb..785b3d1dcb2 100644 --- a/tt_metal/llrt/tt_cluster.cpp +++ b/tt_metal/llrt/tt_cluster.cpp @@ -130,9 +130,10 @@ void Cluster::detect_arch_and_target() { this->target_type_); } -bool Cluster::is_galaxy_cluster() const { - return this->is_tg_cluster_; -} +// TODO: remove this when we deprecate TG +bool Cluster::is_galaxy_cluster() const { return this->cluster_type_ == ClusterType::TG; } + +ClusterType Cluster::get_cluster_type() const { return this->cluster_type_; } BoardType Cluster::get_board_type(chip_id_t chip_id) const { return this->cluster_desc_->get_board_type(chip_id); @@ -145,12 +146,32 @@ void Cluster::generate_cluster_descriptor() { this->cluster_desc_ = tt_ClusterDescriptor::create_mock_cluster(tt_SimulationDevice::detect_available_device_ids(), this->arch_); } else { this->cluster_desc_ = tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path()); + + // Detect cluster type for (const auto &chip_id : this->cluster_desc_->get_all_chips()) { if (this->cluster_desc_->get_board_type(chip_id) == BoardType::GALAXY) { - this->is_tg_cluster_ = true; + this->cluster_type_ = ClusterType::TG; break; } } + bool all_n300 = true; + for (const auto& chip_id : this->cluster_desc_->get_all_chips()) { + if (this->cluster_desc_->get_board_type(chip_id) == BoardType::N300) { + all_n300 &= (this->cluster_desc_->get_board_type(chip_id) == BoardType::N300); + } + } + if (all_n300) { + if (this->cluster_desc_->get_all_chips().size() == 1) { + this->cluster_type_ = ClusterType::N300; + } else if (this->cluster_desc_->get_all_chips().size() == 8) { + this->cluster_type_ = ClusterType::T3K; + } + } + + if ((this->cluster_desc_->get_all_chips().size() == this->cluster_desc_->get_chips_with_mmio().size()) and + (this->cluster_desc_->get_all_chips().size() == 32)) { + this->cluster_type_ = ClusterType::GALAXY; + } } // Use cluster descriptor to map MMIO device id to all devices on the same card (including the MMIO device) @@ -168,7 +189,7 @@ void Cluster::generate_cluster_descriptor() { } uint32_t total_num_hugepages = tt::umd::get_num_hugepages(); - if (this->is_tg_cluster_) { + if (this->cluster_type_ == ClusterType::TG) { // TODO: don't think this check is correct, we want to have total num hugepages == num chips even for Galaxy TT_FATAL( this->arch_ == tt::ARCH::BLACKHOLE or total_num_hugepages >= this->cluster_desc_->get_all_chips().size()/4, @@ -177,8 +198,8 @@ void Cluster::generate_cluster_descriptor() { this->cluster_desc_->get_all_chips().size()/4, this->cluster_desc_->get_all_chips().size(), total_num_hugepages); - } else if (this->target_type_ != TargetDevice::Simulator){ - // TODO (abhullar): ignore hugepage set up for BH bringup + } else if (this->target_type_ != TargetDevice::Simulator) { + // TODO (abhullar): ignore hugepage set up for BH bringup TT_FATAL( this->arch_ == tt::ARCH::BLACKHOLE or total_num_hugepages >= this->cluster_desc_->get_all_chips().size(), "Machine setup error: Insufficient number of hugepages available, expected one per device ({}) but have {}. " diff --git a/tt_metal/llrt/tt_cluster.hpp b/tt_metal/llrt/tt_cluster.hpp index 666e9fa4eed..927d39d5dfc 100644 --- a/tt_metal/llrt/tt_cluster.hpp +++ b/tt_metal/llrt/tt_cluster.hpp @@ -38,6 +38,14 @@ enum class TargetDevice : std::uint8_t { Invalid = 0xFF, }; +enum class ClusterType : std::uint8_t { + INVALID = 0, + N300 = 1, // Production N300 + T3K = 2, // Production T3K, built with 4 N300s + GALAXY = 3, // Production Galaxy, all chips with mmio + TG = 4, // Will be deprecated +}; + class Cluster { public: Cluster& operator=(const Cluster&) = delete; @@ -50,7 +58,7 @@ class Cluster { // For TG Galaxy systems, mmio chips are gateway chips that are only used for dispatc, so user_devices are meant for // user facing host apis size_t number_of_user_devices() const { - if (this->is_tg_cluster_) { + if (this->cluster_type_ == ClusterType::TG) { const auto& chips = this->cluster_desc_->get_all_chips(); return std::count_if(chips.begin(), chips.end(), [&](const auto& id) { return this->cluster_desc_->get_board_type(id) == BoardType::GALAXY; @@ -245,6 +253,8 @@ class Cluster { // Returns Wormhole chip board type. BoardType get_board_type(chip_id_t chip_id) const; + ClusterType get_cluster_type() const; + bool is_worker_core(const CoreCoord& core, chip_id_t chip_id) const; bool is_ethernet_core(const CoreCoord& core, chip_id_t chip_id) const; CoreCoord get_logical_ethernet_core_from_virtual(chip_id_t chip, CoreCoord core) const; @@ -306,7 +316,7 @@ class Cluster { std::unordered_map> virtual_routing_to_profiler_flat_id_; // Flag to tell whether we are on a TG type of system. // If any device has to board type of GALAXY, we are on a TG cluster. - bool is_tg_cluster_; + ClusterType cluster_type_ = ClusterType::INVALID; // Tunnels setup in cluster std::map>> tunnels_from_mmio_device = {}; diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index 59e6543a82e..45e09fe93f6 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -336,10 +336,11 @@ bool ReadFromDeviceL1( const CoreCoord& logical_core, uint32_t address, uint32_t size, - std::vector& host_buffer) { + std::vector& host_buffer, + CoreType core_type) { tt::Cluster::instance().l1_barrier(device->id()); - auto worker_core = device->worker_core_from_logical_core(logical_core); - host_buffer = llrt::read_hex_vec_from_core(device->id(), worker_core, address, size); + auto virtual_core = device->virtual_core_from_logical_core(logical_core, core_type); + host_buffer = llrt::read_hex_vec_from_core(device->id(), virtual_core, address, size); return true; } @@ -350,6 +351,10 @@ bool ReadRegFromDevice(IDevice* device, const CoreCoord& logical_core, uint32_t return true; } +void InitializeFabricSetting(detail::FabricSetting fabric_setting) { + tt::DevicePool::initialize_fabric_setting(detail::FabricSetting::FABRIC); +} + std::map CreateDevices( const std::vector& device_ids, const uint8_t num_hw_cqs, From e68353dfc72bd5bdf3827b93d5e301085f316857 Mon Sep 17 00:00:00 2001 From: Allan Liu Date: Wed, 19 Feb 2025 22:22:32 +0000 Subject: [PATCH 180/316] Add auto fabric init tests to CI --- tests/scripts/t3000/run_t3000_unit_tests.sh | 2 ++ tests/scripts/tg/run_tg_unit_tests.sh | 2 ++ 2 files changed, 4 insertions(+) diff --git a/tests/scripts/t3000/run_t3000_unit_tests.sh b/tests/scripts/t3000/run_t3000_unit_tests.sh index 3eff90e9879..0f849e9ec7f 100755 --- a/tests/scripts/t3000/run_t3000_unit_tests.sh +++ b/tests/scripts/t3000/run_t3000_unit_tests.sh @@ -47,11 +47,13 @@ run_t3000_ttfabric_tests() { TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity_wormhole_b0 --fabric_command 1 --board_type t3k --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16 TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity_wormhole_b0 --fabric_command 64 --board_type t3k --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16 TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity_wormhole_b0 --fabric_command 65 --board_type t3k --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16 + TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity_wormhole_b0 --fabric_command 1 --board_type t3k --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16 --metal_fabric_init_level 1 # Line Mcast tests TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity_wormhole_b0 --fabric_command 1 --board_type t3k --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16 --e_depth 3 TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity_wormhole_b0 --fabric_command 1 --board_type t3k --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16 --w_depth 3 TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity_wormhole_b0 --fabric_command 1 --board_type t3k --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16 --n_depth 1 TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity_wormhole_b0 --fabric_command 1 --board_type t3k --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16 --s_depth 1 + TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity_wormhole_b0 --fabric_command 1 --board_type t3k --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16 --e_depth 3 --metal_fabric_init_level 1 # Record the end time end_time=$(date +%s) diff --git a/tests/scripts/tg/run_tg_unit_tests.sh b/tests/scripts/tg/run_tg_unit_tests.sh index f5b3752f840..433ba6fb784 100755 --- a/tests/scripts/tg/run_tg_unit_tests.sh +++ b/tests/scripts/tg/run_tg_unit_tests.sh @@ -118,11 +118,13 @@ run_tg_tests() { TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity_wormhole_b0 --fabric_command 1 --board_type glx32 --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16 TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity_wormhole_b0 --fabric_command 64 --board_type glx32 --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16 TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity_wormhole_b0 --fabric_command 65 --board_type glx32 --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16 + TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity_wormhole_b0 --fabric_command 1 --board_type glx32 --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16 --metal_fabric_init_level 1 # Line Mcast tests TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity_wormhole_b0 --fabric_command 1 --board_type glx32 --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16 --e_depth 7 TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity_wormhole_b0 --fabric_command 1 --board_type glx32 --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16 --w_depth 7 TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity_wormhole_b0 --fabric_command 1 --board_type glx32 --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16 --n_depth 3 TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity_wormhole_b0 --fabric_command 1 --board_type glx32 --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16 --s_depth 3 + TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity_wormhole_b0 --fabric_command 1 --board_type glx32 --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16 --n_depth 3 --metal_fabric_init_level 1 elif [[ "$1" == "llama3-70b" ]]; then run_tg_llama3.1-70b_tests From 98a147a954dbb351a840ff8d0d060358a78f785d Mon Sep 17 00:00:00 2001 From: Borys Bradel <164946524+bbradelTT@users.noreply.github.com> Date: Wed, 19 Feb 2025 22:01:47 -0500 Subject: [PATCH 181/316] #17277: switch run_without_autoformat to run in reduce_op.cpp (#18032) ### Ticket Link to Github Issue #17277 ### Problem description - operation::run_without_autoformat is being removed ### What's changed - switch to using operation::run ### Checklist - [x] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes https://github.com/tenstorrent/tt-metal/actions/runs/13419982451 - [x] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) https://github.com/tenstorrent/tt-metal/actions/runs/13419994272 - [x] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) https://github.com/tenstorrent/tt-metal/actions/runs/13419999018/job/37490243801 failures exist in main https://github.com/tenstorrent/tt-metal/actions/runs/13418125200/job/37484190615 - [x] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) https://github.com/tenstorrent/tt-metal/actions/runs/13419997336 - [ ] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [x] New/Existing tests provide coverage for changes --- .../ttnn/operations/reduction/generic/device/reduce_op.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ttnn/cpp/ttnn/operations/reduction/generic/device/reduce_op.cpp b/ttnn/cpp/ttnn/operations/reduction/generic/device/reduce_op.cpp index b3397f35fb6..b793645b5da 100644 --- a/ttnn/cpp/ttnn/operations/reduction/generic/device/reduce_op.cpp +++ b/ttnn/cpp/ttnn/operations/reduction/generic/device/reduce_op.cpp @@ -222,7 +222,7 @@ Tensor reduce( ttnn::operations::experimental::auto_format::AutoFormat::format_input_tensor( input_tensor, device, input_tensor_pad_shape, pad_value, Layout::TILE); } - const Tensor output_tensor = operation::run_without_autoformat( + const Tensor output_tensor = operation::run( Reduce{ reduce_math, ReduceOpDim::W, @@ -232,7 +232,7 @@ Tensor reduce( config}, {formatted_input_tensor}) .at(0); - return operation::run_without_autoformat( + return operation::run( Reduce{ reduce_math, ReduceOpDim::H, From d29a5be1dbaf90a238e97ff862bce224a7abbac4 Mon Sep 17 00:00:00 2001 From: Sean Nijjar Date: Wed, 19 Feb 2025 22:20:37 -0500 Subject: [PATCH 182/316] lower edm fabric switch interval (#18052) to account for less frequent idle counter increments because of recent addition of inner loop that doesn't ctx-switch. This is needed now because some systems are seeing excessively long teardown times due to teardown signals being blocked behind context switches to eth fw routing. The drop in context switch interval is roughly proportional to the inner loop count in the main EDM fabric control loop. --- ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp b/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp index b271f19ac52..58f369b1cd0 100644 --- a/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp @@ -181,7 +181,7 @@ size_t log_worker_to_fabric_edm_sender_rt_args(std::vector const& args class FabricEriscDatamoverBuilder { public: - static constexpr size_t default_firmware_context_switch_interval = 200000; + static constexpr size_t default_firmware_context_switch_interval = 10000; // payload only, no header static constexpr size_t default_packet_payload_size_bytes = tt::tile_size(tt::DataFormat::Bfp8_b) * 4; From 8e4a6e05421aade8f7b0fd3861470088693efd41 Mon Sep 17 00:00:00 2001 From: Saad Jameel <163029024+sjameelTT@users.noreply.github.com> Date: Wed, 19 Feb 2025 22:47:17 -0500 Subject: [PATCH 183/316] Add row major eltwise binary_ng support (#17969) ### Ticket #17966 #17356 ### Problem description Eltwise currently has 0 row major support at all. Also need a test confirming that fused dtype works. ### What's changed As a first step I'm supporting it via untilize/tilize support to unblock any models going forward. Next step will be adding native kernel support. ### Checklist - [ ] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes https://github.com/tenstorrent/tt-metal/actions/runs/13402699040 - [ ] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) https://github.com/tenstorrent/tt-metal/actions/runs/13399500242 - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [ ] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [ ] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [ ] New/Existing tests provide coverage for changes --- .../eltwise/test_binary_ng_typecast.py | 284 +++++++++++++++++- .../eltwise/binary_ng/binary_ng.cpp | 66 +++- 2 files changed, 336 insertions(+), 14 deletions(-) diff --git a/tests/ttnn/unit_tests/operations/eltwise/test_binary_ng_typecast.py b/tests/ttnn/unit_tests/operations/eltwise/test_binary_ng_typecast.py index df8b8db740a..3c804597a06 100644 --- a/tests/ttnn/unit_tests/operations/eltwise/test_binary_ng_typecast.py +++ b/tests/ttnn/unit_tests/operations/eltwise/test_binary_ng_typecast.py @@ -9,6 +9,7 @@ from models.utility_functions import skip_for_grayskull, torch_random from functools import partial from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt +from tests.ttnn.utils_for_testing import assert_with_pcc binary_fns = { @@ -51,8 +52,12 @@ "dtype", ([ttnn.bfloat16]), ) +@pytest.mark.parametrize( + "layout", + ([ttnn.TILE_LAYOUT]), +) # No typecast on inputs and optional output -def test_opt_output_no_typecast(input_shapes, dtype, ttnn_fn, device): +def test_opt_output_no_typecast(input_shapes, dtype, layout, ttnn_fn, device): torch.manual_seed(0) a_shape, b_shape, out_shape = input_shapes ttnn_op = getattr(ttnn.experimental, ttnn_fn) @@ -66,14 +71,12 @@ def test_opt_output_no_typecast(input_shapes, dtype, ttnn_fn, device): out = gen_func_with_cast_tt(partial(torch_random, low=0, high=1, dtype=torch.bfloat16), dtype)(out_shape) input_tensor_a = ttnn.from_torch( - torch_input_tensor_a, dtype=dtype, device=device, layout=ttnn.TILE_LAYOUT, memory_config=ttnn.DRAM_MEMORY_CONFIG + torch_input_tensor_a, dtype=dtype, device=device, layout=layout, memory_config=ttnn.DRAM_MEMORY_CONFIG ) input_tensor_b = ttnn.from_torch( - torch_input_tensor_b, dtype=dtype, device=device, layout=ttnn.TILE_LAYOUT, memory_config=ttnn.DRAM_MEMORY_CONFIG - ) - out_tt = ttnn.from_torch( - out, dtype=dtype, device=device, layout=ttnn.TILE_LAYOUT, memory_config=ttnn.DRAM_MEMORY_CONFIG + torch_input_tensor_b, dtype=dtype, device=device, layout=layout, memory_config=ttnn.DRAM_MEMORY_CONFIG ) + out_tt = ttnn.from_torch(out, dtype=dtype, device=device, layout=layout, memory_config=ttnn.DRAM_MEMORY_CONFIG) cq_id = 0 ttnn_op(input_tensor_a, input_tensor_b, queue_id=cq_id, output_tensor=out_tt) output_tensor = ttnn.to_torch(out_tt) @@ -660,3 +663,272 @@ def test_opt_output_scalar(input_shapes, ttnn_fn, scalar, device): status = ttnn.pearson_correlation_coefficient(torch_output_tensor, output_tensor) assert status >= 0.999 + + +@skip_for_grayskull("Requires wormhole_b0 to run") +@pytest.mark.parametrize("input_shape", [(1, 1, 1, 1), (3, 3, 15, 15), (3, 3, 17, 17), (3, 3, 33, 33)]) +@pytest.mark.parametrize( + "memory_config", + ([ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG]), +) +@pytest.mark.parametrize("scalar", [-0.25, -16.5, 0.0, 0.05, 1.7, 19.0]) +@pytest.mark.parametrize( + "ttnn_fn", + [ + "add", + "sub", + "mul", + "div", + "rsub", + "squared_difference", + ], +) +@pytest.mark.parametrize( + "layout", + ([ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT]), +) +def test_edgecase_dims_eltwise_scalar_matrix_math(input_shape, scalar, ttnn_fn, memory_config, layout, device): + torch.manual_seed(0) + a_shape = input_shape + + ttnn_op = getattr(ttnn.experimental, ttnn_fn) + torch_input_tensor_a = torch.randn(a_shape, dtype=torch.bfloat16) + + input_tensor_a = ttnn.from_torch( + torch_input_tensor_a, + dtype=ttnn.bfloat16, + device=device, + layout=layout, + memory_config=memory_config, + ) + + output = ttnn_op(input_tensor_a, scalar) + tt_output_tensor = ttnn.to_torch(output) + + golden_fn = ttnn.get_golden_function(ttnn_op) + torch_output_tensor = golden_fn(torch_input_tensor_a, scalar) + + assert_with_pcc(torch_output_tensor, tt_output_tensor, 0.999) + + +@skip_for_grayskull("Requires wormhole_b0 to run") +@pytest.mark.parametrize("input_shape", [(1, 1, 1, 1), (3, 3, 15, 15), (3, 3, 17, 17), (3, 3, 33, 33)]) +@pytest.mark.parametrize( + "memory_config", + ([ttnn.DRAM_MEMORY_CONFIG]), +) +@pytest.mark.parametrize("scalar", [-0.25, -16.5, 0.0, 0.05, 1.7, 19.0]) +@pytest.mark.parametrize( + "ttnn_fn", + [ + "gt", + "lt", + "lte", + "gte", + "eq", + "ne", + ], +) +@pytest.mark.parametrize( + "layout", + ([ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT]), +) +def test_edgecase_dims_eltwise_scalar_logical(input_shape, scalar, ttnn_fn, memory_config, layout, device): + torch.manual_seed(0) + a_shape = input_shape + + ttnn_op = getattr(ttnn.experimental, ttnn_fn) + torch_input_tensor_a = torch.randn(a_shape, dtype=torch.bfloat16) + # guarantee at least one equal value + if (ttnn_fn == "eq" or ttnn_fn == "ne" or ttnn_fn == "gte" or ttnn_fn == "lte") and input_shape != (1, 1, 1, 1): + torch_input_tensor_a[0, 0, 0, 0] = scalar + + input_tensor_a = ttnn.from_torch( + torch_input_tensor_a, + dtype=ttnn.bfloat16, + device=device, + layout=layout, + memory_config=memory_config, + ) + + output = ttnn_op(input_tensor_a, scalar, dtype=ttnn.uint32) + tt_output_tensor = ttnn.to_torch(output) + + golden_fn = ttnn.get_golden_function(ttnn_op) + torch_output_tensor = golden_fn(torch_input_tensor_a, scalar) + + assert_with_pcc(torch_output_tensor, tt_output_tensor, 0.999) + + +@pytest.mark.parametrize( + "input_shapes", + [ + ((1, 7, 1, 1), (7, 7, 33, 33)), + ((7, 1, 1, 1), (7, 7, 49, 49)), + ((7, 7, 65, 65), (7, 7, 65, 65)), + ((2, 2, 10, 1), (2, 2, 10, 2)), + ], +) +@pytest.mark.parametrize( + "memory_config", + ([ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG]), +) +@pytest.mark.parametrize( + "ttnn_fn", + [ + "add", + "sub", + "mul", + "div", + "rsub", + "squared_difference", + ], +) +@pytest.mark.parametrize( + "layout", + ([ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT]), +) +def test_edgecase_dims_eltwise_broadcast_matrix_math(input_shapes, ttnn_fn, memory_config, layout, device): + torch.manual_seed(0) + a_shape, b_shape = input_shapes + + ttnn_op = getattr(ttnn.experimental, ttnn_fn) + torch_input_tensor_a = torch.randn(a_shape, dtype=torch.bfloat16) + torch_input_tensor_b = torch.randn(b_shape, dtype=torch.bfloat16) + + if ttnn_fn == "div": + torch_input_tensor_b[torch_input_tensor_b.abs() < 0.001] = 0.001 + + input_tensor_a = ttnn.from_torch( + torch_input_tensor_a, + dtype=ttnn.bfloat16, + device=device, + layout=layout, + memory_config=memory_config, + ) + + input_tensor_b = ttnn.from_torch( + torch_input_tensor_b, + dtype=ttnn.bfloat16, + device=device, + layout=layout, + memory_config=memory_config, + ) + + output = ttnn_op(input_tensor_a, input_tensor_b, dtype=ttnn.float32) + tt_output_tensor = ttnn.to_torch(output) + + golden_fn = ttnn.get_golden_function(ttnn_op) + torch_output_tensor = golden_fn(torch_input_tensor_a, torch_input_tensor_b) + + assert_with_pcc(torch_output_tensor, tt_output_tensor, 0.999) + + +@skip_for_grayskull("Requires wormhole_b0 to run") +@pytest.mark.parametrize( + "input_shapes", + [ + ((1, 7, 1, 1), (7, 7, 33, 33)), + ((7, 1, 1, 1), (7, 7, 49, 49)), + ((7, 7, 65, 65), (7, 7, 65, 65)), + ], +) +@pytest.mark.parametrize( + "memory_config", + ([ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG]), +) +@pytest.mark.parametrize( + "ttnn_fn", + [ + "gt", + "lt", + "lte", + "gte", + "eq", + "ne", + ], +) +@pytest.mark.parametrize( + "layout", + ([ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT]), +) +def test_edgecase_dims_eltwise_broadcast_logical(input_shapes, ttnn_fn, memory_config, layout, device): + torch.manual_seed(0) + a_shape, b_shape = input_shapes + + ttnn_op = getattr(ttnn.experimental, ttnn_fn) + torch_input_tensor_a = torch.randn(a_shape, dtype=torch.bfloat16) + torch_input_tensor_b = torch.randn(b_shape, dtype=torch.bfloat16) + # guarantee at least one equal value + if ttnn_fn == "eq" or ttnn_fn == "ne" or ttnn_fn == "gte" or ttnn_fn == "lte": + torch_input_tensor_a[0, 0, 0, 0] = torch_input_tensor_b[0, 0, 0, 0] + + input_tensor_a = ttnn.from_torch( + torch_input_tensor_a, + dtype=ttnn.bfloat16, + device=device, + layout=layout, + memory_config=memory_config, + ) + + input_tensor_b = ttnn.from_torch( + torch_input_tensor_b, + dtype=ttnn.bfloat16, + device=device, + layout=layout, + memory_config=memory_config, + ) + + output = ttnn_op(input_tensor_a, input_tensor_b, dtype=ttnn.float32) + tt_output_tensor = ttnn.to_torch(output) + + golden_fn = ttnn.get_golden_function(ttnn_op) + torch_output_tensor = golden_fn(torch_input_tensor_a, torch_input_tensor_b) + + assert_with_pcc(torch_output_tensor, tt_output_tensor, 0.999) + + +@pytest.mark.parametrize( + "input_shape, input_layout, input_shard_grid, input_shard_orientation, input_sharding_scheme", + [ + ( + [1, 1, 64, 64], + ttnn.TILE_LAYOUT, + ttnn.CoreGrid(y=1, x=2), + ttnn.ShardOrientation.ROW_MAJOR, + ttnn.ShardStrategy.WIDTH, + ), + ], +) +@pytest.mark.parametrize("input_dtype", [ttnn.bfloat16, ttnn.float32]) +@pytest.mark.parametrize("output_dtype", [ttnn.float32, ttnn.bfloat16]) +def test_binary_div( + device, + input_shape, + input_layout, + input_shard_grid, + input_shard_orientation, + input_sharding_scheme, + input_dtype, + output_dtype, +): + memory_config = ttnn.create_sharded_memory_config( + input_shape, + core_grid=input_shard_grid, + strategy=input_sharding_scheme, + orientation=input_shard_orientation, + use_height_and_width_as_shard_shape=False, + ) + + torch_input_a = torch.rand(input_shape, dtype=torch.bfloat16) + 1 + torch_input_b = torch.rand(input_shape, dtype=torch.bfloat16) + 1 + torch_output = torch_input_a / torch_input_b + + input_tensor_a = ttnn.from_torch( + torch_input_a, layout=input_layout, memory_config=memory_config, dtype=input_dtype, device=device + ) + input_tensor_b = ttnn.from_torch( + torch_input_b, layout=input_layout, memory_config=memory_config, dtype=input_dtype, device=device + ) + output_tensor = ttnn.experimental.div(input_tensor_a, input_tensor_b, dtype=output_dtype) + assert_with_pcc(torch_output, ttnn.to_torch(output_tensor), 0.999) diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/binary_ng.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/binary_ng.cpp index 99c1a77dab0..71a3f32b980 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/binary_ng.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/binary_ng.cpp @@ -43,11 +43,28 @@ Tensor BinaryNg::invoke( bool typecast_b = needs_typecast_to_bfloat16(b_dtype); bool typecast_out = needs_typecast_to_bfloat16(out_dtype); + // RM is never BFLOAT8 or BFLOAT4 so we can assume it goes in here. if (!typecast_a && !typecast_b) { - return ttnn::prim::binary_ng( + bool input_a_rm = input_tensor_a.get_layout() == Layout::ROW_MAJOR; + bool input_b_rm = input_tensor_b.get_layout() == Layout::ROW_MAJOR; + Tensor input_a = + input_a_rm ? ttnn::to_layout(input_tensor_a, Layout::TILE, std::nullopt, std::nullopt, (IDevice*)nullptr) + : input_tensor_a; + Tensor input_b = + input_b_rm ? ttnn::to_layout(input_tensor_b, Layout::TILE, std::nullopt, std::nullopt, (IDevice*)nullptr) + : input_tensor_b; + + if (input_a_rm && input_b_rm) { + // we don't support to_layout with optional output tensor + TT_FATAL( + !output_preallocated, + "Optional output tensor with Row Major input is not supported right now for Elementwise operations"); + } + + Tensor result = ttnn::prim::binary_ng( queue_id, - input_tensor_a, - input_tensor_b, + input_a, + input_b, binary_op_type, out_dtype, output_preallocated ? optional_output_tensor->memory_config() @@ -56,6 +73,20 @@ Tensor BinaryNg::invoke( lhs_activations, rhs_activations, post_activations); + + // if both inputs are in row major, convert the output to row major + // since there's no consensus here, avoiding the conversion if we have an excuse to is likely the best option + // since it leads to better perf + if (input_a_rm && input_b_rm) { + result = ttnn::to_layout( + result, + Layout::ROW_MAJOR, + std::nullopt, + memory_config.value_or(input_tensor_a.memory_config()), + (IDevice*)nullptr); + } + + return result; } else { Tensor input_a = typecast_to(DataType::BFLOAT16, input_tensor_a); Tensor input_b = typecast_to(DataType::BFLOAT16, input_tensor_b); @@ -116,6 +147,8 @@ Tensor BinaryNg::invoke( const bool output_preallocated = optional_output_tensor.has_value(); const ttnn::DataType out_dtype = output_preallocated ? optional_output_tensor->get_dtype() : output_dtype.value_or(a_dtype); + const auto mem_config = output_preallocated ? optional_output_tensor->memory_config() + : memory_config.value_or(input_tensor_a.memory_config()); if (output_dtype.has_value() && output_preallocated) { TT_FATAL( @@ -127,18 +160,35 @@ Tensor BinaryNg::invoke( bool typecast_out = needs_typecast_to_bfloat16(out_dtype); if (!typecast_a) { - return ttnn::prim::binary_ng( + bool input_a_rm = input_tensor_a.get_layout() == Layout::ROW_MAJOR; + if (input_a_rm) { + // we don't support to_layout with optional output tensor + TT_FATAL( + !output_preallocated, + "Optional output tensor with Row Major input is not supported right now for Elementwise operations"); + } + Tensor input_a = + input_a_rm + ? ttnn::to_layout( + input_tensor_a, Layout::TILE, std::nullopt, input_tensor_a.memory_config(), (IDevice*)nullptr) + : input_tensor_a; + Tensor result = ttnn::prim::binary_ng( queue_id, - input_tensor_a, + input_a, scalar, binary_op_type, out_dtype, - output_preallocated ? optional_output_tensor->memory_config() - : memory_config.value_or(input_tensor_a.memory_config()), + mem_config, optional_output_tensor, lhs_activations, rhs_activations, post_activations); + + // if input is in row major, convert the output to row major + if (input_a_rm) { + result = ttnn::to_layout(result, Layout::ROW_MAJOR, std::nullopt, mem_config, (IDevice*)nullptr); + } + return result; } else { Tensor input_a = typecast_to(DataType::BFLOAT16, input_tensor_a); const auto output_tensor = output_preallocated and typecast_out @@ -151,7 +201,7 @@ Tensor BinaryNg::invoke( scalar, binary_op_type, input_a.get_dtype(), - input_a.memory_config(), + mem_config, output_tensor, lhs_activations, rhs_activations, From 705b94d287f432c043b792443e17c3ea1dd01104 Mon Sep 17 00:00:00 2001 From: Juan Camilo Vega Date: Wed, 19 Feb 2025 22:53:54 -0500 Subject: [PATCH 184/316] #17972 and #17975 Fixing PCC and Program Cache issues in Repeat and Expand (#18002) ### Ticket #17975 #17972 ### Problem description This PR closes two P0 errors by applying bug fixes to the repeat program factory and giving repeat program cache support. ### What's changed Program factory changes to repeat Adding Program Cache testing to the CI pipelines for repeat Removed redundant CI tests in Repeat to help improve CI pipeline times ### Checklist - [ ] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes. [Submitted](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) [rerun after PR changes](https://github.com/tenstorrent/tt-metal/actions/runs/13422573676) - [ ] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable). [Submitted](https://github.com/tenstorrent/tt-metal/actions/runs/13416833113) - [ ] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [ ] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [ ] New/Existing tests provide coverage for changes - [ ] T3K Demo. [Submitted](https://github.com/tenstorrent/tt-metal/actions/runs/13416778070) --- .../ttnn/unit_tests/operations/test_repeat.py | 128 +++++++++++++++++- .../device/host/repeat_program_factory.cpp | 39 +++++- 2 files changed, 157 insertions(+), 10 deletions(-) diff --git a/tests/ttnn/unit_tests/operations/test_repeat.py b/tests/ttnn/unit_tests/operations/test_repeat.py index 73af42df968..c10efdff258 100644 --- a/tests/ttnn/unit_tests/operations/test_repeat.py +++ b/tests/ttnn/unit_tests/operations/test_repeat.py @@ -15,17 +15,12 @@ layouts = [ttnn.ROW_MAJOR_LAYOUT, ttnn.TILE_LAYOUT] dtypes = [(torch.float32, ttnn.float32), (torch.bfloat16, ttnn.bfloat16), (torch.bfloat16, ttnn.bfloat8_b)] -shapes = [(1,), (2,), (2, 1), (2, 3), (2, 1, 3), (4, 16, 3, 2), (4, 3, 1, 2, 2)] +shapes = [(1,), (2,), (2, 3), (4, 16, 3, 1), (4, 3, 1, 2, 2)] repeat_shapes = [ (1,), - (2,), (1, 2), - (1, 4), - (2, 1, 3), - (1, 2, 3), (4, 3, 2, 1), (2, 3, 4, 5, 2), - (2, 1, 3, 1, 3, 1), (2048,), ] @@ -75,4 +70,123 @@ def test_repeat(device, layout, dtype, shape, repeat_shape): assert_with_pcc(torch_result, output, 0.9999) -# TODO! test program cache when it is implemented +@pytest.mark.parametrize("layout", layouts) +@pytest.mark.parametrize("shape", shapes) +@pytest.mark.parametrize("repeat_shape", repeat_shapes) +def test_pc_repeat(device, layout, shape, repeat_shape, use_program_cache): + # trying to avoid the `buffer not divisible by page size` error. Does this make sense? + if layout == ttnn.TILE_LAYOUT and ( + prod(shape) % ttnn.TILE_SIZE != 0 or _get_final_size(shape, repeat_shape) % ttnn.TILE_SIZE != 0 + ): + pytest.skip("Tensor not suitable for tile layout") + + if len(repeat_shape) < len(shape): + pytest.skip("PyTorch repeat dim must be >= tensor dim (although we can handle this).") + num_iters = 3 + input_tensors = [] + torch_results = [] + for i in range(num_iters): + torch_tensor = torch.rand(shape, dtype=torch.bfloat16) + torch_results.append(torch_tensor.repeat(repeat_shape)) + input_tensors.append(ttnn.from_torch(torch_tensor, layout=layout, device=device, dtype=ttnn.bfloat16)) + for i in range(num_iters): + output = ttnn.repeat(input_tensors[i], ttnn.Shape(repeat_shape)) + output = ttnn.to_torch(output) + assert ( + output.shape == torch_results[i].shape + ), f"Output shape {output.shape} does not match torch shape {torch_results[i].shape}" + + assert_with_pcc(torch_results[i], output, 0.9999) + if i == 0: + base_program_cache_entires = device.num_program_cache_entries() + else: + assert ( + device.num_program_cache_entries() == base_program_cache_entires, + "program cache entries differ on same configs", + ) + + +# 17975 test cases + + +def test_pc_with_different_shapes_in_sequence(device, use_program_cache): + y = torch.rand((1, 1, 256, 384), dtype=torch.bfloat16) + y_tt = ttnn.from_torch(y, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device) + base_program_cache_entires = device.num_program_cache_entries() + + x = torch.zeros((64, 1, 256, 384), dtype=torch.bfloat16) + x_tt = ttnn.from_torch(x, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device) + num_iters = 4 + z_tt = x_tt + y_tt + + for i in range(64): + z_torch = ttnn.to_torch(z_tt[i : i + 1]) + assert torch.allclose(z_torch, y, atol=1e-2), f"z_torch[{i}] != y" + for _ in range(num_iters): + y_tt = ttnn.from_torch(y, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device) + assert ( + device.num_program_cache_entries() == base_program_cache_entires, + "program cache entries differ on same configs", + ) + + x = torch.zeros((64, 1, 256, 384), dtype=torch.bfloat16) + x_tt = ttnn.from_torch(x, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device) + + z_tt = x_tt + y_tt + + for i in range(64): + z_torch = ttnn.to_torch(z_tt[i : i + 1]) + assert torch.allclose(z_torch, y, atol=1e-2), f"z_torch[{i}] != y" + y = torch.rand((1, 1, 32, 32), dtype=torch.bfloat16) + + y_tt = ttnn.from_torch(y, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device) + base_program_cache_entires = device.num_program_cache_entries() + + x = torch.zeros((4, 1, 32, 32), dtype=torch.bfloat16) + x_tt = ttnn.from_torch(x, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device) + + ttnn.repeat(y_tt, [4, 1, 1, 1]) + z_tt = ttnn.experimental.add(x_tt, y_tt) + # z_tt = x_tt + y_tt + + for i in range(num_iters): + z_torch = ttnn.to_torch(z_tt[i : i + 1]) + assert torch.allclose(z_torch, y, atol=1e-2), f"z_torch[{i}] != y" + for _ in range(num_iters): + y_tt = ttnn.from_torch(y, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device) + assert ( + device.num_program_cache_entries() == base_program_cache_entires, + "program cache entries differ on same configs", + ) + + x = torch.zeros((4, 1, 32, 32), dtype=torch.bfloat16) + x_tt = ttnn.from_torch(x, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device) + + ttnn.repeat(y_tt, [4, 1, 1, 1]) + z_tt = ttnn.experimental.add(x_tt, y_tt) + # z_tt = x_tt + y_tt + + for i in range(num_iters): + z_torch = ttnn.to_torch(z_tt[i : i + 1]) + assert torch.allclose(z_torch, y, atol=1e-2), f"z_torch[{i}] != y" + y = torch.rand((1, 1, 256, 384), dtype=torch.bfloat16) + + y_tt = ttnn.from_torch(y, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device) + base_program_cache_entires = device.num_program_cache_entries() + z_tt = ttnn.repeat(y_tt, ttnn.Shape([64, 1, 1, 1])) + + for i in range(64): + z_torch = ttnn.to_torch(z_tt[i : i + 1]) + assert torch.allclose(z_torch, y, atol=1e-2), f"z_torch[{i}] != y" + for _ in range(num_iters): + y = torch.rand((1, 1, 256, 384), dtype=torch.bfloat16) + y_tt = ttnn.from_torch(y, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device) + assert ( + device.num_program_cache_entries() == base_program_cache_entires, + "program cache entries differ on same configs", + ) + z_tt = ttnn.repeat(y_tt, ttnn.Shape([64, 1, 1, 1])) + + for i in range(64): + z_torch = ttnn.to_torch(z_tt[i : i + 1]) + assert torch.allclose(z_torch, y, atol=1e-2), f"z_torch[{i}] != y" diff --git a/ttnn/cpp/ttnn/operations/data_movement/repeat/device/host/repeat_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/repeat/device/host/repeat_program_factory.cpp index e8266b2ee50..d726d53de79 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/repeat/device/host/repeat_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/repeat/device/host/repeat_program_factory.cpp @@ -119,7 +119,23 @@ tt::tt_metal::operation::ProgramWithCallbacks rm_repeater_last_dim( } } } - return {.program = std::move(program)}; + auto override_runtime_args_callback = [reader_kernel_id, total_cores]( + const void* operation, + const tt::tt_metal::Program& program, + const std::vector& input_tensors, + const std::vector>&, + const std::vector& output_tensors) { + auto input = input_tensors.at(0); + auto output = output_tensors.at(0); + auto& runtime_args_by_core = GetRuntimeArgs(program, reader_kernel_id); + for (const auto& core : total_cores) { + auto& runtime_args = runtime_args_by_core[core.x][core.y]; + runtime_args.at(0) = input.buffer()->address(); + runtime_args.at(1) = output.buffer()->address(); + } + }; + + return {.program = std::move(program), .override_runtime_arguments_callback = override_runtime_args_callback}; } tt::tt_metal::operation::ProgramWithCallbacks rm_repeater( @@ -162,15 +178,17 @@ tt::tt_metal::operation::ProgramWithCallbacks rm_repeater( uint32_t cb_size_bytes = READ_ALIGNMENT * 2 + page_size_bytes; uint32_t src0_cb_index = 0; uint32_t src1_cb_index = 1; + tt::tt_metal::CircularBufferConfig cb_src0_config = tt::tt_metal::CircularBufferConfig(cb_size_bytes, {{src0_cb_index, cb_data_format}}) .set_page_size(src0_cb_index, cb_size_bytes); auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, total_cores, cb_src0_config); + tt::tt_metal::CircularBufferConfig cb_src1_config = tt::tt_metal::CircularBufferConfig(cb_size_bytes, {{src1_cb_index, cb_data_format}}) .set_page_size(src1_cb_index, cb_size_bytes); - auto cb_src1 = tt::tt_metal::CreateCircularBuffer(program, total_cores, cb_src1_config); + bool page_is_pow_2 = tt::tt_metal::is_power_of_two_at_least_32(page_size_bytes); uint32_t page_pow_2 = page_is_pow_2 ? (std::uint32_t)std::log2(page_size_bytes) : 0; std::vector compile_time_args = { @@ -245,7 +263,22 @@ tt::tt_metal::operation::ProgramWithCallbacks rm_repeater( } } } - return {.program = std::move(program)}; + auto override_runtime_args_callback = [reader_kernel_id, total_cores]( + const void* operation, + const tt::tt_metal::Program& program, + const std::vector& input_tensors, + const std::vector>&, + const std::vector& output_tensors) { + auto input = input_tensors.at(0); + auto output = output_tensors.at(0); + auto& runtime_args_by_core = GetRuntimeArgs(program, reader_kernel_id); + for (const auto& core : total_cores) { + auto& runtime_args = runtime_args_by_core[core.x][core.y]; + runtime_args.at(0) = input.buffer()->address(); + runtime_args.at(1) = output.buffer()->address(); + } + }; + return {.program = std::move(program), .override_runtime_arguments_callback = override_runtime_args_callback}; } tt::tt_metal::operation::ProgramWithCallbacks rm_repeat_program_factory( From 55343c84a2402f648605126dc594b3ec44f89db8 Mon Sep 17 00:00:00 2001 From: Austin Ho Date: Wed, 19 Feb 2025 19:55:40 +0000 Subject: [PATCH 185/316] #0: Flip default behaviour of fabric RoutingType template to be ROUTER_XY instead of ROUTING_TABLE --- .../tt_fabric_traffic_gen_rx_socket.cpp | 8 +-- .../kernels/tt_fabric_traffic_gen_tx.cpp | 2 +- .../tt_fabric_traffic_gen_tx_socket.cpp | 8 +-- .../routing/kernels/tt_fabric_tx_ubench.cpp | 12 ++-- tt_metal/fabric/hw/inc/tt_fabric_api.h | 65 ++++++++++--------- 5 files changed, 51 insertions(+), 44 deletions(-) diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx_socket.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx_socket.cpp index 98061fbe385..5232ef3fce5 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx_socket.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx_socket.cpp @@ -5,10 +5,10 @@ // clang-format off #include "debug/dprint.h" #include "dataflow_api.h" -#include "tt_fabric/hw/inc/tt_fabric.h" +#include "tt_metal/fabric/hw/inc/tt_fabric.h" #include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen.hpp" -#include "tt_fabric/hw/inc/tt_fabric_interface.h" -#include "tt_fabric/hw/inc/tt_fabric_api.h" +#include "tt_metal/fabric/hw/inc/tt_fabric_interface.h" +#include "tt_metal/fabric/hw/inc/tt_fabric_api.h" // clang-format on using namespace tt::tt_fabric; @@ -82,7 +82,7 @@ void kernel_main() { // make sure fabric node gatekeeper is available. tt_fabric_init(); - fabric_endpoint_init(); + fabric_endpoint_init(); socket_reader.init(data_buffer_start_addr, data_buffer_size_words); DPRINT << "Socket open on " << dest_device << ENDL(); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp index 9771420e537..9678fe4e0dc 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp @@ -458,7 +458,7 @@ void kernel_main() { // initalize client tt_fabric_init(); - fabric_endpoint_init(client_interface, outbound_eth_chan); + fabric_endpoint_init(client_interface, outbound_eth_chan); routing_table = reinterpret_cast(client_interface->routing_tables_l1_offset); while (true) { diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx_socket.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx_socket.cpp index c46c85e4a7b..d63197ab70b 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx_socket.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx_socket.cpp @@ -5,10 +5,10 @@ // clang-format off #include "dataflow_api.h" #include "debug/dprint.h" -#include "tt_fabric/hw/inc/tt_fabric.h" +#include "tt_metal/fabric/hw/inc/tt_fabric.h" #include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen.hpp" -#include "tt_fabric/hw/inc/tt_fabric_interface.h" -#include "tt_fabric/hw/inc/tt_fabric_api.h" +#include "tt_metal/fabric/hw/inc/tt_fabric_interface.h" +#include "tt_metal/fabric/hw/inc/tt_fabric_api.h" // clang-format on using namespace tt::tt_fabric; @@ -352,7 +352,7 @@ void kernel_main() { // initalize client tt_fabric_init(); - fabric_endpoint_init(client_interface, gk_interface_addr_l, gk_interface_addr_h); + fabric_endpoint_init(client_interface, gk_interface_addr_l, gk_interface_addr_h); routing_table = reinterpret_cast( client_interface->routing_tables_l1_offset + sizeof(fabric_router_l1_config_t) * routing_plane); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_tx_ubench.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_tx_ubench.cpp index bd042ff4ae3..a94d6185364 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_tx_ubench.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_tx_ubench.cpp @@ -5,10 +5,10 @@ // clang-format off #include "dataflow_api.h" #include "debug/dprint.h" -#include "tt_fabric/hw/inc/tt_fabric.h" +#include "tt_metal/fabric/hw/inc/tt_fabric.h" #include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen.hpp" -#include "tt_fabric/hw/inc/tt_fabric_interface.h" -#include "tt_fabric/hw/inc/tt_fabric_api.h" +#include "tt_metal/fabric/hw/inc/tt_fabric_interface.h" +#include "tt_metal/fabric/hw/inc/tt_fabric_api.h" #include "tests/tt_metal/tt_metal/perf_microbenchmark/common/kernel_utils.hpp" // clang-format on @@ -136,7 +136,7 @@ void kernel_main() { } // initalize client - fabric_endpoint_init(client_interface, outbound_eth_chan); + fabric_endpoint_init(client_interface, outbound_eth_chan); // notify the controller kernel that this worker is ready to proceed notify_traffic_controller(); @@ -157,7 +157,7 @@ void kernel_main() { while (true) { client_interface->local_pull_request.pull_request.words_read = 0; if constexpr (mcast_data) { - fabric_async_write_multicast( + fabric_async_write_multicast( client_interface, 0, // the network plane to use for this transaction data_buffer_start_addr, // source address in sender’s memory @@ -170,7 +170,7 @@ void kernel_main() { n_depth, s_depth); } else { - fabric_async_write( + fabric_async_write( client_interface, 0, // the network plane to use for this transaction data_buffer_start_addr, // source address in sender’s memory diff --git a/tt_metal/fabric/hw/inc/tt_fabric_api.h b/tt_metal/fabric/hw/inc/tt_fabric_api.h index 964fe971155..b36b5861025 100644 --- a/tt_metal/fabric/hw/inc/tt_fabric_api.h +++ b/tt_metal/fabric/hw/inc/tt_fabric_api.h @@ -12,10 +12,12 @@ namespace tt::tt_fabric { -#define ASYNC_WR_ADD_PR 1 -#define ASYNC_WR_SEND 2 -#define ASYNC_WR_ADD_HEADER 4 -#define ASYNC_WR_ALL ASYNC_WR_ADD_HEADER | ASYNC_WR_ADD_PR | ASYNC_WR_SEND +enum AsyncWriteMode : uint8_t { + ADD_PR = 0x01, + SEND = 0x02, + ADD_HEADER = 0x04, + ALL = ADD_HEADER | ADD_PR | SEND, +}; enum RoutingType : uint8_t { ROUTING_TABLE, @@ -56,10 +58,11 @@ inline void fabric_setup_pull_request( client_interface->local_pull_request.pull_request.flags = FORWARD; } -template +template inline void fabric_send_pull_request( volatile tt_l1_ptr fabric_client_interface_t* client_interface, - uint32_t routing, + uint32_t routing, // routing refers to the router noc xy to use when using ROUTER_XY, + // and the routing plane to use when using ROUTING_TABLE uint16_t dst_mesh_id, uint16_t dst_dev_id) { uint64_t router_addr; @@ -113,25 +116,26 @@ inline void fabric_async_write_add_header( // Write packetized data over fabric to dst_mesh, dst_dev. // Packet is at src_addr in sender L1. -template +template inline void fabric_async_write( volatile tt_l1_ptr fabric_client_interface_t* client_interface, - uint32_t routing, // the network plane to use for this transaction + uint32_t routing, // routing refers to the router noc xy to use when using ROUTER_XY, + // and the routing plane to use when using ROUTING_TABLE uint32_t src_addr, // source address in sender’s memory uint16_t dst_mesh_id, uint16_t dst_dev_id, uint64_t dst_addr, uint32_t size // number of bytes to write to remote destination ) { - if constexpr (mode & ASYNC_WR_ADD_HEADER) { + if constexpr (mode & AsyncWriteMode::ADD_HEADER) { fabric_async_write_add_header(src_addr, dst_mesh_id, dst_dev_id, dst_addr, size); } - if constexpr (mode & ASYNC_WR_ADD_PR) { + if constexpr (mode & AsyncWriteMode::ADD_PR) { fabric_setup_pull_request(client_interface, src_addr, size); } - if constexpr (mode & ASYNC_WR_SEND) { + if constexpr (mode & AsyncWriteMode::SEND) { fabric_send_pull_request(client_interface, routing, dst_mesh_id, dst_dev_id); } } @@ -162,11 +166,12 @@ inline void fabric_async_write_multicast_add_header( } // Write packetized data over fabric to dst_mesh, dst_dev. // Packet is at src_addr in sender L1. -template +template inline void fabric_async_write_multicast( volatile tt_l1_ptr fabric_client_interface_t* client_interface, - uint32_t routing_plane, // the network plane to use for this transaction - uint32_t src_addr, // source address in sender’s memory + uint32_t routing, // routing refers to the router noc xy to use when using ROUTER_XY, + // and the routing plane to use when using ROUTING_TABLE + uint32_t src_addr, // source address in sender’s memory uint16_t dst_mesh_id, uint16_t dst_dev_id, uint64_t dst_addr, @@ -175,17 +180,17 @@ inline void fabric_async_write_multicast( uint16_t w_depth, uint16_t n_depth, uint16_t s_depth) { - if constexpr (mode & ASYNC_WR_ADD_HEADER) { + if constexpr (mode & AsyncWriteMode::ADD_HEADER) { fabric_async_write_multicast_add_header( src_addr, dst_mesh_id, dst_dev_id, dst_addr, size, e_depth, w_depth, n_depth, s_depth); } - if constexpr (mode & ASYNC_WR_ADD_PR) { + if constexpr (mode & AsyncWriteMode::ADD_PR) { fabric_setup_pull_request(client_interface, src_addr, size); } - if constexpr (mode & ASYNC_WR_SEND) { - fabric_send_pull_request(client_interface, routing_plane, dst_mesh_id, dst_dev_id); + if constexpr (mode & AsyncWriteMode::SEND) { + fabric_send_pull_request(client_interface, routing, dst_mesh_id, dst_dev_id); } } @@ -211,25 +216,26 @@ inline void fabric_atomic_inc_add_header( // Write packetized data over fabric to dst_mesh, dst_dev. // Packet is at src_addr in sender L1. -template +template inline void fabric_atomic_inc( volatile tt_l1_ptr fabric_client_interface_t* client_interface, - uint32_t routing, // the network plane to use for this transaction + uint32_t routing, // routing refers to the router noc xy to use when using ROUTER_XY, + // and the routing plane to use when using ROUTING_TABLE uint32_t src_addr, // source address in sender’s memory uint16_t dst_mesh_id, uint16_t dst_dev_id, uint64_t dst_addr, uint32_t atomic_inc, uint32_t wrap_boundary) { - if constexpr (mode & ASYNC_WR_ADD_HEADER) { + if constexpr (mode & AsyncWriteMode::ADD_HEADER) { fabric_atomic_inc_add_header(src_addr, dst_mesh_id, dst_dev_id, dst_addr, atomic_inc, wrap_boundary); } - if constexpr (mode & ASYNC_WR_ADD_PR) { + if constexpr (mode & AsyncWriteMode::ADD_PR) { fabric_setup_pull_request(client_interface, src_addr, PACKET_HEADER_SIZE_BYTES); } - if constexpr (mode & ASYNC_WR_SEND) { + if constexpr (mode & AsyncWriteMode::SEND) { fabric_send_pull_request(client_interface, routing, dst_mesh_id, dst_dev_id); } } @@ -258,10 +264,11 @@ inline void fabric_async_write_atomic_inc_add_header( // Write packetized data over fabric to dst_mesh, dst_dev. // Packet is at src_addr in sender L1. -template +template inline void fabric_async_write_atomic_inc( volatile tt_l1_ptr fabric_client_interface_t* client_interface, - uint32_t routing, // the network plane to use for this transaction + uint32_t routing, // routing refers to the router noc xy to use when using ROUTER_XY, + // and the routing plane to use when using ROUTING_TABLE uint32_t src_addr, // source address in sender’s memory uint16_t dst_mesh_id, uint16_t dst_dev_id, @@ -269,16 +276,16 @@ inline void fabric_async_write_atomic_inc( uint64_t dst_atomic_addr, uint32_t size, // number of bytes to write to remote destination uint32_t atomic_inc) { - if constexpr (mode & ASYNC_WR_ADD_HEADER) { + if constexpr (mode & AsyncWriteMode::ADD_HEADER) { fabric_async_write_atomic_inc_add_header( src_addr, dst_mesh_id, dst_dev_id, dst_write_addr, dst_atomic_addr, size, atomic_inc); } - if constexpr (mode & ASYNC_WR_ADD_PR) { + if constexpr (mode & AsyncWriteMode::ADD_PR) { fabric_setup_pull_request(client_interface, src_addr, size); } - if constexpr (mode & ASYNC_WR_SEND) { + if constexpr (mode & AsyncWriteMode::SEND) { fabric_send_pull_request(client_interface, routing, dst_mesh_id, dst_dev_id); } } @@ -385,7 +392,7 @@ inline void fabric_socket_connect(socket_handle_t* socket_handle) { while (((volatile socket_handle_t*)socket_handle)->socket_state != SocketState::ACTIVE); } -template +template inline void fabric_endpoint_init( volatile tt_l1_ptr fabric_client_interface_t* client_interface, uint32_t outbound_eth_chan) { // TODO: Should not assume routing tables are immediately after the client interface From f82aaa8e7155ca812362fb1fa0f4db898938ae18 Mon Sep 17 00:00:00 2001 From: Saad Jameel <163029024+sjameelTT@users.noreply.github.com> Date: Wed, 19 Feb 2025 23:39:58 -0500 Subject: [PATCH 186/316] Revert "Add row major eltwise binary_ng support" (#18074) Reverts tenstorrent/tt-metal#17969 --- .../eltwise/test_binary_ng_typecast.py | 284 +----------------- .../eltwise/binary_ng/binary_ng.cpp | 66 +--- 2 files changed, 14 insertions(+), 336 deletions(-) diff --git a/tests/ttnn/unit_tests/operations/eltwise/test_binary_ng_typecast.py b/tests/ttnn/unit_tests/operations/eltwise/test_binary_ng_typecast.py index 3c804597a06..df8b8db740a 100644 --- a/tests/ttnn/unit_tests/operations/eltwise/test_binary_ng_typecast.py +++ b/tests/ttnn/unit_tests/operations/eltwise/test_binary_ng_typecast.py @@ -9,7 +9,6 @@ from models.utility_functions import skip_for_grayskull, torch_random from functools import partial from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt -from tests.ttnn.utils_for_testing import assert_with_pcc binary_fns = { @@ -52,12 +51,8 @@ "dtype", ([ttnn.bfloat16]), ) -@pytest.mark.parametrize( - "layout", - ([ttnn.TILE_LAYOUT]), -) # No typecast on inputs and optional output -def test_opt_output_no_typecast(input_shapes, dtype, layout, ttnn_fn, device): +def test_opt_output_no_typecast(input_shapes, dtype, ttnn_fn, device): torch.manual_seed(0) a_shape, b_shape, out_shape = input_shapes ttnn_op = getattr(ttnn.experimental, ttnn_fn) @@ -71,12 +66,14 @@ def test_opt_output_no_typecast(input_shapes, dtype, layout, ttnn_fn, device): out = gen_func_with_cast_tt(partial(torch_random, low=0, high=1, dtype=torch.bfloat16), dtype)(out_shape) input_tensor_a = ttnn.from_torch( - torch_input_tensor_a, dtype=dtype, device=device, layout=layout, memory_config=ttnn.DRAM_MEMORY_CONFIG + torch_input_tensor_a, dtype=dtype, device=device, layout=ttnn.TILE_LAYOUT, memory_config=ttnn.DRAM_MEMORY_CONFIG ) input_tensor_b = ttnn.from_torch( - torch_input_tensor_b, dtype=dtype, device=device, layout=layout, memory_config=ttnn.DRAM_MEMORY_CONFIG + torch_input_tensor_b, dtype=dtype, device=device, layout=ttnn.TILE_LAYOUT, memory_config=ttnn.DRAM_MEMORY_CONFIG + ) + out_tt = ttnn.from_torch( + out, dtype=dtype, device=device, layout=ttnn.TILE_LAYOUT, memory_config=ttnn.DRAM_MEMORY_CONFIG ) - out_tt = ttnn.from_torch(out, dtype=dtype, device=device, layout=layout, memory_config=ttnn.DRAM_MEMORY_CONFIG) cq_id = 0 ttnn_op(input_tensor_a, input_tensor_b, queue_id=cq_id, output_tensor=out_tt) output_tensor = ttnn.to_torch(out_tt) @@ -663,272 +660,3 @@ def test_opt_output_scalar(input_shapes, ttnn_fn, scalar, device): status = ttnn.pearson_correlation_coefficient(torch_output_tensor, output_tensor) assert status >= 0.999 - - -@skip_for_grayskull("Requires wormhole_b0 to run") -@pytest.mark.parametrize("input_shape", [(1, 1, 1, 1), (3, 3, 15, 15), (3, 3, 17, 17), (3, 3, 33, 33)]) -@pytest.mark.parametrize( - "memory_config", - ([ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG]), -) -@pytest.mark.parametrize("scalar", [-0.25, -16.5, 0.0, 0.05, 1.7, 19.0]) -@pytest.mark.parametrize( - "ttnn_fn", - [ - "add", - "sub", - "mul", - "div", - "rsub", - "squared_difference", - ], -) -@pytest.mark.parametrize( - "layout", - ([ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT]), -) -def test_edgecase_dims_eltwise_scalar_matrix_math(input_shape, scalar, ttnn_fn, memory_config, layout, device): - torch.manual_seed(0) - a_shape = input_shape - - ttnn_op = getattr(ttnn.experimental, ttnn_fn) - torch_input_tensor_a = torch.randn(a_shape, dtype=torch.bfloat16) - - input_tensor_a = ttnn.from_torch( - torch_input_tensor_a, - dtype=ttnn.bfloat16, - device=device, - layout=layout, - memory_config=memory_config, - ) - - output = ttnn_op(input_tensor_a, scalar) - tt_output_tensor = ttnn.to_torch(output) - - golden_fn = ttnn.get_golden_function(ttnn_op) - torch_output_tensor = golden_fn(torch_input_tensor_a, scalar) - - assert_with_pcc(torch_output_tensor, tt_output_tensor, 0.999) - - -@skip_for_grayskull("Requires wormhole_b0 to run") -@pytest.mark.parametrize("input_shape", [(1, 1, 1, 1), (3, 3, 15, 15), (3, 3, 17, 17), (3, 3, 33, 33)]) -@pytest.mark.parametrize( - "memory_config", - ([ttnn.DRAM_MEMORY_CONFIG]), -) -@pytest.mark.parametrize("scalar", [-0.25, -16.5, 0.0, 0.05, 1.7, 19.0]) -@pytest.mark.parametrize( - "ttnn_fn", - [ - "gt", - "lt", - "lte", - "gte", - "eq", - "ne", - ], -) -@pytest.mark.parametrize( - "layout", - ([ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT]), -) -def test_edgecase_dims_eltwise_scalar_logical(input_shape, scalar, ttnn_fn, memory_config, layout, device): - torch.manual_seed(0) - a_shape = input_shape - - ttnn_op = getattr(ttnn.experimental, ttnn_fn) - torch_input_tensor_a = torch.randn(a_shape, dtype=torch.bfloat16) - # guarantee at least one equal value - if (ttnn_fn == "eq" or ttnn_fn == "ne" or ttnn_fn == "gte" or ttnn_fn == "lte") and input_shape != (1, 1, 1, 1): - torch_input_tensor_a[0, 0, 0, 0] = scalar - - input_tensor_a = ttnn.from_torch( - torch_input_tensor_a, - dtype=ttnn.bfloat16, - device=device, - layout=layout, - memory_config=memory_config, - ) - - output = ttnn_op(input_tensor_a, scalar, dtype=ttnn.uint32) - tt_output_tensor = ttnn.to_torch(output) - - golden_fn = ttnn.get_golden_function(ttnn_op) - torch_output_tensor = golden_fn(torch_input_tensor_a, scalar) - - assert_with_pcc(torch_output_tensor, tt_output_tensor, 0.999) - - -@pytest.mark.parametrize( - "input_shapes", - [ - ((1, 7, 1, 1), (7, 7, 33, 33)), - ((7, 1, 1, 1), (7, 7, 49, 49)), - ((7, 7, 65, 65), (7, 7, 65, 65)), - ((2, 2, 10, 1), (2, 2, 10, 2)), - ], -) -@pytest.mark.parametrize( - "memory_config", - ([ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG]), -) -@pytest.mark.parametrize( - "ttnn_fn", - [ - "add", - "sub", - "mul", - "div", - "rsub", - "squared_difference", - ], -) -@pytest.mark.parametrize( - "layout", - ([ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT]), -) -def test_edgecase_dims_eltwise_broadcast_matrix_math(input_shapes, ttnn_fn, memory_config, layout, device): - torch.manual_seed(0) - a_shape, b_shape = input_shapes - - ttnn_op = getattr(ttnn.experimental, ttnn_fn) - torch_input_tensor_a = torch.randn(a_shape, dtype=torch.bfloat16) - torch_input_tensor_b = torch.randn(b_shape, dtype=torch.bfloat16) - - if ttnn_fn == "div": - torch_input_tensor_b[torch_input_tensor_b.abs() < 0.001] = 0.001 - - input_tensor_a = ttnn.from_torch( - torch_input_tensor_a, - dtype=ttnn.bfloat16, - device=device, - layout=layout, - memory_config=memory_config, - ) - - input_tensor_b = ttnn.from_torch( - torch_input_tensor_b, - dtype=ttnn.bfloat16, - device=device, - layout=layout, - memory_config=memory_config, - ) - - output = ttnn_op(input_tensor_a, input_tensor_b, dtype=ttnn.float32) - tt_output_tensor = ttnn.to_torch(output) - - golden_fn = ttnn.get_golden_function(ttnn_op) - torch_output_tensor = golden_fn(torch_input_tensor_a, torch_input_tensor_b) - - assert_with_pcc(torch_output_tensor, tt_output_tensor, 0.999) - - -@skip_for_grayskull("Requires wormhole_b0 to run") -@pytest.mark.parametrize( - "input_shapes", - [ - ((1, 7, 1, 1), (7, 7, 33, 33)), - ((7, 1, 1, 1), (7, 7, 49, 49)), - ((7, 7, 65, 65), (7, 7, 65, 65)), - ], -) -@pytest.mark.parametrize( - "memory_config", - ([ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG]), -) -@pytest.mark.parametrize( - "ttnn_fn", - [ - "gt", - "lt", - "lte", - "gte", - "eq", - "ne", - ], -) -@pytest.mark.parametrize( - "layout", - ([ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT]), -) -def test_edgecase_dims_eltwise_broadcast_logical(input_shapes, ttnn_fn, memory_config, layout, device): - torch.manual_seed(0) - a_shape, b_shape = input_shapes - - ttnn_op = getattr(ttnn.experimental, ttnn_fn) - torch_input_tensor_a = torch.randn(a_shape, dtype=torch.bfloat16) - torch_input_tensor_b = torch.randn(b_shape, dtype=torch.bfloat16) - # guarantee at least one equal value - if ttnn_fn == "eq" or ttnn_fn == "ne" or ttnn_fn == "gte" or ttnn_fn == "lte": - torch_input_tensor_a[0, 0, 0, 0] = torch_input_tensor_b[0, 0, 0, 0] - - input_tensor_a = ttnn.from_torch( - torch_input_tensor_a, - dtype=ttnn.bfloat16, - device=device, - layout=layout, - memory_config=memory_config, - ) - - input_tensor_b = ttnn.from_torch( - torch_input_tensor_b, - dtype=ttnn.bfloat16, - device=device, - layout=layout, - memory_config=memory_config, - ) - - output = ttnn_op(input_tensor_a, input_tensor_b, dtype=ttnn.float32) - tt_output_tensor = ttnn.to_torch(output) - - golden_fn = ttnn.get_golden_function(ttnn_op) - torch_output_tensor = golden_fn(torch_input_tensor_a, torch_input_tensor_b) - - assert_with_pcc(torch_output_tensor, tt_output_tensor, 0.999) - - -@pytest.mark.parametrize( - "input_shape, input_layout, input_shard_grid, input_shard_orientation, input_sharding_scheme", - [ - ( - [1, 1, 64, 64], - ttnn.TILE_LAYOUT, - ttnn.CoreGrid(y=1, x=2), - ttnn.ShardOrientation.ROW_MAJOR, - ttnn.ShardStrategy.WIDTH, - ), - ], -) -@pytest.mark.parametrize("input_dtype", [ttnn.bfloat16, ttnn.float32]) -@pytest.mark.parametrize("output_dtype", [ttnn.float32, ttnn.bfloat16]) -def test_binary_div( - device, - input_shape, - input_layout, - input_shard_grid, - input_shard_orientation, - input_sharding_scheme, - input_dtype, - output_dtype, -): - memory_config = ttnn.create_sharded_memory_config( - input_shape, - core_grid=input_shard_grid, - strategy=input_sharding_scheme, - orientation=input_shard_orientation, - use_height_and_width_as_shard_shape=False, - ) - - torch_input_a = torch.rand(input_shape, dtype=torch.bfloat16) + 1 - torch_input_b = torch.rand(input_shape, dtype=torch.bfloat16) + 1 - torch_output = torch_input_a / torch_input_b - - input_tensor_a = ttnn.from_torch( - torch_input_a, layout=input_layout, memory_config=memory_config, dtype=input_dtype, device=device - ) - input_tensor_b = ttnn.from_torch( - torch_input_b, layout=input_layout, memory_config=memory_config, dtype=input_dtype, device=device - ) - output_tensor = ttnn.experimental.div(input_tensor_a, input_tensor_b, dtype=output_dtype) - assert_with_pcc(torch_output, ttnn.to_torch(output_tensor), 0.999) diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/binary_ng.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/binary_ng.cpp index 71a3f32b980..99c1a77dab0 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/binary_ng.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/binary_ng.cpp @@ -43,28 +43,11 @@ Tensor BinaryNg::invoke( bool typecast_b = needs_typecast_to_bfloat16(b_dtype); bool typecast_out = needs_typecast_to_bfloat16(out_dtype); - // RM is never BFLOAT8 or BFLOAT4 so we can assume it goes in here. if (!typecast_a && !typecast_b) { - bool input_a_rm = input_tensor_a.get_layout() == Layout::ROW_MAJOR; - bool input_b_rm = input_tensor_b.get_layout() == Layout::ROW_MAJOR; - Tensor input_a = - input_a_rm ? ttnn::to_layout(input_tensor_a, Layout::TILE, std::nullopt, std::nullopt, (IDevice*)nullptr) - : input_tensor_a; - Tensor input_b = - input_b_rm ? ttnn::to_layout(input_tensor_b, Layout::TILE, std::nullopt, std::nullopt, (IDevice*)nullptr) - : input_tensor_b; - - if (input_a_rm && input_b_rm) { - // we don't support to_layout with optional output tensor - TT_FATAL( - !output_preallocated, - "Optional output tensor with Row Major input is not supported right now for Elementwise operations"); - } - - Tensor result = ttnn::prim::binary_ng( + return ttnn::prim::binary_ng( queue_id, - input_a, - input_b, + input_tensor_a, + input_tensor_b, binary_op_type, out_dtype, output_preallocated ? optional_output_tensor->memory_config() @@ -73,20 +56,6 @@ Tensor BinaryNg::invoke( lhs_activations, rhs_activations, post_activations); - - // if both inputs are in row major, convert the output to row major - // since there's no consensus here, avoiding the conversion if we have an excuse to is likely the best option - // since it leads to better perf - if (input_a_rm && input_b_rm) { - result = ttnn::to_layout( - result, - Layout::ROW_MAJOR, - std::nullopt, - memory_config.value_or(input_tensor_a.memory_config()), - (IDevice*)nullptr); - } - - return result; } else { Tensor input_a = typecast_to(DataType::BFLOAT16, input_tensor_a); Tensor input_b = typecast_to(DataType::BFLOAT16, input_tensor_b); @@ -147,8 +116,6 @@ Tensor BinaryNg::invoke( const bool output_preallocated = optional_output_tensor.has_value(); const ttnn::DataType out_dtype = output_preallocated ? optional_output_tensor->get_dtype() : output_dtype.value_or(a_dtype); - const auto mem_config = output_preallocated ? optional_output_tensor->memory_config() - : memory_config.value_or(input_tensor_a.memory_config()); if (output_dtype.has_value() && output_preallocated) { TT_FATAL( @@ -160,35 +127,18 @@ Tensor BinaryNg::invoke( bool typecast_out = needs_typecast_to_bfloat16(out_dtype); if (!typecast_a) { - bool input_a_rm = input_tensor_a.get_layout() == Layout::ROW_MAJOR; - if (input_a_rm) { - // we don't support to_layout with optional output tensor - TT_FATAL( - !output_preallocated, - "Optional output tensor with Row Major input is not supported right now for Elementwise operations"); - } - Tensor input_a = - input_a_rm - ? ttnn::to_layout( - input_tensor_a, Layout::TILE, std::nullopt, input_tensor_a.memory_config(), (IDevice*)nullptr) - : input_tensor_a; - Tensor result = ttnn::prim::binary_ng( + return ttnn::prim::binary_ng( queue_id, - input_a, + input_tensor_a, scalar, binary_op_type, out_dtype, - mem_config, + output_preallocated ? optional_output_tensor->memory_config() + : memory_config.value_or(input_tensor_a.memory_config()), optional_output_tensor, lhs_activations, rhs_activations, post_activations); - - // if input is in row major, convert the output to row major - if (input_a_rm) { - result = ttnn::to_layout(result, Layout::ROW_MAJOR, std::nullopt, mem_config, (IDevice*)nullptr); - } - return result; } else { Tensor input_a = typecast_to(DataType::BFLOAT16, input_tensor_a); const auto output_tensor = output_preallocated and typecast_out @@ -201,7 +151,7 @@ Tensor BinaryNg::invoke( scalar, binary_op_type, input_a.get_dtype(), - mem_config, + input_a.memory_config(), output_tensor, lhs_activations, rhs_activations, From e56c9b5389862b7e7e5485a0625ba35329ba7e1c Mon Sep 17 00:00:00 2001 From: Pavle Josipovic Date: Wed, 19 Feb 2025 13:14:22 +0000 Subject: [PATCH 187/316] Add ttnn-pytorch and tt-forge conv2d/maxpool_2d sweeps to nightly --- .../sweep_utils/conv2d_common.py | 2 - .../operations/conv2d/test_conv2d_sweeps.py | 57 +++++++++++++++++++ .../max_pool2d/test_max_pool2d_sweeps.py | 51 +++++++++++++++++ 3 files changed, 108 insertions(+), 2 deletions(-) create mode 100644 tests/ttnn/nightly/unit_tests/operations/conv2d/test_conv2d_sweeps.py create mode 100644 tests/ttnn/nightly/unit_tests/operations/max_pool2d/test_max_pool2d_sweeps.py diff --git a/tests/sweep_framework/sweep_utils/conv2d_common.py b/tests/sweep_framework/sweep_utils/conv2d_common.py index eb3eb3056f2..1c18de54308 100644 --- a/tests/sweep_framework/sweep_utils/conv2d_common.py +++ b/tests/sweep_framework/sweep_utils/conv2d_common.py @@ -220,7 +220,6 @@ def run_conv2d_short_sweep( dilation_w, has_bias, ] = input_specs - print(input_specs) if is_forge_suite: torch_input_dtype = torch.bfloat16 if input_dtype == ttnn.DataType(ttnn.bfloat16) else torch.float32 @@ -317,7 +316,6 @@ def run_conv2d_short_sweep( torch_output_tensor = torch.permute(torch_output_tensor, (0, 3, 1, 2)) - print("End of test case") return [check_with_pcc(torch_output_tensor, torch_out_golden_tensor, pcc=0.985), e2e_perf] diff --git a/tests/ttnn/nightly/unit_tests/operations/conv2d/test_conv2d_sweeps.py b/tests/ttnn/nightly/unit_tests/operations/conv2d/test_conv2d_sweeps.py new file mode 100644 index 00000000000..7f8c3b40022 --- /dev/null +++ b/tests/ttnn/nightly/unit_tests/operations/conv2d/test_conv2d_sweeps.py @@ -0,0 +1,57 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +from tests.sweep_framework.sweep_utils.conv2d_common import run_conv2d_short_sweep +from tests.sweep_framework.sweeps.conv2d.short.conv2d_short_sweep import parameters as parameters_ttnn_pytorch +from tests.sweep_framework.sweeps.conv2d.short.conv2d_short_sweep import ( + failing_parameters as failing_parameters_ttnn_pytorch, +) + +from tests.sweep_framework.sweeps.conv2d.short.conv2d_ttforge_sweep import parameters as parameters_ttnn_forge +from tests.sweep_framework.sweeps.conv2d.short.conv2d_ttforge_sweep import ( + failing_parameters as failing_parameters_ttnn_forge, +) + +from models.utility_functions import ( + skip_for_grayskull, + is_wormhole_b0, +) + +import pytest + + +@skip_for_grayskull() +@pytest.mark.parametrize("input_spec", parameters_ttnn_pytorch["short_sweep_suite_conv2d"]["input_specs"]) +@pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) +def test_ttnn_pytorch_sweep(device, input_spec): + if device.core_grid.y != 8 and is_wormhole_b0(): + pytest.skip("Needs 8x8 grid for wormhole_b0") + + # Check if input_spec is in failing_parameters + if input_spec in failing_parameters_ttnn_pytorch: + pytest.skip(f"Skipping test for failing input_spec: {input_spec}") + + pcc, messsage = run_conv2d_short_sweep( + input_spec, + device, + )[0] + assert pcc, messsage + + +@skip_for_grayskull() +@pytest.mark.parametrize("input_spec", parameters_ttnn_forge["ttforge_sweep_conv2d"]["input_specs"]) +@pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) +def test_tt_forge_sweep(device, input_spec): + if device.core_grid.y != 8 and is_wormhole_b0(): + pytest.skip("Needs 8x8 grid for wormhole_b0") + + # Check if input_spec is in failing_parameters + if input_spec in failing_parameters_ttnn_forge: + pytest.skip(f"Skipping test for failing input_spec: {input_spec}") + + pcc, messsage = run_conv2d_short_sweep( + input_spec, + device, + )[0] + assert pcc, messsage diff --git a/tests/ttnn/nightly/unit_tests/operations/max_pool2d/test_max_pool2d_sweeps.py b/tests/ttnn/nightly/unit_tests/operations/max_pool2d/test_max_pool2d_sweeps.py new file mode 100644 index 00000000000..d8dcf39e8a8 --- /dev/null +++ b/tests/ttnn/nightly/unit_tests/operations/max_pool2d/test_max_pool2d_sweeps.py @@ -0,0 +1,51 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +from tests.sweep_framework.sweep_utils.max_pool2d_common import run_max_pool2d +from tests.sweep_framework.sweeps.max_pool2d.short.max_pool2d_short_sweep import parameters as parameters_ttnn_pytorch + +from models.utility_functions import skip_for_grayskull + +import pytest +import ttnn + + +@skip_for_grayskull() +@pytest.mark.parametrize("input_spec", parameters_ttnn_pytorch["max_pool2d_short_sweep_suite"]["input_specs"]) +@pytest.mark.parametrize("dtype", [ttnn.bfloat16, ttnn.bfloat8_b]) +@pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) +def test_ttnn_pytorch_sweep(device, dtype, input_spec): + ( + in_n, + in_c, + in_h, + in_w, + kernel_h, + kernel_w, + stride_h, + stride_w, + pad_h, + pad_w, + dilation_h, + dilation_w, + ceil_mode, + ) = input_spec + run_max_pool2d( + in_n, + in_c, + in_h, + in_w, + kernel_h, + kernel_w, + stride_h, + stride_w, + pad_h, + pad_w, + dilation_h, + dilation_w, + dtype, + device, + ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + ceil_mode, + ) From 790e53177a2bc40dffbbf98d8908f6a1ff119629 Mon Sep 17 00:00:00 2001 From: Almeet Bhullar Date: Sat, 25 Jan 2025 02:45:47 +0000 Subject: [PATCH 188/316] Fix bug in calculating erisc app sync base addr + update invalidate compile call for full barrier --- tt_metal/hw/firmware/src/active_erisc.cc | 4 +++- tt_metal/hw/firmware/src/active_erisck.cc | 21 +++++++++++++------ .../hw/inc/blackhole/eth_l1_address_map.h | 13 ++++++------ tt_metal/hw/inc/dataflow_api.h | 5 ++--- tt_metal/llrt/tt_cluster.cpp | 8 +++---- tt_metal/llrt/tt_cluster.hpp | 10 +++++++-- 6 files changed, 38 insertions(+), 23 deletions(-) diff --git a/tt_metal/hw/firmware/src/active_erisc.cc b/tt_metal/hw/firmware/src/active_erisc.cc index 2c50889f7f9..448144b0b0d 100644 --- a/tt_metal/hw/firmware/src/active_erisc.cc +++ b/tt_metal/hw/firmware/src/active_erisc.cc @@ -69,6 +69,9 @@ int main() { // put this into scratch space similar to idle erisc noc_bank_table_init(eth_l1_mem::address_map::ERISC_MEM_BANK_TO_NOC_SCRATCH); + mailboxes->launch_msg_rd_ptr = 0; // Initialize the rdptr to 0 + noc_index = 0; + risc_init(); mailboxes->slave_sync.all = RUN_SYNC_MSG_ALL_SLAVES_DONE; @@ -79,7 +82,6 @@ int main() { } mailboxes->go_message.signal = RUN_MSG_DONE; - mailboxes->launch_msg_rd_ptr = 0; // Initialize the rdptr to 0 while (1) { // Wait... diff --git a/tt_metal/hw/firmware/src/active_erisck.cc b/tt_metal/hw/firmware/src/active_erisck.cc index 0e2c75d5008..9afc3e2f499 100644 --- a/tt_metal/hw/firmware/src/active_erisck.cc +++ b/tt_metal/hw/firmware/src/active_erisck.cc @@ -21,12 +21,7 @@ #include #include -extern uint32_t __kernel_init_local_l1_base[]; -extern uint32_t __fw_export_end_text[]; - void kernel_launch(uint32_t kernel_base_addr) { - DeviceZoneScopedMainChildN("ACTIVE-ERISC-KERNEL"); - extern uint32_t __kernel_init_local_l1_base[]; extern uint32_t __fw_export_end_text[]; do_crt1((uint32_t tt_l1_ptr*)(kernel_base_addr + (uint32_t)__kernel_init_local_l1_base - @@ -34,5 +29,19 @@ void kernel_launch(uint32_t kernel_base_addr) { noc_local_state_init(NOC_INDEX); - kernel_main(); + { + DeviceZoneScopedMainChildN("ACTIVE-ERISC-KERNEL"); + kernel_main(); + if constexpr (NOC_MODE == DM_DEDICATED_NOC) { + WAYPOINT("NKFW"); + // Assert that no noc transactions are outstanding, to ensure that all reads and writes have landed and the + // NOC interface is in a known idle state for the next kernel. + ASSERT(ncrisc_noc_reads_flushed(NOC_INDEX)); + ASSERT(ncrisc_noc_nonposted_writes_sent(NOC_INDEX)); + ASSERT(ncrisc_noc_nonposted_writes_flushed(NOC_INDEX)); + ASSERT(ncrisc_noc_nonposted_atomics_flushed(NOC_INDEX)); + ASSERT(ncrisc_noc_posted_writes_sent(NOC_INDEX)); + WAYPOINT("NKFD"); + } + } } diff --git a/tt_metal/hw/inc/blackhole/eth_l1_address_map.h b/tt_metal/hw/inc/blackhole/eth_l1_address_map.h index 275bccce2e6..b83a2c9239c 100644 --- a/tt_metal/hw/inc/blackhole/eth_l1_address_map.h +++ b/tt_metal/hw/inc/blackhole/eth_l1_address_map.h @@ -27,7 +27,7 @@ struct address_map { static constexpr std::int32_t MAX_L1_LOADING_SIZE = MAX_SIZE; static constexpr std::int32_t FABRIC_ROUTER_CONFIG_BASE = MAX_SIZE; - static constexpr std::int32_t ERISC_APP_SYNC_INFO_BASE = FABRIC_ROUTER_CONFIG_BASE + FABRIC_ROUTER_CONFIG_BASE; + static constexpr std::int32_t ERISC_APP_SYNC_INFO_BASE = FABRIC_ROUTER_CONFIG_BASE + FABRIC_ROUTER_CONFIG_SIZE; static constexpr std::int32_t ERISC_APP_ROUTING_INFO_BASE = ERISC_APP_SYNC_INFO_BASE + ERISC_APP_SYNC_INFO_SIZE; static constexpr std::uint32_t ERISC_BARRIER_BASE = ERISC_APP_ROUTING_INFO_BASE + ERISC_APP_ROUTING_INFO_SIZE; @@ -56,8 +56,11 @@ struct address_map { static constexpr std::int32_t MEM_ERISC_STACK_BASE = RISC_LOCAL_MEM_BASE + MEM_ERISC_LOCAL_SIZE - MEM_ERISC_STACK_SIZE; - static constexpr std::int32_t ERISC_MEM_BANK_TO_NOC_SCRATCH = - MEM_ERISC_INIT_LOCAL_L1_BASE_SCRATCH + MEM_ERISC_LOCAL_SIZE; + static constexpr std::int32_t LAUNCH_ERISC_APP_FLAG = 0; // don't need this - just to get things to compile + static constexpr std::int32_t ERISC_L1_UNRESERVED_BASE = (MEM_ERISC_MAP_END + (69 * 1024) + 63) & ~63; + static constexpr std::int32_t ERISC_L1_UNRESERVED_SIZE = MAX_SIZE - ERISC_L1_UNRESERVED_BASE; + + static constexpr std::int32_t ERISC_MEM_BANK_TO_NOC_SCRATCH = ERISC_L1_UNRESERVED_BASE; // Memory for (dram/l1)_bank_to_noc_xy arrays, size needs to be atleast 2 * NUM_NOCS * (NUM_DRAM_BANKS + // NUM_L1_BANKS) static constexpr std::int32_t ERISC_MEM_BANK_TO_NOC_XY_SIZE = 1024; @@ -66,10 +69,6 @@ struct address_map { static constexpr std::int32_t ERISC_MEM_BANK_OFFSET_SIZE = 1024; static constexpr std::int32_t ERISC_MEM_BANK_TO_NOC_SIZE = ERISC_MEM_BANK_TO_NOC_XY_SIZE + ERISC_MEM_BANK_OFFSET_SIZE; - static constexpr std::int32_t LAUNCH_ERISC_APP_FLAG = 0; // don't need this - just to get things to compile - static constexpr std::int32_t ERISC_L1_UNRESERVED_BASE = (MEM_ERISC_MAP_END + (69 * 1024) + 63) & ~63; - static constexpr std::int32_t ERISC_L1_UNRESERVED_SIZE = MAX_SIZE - ERISC_L1_UNRESERVED_BASE; - static_assert((ERISC_L1_UNRESERVED_BASE % 64) == 0); template diff --git a/tt_metal/hw/inc/dataflow_api.h b/tt_metal/hw/inc/dataflow_api.h index 88038173b3f..7f16650e680 100644 --- a/tt_metal/hw/inc/dataflow_api.h +++ b/tt_metal/hw/inc/dataflow_api.h @@ -1802,10 +1802,9 @@ void noc_async_atomic_barrier(uint8_t noc_idx = noc_index) { */ FORCE_INLINE void noc_async_full_barrier(uint8_t noc_idx = noc_index) { + invalidate_l1_cache(); WAYPOINT("NFBW"); - do { - invalidate_l1_cache(); - } while (!ncrisc_noc_reads_flushed(noc_idx)); + while (!ncrisc_noc_reads_flushed(noc_idx)); WAYPOINT("NFCW"); while (!ncrisc_noc_nonposted_writes_sent(noc_idx)); WAYPOINT("NFDW"); diff --git a/tt_metal/llrt/tt_cluster.cpp b/tt_metal/llrt/tt_cluster.cpp index 785b3d1dcb2..e35d4a2a4b4 100644 --- a/tt_metal/llrt/tt_cluster.cpp +++ b/tt_metal/llrt/tt_cluster.cpp @@ -501,16 +501,16 @@ int Cluster::get_device_aiclk(const chip_id_t &chip_id) const { return 0; } -void Cluster::deassert_risc_reset_at_core(const tt_cxy_pair &core) const { +void Cluster::deassert_risc_reset_at_core(const tt_cxy_pair& core, const TensixSoftResetOptions& soft_resets) const { const metal_SocDescriptor &soc_desc = this->get_soc_desc(core.chip); tt::umd::CoreCoord core_coord = soc_desc.get_coord_at(core, CoordSystem::TRANSLATED); - this->driver_->deassert_risc_reset_at_core(core.chip, core_coord); + this->driver_->deassert_risc_reset_at_core(core.chip, core_coord, soft_resets); } -void Cluster::assert_risc_reset_at_core(const tt_cxy_pair &core) const { +void Cluster::assert_risc_reset_at_core(const tt_cxy_pair& core, const TensixSoftResetOptions& soft_resets) const { const metal_SocDescriptor &soc_desc = this->get_soc_desc(core.chip); tt::umd::CoreCoord core_coord = soc_desc.get_coord_at(core, CoordSystem::TRANSLATED); - this->driver_->assert_risc_reset_at_core(core.chip, core_coord); + this->driver_->assert_risc_reset_at_core(core.chip, core_coord, soft_resets); } void Cluster::write_dram_vec(std::vector &vec, tt_target_dram dram, uint64_t addr, bool small_access) const { diff --git a/tt_metal/llrt/tt_cluster.hpp b/tt_metal/llrt/tt_cluster.hpp index 927d39d5dfc..1b54e3a1213 100644 --- a/tt_metal/llrt/tt_cluster.hpp +++ b/tt_metal/llrt/tt_cluster.hpp @@ -95,8 +95,12 @@ class Cluster { //! device driver and misc apis void verify_sw_fw_versions(int device_id, std::uint32_t sw_version, std::vector& fw_versions) const; - void deassert_risc_reset_at_core(const tt_cxy_pair& physical_chip_coord) const; - void assert_risc_reset_at_core(const tt_cxy_pair& physical_chip_coord) const; + void deassert_risc_reset_at_core( + const tt_cxy_pair& physical_chip_coord, + const TensixSoftResetOptions& soft_resets = TENSIX_DEASSERT_SOFT_RESET) const; + void assert_risc_reset_at_core( + const tt_cxy_pair& physical_chip_coord, + const TensixSoftResetOptions& soft_resets = TENSIX_ASSERT_SOFT_RESET) const; void write_dram_vec( std::vector& vec, tt_target_dram dram, uint64_t addr, bool small_access = false) const; @@ -172,6 +176,8 @@ class Cluster { // Returns set of logical active ethernet coordinates on chip // If skip_reserved_tunnel_cores is true, will return cores that dispatch is not using, // intended for users to grab available eth cores for testing + // `skip_reserved_tunnel_cores` is ignored on BH because there are no ethernet cores used for Fast Dispatch + // tunneling std::unordered_set get_active_ethernet_cores( chip_id_t chip_id, bool skip_reserved_tunnel_cores = false) const; From 9adb1c5d0dda0e1f6a019490b90e95e92ababfa2 Mon Sep 17 00:00:00 2001 From: Almeet Bhullar Date: Sat, 25 Jan 2025 02:48:19 +0000 Subject: [PATCH 189/316] Update checking eth txq status and slow it down to ensure cmd_ongoing bit is at a stable state --- tt_metal/hw/firmware/src/tt_eth_api.cpp | 5 +++-- tt_metal/hw/inc/ethernet/dataflow_api.h | 9 +++++++++ tt_metal/hw/inc/ethernet/erisc.h | 2 ++ tt_metal/hw/inc/ethernet/tt_eth_ss_regs.h | 7 +++++++ tt_metal/hw/inc/ethernet/tunneling.h | 17 +++++++++++++---- tt_metal/jit_build/build.cpp | 3 ++- .../ccl/kernels/edm/edm_handshake.hpp | 4 ++-- 7 files changed, 38 insertions(+), 9 deletions(-) diff --git a/tt_metal/hw/firmware/src/tt_eth_api.cpp b/tt_metal/hw/firmware/src/tt_eth_api.cpp index 2835915e4eb..1814a5732a7 100644 --- a/tt_metal/hw/firmware/src/tt_eth_api.cpp +++ b/tt_metal/hw/firmware/src/tt_eth_api.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include "tt_eth_api.h" +#include "ethernet/dataflow_api.h" void eth_txq_reg_write(uint32_t qnum, uint32_t offset, uint32_t val) { ETH_WRITE_REG(ETH_TXQ0_REGS_START + (qnum * ETH_TXQ_REGS_SIZE) + offset, val); @@ -13,7 +14,7 @@ uint32_t eth_txq_reg_read(uint32_t qnum, uint32_t offset) { } void eth_send_packet(uint32_t q_num, uint32_t src_word_addr, uint32_t dest_word_addr, uint32_t num_words) { - while (eth_txq_reg_read(q_num, ETH_TXQ_CMD) != 0) { + while (internal_::eth_txq_is_busy(q_num)) { } eth_txq_reg_write(q_num, ETH_TXQ_TRANSFER_START_ADDR, src_word_addr << 4); eth_txq_reg_write(q_num, ETH_TXQ_DEST_ADDR, dest_word_addr << 4); @@ -22,7 +23,7 @@ void eth_send_packet(uint32_t q_num, uint32_t src_word_addr, uint32_t dest_word_ } void eth_write_remote_reg(uint32_t q_num, uint32_t reg_addr, uint32_t val) { - while (eth_txq_reg_read(q_num, ETH_TXQ_CMD) != 0) { + while (internal_::eth_txq_is_busy(q_num)) { } eth_txq_reg_write(q_num, ETH_TXQ_DEST_ADDR, reg_addr); eth_txq_reg_write(q_num, ETH_TXQ_REMOTE_REG_DATA, val); diff --git a/tt_metal/hw/inc/ethernet/dataflow_api.h b/tt_metal/hw/inc/ethernet/dataflow_api.h index 2ee188b911b..8f949e86dcd 100644 --- a/tt_metal/hw/inc/ethernet/dataflow_api.h +++ b/tt_metal/hw/inc/ethernet/dataflow_api.h @@ -67,6 +67,7 @@ FORCE_INLINE void eth_noc_semaphore_wait(volatile tt_l1_ptr uint32_t* sem_addr, uint32_t val, uint32_t wait_min = 0) { uint32_t count = 0; while ((*sem_addr) != val) { + invalidate_l1_cache(); if (count == wait_min) { run_routing(); count = 0; @@ -95,6 +96,7 @@ FORCE_INLINE void eth_noc_semaphore_wait_min(volatile tt_l1_ptr uint32_t* sem_addr, uint32_t val, uint32_t wait_min = 0) { uint32_t count = 0; while ((*sem_addr) < val) { + invalidate_l1_cache(); if (count == wait_min) { run_routing(); count = 0; @@ -116,6 +118,7 @@ void eth_noc_async_read_barrier() { while (!ncrisc_noc_reads_flushed(noc_index)) { run_routing(); } + invalidate_l1_cache(); } /** @@ -290,6 +293,7 @@ void eth_wait_for_receiver_done(uint32_t wait_min = 0) { 1); uint32_t count = 0; while (erisc_info->channels[0].bytes_sent != 0) { + invalidate_l1_cache(); if (count == wait_min) { count = 0; run_routing(); @@ -352,6 +356,7 @@ void eth_wait_for_receiver_channel_done(uint32_t channel) { uint32_t max = 100000; while (!eth_is_receiver_channel_send_done(channel)) { + invalidate_l1_cache(); count++; if (count > max) { count = 0; @@ -378,6 +383,7 @@ FORCE_INLINE void eth_wait_receiver_done(uint32_t wait_min = 0) { uint32_t count = 0; while (erisc_info->channels[0].bytes_sent != 0) { + invalidate_l1_cache(); if (count == wait_min) { count = 0; run_routing(); @@ -406,6 +412,7 @@ FORCE_INLINE void eth_wait_for_bytes(uint32_t num_bytes, uint32_t wait_min = 0) { uint32_t count = 0; while (erisc_info->channels[0].bytes_sent != num_bytes) { + invalidate_l1_cache(); if (count == wait_min) { count = 0; run_routing(); @@ -454,6 +461,7 @@ void eth_wait_for_bytes_on_channel_sync_addr( uint32_t count = 0; uint32_t num_bytes_sent = eth_channel_syncs->bytes_sent; while (num_bytes_sent != num_bytes) { + invalidate_l1_cache(); uint32_t received_this_iter = eth_channel_syncs->bytes_sent; if (received_this_iter != num_bytes_sent) { // We are currently in the process of receiving data on this channel, so we just just wait a @@ -594,6 +602,7 @@ void eth_receiver_acknowledge(uint8_t channel = 0) { FORCE_INLINE void eth_wait_receiver_acknowledge(uint8_t channel = 0) { while (erisc_info->channels[channel].bytes_sent != 1) { + invalidate_l1_cache(); run_routing(); } } diff --git a/tt_metal/hw/inc/ethernet/erisc.h b/tt_metal/hw/inc/ethernet/erisc.h index 132433aa8e6..0a476f6b733 100644 --- a/tt_metal/hw/inc/ethernet/erisc.h +++ b/tt_metal/hw/inc/ethernet/erisc.h @@ -11,9 +11,11 @@ volatile inline uint32_t* flag_disable = (uint32_t*)(eth_l1_mem::address_map::LA namespace internal_ { inline __attribute__((always_inline)) void risc_context_switch() { +#ifdef COOPERATIVE_ERISC ncrisc_noc_full_sync(); rtos_context_switch_ptr(); ncrisc_noc_counters_init(); +#endif } inline __attribute__((always_inline)) void disable_erisc_app() { flag_disable[0] = 0; } diff --git a/tt_metal/hw/inc/ethernet/tt_eth_ss_regs.h b/tt_metal/hw/inc/ethernet/tt_eth_ss_regs.h index dbeff0ae738..82b4b5a913d 100644 --- a/tt_metal/hw/inc/ethernet/tt_eth_ss_regs.h +++ b/tt_metal/hw/inc/ethernet/tt_eth_ss_regs.h @@ -9,7 +9,11 @@ // ETH Params #define NUM_ECC_SOURCES (5 + 4 * 3 + 2) +#ifdef ARCH_BLACKHOLE +#define NUM_ETH_QUEUES 3 +#else #define NUM_ETH_QUEUES 2 +#endif ////////////////// // RISC debug regs @@ -48,6 +52,9 @@ #define ETH_TXQ_CMD_FLUSH (0x1 << 3) #define ETH_TXQ_STATUS 0x8 // IMPROVE: document (misc. internal bits for debug) +#define ETH_TXQ_STATUS_CMD_ONGOING_BIT \ + 0x10 // On Blackhole bit 16 of the ETH_TXQ_STATUS register indicates whether a packer transfer (raw/data/reg write) + // is ongoing #define ETH_TXQ_MAX_PKT_SIZE_BYTES 0xC // Max ethernet payload size (default = 1500 bytes) #define ETH_TXQ_BURST_LEN 0x10 // Value to drive on ati_q#_pbl output (default = 8) #define ETH_TXQ_TRANSFER_START_ADDR \ diff --git a/tt_metal/hw/inc/ethernet/tunneling.h b/tt_metal/hw/inc/ethernet/tunneling.h index a4070cbb24b..92eef061c2d 100644 --- a/tt_metal/hw/inc/ethernet/tunneling.h +++ b/tt_metal/hw/inc/ethernet/tunneling.h @@ -55,11 +55,20 @@ volatile uint32_t* RtosTable = namespace internal_ { -FORCE_INLINE bool eth_txq_is_busy(uint32_t q_num) { return eth_txq_reg_read(q_num, ETH_TXQ_CMD) != 0; } +FORCE_INLINE bool eth_txq_is_busy(uint32_t q_num) { +#ifdef ARCH_WORMHOLE + return eth_txq_reg_read(q_num, ETH_TXQ_CMD) != 0; +#else + // Due to https://tenstorrent.atlassian.net/browse/BH-55 we don't want to poll STATUS.cmd_ongoing bit too soon after + // a previous TX. Workaround is to perform any register operation on the same TX queue to slow down successive polls + eth_txq_reg_read(q_num, ETH_TXQ_CMD); + return ((eth_txq_reg_read(q_num, ETH_TXQ_STATUS) >> ETH_TXQ_STATUS_CMD_ONGOING_BIT) & 0x1) != 0; +#endif +} FORCE_INLINE void eth_send_packet(uint32_t q_num, uint32_t src_word_addr, uint32_t dest_word_addr, uint32_t num_words) { - while (eth_txq_reg_read(q_num, ETH_TXQ_CMD) != 0) { + while (eth_txq_is_busy(q_num)) { // Note, this is overly eager... Kills perf on allgather risc_context_switch(); } @@ -71,7 +80,7 @@ void eth_send_packet(uint32_t q_num, uint32_t src_word_addr, uint32_t dest_word_ FORCE_INLINE void eth_send_packet_unsafe(uint32_t q_num, uint32_t src_word_addr, uint32_t dest_word_addr, uint32_t num_words) { - ASSERT(eth_txq_reg_read(q_num, ETH_TXQ_CMD) == 0); + ASSERT(!eth_txq_is_busy(q_num)); eth_txq_reg_write(q_num, ETH_TXQ_TRANSFER_START_ADDR, src_word_addr << 4); eth_txq_reg_write(q_num, ETH_TXQ_DEST_ADDR, dest_word_addr << 4); eth_txq_reg_write(q_num, ETH_TXQ_TRANSFER_SIZE_BYTES, num_words << 4); @@ -89,7 +98,7 @@ void eth_send_packet_bytes_unsafe(uint32_t q_num, uint32_t src_addr, uint32_t de FORCE_INLINE void eth_write_remote_reg(uint32_t q_num, uint32_t reg_addr, uint32_t val) { - while (eth_txq_reg_read(q_num, ETH_TXQ_CMD) != 0) { + while (eth_txq_is_busy(q_num)) { risc_context_switch(); } eth_txq_reg_write(q_num, ETH_TXQ_DEST_ADDR, reg_addr); diff --git a/tt_metal/jit_build/build.cpp b/tt_metal/jit_build/build.cpp index 8876c9a6915..f6c8f991d05 100644 --- a/tt_metal/jit_build/build.cpp +++ b/tt_metal/jit_build/build.cpp @@ -471,7 +471,8 @@ JitBuildActiveEthernet::JitBuildActiveEthernet(const JitBuildEnv& env, const Jit this->defines_ += "-DCOMPILE_FOR_ERISC " "-DERISC " - "-DRISC_B0_HW "; + "-DRISC_B0_HW " + "-DCOOPERATIVE_ERISC "; this->includes_ += "-I " + env_.root_ + "tt_metal/hw/inc/ethernet "; diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm/edm_handshake.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm/edm_handshake.hpp index e2dad353ecc..072bac1276c 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm/edm_handshake.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm/edm_handshake.hpp @@ -69,7 +69,7 @@ FORCE_INLINE void sender_side_start( std::uint32_t handshake_register_address, size_t HS_CONTEXT_SWITCH_TIMEOUT = A_LONG_TIMEOUT_BEFORE_CONTEXT_SWITCH) { initialize_edm_common_datastructures(handshake_register_address); eth_wait_receiver_done(HS_CONTEXT_SWITCH_TIMEOUT); - while (eth_txq_reg_read(0, ETH_TXQ_CMD) != 0) { + while (eth_txq_is_busy()) { asm volatile("nop"); } eth_send_bytes(handshake_register_address, handshake_register_address, 16); @@ -101,7 +101,7 @@ FORCE_INLINE bool receiver_side_can_finish() { return eth_bytes_are_available_on FORCE_INLINE void receiver_side_finish( std::uint32_t handshake_register_address, size_t HS_CONTEXT_SWITCH_TIMEOUT = A_LONG_TIMEOUT_BEFORE_CONTEXT_SWITCH) { eth_wait_for_bytes(16, HS_CONTEXT_SWITCH_TIMEOUT); - while (eth_txq_reg_read(0, ETH_TXQ_CMD) != 0) { + while (eth_txq_is_busy()) { asm volatile("nop"); } eth_receiver_channel_done(0); From 63760bd2977cb53c90c358d0dc7a908eeaac1dad Mon Sep 17 00:00:00 2001 From: Austin Ho Date: Wed, 19 Feb 2025 22:53:04 +0000 Subject: [PATCH 190/316] #0: Add basic fabric sanity tests to CI for N300 --- .../workflows/all-post-commit-workflows.yaml | 15 +++ .../fabric-build-and-unit-tests-wrapper.yaml | 23 +++++ .../fabric-build-and-unit-tests.yaml | 93 +++++++++++++++++++ tests/scripts/run_cpp_fabric_tests.sh | 41 ++++++++ tt_metal/llrt/tt_cluster.cpp | 6 +- 5 files changed, 174 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/fabric-build-and-unit-tests-wrapper.yaml create mode 100644 .github/workflows/fabric-build-and-unit-tests.yaml create mode 100755 tests/scripts/run_cpp_fabric_tests.sh diff --git a/.github/workflows/all-post-commit-workflows.yaml b/.github/workflows/all-post-commit-workflows.yaml index e873132cdb1..06cbc2652ec 100644 --- a/.github/workflows/all-post-commit-workflows.yaml +++ b/.github/workflows/all-post-commit-workflows.yaml @@ -89,6 +89,21 @@ jobs: os: ubuntu-20.04 arch: ${{ matrix.test-group.arch }} runner-label: ${{ matrix.test-group.runner-label }} + # Fabric Unit Tests + fabric-unit-tests: + needs: build-artifact + secrets: inherit + strategy: + fail-fast: false + matrix: + test-group: [ + { arch: wormhole_b0, runner-label: N300 }, + ] + uses: ./.github/workflows/fabric-build-and-unit-tests.yaml + with: + os: ubuntu-20.04 + arch: ${{ matrix.test-group.arch }} + runner-label: ${{ matrix.test-group.runner-label }} # TTNN FD Unit tests ttnn-unit-tests: needs: build-artifact diff --git a/.github/workflows/fabric-build-and-unit-tests-wrapper.yaml b/.github/workflows/fabric-build-and-unit-tests-wrapper.yaml new file mode 100644 index 00000000000..b08c53a0c7d --- /dev/null +++ b/.github/workflows/fabric-build-and-unit-tests-wrapper.yaml @@ -0,0 +1,23 @@ +name: "[post-commit] Fabric unit tests" + +on: + workflow_dispatch: + +jobs: + build-artifact: + uses: ./.github/workflows/build-artifact.yaml + secrets: inherit + + fabric-unit-tests: + needs: build-artifact + secrets: inherit + strategy: + fail-fast: false + matrix: + test-group: [ + { arch: wormhole_b0, runner-label: N300 }, + ] + uses: ./.github/workflows/fabric-build-and-unit-tests.yaml + with: + arch: ${{ matrix.test-group.arch}} + runner-label: ${{ matrix.test-group.runner-label}} diff --git a/.github/workflows/fabric-build-and-unit-tests.yaml b/.github/workflows/fabric-build-and-unit-tests.yaml new file mode 100644 index 00000000000..0f0265939e8 --- /dev/null +++ b/.github/workflows/fabric-build-and-unit-tests.yaml @@ -0,0 +1,93 @@ +name: "[internal] Fabric unit tests impl" + +on: + workflow_call: + inputs: + arch: + required: true + type: string + runner-label: + required: true + type: string + timeout: + required: false + type: number + default: 10 + os: + required: false + type: string + default: "ubuntu-20.04" + workflow_dispatch: + inputs: + arch: + required: true + type: choice + options: + - wormhole_b0 + runner-label: + required: true + type: choice + options: + - N300 + timeout: + required: false + type: number + default: 10 + os: + required: false + type: string + default: "ubuntu-20.04" + +jobs: + fabric-tests: + strategy: + # Do not fail-fast because we need to ensure all tests go to completion + # so we try not to get hanging machines + fail-fast: false + matrix: + test-group: [ + {name: fabric unit tests, cmd: ./tests/scripts/run_cpp_fabric_tests.sh }, + ] + name: ${{ inputs.arch }} ${{ inputs.runner-label }} ${{ matrix.test-group.name }} + runs-on: + - ${{ inputs.runner-label }} + - cloud-virtual-machine + - in-service + env: + ARCH_NAME: ${{ inputs.arch }} + LOGURU_LEVEL: INFO + steps: + - uses: tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main + - uses: ./.github/actions/prepare-metal-run + - name: ${{ matrix.test-group.name }} tests + timeout-minutes: ${{ inputs.timeout }} + uses: ./.github/actions/docker-run + with: + docker_os_arch: tt-metalium/${{ inputs.os }}-amd64 + docker_password: ${{ secrets.GITHUB_TOKEN }} + docker_opts: | + -e ARCH_NAME=${{ inputs.arch }} + -e TT_METAL_HOME=${{ github.workspace }} + -e TT_METAL_SLOW_DISPATCH_MODE=1 + -e LD_LIBRARY_PATH=${{ github.workspace }}/build/lib + -e GTEST_OUTPUT=xml:generated/test_reports/ + run_args: | + pip install --force-reinstall pip==21.2.4 + pip install -r tt_metal/python_env/requirements-dev.txt + pip install -e . + mkdir -p generated/test_reports + ${{ matrix.test-group.cmd }} + - uses: ./.github/actions/slack-report + if: ${{ failure() }} + with: + slack_webhook_url: ${{ secrets.SLACK_WEBHOOK_URL }} + owner: U06CXU895AP # Michael Chiou + - uses: ./.github/actions/upload-artifact-with-job-uuid + if: ${{ !cancelled() }} + with: + path: | + generated/test_reports/ + prefix: "test_reports_" + - name: Generate system logs on failure + uses: ./.github/actions/generate-system-logs + if: ${{ failure() }} diff --git a/tests/scripts/run_cpp_fabric_tests.sh b/tests/scripts/run_cpp_fabric_tests.sh new file mode 100755 index 00000000000..d16e10963c4 --- /dev/null +++ b/tests/scripts/run_cpp_fabric_tests.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +set -eo pipefail + +if [[ -z "$TT_METAL_HOME" ]]; then + echo "Must provide TT_METAL_HOME in environment" 1>&2 + exit 1 +fi + +if [[ -z "$ARCH_NAME" ]]; then + echo "Must provide ARCH_NAME in environment" 1>&2 + exit 1 +fi + +export TT_METAL_CLEAR_L1=1 + +############################################# +# FABRIC SANITY TESTS # +############################################# +echo "Running fabric sanity tests now..."; + +cd $TT_METAL_HOME + +TEST_FOLDER="./build/test/tt_metal/perf_microbenchmark/routing" + +# Async Write +TT_METAL_SLOW_DISPATCH_MODE=1 ${TEST_FOLDER}/test_tt_fabric_sanity_${ARCH_NAME} --fabric_command 1 --board_type n300 --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16 +TT_METAL_SLOW_DISPATCH_MODE=1 ${TEST_FOLDER}/test_tt_fabric_sanity_${ARCH_NAME} --fabric_command 1 --board_type n300 --data_kb_per_tx 10 --num_src_endpoints 8 --num_dest_endpoints 8 --num_links 16 --benchmark +TT_METAL_SLOW_DISPATCH_MODE=1 ${TEST_FOLDER}/test_tt_fabric_sanity_${ARCH_NAME} --fabric_command 1 --board_type n300 --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16 --metal_fabric_init_level 1 +TT_METAL_SLOW_DISPATCH_MODE=1 ${TEST_FOLDER}/test_tt_fabric_sanity_${ARCH_NAME} --fabric_command 1 --board_type n300 --data_kb_per_tx 10 --num_src_endpoints 8 --num_dest_endpoints 8 --num_links 16 --benchmark --metal_fabric_init_level 1 +# Async Write Mcast +TT_METAL_SLOW_DISPATCH_MODE=1 ${TEST_FOLDER}/test_tt_fabric_sanity_${ARCH_NAME} --fabric_command 1 --board_type n300 --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16 --e_depth 1 +TT_METAL_SLOW_DISPATCH_MODE=1 ${TEST_FOLDER}/test_tt_fabric_sanity_${ARCH_NAME} --fabric_command 1 --board_type n300 --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16 --w_depth 1 +TT_METAL_SLOW_DISPATCH_MODE=1 ${TEST_FOLDER}/test_tt_fabric_sanity_${ARCH_NAME} --fabric_command 1 --board_type n300 --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16 --e_depth 1 --metal_fabric_init_level 1 +# TODO: Enable benchmark functionality for mcast +# Atomic Inc +TT_METAL_SLOW_DISPATCH_MODE=1 ${TEST_FOLDER}/test_tt_fabric_sanity_${ARCH_NAME} --fabric_command 64 --board_type n300 --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16 +TT_METAL_SLOW_DISPATCH_MODE=1 ${TEST_FOLDER}/test_tt_fabric_sanity_${ARCH_NAME} --fabric_command 64 --board_type n300 --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16 --metal_fabric_init_level 1 +# Async Write Atomic Inc +TT_METAL_SLOW_DISPATCH_MODE=1 ${TEST_FOLDER}/test_tt_fabric_sanity_${ARCH_NAME} --fabric_command 65 --board_type n300 --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16 +TT_METAL_SLOW_DISPATCH_MODE=1 ${TEST_FOLDER}/test_tt_fabric_sanity_${ARCH_NAME} --fabric_command 65 --board_type n300 --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16 --metal_fabric_init_level 1 diff --git a/tt_metal/llrt/tt_cluster.cpp b/tt_metal/llrt/tt_cluster.cpp index e35d4a2a4b4..afa0a600254 100644 --- a/tt_metal/llrt/tt_cluster.cpp +++ b/tt_metal/llrt/tt_cluster.cpp @@ -156,12 +156,10 @@ void Cluster::generate_cluster_descriptor() { } bool all_n300 = true; for (const auto& chip_id : this->cluster_desc_->get_all_chips()) { - if (this->cluster_desc_->get_board_type(chip_id) == BoardType::N300) { - all_n300 &= (this->cluster_desc_->get_board_type(chip_id) == BoardType::N300); - } + all_n300 &= (this->cluster_desc_->get_board_type(chip_id) == BoardType::N300); } if (all_n300) { - if (this->cluster_desc_->get_all_chips().size() == 1) { + if (this->cluster_desc_->get_all_chips().size() == 2) { this->cluster_type_ = ClusterType::N300; } else if (this->cluster_desc_->get_all_chips().size() == 8) { this->cluster_type_ = ClusterType::T3K; From e27c83a76fa8741eed9280a7b4df8eac08449327 Mon Sep 17 00:00:00 2001 From: Miguel Tairum <150826086+mtairum@users.noreply.github.com> Date: Thu, 20 Feb 2025 17:55:27 +0000 Subject: [PATCH 191/316] Refactor llama3 demo to the new generator API (#16753) ### What's changed - New Llama3 demo now uses the generator API - Improved prefill performance. E.g. Llama3-70B now at 182ms prefill time - Improved profiling in the demo - Removed old text demo and updated CI accordingly - Cleaned up the prompt input files and added missing ones. - New benchmark profiling for superset: now includes TTFT, and full decode perf for 4096 iteration (for plotting). - Add llama3 demo custom input support: you can now override any settings for easier testing. - Updated PERF.md with the latest numbers. --- models/demos/llama3/PERF.md | 147 ++- models/demos/llama3/README.md | 27 +- models/demos/llama3/demo/conftest.py | 23 + models/demos/llama3/demo/demo.py | 1010 ----------------- .../llama3/demo/input_data_questions.json | 97 -- .../input_data_long_128k.json | 0 .../sample_prompts/input_data_long_16k.json | 7 + .../sample_prompts/input_data_long_1k.json | 7 + .../sample_prompts/input_data_long_2k.json | 7 + .../input_data_long_32k.json | 0 .../sample_prompts/input_data_long_4k.json | 7 + .../input_data_long_64k.json | 0 .../sample_prompts/input_data_long_8k.json | 7 + .../input_data_prefill_128.json | 0 .../input_data_questions_prefill_128.json | 98 ++ .../input_data_questions_prefill_256.json} | 0 models/demos/llama3/demo/simple_text_demo.py | 761 +++++++++++++ .../demos/llama3/demo/simple_vision_demo.py | 2 +- models/demos/llama3/lt | 108 +- ..._llama_cross_attention_transformer_text.py | 1 - .../demos/llama3/tests/test_llama_accuracy.py | 3 +- .../tests/test_llama_attention_prefill.py | 1 - .../tests/test_llama_chunked_generation.py | 3 +- .../tests/test_llama_decoder_prefill.py | 1 - models/demos/llama3/tests/test_llama_model.py | 4 +- .../llama3/tests/test_llama_model_prefill.py | 1 - models/demos/llama3/tt/generator.py | 24 +- models/demos/llama3/tt/llama_common.py | 103 +- models/demos/llama3/tt/llama_model.py | 71 +- models/demos/llama3/tt/llama_rope.py | 4 +- models/demos/llama3/tt/model_config.py | 21 +- ...lama_cross_attention_transformer_vision.py | 1 + .../llama3/tt/multimodal/llama_image_mlp.py | 1 + .../tt/multimodal/llama_vision_model.py | 3 +- .../single_card/run_single_card_demo_tests.sh | 14 +- tests/scripts/t3000/run_t3000_demo_tests.sh | 7 +- .../scripts/t3000/run_t3000_frequent_tests.sh | 9 - tests/scripts/t3000/run_t3000_unit_tests.sh | 6 - tests/scripts/tg/run_tg_demo_tests.sh | 2 +- .../misc/test_rotary_embedding_llama.py | 2 +- .../test_rotary_embedding_llama_fused_qk.py | 2 +- tt_metal/python_env/requirements-dev.txt | 3 + 42 files changed, 1295 insertions(+), 1300 deletions(-) create mode 100644 models/demos/llama3/demo/conftest.py delete mode 100644 models/demos/llama3/demo/demo.py delete mode 100644 models/demos/llama3/demo/input_data_questions.json rename models/demos/llama3/demo/{ => sample_prompts}/input_data_long_128k.json (100%) create mode 100644 models/demos/llama3/demo/sample_prompts/input_data_long_16k.json create mode 100644 models/demos/llama3/demo/sample_prompts/input_data_long_1k.json create mode 100644 models/demos/llama3/demo/sample_prompts/input_data_long_2k.json rename models/demos/llama3/demo/{ => sample_prompts}/input_data_long_32k.json (100%) create mode 100644 models/demos/llama3/demo/sample_prompts/input_data_long_4k.json rename models/demos/llama3/demo/{ => sample_prompts}/input_data_long_64k.json (100%) create mode 100644 models/demos/llama3/demo/sample_prompts/input_data_long_8k.json rename models/demos/llama3/demo/{ => sample_prompts}/input_data_prefill_128.json (100%) create mode 100644 models/demos/llama3/demo/sample_prompts/input_data_questions_prefill_128.json rename models/demos/llama3/demo/{input_data_questions_prefill_128.json => sample_prompts/input_data_questions_prefill_256.json} (100%) create mode 100644 models/demos/llama3/demo/simple_text_demo.py diff --git a/models/demos/llama3/PERF.md b/models/demos/llama3/PERF.md index 8fb3be2baf7..2209cbcec87 100644 --- a/models/demos/llama3/PERF.md +++ b/models/demos/llama3/PERF.md @@ -4,54 +4,109 @@ Performance collected from [demo/demo.py](demo/demo.py) and accuracy collected f Note that `test_llama_accuracy.py` parses the below to determine expected values +- 0.5. +Also note that all the performance metrics below were taken for a maximum generation of 200 tokens, i.e., 200 decode iterations. + ## Performance -This configuration uses bfp4 MLP FF1+FF3 for all models. - -| Model | Device | Top-1 (%) | Top-5 (%) | Speed (t/s/u) | -|----------------|--------|-----------|-----------|---------------| -| Llama3.2-1B | N150 | 89 | 98 | 86.9 | -| Llama3.2-1B | N300 | 90 | 98 | 104.3 | -| Llama3.2-1B | T3K | 87 | 98 | 118.5 | -| Llama3.2-1B | TG | | | 72.3 | -| Llama3.2-3B | N150 | 91 | 96 | 53.3 | -| Llama3.2-3B | N300 | 91 | 96 | 66.1 | -| Llama3.2-3B | T3K | 91 | 96 | 66.9 | -| Llama3.2-3B | TG | | | 48.5 | -| Llama3.1-8B | N150 | 87 | 99 | 27.9 | -| Llama3.1-8B | N300 | 88 | 99 | 43.7 | -| Llama3.1-8B | T3K | 88 | 99 | 64.2 | -| Llama3.1-8B | TG | | | 41.0 | -| Llama3.2-11B | N300 | 89 | 99 | 43.5 | -| Llama3.2-11B | T3K | 88 | 99 | 63.4 | -| Llama3.2-11B | TG | | | 40.9 | -| Llama3.1-70B | T3K | 96 | 100 | 16.1 | -| Llama3.1-70B | TG | | | | -| Qwen2.5-7B | N300 | 80 | 96 | 37.9 | -| Qwen2.5-72B | T3K | 98 | 100 | 12.8 | +This configuration uses bfp4 MLP FF1+FF3 for all models. **Batch_size=1 and prefill_length is 128 tokens.** + +| Model | Device | Top-1 (%) | Top-5 (%) | Speed (t/s/u) | TTFT (ms) | +|----------------|--------|-----------|-----------|---------------|-----------| +| Llama3.2-1B | N150 | 88 | 98 | 84.5 | 58 | +| Llama3.2-1B | N300 | 91 | 98 | 100.5 | 54 | +| Llama3.2-1B | T3K | 89 | 98 | 113.8 | 41 | +| Llama3.2-1B | TG | 88 | 99 | 51.0 | | +| Llama3.2-3B | N150 | 92 | 95 | 52.4 | 76 | +| Llama3.2-3B | N300 | 92 | 97 | 65.3 | 56 | +| Llama3.2-3B | T3K | 91 | 97 | 65.4 | 64 | +| Llama3.2-3B | TG | 90 | 97 | 33.5 | | +| Llama3.1-8B | N150 | 88 | 100 | 27.8 | 121 | +| Llama3.1-8B | N300 | 88 | 100 | 43.3 | 85 | +| Llama3.1-8B | T3K | 88 | 100 | 62.3 | 69 | +| Llama3.1-8B | TG | 86 | 98 | 29.5 | | +| Llama3.2-11B | N300 | 90 | 99 | 42.8 | 84 | +| Llama3.2-11B | T3K | 87 | 99 | 61.2 | 75 | +| Llama3.2-11B | TG | 86 | 98 | 29.5 | | +| Llama3.1-70B | T3K | 97 | 100 | 16.3 | 182 | +| Llama3.1-70B | TG | 95 | 100 | 12.7 | | +| Qwen2.5-7B | N300 | 80 | 96 | 37.9 | | +| Qwen2.5-72B | T3K | 98 | 100 | 12.8 | | + ## Accuracy -This configuration uses bfp4 MLP FF1+FF3 only for the Llama-3.1-70B model and the Qwen-2.5-72B model. - -| Model | Device | Top-1 (%) | Top-5 (%) | Speed (t/s/u) | -|----------------|--------|-----------|-----------|---------------| -| Llama3.2-1B | N150 | 88 | 98 | 86.8 | -| Llama3.2-1B | N300 | 88 | 98 | 98.1 | -| Llama3.2-1B | T3K | 89 | 99 | 97.5 | -| Llama3.2-1B | TG | 87 | 98 | 51.3 | -| Llama3.2-3B | N150 | 92 | 99 | 44.2 | -| Llama3.2-3B | N300 | 92 | 98 | 54.2 | -| Llama3.2-3B | T3K | 91 | 100 | 55.6 | -| Llama3.2-3B | TG | 91 | 98 | 33.6 | -| Llama3.1-8B | N150 | 93 | 100 | 23.6 | -| Llama3.1-8B | N300 | 93 | 100 | 34.5 | -| Llama3.1-8B | T3K | 92 | 100 | 49.8 | -| Llama3.1-8B | TG | 88 | 100 | 29.5 | -| Llama3.2-11B | N300 | 93 | 100 | 33.8 | -| Llama3.2-11B | T3K | 94 | 100 | 52.6 | -| Llama3.2-11B | TG | 88 | 100 | 29.5 | -| Llama3.1-70B | T3K | 97 | 100 | 14.7 | -| Llama3.1-70B | TG | 95 | 100 | 12.7 | -| Qwen2.5-7B | N300 | 80 | 96 | 33.4 | -| Qwen2.5-72B | T3K | 99 | 100 | 12.8 | +This configuration uses bfp4 MLP FF1+FF3 only for the 3.1-70B model and the Qwen-2.5-72B model. **Batch_size=1 and prefill_length is 128 tokens.** + +| Model | Device | Top-1 (%) | Top-5 (%) | Speed (t/s/u) | TTFT (ms) | +|----------------|--------|-----------|-----------|---------------|-----------| +| Llama3.2-1B | N150 | 91 | 98 | 82.0 | 55 | +| Llama3.2-1B | N300 | 91 | 98 | 98.6 | 59 | +| Llama3.2-1B | T3K | 88 | 98 | 114.1 | 42 | +| Llama3.2-1B | TG | 87 | 98 | 51.3 | | +| Llama3.2-3B | N150 | 94 | 99 | 47.0 | 83 | +| Llama3.2-3B | N300 | 90 | 98 | 61.1 | 64 | +| Llama3.2-3B | T3K | 92 | 98 | 65.2 | 63 | +| Llama3.2-3B | TG | 91 | 98 | 33.6 | | +| Llama3.1-8B | N150 | 93 | 100 | 24.8 | 160 | +| Llama3.1-8B | N300 | 94 | 100 | 37.8 | 100 | +| Llama3.1-8B | T3K | 94 | 100 | 59.8 | 79 | +| Llama3.1-8B | TG | 88 | 100 | 29.5 | | +| Llama3.2-11B | N300 | 92 | 100 | 37.5 | 97 | +| Llama3.2-11B | T3K | 95 | 100 | 59.2 | 64 | +| Llama3.2-11B | TG | 88 | 100 | 29.5 | | +| Llama3.1-70B | T3K | 98 | 100 | 14.1 | 210 | +| Llama3.1-70B | TG | 95 | 100 | 12.7 | | +| Qwen2.5-7B | N300 | 80 | 96 | 33.4 | | +| Qwen2.5-72B | T3K | 99 | 100 | 12.8 | | + +## Long-context (64K Tokens) + +This configuration uses bfp4 MLP FF1+FF3 for all models. **Batch_size=1 and prefill_length is 64k tokens.** + +| Model | Device | Speed (t/s/u) | TTFT (ms) | +|----------------|--------|---------------|-----------| +| Llama3.2-1B | N150 | 53.0 | 20191 | +| Llama3.2-1B | N300 | 65.2 | 10973 | +| Llama3.2-1B | T3K | 73.7 | 5271 | +| Llama3.2-1B | TG | | | +| Llama3.2-3B | N150 | 25.3 | 46936 | +| Llama3.2-3B | N300 | 34.8 | 23115 | +| Llama3.2-3B | T3K | 41.0 | 10727 | +| Llama3.2-3B | TG | | | +| Llama3.1-8B | N150 | 16.9 | 65083 | +| Llama3.1-8B | N300 | 26.1 | 36422 | +| Llama3.1-8B | T3K | 38.1 | 16287 | +| Llama3.1-8B | TG | | | +| Llama3.2-11B | N300 | 26.1 | 36422 | +| Llama3.2-11B | T3K | 38.4 | 16288 | +| Llama3.2-11B | TG | | | +| Llama3.1-70B | T3K | 11.9 | 74363 | +| Llama3.1-70B | TG | | | +| Qwen2.5-7B | N300 | | | +| Qwen2.5-72B | T3K | | | + +## Short-Context, Batch-32 + +This configuration uses bfp4 MLP FF1+FF3 for all models. **Batch_size=32 and prefill_length is 128 tokens.** + +| Model | Device | Speed (t/s/u) | avg TTFT (ms) | +|----------------|--------|---------------|---------------| +| Llama3.2-1B | N150 | 54.7 | 55 | +| Llama3.2-1B | N300 | 64.2 | 48 | +| Llama3.2-1B | T3K | 69.9 | 57 | +| Llama3.2-1B | TG | | | +| Llama3.2-3B | N150 | 36.5 | 84 | +| Llama3.2-3B | N300 | 45.8 | 68 | +| Llama3.2-3B | T3K | 47.8 | 71 | +| Llama3.2-3B | TG | | | +| Llama3.1-8B | N150 | 22.3 | 134 | +| Llama3.1-8B | N300 | 33.5 | 93 | +| Llama3.1-8B | T3K | 45.6 | 79 | +| Llama3.1-8B | TG | | | +| Llama3.2-11B | N300 | 33.4 | 100 | +| Llama3.2-11B | T3K | 45.1 | 76 | +| Llama3.2-11B | TG | | | +| Llama3.1-70B | T3K | 14.8 | 192 | +| Llama3.1-70B | TG | | | +| Qwen2.5-7B | N300 | | | +| Qwen2.5-72B | T3K | | | diff --git a/models/demos/llama3/README.md b/models/demos/llama3/README.md index 5e8bd6f44de..61672a87660 100644 --- a/models/demos/llama3/README.md +++ b/models/demos/llama3/README.md @@ -60,8 +60,8 @@ python models/demos/llama3/scripts/repack_weights_70b.py 1` or using `top-p` sampling with any batch size, these ops will be run on host. This is because those ops are not yet fully supported on device. A decrease in performance is expected when these configurations are enabled. @@ -150,18 +151,26 @@ Example: `export FAKE_DEVICE=N150`, will enable running a single-chip demo on a # Examples of how to run the demo for any supported Llama3 models # Batch-1 -pytest models/demos/llama3/demo/demo.py -k "performance and batch-1" +pytest models/demos/llama3/demo/simple_text_demo.py -k "performance and batch-1" # Batch-32 -pytest models/demos/llama3/demo/demo.py -k "performance and batch-32" +pytest models/demos/llama3/demo/simple_text_demo.py -k "performance and batch-32" # Long-context -pytest models/demos/llama3/demo/demo.py -k "performance and long" +pytest models/demos/llama3/demo/simple_text_demo.py -k "performance and long" ``` The above examples are run in `LlamaOptimizations.performance` mode. You can override this by setting the `optimizations` argument in the demo. To use instead the accuracy mode you can call the above tests with `-k "accuracy and ..."` instead of performance. +#### Custom input arguments +To facilitate testing different configurations, `simple_text_demo.py` supports argument overrides. The full list of overrides is included in `models/demos/llama3/demo/conftest.py`. + +An example usage where the `batch-1` test is modified to run with 16 users and keep generating tokens until 1024 are generated: + +``` +pytest models/demos/llama3/demo/simple_text_demo.py -k "performance and batch-1" --batch_size 16 --max_generated_tokens 1024 --stop_at_eos 0 +``` ### Expected performance and accuracy diff --git a/models/demos/llama3/demo/conftest.py b/models/demos/llama3/demo/conftest.py new file mode 100644 index 00000000000..79c1c029477 --- /dev/null +++ b/models/demos/llama3/demo/conftest.py @@ -0,0 +1,23 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +# These inputs override the default inputs used by simple_text_demo.py. Check the main demo to see the default values. +def pytest_addoption(parser): + parser.addoption("--input_prompts", action="store", help="input prompts json file") + parser.addoption("--instruct", action="store", type=int, help="Use instruct weights") + parser.addoption("--repeat_batches", action="store", type=int, help="Number of consecutive batches of users to run") + parser.addoption("--max_seq_len", action="store", type=int, help="Maximum context length supported by the model") + parser.addoption("--batch_size", action="store", type=int, help="Number of users in a batch ") + parser.addoption( + "--max_generated_tokens", action="store", type=int, help="Maximum number of tokens to generate for each user" + ) + parser.addoption( + "--paged_attention", action="store", type=bool, help="Whether to use paged attention or default attention" + ) + parser.addoption("--page_params", action="store", type=dict, help="Page parameters for paged attention") + parser.addoption("--sampling_params", action="store", type=dict, help="Sampling parameters for decoding") + parser.addoption( + "--stop_at_eos", action="store", type=int, help="Whether to stop decoding when the model generates an EoS token" + ) diff --git a/models/demos/llama3/demo/demo.py b/models/demos/llama3/demo/demo.py deleted file mode 100644 index 21aea65fb6b..00000000000 --- a/models/demos/llama3/demo/demo.py +++ /dev/null @@ -1,1010 +0,0 @@ -# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. - -# SPDX-License-Identifier: Apache-2.0 - -import torch -import json -from time import time -from datetime import datetime -from loguru import logger -import os -import ttnn -import math -import pytest -import requests -from pathlib import Path -import hashlib - -from models.demos.llama3.tt.llama_common import ( - get_prefill_rot_mat, - PagedAttentionConfig, - sample_host, -) -from models.demos.llama3.tt.llama_model import TtTransformer -from models.demos.llama3.tt.llama_embedding import TtLlamaEmbedding -from models.demos.llama3.tt.model_config import TtModelArgs - -from models.perf.benchmarking_utils import BenchmarkProfiler -from models.demos.utils.llm_demo_utils import create_benchmark_data -from models.demos.llama3.tt.model_config import LlamaOptimizations - - -def load_and_cache_context(context_url, cache_dir, max_length=None): - cache_file = cache_dir / hashlib.md5(context_url.encode()).hexdigest() - - if cache_file.exists(): - with open(cache_file, "r") as f: - context_text = f.read() - logger.info(f"Loaded context from cache: {context_url}") - else: - try: - response = requests.get(context_url) - if response.status_code == 200: - context_text = response.text - with open(cache_file, "w") as f: - f.write(context_text) - logger.info(f"Downloaded and cached context: {context_url}") - else: - logger.warning(f"Failed to fetch context from URL: {context_url}. Status code: {response.status_code}") - context_text = "" - except Exception as e: - logger.error(f"Error fetching context from URL: {context_url}. Error: {str(e)}") - context_text = "" - - # Clip the context to the max length provided - if max_length: - context_text = context_text[:max_length] - logger.info(f"Clipped the context text to {max_length} characters") - - return context_text - - -# load from json, return as a list -def load_inputs(user_input, batch, instruct_mode): - if isinstance(user_input, str): - with open(user_input, "r") as f: - user_input = json.load(f) - assert len(user_input) >= batch, f"Number of users (batch) must be {batch}!" - in_prompt = [] - cache_dir = Path("models/demos/llama3/demo/context_cache") - cache_dir.mkdir(parents=True, exist_ok=True) - - for i in range(batch): - prompt = user_input[i]["prompt"] - if "context" in user_input[i]: - if "max_length" in user_input[i]: # Clip the context to the max length provided - context_text = load_and_cache_context( - user_input[i]["context"], cache_dir, max_length=user_input[i]["max_length"] - ) - else: - context_text = load_and_cache_context(user_input[i]["context"], cache_dir) - if instruct_mode: - prompt = ( - "```" + context_text + "```\n\n" + prompt - ) # Add the markdown block to the context to comply with the prompt - else: - prompt = context_text - in_prompt.append(prompt) - return in_prompt - - -def preprocess_inputs_prefill( - input_prompts, - tokenizer, - model_args, - instruct, - max_generated_tokens, - max_prefill_len=128 * 1024, -): - """ - Run tokenizer on inputs, and create embeddings for the first token of each input - """ - # The maximum KV-cache len supported is 32k. To avoid going out of memory, clip the max prefill length by the maximum number of tokens that will be generated - if max_prefill_len == 128 * 1024: - max_prefill_len = 128 * 1024 - max_generated_tokens - - encoded_prompts = [model_args.encode_prompt(prompt, instruct=instruct) for prompt in input_prompts] - - # Print the length of encoded prompts - logger.info("Encoded prompt lengths:" + ", ".join(str(len(prompt)) for prompt in encoded_prompts)) - - prompt_lens = [len(x) for x in encoded_prompts] - min_prompt_len = min(prompt_lens) - max_prompt_len = max(prompt_lens) - - # The large input demo we provide contains more tokens than the maximum (32k tokens) - # To avoid running out of memory, clip to max_prefill_len - - if min_prompt_len > max_prefill_len: - logger.info(f"Left-clipping prompts to {max_prefill_len}") - if instruct: - # We need to allow a few tokens for the system prompt and the special turn tokens for assistant and user; - # to find out how big those will be, we will: - # 1. Tokenize the entire prompt with non-instruct tokenization - # 2. Calculate overhead = length of instruct tokenization - length of non-instruct tokenization - # 3. Shorten the tokenized clipped prompt by the overhead and convert back to text - # 4. Tokenize the result with instruct tokenization - # 5. Assert that the length of this is equal to the max_prefill_len - raw_prompts = [model_args.encode_prompt(prompt, instruct=False) for prompt in input_prompts] - overhead = [len(e) - len(r) for e, r in zip(encoded_prompts, raw_prompts)] - shortened = [tokenizer.decode(e[-(max_prefill_len - o) :]) for e, o in zip(raw_prompts, overhead)] - encoded_prompts = [model_args.encode_prompt(prompt, instruct=instruct) for prompt in shortened] - assert all( - len(e) == max_prefill_len for e in encoded_prompts - ), f"Clipped prompts are not of the correct length, expected {max_prefill_len} but got {[len(e) for e in encoded_prompts]}" - else: - encoded_prompts = [encod[-max_prefill_len:] for encod in encoded_prompts] - - # Update prompt lengths - prompt_lens = [len(x) for x in encoded_prompts] - min_prompt_len = min(prompt_lens) - max_prompt_len = max(prompt_lens) - - assert ( - max_prompt_len <= model_args.max_seq_len - ), f"Max prompt length {max_prompt_len} exceeds model max seq len {model_args.max_seq_len}" - assert min_prompt_len > 0, "Minimum prompt length must be greater than 0" - assert min_prompt_len <= max_prompt_len, f"Minimum prompt length {min_prompt_len} exceeds max len {max_prompt_len}" - - logger.info(f"# of users: {len(encoded_prompts)}") - input_tokens_prefill = [] - decoding_pos = [] - prefill_lens = [] - - # Always prefill the nearest power of 2 for each user. This means that the majority of cases we will prefill more tokens than needed. - # To avoid issues, we keep track of the decoding position to decode correctly the user's prompt - for i, encoded in enumerate(encoded_prompts): - # Prefill size is nearest power of 2 - prefill_seq_len = max(2 ** math.ceil(math.log(len(encoded), 2)), 128) - - # Initial prefill tensors full of pad tokens - input_tokens_prefill_i = torch.full((1, prefill_seq_len), 0, dtype=torch.int32) - input_tokens_prefill_i[0, : len(encoded[:])] = torch.tensor(encoded[:]).to(input_tokens_prefill_i) - input_tokens_prefill.append(input_tokens_prefill_i) - - # Keep the correct decoding position of each user - decoding_pos.append(len(encoded)) - prefill_lens.append(prefill_seq_len) - - return ( - input_tokens_prefill, - encoded_prompts, - decoding_pos, - prefill_lens, - ) - - -def run_llama3_demo( - user_input, - mesh_device, - max_seq_len, - batch_size, - num_batches, - paged_attention, - paged_attention_config, - max_generated_tokens, - optimizations, - sampling_params, - instruct_mode, - is_ci_env, - print_to_file, -): - # Creat batch output file - timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") - output_directory = "models/demos/llama3/demo/output" - os.makedirs(output_directory, exist_ok=True) - os.chmod(output_directory, 0o755) - output_filename = f"{output_directory}/demo_user_output_{timestamp}.txt" - - dtype = ttnn.bfloat8_b - assert batch_size <= 32, "Max batch size currently supported is 32" - assert max_seq_len <= 128 * 1024, "Max sequence length must be less than 128k tokens" - - # We disregard any warmup iteration for profiling, in favour of just measuring compile time on the first iteration - N_warmup_iter = {"inference_prefill": 0, "inference_decode": 0} - - # Start profiler - logger.info(f"Start profiler") - profiler = BenchmarkProfiler() - profiler.start("run") - - logger.info(f"Reading inputs...") - profiler.start("loading_inputs") - if len(user_input) == 1: - input_prompts = user_input * batch_size - else: - input_prompts = load_inputs(user_input, batch_size, instruct_mode) - profiler.end("loading_inputs") - - # Generate the batched prompts (rotate the inputs between the users, for each batch) - # If batch_size == 1, the same prompt is repeated for each batch - batch_prompts = [] - for i in range(num_batches): - batch_prompts.append([input_prompts[(j + i) % len(input_prompts)] for j in range(len(input_prompts))]) - - # Load model args, weights, and tokenizer - model_args = TtModelArgs( - mesh_device, - instruct=instruct_mode, - max_batch_size=batch_size, - optimizations=optimizations, - max_seq_len=max_seq_len, - ) - - tokenizer = model_args.tokenizer - - # Check max sequence length compatibility with model and architecture. Refer to README for more information - llama_model_name = model_args.base_model_name # ["3.2-1B", "3.2-3B", "3.1-8B", "3.2-11B", "3.1-70B"] - tt_device_name = model_args.device_name # ["N150", "N300", "T3K", "TG"] - - if llama_model_name in ["Llama3.1-8B", "Llama3.2-11B"] and tt_device_name == "N150": - assert ( - max_seq_len <= 64 * 1024 - ), "N150 only supports a max context length of 64k tokens for Llama3.1-8B and Llama3.2-11B" - else: - assert max_seq_len <= 128 * 1024, f"{llama_model_name} supports a max context length of 128k tokens" - - if llama_model_name == "Llama3.1-70B": - assert tt_device_name in ["T3K", "TG"], "Llama3.1-70B is only supported on T3K or TG" - - logger.info("Loading weights...") - profiler.start("weight_loading") - state_dict = model_args.load_state_dict() - profiler.end("weight_loading") - - page_table_tt = None - - if paged_attention: - # Implied shuffling of blocks - permutation = torch.randperm(paged_attention_config.max_num_blocks) - # Page table which maps virtual blocks to physical - reverse_permutation = torch.argsort(permutation) - page_table = reverse_permutation.reshape( - model_args.max_batch_size, paged_attention_config.max_num_blocks // model_args.max_batch_size - ) - page_table_tt = ttnn.from_torch( - page_table, - device=mesh_device, - dtype=ttnn.int32, - layout=ttnn.ROW_MAJOR_LAYOUT, - mesh_mapper=ttnn.ShardTensor2dMesh(mesh_device, dims=(None, None), mesh_shape=model_args.cluster_shape), - ) - - # Load TTNN Llama3.1 model - logger.info("Loading weights to device...") - profiler.start("loading_weights_to_device") - tt_model = TtTransformer( - args=model_args, - mesh_device=mesh_device, - dtype=dtype, - state_dict=state_dict, - weight_cache_path=model_args.weight_cache_path(dtype), - paged_attention_config=paged_attention_config, - ) - tt_embd = TtLlamaEmbedding( - mesh_device=mesh_device, - args=model_args, - weight_cache_path=model_args.weight_cache_path(dtype), - state_dict=state_dict, - dtype=ttnn.bfloat16, # Row major layout requires bfloat16 - ) - embd = model_args.reference_embedding() - state_dict_prefix = model_args.get_state_dict_prefix("", None) - embd.load_state_dict({"emb.weight": state_dict[f"{state_dict_prefix}tok_embeddings.weight"]}) - profiler.end("loading_weights_to_device") - logger.info("Finished loading weights to device.") - - num_tokens_generated_decode = [] - - logger.info("Starting inference...") - for batch_idx, input_prompts in enumerate(batch_prompts): - logger.info(f"Processing batch {batch_idx}") - profiler.start(f"preprocess_prefill_inputs", iteration=batch_idx) - # Preprocess initial prompt inputs - ( - input_tokens_prefill_pt, - encoded_prompts, - decoding_pos, - prefill_lens, - ) = preprocess_inputs_prefill( - input_prompts, - tokenizer, - model_args, - instruct_mode, - max_generated_tokens, - ) - - max_encoded_prompt_len = max(len(p) for p in encoded_prompts) - assert ( - max_generated_tokens + max_encoded_prompt_len <= max_seq_len - ), f"Prompt prefill tokens ({max_encoded_prompt_len}) + maximum number of decoded iterations ({max_generated_tokens}) needs to be <= than max_seq_len ({max_seq_len})" - - # Prefill embeddings are on host since we need to mask out the tokens after the prefill length after embeddings are computed - pt_prefill_input = [embd(input_tokens_prefill_pt[b]).view(1, prefill_lens[b], -1) for b in range(batch_size)] - profiler.end(f"preprocess_prefill_inputs", iteration=batch_idx) - - # set kv cache to zeros if not first batch, to avoid context leaking when doing multiple batches - if batch_idx != 0: - for layer in tt_model.layers: - k_cache, v_cache = layer.attention.layer_past - k_cache = ttnn.mul(k_cache, 0, output_tensor=k_cache) - v_cache = ttnn.mul(v_cache, 0, output_tensor=v_cache) - - logger.info(f"Starting prefill...") - - # Do not count the first user for prefill time and instead log it as compile time - num_users_generated_prefill = batch_size - 1 if batch_size > 1 else 1 - - pt_out = [] - - profiler.start(f"inference_prefill", iteration=batch_idx) - for batch_id in range(batch_size): - prefill_seq_len = prefill_lens[batch_id] - rot_mats_prefill = get_prefill_rot_mat( - model_args.head_dim, - model_args.max_seq_len, - mesh_device, - prefill_seq_len, - model_args.rope_theta, - model_args.rope_scaling_factor, - model_args.orig_context_len, - ) - if decoding_pos[batch_id] < prefill_seq_len: - pt_prefill_input[batch_id][ - :, decoding_pos[batch_id] :, : - ] = 0 # Zero out the tokens after the prefill length - - prefill_input = model_args.prepare_residual_tensor_prefill( - pt_prefill_input[batch_id], - ) - - if batch_id == 0: # First user prefill accounts for compile time - profiler.start(f"compile_prefill", iteration=batch_idx) - - tt_out = tt_model( - prefill_input, - current_pos=None, - rot_mats=rot_mats_prefill, - user_id=batch_id, - mode="prefill", - page_table=page_table_tt, - get_last_token=((decoding_pos[batch_id] - 1) // 32) * 32, - ) - - if ( - batch_id == 0 - ): # First user prefill accounts for compile time (which will be removed from the full prefill inference time) - profiler.end(f"compile_prefill", iteration=batch_idx) - - # [PROFILER-ONLY] In runs where there is only one user, run the prefill twice to measure compile and inference prefill times - if batch_size == 1: - ttnn.deallocate(tt_out) - prefill_input = model_args.prepare_residual_tensor_prefill( - pt_prefill_input[batch_id], - ) - tt_out = tt_model( - prefill_input, - current_pos=None, - rot_mats=rot_mats_prefill, - user_id=batch_id, - mode="prefill", - page_table=page_table_tt, - get_last_token=((decoding_pos[batch_id] - 1) // 32) * 32, - ) - - pt_out.append( - ttnn.to_torch( - tt_out, - mesh_composer=ttnn.ConcatMesh2dToTensor( - mesh_device, - dims=(3, 1) if model_args.is_galaxy else (1, -1), - mesh_shape=model_args.cluster_shape, - ), - )[0, 0, (decoding_pos[batch_id] - 1) % 32, : model_args.vocab_size] - ) - ttnn.deallocate(tt_out) - - # Synchronize devices to ensure the profile captures the correct timing of all devices - for i in range(model_args.num_devices): - ttnn.synchronize_device(mesh_device.get_devices()[i]) - profiler.end(f"inference_prefill", iteration=batch_idx) - logger.info(f"Prefill finished") - - # Preparing first decode token - profiler.start(f"prepare_first_decode_token_{batch_idx}") - pt_out_batched = torch.stack(pt_out, dim=-2) - pt_out_batched = torch.argmax(pt_out_batched, dim=-1) - # Pad the output tensor to be tile sized - tt_out_tok = ttnn.from_torch( - torch.nn.functional.pad( - pt_out_batched.unsqueeze(0).unsqueeze(0).unsqueeze(0), (0, 32 - len(pt_out_batched)), "constant", 0 - ), - device=mesh_device, - mesh_mapper=ttnn.ReplicateTensorToMesh(mesh_device), - dtype=ttnn.uint32, - ) - profiler.end(f"prepare_first_decode_token_{batch_idx}") - - # Keep track of generated outputs to print out every iteration - all_outputs = [encoded_prompts[b][:prefill_seq_len] for b in range(batch_size)] - for user in range(batch_size): - user_tok = int(pt_out_batched[user].item()) - all_outputs[user].append(user_tok) - - user_done = [False] * batch_size # Keeps track when a user reaches EoD token - - logger.info("Starting decode...") - - # Shard the page table for TG decode - if paged_attention and model_args.is_galaxy and batch_size > 1: - page_table_tt = ttnn.from_torch( - page_table, - device=mesh_device, - dtype=ttnn.int32, - layout=ttnn.ROW_MAJOR_LAYOUT, - mesh_mapper=ttnn.ShardTensor2dMesh( - mesh_device, - dims=(None, -2) if batch_size > 1 else (None, None), - mesh_shape=model_args.cluster_shape, - ), - ) - # Set sampling mode - argmax_on_device = False if (batch_size > 1 or sampling_params["temperature"] != 0) else True - - # Create events - profiler.start(f"compile_trace_{batch_idx}") - op_event = ttnn.create_event(mesh_device) - write_event = ttnn.create_event(mesh_device) - - # Initial positions - current_pos = torch.tensor([decoding_pos[b] for b in range(batch_size)]) - - current_pos_tensor = ttnn.from_torch( - current_pos, - device=mesh_device, - dtype=ttnn.int32, - mesh_mapper=ttnn.ShardTensor2dMesh( - mesh_device, - dims=(None, 0) if (model_args.is_galaxy and batch_size > 1) else (None, None), - mesh_shape=model_args.cluster_shape, - ), - ) - - # Get cos/sin matrices for the current position of each user - rot_mats, rot_mat_idxs = tt_model.rope_setup.get_rot_mats(current_pos, return_rot_idxs=True) - # Compile - logger.info(f"Compiling model trace...") - decode_input = ttnn.unsqueeze_to_4D(tt_embd(tt_out_tok)) - decode_input = ttnn.to_memory_config( - decode_input, - ttnn.L1_MEMORY_CONFIG if model_args.is_galaxy else tt_model.args.model_config["DECODE_RESIDUAL_MEMCFG"], - ) - tt_out = tt_model( - decode_input, - current_pos_tensor, - rot_mats=rot_mats, - mode="decode", - page_table=page_table_tt, - ) - if tt_model.args.num_devices > 1: - if tt_model.args.is_galaxy: - tt_out_gathered = ttnn.all_gather( - tt_out, - dim=3, - num_links=2, - cluster_axis=0, - mesh_device=mesh_device, - topology=model_args.ccl_topology(), - ) - else: - tt_out_gathered = ttnn.all_gather(tt_out, dim=3, num_links=1, topology=model_args.ccl_topology()) - ttnn.deallocate(tt_out) - else: - tt_out_gathered = tt_out - tt_out_rm = ttnn.untilize(tt_out_gathered, use_multicore=True) - ttnn.deallocate(tt_out_gathered) - if argmax_on_device: - tt_out_tok = ttnn.argmax( # FIXME When ttnn.argmax supports multicore, avoid falling back to host - tt_out_rm, dim=3, use_multicore=False if batch_size > 1 else True, output_tensor=tt_out_tok - ) - ttnn.deallocate(tt_out_rm) - else: - tt_out_tok_reset, _ = sample_host( - tt_out_rm, - mesh_device, - temperature=sampling_params["temperature"], - top_p=sampling_params["top_p"], - on_host=True, - ) - ttnn.copy_host_to_device_tensor(tt_out_tok_reset, tt_out_tok) - ttnn.plus_one(current_pos_tensor) - profiler.end(f"compile_trace_{batch_idx}") - - # Capture Trace - logger.info(f"Capturing model trace...") - profiler.start(f"capture_trace_{batch_idx}") - trace_id = ttnn.begin_trace_capture(mesh_device, cq_id=0) - - decode_input = ttnn.unsqueeze_to_4D(tt_embd(tt_out_tok)) - decode_input = ttnn.to_memory_config(decode_input, tt_model.args.model_config["DECODE_RESIDUAL_MEMCFG"]) - rot_mats = tt_model.rope_setup.get_rot_mats(rot_mat_idxs) - tt_out = tt_model( - decode_input, - current_pos_tensor, - rot_mats=rot_mats, - mode="decode", - page_table=page_table_tt, - ) - if tt_model.args.num_devices > 1: - if tt_model.args.is_galaxy: - tt_out_gathered = ttnn.all_gather( - tt_out, - dim=3, - num_links=2, - cluster_axis=0, - mesh_device=mesh_device, - topology=model_args.ccl_topology(), - ) - else: - tt_out_gathered = ttnn.all_gather(tt_out, dim=3, num_links=1, topology=model_args.ccl_topology()) - ttnn.deallocate(tt_out) - else: - tt_out_gathered = tt_out - tt_out_rm = ttnn.untilize(tt_out_gathered, use_multicore=True) - ttnn.deallocate(tt_out_gathered) - if argmax_on_device: - tt_out_tok = ttnn.argmax( - tt_out_rm, dim=3, use_multicore=False if batch_size > 1 else True, output_tensor=tt_out_tok - ) # FIXME Multicore is not compatible with batch > 1 - ttnn.deallocate(tt_out_rm) - ttnn.plus_one(current_pos_tensor) - # ttnn.plus_one(rot_mat_idxs) # FIXME <- This won't work since embedding requires uint32 and plus_one only works for int32 - - ttnn.end_trace_capture(mesh_device, trace_id, cq_id=0) - - # Reset the decoding position for the proper run of the model - current_pos_reset = ttnn.from_torch( - current_pos, - dtype=ttnn.int32, - mesh_mapper=( - ttnn.ShardTensor2dMesh( - mesh_device, - dims=(None, 0) if (model_args.is_galaxy and batch_size > 1) else (None, None), - mesh_shape=model_args.cluster_shape, - ) - if tt_model.args.num_devices > 1 - else None - ), - ) - tt_out_tok_reset = ttnn.from_torch( - torch.nn.functional.pad( - pt_out_batched.unsqueeze(0).unsqueeze(0).unsqueeze(0), (0, 32 - len(pt_out_batched)), "constant", 0 - ), - # torch.nn.functional.pad(pt_out_batched.unsqueeze(0).unsqueeze(0).unsqueeze(0), (0, 30), "constant", 0), - dtype=ttnn.uint32, - mesh_mapper=ttnn.ReplicateTensorToMesh(mesh_device) if tt_model.args.num_devices > 1 else None, - ) - - # Reset the current position and output token tensors for the real decode run - ttnn.copy_host_to_device_tensor(current_pos_reset, current_pos_tensor) - ttnn.copy_host_to_device_tensor(tt_out_tok_reset, tt_out_tok) - rot_mat_idxs_reset = tt_model.rope_setup.get_rot_idxs(current_pos, on_host=True) - ttnn.copy_host_to_device_tensor(rot_mat_idxs_reset, rot_mat_idxs) - - profiler.end(f"capture_trace_{batch_idx}") - - # Start decoding - iteration = 0 - users_decoding = True # reset to handle next batch - total_decoding_time = 0 # Track total decoding time - total_tokens_generated = 0 # Track total tokens generated - - logger.info(f"Starting decode loop...") - profiler.start(f"inference_decode", iteration=batch_idx) - - ttnn.record_event(1, write_event) - while users_decoding: - if iteration == 0: # First iteration also accounts for compile time - profiler.start(f"compile_decode", iteration=batch_idx) - iteration_time_start = time() - - # Execute trace - ttnn.wait_for_event(0, write_event) - ttnn.execute_trace(mesh_device, trace_id, cq_id=0, blocking=True) - ttnn.record_event(0, op_event) - - # Update current pos and mat idxs on host and send to device - # TODO This is required for now since we cannot ttnn.plus_one(rot_mat_idxs) while it being uint32. - # If this tensor is int32, it won't be supported by ttnn.embedding - current_pos += 1 - rot_mat_idxs_updated = tt_model.rope_setup.get_rot_idxs(current_pos, on_host=True) - ttnn.copy_host_to_device_tensor(rot_mat_idxs_updated, rot_mat_idxs) - - # Write to host - ttnn.wait_for_event(1, op_event) - if argmax_on_device: - tt_output_torch = ttnn.to_torch( - tt_out_tok.cpu(blocking=True, cq_id=1), - mesh_composer=ttnn.ConcatMesh2dToTensor( - mesh_device, - dims=(3, 1) if tt_model.args.is_galaxy else (1, -1), - mesh_shape=model_args.cluster_shape, - ), - )[0, 0, 0, :batch_size] - else: - tt_out_tok_reset, tt_output_torch = sample_host( - tt_out_rm, - mesh_device, - temperature=sampling_params["temperature"], - top_p=sampling_params["top_p"], - on_host=True, - ) - tt_output_torch = tt_output_torch[0, 0, 0, :batch_size] - ttnn.copy_host_to_device_tensor(tt_out_tok_reset, tt_out_tok) - ttnn.record_event(1, write_event) - - # Save output token to print out later - for user in range(batch_size): - user_tok = tt_output_torch[user].tolist() - if ( - user_tok not in tokenizer.stop_tokens and user_done[user] == False - ): # Read until an eos token (e.g. <|eot_id|>); create_tokenizer adds stop_tokens to HF tokenizers - all_outputs[user].append(user_tok) - else: - user_done[user] = True - logger.trace(f"[User {user}] Finished decoding at iteration {iteration}") - if all(user_done): - users_decoding = False - - # Print out generated outputs for each user at the end of every iteration - iteration_time = time() - iteration_time_start - - # Ignore the first iteration for average speed calculation - if iteration > 0: - total_decoding_time += iteration_time - total_tokens_generated += 1 - - tokens_per_second_per_user = 1 / iteration_time - - profiler.start(f"log_printing_iter_{iteration}", iteration=batch_idx) - # Print out generated outputs for each user at the end of every iteration - if not is_ci_env: - if len(user_input) == 1: - logger.info("[User 0] {}".format("".join(tokenizer.decode(all_outputs[0])))) - else: - for user in range(batch_size): - text = "".join(tokenizer.decode(all_outputs[user])) - if len(text) > 100: - text = "..." + text[-97:] - text = text.replace("\n", " ") - logger.info("[User {}] {}".format(user, text)) - - # Always print perf at every iteration - logger.info( - f"Iteration {iteration}: {1000*iteration_time:.0f}ms @ {tokens_per_second_per_user:.1f} tok/s/user ({batch_size*tokens_per_second_per_user:.1f} tok/s throughput)" - ) - profiler.end(f"log_printing_iter_{iteration}", iteration=batch_idx) - - if iteration == 0: # First iteration also accounts for compile time - profiler.end(f"compile_decode", iteration=batch_idx) - - iteration += 1 - - # Upper limit of generated tokens for each user (to avoid infinite generation in case eos is not seen) - if iteration >= max_generated_tokens: - users_decoding = False - - if not users_decoding: - profiler.start(f"log_saving_file", iteration=batch_idx) - for i, (output, prompt) in enumerate(zip(all_outputs, input_prompts)): - text = tokenizer.decode(output) - prompt_including_assistant_tags = tokenizer.decode( - model_args.encode_prompt(prompt, instruct=instruct_mode) - ) - text_after_prompt = text.replace(prompt_including_assistant_tags, "", 1) - if print_to_file: - with open(output_filename, "a") as f: - f.write( - f"\nbatch: {batch_idx} user: {i}\nprompt: {prompt} \noutput:\n{text_after_prompt}\n" - ) - else: - # Strip leading newlines from output when sent to terminal - short_prompt = ( - (prompt[:100] + "\n\n" + prompt[-100:]) - if len(prompt) > 200 - else prompt - ) - logger.info( - f"\nbatch: {batch_idx} user: {i}\nprompt: {short_prompt} \noutput:\n{text_after_prompt.strip()}\n" - ) - profiler.end(f"log_saving_file", iteration=batch_idx) - - num_tokens_generated_decode.append( - total_tokens_generated - ) # Save the number of tokens generated for each batch (excluding the first token) - - # Release trace - ttnn.release_trace(mesh_device, trace_id) - - profiler.end(f"inference_decode", iteration=batch_idx) - - # Finish profiling at the end of all batches inference - profiler.end("run") - - # Prepare profile benchmark metrics for batch 0 - compile_prefill_time = profiler.get_duration("compile_prefill") - compile_decode_time = profiler.get_duration("compile_decode") - inference_prefill_time = profiler.get_duration("inference_prefill") - inference_decode_time = profiler.get_duration("inference_decode") - log_printing_time = sum(profiler.get_duration(f"log_printing_iter_{i}") for i in range(total_tokens_generated)) - log_saving_file_time = profiler.get_duration(f"log_saving_file") - - # Correct the inference decode time to remove the time spent on compile (1st iteration) and log_printing (at the end of every iteration) - inference_decode_time = inference_decode_time - compile_decode_time - log_printing_time - log_saving_file_time - # Correct the inference prefill time to remove the time spent on compile (1st iteration) - inference_prefill_time = inference_prefill_time - compile_prefill_time - # Average prefill time for each user - prefill_time_to_first = inference_prefill_time / num_users_generated_prefill - - measurements = { - # Required measurements - "compile_prefill": compile_prefill_time, - "compile_decode": compile_decode_time, - "inference_prefill": inference_prefill_time, - "inference_decode": inference_decode_time, - "prefill_time_to_token": prefill_time_to_first, - "prefill_t/s": num_users_generated_prefill / inference_prefill_time * prefill_seq_len, # tokens/s - "decode_t/s/u": num_tokens_generated_decode[0] / inference_decode_time, # tokens/s/u - "decode_t/s": num_tokens_generated_decode[0] / inference_decode_time * batch_size, # tokens/s - # Optional measurements - "loading_inputs": profiler.get_duration("loading_inputs"), - "weight_loading": profiler.get_duration("weight_loading"), - "prepare_first_decode_token": profiler.get_duration("prepare_first_decode_token_0"), - "preprocess_prefill_inputs": profiler.get_duration("preprocess_prefill_inputs"), - "loading_weights_to_device": profiler.get_duration("loading_weights_to_device"), - "compile_trace": profiler.get_duration("compile_trace_0"), # Only for batch 0 - "capture_trace": profiler.get_duration("capture_trace_0"), # Only for batch 0 - "Total compile time": compile_prefill_time + compile_decode_time, - "Full demo runtime": profiler.get_duration("run"), - } - - # Print some of the perf metrics - logger.info("") - logger.info(f"Performance metrics for batch 0") - logger.info(f"Prefill compile time: {round(measurements['compile_prefill'], 4)}s") - logger.info(f"Decode compile time: {round(measurements['compile_decode'], 4)}s") - logger.info(f"Prefill inference time per user: {round(inference_prefill_time/num_users_generated_prefill, 4)}s") - logger.info( - f"Total Decode inference time ({total_tokens_generated-1} iterations): {round(measurements['inference_decode'], 4)}s" - ) - logger.info("") - logger.info(f"Time to first token: {round(measurements['prefill_time_to_token']* 1000, 2)}ms") - logger.info( - f"Average speed: {round(inference_decode_time / num_tokens_generated_decode[0] * 1000, 2)}ms @ {round(measurements['decode_t/s/u'], 2)} tok/s/user ({round(measurements['decode_t/s'], 2)} tok/s throughput)" - ) - logger.info("") - - supported_models = ["Llama3.2-1B", "Llama3.2-3B", "Llama3.1-8B", "Llama3.2-11B", "Llama3.1-70B"] - supported_devices = ["N150", "N300", "T3K", "TG"] - - # TODO update targets based on the llama3 model and the target device - tt_device_name = model_args.device_name - - if model_args.base_model_name in supported_models: - assert tt_device_name in supported_devices, f"Device {tt_device_name} not supported" - - # Set the target times to first token for every combination of device and model - target_prefill_tok_s = { - "N150_Llama3.2-1B": 1050, # TODO Update target - "N300_Llama3.2-1B": 1050, # TODO Update target - "T3K_Llama3.2-1B": 1050, # TODO Update target - "TG_Llama3.2-1B": 1050, # TODO Update target - # - "N150_Llama3.2-3B": 1050, # TODO Update target - "N300_Llama3.2-3B": 1050, # TODO Update target - "T3K_Llama3.2-3B": 1050, # TODO Update target - "TG_Llama3.2-3B": 1050, # TODO Update target - # - "N150_Llama3.1-8B": 1050, - "N300_Llama3.1-8B": 1050, - "T3K_Llama3.1-8B": 1050, - "TG_Llama3.1-8B": 1050, - # - "N150_Llama3.2-11B": 1050, # TODO Update target - "N300_Llama3.2-11B": 1050, # TODO Update target - "T3K_Llama3.2-11B": 1050, # TODO Update target - "TG_Llama3.2-11B": 1050, # TODO Update target - # - "N150_Llama3.1-70B": 1050, # TODO Update target - "N300_Llama3.1-70B": 1050, # TODO Update target - "T3K_Llama3.1-70B": 1050, # TODO Update target - "TG_Llama3.1-70B": 1050, # TODO Update target - }[f"{tt_device_name}_{model_args.base_model_name}"] - - # Set the target decode timesfor every combination of device and model - target_decode_tok_s_u = { - "N150_Llama3.2-1B": 160, # TODO Update target - "N300_Llama3.2-1B": 250, # TODO Update target - "T3K_Llama3.2-1B": 300, # TODO Update target - "TG_Llama3.2-1B": 300, # TODO Update target - # - "N150_Llama3.2-3B": 60, # TODO Update target - "N300_Llama3.2-3B": 100, # TODO Update target - "T3K_Llama3.2-3B": 150, # TODO Update target - "TG_Llama3.2-3B": 150, # TODO Update target - # - "N150_Llama3.1-8B": 23, # TODO Update target - "N300_Llama3.1-8B": 38, - "T3K_Llama3.1-8B": 45, - "TG_Llama3.1-8B": 45, # TODO Update target - # - "N150_Llama3.2-11B": 23, - "N300_Llama3.2-11B": 38, # TODO Update target - "T3K_Llama3.2-11B": 45, # TODO Update target - "TG_Llama3.2-11B": 45, # TODO Update target - # - "T3K_Llama3.1-70B": 20, # TODO Update target - "TG_Llama3.1-70B": 20, # TODO Update target - }[f"{tt_device_name}_{model_args.base_model_name}"] - - target_decode_tok_s = target_decode_tok_s_u * batch_size - targets = { - "prefill_t/s": target_prefill_tok_s, - "decode_t/s": target_decode_tok_s, - "decode_t/s/u": target_decode_tok_s_u, - } - else: - logger.warning(f"Model {model_args.base_model_name} not does not have performance targets set") - targets = {} - - # Save benchmark data for CI dashboard - if is_ci_env: - benchmark_data = create_benchmark_data(profiler, measurements, N_warmup_iter, targets) - benchmark_data.save_partial_run_json( - profiler, - run_type=f"{tt_device_name}-demo", - ml_model_name=model_args.base_model_name, - ml_model_type="llm", - num_layers=model_args.n_layers, - batch_size=batch_size, - input_sequence_length=prefill_seq_len, - output_sequence_length=1, - ) - - -# List of supported Parameters for demo.py -# -# input_prompts (string): input json file with prompts to process. See models/demos/llama3/demo/*.json for list of input files -# instruct (bool): Whether to use instruct weights or general weights -# repeat_batches (int): Number of consecutive batches of users to run (default: 1) -# max_seq_len (int): Maximum context length supported by the model (Llama3.1 and Llama3.2 models have a maximum context length of 128k, i.e., 128 * 1024) -# batch_size (int): Number of users in a batch (Supports 1/2/4/8/16/32 batches) -# max_generated_tokens (int): Maximum number of tokens to generate for each user (Note that the users will stop generation before this limit if they reach a EoS token) -# paged_attention (bool): Whether to use paged attention or default attention (vLLM requires paged attention) -# page_params (dict): Page parameters for paged attention (block_size, max_num_blocks) For smaller context lengths use block_size=32 and max_num_blocks=1024, for larger context use block_size=64 and max_num_blocks=2048 -# sampling_params (dict): Sampling parameters for decoding (temperature, top_p). If temperature is set to 0, argmax (greedy decode) is used. -# -# optimization (LlamaOptimizations): Optimization level to use for the model (performance or accuracy) -# FAKE_DEVICE (str): Fake device to use for testing (N150, N300, T3K, TG). Usage: `export FAKE_DEVICE=N150`, will enable running a single-chip demo on a multi-chip system. -@pytest.mark.parametrize( - "input_prompts, instruct, repeat_batches, max_seq_len, batch_size, max_generated_tokens, paged_attention, page_params, sampling_params", - [ - ( # Batch-1 run (Reasoning) - single user, small prompt, long thinking time - "models/demos/llama3/demo/input_data_questions_reasoning.json", # input_prompts - True, # instruct mode - 1, # repeat_batches - 16384, # max_seq_len - 1, # batch_size - 15000, # max_generated_tokens - True, # paged_attention - {"page_block_size": 32, "page_max_num_blocks": 1024}, # page_params # TODO This will be serviced by vLLM - {"temperature": 0, "top_p": 0.08}, # sampling_params (argmax) - ), - ( # Batch-1 run (Latency) - single user, small prompt - "models/demos/llama3/demo/input_data_questions_prefill_128.json", # input_prompts - True, # instruct mode - 1, # repeat_batches - 1024, # max_seq_len - 1, # batch_size - 200, # max_generated_tokens - True, # paged_attention - {"page_block_size": 32, "page_max_num_blocks": 1024}, # page_params # TODO This will be serviced by vLLM - {"temperature": 0, "top_p": 0.08}, # sampling_params (argmax) - ), - ( # Batch-32 run (Throughput) - 32 users, small prompt - "models/demos/llama3/demo/input_data_questions_prefill_128.json", # input_prompts - True, # instruct mode - 1, # repeat_batches - 1024, # max_seq_len - 32, # batch_size - 200, # max_generated_tokens - True, # paged_attention - {"page_block_size": 32, "page_max_num_blocks": 1024}, # page_params # TODO This will be serviced by vLLM - {"temperature": 0, "top_p": 0.08}, # sampling_params (argmax) - ), - ( # Long-context run - Single user, long prompt (adapted to the model being used and architecture) - "models/demos/llama3/demo/input_data_long_64k.json", # input_prompts - True, # instruct mode - 1, # repeat_batches - 64 * 1024, # max_seq_len - 1, # batch_size - 200, # max_generated_tokens - False, # paged_attention - {"page_block_size": 64, "page_max_num_blocks": 2048}, # page_params # TODO This will be serviced by vLLM - {"temperature": 0, "top_p": 0.08}, # sampling_params (argmax) - ), - ], - ids=[ - "reasoning-1", # reasoning - "batch-1", # latency - "batch-32", # throughput - "long-context", # max-length - ], -) -@pytest.mark.parametrize( - "optimizations", - [ - LlamaOptimizations.performance, - LlamaOptimizations.accuracy, - ], -) -@pytest.mark.parametrize("device_params", [{"trace_region_size": 23887872, "num_command_queues": 2}], indirect=True) -@pytest.mark.parametrize( - "mesh_device", - [ - {"N150": (1, 1), "N300": (1, 2), "T3K": (1, 8), "TG": (8, 4)}.get( - os.environ.get("FAKE_DEVICE"), len(ttnn.get_device_ids()) - ) - ], - indirect=True, -) -def test_llama_demo( - input_prompts, - instruct, - repeat_batches, - max_seq_len, - batch_size, - max_generated_tokens, - paged_attention, - page_params, - sampling_params, - optimizations, - mesh_device, - use_program_cache, - is_ci_env, - reset_seeds, -): - if is_ci_env and ( - "long" in input_prompts or "reasoning" in input_prompts or optimizations == LlamaOptimizations.accuracy - ): - pytest.skip("Do not run the 'long-context' or accuracy tests on CI to reduce load") - - # TODO: Remove this once all batch sizes are supported on TG - if os.environ.get("FAKE_DEVICE") == "TG" and batch_size not in [1, 32]: - pytest.skip("TG only supports batch 1 and 32") - - mesh_device.enable_async(True) - - if paged_attention: - paged_attention_config = PagedAttentionConfig( - block_size=page_params["page_block_size"], - max_num_blocks=page_params["page_max_num_blocks"], - ) - else: - paged_attention_config = None - - return run_llama3_demo( - user_input=input_prompts, - mesh_device=mesh_device, - max_seq_len=max_seq_len, - batch_size=batch_size, - num_batches=repeat_batches, - paged_attention=paged_attention, - paged_attention_config=paged_attention_config, - max_generated_tokens=max_generated_tokens, - optimizations=optimizations, - sampling_params=sampling_params, - instruct_mode=instruct, - is_ci_env=is_ci_env, - print_to_file=False, - ) diff --git a/models/demos/llama3/demo/input_data_questions.json b/models/demos/llama3/demo/input_data_questions.json deleted file mode 100644 index e8aa3ee0eaa..00000000000 --- a/models/demos/llama3/demo/input_data_questions.json +++ /dev/null @@ -1,97 +0,0 @@ -[ - { -"prompt": "What is your favourite condiment?" - }, - { -"prompt": "Hello, how are you?" - }, - { -"prompt": "Do you have mayonnaise recipes?" - }, - { -"prompt": "Which color do you get if you mix yellow and blue?" - }, - { -"prompt": "What is the ideal room temperature?" - }, - { -"prompt": "Can you tell me a joke?" - }, - { -"prompt": "What are you good at?" - }, - { -"prompt": "What is 2+2?" - }, - { -"prompt": "what is the capital of USA?" - }, - { -"prompt": "what is the capital of Canada?" - }, - { -"prompt": "what is the capital of UK?" - }, - { -"prompt": "what is the capital of Germany?" - }, - { -"prompt": "what is the capital of France?" - }, - { -"prompt": "what is the capital of Japan?" - }, - { -"prompt": "what is the capital of Portugal?" - }, - { -"prompt": "what is the capital of China?" - }, - { -"prompt": "what is the currency of Cuba?" - }, - { -"prompt": "what is the currency of Lebanon?" - }, - { -"prompt": "what is the currency of Brazil?" - }, - { -"prompt": "what is the currency of Australia?" - }, - { -"prompt": "what is the currency of Jamaica?" - }, - { -"prompt": "what is the currency of Egypt?" - }, - { -"prompt": "what is the currency of Uzbekistan?" - }, - { -"prompt": "what is the currency of Argentina?" - }, - { -"prompt": "Are birds mammals?" - }, - { -"prompt": "How do you play tennis?" - }, - { -"prompt": "Suggest cities to visit in Japan" - }, - { -"prompt": "How far away is the moon from the earth?" - }, - { -"prompt": "What is a black hole?" - }, - { -"prompt": "How do you play golf?" - }, - { -"prompt": "Recommend me a movie" - }, - { -"prompt": "what is the capital of Spain?"} - ] diff --git a/models/demos/llama3/demo/input_data_long_128k.json b/models/demos/llama3/demo/sample_prompts/input_data_long_128k.json similarity index 100% rename from models/demos/llama3/demo/input_data_long_128k.json rename to models/demos/llama3/demo/sample_prompts/input_data_long_128k.json diff --git a/models/demos/llama3/demo/sample_prompts/input_data_long_16k.json b/models/demos/llama3/demo/sample_prompts/input_data_long_16k.json new file mode 100644 index 00000000000..1cba84254c8 --- /dev/null +++ b/models/demos/llama3/demo/sample_prompts/input_data_long_16k.json @@ -0,0 +1,7 @@ +[ + { + "prompt": "Explicitly state the quotes directly taken from the book inside double quotes like this: \n A. < add quote> \n Metaphor: \n B. < add quote> \n Metaphor: \n C. < add quote> \n Metaphor: \n with the metaphors after each quote. Double-check that the quotes are from the text specified above and that the metaphors relate to AI. End your answer after the 3 quotes / metaphors are finished.", + "context": "https://www.gutenberg.org/cache/epub/84/pg84.txt", + "max_length": 70000 + } +] diff --git a/models/demos/llama3/demo/sample_prompts/input_data_long_1k.json b/models/demos/llama3/demo/sample_prompts/input_data_long_1k.json new file mode 100644 index 00000000000..2df81b4d095 --- /dev/null +++ b/models/demos/llama3/demo/sample_prompts/input_data_long_1k.json @@ -0,0 +1,7 @@ +[ + { + "prompt": "Explicitly state the quotes directly taken from the book inside double quotes like this: \n A. < add quote> \n Metaphor: \n B. < add quote> \n Metaphor: \n C. < add quote> \n Metaphor: \n with the metaphors after each quote. Double-check that the quotes are from the text specified above and that the metaphors relate to AI. End your answer after the 3 quotes / metaphors are finished.", + "context": "https://www.gutenberg.org/cache/epub/84/pg84.txt", + "max_length": 3500 + } +] diff --git a/models/demos/llama3/demo/sample_prompts/input_data_long_2k.json b/models/demos/llama3/demo/sample_prompts/input_data_long_2k.json new file mode 100644 index 00000000000..84cbc0ce5cc --- /dev/null +++ b/models/demos/llama3/demo/sample_prompts/input_data_long_2k.json @@ -0,0 +1,7 @@ +[ + { + "prompt": "Explicitly state the quotes directly taken from the book inside double quotes like this: \n A. < add quote> \n Metaphor: \n B. < add quote> \n Metaphor: \n C. < add quote> \n Metaphor: \n with the metaphors after each quote. Double-check that the quotes are from the text specified above and that the metaphors relate to AI. End your answer after the 3 quotes / metaphors are finished.", + "context": "https://www.gutenberg.org/cache/epub/84/pg84.txt", + "max_length": 7000 + } +] diff --git a/models/demos/llama3/demo/input_data_long_32k.json b/models/demos/llama3/demo/sample_prompts/input_data_long_32k.json similarity index 100% rename from models/demos/llama3/demo/input_data_long_32k.json rename to models/demos/llama3/demo/sample_prompts/input_data_long_32k.json diff --git a/models/demos/llama3/demo/sample_prompts/input_data_long_4k.json b/models/demos/llama3/demo/sample_prompts/input_data_long_4k.json new file mode 100644 index 00000000000..df4b3e99b8e --- /dev/null +++ b/models/demos/llama3/demo/sample_prompts/input_data_long_4k.json @@ -0,0 +1,7 @@ +[ + { + "prompt": "Explicitly state the quotes directly taken from the book inside double quotes like this: \n A. < add quote> \n Metaphor: \n B. < add quote> \n Metaphor: \n C. < add quote> \n Metaphor: \n with the metaphors after each quote. Double-check that the quotes are from the text specified above and that the metaphors relate to AI. End your answer after the 3 quotes / metaphors are finished.", + "context": "https://www.gutenberg.org/cache/epub/84/pg84.txt", + "max_length": 16000 + } +] diff --git a/models/demos/llama3/demo/input_data_long_64k.json b/models/demos/llama3/demo/sample_prompts/input_data_long_64k.json similarity index 100% rename from models/demos/llama3/demo/input_data_long_64k.json rename to models/demos/llama3/demo/sample_prompts/input_data_long_64k.json diff --git a/models/demos/llama3/demo/sample_prompts/input_data_long_8k.json b/models/demos/llama3/demo/sample_prompts/input_data_long_8k.json new file mode 100644 index 00000000000..2708b666228 --- /dev/null +++ b/models/demos/llama3/demo/sample_prompts/input_data_long_8k.json @@ -0,0 +1,7 @@ +[ + { + "prompt": "Explicitly state the quotes directly taken from the book inside double quotes like this: \n A. < add quote> \n Metaphor: \n B. < add quote> \n Metaphor: \n C. < add quote> \n Metaphor: \n with the metaphors after each quote. Double-check that the quotes are from the text specified above and that the metaphors relate to AI. End your answer after the 3 quotes / metaphors are finished.", + "context": "https://www.gutenberg.org/cache/epub/84/pg84.txt", + "max_length": 32000 + } +] diff --git a/models/demos/llama3/demo/input_data_prefill_128.json b/models/demos/llama3/demo/sample_prompts/input_data_prefill_128.json similarity index 100% rename from models/demos/llama3/demo/input_data_prefill_128.json rename to models/demos/llama3/demo/sample_prompts/input_data_prefill_128.json diff --git a/models/demos/llama3/demo/sample_prompts/input_data_questions_prefill_128.json b/models/demos/llama3/demo/sample_prompts/input_data_questions_prefill_128.json new file mode 100644 index 00000000000..0e361c55dcd --- /dev/null +++ b/models/demos/llama3/demo/sample_prompts/input_data_questions_prefill_128.json @@ -0,0 +1,98 @@ +[ + { + "prompt": "What is your favorite condiment? There are so many condiments to choose from, each bringing its unique flavor and texture to enhance different dishes. Do you prefer the classic taste of ketchup, the creamy richness of mayonnaise, the spicy kick of mustard, or perhaps something more exotic like sriracha or hoisin sauce? Maybe you enjoy the tangy zest of salsa or the smooth and savory taste of aioli. Share what your favorite condiment is and why you love it. Does it remind you of a specific dish or meal?" + }, + { + "prompt": "Hello, how are you? This simple question can open up a conversation in many different ways. When someone asks how you are, they are inviting you to share a bit about your current state, whether it's your mood, your health, or what's been happening in your life recently. How do you usually respond to this question? Do you give a brief and polite answer, or do you take the opportunity to share more details? How does your response change depending on who is asking? Think about how you feel today and take a moment to check in with yourself." + }, + { + "prompt": "Do you have mayonnaise recipes? Mayonnaise is a versatile ingredient that can be used in countless recipes beyond just a sandwich spread. What are some of your favorite ways to use mayonnaise in cooking or baking? Do you have a special recipe for a creamy potato salad, a tangy coleslaw, or perhaps a savory dip for vegetables and chips? Mayonnaise can also be used as a base for homemade dressings and sauces, adding richness and flavor to your dishes. Share any recipes, tips, or creative uses you have for mayonnaise." + }, + { + "prompt": "Which color do you get if you mix yellow and blue? Color mixing is a fundamental concept in both art and science. When you combine the primary colors yellow and blue, you create green. This is an example of subtractive color mixing, which is used in painting and printing. Have you ever experimented with mixing colors in art class or while working on a creative project? What other color combinations have you tried, and what results did you get? Think about how colors interact with each other and how you can use this knowledge in your artwork, home decor, or even fashion choices." + }, + { + "prompt": "What is the ideal room temperature? The ideal room temperature can vary based on personal preference, the climate you live in, and the activity you're doing. Generally, a comfortable room temperature for most people is around 68-72 degrees Fahrenheit (20-22 degrees Celsius). Do you prefer a warmer or cooler environment?" + }, + { + "prompt": "Can you tell me a joke? Jokes are a great way to bring a smile to someone's face and lighten the mood. They can be short and simple, like puns or one-liners, or longer and more elaborate. Do you have a favorite joke that never fails to make people laugh? Perhaps you enjoy clever wordplay, situational humor, or jokes that tell a funny story. How do you choose the right moment to share a joke? Have you ever used humor to break the ice in a social setting or to cheer someone up?" + }, + { + "prompt": "What are you good at? Everyone has unique skills and talents that they excel in. What are some things that you are particularly good at, whether they are professional skills, hobbies, or personal strengths? Do you have a talent for playing a musical instrument, painting, or writing? Maybe you are great at sports, cooking, or problem-solving. How did you discover these abilities, and how have you developed them over time? Think about how your skills have influenced your life and the satisfaction you get from using them. Are there any new skills you would like to learn or improve upon?" + }, + { + "prompt": "What is 2+2? This basic arithmetic question is one of the first math problems we learn as children. The answer is 4, but the concept of addition is much more than just numbers. Think about how you use addition in everyday life, from counting items in your shopping cart to calculating the total cost of your purchases. How has your understanding of math evolved since you first learned to add? Do you enjoy working with numbers, or do you find it challenging? Consider how basic math skills lay the foundation for more complex problem-solving in fields like science, engineering, and finance." + }, + { + "prompt": "What is the capital of the USA? The capital city of a country is often the center of its government and an important cultural hub. The capital of the United States is Washington, D.C. How much do you know about this city and its significance? Have you ever visited Washington, D.C., or do you have any plans to go there? The city is home to many historical landmarks, museums, and monuments. Think about what makes a capital city important and how it represents the nation. What are some other famous capital cities around the world, and what do you find interesting about them?" + }, + { + "prompt": "What is the capital of Canada? Knowing the capital cities of different countries is an important part of understanding global geography. The capital of Canada is Ottawa, a city known for its political significance and cultural landmarks. Have you ever been to Ottawa, or do you know someone who has? What are some key attractions or historical sites in the city? How does Ottawa compare to other major cities in Canada like Toronto, Vancouver, or Montreal? Think about how the location and characteristics of a capital city can influence its role in the country." + }, + { + "prompt": "What is the capital of the UK? Knowing the capital cities of different countries can help broaden your understanding of global geography and culture. The capital of the United Kingdom is London. This city is not only the political hub of the UK but also a major center for finance, culture, and history. What do you know about London? Have you ever visited or would you like to visit one day? What aspects of London intrigue you the most, whether it's the history, the architecture, or the vibrant cultural scene?" + }, + { + "prompt": "What is the capital of Germany? Understanding capital cities and their roles in their respective countries can provide insights into a nation's culture and governance. The capital of Germany is Berlin, a city rich in history and cultural diversity. Have you ever visited Berlin or learned about its significance in world history? Consider its famous landmarks like the Brandenburg Gate, the Berlin Wall, and the Reichstag building. How does Berlin's history influence its current status as a cultural and political center in Europe? Reflect on how the city's past has shaped its present and what makes it a unique and fascinating capital." + }, + { + "prompt": "What is the capital of France? Knowing the capitals of countries can help you understand more about global geography and culture. The capital of France is Paris, often referred to as the 'City of Light.' Paris is renowned for its art, fashion, and history. Have you ever visited Paris, or do you dream of going there someday? Think about iconic landmarks such as the Eiffel Tower, the Louvre Museum, and Notre-Dame Cathedral. What aspects of Parisian culture do you find most appealing? Reflect on the city's influence on art, literature, and cuisine." + }, + { + "prompt": "What is the capital of Japan? Learning about the capitals of different countries can enhance your understanding of global cultures and histories. The capital of Japan is Tokyo, a bustling metropolis known for its blend of traditional and modern influences. Have you ever been to Tokyo or do you know someone who has? Think about what makes Tokyo unique, from its towering skyscrapers and advanced technology to its historic temples and gardens. What cultural elements of Tokyo fascinate you the most? Reflect on how the city represents Japan's rich heritage and rapid modernization." + }, + { + "prompt": "What is the capital of Portugal? Knowing the capitals of different countries can give you a deeper understanding of global geography and culture. Have you ever visited Lisbon or read about its history? Think about landmarks such as the Belem Tower, Jeronimos Monastery, and the scenic Alfama district. What aspects of Lisbon's culture, such as its music, cuisine, or festivals, do you find most interesting? Reflect on the city's significance in maritime history and its influence on global exploration." + }, + { + "prompt": "What is the capital of China? Learning about the capitals of different countries helps you understand their cultural and political significance. Have you ever visited Beijing or learned about its key landmarks like the Forbidden City, Tiananmen Square, and the Great Wall? Think about how Beijing's history as an imperial capital has shaped its development. What aspects of Beijing's culture, such as its cuisine, festivals, or architecture, do you find most intriguing? Reflect on the city's role in China's history and its position as a global political and cultural center." + }, + { + "prompt": "What is the currency of Cuba? Understanding the currencies used in different countries can enhance your knowledge of global economics and trade. The official currency of Cuba is the Cuban peso (CUP). Are you curious about how the currency system works in Cuba, especially given its unique economic situation? Think about how currency reflects the economic policies and conditions of a country. Have you ever traveled to a country with a different currency, and how did you find the experience of exchanging money and making transactions?" + }, + { + "prompt": "What is the currency of Lebanon? Knowing about the currencies of different countries can help you understand their economic systems and cultural exchange. The official currency of Lebanon is the Lebanese pound (LBP). Have you ever wondered how the currency system operates in Lebanon, especially in light of its recent economic challenges? Think about how the value of a currency affects the cost of living, inflation, and international trade. Have you ever traveled to a country with a different currency, and what was your experience like with exchanging money and making purchases? Reflect on the role of currency in everyday transactions and the global economy." + }, + { + "prompt": "What is the currency of Brazil? Learning about the currencies of different countries helps you understand their economic landscapes and cultural interactions. Are you interested in how Brazil's economy and currency have evolved over time? Think about how the exchange rate of the real impacts international trade, tourism, and the daily lives of Brazilians. Have you ever traveled to a country with a different currency, and how did you handle the experience of exchanging money and making transactions? Reflect on the significance of currency in global markets and personal finance." + }, + { + "prompt": "What is the currency of Australia? Understanding the currencies used in different countries can provide insight into their economic systems and cultural exchanges. Are you curious about how the Australian dollar compares to other major currencies and its role in the global economy? Think about how currency values influence international trade, tourism, and the cost of living. Have you ever traveled to a country with a different currency, and what was your experience like with exchanging money and making transactions? Reflect on the importance of currency in daily life and the global marketplace." + }, + { + "prompt": "What is the currency of Jamaica? Learning about the currencies of different countries helps you understand their economic contexts and cultural exchanges. The official currency of Jamaica is the Jamaican dollar (JMD). Are you interested in how the Jamaican dollar functions within the country's economy and its impact on tourism and trade? Think about how currency values affect the cost of living, inflation, and international commerce. Have you ever traveled to a country with a different currency, and how did you handle the experience of exchanging money and making purchases? Reflect on the role of currency in daily transactions and the global economy." + }, + { + "prompt": "What is the currency of Egypt? Knowing about the currencies of different countries can enhance your understanding of their economic systems and cultural interactions.. Are you curious about how the currency system operates in Egypt, especially considering its rich history and current economic conditions? Think about how the value of the Egyptian pound affects tourism, international trade, and the cost of living. Have you ever traveled to a country with a different currency, and what was your experience like with exchanging money and making transactions?" + }, + { + "prompt": "What is the currency of Uzbekistan? Learning about the currencies of different countries helps you understand their economic systems and cultural exchanges. Are you interested in how the currency system works in Uzbekistan, particularly in the context of its historical Silk Road heritage and modern economic development? Think about how the value of the som impacts the cost of living, inflation, and international trade. Have you ever traveled to a country with a different currency, and how did you handle the experience of exchanging money and making purchases?" + }, + { + "prompt": "What is the currency of Argentina? Understanding the currencies used in different countries can provide insight into their economic landscapes and cultural exchanges. Are you curious about how the currency system operates in Argentina, especially considering its recent economic challenges and fluctuations? Think about how the value of the Argentine peso affects the cost of living, inflation, and international trade. Have you ever traveled to a country with a different currency, and what was your experience like with exchanging money and making transactions?" + }, + { + "prompt": "Are birds mammals? This question touches on basic biological classification and the differences between various classes of animals. Birds are not mammals; they belong to the class Aves. What characteristics distinguish birds from mammals, and why is this classification important in biology? Think about the unique features of birds, such as feathers, beaks, and their ability to fly. How do these characteristics compare to mammals, which typically have fur or hair and produce milk for their young? Understanding these differences can help you appreciate the diversity of the animal kingdom." + }, + { + "prompt": "How do you play tennis? Tennis is a popular sport enjoyed by millions around the world. Are you familiar with the basic rules and techniques of tennis? Have you ever played tennis, or do you plan to learn? Reflect on the skills and physical fitness required to play tennis, such as agility, coordination, and endurance. Share any experiences you have with the sport, whether it's watching professional matches, playing recreationally, or taking lessons to improve your game." + }, + { + "prompt": "Suggest cities to visit in Japan. Japan is a country with a rich cultural heritage and modern attractions, making it a popular travel destination. What cities in Japan do you recommend visiting, and why? Think about famous cities like Tokyo, with its bustling metropolis and cutting-edge technology; Kyoto, known for its historic temples and traditional tea houses; and Osaka, famous for its vibrant food scene and entertainment districts. Are there lesser-known cities that offer unique experiences?" + }, + { + "prompt": "How far away is the moon from the earth? Understanding the distance between the Earth and the moon can give you a sense of the vastness of space. Have you ever wondered how scientists measure this distance, or how it varies slightly due to the moon's elliptical orbit? Think about the significance of this distance in terms of space travel and exploration. How long does it take for light or a spacecraft to travel between the Earth and the moon?" + }, + { + "prompt": "What is the capital of the UK? Knowing the capital cities of different countries can help broaden your understanding of global geography and culture. This city is not only the political hub of the UK but also a major center for finance, culture, and history. What do you know about London? Have you ever visited or would you like to visit one day? Think about famous landmarks such as the Tower of London, Buckingham Palace, and the British Museum. What aspects of London intrigue you the most, whether it's the history, the architecture, or the vibrant cultural scene?" + }, + { + "prompt": "What is the capital of Germany? Understanding capital cities and their roles in their respective countries can provide insights into a nation's culture and governance. The capital of Germany is Berlin, a city rich in history and cultural diversity. Have you ever visited Berlin or learned about its significance in world history? Consider its famous landmarks like the Brandenburg Gate, the Berlin Wall, and the Reichstag building. How does Berlin's history influence its current status as a cultural and political center in Europe? Reflect on how the city's past has shaped its present and what makes it a unique and fascinating capital." + }, + { + "prompt": "What is the capital of France? Knowing the capitals of countries can help you understand more about global geography and culture. The capital of France is Paris, often referred to as the 'City of Light.' Paris is renowned for its art, fashion, and history. Have you ever visited Paris, or do you dream of going there someday? Think about iconic landmarks such as the Eiffel Tower, the Louvre Museum, and Notre-Dame Cathedral. What aspects of Parisian culture do you find most appealing? Reflect on the city's influence on art, literature, and cuisine." + }, + { + "prompt": "What is the capital of Japan? Learning about the capitals of different countries can enhance your understanding of global cultures and histories. The capital of Japan is Tokyo, a bustling metropolis known for its blend of traditional and modern influences. Have you ever been to Tokyo or do you know someone who has? Think about what makes Tokyo unique, from its towering skyscrapers and advanced technology to its historic temples and gardens. What cultural elements of Tokyo fascinate you the most? Reflect on how the city represents Japan's rich heritage and rapid modernization." + } +] diff --git a/models/demos/llama3/demo/input_data_questions_prefill_128.json b/models/demos/llama3/demo/sample_prompts/input_data_questions_prefill_256.json similarity index 100% rename from models/demos/llama3/demo/input_data_questions_prefill_128.json rename to models/demos/llama3/demo/sample_prompts/input_data_questions_prefill_256.json diff --git a/models/demos/llama3/demo/simple_text_demo.py b/models/demos/llama3/demo/simple_text_demo.py new file mode 100644 index 00000000000..50f507170b3 --- /dev/null +++ b/models/demos/llama3/demo/simple_text_demo.py @@ -0,0 +1,761 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +# SPDX-License-Identifier: Apache-2.0 + +from pathlib import Path +from typing import Optional +from loguru import logger +from time import time +from datetime import datetime +import hashlib +import requests +import json +from pkg_resources import resource_filename +import math +from termcolor import cprint + +import torch +import pytest +import os +import ttnn + +from llama_models.llama3.api.tokenizer import Tokenizer + +from models.demos.llama3.tt.generator import LlamaGenerator +from models.demos.llama3.tt.model_config import LlamaOptimizations +from models.demos.llama3.tt.llama_common import ( + preprocess_inputs_prefill, + get_rot_transformation_mat, + encode_prompt_llama_instruct, + PagedAttentionConfig, + sample_host, +) +from models.perf.benchmarking_utils import BenchmarkProfiler +from models.demos.utils.llm_demo_utils import create_benchmark_data + + +def load_and_cache_context(context_url, cache_dir, max_length=None): + cache_file = cache_dir / hashlib.md5(context_url.encode()).hexdigest() + + if cache_file.exists(): + with open(cache_file, "r") as f: + context_text = f.read() + logger.info(f"Loaded context from cache: {context_url}") + else: + try: + response = requests.get(context_url) + if response.status_code == 200: + context_text = response.text + with open(cache_file, "w") as f: + f.write(context_text) + logger.info(f"Downloaded and cached context: {context_url}") + else: + logger.warning(f"Failed to fetch context from URL: {context_url}. Status code: {response.status_code}") + context_text = "" + except Exception as e: + logger.error(f"Error fetching context from URL: {context_url}. Error: {str(e)}") + context_text = "" + + # Clip the context to the max length provided + if max_length: + context_text = context_text[:max_length] + logger.info(f"Clipped the context text to {max_length} characters") + + return context_text + + +# load input prompts from json, return as a list +def load_inputs(user_input, batch, instruct): + if isinstance(user_input, str): + with open(user_input, "r") as f: + user_input = json.load(f) + + if len(user_input) < batch: + logger.warning( + f"Number of users in the file is less than the provided batch={batch}. Repeating the prompts to match the batch size." + ) + user_input = user_input * batch + + in_prompt = [] + cache_dir = Path("models/demos/llama3/demo/context_cache") + cache_dir.mkdir(parents=True, exist_ok=True) + + # The demo supports a custom prompt file, where the context is provided by a link to a book from the gutenberg project + # It clips the excerpt to the max length provided to allow testing different long context lengthts + for i in range(batch): + prompt = user_input[i]["prompt"] + if "context" in user_input[i]: + if "max_length" in user_input[i]: # Clip the context to the max length provided + context_text = load_and_cache_context( + user_input[i]["context"], cache_dir, max_length=user_input[i]["max_length"] + ) + else: + context_text = load_and_cache_context(user_input[i]["context"], cache_dir) + if instruct: + prompt = ( + "```" + context_text + "```\n\n" + prompt + ) # Add the markdown block to the context to comply with the prompt + else: + prompt = context_text + in_prompt.append(prompt) + return in_prompt + + +def create_tt_model( + mesh_device, + instruct, + max_batch_size, + optimizations, + max_seq_len, + page_params, + dtype=ttnn.bfloat8_b, + use_paged_kv_cache=False, +): + from models.demos.llama3.tt.llama_model import TtTransformer + from models.demos.llama3.tt.model_config import TtModelArgs + + tt_model_args = TtModelArgs( + mesh_device, + instruct=instruct, + max_batch_size=max_batch_size, + optimizations=optimizations, + max_seq_len=max_seq_len, + ) + state_dict = tt_model_args.load_state_dict() + + page_table = None + paged_attention_config = None + tt_kv_cache = None + + if use_paged_kv_cache: + paged_attention_config = PagedAttentionConfig( + block_size=page_params["page_block_size"], + max_num_blocks=page_params["page_max_num_blocks"], + ) + # Implied shuffling of blocks + permutation = torch.randperm(paged_attention_config.max_num_blocks) + # Page table which maps virtual blocks to physical + reverse_permutation = torch.argsort(permutation) + page_table = reverse_permutation.reshape( + tt_model_args.max_batch_size, paged_attention_config.max_num_blocks // tt_model_args.max_batch_size + ) + paged_attention_config = PagedAttentionConfig( + block_size=page_params["page_block_size"], + max_num_blocks=page_params["page_max_num_blocks"], + ) + + model = TtTransformer( + args=tt_model_args, + mesh_device=mesh_device, + dtype=dtype, + state_dict=state_dict, + weight_cache_path=tt_model_args.weight_cache_path(dtype), + paged_attention_config=paged_attention_config, + ) + + if use_paged_kv_cache: + tt_kv_cache = [l.attention.layer_past for l in model.layers] + + return tt_model_args, model, page_table, tt_kv_cache + + +# List of supported Parameters for demo.py +# +# input_prompts (string): input json file with prompts to process. See models/demos/llama3/demo/*.json for list of input files +# instruct (bool): Whether to use instruct weights or general weights +# repeat_batches (int): Number of consecutive batches of users to run (default: 1) +# max_seq_len (int): Maximum context length supported by the model (Llama3.1 and Llama3.2 models have a maximum context length of 128k, i.e., 128 * 1024) +# batch_size (int): Number of users in a batch (Supports 1/2/4/8/16/32 batches) +# max_generated_tokens (int): Maximum number of tokens to generate for each user (Note that the users will stop generation before this limit if they reach a EoS token) +# paged_attention (bool): Whether to use paged attention or default attention (vLLM requires paged attention) +# page_params (dict): Page parameters for paged attention (block_size, max_num_blocks) For smaller context lengths use block_size=32 and max_num_blocks=1024, for larger context use block_size=64 and max_num_blocks=2048 +# sampling_params (dict): Sampling parameters for decoding (temperature, top_p). If temperature is set to 0, argmax (greedy decode) is used. +# stop_at_eos (bool): Whether to stop decoding when the model generates an EoS token +# +# optimization (LlamaOptimizations): Optimization level to use for the model (performance or accuracy) +# FAKE_DEVICE (str): Fake device to use for testing (N150, N300, T3K, TG). Usage: `export FAKE_DEVICE=N150`, will enable running a single-chip demo on a multi-chip system. +@pytest.mark.parametrize( + "input_prompts, instruct, repeat_batches, max_seq_len, batch_size, max_generated_tokens, paged_attention, page_params, sampling_params, stop_at_eos, ci_only", + [ + ( # Batch-1 run (Latency) - single user, small prompt + "models/demos/llama3/demo/sample_prompts/input_data_questions_prefill_128.json", # input_prompts + True, # instruct mode + 1, # repeat_batches + 1024, # max_seq_len + 1, # batch_size + 200, # max_generated_tokens + True, # paged_attention + {"page_block_size": 32, "page_max_num_blocks": 1024}, # page_params + {"temperature": 0, "top_p": 0.08}, # sampling_params (argmax) + True, # stop_at_eos + False, # ci_only + ), + ( # Batch-32 run (Throughput) - 32 users, small prompt + "models/demos/llama3/demo/sample_prompts/input_data_questions_prefill_128.json", # input_prompts + True, # instruct mode + 1, # repeat_batches + 1024, # max_seq_len + 32, # batch_size + 200, # max_generated_tokens + True, # paged_attention + {"page_block_size": 32, "page_max_num_blocks": 1024}, # page_params + {"temperature": 0, "top_p": 0.08}, # sampling_params (argmax) + True, # stop_at_eos + False, # ci_only + ), + ( # Long-context run - Single user, long prompt (adapted to the model being used and architecture) + "models/demos/llama3/demo/sample_prompts/input_data_long_64k.json", # input_prompts + True, # instruct mode + 1, # repeat_batches + 128 * 1024, # max_seq_len + 1, # batch_size + 200, # max_generated_tokens + True, # paged_attention + {"page_block_size": 32, "page_max_num_blocks": 2048}, # page_params + {"temperature": 0, "top_p": 0.08}, # sampling_params (argmax) + True, # stop_at_eos + False, # ci_only + ), + ( # Batch-1 run (Reasoning) - single user, small prompt, long thinking time + "models/demos/llama3/demo/input_data_questions_reasoning.json", # input_prompts + True, # instruct mode + 1, # repeat_batches + 16 * 1024, # max_seq_len + 1, # batch_size + 15000, # max_generated_tokens + True, # paged_attention + {"page_block_size": 32, "page_max_num_blocks": 1024}, # page_params # TODO This will be serviced by vLLM + {"temperature": 0, "top_p": 0.08}, # sampling_params (argmax) + False, # stop_at_eos + False, # ci_only + ), + ( # CI Batch-1 run - Measures the performance of a single user over 4096 iterations + "models/demos/llama3/demo/sample_prompts/input_data_questions_prefill_128.json", # input_prompts + True, # instruct mode + 1, # repeat_batches + 8192, # max_seq_len + 1, # batch_size + 4096, # max_generated_tokens + True, # paged_attention + {"page_block_size": 32, "page_max_num_blocks": 1024}, # page_params + {"temperature": 0, "top_p": 0.08}, # sampling_params (argmax) + False, # stop_at_eos + True, # ci_only + ), + ( # CI Batch-32 run - Measures the performance of a 32 users over 4096 iterations + "models/demos/llama3/demo/sample_prompts/input_data_questions_prefill_128.json", # input_prompts + True, # instruct mode + 1, # repeat_batches + 2000, # max_seq_len + 32, # batch_size + 1024, # max_generated_tokens # TODO Update this to 4096, and make sure it fits in DRAM with correct page_params + True, # paged_attention # TODO Find the correct paged_attn params to avoid hangs in this config with long context generation + {"page_block_size": 64, "page_max_num_blocks": 1024}, # page_params + {"temperature": 0, "top_p": 0.08}, # sampling_params (argmax) + False, # stop_at_eos + True, # ci_only + ), + ], + ids=[ + "batch-1", # latency + "batch-32", # throughput + "long-context", # max-length + "reasoning-1", # reasoning + "ci-1", # CI batch 1 + "ci-32", # CI batch 32 + ], +) +@pytest.mark.parametrize( + "optimizations", + [ + LlamaOptimizations.performance, + LlamaOptimizations.accuracy, + ], +) +@pytest.mark.parametrize("device_params", [{"trace_region_size": 23887872, "num_command_queues": 2}], indirect=True) +@pytest.mark.parametrize( + "mesh_device", + [ + {"N150": (1, 1), "N300": (1, 2), "T3K": (1, 8), "TG": (8, 4)}.get( + os.environ.get("FAKE_DEVICE"), len(ttnn.get_device_ids()) + ) + ], + indirect=True, +) +def test_llama_demo_text( + input_prompts, + instruct, + repeat_batches, + max_seq_len, + batch_size, + max_generated_tokens, + paged_attention, + page_params, + sampling_params, + optimizations, + stop_at_eos, + mesh_device, + use_program_cache, + is_ci_env, + ci_only, + reset_seeds, + request, +): + """ + Simple Llama demo with limited dependence on reference code. + """ + + if is_ci_env and (optimizations == LlamaOptimizations.accuracy or not ci_only): + pytest.skip("CI only runs the CI-only tests") + + # TODO: Remove this once all batch sizes are supported on TG + if os.environ.get("FAKE_DEVICE") == "TG" and batch_size not in [1, 32]: + pytest.skip("TG only supports batch 1 and 32") + + mesh_device.enable_async(True) + enable_trace = True # Use tracing for better perf + print_to_file = False # Enable this flag to print the output of all users to a file + + # Override parameters from command line if they are provided + input_prompts = request.config.getoption("--input_prompts") or input_prompts + if request.config.getoption("--instruct") in [ + 0, + 1, + ]: # If the flag is provided, use it. Take an int instead of bool due to parser limitations + instruct = request.config.getoption("--instruct") + repeat_batches = request.config.getoption("--repeat_batches") or repeat_batches + max_seq_len = request.config.getoption("--max_seq_len") or max_seq_len + batch_size = request.config.getoption("--batch_size") or batch_size + max_generated_tokens = request.config.getoption("--max_generated_tokens") or max_generated_tokens + paged_attention = request.config.getoption("--paged_attention") or paged_attention + page_params = request.config.getoption("--page_params") or page_params + sampling_params = request.config.getoption("--sampling_params") or sampling_params + if request.config.getoption("--stop_at_eos") in [ + 0, + 1, + ]: # If the flag is provided, use it. Take an int instead of bool due to parser limitations + stop_at_eos = request.config.getoption("--stop_at_eos") + + if not stop_at_eos: + logger.info(f"The decode generation will only stop at the max_generated_tokens limit == {max_generated_tokens}") + + if print_to_file: + # Creat batch output file + timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + output_directory = "models/demos/llama3/demo/output" + os.makedirs(output_directory, exist_ok=True) + os.chmod(output_directory, 0o755) + output_filename = f"{output_directory}/llama_text_demo_output_{timestamp}.txt" + + # Start profiler + logger.info(f"Start profiler") + profiler = BenchmarkProfiler() + profiler.start("run") + + logger.info(f"Reading inputs...") + profiler.start("loading_inputs") + if len(input_prompts) == 1: # Manual input + input_prompts = input_prompts * batch_size + else: # Inputs from file + input_prompts = load_inputs(input_prompts, batch_size, input_prompts) + profiler.end("loading_inputs") + + # To simulate a deployment environment, the demo supports repeating batched prompts. + # This loop will rotate the prompts between the users for each batch, to simulate users sending different requests + # If batch_size=1, the same prompt is repeated for each batch + repeat_batch_prompts = [] + for i in range(repeat_batches): + repeat_batch_prompts.append([input_prompts[(j + i) % len(input_prompts)] for j in range(len(input_prompts))]) + + model_args, model, page_table, tt_kv_cache = create_tt_model( + mesh_device, + instruct=instruct, + max_batch_size=batch_size, + optimizations=optimizations, + max_seq_len=max_seq_len, + page_params=page_params, + dtype=ttnn.bfloat8_b, + use_paged_kv_cache=paged_attention, + ) + + tokenizer = model_args.tokenizer + generator = LlamaGenerator(model, model_args, mesh_device, tokenizer=tokenizer) + + num_tokens_generated_decode = [] + + logger.info("Starting inference...") + for batch_idx, input_prompts in enumerate(repeat_batch_prompts): + logger.info(f"Processing batch {batch_idx}") + profiler.start(f"preprocess_prefill_inputs", iteration=batch_idx) + # Preprocess initial prompt inputs + ( + input_tokens_prefill_pt, + encoded_prompts, + decoding_pos, + prefill_lens, + ) = preprocess_inputs_prefill( + input_prompts, + tokenizer, + model_args, + instruct, + max_generated_tokens, + ) + + max_encoded_prompt_len = max(len(p) for p in encoded_prompts) + assert ( + max_generated_tokens + max_encoded_prompt_len <= max_seq_len + ), f"Prompt prefill tokens ({max_encoded_prompt_len}) + maximum number of decoded iterations ({max_generated_tokens}) needs to be <= than max_seq_len ({max_seq_len})" + + profiler.end(f"preprocess_prefill_inputs", iteration=batch_idx) + + # when doing repeating batches, set kv-caches to zero, to avoid context leaking + if batch_idx != 0: + for layer in model.layers: + k_cache, v_cache = layer.attention.layer_past + k_cache = ttnn.mul(k_cache, 0, output_tensor=k_cache) + v_cache = ttnn.mul(v_cache, 0, output_tensor=v_cache) + + input_tokens_prefill_pt = torch.stack(input_tokens_prefill_pt).view(batch_size, -1) + + logger.info("Starting prefill warmup...") + profiler.start(f"compile_prefill", iteration=batch_idx) + logits = generator.prefill_forward_text( + input_tokens_prefill_pt[0].unsqueeze(0), # Just warmup prefill for 1 user + page_table=page_table, + kv_cache=tt_kv_cache, + prompt_lens=decoding_pos, + ) + profiler.end(f"compile_prefill", iteration=batch_idx) + logger.info("Finished prefill warmup") + + logger.info(f"Starting prefill...") + profiler.start(f"inference_prefill", iteration=batch_idx) + logits = generator.prefill_forward_text( + input_tokens_prefill_pt, + page_table=page_table, + kv_cache=tt_kv_cache, + prompt_lens=decoding_pos, + ) + prefilled_token = torch.argmax(logits, dim=-1) + profiler.end(f"inference_prefill", iteration=batch_idx) + logger.info(f"Prefill finished") + + # Keep track of generated outputs to print out every iteration + all_outputs = [encoded_prompts[b][: prefill_lens[b]] for b in range(batch_size)] + for user in range(batch_size): + user_tok = int(prefilled_token[user].item()) + all_outputs[user].append(user_tok) + + user_done = [False] * batch_size # Keeps track when a user reaches EoD token + + # TODO Argmax on device is only supported for batch_size=1 + argmax_on_device = False if (batch_size > 1 or sampling_params["temperature"] != 0) else True + + # Initial positions + current_pos = torch.tensor([decoding_pos[b] for b in range(batch_size)]) + + # Start decoding + iteration = 0 + users_decoding = True + + out_tok = prefilled_token + + logger.info(f"Starting decode loop...") + + # Log total inference (accounting for compile_decode as well) + profiler.start(f"inference_decode", iteration=batch_idx) + while users_decoding: + if iteration == 0: # First iteration also accounts for compile time + profiler.start(f"compile_decode", iteration=batch_idx) + else: + profiler.start(f"inference_decode_time_{iteration}", iteration=batch_idx) + + # Run decode forward + logits = generator.decode_forward_text( + out_tok, + current_pos, + enable_trace=enable_trace, + page_table=page_table, + kv_cache=tt_kv_cache, + argmax_on_device=argmax_on_device, + ) + + # Get the next token + if argmax_on_device: + out_tok = logits.unsqueeze(1) + else: + # TODO Fix use case with temperature > 0 + _, out_tok = sample_host( + logits, + None, + temperature=sampling_params["temperature"], + top_p=sampling_params["top_p"], + on_host=True, + ) + + if iteration == 0: # First iteration will account the compile time + profiler.end(f"compile_decode", iteration=batch_idx) + decode_iteration_time = profiler.get_duration("compile_decode", iteration=batch_idx) + else: + profiler.end(f"inference_decode_time_{iteration}", iteration=batch_idx) + decode_iteration_time = profiler.get_duration(f"inference_decode_time_{iteration}", iteration=batch_idx) + + # Always print perf after every iteration + tokens_per_second_per_user = 1 / decode_iteration_time + logger.info( + f"Iteration {iteration}: {1000*decode_iteration_time:.0f}ms @ {tokens_per_second_per_user:.1f} tok/s/user ({batch_size*tokens_per_second_per_user:.1f} tok/s throughput)" + ) + + current_pos += 1 + + # Save output token to print out later + for user in range(batch_size): + user_tok = out_tok[user].item() + if ( + user_tok not in tokenizer.stop_tokens and user_done[user] == False + ): # Read until an eos token (e.g. <|eot_id|>); create_tokenizer adds stop_tokens to HF tokenizers + all_outputs[user].append(user_tok) + else: + if ( + stop_at_eos + ): # For performance gathering in CI, we want to sometimes force decoding for a fixed number of iterations + user_done[user] = True + logger.trace(f"[User {user}] Finished decoding at iteration {iteration}") + if all(user_done): + users_decoding = False + + # Print out generated outputs for each user at the end of every iteration + if not is_ci_env: + for user in range(batch_size): + text = "".join(tokenizer.decode(all_outputs[user])) + if len(text) > 100: + text = "..." + text[-97:] + text = text.replace("\n", " ") + logger.info("[User {}] {}".format(user, text)) + + iteration += 1 + + # Upper limit of generated tokens for each user + if iteration >= max_generated_tokens: + users_decoding = False + + # Final print + if not users_decoding: + profiler.start(f"log_saving_file", iteration=batch_idx) + logger.info("Finished decoding, printing the final outputs...\n") + for i, (output, prompt) in enumerate(zip(all_outputs, input_prompts)): + text = tokenizer.decode(output) + prompt_including_assistant_tags = tokenizer.decode( + model_args.encode_prompt(prompt, instruct=instruct) + ) + text_after_prompt = text.replace(prompt_including_assistant_tags, "", 1) + if print_to_file: + with open(output_filename, "a") as f: + f.write( + f"\nbatch: {batch_idx} user: {i}\nprompt: {prompt} \noutput:\n{text_after_prompt}\n" + ) + else: + # Strip leading newlines from output when sent to terminal + short_prompt = ( + (prompt[:100] + "\n\n" + prompt[-100:]) + if len(prompt) > 200 + else prompt + ) + logger.info( + f"\n==REPEAT BATCH {batch_idx}\n==USER {i} - PROMPT\n{short_prompt} \n==USER {i} - OUTPUT\n{text_after_prompt.strip()}\n" + ) + profiler.end(f"log_saving_file", iteration=batch_idx) + + num_tokens_generated_decode.append(iteration) # Save the number of tokens generated for each repeat batch + + profiler.end(f"inference_decode", iteration=batch_idx) + + # Finish profiling at the end of inference for all repeated batches + profiler.end("run") + + # Prepare profile benchmark metrics for the first repeat batch only + compile_prefill_time = profiler.get_duration("compile_prefill") + compile_decode_time = profiler.get_duration("compile_decode") + + total_inference_prefill_time = profiler.get_duration("inference_prefill") + total_inference_decode_time = 0 + for i in range(1, iteration): # Iteration 0 is the compile time + total_inference_decode_time += profiler.get_duration(f"inference_decode_time_{i}") + + # Average prefill time for each user + avg_time_to_first_token = total_inference_prefill_time / batch_size + # Average decode time per batch iteration + avg_decode_iteration_time = total_inference_decode_time / (iteration - 1) + + prefill_tok_s = prefill_lens[0] / total_inference_prefill_time / batch_size + decode_tok_s_user = (num_tokens_generated_decode[0] - 1) / total_inference_decode_time # Remove the compile time + decode_tok_s = ( + (num_tokens_generated_decode[0] - 1) / total_inference_decode_time * batch_size + ) # Remove the compile time + + measurements = { + # Required measurements + "compile_prefill": compile_prefill_time, + "compile_decode": compile_decode_time, + "inference_prefill": total_inference_prefill_time, + "inference_decode": total_inference_decode_time, + "prefill_time_to_token": avg_time_to_first_token, + "prefill_t/s": prefill_tok_s, # tokens/s + "decode_t/s/u": decode_tok_s_user, # tokens/s/u + "decode_t/s": decode_tok_s, # tokens/s + # Optional measurements + "Total compile time": compile_prefill_time + compile_decode_time, + "Full demo runtime": profiler.get_duration("run"), + } + + # Decode performance for some specific tokens + tok_1_perf = profiler.get_duration(f"inference_decode_time_{1}") # Iteration 0 is compile time + tok_128_perf = profiler.get_duration(f"inference_decode_time_{127}") if 127 < iteration else 0 + tok_1024_perf = profiler.get_duration(f"inference_decode_time_{1023}") if 1023 < iteration else 0 + tok_4096_perf = profiler.get_duration(f"inference_decode_time_{4095}") if 4095 < iteration else 0 + + if not stop_at_eos: + logger.info(f"Please note that 'stop_at_eos' is disabled. Output repetition is expected.") + + logger.info("") + logger.info(f"=== Performance metrics ===") + logger.info( + f"1st token decode time: {tok_1_perf*1000:.2f}ms [{round(1/tok_1_perf, 2)} t/s/u, {round((1/tok_1_perf)*batch_size, 2)} t/s]" + ) + if tok_128_perf > 0: + logger.info( + f"128th token decode time: {tok_128_perf*1000:.2f}ms [{round(1/tok_128_perf, 2)} t/s/u, {round((1/tok_128_perf)*batch_size, 2)} t/s]" + ) + if tok_1024_perf > 0: + logger.info( + f"1024th token decode time: {tok_1024_perf*1000:.2f}ms [{round(1/tok_1024_perf, 2)} t/s/u, {round((1/tok_1024_perf)*batch_size, 2)} t/s]" + ) + if tok_4096_perf > 0: + logger.info( + f"4096th token decode time: {tok_4096_perf*1000:.2f}ms [{round(1/tok_4096_perf, 2)} t/s/u, {round((1/tok_4096_perf)*batch_size, 2)} t/s]" + ) + + # Print some of the perf metrics + logger.info("==") + logger.info(f"Prefill compile time: {round(compile_prefill_time, 2)}s") + logger.info(f"Decode compile time: {round(compile_decode_time, 2)}s") + logger.info("") + logger.info(f"Average Time to First Token (TTFT): {round(avg_time_to_first_token*1000, 2)}ms") + logger.info( + f"Average speed: {round(avg_decode_iteration_time * 1000, 2)}ms @ {round(decode_tok_s_user, 2)} tok/s/user ({round(decode_tok_s, 2)} tok/s throughput)" + ) + + # Benchmark targets + supported_models = ["Llama3.2-1B", "Llama3.2-3B", "Llama3.1-8B", "Llama3.2-11B", "Llama3.1-70B"] + supported_devices = ["N150", "N300", "T3K", "TG"] + + tt_device_name = model_args.device_name + + if model_args.base_model_name in supported_models: + assert tt_device_name in supported_devices, f"Device {tt_device_name} not supported" + + # Set the target times to first token for every combination of device and model + target_prefill_tok_s = { + "N150_Llama3.2-1B": 1050, # TODO Update target + "N300_Llama3.2-1B": 1050, # TODO Update target + "T3K_Llama3.2-1B": 1050, # TODO Update target + "TG_Llama3.2-1B": 1050, # TODO Update target + # + "N150_Llama3.2-3B": 1050, # TODO Update target + "N300_Llama3.2-3B": 1050, # TODO Update target + "T3K_Llama3.2-3B": 1050, # TODO Update target + "TG_Llama3.2-3B": 1050, # TODO Update target + # + "N150_Llama3.1-8B": 1050, + "N300_Llama3.1-8B": 1050, + "T3K_Llama3.1-8B": 1050, + "TG_Llama3.1-8B": 1050, + # + "N150_Llama3.2-11B": 1050, # TODO Update target + "N300_Llama3.2-11B": 1050, # TODO Update target + "T3K_Llama3.2-11B": 1050, # TODO Update target + "TG_Llama3.2-11B": 1050, # TODO Update target + # + "N150_Llama3.1-70B": 1050, # TODO Update target + "N300_Llama3.1-70B": 1050, # TODO Update target + "T3K_Llama3.1-70B": 1050, # TODO Update target + "TG_Llama3.1-70B": 1050, # TODO Update target + }[f"{tt_device_name}_{model_args.base_model_name}"] + + # Set the target decode timesfor every combination of device and model + target_decode_tok_s_u = { + "N150_Llama3.2-1B": 160, # TODO Update target + "N300_Llama3.2-1B": 250, # TODO Update target + "T3K_Llama3.2-1B": 300, # TODO Update target + "TG_Llama3.2-1B": 300, # TODO Update target + # + "N150_Llama3.2-3B": 60, # TODO Update target + "N300_Llama3.2-3B": 100, # TODO Update target + "T3K_Llama3.2-3B": 150, # TODO Update target + "TG_Llama3.2-3B": 150, # TODO Update target + # + "N150_Llama3.1-8B": 23, # TODO Update target + "N300_Llama3.1-8B": 38, + "T3K_Llama3.1-8B": 45, + "TG_Llama3.1-8B": 45, # TODO Update target + # + "N150_Llama3.2-11B": 23, + "N300_Llama3.2-11B": 38, # TODO Update target + "T3K_Llama3.2-11B": 45, # TODO Update target + "TG_Llama3.2-11B": 45, # TODO Update target + # + "T3K_Llama3.1-70B": 20, # TODO Update target + "TG_Llama3.1-70B": 20, # TODO Update target + }[f"{tt_device_name}_{model_args.base_model_name}"] + + target_decode_tok_s = target_decode_tok_s_u * batch_size + targets = { + "prefill_t/s": target_prefill_tok_s, + "decode_t/s": target_decode_tok_s, + "decode_t/s/u": target_decode_tok_s_u, + } + else: + logger.warning(f"Model {model_args.base_model_name} not does not have performance targets set") + targets = {} + + # Save benchmark data for CI dashboard + if is_ci_env: + # Instead of running warmup iterations, the demo profiles the initial compile iteration + bench_n_warmup_iter = {"inference_prefill": 0, "inference_decode": 1} + benchmark_data = create_benchmark_data(profiler, measurements, bench_n_warmup_iter, targets) + + # Save the decode performance of every iteration for plotting in superset + for i in range(1, iteration): + benchmark_data.add_measurement( + profiler, + 0, + "inference_decode", + f"time_to_token_{i}", + profiler.get_duration(f"inference_decode_time_{i}") * 1000, + step_warm_up_num_iterations=None, + target=None, + ) + + # Also save the avg decode performance for the 128 iterations (excluding the compile time) + inference_decode_time_first_128 = sum( + profiler.get_duration(f"inference_decode_time_{i}") for i in range(1, 128) + ) + benchmark_data.add_measurement( + profiler, + 0, + "inference_decode", + "avg_decode_time_first_128", + inference_decode_time_first_128 * 1000 / 127, + step_warm_up_num_iterations=None, + target=None, + ) + + benchmark_data.save_partial_run_json( + profiler, + run_type=f"{tt_device_name}-demo", + ml_model_name=model_args.base_model_name, + ml_model_type="llm", + num_layers=model_args.n_layers, + batch_size=batch_size, + input_sequence_length=max(prefill_lens), + output_sequence_length=num_tokens_generated_decode[0], + ) diff --git a/models/demos/llama3/demo/simple_vision_demo.py b/models/demos/llama3/demo/simple_vision_demo.py index 7eaed8091a7..47719f91462 100644 --- a/models/demos/llama3/demo/simple_vision_demo.py +++ b/models/demos/llama3/demo/simple_vision_demo.py @@ -108,7 +108,7 @@ def test_llama_multimodal_demo_text( mesh_device.enable_async(True) model_args, model = create_multimodal_model(mesh_device, max_batch_size=max_batch_size, max_seq_len=max_seq_len) generator = LlamaGenerator(model, model_args, mesh_device) - tokenizer = model_args.tokenizer + tokenizer = Tokenizer(model_path=tokenizer_path) formatter = ChatFormat(tokenizer) xattn_caches = generator.model.setup_cache(model_args.max_batch_size) diff --git a/models/demos/llama3/lt b/models/demos/llama3/lt index c088bb586d8..c1254a0fc74 100755 --- a/models/demos/llama3/lt +++ b/models/demos/llama3/lt @@ -163,6 +163,12 @@ class OutputEntryList: entry.output = entry_data["output"] entry.log_id = entry_data["log_id"] entry.speed = entry_data["speed"] + if ( + "ttft" not in entry_data.keys() + ): # Verify if the new TTFT attribute is present to avoid errors with old lt versions + os.remove("logs/state.json") + return + entry.ttft = entry_data["ttft"] entry.pcc = entry_data["pcc"] self._entries.append(entry) except (FileNotFoundError, json.JSONDecodeError): @@ -182,6 +188,8 @@ class OutputEntryList: } if hasattr(entry, "speed"): entry_data["speed"] = entry.speed + if hasattr(entry, "ttft"): + entry_data["ttft"] = entry.ttft if hasattr(entry, "pcc"): entry_data["pcc"] = entry.pcc state.append(entry_data) @@ -256,6 +264,7 @@ class Entry: self.lock = threading.Lock() self.log_id = None # Will be set by OutputEntryList self.speed = None + self.ttft = None self.pcc = None self.thread = None self.changed = True # Initialize as changed to ensure first draw @@ -570,6 +579,7 @@ def main(stdscr): entry["status"] = "Waiting" entry["output"] = "" entry["speed"] = None + entry["ttft"] = None entry["pcc"] = None entry["process"] = None entry["log_file"] = None @@ -728,6 +738,7 @@ def draw_output_entry(stdscr, entry, y, is_selected, max_x): entry.device, entry.status, entry.speed if entry.speed else "", + entry.ttft if entry.ttft else "", entry.pcc if entry.pcc else "", entry.output, ] @@ -756,7 +767,9 @@ def draw_output_entry(stdscr, entry, y, is_selected, max_x): color = COLOR_PAIR_WAITING elif i == 4: # Speed column color = COLOR_PAIR_SPEED - elif i == 5: # PCC column + elif i == 5: # TTFT column + color = COLOR_PAIR_SPEED + elif i == 6: # PCC column if col: try: pcc_value = float(col) @@ -776,7 +789,7 @@ def draw_output_entry(stdscr, entry, y, is_selected, max_x): def format_header(max_x): - cols = ["Command", "Model", "Device", "Status", "Speed", "PCC", "Output"] + cols = ["Command", "Model", "Device", "Status", "Speed", "TTFT(ms)", "PCC", "Output"] col_widths = [20, 10, 10, 20, 10, 10, max_x - 85] # Adjusted widths to accommodate the PCC column formatted_cols = [] for col, width in zip(cols, col_widths): @@ -839,10 +852,12 @@ def run_entry_command(entry, screen_lock, output_entries, screen_needs_update): command_shortcuts = { "accuracy": "pytest models/demos/llama3/tests/test_llama_accuracy.py -k 'attention-performance and file'", "accuracy-acc": "pytest models/demos/llama3/tests/test_llama_accuracy.py -k 'attention-acc and file'", - "demo": "pytest models/demos/llama3/demo/demo.py -k performance-batch-1", - "demo-acc": "pytest models/demos/llama3/demo/demo.py -k accuracy-batch-1", - "demo-32": "pytest models/demos/llama3/demo/demo.py -k performance-batch-32", - "demo-long": "pytest models/demos/llama3/demo/demo.py -k performance-long", + "demo": "pytest models/demos/llama3/demo/simple_text_demo.py -k performance-batch-1", + "demo-acc": "pytest models/demos/llama3/demo/simple_text_demo.py -k accuracy-batch-1", + "demo-32": "pytest models/demos/llama3/demo/simple_text_demo.py -k performance-batch-32", + "demo-long": "pytest models/demos/llama3/demo/simple_text_demo.py -k performance-long", + "demo-ci-1": "pytest models/demos/llama3/demo/simple_text_demo.py -k performance-ci-1", + "demo-ci-32": "pytest models/demos/llama3/demo/simple_text_demo.py -k performance-ci-32", "attention": "pytest models/demos/llama3/tests/test_llama_attention.py", "attention-prefill": "pytest models/demos/llama3/tests/test_llama_attention_prefill.py", "mlp": "pytest models/demos/llama3/tests/test_llama_mlp.py", @@ -922,16 +937,18 @@ def process_output(entry, screen_lock, output_entries, screen_needs_update): log_file.flush() # Update status and output based on output - status, output, speed, pcc = parse_output_line(line, previous_line, entry.status) + status, output, speed, ttft, pcc = parse_output_line(line, previous_line, entry.status) previous_line = line.strip() with entry.lock: - if status != entry.status or output or speed is not None or pcc is not None: + if status != entry.status or output or speed is not None or ttft is not None or pcc is not None: entry.status = status # This will mark entry as changed via __setattr__ if output: entry.output = output if speed is not None: entry.speed = f"{speed:.1f}" + if ttft is not None: + entry.ttft = f"{ttft:.0f}" if pcc is not None: try: pcc_value = float(pcc) @@ -987,6 +1004,12 @@ def parse_output_line(line, previous_line, current_status): if latency_match: speed = 1000 * float(latency_match.group(1)) # convert to ms + # Check for TTFT information + ttft = None + ttft_match = re.search(r"\(TTFT\)\: (\d+\.\d+)ms", line) + if ttft_match: + ttft = float(ttft_match.group(1)) + # Check for PCC information pcc = None pcc_match = re.search(r"PCC: (\d+\.\d+)", line) @@ -1000,42 +1023,42 @@ def parse_output_line(line, previous_line, current_status): pcc = f"{top1.strip():<3s}|{top5.strip():>3s}" if "Initializing device" in line: - return "Initializing device", None, speed, pcc + return "Initializing device", None, speed, ttft, pcc elif "Loading weights" in line: - return "Loading weights", None, speed, pcc + return "Loading weights", None, speed, ttft, pcc elif re.search(r"layers\.\d+\.", line): match = re.search(r"layers\.(\d+)\.", line) if match: layer_number = match.group(1) - return f"Loading layer {layer_number}", None, speed, pcc + return f"Loading layer {layer_number}", None, speed, ttft, pcc elif "Starting inference..." in line: - return "Starting", None, speed, pcc + return "Starting", None, speed, ttft, pcc elif "Starting prefill..." in line: - return "Prefill", None, speed, pcc + return "Prefill", None, speed, ttft, pcc elif "Starting decode..." in line: - return "Decode", None, speed, pcc - elif line == "output:": - return "Waiting for output", None, speed, pcc - elif current_status == "Waiting for output" and previous_line == "output:": + return "Decode", None, speed, ttft, pcc + elif "- OUTPUT" in line: + return "Waiting for output", None, speed, ttft, pcc + elif current_status == "Waiting for output" and "- OUTPUT" in previous_line: if "<|start_header_id|>assistant<|end_header_id|>" in line: output = line.split("<|start_header_id|>assistant<|end_header_id|>", 1)[1].strip() if output: - return "Running", output, speed, pcc + return "Running", output, speed, ttft, pcc else: - return "Assistant output", None, speed, pcc # wait for a non-blank line + return "Assistant output", None, speed, ttft, pcc # wait for a non-blank line else: - return "Running", line, speed, pcc + return "Running", line, speed, ttft, pcc elif current_status == "Assistant output" and line: # skip blank lines - return "Running", line, speed, pcc + return "Running", line, speed, ttft, pcc # Check for test output test_match = re.search(r"\| models\.demos\.llama3\.tests\..+ - (.+)", line) if test_match: - if current_status.startswith("Loading") and (pcc is not None or speed is not None): + if current_status.startswith("Loading") and (pcc is not None or speed is not None or ttft is not None): current_status = "Running" - return current_status, test_match.group(1), speed, pcc + return current_status, test_match.group(1), speed, ttft, pcc - return current_status, None, speed, pcc + return current_status, None, speed, ttft, pcc def get_llama_dir(model): @@ -1221,8 +1244,9 @@ def export_results_to_markdown(output_entries, stdscr): key = (entry.model, entry.device) if entry.command_name == "demo" or entry.command_name == "accuracy": - # Get speed from demo entry + # Get speed and ttft from demo entry speed = entry.speed if entry.command_name == "demo" else None + ttft = entry.ttft if entry.command_name == "demo" else None # Get accuracy from accuracy entry top1, top5 = "N/A", "N/A" if entry.command_name == "accuracy" and entry.pcc: @@ -1237,12 +1261,15 @@ def export_results_to_markdown(output_entries, stdscr): existing_entry[3] = speed if top1 != "N/A": existing_entry[1:3] = [top1, top5] + if ttft: + existing_entry[4] = ttft else: - perf_entries.append([key, top1, top5, speed or "N/A"]) + perf_entries.append([key, top1, top5, speed or "N/A", ttft]) elif entry.command_name == "demo-acc" or entry.command_name == "accuracy-acc": # Same logic for accuracy configuration speed = entry.speed if entry.command_name == "demo-acc" else None + ttft = entry.ttft if entry.command_name == "demo-acc" else None top1, top5 = "N/A", "N/A" if entry.command_name == "accuracy-acc" and entry.pcc: match = re.match(r"(\d+)\s*\|\s*(\d+)", entry.pcc) @@ -1255,8 +1282,10 @@ def export_results_to_markdown(output_entries, stdscr): existing_entry[3] = speed if top1 != "N/A": existing_entry[1:3] = [top1, top5] + if ttft: + existing_entry[4] = ttft else: - acc_entries.append([key, top1, top5, speed or "N/A"]) + acc_entries.append([key, top1, top5, speed or "N/A", ttft]) # Create markdown content markdown_lines = [ @@ -1264,8 +1293,8 @@ def export_results_to_markdown(output_entries, stdscr): "", "This configuration uses bfp4 MLP FF1+FF3 for all models.", "", - "| Model | Device | Top-1 (%) | Top-5 (%) | Speed (t/s/u) |", - "|-------|--------|-----------|-----------|---------------|", + "| Model | Device | Top-1 (%) | Top-5 (%) | Speed (t/s/u) | TTFT (ms) |", + "|-------|--------|-----------|-----------|---------------|-----------|", ] fullname = { @@ -1281,8 +1310,8 @@ def export_results_to_markdown(output_entries, stdscr): # Add rows for performance table in original order for entry in perf_entries: - (model, device), top1, top5, speed = entry - markdown_lines.append(f"| {model} | {device} | {top1} | {top5} | {speed} |") + (model, device), top1, top5, speed, ttft = entry + markdown_lines.append(f"| {model} | {device} | {top1} | {top5} | {speed} | {ttft} |") # Add accuracy table markdown_lines.extend( @@ -1292,15 +1321,15 @@ def export_results_to_markdown(output_entries, stdscr): "", "This configuration uses bfp4 MLP FF1+FF3 only for the 3.1-70B model.", "", - "| Model | Device | Top-1 (%) | Top-5 (%) | Speed (t/s/u) |", - "|-------|--------|-----------|-----------|---------------|", + "| Model | Device | Top-1 (%) | Top-5 (%) | Speed (t/s/u) | TTFT (ms) |", + "|-------|--------|-----------|-----------|---------------|-----------|", ] ) # Add rows for accuracy table in original order for entry in acc_entries: - (model, device), top1, top5, speed = entry - markdown_lines.append(f"| {fullname[model]} | {device} | {top1} | {top5} | {speed} |") + (model, device), top1, top5, speed, ttft = entry + markdown_lines.append(f"| {fullname[model]} | {device} | {top1} | {top5} | {speed} | {ttft} |") # Write to PERF.md with open("PERF.md", "w") as f: @@ -1325,22 +1354,25 @@ def export_results_to_markdown(output_entries, stdscr): def reparse_log_file(entry, screen_needs_update): - """Reparse an entry's log file to update speed and pcc values.""" + """Reparse an entry's log file to update speed, ttft and pcc values.""" try: with open(entry.get_log_filename(), "r") as f: previous_line = "" status = entry.status # Preserve the current status - # Reset speed and pcc before reparsing + # Reset speed, ttft and pcc before reparsing entry.speed = None + entry.ttft = None entry.pcc = None for line in f: - new_status, output, speed, pcc = parse_output_line(line, previous_line, status) + new_status, output, speed, ttft, pcc = parse_output_line(line, previous_line, status) previous_line = line.strip() if speed is not None: entry.speed = f"{speed:.1f}" + if ttft is not None: + entry.ttft = f"{ttft:.0f}" if pcc is not None: try: pcc_value = float(pcc) diff --git a/models/demos/llama3/tests/multimodal/test_llama_cross_attention_transformer_text.py b/models/demos/llama3/tests/multimodal/test_llama_cross_attention_transformer_text.py index e23ea6e62bd..7c59a9630de 100644 --- a/models/demos/llama3/tests/multimodal/test_llama_cross_attention_transformer_text.py +++ b/models/demos/llama3/tests/multimodal/test_llama_cross_attention_transformer_text.py @@ -214,7 +214,6 @@ def test_llama_cross_attention_transformer_text_inference( rot_mats = get_prefill_rot_mat( model_args.head_dim, - model_args.max_seq_len, mesh_device, seq_len, model_args.rope_theta, diff --git a/models/demos/llama3/tests/test_llama_accuracy.py b/models/demos/llama3/tests/test_llama_accuracy.py index 5a40dec57ac..54fb306b299 100644 --- a/models/demos/llama3/tests/test_llama_accuracy.py +++ b/models/demos/llama3/tests/test_llama_accuracy.py @@ -10,10 +10,10 @@ from models.demos.llama3.tt.llama_common import ( get_prefill_rot_mat, PagedAttentionConfig, + preprocess_inputs_prefill, ) from models.demos.llama3.tt.llama_model import TtTransformer from models.demos.llama3.tt.model_config import TtModelArgs, LlamaOptimizations -from models.demos.llama3.demo.demo import preprocess_inputs_prefill from pathlib import Path @@ -229,7 +229,6 @@ def test_tt_model_acc( # Pre-compute the rotational embedding matrix and send to device rot_mats_prefill = get_prefill_rot_mat( model_args.head_dim, - model_args.max_seq_len, mesh_device, prefill_lens[0], model_args.rope_theta, diff --git a/models/demos/llama3/tests/test_llama_attention_prefill.py b/models/demos/llama3/tests/test_llama_attention_prefill.py index bf1db31f622..52d6e2cc19a 100644 --- a/models/demos/llama3/tests/test_llama_attention_prefill.py +++ b/models/demos/llama3/tests/test_llama_attention_prefill.py @@ -86,7 +86,6 @@ def test_llama_attention_inference( # pre-compute the rotational embedding matrix and send to device rot_mats = get_prefill_rot_mat( model_args.head_dim, - model_args.max_seq_len, mesh_device, max_seq_len, model_args.rope_theta, diff --git a/models/demos/llama3/tests/test_llama_chunked_generation.py b/models/demos/llama3/tests/test_llama_chunked_generation.py index 7d91921e732..b2ddb7296b1 100644 --- a/models/demos/llama3/tests/test_llama_chunked_generation.py +++ b/models/demos/llama3/tests/test_llama_chunked_generation.py @@ -11,7 +11,6 @@ PagedAttentionConfig, get_block_size, num_blocks_in_seq, - HostEmbedding, ) from models.demos.llama3.tt.llama_model import TtTransformer from models.demos.llama3.tt.model_config import TtModelArgs, LlamaOptimizations @@ -102,7 +101,7 @@ def test_chunked_prefill_single_user( reference_model = Transformer(model_args) reference_model.load_state_dict(reference_state_dict) - embd = HostEmbedding(model_args) + embd = model_args.reference_embedding() embd.load_state_dict({"emb.weight": state_dict[f"{state_dict_prefix}tok_embeddings.weight"]}) # Setup page table diff --git a/models/demos/llama3/tests/test_llama_decoder_prefill.py b/models/demos/llama3/tests/test_llama_decoder_prefill.py index 53cbf81cb03..a370011383d 100644 --- a/models/demos/llama3/tests/test_llama_decoder_prefill.py +++ b/models/demos/llama3/tests/test_llama_decoder_prefill.py @@ -89,7 +89,6 @@ def test_llama_decoder_inference( # pre-compute the rotational embedding matrix and send to device rot_mats = get_prefill_rot_mat( model_args.head_dim, - model_args.max_seq_len, mesh_device, max_seq_len, model_args.rope_theta, diff --git a/models/demos/llama3/tests/test_llama_model.py b/models/demos/llama3/tests/test_llama_model.py index fefda03034f..a131dfd7836 100644 --- a/models/demos/llama3/tests/test_llama_model.py +++ b/models/demos/llama3/tests/test_llama_model.py @@ -321,7 +321,7 @@ def test_llama_model_inference( # Greedy decode (temperature = 0) the generated token and save it to print out later if run_ref_pt: # Sample from reference model first - pt_out_tok = sample_host(ref_output, None, temperature=0, top_p=0.8) + _, pt_out_tok = sample_host(ref_output, None, temperature=0, top_p=0.8) pt_decode_input = embd(pt_out_tok) all_outputs_ref.append(pt_out_tok.squeeze(1).tolist()[0]) @@ -330,7 +330,7 @@ def test_llama_model_inference( all_outputs.append(pt_out_tok.squeeze(1).tolist()[0]) else: # If not running reference model, sample from TT model directly - tt_out_tok = sample_host(tt_output_torch, None, temperature=0, top_p=0.8) + _, tt_out_tok = sample_host(tt_output_torch, None, temperature=0, top_p=0.8) tt_decode_input = embd(tt_out_tok) all_outputs.append(tt_out_tok.squeeze(1).tolist()[0]) diff --git a/models/demos/llama3/tests/test_llama_model_prefill.py b/models/demos/llama3/tests/test_llama_model_prefill.py index fb16414e979..667764a2304 100644 --- a/models/demos/llama3/tests/test_llama_model_prefill.py +++ b/models/demos/llama3/tests/test_llama_model_prefill.py @@ -133,7 +133,6 @@ def test_llama_model_inference( # pre-compute the rotational embedding matrix and send to device rot_mats = get_prefill_rot_mat( model_args.head_dim, - model_args.max_seq_len, mesh_device, seq_len, model_args.rope_theta, diff --git a/models/demos/llama3/tt/generator.py b/models/demos/llama3/tt/generator.py index 0ca2a544b7d..858ada4f3c1 100644 --- a/models/demos/llama3/tt/generator.py +++ b/models/demos/llama3/tt/generator.py @@ -174,12 +174,14 @@ def decode_forward_text( kv_cache=None, enable_trace=True, read_from_device=True, + argmax_on_device=False, ): decode_kwargs = { "current_pos": start_pos, "tokens": tokens, "page_table": page_table, "kv_cache": kv_cache, + "argmax_on_device": argmax_on_device, } if enable_trace: tt_logits = self._easy_trace_text(**decode_kwargs) @@ -187,7 +189,7 @@ def decode_forward_text( tt_logits = self._decode_forward_no_trace_text(**decode_kwargs) if read_from_device: - return self.read_decode_output(tt_logits, tokens.shape[0]) + return self.read_decode_output(tt_logits, tokens.shape[0], argmax_on_device) else: return tt_logits @@ -197,6 +199,7 @@ def _decode_forward_no_trace_text( current_pos, page_table=None, kv_cache=None, + argmax_on_device=False, ): """ Performs text decode step. @@ -205,13 +208,13 @@ def _decode_forward_no_trace_text( tt_tokens, tt_current_pos, tt_rot_mats, tt_page_table = self.model.prepare_inputs_decode( tokens, current_pos, page_table ) - tt_logits = self.model.ttnn_decode_forward( tt_tokens, tt_current_pos, rot_mats=tt_rot_mats, page_table=tt_page_table, kv_cache=kv_cache, + argmax_on_device=argmax_on_device, ) return tt_logits @@ -222,13 +225,16 @@ def _capture_trace_text( current_pos, page_table=None, kv_cache=None, + argmax_on_device=False, ): """ Captures a trace for the decode_forward method. """ # Compile run - self._decode_forward_no_trace_text(tokens, current_pos, page_table=page_table, kv_cache=kv_cache) + self._decode_forward_no_trace_text( + tokens, current_pos, page_table=page_table, kv_cache=kv_cache, argmax_on_device=argmax_on_device + ) logger.info("Done Compiling Model") # Get inputs ready for trace run @@ -238,11 +244,12 @@ def _capture_trace_text( trace_id = ttnn.begin_trace_capture(self.mesh_device, cq_id=0) transformed_inputs = self.model.transform_decode_inputs_device(*device_inputs) - tt_out_trace = self.model.ttnn_decode_forward(*transformed_inputs, kv_cache=kv_cache) + tt_out_trace = self.model.ttnn_decode_forward( + *transformed_inputs, kv_cache=kv_cache, argmax_on_device=argmax_on_device + ) ttnn.end_trace_capture(self.mesh_device, trace_id, cq_id=0) logger.info("Done Capturing Decode Trace") - return trace_id, tt_out_trace, *device_inputs def _decode_forward_trace_text( @@ -274,13 +281,14 @@ def _easy_trace_text( current_pos, page_table=None, kv_cache=None, + argmax_on_device=False, ): """ Tracing is easy! Just call this method and we'll handle tracing for you. """ if not hasattr(self, "trace_id_text"): trace_id, tt_out_trace, *device_inputs = self._capture_trace_text( - tokens, current_pos, page_table=page_table, kv_cache=kv_cache + tokens, current_pos, page_table=page_table, kv_cache=kv_cache, argmax_on_device=argmax_on_device ) self.trace_id_text = trace_id self.trace_inputs_text = device_inputs @@ -460,8 +468,8 @@ def decode_forward( else: return tt_logits - def read_decode_output(self, tt_logits, unpadded_batch): - logits = self.model.process_output_decode(tt_logits, B=unpadded_batch, S=1) + def read_decode_output(self, tt_logits, unpadded_batch, argmax_on_device=False): + logits = self.model.process_output_decode(tt_logits, B=unpadded_batch, S=1, argmax_on_device=argmax_on_device) return logits def _decode_forward_no_trace( diff --git a/models/demos/llama3/tt/llama_common.py b/models/demos/llama3/tt/llama_common.py index d1de6bce149..dd6873ed8b3 100644 --- a/models/demos/llama3/tt/llama_common.py +++ b/models/demos/llama3/tt/llama_common.py @@ -5,6 +5,7 @@ import math import torch import ttnn +from loguru import logger class HostEmbedding(torch.nn.Module): @@ -44,14 +45,88 @@ def encode_prompt_llama_instruct(tokenizer, prompt_text, system_prompt_text=None return begin_of_text + system_prompt + user_prompt + assistant_reply -def encode_prompt_hf(tokenizer, prompt_text, system_prompt_text=None): - """See https://huggingface.co/docs/transformers/main/en/chat_templating""" - chat = [] - if system_prompt_text: - chat.append({"role": "system", "content": system_prompt_text}) - if prompt_text: - chat.append({"role": "user", "content": prompt_text}) - return tokenizer.apply_chat_template(chat, tokenize=True, add_generation_prompt=True) +def preprocess_inputs_prefill( + input_prompts, + tokenizer, + model_args, + instruct, + max_generated_tokens, + max_prefill_len=128 * 1024, +): + """ + Run tokenizer on inputs, and create embeddings for the first token of each input + """ + # To avoid going out of memory, clip the max prefill length by the maximum number of tokens that will be generated + if max_prefill_len == 128 * 1024: + max_prefill_len = 128 * 1024 - max_generated_tokens + + encoded_prompts = [model_args.encode_prompt(prompt, instruct=instruct) for prompt in input_prompts] + + # Print the length of encoded prompts + logger.info("Encoded prompt lengths:" + ", ".join(str(len(prompt)) for prompt in encoded_prompts)) + + prompt_lens = [len(x) for x in encoded_prompts] + min_prompt_len = min(prompt_lens) + max_prompt_len = max(prompt_lens) + + # To avoid running out of memory when giving prompts larger than the maximum, clip to max_prefill_len + if min_prompt_len > max_prefill_len: + logger.info(f"Left-clipping prompts to {max_prefill_len}") + if instruct: + # We need to allow a few tokens for the system prompt and the special turn tokens for assistant and user; + # to find out how big those will be, we will: + # 1. Tokenize the entire prompt with non-instruct tokenization + # 2. Calculate overhead = length of instruct tokenization - length of non-instruct tokenization + # 3. Shorten the tokenized clipped prompt by the overhead and convert back to text + # 4. Tokenize the result with instruct tokenization + # 5. Assert that the length of this is equal to the max_prefill_len + raw_prompts = [model_args.encode_prompt(prompt, instruct=False) for prompt in input_prompts] + overhead = [len(e) - len(r) for e, r in zip(encoded_prompts, raw_prompts)] + shortened = [tokenizer.decode(e[-(max_prefill_len - o) :]) for e, o in zip(raw_prompts, overhead)] + encoded_prompts = [model_args.encode_prompt(prompt, instruct=instruct) for prompt in shortened] + assert all( + len(e) == max_prefill_len for e in encoded_prompts + ), f"Clipped prompts are not of the correct length, expected {max_prefill_len} but got {[len(e) for e in encoded_prompts]}" + else: + encoded_prompts = [encod[-max_prefill_len:] for encod in encoded_prompts] + + # Update prompt lengths + prompt_lens = [len(x) for x in encoded_prompts] + min_prompt_len = min(prompt_lens) + max_prompt_len = max(prompt_lens) + + assert ( + max_prompt_len <= model_args.max_seq_len + ), f"Max prompt length {max_prompt_len} exceeds model max seq len {model_args.max_seq_len}" + assert min_prompt_len > 0, "Minimum prompt length must be greater than 0" + assert min_prompt_len <= max_prompt_len, f"Minimum prompt length {min_prompt_len} exceeds max len {max_prompt_len}" + + logger.info(f"# of users: {len(encoded_prompts)}") + input_tokens_prefill = [] + decoding_pos = [] + prefill_lens = [] + + # Always prefill the nearest power of 2 for each user. This means that the majority of cases we will prefill more tokens than needed. + # To avoid issues, we keep track of the decoding position to decode correctly the user's prompt + for i, encoded in enumerate(encoded_prompts): + # Prefill size is nearest power of 2 + prefill_seq_len = max(2 ** math.ceil(math.log(len(encoded), 2)), 128) + + # Initial prefill tensors full of pad tokens + input_tokens_prefill_i = torch.full((1, prefill_seq_len), 0, dtype=torch.int32) + input_tokens_prefill_i[0, : len(encoded[:])] = torch.tensor(encoded[:]).to(input_tokens_prefill_i) + input_tokens_prefill.append(input_tokens_prefill_i) + + # Keep the correct decoding position of each user + decoding_pos.append(len(encoded)) + prefill_lens.append(prefill_seq_len) + + return ( + input_tokens_prefill, + encoded_prompts, + decoding_pos, + prefill_lens, + ) def encode_prompt_hf(tokenizer, prompt_text, system_prompt_text=None): @@ -131,10 +206,10 @@ def gather_cos_sin(position_ids, cos, sin): return cos, sin -def get_prefill_rot_mat( - head_dim, max_seq_len, mesh_device, seq_len, theta, scale_factor, orig_context_len, start_pos=0 -): - cos, sin = precompute_freqs(head_dim, max_seq_len * 2, theta, scale_factor, orig_context_len) +def get_prefill_rot_mat(head_dim, mesh_device, seq_len, theta, scale_factor, orig_context_len, start_pos=0): + cos, sin = precompute_freqs( + head_dim, seq_len * 2, theta=theta, scale_factor=scale_factor, orig_context_len=orig_context_len + ) cos_gathered, sin_gathered = gather_cos_sin(torch.arange(start_pos, start_pos + seq_len), cos, sin) assert cos_gathered.size() == (1, 1, seq_len, head_dim) assert sin_gathered.size() == (1, 1, seq_len, head_dim) @@ -317,7 +392,9 @@ def sample_host(tt_input, mesh_device, temperature=0.6, top_p=0.08, on_host=True pt_out = torch.argmax(pt_input, dim=-1) if mesh_device is None: - return pt_out + if pt_out.dim() == 1: # if sampling a single token re-add the batch dim to the tensor + pt_out = pt_out.unsqueeze(0) + return None, pt_out if on_host: return ( ttnn.as_tensor( diff --git a/models/demos/llama3/tt/llama_model.py b/models/demos/llama3/tt/llama_model.py index 8a909981efb..8f49cd04299 100644 --- a/models/demos/llama3/tt/llama_model.py +++ b/models/demos/llama3/tt/llama_model.py @@ -12,7 +12,7 @@ from models.common.lightweightmodule import LightweightModule from models.demos.llama3.tt.distributed_norm import DistributedNorm from models.demos.llama3.tt.lm_head import LMHead -from models.demos.llama3.tt.llama_common import copy_host_to_device, get_prefill_rot_mat +from models.demos.llama3.tt.llama_common import copy_host_to_device from models.demos.llama3.tt.llama_rope import TtLlamaRotarySetup from models.demos.llama3.tt.llama_embedding import TtLlamaEmbedding @@ -118,16 +118,8 @@ def prepare_inputs_prefill(self, tokens, start_pos=0, page_table=None, chunk_pag tokens_embd = self.embd(tokens) tokens_embd = ttnn.unsqueeze_to_4D(tokens_embd) - tt_rot_mats_prefill = get_prefill_rot_mat( - self.args.head_dim, - self.args.max_seq_len, - self.mesh_device, - S, - self.args.rope_theta, - self.args.rope_scaling_factor, - self.args.orig_context_len, - start_pos=start_pos, - ) + # Slice the rot mats to the prefill seqlen + tt_rot_mats_prefill = [self.rope_setup.cos_matrix[:, :, :S, :], self.rope_setup.sin_matrix[:, :, :S, :]] if page_table is not None: tt_page_table = ttnn.from_torch( @@ -244,23 +236,21 @@ def process_output_prefill(self, tt_out, last_token_idx): )[0, 0, last_token_idx, : self.vocab_size] return logits - def process_output_decode(self, tt_out, B, S=1): + def process_output_decode(self, tt_out, B, S=1, argmax_on_device=False): """ - Input is ttnn device tensor of logits. Output is torch logits tensor + Input is ttnn device tensor of logits. Output is torch logits tensor or the generated token if argmax on device """ - if self.args.num_devices > 1: - if self.args.is_galaxy: - tt_out = ttnn.all_gather( - tt_out, - dim=3, - num_links=2, - cluster_axis=0, - mesh_device=self.mesh_device, - topology=self.args.ccl_topology(), - ) - else: - tt_out = ttnn.all_gather(tt_out, dim=3, num_links=1, topology=self.args.ccl_topology()) - tt_out = ttnn.untilize(tt_out, use_multicore=True) + if argmax_on_device: + tt_out = ttnn.to_torch( + tt_out, # tt_out.cpu(blocking=True, cq_id=1), + mesh_composer=ttnn.ConcatMesh2dToTensor( + self.mesh_device, + dims=(3, 1) if self.args.is_galaxy else (1, -1), + mesh_shape=self.args.cluster_shape, + ), + )[0, 0, 0, :B] + return tt_out + if self.args.num_devices > 1: tt_out = ttnn.to_torch(ttnn.get_device_tensors(tt_out)[0]).float() else: @@ -303,6 +293,7 @@ def ttnn_decode_forward( rot_mats, page_table=None, kv_cache=None, + argmax_on_device=False, ): """ This method will take device tensors and any other args to run forward. @@ -316,9 +307,31 @@ def ttnn_decode_forward( page_table=page_table, kv_cache=kv_cache, ) - # Send output logits to DRAM so L1 is not reserved for ttnn tracing and can be used by subsequent operations - if not self.args.is_galaxy: - tt_logits = ttnn.to_memory_config(tt_logits, ttnn.DRAM_MEMORY_CONFIG) + + # Gather the output across all devices and untilize the tensor (for argmax) + if self.args.num_devices > 1: + if self.args.is_galaxy: + tt_logits = ttnn.all_gather( + tt_logits, + dim=3, + num_links=2, + cluster_axis=0, + mesh_device=self.mesh_device, + topology=self.args.ccl_topology(), + ) + else: + tt_logits = ttnn.all_gather(tt_logits, dim=3, num_links=1, topology=self.args.ccl_topology()) + tt_logits = ttnn.untilize(tt_logits, use_multicore=True) + + if argmax_on_device: + tt_logits = ttnn.argmax( # TODO Add multicore support to batch > 1 + tt_logits, dim=3, use_multicore=False if self.args.max_batch_size > 1 else True # ,output_tensor=tokens + ) + else: + # Send output logits to DRAM so L1 is not reserved for ttnn tracing and can be used by subsequent operations + if not self.args.is_galaxy: + tt_logits = ttnn.to_memory_config(tt_logits, ttnn.DRAM_MEMORY_CONFIG) + return tt_logits def forward( diff --git a/models/demos/llama3/tt/llama_rope.py b/models/demos/llama3/tt/llama_rope.py index 4b395c3eec5..533768df5b5 100644 --- a/models/demos/llama3/tt/llama_rope.py +++ b/models/demos/llama3/tt/llama_rope.py @@ -54,14 +54,14 @@ def __init__( self.cos_matrix = ttnn.from_torch( cos_matrix, device=device, - layout=ttnn.ROW_MAJOR_LAYOUT, + layout=ttnn.TILE_LAYOUT, dtype=datatype, mesh_mapper=ReplicateTensorToMesh(device) if self.is_mesh_device else None, ) self.sin_matrix = ttnn.from_torch( sin_matrix, device=device, - layout=ttnn.ROW_MAJOR_LAYOUT, + layout=ttnn.TILE_LAYOUT, dtype=datatype, mesh_mapper=ReplicateTensorToMesh(device) if self.is_mesh_device else None, ) diff --git a/models/demos/llama3/tt/model_config.py b/models/demos/llama3/tt/model_config.py index d93dd3949c1..f278e9d755f 100644 --- a/models/demos/llama3/tt/model_config.py +++ b/models/demos/llama3/tt/model_config.py @@ -376,16 +376,18 @@ def __init__( else: self.model_config["ATTN_ALL_GATHER_MATMUL_PROGCFG"] = None - prefill_rows = lambda seq_len: min(seq_len, 1024) // self.tile_size + # For maximum performance, set the prefill grid row to 8, even if it can fit in a smaller grid + # prefill_rows = lambda seq_len: min(seq_len, 1024) // self.tile_size + prefill_rows = 8 mlp1_3_grid = lambda seq_len: ( (8, min(min(seq_len, 1024) // 32, 4)) if self.is_galaxy - else self.find_prefill_grid(prefill_rows(seq_len), self.dim // self.tile_size) + else self.find_prefill_grid(prefill_rows, self.dim // self.tile_size) ) mlp2_grid = lambda seq_len: ( (8, min(min(seq_len, 1024) // 32, 4)) if self.is_galaxy - else self.find_prefill_grid(prefill_rows(seq_len), self.hidden_dim // self.tile_size) + else self.find_prefill_grid(prefill_rows, self.hidden_dim // self.tile_size) ) self.model_config["PREFILL_MLP_W1_W3_PRG_CONFIG"] = lambda seq_len: self.matmul_config( @@ -402,14 +404,23 @@ def __init__( ) k_dim = self.dim // self.cluster_shape[0] if self.is_galaxy else self.dim - n_dim = self.dim // self.cluster_shape[1] if self.is_galaxy else self.dim + # n_dim = self.dim // self.cluster_shape[1] if self.is_galaxy else self.dim + n_dim = ( + self.dim // self.cluster_shape[1] + if self.is_galaxy + else ( + 1024 + if self.ccl_topology() == ttnn.Topology.Ring and 1024 % (self.dim / self.num_devices) == 0 + else self.dim + ) + ) num_rows = lambda seq_len: min(seq_len, 1024 if self.is_galaxy else 2048) self.model_config["WO_PREFILL_PROGCFG"] = lambda seq_len: self.matmul_config( m=num_rows(seq_len), k=k_dim, n=n_dim, grid_size=self.find_prefill_grid(num_rows(seq_len), n_dim // self.tile_size), - in0_block_w=1, + in0_block_w=1 if self.is_galaxy else self.dim // 1024, fuse_batch=seq_len <= 1024, # if self.is_galaxy else 2048), ) diff --git a/models/demos/llama3/tt/multimodal/llama_cross_attention_transformer_vision.py b/models/demos/llama3/tt/multimodal/llama_cross_attention_transformer_vision.py index 06e5095d4ca..7e0fa7dbf4c 100644 --- a/models/demos/llama3/tt/multimodal/llama_cross_attention_transformer_vision.py +++ b/models/demos/llama3/tt/multimodal/llama_cross_attention_transformer_vision.py @@ -90,6 +90,7 @@ def shuffle_weight(weight): # Sharded weights self.vision_projection_weight = as_interleaved_tensor("vision_projection", "weight", dtype, dim=-1) self.vision_projection_bias = as_interleaved_tensor("vision_projection", "bias", ttnn.bfloat16, dim=-1) + self.vision_projection_bias = ttnn.reshape(self.vision_projection_bias, [1, -1]) def forward(self, images, ar): vision_tokens = self.vision_encoder(images, ar) diff --git a/models/demos/llama3/tt/multimodal/llama_image_mlp.py b/models/demos/llama3/tt/multimodal/llama_image_mlp.py index 45755f88f30..0d56f310eaf 100644 --- a/models/demos/llama3/tt/multimodal/llama_image_mlp.py +++ b/models/demos/llama3/tt/multimodal/llama_image_mlp.py @@ -53,6 +53,7 @@ def __init__( # Sharded weights self.c_fc_weight = as_interleaved_tensor("c_fc", "weight", dtype, dim=-1) self.c_fc_bias = as_interleaved_tensor("c_fc", "bias", ttnn.bfloat16, dim=-1) + self.c_fc_bias = ttnn.reshape(self.c_fc_bias, [1, -1]) self.c_proj_weight = as_interleaved_tensor("c_proj", "weight", dtype, dim=-2) self.c_proj_bias = as_interleaved_tensor("c_proj", "bias", ttnn.bfloat16, dim=None) diff --git a/models/demos/llama3/tt/multimodal/llama_vision_model.py b/models/demos/llama3/tt/multimodal/llama_vision_model.py index 7a4918c96c1..7fc9d630102 100644 --- a/models/demos/llama3/tt/multimodal/llama_vision_model.py +++ b/models/demos/llama3/tt/multimodal/llama_vision_model.py @@ -370,7 +370,6 @@ def prepare_inputs_prefill( ) rot_mats = get_prefill_rot_mat( self.configuration.head_dim, - self.configuration.max_seq_len, self.mesh_device, seq_len=S, theta=self.configuration.rope_theta, @@ -638,7 +637,7 @@ def process_output_prefill(self, tt_out, B, last_token_idx): tt_out = tt_out[0, 0, last_token_idx, :] return tt_out - def process_output_decode(self, tt_out, B, S): + def process_output_decode(self, tt_out, B, S, argmax_on_device=False): tt_out = ttnn.to_torch(ttnn.get_device_tensors(tt_out)[0]).float() tt_out = tt_out[:, :, :B, :].reshape(B, S, -1) return tt_out diff --git a/tests/scripts/single_card/run_single_card_demo_tests.sh b/tests/scripts/single_card/run_single_card_demo_tests.sh index dfff74560e9..e7a8e492122 100755 --- a/tests/scripts/single_card/run_single_card_demo_tests.sh +++ b/tests/scripts/single_card/run_single_card_demo_tests.sh @@ -105,24 +105,24 @@ run_n300_perf_tests(){ run_common_perf_tests; fail+=$? - # Llama3.1-8B - llama8b=/mnt/MLPerf/tt_dnn-models/llama/Meta-Llama-3.1-8B-Instruct/ # Llama3.2-1B llama1b=/mnt/MLPerf/tt_dnn-models/llama/Llama3.2-1B-Instruct/ # Llama3.2-3B llama3b=/mnt/MLPerf/tt_dnn-models/llama/Llama3.2-3B-Instruct/ - # Llama3.2-11B + # Llama3.1-8B + llama8b=/mnt/MLPerf/tt_dnn-models/llama/Meta-Llama-3.1-8B-Instruct/ + # Llama3.2-11B (same tet weights as 8B) llama11b=/mnt/MLPerf/tt_dnn-models/llama/Llama3.2-11B-Vision-Instruct/ - # Run all Llama3 tests for 1B, 3B, 8B and 11B weights for N150 + # Run all Llama3 tests for 1B, 3B, 8B weights for N150 # To ensure a proper perf measurement and dashboard upload of the Llama3 models on a N150, we have to run them on the N300 perf pipeline for now - for llama_dir in "$llama1b" "$llama3b" "$llama8b" "$llama11b"; do - FAKE_DEVICE=N150 LLAMA_DIR=$llama_dir WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/llama3/demo/demo.py --timeout 600; fail+=$? + for llama_dir in "$llama1b" "$llama3b" "$llama8b"; do + FAKE_DEVICE=N150 LLAMA_DIR=$llama_dir WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/llama3/demo/simple_text_demo.py --timeout 600; fail+=$? echo "LOG_METAL: Llama3 tests for $llama_dir completed on N150" done # Run all Llama3 tests for 1B, 3B, 8B and 11B weights for llama_dir in "$llama1b" "$llama3b" "$llama8b" "$llama11b"; do - LLAMA_DIR=$llama_dir WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/llama3/demo/demo.py --timeout 600; fail+=$? + LLAMA_DIR=$llama_dir WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/llama3/demo/simple_text_demo.py --timeout 600; fail+=$? echo "LOG_METAL: Llama3 tests for $llama_dir completed" done diff --git a/tests/scripts/t3000/run_t3000_demo_tests.sh b/tests/scripts/t3000/run_t3000_demo_tests.sh index 6de0b8883fd..0b5e9d45ef4 100755 --- a/tests/scripts/t3000/run_t3000_demo_tests.sh +++ b/tests/scripts/t3000/run_t3000_demo_tests.sh @@ -30,7 +30,7 @@ run_t3000_llama3_70b_tests() { echo "LOG_METAL: Running run_t3000_llama3_70b_tests" - LLAMA_DIR=/mnt/MLPerf/tt_dnn-models/llama/Llama3.1-70B-Instruct/ WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/llama3/demo/demo.py --timeout 600; fail+=$? + LLAMA_DIR=/mnt/MLPerf/tt_dnn-models/llama/Llama3.1-70B-Instruct/ WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/llama3/demo/simple_text_demo.py --timeout 600; fail+=$? # Output verification demo for old llama3-70b codebase, to be removed once old codebase is deleted env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/llama3_70b/demo/demo.py::test_LlamaModel_demo[wormhole_b0-True-device_params0-short_context-check_enabled-greedy-tt-70b-T3000-80L-decode_only-trace_mode_off-text_completion-llama3] --timeout=900 ; fail+=$? @@ -66,7 +66,7 @@ run_t3000_llama3_tests() { # Run all Llama3 tests for 8B, 1B, and 3B weights for llama_dir in "$llama1b" "$llama3b" "$llama8b" "$llama11b"; do - LLAMA_DIR=$llama_dir WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/demo/demo.py --timeout 600; fail+=$? + LLAMA_DIR=$llama_dir WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/demo/simple_text_demo.py --timeout 600; fail+=$? echo "LOG_METAL: Llama3 tests for $llama_dir completed" done @@ -92,9 +92,6 @@ run_t3000_llama3_vision_tests() { n300=N300 t3k=T3K - # Install Vision-specific packages - pip install -r models/demos/llama3/requirements.txt - for fake_device in "$n300" "$t3k"; do FAKE_DEVICE=$fake_device LLAMA_DIR=$llama11b WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/demo/simple_vision_demo.py -k "batch1-trace or batch4-trace-with-text-prompts" --timeout 600; fail+=$? echo "LOG_METAL: Llama3 vision tests for $fake_device completed" diff --git a/tests/scripts/t3000/run_t3000_frequent_tests.sh b/tests/scripts/t3000/run_t3000_frequent_tests.sh index 81a5f1b9d42..790df1a4a2c 100755 --- a/tests/scripts/t3000/run_t3000_frequent_tests.sh +++ b/tests/scripts/t3000/run_t3000_frequent_tests.sh @@ -60,9 +60,6 @@ run_t3000_llama3_tests() { # Llama3.2-11B llama11b=/mnt/MLPerf/tt_dnn-models/llama/Llama3.2-11B-Vision-Instruct/ - # Install Vision-specific packages - pip install -r models/demos/llama3/requirements.txt - # Run test model for llama3 - 1B, 3B, 8B and 11B weights for llama_dir in "$llama1b" "$llama3b" "$llama8b" "$llama11b"; do LLAMA_DIR=$llama_dir WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/test_llama_model.py -k full ; fail+=$? @@ -147,9 +144,6 @@ run_t3000_llama3.2-11b-vision_freq_tests() { # Llama3.2-11B llama11b=/mnt/MLPerf/tt_dnn-models/llama/Llama3.2-11B-Vision-Instruct/ - # Install Vision-specific packages - pip install -r models/demos/llama3/requirements.txt - LLAMA_DIR=$llama11b WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/multimodal/test_llama_image_transformer.py ; fail+=$? LLAMA_DIR=$llama11b WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/multimodal/test_llama_vision_encoder.py ; fail+=$? LLAMA_DIR=$llama11b WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/multimodal/test_llama_cross_attention_transformer_text.py ; fail+=$? @@ -177,9 +171,6 @@ run_t3000_spoof_n300_llama3.2-11b-vision_freq_tests() { # Use FAKE_DEVICE env variable to run on an N300 mesh fake_device=N300 - # Install Vision-specific packages - pip install -r models/demos/llama3/requirements.txt - FAKE_DEVICE=$fake_device LLAMA_DIR=$llama11b WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/multimodal/test_llama_image_transformer.py ; fail+=$? FAKE_DEVICE=$fake_device LLAMA_DIR=$llama11b WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/multimodal/test_llama_vision_encoder.py ; fail+=$? FAKE_DEVICE=$fake_device LLAMA_DIR=$llama11b WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/multimodal/test_llama_cross_attention_transformer_text.py ; fail+=$? diff --git a/tests/scripts/t3000/run_t3000_unit_tests.sh b/tests/scripts/t3000/run_t3000_unit_tests.sh index 0f849e9ec7f..e4e54a510b1 100755 --- a/tests/scripts/t3000/run_t3000_unit_tests.sh +++ b/tests/scripts/t3000/run_t3000_unit_tests.sh @@ -230,9 +230,6 @@ run_t3000_llama3.2-11b-vision_unit_tests() { # Llama3.2-11B llama11b=/mnt/MLPerf/tt_dnn-models/llama/Llama3.2-11B-Vision-Instruct/ - # Install Vision-specific packages - pip install -r models/demos/llama3/requirements.txt - LLAMA_DIR=$llama11b WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/multimodal/test_llama_image_mlp.py ; fail+=$? LLAMA_DIR=$llama11b WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/multimodal/test_llama_image_attention.py ; fail+=$? LLAMA_DIR=$llama11b WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/multimodal/test_llama_image_block.py ; fail+=$? @@ -265,9 +262,6 @@ run_t3000_spoof_n300_llama3.2-11b-vision_unit_tests() { # Use FAKE_DEVICE env variable to run on an N300 mesh fake_device=N300 - # Install Vision-specific packages - pip install -r models/demos/llama3/requirements.txt - FAKE_DEVICE=$fake_device LLAMA_DIR=$llama11b WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/multimodal/test_llama_image_mlp.py ; fail+=$? FAKE_DEVICE=$fake_device LLAMA_DIR=$llama11b WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/multimodal/test_llama_image_attention.py ; fail+=$? FAKE_DEVICE=$fake_device LLAMA_DIR=$llama11b WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/multimodal/test_llama_image_block.py ; fail+=$? diff --git a/tests/scripts/tg/run_tg_demo_tests.sh b/tests/scripts/tg/run_tg_demo_tests.sh index c8fa2f7b6a9..5d741ce924a 100755 --- a/tests/scripts/tg/run_tg_demo_tests.sh +++ b/tests/scripts/tg/run_tg_demo_tests.sh @@ -21,7 +21,7 @@ run_tg_llama3_tests() { # Run all Llama3 tests for 1B, 3B, 8B, 11B and 70B weights # for llama_dir in "$llama1b" "$llama3b" "$llama8b" "$llama11b" "$llama70b"; do for llama_dir in "$llama1b" "$llama8b" "$llama70b"; do - LLAMA_DIR=$llama_dir FAKE_DEVICE=TG pytest -n auto models/demos/llama3/demo/demo.py --timeout 5000; fail+=$? + LLAMA_DIR=$llama_dir FAKE_DEVICE=TG pytest -n auto models/demos/llama3/demo/simple_text_demo.py --timeout 5000; fail+=$? echo "LOG_METAL: Llama3 tests for $llama_dir completed" done diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama.py index 6d4db95ccb7..01ea4b5858a 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama.py @@ -454,7 +454,7 @@ def test_rotary_embedding_llama_with_program_cache( num_ops = 2 # 2 * rope if mode == "decode": - num_ops += 3 # embedding + transpose + interleaved_to_sharded + num_ops += 4 # untilize cos/sin + embedding + transpose + interleaved_to_sharded if batch % ttnn.TILE_SIZE != 0: num_ops += 1 # slice diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama_fused_qk.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama_fused_qk.py index 893fe74baa5..1f4aaca24a8 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama_fused_qk.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama_fused_qk.py @@ -132,7 +132,7 @@ def test_rotary_embedding_llama_fused_qk_with_program_cache( cache_tensors.append(test_tensor) - num_ops = 4 # embedding + fused_qk_rope + transpose + interleaved_to_sharded + num_ops = 5 # untilize cos/sin + embedding + fused_qk_rope + transpose + interleaved_to_sharded if (batch * 2) % ttnn.TILE_SIZE != 0: num_ops += 1 # slice diff --git a/tt_metal/python_env/requirements-dev.txt b/tt_metal/python_env/requirements-dev.txt index 18a5c84dbc5..a9ed3355d47 100644 --- a/tt_metal/python_env/requirements-dev.txt +++ b/tt_metal/python_env/requirements-dev.txt @@ -22,6 +22,9 @@ mypy==1.9.0 # For sweep testing -r ../../tests/sweep_framework/requirements-sweeps.txt +# For all Llama3 demo tests +git+https://github.com/tenstorrent/llama-models.git@tt_metal_tag + # testing pytest==7.2.2 pytest-timeout==2.2.0 From 033573f2b78c97454b45609817d274434741b78c Mon Sep 17 00:00:00 2001 From: Bryan Wilder Field Lozano Date: Thu, 20 Feb 2025 09:55:41 -0800 Subject: [PATCH 192/316] [skip ci] Remove all references to TT_METAL_ENV (#18090) --- .github/workflows/bisect-dispatch.yaml | 1 - .github/workflows/docs-latest-public.yaml | 1 - .github/workflows/full-regressions-and-models.yaml | 1 - .github/workflows/metal-run-microbenchmarks.yaml | 1 - .github/workflows/perf-models-impl.yaml | 1 - .github/workflows/single-card-demo-tests-impl.yaml | 1 - .../workflows/stress-fast-dispatch-build-and-unit-tests.yaml | 1 - .../workflows/stress-slow-dispatch-build-and-unit-tests.yaml | 1 - .github/workflows/t3000-demo-tests-impl.yaml | 1 - .github/workflows/t3000-frequent-tests-impl.yaml | 1 - .github/workflows/t3000-model-perf-tests-impl.yaml | 1 - .github/workflows/t3000-nightly-tests-impl.yaml | 1 - .github/workflows/t3000-perplexity-tests-impl.yaml | 1 - .github/workflows/t3000-profiler-tests-impl.yaml | 1 - .github/workflows/t3000-unit-tests-impl.yaml | 1 - .github/workflows/test-dispatch.yaml | 1 - .github/workflows/tg-demo-tests-impl.yaml | 1 - .github/workflows/tg-frequent-tests-impl.yaml | 1 - .github/workflows/tg-model-perf-tests-impl.yaml | 1 - .github/workflows/tg-nightly-tests.yaml | 1 - .github/workflows/tg-unit-tests-impl.yaml | 2 -- .github/workflows/tgg-demo-tests.yaml | 1 - .github/workflows/tgg-frequent-tests-impl.yaml | 1 - .github/workflows/tgg-model-perf-tests-impl.yaml | 1 - .github/workflows/tgg-unit-tests-impl.yaml | 1 - .github/workflows/ttnn-run-sweeps.yaml | 2 -- .github/workflows/umd-unit-tests.yaml | 1 - 27 files changed, 29 deletions(-) diff --git a/.github/workflows/bisect-dispatch.yaml b/.github/workflows/bisect-dispatch.yaml index 12bda76c1fc..72e2054d66c 100644 --- a/.github/workflows/bisect-dispatch.yaml +++ b/.github/workflows/bisect-dispatch.yaml @@ -36,7 +36,6 @@ jobs: needs: build-artifact timeout-minutes: 1440 env: - TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} ARCH_NAME: ${{ inputs.arch }} runs-on: - ${{ inputs.runner-label }} diff --git a/.github/workflows/docs-latest-public.yaml b/.github/workflows/docs-latest-public.yaml index c092a50ffc8..d3e918a6dcc 100644 --- a/.github/workflows/docs-latest-public.yaml +++ b/.github/workflows/docs-latest-public.yaml @@ -20,7 +20,6 @@ jobs: matrix: arch: [grayskull] env: - TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} DOCS_VERSION: latest ARCH_NAME: ${{ matrix.arch }} LOGURU_LEVEL: INFO diff --git a/.github/workflows/full-regressions-and-models.yaml b/.github/workflows/full-regressions-and-models.yaml index 0c424f5e4f5..6f6784136df 100644 --- a/.github/workflows/full-regressions-and-models.yaml +++ b/.github/workflows/full-regressions-and-models.yaml @@ -20,7 +20,6 @@ jobs: arch: [grayskull, wormhole_b0] frequent-type: [api] env: - TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} ARCH_NAME: ${{ matrix.arch }} LOGURU_LEVEL: INFO TT_METAL_SLOW_DISPATCH_MODE: 1 diff --git a/.github/workflows/metal-run-microbenchmarks.yaml b/.github/workflows/metal-run-microbenchmarks.yaml index 680ab152523..7df326ba8d4 100644 --- a/.github/workflows/metal-run-microbenchmarks.yaml +++ b/.github/workflows/metal-run-microbenchmarks.yaml @@ -22,7 +22,6 @@ jobs: {arch: wormhole_b0, runs-on: ["N300", "pipeline-perf", "bare-metal", "in-service"], ccl: true}, ] env: - TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} # Use BM for microbenchmarks ARCH_NAME: ${{ matrix.runner-info.arch }} LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib diff --git a/.github/workflows/perf-models-impl.yaml b/.github/workflows/perf-models-impl.yaml index 153e303001e..dab1338b772 100644 --- a/.github/workflows/perf-models-impl.yaml +++ b/.github/workflows/perf-models-impl.yaml @@ -17,7 +17,6 @@ jobs: model-type: [llm_javelin, cnn_javelin, other] name: "${{ matrix.model-type }} ${{ matrix.test-info.name }}" env: - TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} ARCH_NAME: ${{ matrix.test-info.arch }} LOGURU_LEVEL: INFO LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib diff --git a/.github/workflows/single-card-demo-tests-impl.yaml b/.github/workflows/single-card-demo-tests-impl.yaml index 40502033cfb..6d68f5bbe94 100644 --- a/.github/workflows/single-card-demo-tests-impl.yaml +++ b/.github/workflows/single-card-demo-tests-impl.yaml @@ -30,7 +30,6 @@ jobs: ] name: ${{ matrix.test-group.name }} env: - TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} ARCH_NAME: ${{ matrix.test-group.arch }} LOGURU_LEVEL: INFO LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib diff --git a/.github/workflows/stress-fast-dispatch-build-and-unit-tests.yaml b/.github/workflows/stress-fast-dispatch-build-and-unit-tests.yaml index 205e86cceb9..2a3e5717d0b 100644 --- a/.github/workflows/stress-fast-dispatch-build-and-unit-tests.yaml +++ b/.github/workflows/stress-fast-dispatch-build-and-unit-tests.yaml @@ -27,7 +27,6 @@ jobs: {arch: wormhole_b0, runs-on: ["cloud-virtual-machine", "N300", "in-service"], machine-type: "virtual_machine", name: "N300"}, ] env: - TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} ARCH_NAME: ${{ matrix.runner-info.arch }} TT_METAL_WATCHER: 60 TT_METAL_WATCHER_NOINLINE: 1 diff --git a/.github/workflows/stress-slow-dispatch-build-and-unit-tests.yaml b/.github/workflows/stress-slow-dispatch-build-and-unit-tests.yaml index ce01df49a5c..f75e6ea6aae 100644 --- a/.github/workflows/stress-slow-dispatch-build-and-unit-tests.yaml +++ b/.github/workflows/stress-slow-dispatch-build-and-unit-tests.yaml @@ -27,7 +27,6 @@ jobs: {arch: wormhole_b0, runs-on: ["cloud-virtual-machine", "N300", "in-service"], machine-type: "virtual_machine", name: "N300"}, ] env: - TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} ARCH_NAME: ${{ matrix.runner-info.arch }} TT_METAL_SLOW_DISPATCH_MODE: 1 TT_METAL_WATCHER: 60 diff --git a/.github/workflows/t3000-demo-tests-impl.yaml b/.github/workflows/t3000-demo-tests-impl.yaml index 8b75690aed3..deacc762f4a 100644 --- a/.github/workflows/t3000-demo-tests-impl.yaml +++ b/.github/workflows/t3000-demo-tests-impl.yaml @@ -24,7 +24,6 @@ jobs: name: ${{ matrix.test-group.name }} env: - TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} ARCH_NAME: ${{ matrix.test-group.arch }} LOGURU_LEVEL: INFO LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib diff --git a/.github/workflows/t3000-frequent-tests-impl.yaml b/.github/workflows/t3000-frequent-tests-impl.yaml index f538f9ba3cf..ad1fcff1f73 100644 --- a/.github/workflows/t3000-frequent-tests-impl.yaml +++ b/.github/workflows/t3000-frequent-tests-impl.yaml @@ -29,7 +29,6 @@ jobs: ] name: ${{ matrix.test-group.name }} env: - TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} ARCH_NAME: ${{ matrix.test-group.arch }} LOGURU_LEVEL: INFO LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib diff --git a/.github/workflows/t3000-model-perf-tests-impl.yaml b/.github/workflows/t3000-model-perf-tests-impl.yaml index d63b96dd421..b20cbf0a40f 100644 --- a/.github/workflows/t3000-model-perf-tests-impl.yaml +++ b/.github/workflows/t3000-model-perf-tests-impl.yaml @@ -25,7 +25,6 @@ jobs: ] name: ${{ matrix.test-group.name }} env: - TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} ARCH_NAME: ${{ matrix.test-group.arch }} LOGURU_LEVEL: INFO LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib diff --git a/.github/workflows/t3000-nightly-tests-impl.yaml b/.github/workflows/t3000-nightly-tests-impl.yaml index b09dfcc6318..7f2469b2ac8 100644 --- a/.github/workflows/t3000-nightly-tests-impl.yaml +++ b/.github/workflows/t3000-nightly-tests-impl.yaml @@ -19,7 +19,6 @@ jobs: name: ${{ matrix.test-group.name }} env: - TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} ARCH_NAME: ${{ matrix.test-group.arch }} LOGURU_LEVEL: INFO LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib diff --git a/.github/workflows/t3000-perplexity-tests-impl.yaml b/.github/workflows/t3000-perplexity-tests-impl.yaml index 9b6384bb491..af98982db79 100644 --- a/.github/workflows/t3000-perplexity-tests-impl.yaml +++ b/.github/workflows/t3000-perplexity-tests-impl.yaml @@ -20,7 +20,6 @@ jobs: name: ${{ matrix.test-group.name }} env: - TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} ARCH_NAME: ${{ matrix.test-group.arch }} LOGURU_LEVEL: INFO LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib diff --git a/.github/workflows/t3000-profiler-tests-impl.yaml b/.github/workflows/t3000-profiler-tests-impl.yaml index d9847249087..0e2bcd10db4 100644 --- a/.github/workflows/t3000-profiler-tests-impl.yaml +++ b/.github/workflows/t3000-profiler-tests-impl.yaml @@ -23,7 +23,6 @@ jobs: ] name: ${{ matrix.test-group.name }} env: - TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} ARCH_NAME: ${{ matrix.test-group.arch }} LOGURU_LEVEL: INFO LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib diff --git a/.github/workflows/t3000-unit-tests-impl.yaml b/.github/workflows/t3000-unit-tests-impl.yaml index ea077571775..3d761f5b530 100644 --- a/.github/workflows/t3000-unit-tests-impl.yaml +++ b/.github/workflows/t3000-unit-tests-impl.yaml @@ -30,7 +30,6 @@ jobs: ] name: ${{ matrix.test-group.name }} env: - TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} ARCH_NAME: ${{ matrix.test-group.arch }} LOGURU_LEVEL: INFO LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib diff --git a/.github/workflows/test-dispatch.yaml b/.github/workflows/test-dispatch.yaml index d14ec14f6df..416970b809c 100644 --- a/.github/workflows/test-dispatch.yaml +++ b/.github/workflows/test-dispatch.yaml @@ -60,7 +60,6 @@ jobs: needs: build-artifact timeout-minutes: 1440 env: - TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} ARCH_NAME: ${{ inputs.arch }} LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib runs-on: ${{ fromJSON(inputs.runner-label) }} diff --git a/.github/workflows/tg-demo-tests-impl.yaml b/.github/workflows/tg-demo-tests-impl.yaml index 492ad10f199..b7a75882e0c 100644 --- a/.github/workflows/tg-demo-tests-impl.yaml +++ b/.github/workflows/tg-demo-tests-impl.yaml @@ -14,7 +14,6 @@ jobs: { name: "TG Falcon7b demo tests", arch: wormhole_b0, model: falcon7b, timeout: 120, owner_id: U05RWH3QUPM}, # Salar Hosseini ] env: - TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} ARCH_NAME: ${{ matrix.test-group.arch }} LOGURU_LEVEL: INFO LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib diff --git a/.github/workflows/tg-frequent-tests-impl.yaml b/.github/workflows/tg-frequent-tests-impl.yaml index 576d6626626..717b6d6baee 100644 --- a/.github/workflows/tg-frequent-tests-impl.yaml +++ b/.github/workflows/tg-frequent-tests-impl.yaml @@ -14,7 +14,6 @@ jobs: { name: "TG unit/distributed frequent tests", arch: wormhole_b0, model: unit, timeout: 90, owner_id: XXXXX}, # Add owner ] env: - TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} ARCH_NAME: ${{ matrix.test-group.arch }} LOGURU_LEVEL: INFO LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib diff --git a/.github/workflows/tg-model-perf-tests-impl.yaml b/.github/workflows/tg-model-perf-tests-impl.yaml index 5ce68339f04..251cdbcf317 100644 --- a/.github/workflows/tg-model-perf-tests-impl.yaml +++ b/.github/workflows/tg-model-perf-tests-impl.yaml @@ -33,7 +33,6 @@ jobs: ] name: ${{ matrix.test-group.name }} env: - TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} ARCH_NAME: ${{ matrix.test-group.arch }} LOGURU_LEVEL: INFO LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib diff --git a/.github/workflows/tg-nightly-tests.yaml b/.github/workflows/tg-nightly-tests.yaml index ce8f9897ffb..4e67f799a6b 100644 --- a/.github/workflows/tg-nightly-tests.yaml +++ b/.github/workflows/tg-nightly-tests.yaml @@ -19,7 +19,6 @@ jobs: ] name: ${{ matrix.test-group.name }} env: - TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} ARCH_NAME: ${{ matrix.test-group.arch }} LOGURU_LEVEL: INFO LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib diff --git a/.github/workflows/tg-unit-tests-impl.yaml b/.github/workflows/tg-unit-tests-impl.yaml index 1d594b69403..f8049b38976 100644 --- a/.github/workflows/tg-unit-tests-impl.yaml +++ b/.github/workflows/tg-unit-tests-impl.yaml @@ -17,7 +17,6 @@ jobs: }, ] env: - TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} ARCH_NAME: ${{ matrix.test-group.arch }} LOGURU_LEVEL: INFO runs-on: ${{ matrix.test-group.runs-on }} @@ -49,7 +48,6 @@ jobs: ] name: ${{ matrix.test-group.name }} env: - TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} ARCH_NAME: ${{ matrix.test-group.arch }} LOGURU_LEVEL: INFO LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib diff --git a/.github/workflows/tgg-demo-tests.yaml b/.github/workflows/tgg-demo-tests.yaml index 0cab3fdd13d..908fd1e0588 100644 --- a/.github/workflows/tgg-demo-tests.yaml +++ b/.github/workflows/tgg-demo-tests.yaml @@ -24,7 +24,6 @@ jobs: ] name: ${{ matrix.test-group.name }} env: - TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} ARCH_NAME: ${{ matrix.test-group.arch }} LOGURU_LEVEL: INFO LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib diff --git a/.github/workflows/tgg-frequent-tests-impl.yaml b/.github/workflows/tgg-frequent-tests-impl.yaml index b042635fece..c374035b286 100644 --- a/.github/workflows/tgg-frequent-tests-impl.yaml +++ b/.github/workflows/tgg-frequent-tests-impl.yaml @@ -18,7 +18,6 @@ jobs: ] name: ${{ matrix.test-group.name }} env: - TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} ARCH_NAME: ${{ matrix.test-group.arch }} LOGURU_LEVEL: INFO LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib diff --git a/.github/workflows/tgg-model-perf-tests-impl.yaml b/.github/workflows/tgg-model-perf-tests-impl.yaml index c487d43d7e3..b47afc3ac98 100644 --- a/.github/workflows/tgg-model-perf-tests-impl.yaml +++ b/.github/workflows/tgg-model-perf-tests-impl.yaml @@ -26,7 +26,6 @@ jobs: ] name: ${{ matrix.test-group.name }} env: - TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} ARCH_NAME: ${{ matrix.test-group.arch }} LOGURU_LEVEL: INFO LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib diff --git a/.github/workflows/tgg-unit-tests-impl.yaml b/.github/workflows/tgg-unit-tests-impl.yaml index 5313e0610c4..140230c82b2 100644 --- a/.github/workflows/tgg-unit-tests-impl.yaml +++ b/.github/workflows/tgg-unit-tests-impl.yaml @@ -18,7 +18,6 @@ jobs: ] name: ${{ matrix.test-group.name }} env: - TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} ARCH_NAME: ${{ matrix.test-group.arch }} LOGURU_LEVEL: INFO LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib diff --git a/.github/workflows/ttnn-run-sweeps.yaml b/.github/workflows/ttnn-run-sweeps.yaml index 4b1e17557d9..1b7ab7f1bbf 100644 --- a/.github/workflows/ttnn-run-sweeps.yaml +++ b/.github/workflows/ttnn-run-sweeps.yaml @@ -550,7 +550,6 @@ jobs: ttnn-generate-sweeps: needs: build-artifact env: - TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} ARCH_NAME: wormhole_b0 ELASTIC_USERNAME: ${{ secrets.SWEEPS_ELASTIC_USERNAME }} ELASTIC_PASSWORD: ${{ secrets.SWEEPS_ELASTIC_PASSWORD }} @@ -607,7 +606,6 @@ jobs: } ] env: - TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} ARCH_NAME: ${{ matrix.test-group.arch }} ELASTIC_USERNAME: ${{ secrets.SWEEPS_ELASTIC_USERNAME }} ELASTIC_PASSWORD: ${{ secrets.SWEEPS_ELASTIC_PASSWORD }} diff --git a/.github/workflows/umd-unit-tests.yaml b/.github/workflows/umd-unit-tests.yaml index 02eb95b79c3..460ec079503 100644 --- a/.github/workflows/umd-unit-tests.yaml +++ b/.github/workflows/umd-unit-tests.yaml @@ -43,7 +43,6 @@ jobs: - cloud-virtual-machine - in-service env: - TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} ARCH_NAME: ${{ inputs.arch }} LOGURU_LEVEL: INFO steps: From 4900e9b873140a489bd0d8c9c326ba6196b18460 Mon Sep 17 00:00:00 2001 From: Sean Nijjar Date: Thu, 20 Feb 2025 14:06:00 -0500 Subject: [PATCH 193/316] cleanup packet header validation in EDM fabric (#18001) Some device watcher asserts were made stale due to recent changes. This PR corrects those assertions to be valid again. --- .../ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp | 4 +++- .../edm_fabric/fabric_edm_packet_header_validate.hpp | 6 ++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp index 9a5cfcb40f9..af3c53f27b5 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp @@ -26,13 +26,15 @@ enum NocSendType : uint8_t { NOC_UNICAST_INLINE_WRITE = 1, NOC_MULTICAST_WRITE = 2, NOC_UNICAST_ATOMIC_INC = 3, - NOC_MULTICAST_ATOMIC_INC = 4 + NOC_MULTICAST_ATOMIC_INC = 4, + NOC_SEND_TYPE_LAST = NOC_MULTICAST_ATOMIC_INC }; // How to send the payload across the cluster // 1 bit enum ChipSendType : uint8_t { CHIP_UNICAST = 0, CHIP_MULTICAST = 1, + CHIP_SEND_TYPE_LAST = CHIP_MULTICAST }; struct RoutingFields { diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header_validate.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header_validate.hpp index bb6b6603e11..2589c8f526a 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header_validate.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header_validate.hpp @@ -9,9 +9,11 @@ namespace tt::fabric { -FORCE_INLINE void validate(const PacketHeader& packet_header) { ASSERT(packet_header.chip_send_type < 2); } +FORCE_INLINE void validate(const PacketHeader& packet_header) { + ASSERT(packet_header.chip_send_type <= CHIP_SEND_TYPE_LAST); +} FORCE_INLINE bool is_valid(PacketHeader const& packet_header) { - return (packet_header.chip_send_type < 2) && (packet_header.noc_send_type < 2); + return (packet_header.chip_send_type <= CHIP_SEND_TYPE_LAST) && (packet_header.noc_send_type <= NOC_SEND_TYPE_LAST); } } // namespace tt::fabric From bdb0bfcbea05f9ae541e8b65893334bfe90682fc Mon Sep 17 00:00:00 2001 From: Yu Gao <145494740+yugaoTT@users.noreply.github.com> Date: Thu, 20 Feb 2025 14:17:36 -0500 Subject: [PATCH 194/316] Use stateful NoC API in EDM and dedicated cmd buffer for EDM-EDM NoC path (#18014) Previously the EDM uses the single cmd buffer for writing both to worker and EDM, this caused to re-program all the fields each time. Using dedicated cmd buf can allow use use stateful apis. perf increase (B/c): mcast on 4 devices: 5.05 -> 5.65 unicast on 2 devices: 6.68->7.13 ### Checklist - [x] [All post commit] https://github.com/tenstorrent/tt-metal/actions/runs/13420435703 - [x] [Blackhole Post commit] https://github.com/tenstorrent/tt-metal/actions/runs/13420449997 - [x] T3K frequent https://github.com/tenstorrent/tt-metal/actions/runs/13420464130 - [x] T3K unit https://github.com/tenstorrent/tt-metal/actions/runs/13420458982/job/37491768972 - [x] T3K nightly https://github.com/tenstorrent/tt-metal/actions/runs/13439719604 --- ...net_write_worker_latency_ubench_common.hpp | 2 +- .../hw/inc/blackhole/noc_nonblocking_api.h | 7 ++++- tt_metal/hw/inc/dataflow_api.h | 30 +++++++++---------- tt_metal/hw/inc/ethernet/erisc.h | 2 ++ tt_metal/hw/inc/ethernet/tunneling.h | 3 ++ .../hw/inc/wormhole/noc_nonblocking_api.h | 7 ++++- .../ccl/kernel_common/worker_edm_utils.hpp | 5 ++-- .../edm_fabric/edm_fabric_worker_adapters.hpp | 16 +++++++--- .../edm_fabric/fabric_erisc_datamover.cpp | 30 ++++++++++++++----- 9 files changed, 70 insertions(+), 32 deletions(-) diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/ethernet_write_worker_latency_ubench_common.hpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/ethernet_write_worker_latency_ubench_common.hpp index 0e1b83b8b94..d634bc5a619 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/ethernet_write_worker_latency_ubench_common.hpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/ethernet_write_worker_latency_ubench_common.hpp @@ -172,7 +172,7 @@ FORCE_INLINE bool has_incoming_packet(volatile eth_buffer_slot_sync_t* buffer_sl } FORCE_INLINE bool write_worker_done(uint32_t trid) { - return ncrisc_noc_nonposted_write_with_transaction_id_flushed(noc_index, trid); + return ncrisc_noc_nonposted_write_with_transaction_id_sent(noc_index, trid); } FORCE_INLINE void ack_complete( diff --git a/tt_metal/hw/inc/blackhole/noc_nonblocking_api.h b/tt_metal/hw/inc/blackhole/noc_nonblocking_api.h index fb9cd78cb14..9c57bf31dc6 100644 --- a/tt_metal/hw/inc/blackhole/noc_nonblocking_api.h +++ b/tt_metal/hw/inc/blackhole/noc_nonblocking_api.h @@ -292,11 +292,16 @@ inline __attribute__((always_inline)) bool ncrisc_noc_nonposted_writes_flushed(u return (NOC_STATUS_READ_REG(noc, NIU_MST_WR_ACK_RECEIVED) == noc_nonposted_writes_acked[noc]); } -inline __attribute__((always_inline)) bool ncrisc_noc_nonposted_write_with_transaction_id_flushed( +inline __attribute__((always_inline)) bool ncrisc_noc_nonposted_write_with_transaction_id_sent( uint32_t noc, uint32_t transcation_id) { return (NOC_STATUS_READ_REG(noc, NIU_MST_WRITE_REQS_OUTGOING_ID(transcation_id)) == 0); } +inline __attribute__((always_inline)) bool ncrisc_noc_nonposted_write_with_transaction_id_flushed( + uint32_t noc, uint32_t transcation_id) { + return (NOC_STATUS_READ_REG(noc, NIU_MST_REQS_OUTSTANDING_ID(transcation_id)) == 0); +} + inline __attribute__((always_inline)) bool ncrisc_noc_nonposted_atomics_flushed(uint32_t noc) { return (NOC_STATUS_READ_REG(noc, NIU_MST_ATOMIC_RESP_RECEIVED) == noc_nonposted_atomics_acked[noc]); } diff --git a/tt_metal/hw/inc/dataflow_api.h b/tt_metal/hw/inc/dataflow_api.h index 7f16650e680..4800b0dc42b 100644 --- a/tt_metal/hw/inc/dataflow_api.h +++ b/tt_metal/hw/inc/dataflow_api.h @@ -2046,26 +2046,24 @@ void noc_async_read_barrier_with_trid(uint32_t trid, uint8_t noc = noc_index) { WAYPOINT("NBTD"); } -inline void noc_async_write_one_packet_with_trid_set_state(std::uint64_t dst_noc_addr, uint8_t noc = noc_index) { +FORCE_INLINE void noc_async_write_one_packet_with_trid_set_state( + std::uint64_t dst_noc_addr, uint8_t cmd_buf = write_cmd_buf, uint8_t noc = noc_index) { #ifndef ARCH_GRAYSKULL WAYPOINT("NAWW"); - while (!noc_cmd_buf_ready(noc, write_cmd_buf)); + while (!noc_cmd_buf_ready(noc, cmd_buf)); WAYPOINT("NAWD"); uint32_t noc_cmd_field = NOC_CMD_CPY | NOC_CMD_WR | NOC_CMD_VC_STATIC | NOC_CMD_STATIC_VC(NOC_UNICAST_WRITE_VC) | 0x0 | // (linked ? NOC_CMD_VC_LINKED : 0x0) 0x0 | // (mcast ? (NOC_CMD_PATH_RESERVE | NOC_CMD_BRCST_PACKET) : 0x0) NOC_CMD_RESP_MARKED; - NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_CTRL, noc_cmd_field); + NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_CTRL, noc_cmd_field); #ifdef ARCH_BLACKHOLE // Handles writing to PCIe - NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_RET_ADDR_MID, (uint32_t)(dst_noc_addr >> 32) & 0x1000000F); + NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_RET_ADDR_MID, (uint32_t)(dst_noc_addr >> 32) & 0x1000000F); #endif NOC_CMD_BUF_WRITE_REG( - noc, - write_cmd_buf, - NOC_RET_ADDR_COORDINATE, - (uint32_t)(dst_noc_addr >> NOC_ADDR_COORD_SHIFT) & NOC_COORDINATE_MASK); + noc, cmd_buf, NOC_RET_ADDR_COORDINATE, (uint32_t)(dst_noc_addr >> NOC_ADDR_COORD_SHIFT) & NOC_COORDINATE_MASK); #endif } @@ -2074,24 +2072,24 @@ FORCE_INLINE void noc_async_write_one_packet_with_trid_with_state( std::uint32_t dst_noc_addr, std::uint32_t size, std::uint32_t trid, + uint8_t cmd_buf = write_cmd_buf, uint8_t noc = noc_index) { #ifndef ARCH_GRAYSKULL WAYPOINT("NWPW"); - while (!noc_cmd_buf_ready(noc, write_cmd_buf)); + while (!noc_cmd_buf_ready(noc, cmd_buf)); WAYPOINT("NWPD"); // In order to sanitize, need to grab full noc addr + xfer size from state. DEBUG_SANITIZE_NOC_WRITE_TRANSACTION_WITH_ADDR_AND_SIZE_STATE(noc, dst_noc_addr, src_local_l1_addr); - - NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_PACKET_TAG, NOC_PACKET_TAG_TRANSACTION_ID(trid)); - NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_TARG_ADDR_LO, src_local_l1_addr); - NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_RET_ADDR_LO, dst_noc_addr); - NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_AT_LEN_BE, size); - NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); + NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_PACKET_TAG, NOC_PACKET_TAG_TRANSACTION_ID(trid)); + NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_TARG_ADDR_LO, src_local_l1_addr); + NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_RET_ADDR_LO, (uint32_t)dst_noc_addr); + NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_AT_LEN_BE, size); + NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); #endif } -inline void noc_async_write_one_packet_with_trid( +FORCE_INLINE void noc_async_write_one_packet_with_trid( std::uint32_t src_local_l1_addr, std::uint64_t dst_noc_addr, std::uint32_t size, diff --git a/tt_metal/hw/inc/ethernet/erisc.h b/tt_metal/hw/inc/ethernet/erisc.h index 0a476f6b733..1bb4e1ed8b9 100644 --- a/tt_metal/hw/inc/ethernet/erisc.h +++ b/tt_metal/hw/inc/ethernet/erisc.h @@ -18,5 +18,7 @@ inline __attribute__((always_inline)) void risc_context_switch() { #endif } +inline __attribute__((always_inline)) void risc_context_switch_without_noc_sync() { rtos_context_switch_ptr(); } + inline __attribute__((always_inline)) void disable_erisc_app() { flag_disable[0] = 0; } } // namespace internal_ diff --git a/tt_metal/hw/inc/ethernet/tunneling.h b/tt_metal/hw/inc/ethernet/tunneling.h index 92eef061c2d..a2a7e7a7a2d 100644 --- a/tt_metal/hw/inc/ethernet/tunneling.h +++ b/tt_metal/hw/inc/ethernet/tunneling.h @@ -152,3 +152,6 @@ void run_routing() { // receive of fd packets internal_::risc_context_switch(); } + +FORCE_INLINE +void run_routing_without_noc_sync() { internal_::risc_context_switch_without_noc_sync(); } diff --git a/tt_metal/hw/inc/wormhole/noc_nonblocking_api.h b/tt_metal/hw/inc/wormhole/noc_nonblocking_api.h index 9b763f44fcf..9bc12dbfff3 100644 --- a/tt_metal/hw/inc/wormhole/noc_nonblocking_api.h +++ b/tt_metal/hw/inc/wormhole/noc_nonblocking_api.h @@ -248,11 +248,16 @@ inline __attribute__((always_inline)) bool ncrisc_noc_nonposted_writes_flushed(u return (NOC_STATUS_READ_REG(noc, NIU_MST_WR_ACK_RECEIVED) == noc_nonposted_writes_acked[noc]); } -inline __attribute__((always_inline)) bool ncrisc_noc_nonposted_write_with_transaction_id_flushed( +inline __attribute__((always_inline)) bool ncrisc_noc_nonposted_write_with_transaction_id_sent( uint32_t noc, uint32_t transcation_id) { return (NOC_STATUS_READ_REG(noc, NIU_MST_WRITE_REQS_OUTGOING_ID(transcation_id)) == 0); } +inline __attribute__((always_inline)) bool ncrisc_noc_nonposted_write_with_transaction_id_flushed( + uint32_t noc, uint32_t transcation_id) { + return (NOC_STATUS_READ_REG(noc, NIU_MST_REQS_OUTSTANDING_ID(transcation_id)) == 0); +} + inline __attribute__((always_inline)) bool ncrisc_noc_nonposted_atomics_flushed(uint32_t noc) { return (NOC_STATUS_READ_REG(noc, NIU_MST_ATOMIC_RESP_RECEIVED) == noc_nonposted_atomics_acked[noc]); } diff --git a/ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_edm_utils.hpp b/ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_edm_utils.hpp index b374000953a..3207c24a47c 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_edm_utils.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_edm_utils.hpp @@ -52,8 +52,9 @@ FORCE_INLINE void fetch_chunk( template FORCE_INLINE void send_chunk_from_address_with_trid( - const uint32_t& local_l1_address, const uint32_t& num_pages, const uint32_t& page_size, uint64_t remote_l1_write_addr, uint8_t trid) { - noc_async_write_one_packet_with_trid(local_l1_address, remote_l1_write_addr, page_size * num_pages, trid); + const uint32_t& local_l1_address, const uint32_t& num_pages, const uint32_t& page_size, uint32_t remote_l1_write_addr, uint8_t trid, uint8_t cmd_buf) { + noc_async_write_one_packet_with_trid_with_state(local_l1_address, remote_l1_write_addr, page_size * num_pages, trid, cmd_buf); + // TODO: this barrier will no longer be functional since we are not incrementing noc counters, remove if constexpr (blocking_mode == ttnn::ccl::EDM_IO_BLOCKING_MODE::FLUSH_BLOCKING) { noc_async_writes_flushed(); } else if constexpr (blocking_mode == ttnn::ccl::EDM_IO_BLOCKING_MODE::BLOCKING) { diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp index 4864cea0b29..564ed163999 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp @@ -121,13 +121,20 @@ struct WorkerToFabricEdmSenderImpl { num_buffers_per_channel(num_buffers_per_channel), last_buffer_index(num_buffers_per_channel - 1), edm_noc_x(edm_worker_x), - edm_noc_y(edm_worker_y) { + edm_noc_y(edm_worker_y), + edm_noc_cmd_buf(write_reg_cmd_buf) { + setup_edm_noc_cmd_buf(write_reg_cmd_buf); ASSERT(buffer_size_bytes > 0); if constexpr (USER_DEFINED_NUM_BUFFER_SLOTS) { ASSERT(num_buffers_per_channel == EDM_NUM_BUFFER_SLOTS); } } + FORCE_INLINE void setup_edm_noc_cmd_buf(uint8_t cmd_buf) const { + uint64_t edm_noc_addr = get_noc_addr(this->edm_noc_x, this->edm_noc_y, 0); + noc_async_write_one_packet_with_trid_set_state(edm_noc_addr, cmd_buf); + } + FORCE_INLINE bool edm_has_space_for_packet() const { using namespace tt::fabric; if constexpr (USER_DEFINED_NUM_BUFFER_SLOTS) { @@ -278,6 +285,9 @@ struct WorkerToFabricEdmSenderImpl { uint8_t edm_noc_x; uint8_t edm_noc_y; + // the cmd buffer is used for edm-edm path + uint8_t edm_noc_cmd_buf; + private: FORCE_INLINE void update_edm_buffer_slot_wrptr() { @@ -339,12 +349,10 @@ struct WorkerToFabricEdmSenderImpl { } template FORCE_INLINE void send_payload_from_address_with_trid_impl(uint32_t source_address, size_t size_bytes, uint8_t trid) { - uint64_t buffer_address = this->compute_dest_buffer_slot_noc_addr(); - ASSERT(size_bytes <= this->buffer_size_bytes); ASSERT(tt::fabric::is_valid(*const_cast( reinterpret_cast(source_address)))); - send_chunk_from_address_with_trid(source_address, 1, size_bytes, buffer_address, trid); + send_chunk_from_address_with_trid(source_address, 1, size_bytes, this->edm_buffer_addr, trid, this->edm_noc_cmd_buf); post_send_payload_increment_pointers(); } diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp index 4f7b82b5ce7..be1ec45d50d 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp @@ -311,11 +311,18 @@ struct WriteTransactionIdTracker { FORCE_INLINE bool transaction_flushed(tt::fabric::BufferIndex buffer_index) const { if constexpr (BOTH_PARAMS_ARE_POW2) { auto trid = this->get_buffer_slot_trid(buffer_index); - return ncrisc_noc_nonposted_write_with_transaction_id_flushed(noc_index, trid); + return ncrisc_noc_nonposted_write_with_transaction_id_sent(noc_index, trid); } else { // TODO: should be able to remove compare against INVALID_TRID auto trid = this->get_buffer_slot_trid(buffer_index); - return trid == INVALID_TRID || ncrisc_noc_nonposted_write_with_transaction_id_flushed(noc_index, trid); + return trid == INVALID_TRID || ncrisc_noc_nonposted_write_with_transaction_id_sent(noc_index, trid); + } + } + FORCE_INLINE void all_buffer_slot_transactions_acked() const { + for (uint8_t i = 0; i < NUM_CHANNELS; ++i) { + tt::fabric::BufferIndex buffer_index(i); + auto trid = this->get_buffer_slot_trid(buffer_index); + noc_async_write_barrier_with_trid(trid, noc_index); } } private: @@ -886,7 +893,8 @@ void run_fabric_edm_main_loop( volatile tt::fabric::EdmFabricReceiverChannelCounters *receiver_channel_counters_ptr, std::array sender_channel_counters_ptrs, PacketHeaderRecorder &receiver_channel_packet_recorder, - std::array &sender_channel_packet_recorders) { + std::array &sender_channel_packet_recorders, + WriteTransactionIdTracker &receiver_channel_trid_tracker) { std::array sender_states = { SenderState::SENDER_WAIT_WORKER_HANDSHAKE, SenderState::SENDER_WAIT_WORKER_HANDSHAKE}; size_t sender_channel_index = 0; @@ -905,8 +913,6 @@ void run_fabric_edm_main_loop( ReceiverChannelPointers receiver_channel_pointers; std::array channel_connection_established = {false, false}; - WriteTransactionIdTracker receiver_channel_trid_tracker; - // This value defines the number of loop iterations we perform of the main control sequence before exiting // to check for termination and context switch. Removing the these checks from the inner loop can drastically // improve performance. The value of 32 was chosen somewhat empirically and then raised up slightly. @@ -964,7 +970,8 @@ void run_fabric_edm_main_loop( } else { if (did_nothing_count++ > SWITCH_INTERVAL) { did_nothing_count = 0; - run_routing(); + // shouldn't do noc counter sync since we are not incrementing them + run_routing_without_noc_sync(); } } } @@ -1212,6 +1219,9 @@ void kernel_main() { } + WriteTransactionIdTracker receiver_channel_trid_tracker; + + if (has_downstream_edm_buffer_connection) { downstream_edm_noc_interface.open(); *downstream_edm_noc_interface.from_remote_buffer_slot_rdptr_ptr = 0; @@ -1240,7 +1250,8 @@ void kernel_main() { receiver_channel_counters_ptr, {sender_channel_0_counters_ptr, sender_channel_1_counters_ptr}, receiver_channel_packet_recorder, - sender_channel_packet_recorders); + sender_channel_packet_recorders, + receiver_channel_trid_tracker); if constexpr (persistent_mode) { @@ -1251,6 +1262,11 @@ void kernel_main() { *sender0_worker_semaphore_ptr = 99; } + // make sure all the noc transactions are acked before re-init the noc counters + receiver_channel_trid_tracker.all_buffer_slot_transactions_acked(); + // re-init the noc counters as the noc api used is not incrementing them + ncrisc_noc_counters_init(); + DPRINT << "EDM DONE\n"; WAYPOINT("DONE"); } From 42cf08b2235aa5f7fddf2689dea9479bfb594ee1 Mon Sep 17 00:00:00 2001 From: Oleg Milyutin Date: Thu, 20 Feb 2025 14:19:59 -0500 Subject: [PATCH 195/316] #17754: Lower Indestructible to Metal, add guidance on using static vars with non-trivial destructors (#17899) ### Ticket #17754, #17607 ### Problem description Variables with static storage duration should have trivial destructors. Add guidance on why this so, and lower `Indestructible` utility to Metal, as the suggested alternative. ### What's changed * Lower `Indestructible` from tt-train to Metal. * Add guidance to best practices doc. * Add comments and a test for `Indestructible`. ### Checklist - [X] [All post commit](https://github.com/tenstorrent/tt-metal/actions/runs/13350667483) - [X] New/Existing tests provide coverage for changes - [X] Checked that standalone tt-train compiles. --- CODEOWNERS | 2 +- contributing/BestPractices.md | 55 ++++++++++++++++++- tests/tt_metal/tt_metal/stl/CMakeLists.txt | 1 + .../tt_metal/stl/test_indestructible.cpp | 25 +++++++++ .../sources/ttml/autograd/auto_context.cpp | 2 +- .../sources/ttml/autograd/auto_context.hpp | 4 +- tt-train/sources/ttml/core/indestructible.hpp | 40 -------------- tt_metal/tt_stl/indestructible.hpp | 51 +++++++++++++++++ 8 files changed, 134 insertions(+), 46 deletions(-) create mode 100644 tests/tt_metal/tt_metal/stl/test_indestructible.cpp delete mode 100644 tt-train/sources/ttml/core/indestructible.hpp create mode 100644 tt_metal/tt_stl/indestructible.hpp diff --git a/CODEOWNERS b/CODEOWNERS index 62994bfe05c..4acdc090cef 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -80,7 +80,7 @@ tt_metal/hw/firmware/src/*erisc* @aliuTT @ubcheema tt_metal/hw/inc/ethernet/ @aliuTT @ubcheema tt_metal/hw/inc/wormhole/eth_l1_address_map.h @aliuTT @ubcheema tt_metal/third_party/tt_llk_* @rtawfik01 @ttmtrajkovic @rdjogoTT -tt_metal/tt_stl/ @patrickroberts @ayerofieiev-tt @dmakoviichuk-tt @sminakov-tt +tt_metal/tt_stl/ @patrickroberts @ayerofieiev-tt @dmakoviichuk-tt @sminakov-tt @omilyutin-tt sfpi/ @pgkeller diff --git a/contributing/BestPractices.md b/contributing/BestPractices.md index 13a8efcaba2..c805c87ac3e 100644 --- a/contributing/BestPractices.md +++ b/contributing/BestPractices.md @@ -1,4 +1,4 @@ -# Best Practices for C++20 Repository +# Best Practices for Contributing to TT Metal ## 1. Pass Complex Types by Const References @@ -319,7 +319,7 @@ struct PadDimension { ``` Motivation - **Bug Prevention:** Reduces the risk of bugs due to uninitialized variables. -- **Code Safety:** Ensures that all variables have a known value, leading to safer and more predictable code. +- **Safety:** Ensures that all variables have a known value, leading to safer and more predictable code. - **Ease of Review:** Simplifies code reviews by making initialization explicit. ## 16. Use Early Exit for Contract Checks @@ -354,3 +354,54 @@ void doSomething(...) { - **Code Clarity:** Improves code clarity by reducing unnecessary nesting. - **Maintainability:** Makes the code easier to maintain by focusing on the main logic once preconditions are validated. - **Efficiency:** Potentially improves performance by avoiding unnecessary processing when contract conditions aren't met. + +## 17. Avoid `static` variables with non-trivial destructors +### Practice +Avoid using `static` variables with non-trivial destructors. When applicable, use `tt::stl::Indestructible` to create static objects with disabled destructor. + +### Explanation +Objects with static storage duration (globals, static class members, or function-local statics) live from initialization until program termination. + +A non-trivial destructor (i.e., one that is user-defined or virtual) may depend on the state of other objects, which might have already been destroyed by the time it is invoked. This can lead to undefined behavior or subtle bugs, especially in the multi-threaded environments. + +An object is considered trivially destructible if it has no custom or virtual destructor and all its bases and non-static members are also trivially destructible. Examples include: fundamental types (pointers, int, float, etc.), arrays of trivially destructible types, variables marked with `constexpr`. + +To ensure safe and predictable program termination, static objects should meet these criteria. If dynamic initialization is required, consider using function-local statics with `tt::stl::Indestructible` that disables destruction. + +### Motivation +- **Safety:** Prevents accessing objects after they have been destroyed. +- **Maintainability:** Simplifies tracking the lifetime of objects and helps avoid errors related to destruction ordering. + +### Example +**Avoid:** +```cpp +// Bad: Using a static object with a non-trivial destructor. +static const std::map kDeviceConfigFiles = { + {1, "n150.yaml"}, + {2, "n300.yaml"}, + {8, "t3000.yaml"} +}; +``` + +**Prefer:** +```cpp +// Option 1: Use a trivial type for static data when possible. +constexpr std::string_view kData = "Trivial destructor! Good!"; + +constexpr uint32_t kMaxNumberOfCommandQueues = 2; + +// Using array of trivially destructible types is OK. +constexpr std::array kDeviceIds = {1, 2, 8}; + +// Option 2: If dynamic initialization is required, use function-local statics with `Indestructible`. +const auto& get_device_configs() { + static tt::stl::Indestructible> configs{ + std::map{ + {1, "n150.yaml"}, + {2, "n300.yaml"}, + {8, "t3000.yaml"} + } + }; + return configs.get(); +} +``` diff --git a/tests/tt_metal/tt_metal/stl/CMakeLists.txt b/tests/tt_metal/tt_metal/stl/CMakeLists.txt index 0a5de5f45b0..061650d4105 100644 --- a/tests/tt_metal/tt_metal/stl/CMakeLists.txt +++ b/tests/tt_metal/tt_metal/stl/CMakeLists.txt @@ -1,5 +1,6 @@ set(UNIT_TESTS_STL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/test_any_range.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_indestructible.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_slotmap.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_strong_type.cpp ) diff --git a/tests/tt_metal/tt_metal/stl/test_indestructible.cpp b/tests/tt_metal/tt_metal/stl/test_indestructible.cpp new file mode 100644 index 00000000000..3006c9e252a --- /dev/null +++ b/tests/tt_metal/tt_metal/stl/test_indestructible.cpp @@ -0,0 +1,25 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#include "tt_metal/tt_stl/indestructible.hpp" + +namespace tt::stl { +namespace { + +TEST(IndestructibleTest, Basic) { + struct DangerouslyDestructible { + ~DangerouslyDestructible() { + // Wrapping in a lambda, as `FAIL()` returns `void`. + []() { FAIL(); }(); + } + }; + + Indestructible obj; +} + +} // namespace +} // namespace tt::stl diff --git a/tt-train/sources/ttml/autograd/auto_context.cpp b/tt-train/sources/ttml/autograd/auto_context.cpp index dff1ac0d5ff..ebe1afc0726 100644 --- a/tt-train/sources/ttml/autograd/auto_context.cpp +++ b/tt-train/sources/ttml/autograd/auto_context.cpp @@ -26,7 +26,7 @@ uint32_t AutoContext::get_seed() const { } AutoContext& AutoContext::get_instance() { - static core::Indestructible instance{}; + static tt::stl::Indestructible instance{}; return instance.get(); } std::optional AutoContext::add_backward_node(GradFunction&& grad_function, std::span links) { diff --git a/tt-train/sources/ttml/autograd/auto_context.hpp b/tt-train/sources/ttml/autograd/auto_context.hpp index cd62b151137..8d335836ca4 100644 --- a/tt-train/sources/ttml/autograd/auto_context.hpp +++ b/tt-train/sources/ttml/autograd/auto_context.hpp @@ -4,10 +4,10 @@ #pragma once +#include #include #include -#include "core/indestructible.hpp" #include "core/mesh_device.hpp" #include "graph.hpp" @@ -62,7 +62,7 @@ class AutoContext { tt::tt_metal::distributed::MeshShape m_mesh_shape = {1, 1}; std::unique_ptr m_device; - friend class core::Indestructible; + friend class tt::stl::Indestructible; }; inline auto& ctx() { diff --git a/tt-train/sources/ttml/core/indestructible.hpp b/tt-train/sources/ttml/core/indestructible.hpp deleted file mode 100644 index eb30d101bd2..00000000000 --- a/tt-train/sources/ttml/core/indestructible.hpp +++ /dev/null @@ -1,40 +0,0 @@ -// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC -// -// SPDX-License-Identifier: Apache-2.0 - -#pragma once -#include - -namespace ttml::core { - -template -class Indestructible { -public: - template - explicit Indestructible(Args&&... args) { - // Construct T in our aligned storage - new (&storage) T(std::forward(args)...); - } - - T& get() { - return *reinterpret_cast(&storage); - } - - const T& get() const { - return *reinterpret_cast(&storage); - } - - // Disable copy and assignment - Indestructible(const Indestructible&) = delete; - Indestructible& operator=(const Indestructible&) = delete; - - // Destructor does NOT call T's destructor. - // This leaves the object "indestructible." - ~Indestructible() = default; - -private: - // A buffer of unsigned char with alignment of T and size of T - alignas(T) unsigned char storage[sizeof(T)]; -}; - -} // namespace ttml::core diff --git a/tt_metal/tt_stl/indestructible.hpp b/tt_metal/tt_stl/indestructible.hpp new file mode 100644 index 00000000000..7b13aae32db --- /dev/null +++ b/tt_metal/tt_stl/indestructible.hpp @@ -0,0 +1,51 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include + +namespace tt::stl { + +// `Indestructible` is a wrapper around `T` that behaves like `T` but does not call the destructor of `T`. +// This is useful for creating objects with static storage duration: `Indestructible` avoids heap allocation, provides +// thread-safe construction, and ensures the destructor is no-op, so does not depend on any other objects. +// +// +// Example usage: +// +// const auto& get_object() { +// static Indestructible object; +// return object.get(); +// } +// +template +class Indestructible { +public: + template + explicit Indestructible(Args&&... args) { + // Construct T in our aligned storage + new (&storage_) T(std::forward(args)...); + } + + T& get() { return *std::launder(reinterpret_cast(&storage_)); } + + const T& get() const { return *std::launder(reinterpret_cast(&storage_)); } + + // Disable copy and assignment + Indestructible(const Indestructible&) = delete; + Indestructible& operator=(const Indestructible&) = delete; + + // Destructor does NOT call T's destructor. + // This leaves the object "indestructible." + ~Indestructible() = default; + +private: + // A buffer of std::byte with alignment of T and size of T + alignas(T) std::byte storage_[sizeof(T)]; +}; + +} // namespace tt::stl From ed29888fcba9f83e387d7af6aed3b4d0134d0eef Mon Sep 17 00:00:00 2001 From: Juan Camilo Vega Date: Thu, 20 Feb 2025 14:33:58 -0500 Subject: [PATCH 196/316] #17999: Fixing invalid barrier test (#18103) ### Ticket #17999 ### Problem description New asserts for sharding made the test illegal ### What's changed Changed the sharding configuration in the pytest so the input tensor is legal ### Checklist - [ ] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes - [ ] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [ ] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [ ] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [ ] New/Existing tests provide coverage for changes --- .../unit_tests/operations/ccl/test_barrier_t3000_frequent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ttnn/unit_tests/operations/ccl/test_barrier_t3000_frequent.py b/tests/ttnn/unit_tests/operations/ccl/test_barrier_t3000_frequent.py index 96e57bfef96..731da554aac 100644 --- a/tests/ttnn/unit_tests/operations/ccl/test_barrier_t3000_frequent.py +++ b/tests/ttnn/unit_tests/operations/ccl/test_barrier_t3000_frequent.py @@ -347,7 +347,7 @@ def test_run_barrier_impl_pcie( # LLama ( (1, 1, 32, 1024), - (32, 32), + (32, 256), ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}), ), ), From fd3ed75e96eb5b555f2f39cdefd37d8698ff8418 Mon Sep 17 00:00:00 2001 From: Stanislav Minakov Date: Thu, 20 Feb 2025 19:53:06 +0000 Subject: [PATCH 197/316] Update reshape_view C++ API (#18080) ### Ticket https://github.com/tenstorrent/tt-metal/issues/17720 ### Problem description Currently reshape view C++ API is inconsistent with other operations, not allowing to call it specifying memory_config and not specifying queue id, which creates some issues for tt-mlir ### What's changed Changed reshape_view invoke calls, making QueueId the first argument. In this case decorators automatically allow the calls both with and without QueueId specified. ### Checklist - [x] [All post commit CI passes](https://github.com/tenstorrent/tt-metal/actions/runs/13429855903) - [x] New/Existing tests provide coverage for changes --- .../data_movement/reshape_view/reshape.cpp | 26 +++++-------------- .../data_movement/reshape_view/reshape.hpp | 22 +++++++--------- 2 files changed, 15 insertions(+), 33 deletions(-) diff --git a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp index 6bb2d3f1398..982271baf61 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp @@ -47,7 +47,7 @@ ttnn::Tensor convert_tile_to_rm( auto new_tensor = (tensor.get_dtype() == DataType::BFLOAT8_B) ? ttnn::typecast(tensor, DataType::BFLOAT16) : tensor; new_tensor = ttnn::to_layout(tensor, ttnn::ROW_MAJOR_LAYOUT, std::nullopt, std::nullopt, (IDevice*)nullptr); new_tensor = - ReshapeViewOperation::invoke(new_tensor, logical_shape, padded_shape, memory_config, queue_id, pad_value); + ReshapeViewOperation::invoke(queue_id, new_tensor, logical_shape, padded_shape, memory_config, pad_value); new_tensor = ttnn::to_layout(new_tensor, ttnn::TILE_LAYOUT, new_tensor.get_dtype(), memory_config, (IDevice*)nullptr); new_tensor = @@ -344,11 +344,11 @@ std::pair shape_corrector( } ttnn::Tensor ReshapeViewOperation::invoke( + const QueueId queue_id, const ttnn::Tensor& tensor, const ttnn::Shape& logical_input_shape, const ttnn::Shape& padded_input_shape, const std::optional& memory_config, - const QueueId queue_id, const std::optional& pad_value) { MemoryConfig mem_config = memory_config.value_or(tensor.memory_config()); auto layout = tensor.get_layout(); @@ -431,36 +431,22 @@ ttnn::Tensor ReshapeViewOperation::invoke( } ttnn::Tensor ReshapeViewOperation::invoke( + const QueueId queue_id, const ttnn::Tensor& tensor, const ttnn::Shape& shape, const std::optional& memory_config, - const QueueId queue_id, const std::optional& pad_value) { - return invoke(tensor, shape, shape, memory_config, queue_id, pad_value); -} - -ttnn::Tensor ReshapeViewOperation::invoke(const ttnn::Tensor& tensor, const ttnn::Shape& shape) { - return invoke(tensor, shape, shape, std::nullopt, DefaultQueueId, std::nullopt); -} - -ttnn::Tensor ReshapeViewOperation::invoke( - const ttnn::Tensor& tensor, const ttnn::Shape& logical_shape, const ttnn::Shape& padded_shape) { - return invoke(tensor, logical_shape, padded_shape, std::nullopt, DefaultQueueId, std::nullopt); + return invoke(queue_id, tensor, shape, shape, memory_config, pad_value); } ttnn::Tensor ReshapeViewOperation::invoke( + const QueueId queue_id, const ttnn::Tensor& tensor, tt::stl::Span shape_vector, const std::optional& memory_config, - const QueueId queue_id, const std::optional& pad_value) { return invoke( - tensor, tt::tt_metal::infer_dims_for_reshape(tensor, shape_vector), memory_config, queue_id, pad_value); -} - -ttnn::Tensor ReshapeViewOperation::invoke(const ttnn::Tensor& tensor, tt::stl::Span shape_vector) { - return invoke( - tensor, tt::tt_metal::infer_dims_for_reshape(tensor, shape_vector), std::nullopt, DefaultQueueId, std::nullopt); + queue_id, tensor, tt::tt_metal::infer_dims_for_reshape(tensor, shape_vector), memory_config, pad_value); } } // ttnn::operations::data_movement namespace diff --git a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.hpp b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.hpp index 587657e34ce..963387ebc1b 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.hpp @@ -69,28 +69,24 @@ ttnn::Tensor PerformView( struct ReshapeViewOperation { static ttnn::Tensor invoke( + const QueueId queue_id, const ttnn::Tensor& input_tensor, const ttnn::Shape& logical_shape, - const std::optional& memory_config, - const QueueId queue_id, - const std::optional& pad_value); + const std::optional& memory_config = std::nullopt, + const std::optional& pad_value = std::nullopt); static ttnn::Tensor invoke( + const QueueId queue_id, const ttnn::Tensor& input_tensor, const ttnn::Shape& logical_shape, const ttnn::Shape& padded_shape, - const std::optional& memory_config, - const QueueId queue_id, - const std::optional& pad_value); + const std::optional& memory_config = std::nullopt, + const std::optional& pad_value = std::nullopt); static ttnn::Tensor invoke( + const QueueId queue_id, const ttnn::Tensor& input_tensor, tt::stl::Span shape_vector, - const std::optional& memory_config, - const QueueId queue_id, - const std::optional& pad_value); - static ttnn::Tensor invoke(const ttnn::Tensor& input_tensor, const ttnn::Shape& logical_shape); - static ttnn::Tensor invoke( - const ttnn::Tensor& input_tensor, const ttnn::Shape& logical_shape, const ttnn::Shape& padded_shape); - static ttnn::Tensor invoke(const ttnn::Tensor& input_tensor, tt::stl::Span shape_vector); + const std::optional& memory_config = std::nullopt, + const std::optional& pad_value = std::nullopt); }; } // namespace operations::data_movement From 96ebc7a9a2160f6bc8c0396d3df363b56e636e97 Mon Sep 17 00:00:00 2001 From: Saad Jameel <163029024+sjameelTT@users.noreply.github.com> Date: Thu, 20 Feb 2025 15:45:26 -0500 Subject: [PATCH 198/316] #17966 add RM support for eltwise (#18075) ### Ticket #17966 #17356 ### Problem description Eltwise currently has 0 row major support at all. Also need a test confirming that fused dtype works. ### What's changed As a first step I'm supporting it via untilize/tilize support to unblock any models going forward. Next step will be adding native kernel support. ### Checklist - [ ] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes https://github.com/tenstorrent/tt-metal/actions/runs/13421572742 - [ ] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) https://github.com/tenstorrent/tt-metal/actions/runs/13399500242 - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [ ] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [ ] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [ ] New/Existing tests provide coverage for changes --- .../eltwise/test_binary_ng_typecast.py | 287 +++++++++++++++++- .../eltwise/binary_ng/binary_ng.cpp | 71 ++++- 2 files changed, 340 insertions(+), 18 deletions(-) diff --git a/tests/ttnn/unit_tests/operations/eltwise/test_binary_ng_typecast.py b/tests/ttnn/unit_tests/operations/eltwise/test_binary_ng_typecast.py index df8b8db740a..948775866a7 100644 --- a/tests/ttnn/unit_tests/operations/eltwise/test_binary_ng_typecast.py +++ b/tests/ttnn/unit_tests/operations/eltwise/test_binary_ng_typecast.py @@ -9,6 +9,7 @@ from models.utility_functions import skip_for_grayskull, torch_random from functools import partial from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt +from tests.ttnn.utils_for_testing import assert_with_pcc binary_fns = { @@ -51,8 +52,12 @@ "dtype", ([ttnn.bfloat16]), ) +@pytest.mark.parametrize( + "layout", + ([ttnn.TILE_LAYOUT]), +) # No typecast on inputs and optional output -def test_opt_output_no_typecast(input_shapes, dtype, ttnn_fn, device): +def test_opt_output_no_typecast(input_shapes, dtype, layout, ttnn_fn, device): torch.manual_seed(0) a_shape, b_shape, out_shape = input_shapes ttnn_op = getattr(ttnn.experimental, ttnn_fn) @@ -66,14 +71,12 @@ def test_opt_output_no_typecast(input_shapes, dtype, ttnn_fn, device): out = gen_func_with_cast_tt(partial(torch_random, low=0, high=1, dtype=torch.bfloat16), dtype)(out_shape) input_tensor_a = ttnn.from_torch( - torch_input_tensor_a, dtype=dtype, device=device, layout=ttnn.TILE_LAYOUT, memory_config=ttnn.DRAM_MEMORY_CONFIG + torch_input_tensor_a, dtype=dtype, device=device, layout=layout, memory_config=ttnn.DRAM_MEMORY_CONFIG ) input_tensor_b = ttnn.from_torch( - torch_input_tensor_b, dtype=dtype, device=device, layout=ttnn.TILE_LAYOUT, memory_config=ttnn.DRAM_MEMORY_CONFIG - ) - out_tt = ttnn.from_torch( - out, dtype=dtype, device=device, layout=ttnn.TILE_LAYOUT, memory_config=ttnn.DRAM_MEMORY_CONFIG + torch_input_tensor_b, dtype=dtype, device=device, layout=layout, memory_config=ttnn.DRAM_MEMORY_CONFIG ) + out_tt = ttnn.from_torch(out, dtype=dtype, device=device, layout=layout, memory_config=ttnn.DRAM_MEMORY_CONFIG) cq_id = 0 ttnn_op(input_tensor_a, input_tensor_b, queue_id=cq_id, output_tensor=out_tt) output_tensor = ttnn.to_torch(out_tt) @@ -660,3 +663,275 @@ def test_opt_output_scalar(input_shapes, ttnn_fn, scalar, device): status = ttnn.pearson_correlation_coefficient(torch_output_tensor, output_tensor) assert status >= 0.999 + + +@skip_for_grayskull("Requires wormhole_b0 to run") +@pytest.mark.parametrize("input_shape", [(1, 1, 1, 1), (3, 3, 15, 15), (3, 3, 17, 17), (3, 3, 33, 33)]) +@pytest.mark.parametrize( + "memory_config", + ([ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG]), +) +@pytest.mark.parametrize("scalar", [-0.25, -16.5, 0.0, 0.05, 1.7, 19.0]) +@pytest.mark.parametrize( + "ttnn_fn", + [ + "add", + "sub", + "mul", + "div", + "rsub", + "squared_difference", + ], +) +@pytest.mark.parametrize( + "layout", + ([ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT]), +) +def test_edgecase_dims_eltwise_scalar_matrix_math(input_shape, scalar, ttnn_fn, memory_config, layout, device): + torch.manual_seed(0) + a_shape = input_shape + + ttnn_op = getattr(ttnn.experimental, ttnn_fn) + torch_input_tensor_a = torch.randn(a_shape, dtype=torch.bfloat16) + + input_tensor_a = ttnn.from_torch( + torch_input_tensor_a, + dtype=ttnn.bfloat16, + device=device, + layout=layout, + memory_config=memory_config, + ) + + output = ttnn_op(input_tensor_a, scalar) + tt_output_tensor = ttnn.to_torch(output) + + golden_fn = ttnn.get_golden_function(ttnn_op) + torch_output_tensor = golden_fn(torch_input_tensor_a, scalar) + + assert_with_pcc(torch_output_tensor, tt_output_tensor, 0.999) + + +@skip_for_grayskull("Requires wormhole_b0 to run") +@pytest.mark.parametrize("input_shape", [(1, 1, 1, 1), (3, 3, 15, 15), (3, 3, 17, 17), (3, 3, 33, 33)]) +@pytest.mark.parametrize( + "memory_config", + ([ttnn.DRAM_MEMORY_CONFIG]), +) +@pytest.mark.parametrize("scalar", [-1.0, -2.0, 0.0, 1.0, 2.0, 19.0]) +@pytest.mark.parametrize( + "ttnn_fn", + [ + "gt", + "lt", + "lte", + "gte", + "eq", + "ne", + ], +) +@pytest.mark.parametrize( + "layout", + ([ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT]), +) +def test_edgecase_dims_eltwise_scalar_logical(input_shape, scalar, ttnn_fn, memory_config, layout, device): + torch.manual_seed(0) + a_shape = input_shape + + ttnn_op = getattr(ttnn.experimental, ttnn_fn) + torch_input_tensor_a = torch.randint(low=-50, high=50, size=a_shape, dtype=torch.bfloat16) + # guarantee a few equal values + if (ttnn_fn == "eq" or ttnn_fn == "ne" or ttnn_fn == "gte" or ttnn_fn == "lte") and input_shape != (1, 1, 1, 1): + torch_input_tensor_a[0, 0, 0, 0] = scalar + torch_input_tensor_a[-1, -1, -1, -1] = scalar + + input_tensor_a = ttnn.from_torch( + torch_input_tensor_a, + dtype=ttnn.bfloat16, + device=device, + layout=layout, + memory_config=memory_config, + ) + + output = ttnn_op(input_tensor_a, scalar, dtype=ttnn.uint32) + tt_output_tensor = ttnn.to_torch(output) + + golden_fn = ttnn.get_golden_function(ttnn_op) + torch_output_tensor = golden_fn(torch_input_tensor_a, scalar) + + assert_with_pcc(torch_output_tensor, tt_output_tensor, 0.999) + + +@skip_for_grayskull("Requires wormhole_b0 to run") +@pytest.mark.parametrize( + "input_shapes", + [ + ((1, 7, 1, 1), (7, 7, 33, 33)), + ((7, 1, 1, 1), (7, 7, 49, 49)), + ((7, 7, 65, 65), (7, 7, 65, 65)), + ((2, 2, 10, 1), (2, 2, 10, 2)), + ], +) +@pytest.mark.parametrize( + "memory_config", + ([ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG]), +) +@pytest.mark.parametrize( + "ttnn_fn", + [ + "add", + "sub", + "mul", + "div", + "rsub", + "squared_difference", + ], +) +@pytest.mark.parametrize( + "layout", + ([ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT]), +) +def test_edgecase_dims_eltwise_broadcast_matrix_math(input_shapes, ttnn_fn, memory_config, layout, device): + torch.manual_seed(0) + a_shape, b_shape = input_shapes + + ttnn_op = getattr(ttnn.experimental, ttnn_fn) + torch_input_tensor_a = torch.randn(a_shape, dtype=torch.bfloat16) + torch_input_tensor_b = torch.randn(b_shape, dtype=torch.bfloat16) + + if ttnn_fn == "div": + torch_input_tensor_b[torch_input_tensor_b.abs() < 0.001] = 0.001 + + input_tensor_a = ttnn.from_torch( + torch_input_tensor_a, + dtype=ttnn.bfloat16, + device=device, + layout=layout, + memory_config=memory_config, + ) + + input_tensor_b = ttnn.from_torch( + torch_input_tensor_b, + dtype=ttnn.bfloat16, + device=device, + layout=layout, + memory_config=memory_config, + ) + + output = ttnn_op(input_tensor_a, input_tensor_b, dtype=ttnn.float32) + tt_output_tensor = ttnn.to_torch(output) + + golden_fn = ttnn.get_golden_function(ttnn_op) + torch_output_tensor = golden_fn(torch_input_tensor_a, torch_input_tensor_b) + + assert_with_pcc(torch_output_tensor, tt_output_tensor, 0.999) + + +@skip_for_grayskull("Requires wormhole_b0 to run") +@pytest.mark.parametrize( + "input_shapes", + [ + ((1, 7, 1, 1), (7, 7, 33, 33)), + ((7, 1, 1, 1), (7, 7, 49, 49)), + ((7, 7, 65, 65), (7, 7, 65, 65)), + ], +) +@pytest.mark.parametrize( + "memory_config", + ([ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG]), +) +@pytest.mark.parametrize( + "ttnn_fn", + [ + "gt", + "lt", + "lte", + "gte", + "eq", + "ne", + ], +) +@pytest.mark.parametrize( + "layout", + ([ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT]), +) +def test_edgecase_dims_eltwise_broadcast_logical(input_shapes, ttnn_fn, memory_config, layout, device): + torch.manual_seed(0) + a_shape, b_shape = input_shapes + + ttnn_op = getattr(ttnn.experimental, ttnn_fn) + torch_input_tensor_a = torch.randn(a_shape, dtype=torch.bfloat16) + torch_input_tensor_b = torch.randn(b_shape, dtype=torch.bfloat16) + # guarantee at least one equal value + if ttnn_fn == "eq" or ttnn_fn == "ne" or ttnn_fn == "gte" or ttnn_fn == "lte": + torch_input_tensor_a[0, 0, 0, 0] = torch_input_tensor_b[0, 0, 0, 0] + + input_tensor_a = ttnn.from_torch( + torch_input_tensor_a, + dtype=ttnn.bfloat16, + device=device, + layout=layout, + memory_config=memory_config, + ) + + input_tensor_b = ttnn.from_torch( + torch_input_tensor_b, + dtype=ttnn.bfloat16, + device=device, + layout=layout, + memory_config=memory_config, + ) + + output = ttnn_op(input_tensor_a, input_tensor_b, dtype=ttnn.float32) + tt_output_tensor = ttnn.to_torch(output) + + golden_fn = ttnn.get_golden_function(ttnn_op) + torch_output_tensor = golden_fn(torch_input_tensor_a, torch_input_tensor_b) + + assert_with_pcc(torch_output_tensor, tt_output_tensor, 0.999) + + +@skip_for_grayskull("Requires wormhole_b0 to run") +@pytest.mark.parametrize( + "input_shape, input_layout, input_shard_grid, input_shard_orientation, input_sharding_scheme", + [ + ( + [1, 1, 64, 64], + ttnn.TILE_LAYOUT, + ttnn.CoreGrid(y=1, x=2), + ttnn.ShardOrientation.ROW_MAJOR, + ttnn.ShardStrategy.WIDTH, + ), + ], +) +@pytest.mark.parametrize("input_dtype", [ttnn.bfloat16, ttnn.float32]) +@pytest.mark.parametrize("output_dtype", [ttnn.float32, ttnn.bfloat16]) +def test_binary_div( + device, + input_shape, + input_layout, + input_shard_grid, + input_shard_orientation, + input_sharding_scheme, + input_dtype, + output_dtype, +): + memory_config = ttnn.create_sharded_memory_config( + input_shape, + core_grid=input_shard_grid, + strategy=input_sharding_scheme, + orientation=input_shard_orientation, + use_height_and_width_as_shard_shape=False, + ) + + torch_input_a = torch.rand(input_shape, dtype=torch.bfloat16) + 1 + torch_input_b = torch.rand(input_shape, dtype=torch.bfloat16) + 1 + torch_output = torch_input_a / torch_input_b + + input_tensor_a = ttnn.from_torch( + torch_input_a, layout=input_layout, memory_config=memory_config, dtype=input_dtype, device=device + ) + input_tensor_b = ttnn.from_torch( + torch_input_b, layout=input_layout, memory_config=memory_config, dtype=input_dtype, device=device + ) + output_tensor = ttnn.experimental.div(input_tensor_a, input_tensor_b, dtype=output_dtype) + assert_with_pcc(torch_output, ttnn.to_torch(output_tensor), 0.999) diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/binary_ng.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/binary_ng.cpp index 99c1a77dab0..efa19f1962b 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/binary_ng.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/binary_ng.cpp @@ -33,6 +33,9 @@ Tensor BinaryNg::invoke( const ttnn::DataType out_dtype = output_preallocated ? optional_output_tensor->get_dtype() : output_dtype.value_or(a_dtype); + const auto mem_config = output_preallocated ? optional_output_tensor->memory_config() + : memory_config.value_or(input_tensor_a.memory_config()); + if (output_dtype.has_value() && output_preallocated) { TT_FATAL( *output_dtype == out_dtype, @@ -43,19 +46,44 @@ Tensor BinaryNg::invoke( bool typecast_b = needs_typecast_to_bfloat16(b_dtype); bool typecast_out = needs_typecast_to_bfloat16(out_dtype); + // RM is never BFLOAT8 or BFLOAT4 so we can assume it goes in here. if (!typecast_a && !typecast_b) { - return ttnn::prim::binary_ng( + bool input_a_rm = input_tensor_a.get_layout() == Layout::ROW_MAJOR; + bool input_b_rm = input_tensor_b.get_layout() == Layout::ROW_MAJOR; + Tensor input_a = + input_a_rm ? ttnn::to_layout(input_tensor_a, Layout::TILE, std::nullopt, std::nullopt, (IDevice*)nullptr) + : input_tensor_a; + Tensor input_b = + input_b_rm ? ttnn::to_layout(input_tensor_b, Layout::TILE, std::nullopt, std::nullopt, (IDevice*)nullptr) + : input_tensor_b; + + if (input_a_rm && input_b_rm) { + // we don't support to_layout with optional output tensor + TT_FATAL( + !output_preallocated, + "Optional output tensor with Row Major input is not supported right now for Elementwise operations"); + } + + Tensor result = ttnn::prim::binary_ng( queue_id, - input_tensor_a, - input_tensor_b, + input_a, + input_b, binary_op_type, out_dtype, - output_preallocated ? optional_output_tensor->memory_config() - : memory_config.value_or(input_tensor_a.memory_config()), + mem_config, optional_output_tensor, lhs_activations, rhs_activations, post_activations); + + // if both inputs are in row major, convert the output to row major + // since there's no consensus here, avoiding the conversion if we have an excuse to is likely the best option + // since it leads to better perf + if (input_a_rm && input_b_rm) { + result = ttnn::to_layout(result, Layout::ROW_MAJOR, std::nullopt, mem_config, (IDevice*)nullptr); + } + + return result; } else { Tensor input_a = typecast_to(DataType::BFLOAT16, input_tensor_a); Tensor input_b = typecast_to(DataType::BFLOAT16, input_tensor_b); @@ -69,13 +97,13 @@ Tensor BinaryNg::invoke( input_b, binary_op_type, input_a.get_dtype(), - input_a.memory_config(), + mem_config, output_tensor, lhs_activations, rhs_activations, post_activations); - return typecast_out ? ttnn::typecast(result, out_dtype, std::nullopt, optional_output_tensor) : result; + return typecast_out ? ttnn::typecast(result, out_dtype, mem_config, optional_output_tensor) : result; } } @@ -116,6 +144,8 @@ Tensor BinaryNg::invoke( const bool output_preallocated = optional_output_tensor.has_value(); const ttnn::DataType out_dtype = output_preallocated ? optional_output_tensor->get_dtype() : output_dtype.value_or(a_dtype); + const auto mem_config = output_preallocated ? optional_output_tensor->memory_config() + : memory_config.value_or(input_tensor_a.memory_config()); if (output_dtype.has_value() && output_preallocated) { TT_FATAL( @@ -127,18 +157,35 @@ Tensor BinaryNg::invoke( bool typecast_out = needs_typecast_to_bfloat16(out_dtype); if (!typecast_a) { - return ttnn::prim::binary_ng( + bool input_a_rm = input_tensor_a.get_layout() == Layout::ROW_MAJOR; + if (input_a_rm) { + // we don't support to_layout with optional output tensor + TT_FATAL( + !output_preallocated, + "Optional output tensor with Row Major input is not supported right now for Elementwise operations"); + } + Tensor input_a = + input_a_rm + ? ttnn::to_layout( + input_tensor_a, Layout::TILE, std::nullopt, input_tensor_a.memory_config(), (IDevice*)nullptr) + : input_tensor_a; + Tensor result = ttnn::prim::binary_ng( queue_id, - input_tensor_a, + input_a, scalar, binary_op_type, out_dtype, - output_preallocated ? optional_output_tensor->memory_config() - : memory_config.value_or(input_tensor_a.memory_config()), + mem_config, optional_output_tensor, lhs_activations, rhs_activations, post_activations); + + // if input is in row major, convert the output to row major + if (input_a_rm) { + result = ttnn::to_layout(result, Layout::ROW_MAJOR, std::nullopt, mem_config, (IDevice*)nullptr); + } + return result; } else { Tensor input_a = typecast_to(DataType::BFLOAT16, input_tensor_a); const auto output_tensor = output_preallocated and typecast_out @@ -151,7 +198,7 @@ Tensor BinaryNg::invoke( scalar, binary_op_type, input_a.get_dtype(), - input_a.memory_config(), + mem_config, output_tensor, lhs_activations, rhs_activations, From cb84d2eb6ab96b94f2e82a1e429ef84859b3528c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Friedrich=20Sch=C3=B6ller?= Date: Thu, 20 Feb 2025 21:50:40 +0100 Subject: [PATCH 199/316] #18082: Fix creation of mesh devices (#18083) ### Ticket https://github.com/tenstorrent/tt-metal/issues/18082 ### Problem description Due to a missing include, automatic pybind11 conversions were not possible, so the creation of mesh devices failed. ### What's changed Added the missing include to fix automatic pybind11 conversion. ### Checklist - [ ] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes - [ ] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [ ] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [ ] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [ ] New/Existing tests provide coverage for changes --- ttnn/cpp/ttnn/distributed/distributed_pybind.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ttnn/cpp/ttnn/distributed/distributed_pybind.cpp b/ttnn/cpp/ttnn/distributed/distributed_pybind.cpp index 83cb636335f..50ee1506df5 100644 --- a/ttnn/cpp/ttnn/distributed/distributed_pybind.cpp +++ b/ttnn/cpp/ttnn/distributed/distributed_pybind.cpp @@ -10,6 +10,10 @@ #include "ttnn/tensor/tensor.hpp" #include "ttnn/types.hpp" +// This is required for automatic conversions, as in the creation of mesh devices +// https://github.com/tenstorrent/tt-metal/issues/18082 +#include "pybind11/stl.h" + using namespace tt::tt_metal; namespace ttnn::distributed { From 8dd749ee60de28f901cad666bab021059fc3c95e Mon Sep 17 00:00:00 2001 From: Stanislav Minakov Date: Thu, 20 Feb 2025 21:19:40 +0000 Subject: [PATCH 200/316] Simplify repeat device operation (#18102) ### Ticket ### Problem description There is a redundant `create_output_tensors` in repeat_device_operation, which duplicates the logic of `compute_output_specs` ### What's changed Remove `create_output_tensors`, it will be automatically generated using `compute_output_specs`. Use regular TensorLayout constructor instead of `TensorLayout::fromPaddedShape` ### Checklist - [x] [All post commit CI passes](https://github.com/tenstorrent/tt-metal/actions/runs/13442465835) - [x] New/Existing tests provide coverage for changes --- .../repeat/device/repeat_device_operation.cpp | 24 +------------------ .../repeat/device/repeat_device_operation.hpp | 1 - 2 files changed, 1 insertion(+), 24 deletions(-) diff --git a/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_device_operation.cpp b/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_device_operation.cpp index 5e38b7aa6b0..621b42fd58d 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_device_operation.cpp @@ -37,29 +37,7 @@ std::vector RepeatDeviceOperation::compute_output_specs(const std::v mem_config.shard_spec = shard_spec; } return {TensorSpec( - output_shape, - TensorLayout::fromPaddedShape( - input_tensor_a.get_dtype(), - PageConfig(input_tensor_a.get_layout()), - mem_config, - output_shape, - output_shape))}; // no padding requried because we are RM only right now -} - -std::vector RepeatDeviceOperation::create_output_tensors(const std::vector& input_tensors) const { - // Create the output tensor - const auto& input_tensor_a = input_tensors.at(0); - const auto output_shape = this->compute_output_specs(input_tensors).at(0).logical_shape(); - - // is this relevant? - auto mem_config = this->m_output_mem_config; - if (input_tensor_a.memory_config().is_sharded()) { - auto shard_spec = input_tensor_a.shard_spec().value(); - shard_spec.shape[0] = output_shape[0]; - mem_config.shard_spec = shard_spec; - } - return {create_device_tensor( - output_shape, input_tensor_a.get_dtype(), input_tensor_a.get_layout(), input_tensor_a.device(), mem_config)}; + output_shape, TensorLayout(input_tensor_a.get_dtype(), PageConfig(input_tensor_a.get_layout()), mem_config))}; } operation::ProgramWithCallbacks RepeatDeviceOperation::create_program( diff --git a/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_device_operation.hpp b/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_device_operation.hpp index 7ae7d881b80..d8bec905880 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_device_operation.hpp @@ -15,7 +15,6 @@ struct RepeatDeviceOperation { // Required functions to all tensor op functions void validate(const std::vector& input_tensors) const; std::vector compute_output_specs(const std::vector& input_tensors) const; - std::vector create_output_tensors(const std::vector& input_tensors) const; operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; }; From 82b7b05f923d9d582fb473611a469b0c597f391b Mon Sep 17 00:00:00 2001 From: Bryan Wilder Field Lozano Date: Thu, 20 Feb 2025 13:40:10 -0800 Subject: [PATCH 201/316] [skip ci] Update bisect-dispatch.yaml (#18077) --- .github/workflows/bisect-dispatch.yaml | 31 ++++++++++++++++++++++++-- tests/scripts/tt_bisect.sh | 2 +- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/.github/workflows/bisect-dispatch.yaml b/.github/workflows/bisect-dispatch.yaml index 72e2054d66c..61f373958a1 100644 --- a/.github/workflows/bisect-dispatch.yaml +++ b/.github/workflows/bisect-dispatch.yaml @@ -10,9 +10,33 @@ on: - grayskull - wormhole_b0 - blackhole + tracy: + required: true + type: boolean + default: false + description: "Build with tracy enabled" + build-wheel: + required: true + type: boolean + default: false + description: "Build Python Wheel" runner-label: + required: true + type: choice + options: + - E150 + - N150 + - N300 + - P150 + - config-t3000 + - config-tg + - config-tgg + description: "Runner Type Label" + extra-label: required: true type: string + default: "in-service" + description: "Secondary tag to filter runners" good-commit: required: true type: string @@ -32,6 +56,9 @@ jobs: build-artifact: uses: ./.github/workflows/build-artifact.yaml secrets: inherit + with: + tracy: ${{ inputs.tracy }} + build-wheel: ${{ inputs.build-wheel }} test-dispatch: needs: build-artifact timeout-minutes: 1440 @@ -39,7 +66,7 @@ jobs: ARCH_NAME: ${{ inputs.arch }} runs-on: - ${{ inputs.runner-label }} - - "in-service" + - ${{ inputs.extra-label }} steps: - uses: tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main - name: Set up dyanmic env vars for build @@ -47,7 +74,7 @@ jobs: echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV - uses: actions/download-artifact@v4 with: - name: TTMetal_build_any + name: ${{ needs.build-artifact.outputs.build-artifact-name }} - name: Extract files run: tar -xvf ttm_any.tar - uses: ./.github/actions/install-python-deps diff --git a/tests/scripts/tt_bisect.sh b/tests/scripts/tt_bisect.sh index 85915d1d2bd..28becf7a83f 100755 --- a/tests/scripts/tt_bisect.sh +++ b/tests/scripts/tt_bisect.sh @@ -64,7 +64,7 @@ while [[ "$found" = "false" ]]; do continue fi - timeout $timeout_duration $test + timeout $timeout_duration bash -c "$test" timeout_code=${PIPESTATUS[0]} echo $timeout_code From 48d0ece2ad4602cf1a86ccc811d58d5cf9923bc9 Mon Sep 17 00:00:00 2001 From: John Bauman Date: Wed, 19 Feb 2025 22:24:04 +0000 Subject: [PATCH 202/316] Add benchmark to capture go message latency This benchmark has small amounts of data sent by the dispatcher and minimal work needed to load a kernel (no CBs or NCRISC binaries), so if the kernel itself takes long enough, most of the time between kernels will be spent waiting for a go message. --- .../dispatch/pgm_dispatch_golden.json | 1109 +++++++++-------- .../dispatch/test_pgm_dispatch.cpp | 25 +- 2 files changed, 628 insertions(+), 506 deletions(-) diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/pgm_dispatch_golden.json b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/pgm_dispatch_golden.json index 99404547dc7..2ef238726e9 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/pgm_dispatch_golden.json +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/pgm_dispatch_golden.json @@ -1,7 +1,7 @@ { "context": { - "date": "2025-02-17T16:09:05+00:00", - "host_name": "tt-metal-ci-vm-190", + "date": "2025-02-20T00:45:37+00:00", + "host_name": "tt-metal-ci-vm-163", "executable": "./build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_wormhole_b0", "num_cpus": 14, "mhz_per_cpu": 2300, @@ -32,7 +32,7 @@ "num_sharing": 1 } ], - "load_avg": [8.73,8.27,8.15], + "load_avg": [10.85,14.93,17.13], "library_version": "v1.9.1", "library_build_type": "debug", "json_schema_version": 1 @@ -48,10 +48,10 @@ "repetition_index": 0, "threads": 1, "iterations": 26, - "real_time": 2.6730076923076924e+07, - "cpu_time": 2.3336153846153637e+04, + "real_time": 2.6723500000000004e+07, + "cpu_time": 2.7281923076922314e+04, "time_unit": "ns", - "IterationTime": 2.6730076923076924e-06 + "IterationTime": 2.6723500000000008e-06 }, { "name": "BM_pgm_dispatch/brisc_only_trace/512/manual_time", @@ -63,10 +63,10 @@ "repetition_index": 0, "threads": 1, "iterations": 26, - "real_time": 2.6894346153846148e+07, - "cpu_time": 2.4738846153846353e+04, + "real_time": 2.6898615384615384e+07, + "cpu_time": 2.3647692307690377e+04, "time_unit": "ns", - "IterationTime": 2.6894346153846151e-06 + "IterationTime": 2.6898615384615384e-06 }, { "name": "BM_pgm_dispatch/brisc_only_trace/1024/manual_time", @@ -78,10 +78,10 @@ "repetition_index": 0, "threads": 1, "iterations": 26, - "real_time": 2.7130807692307692e+07, - "cpu_time": 2.3016923076922227e+04, + "real_time": 2.7135038461538460e+07, + "cpu_time": 2.6173846153846840e+04, "time_unit": "ns", - "IterationTime": 2.7130807692307694e-06 + "IterationTime": 2.7135038461538459e-06 }, { "name": "BM_pgm_dispatch/brisc_only_trace/2048/manual_time", @@ -93,10 +93,10 @@ "repetition_index": 0, "threads": 1, "iterations": 25, - "real_time": 2.7683120000000004e+07, - "cpu_time": 2.3659639999999981e+04, + "real_time": 2.7682240000000000e+07, + "cpu_time": 2.7707479999996562e+04, "time_unit": "ns", - "IterationTime": 2.7683120000000002e-06 + "IterationTime": 2.7682240000000003e-06 }, { "name": "BM_pgm_dispatch/brisc_only_trace/4096/manual_time", @@ -108,10 +108,10 @@ "repetition_index": 0, "threads": 1, "iterations": 24, - "real_time": 2.9706791666666672e+07, - "cpu_time": 2.2529416666666744e+04, + "real_time": 2.9701666666666657e+07, + "cpu_time": 2.6383541666665420e+04, "time_unit": "ns", - "IterationTime": 2.9706791666666672e-06 + "IterationTime": 2.9701666666666657e-06 }, { "name": "BM_pgm_dispatch/brisc_only_trace/8192/manual_time", @@ -123,10 +123,10 @@ "repetition_index": 0, "threads": 1, "iterations": 22, - "real_time": 3.2475590909090903e+07, - "cpu_time": 2.4634954545455952e+04, + "real_time": 3.2618636363636363e+07, + "cpu_time": 3.5915136363635029e+04, "time_unit": "ns", - "IterationTime": 3.2475590909090901e-06 + "IterationTime": 3.2618636363636362e-06 }, { "name": "BM_pgm_dispatch/brisc_only_trace/12288/manual_time", @@ -138,10 +138,10 @@ "repetition_index": 0, "threads": 1, "iterations": 20, - "real_time": 3.5464200000000007e+07, - "cpu_time": 2.2655500000001717e+04, + "real_time": 3.5456300000000000e+07, + "cpu_time": 2.9020000000001823e+04, "time_unit": "ns", - "IterationTime": 3.5464200000000010e-06 + "IterationTime": 3.5456300000000000e-06 }, { "name": "BM_pgm_dispatch/ncrisc_only_trace/256/manual_time", @@ -153,10 +153,10 @@ "repetition_index": 0, "threads": 1, "iterations": 26, - "real_time": 2.6713653846153848e+07, - "cpu_time": 2.2773076923076318e+04, + "real_time": 2.6716115384615384e+07, + "cpu_time": 2.8298846153847095e+04, "time_unit": "ns", - "IterationTime": 2.6713653846153849e-06 + "IterationTime": 2.6716115384615382e-06 }, { "name": "BM_pgm_dispatch/ncrisc_only_trace/512/manual_time", @@ -168,10 +168,10 @@ "repetition_index": 0, "threads": 1, "iterations": 26, - "real_time": 2.6892884615384616e+07, - "cpu_time": 2.3196538461534874e+04, + "real_time": 2.6904692307692308e+07, + "cpu_time": 2.9525000000000331e+04, "time_unit": "ns", - "IterationTime": 2.6892884615384616e-06 + "IterationTime": 2.6904692307692307e-06 }, { "name": "BM_pgm_dispatch/ncrisc_only_trace/1024/manual_time", @@ -183,10 +183,10 @@ "repetition_index": 0, "threads": 1, "iterations": 26, - "real_time": 2.7130423076923076e+07, - "cpu_time": 2.1398461538454285e+04, + "real_time": 2.7138307692307688e+07, + "cpu_time": 3.2016230769230744e+04, "time_unit": "ns", - "IterationTime": 2.7130423076923079e-06 + "IterationTime": 2.7138307692307689e-06 }, { "name": "BM_pgm_dispatch/ncrisc_only_trace/2048/manual_time", @@ -198,10 +198,10 @@ "repetition_index": 0, "threads": 1, "iterations": 25, - "real_time": 2.7683520000000000e+07, - "cpu_time": 2.2990679999992382e+04, + "real_time": 2.7686920000000000e+07, + "cpu_time": 2.9539840000003533e+04, "time_unit": "ns", - "IterationTime": 2.7683520000000004e-06 + "IterationTime": 2.7686920000000001e-06 }, { "name": "BM_pgm_dispatch/ncrisc_only_trace/4096/manual_time", @@ -213,10 +213,10 @@ "repetition_index": 0, "threads": 1, "iterations": 24, - "real_time": 2.9707708333333340e+07, - "cpu_time": 2.4864708333331248e+04, + "real_time": 2.9711166666666657e+07, + "cpu_time": 2.9315125000003070e+04, "time_unit": "ns", - "IterationTime": 2.9707708333333341e-06 + "IterationTime": 2.9711166666666665e-06 }, { "name": "BM_pgm_dispatch/ncrisc_only_trace/8192/manual_time", @@ -228,10 +228,10 @@ "repetition_index": 0, "threads": 1, "iterations": 22, - "real_time": 3.2475227272727262e+07, - "cpu_time": 2.3398636363641304e+04, + "real_time": 3.2541499999999996e+07, + "cpu_time": 2.9300954545452561e+04, "time_unit": "ns", - "IterationTime": 3.2475227272727262e-06 + "IterationTime": 3.2541499999999996e-06 }, { "name": "BM_pgm_dispatch/ncrisc_only_trace/12288/manual_time", @@ -243,10 +243,10 @@ "repetition_index": 0, "threads": 1, "iterations": 20, - "real_time": 3.5465350000000000e+07, - "cpu_time": 2.4466999999994689e+04, + "real_time": 3.5457350000000007e+07, + "cpu_time": 2.8714999999990272e+04, "time_unit": "ns", - "IterationTime": 3.5465349999999997e-06 + "IterationTime": 3.5457350000000005e-06 }, { "name": "BM_pgm_dispatch/trisc_only_trace/256/manual_time", @@ -258,10 +258,10 @@ "repetition_index": 0, "threads": 1, "iterations": 24, - "real_time": 2.9075708333333332e+07, - "cpu_time": 2.3487499999993073e+04, + "real_time": 2.9076750000000000e+07, + "cpu_time": 2.8871249999997312e+04, "time_unit": "ns", - "IterationTime": 2.9075708333333332e-06 + "IterationTime": 2.9076749999999997e-06 }, { "name": "BM_pgm_dispatch/trisc_only_trace/512/manual_time", @@ -273,10 +273,10 @@ "repetition_index": 0, "threads": 1, "iterations": 24, - "real_time": 2.9075458333333340e+07, - "cpu_time": 2.5067874999988122e+04, + "real_time": 2.9078583333333328e+07, + "cpu_time": 3.2055000000012307e+04, "time_unit": "ns", - "IterationTime": 2.9075458333333340e-06 + "IterationTime": 2.9078583333333325e-06 }, { "name": "BM_pgm_dispatch/trisc_only_trace/1024/manual_time", @@ -288,10 +288,10 @@ "repetition_index": 0, "threads": 1, "iterations": 23, - "real_time": 2.9828217391304348e+07, - "cpu_time": 2.2127217391293176e+04, + "real_time": 2.9838043478260875e+07, + "cpu_time": 2.7489000000008553e+04, "time_unit": "ns", - "IterationTime": 2.9828217391304348e-06 + "IterationTime": 2.9838043478260870e-06 }, { "name": "BM_pgm_dispatch/trisc_only_trace/2048/manual_time", @@ -303,10 +303,10 @@ "repetition_index": 0, "threads": 1, "iterations": 21, - "real_time": 3.3546238095238108e+07, - "cpu_time": 2.2843809523807682e+04, + "real_time": 3.3531809523809519e+07, + "cpu_time": 2.8204238095241864e+04, "time_unit": "ns", - "IterationTime": 3.3546238095238102e-06 + "IterationTime": 3.3531809523809517e-06 }, { "name": "BM_pgm_dispatch/trisc_only_trace/4096/manual_time", @@ -318,10 +318,10 @@ "repetition_index": 0, "threads": 1, "iterations": 18, - "real_time": 3.8659222222222216e+07, - "cpu_time": 2.3362222222224183e+04, + "real_time": 3.8661722222222231e+07, + "cpu_time": 2.8678888888874117e+04, "time_unit": "ns", - "IterationTime": 3.8659222222222217e-06 + "IterationTime": 3.8661722222222239e-06 }, { "name": "BM_pgm_dispatch/trisc_only_trace/8192/manual_time", @@ -333,10 +333,10 @@ "repetition_index": 0, "threads": 1, "iterations": 15, - "real_time": 4.6317666666666664e+07, - "cpu_time": 2.5929333333341019e+04, + "real_time": 4.6449466666666664e+07, + "cpu_time": 3.4942666666667086e+04, "time_unit": "ns", - "IterationTime": 4.6317666666666669e-06 + "IterationTime": 4.6449466666666656e-06 }, { "name": "BM_pgm_dispatch/trisc_only_trace/12288/manual_time", @@ -348,10 +348,10 @@ "repetition_index": 0, "threads": 1, "iterations": 13, - "real_time": 5.4694230769230768e+07, - "cpu_time": 2.7805461538474508e+04, + "real_time": 5.4678076923076920e+07, + "cpu_time": 3.1526230769245263e+04, "time_unit": "ns", - "IterationTime": 5.4694230769230770e-06 + "IterationTime": 5.4678076923076923e-06 }, { "name": "BM_pgm_dispatch/brisc_trisc_only_trace/256/manual_time", @@ -363,10 +363,10 @@ "repetition_index": 0, "threads": 1, "iterations": 23, - "real_time": 2.9950565217391301e+07, - "cpu_time": 2.1679434782619621e+04, + "real_time": 2.9953652173913039e+07, + "cpu_time": 2.7541652173924052e+04, "time_unit": "ns", - "IterationTime": 2.9950565217391299e-06 + "IterationTime": 2.9953652173913039e-06 }, { "name": "BM_pgm_dispatch/brisc_trisc_only_trace/512/manual_time", @@ -378,10 +378,10 @@ "repetition_index": 0, "threads": 1, "iterations": 23, - "real_time": 3.0197434782608695e+07, - "cpu_time": 2.2568478260875934e+04, + "real_time": 3.0217695652173914e+07, + "cpu_time": 2.8143086956519077e+04, "time_unit": "ns", - "IterationTime": 3.0197434782608692e-06 + "IterationTime": 3.0217695652173911e-06 }, { "name": "BM_pgm_dispatch/brisc_trisc_only_trace/1024/manual_time", @@ -393,10 +393,10 @@ "repetition_index": 0, "threads": 1, "iterations": 22, - "real_time": 3.1887909090909086e+07, - "cpu_time": 2.3819681818183399e+04, + "real_time": 3.1854545454545461e+07, + "cpu_time": 2.8280409090914400e+04, "time_unit": "ns", - "IterationTime": 3.1887909090909085e-06 + "IterationTime": 3.1854545454545458e-06 }, { "name": "BM_pgm_dispatch/brisc_trisc_only_trace/2048/manual_time", @@ -408,10 +408,10 @@ "repetition_index": 0, "threads": 1, "iterations": 19, - "real_time": 3.5937210526315793e+07, - "cpu_time": 2.1740000000004005e+04, + "real_time": 3.5941894736842103e+07, + "cpu_time": 2.7991578947383328e+04, "time_unit": "ns", - "IterationTime": 3.5937210526315797e-06 + "IterationTime": 3.5941894736842104e-06 }, { "name": "BM_pgm_dispatch/brisc_trisc_only_trace/4096/manual_time", @@ -423,10 +423,10 @@ "repetition_index": 0, "threads": 1, "iterations": 17, - "real_time": 4.1428294117647067e+07, - "cpu_time": 2.6309411764709432e+04, + "real_time": 4.1464941176470578e+07, + "cpu_time": 2.8563529411757321e+04, "time_unit": "ns", - "IterationTime": 4.1428294117647069e-06 + "IterationTime": 4.1464941176470575e-06 }, { "name": "BM_pgm_dispatch/brisc_trisc_only_trace/8192/manual_time", @@ -438,10 +438,10 @@ "repetition_index": 0, "threads": 1, "iterations": 13, - "real_time": 5.2825692307692304e+07, - "cpu_time": 2.5559999999988584e+04, + "real_time": 5.2712923076923065e+07, + "cpu_time": 2.9768461538459691e+04, "time_unit": "ns", - "IterationTime": 5.2825692307692300e-06 + "IterationTime": 5.2712923076923071e-06 }, { "name": "BM_pgm_dispatch/brisc_trisc_only_trace/12288/manual_time", @@ -453,10 +453,10 @@ "repetition_index": 0, "threads": 1, "iterations": 11, - "real_time": 6.4249545454545468e+07, - "cpu_time": 2.4714545454566789e+04, + "real_time": 6.4287909090909094e+07, + "cpu_time": 3.0248181818161931e+04, "time_unit": "ns", - "IterationTime": 6.4249545454545459e-06 + "IterationTime": 6.4287909090909088e-06 }, { "name": "BM_pgm_dispatch/all_processors_trace/256/manual_time", @@ -468,10 +468,10 @@ "repetition_index": 0, "threads": 1, "iterations": 22, - "real_time": 3.1338136363636352e+07, - "cpu_time": 2.3316954545463374e+04, + "real_time": 3.1347363636363629e+07, + "cpu_time": 2.7595363636369959e+04, "time_unit": "ns", - "IterationTime": 3.1338136363636358e-06 + "IterationTime": 3.1347363636363633e-06 }, { "name": "BM_pgm_dispatch/all_processors_trace/512/manual_time", @@ -483,10 +483,10 @@ "repetition_index": 0, "threads": 1, "iterations": 22, - "real_time": 3.1957136363636363e+07, - "cpu_time": 2.4401090909075374e+04, + "real_time": 3.1968454545454536e+07, + "cpu_time": 2.8872136363650661e+04, "time_unit": "ns", - "IterationTime": 3.1957136363636368e-06 + "IterationTime": 3.1968454545454536e-06 }, { "name": "BM_pgm_dispatch/all_processors_trace/1024/manual_time", @@ -498,10 +498,10 @@ "repetition_index": 0, "threads": 1, "iterations": 21, - "real_time": 3.3438000000000007e+07, - "cpu_time": 2.2249333333332477e+04, + "real_time": 3.3430047619047619e+07, + "cpu_time": 2.7584142857142589e+04, "time_unit": "ns", - "IterationTime": 3.3438000000000005e-06 + "IterationTime": 3.3430047619047614e-06 }, { "name": "BM_pgm_dispatch/all_processors_trace/2048/manual_time", @@ -513,10 +513,10 @@ "repetition_index": 0, "threads": 1, "iterations": 18, - "real_time": 3.8705333333333336e+07, - "cpu_time": 2.1913888888885285e+04, + "real_time": 3.8706222222222224e+07, + "cpu_time": 2.9503444444461336e+04, "time_unit": "ns", - "IterationTime": 3.8705333333333330e-06 + "IterationTime": 3.8706222222222218e-06 }, { "name": "BM_pgm_dispatch/all_processors_trace/4096/manual_time", @@ -528,10 +528,10 @@ "repetition_index": 0, "threads": 1, "iterations": 15, - "real_time": 4.5641533333333343e+07, - "cpu_time": 2.3505999999991665e+04, + "real_time": 4.5603533333333328e+07, + "cpu_time": 3.3940666666657882e+04, "time_unit": "ns", - "IterationTime": 4.5641533333333340e-06 + "IterationTime": 4.5603533333333325e-06 }, { "name": "BM_pgm_dispatch/all_processors_trace/8192/manual_time", @@ -543,10 +543,10 @@ "repetition_index": 0, "threads": 1, "iterations": 12, - "real_time": 5.9665083333333321e+07, - "cpu_time": 2.5379166666672503e+04, + "real_time": 5.9665333333333343e+07, + "cpu_time": 3.1440833333358973e+04, "time_unit": "ns", - "IterationTime": 5.9665083333333329e-06 + "IterationTime": 5.9665333333333355e-06 }, { "name": "BM_pgm_dispatch/all_processors_trace/12288/manual_time", @@ -557,11 +557,11 @@ "repetitions": 1, "repetition_index": 0, "threads": 1, - "iterations": 9, - "real_time": 7.3753111111111119e+07, - "cpu_time": 2.4642222222216584e+04, + "iterations": 10, + "real_time": 7.3627100000000015e+07, + "cpu_time": 3.2366999999977608e+04, "time_unit": "ns", - "IterationTime": 7.3753111111111126e-06 + "IterationTime": 7.3627100000000014e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_trace/256/manual_time", @@ -573,10 +573,10 @@ "repetition_index": 0, "threads": 1, "iterations": 22, - "real_time": 3.1155954545454539e+07, - "cpu_time": 2.2925454545448658e+04, + "real_time": 3.1157590909090914e+07, + "cpu_time": 2.8746818181798779e+04, "time_unit": "ns", - "IterationTime": 3.1155954545454542e-06 + "IterationTime": 3.1157590909090916e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_trace/512/manual_time", @@ -588,10 +588,10 @@ "repetition_index": 0, "threads": 1, "iterations": 22, - "real_time": 3.1700909090909079e+07, - "cpu_time": 2.3464227272729233e+04, + "real_time": 3.1726454545454547e+07, + "cpu_time": 2.8230272727267838e+04, "time_unit": "ns", - "IterationTime": 3.1700909090909077e-06 + "IterationTime": 3.1726454545454548e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_trace/1024/manual_time", @@ -603,10 +603,10 @@ "repetition_index": 0, "threads": 1, "iterations": 21, - "real_time": 3.3428095238095231e+07, - "cpu_time": 2.2474714285730934e+04, + "real_time": 3.3420380952380959e+07, + "cpu_time": 2.9070761904773484e+04, "time_unit": "ns", - "IterationTime": 3.3428095238095233e-06 + "IterationTime": 3.3420380952380956e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_trace/2048/manual_time", @@ -618,10 +618,10 @@ "repetition_index": 0, "threads": 1, "iterations": 18, - "real_time": 3.8703722222222224e+07, - "cpu_time": 2.3273944444469744e+04, + "real_time": 3.8704111111111104e+07, + "cpu_time": 3.3141833333299393e+04, "time_unit": "ns", - "IterationTime": 3.8703722222222221e-06 + "IterationTime": 3.8704111111111108e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_trace/4096/manual_time", @@ -633,10 +633,10 @@ "repetition_index": 0, "threads": 1, "iterations": 15, - "real_time": 4.5644800000000000e+07, - "cpu_time": 3.3046666666673256e+04, + "real_time": 4.5595866666666672e+07, + "cpu_time": 2.5466666666673631e+04, "time_unit": "ns", - "IterationTime": 4.5644800000000004e-06 + "IterationTime": 4.5595866666666675e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_trace/8192/manual_time", @@ -648,10 +648,10 @@ "repetition_index": 0, "threads": 1, "iterations": 12, - "real_time": 5.9704833333333321e+07, - "cpu_time": 2.4242500000030512e+04, + "real_time": 5.9758166666666657e+07, + "cpu_time": 2.7325833333350736e+04, "time_unit": "ns", - "IterationTime": 5.9704833333333331e-06 + "IterationTime": 5.9758166666666655e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_trace/12288/manual_time", @@ -663,10 +663,10 @@ "repetition_index": 0, "threads": 1, "iterations": 9, - "real_time": 7.3861777777777776e+07, - "cpu_time": 2.5335555555629064e+04, + "real_time": 7.3744555555555537e+07, + "cpu_time": 2.8037777777810566e+04, "time_unit": "ns", - "IterationTime": 7.3861777777777777e-06 + "IterationTime": 7.3744555555555538e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_1cb/256/manual_time", @@ -678,10 +678,10 @@ "repetition_index": 0, "threads": 1, "iterations": 20, - "real_time": 3.4477300000000000e+07, - "cpu_time": 2.3501999999986368e+04, + "real_time": 3.4546250000000007e+07, + "cpu_time": 6.9045999999994834e+04, "time_unit": "ns", - "IterationTime": 3.4477299999999996e-06 + "IterationTime": 3.4546250000000008e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_1cb/512/manual_time", @@ -693,10 +693,10 @@ "repetition_index": 0, "threads": 1, "iterations": 20, - "real_time": 3.4912649999999993e+07, - "cpu_time": 2.4015000000021661e+04, + "real_time": 3.5552100000000000e+07, + "cpu_time": 1.0344854999999597e+05, "time_unit": "ns", - "IterationTime": 3.4912649999999992e-06 + "IterationTime": 3.5552099999999997e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_1cb/1024/manual_time", @@ -708,10 +708,10 @@ "repetition_index": 0, "threads": 1, "iterations": 19, - "real_time": 3.6714894736842096e+07, - "cpu_time": 2.4035315789486402e+04, + "real_time": 3.6692157894736841e+07, + "cpu_time": 3.3235578947351925e+04, "time_unit": "ns", - "IterationTime": 3.6714894736842097e-06 + "IterationTime": 3.6692157894736836e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_1cb/2048/manual_time", @@ -723,10 +723,10 @@ "repetition_index": 0, "threads": 1, "iterations": 17, - "real_time": 4.1945941176470585e+07, - "cpu_time": 2.5924117647079052e+04, + "real_time": 4.1962058823529422e+07, + "cpu_time": 4.1426470588239579e+04, "time_unit": "ns", - "IterationTime": 4.1945941176470588e-06 + "IterationTime": 4.1962058823529422e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_1cb/4096/manual_time", @@ -738,10 +738,10 @@ "repetition_index": 0, "threads": 1, "iterations": 14, - "real_time": 4.8923285714285716e+07, - "cpu_time": 2.6736428571475353e+04, + "real_time": 4.8866571428571425e+07, + "cpu_time": 4.9834999999934633e+04, "time_unit": "ns", - "IterationTime": 4.8923285714285717e-06 + "IterationTime": 4.8866571428571430e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_1cb/8192/manual_time", @@ -753,10 +753,10 @@ "repetition_index": 0, "threads": 1, "iterations": 11, - "real_time": 6.3098818181818180e+07, - "cpu_time": 2.2529999999934800e+04, + "real_time": 6.3119272727272704e+07, + "cpu_time": 2.9314545454588042e+04, "time_unit": "ns", - "IterationTime": 6.3098818181818184e-06 + "IterationTime": 6.3119272727272713e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_32cb/256/manual_time", @@ -768,10 +768,10 @@ "repetition_index": 0, "threads": 1, "iterations": 20, - "real_time": 3.4805099999999993e+07, - "cpu_time": 2.4124099999989212e+04, + "real_time": 3.4796650000000000e+07, + "cpu_time": 2.7215999999974370e+04, "time_unit": "ns", - "IterationTime": 3.4805099999999994e-06 + "IterationTime": 3.4796650000000004e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_32cb/512/manual_time", @@ -783,10 +783,10 @@ "repetition_index": 0, "threads": 1, "iterations": 20, - "real_time": 3.5100100000000007e+07, - "cpu_time": 2.5931549999969051e+04, + "real_time": 3.5107099999999993e+07, + "cpu_time": 3.0068449999998138e+04, "time_unit": "ns", - "IterationTime": 3.5100100000000006e-06 + "IterationTime": 3.5107100000000000e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_32cb/1024/manual_time", @@ -798,10 +798,10 @@ "repetition_index": 0, "threads": 1, "iterations": 19, - "real_time": 3.7149842105263159e+07, - "cpu_time": 3.0253684210560106e+04, + "real_time": 3.7121473684210517e+07, + "cpu_time": 2.5832631578970897e+04, "time_unit": "ns", - "IterationTime": 3.7149842105263159e-06 + "IterationTime": 3.7121473684210523e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_32cb/2048/manual_time", @@ -813,10 +813,10 @@ "repetition_index": 0, "threads": 1, "iterations": 17, - "real_time": 4.2246647058823526e+07, - "cpu_time": 2.9003529411721647e+04, + "real_time": 4.2177882352941185e+07, + "cpu_time": 1.7096470588196622e+04, "time_unit": "ns", - "IterationTime": 4.2246647058823523e-06 + "IterationTime": 4.2177882352941189e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_32cb/4096/manual_time", @@ -828,10 +828,10 @@ "repetition_index": 0, "threads": 1, "iterations": 14, - "real_time": 4.9113000000000000e+07, - "cpu_time": 3.1937142857112784e+04, + "real_time": 4.8997500000000007e+07, + "cpu_time": 3.3407857142834073e+04, "time_unit": "ns", - "IterationTime": 4.9112999999999999e-06 + "IterationTime": 4.8997500000000008e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_32cb/8192/manual_time", @@ -843,10 +843,10 @@ "repetition_index": 0, "threads": 1, "iterations": 11, - "real_time": 6.6463000000000007e+07, - "cpu_time": 3.2335727272761716e+04, + "real_time": 6.6456181818181820e+07, + "cpu_time": 3.2069909090831123e+04, "time_unit": "ns", - "IterationTime": 6.6463000000000011e-06 + "IterationTime": 6.6456181818181815e-06 }, { "name": "BM_pgm_dispatch/all_processors_1_core_1_rta/256/manual_time", @@ -858,10 +858,10 @@ "repetition_index": 0, "threads": 1, "iterations": 20, - "real_time": 3.4480349999999993e+07, - "cpu_time": 2.8031049999999166e+04, + "real_time": 3.4481950000000007e+07, + "cpu_time": 3.1215449999999477e+04, "time_unit": "ns", - "IterationTime": 3.4480349999999989e-06 + "IterationTime": 3.4481950000000008e-06 }, { "name": "BM_pgm_dispatch/all_processors_1_core_1_rta/512/manual_time", @@ -873,10 +873,10 @@ "repetition_index": 0, "threads": 1, "iterations": 20, - "real_time": 3.4916699999999993e+07, - "cpu_time": 2.8380200000022171e+04, + "real_time": 3.4921199999999993e+07, + "cpu_time": 3.2242249999958614e+04, "time_unit": "ns", - "IterationTime": 3.4916699999999991e-06 + "IterationTime": 3.4921200000000001e-06 }, { "name": "BM_pgm_dispatch/all_processors_1_core_1_rta/1024/manual_time", @@ -888,10 +888,10 @@ "repetition_index": 0, "threads": 1, "iterations": 19, - "real_time": 3.6713736842105277e+07, - "cpu_time": 3.5802631578961627e+04, + "real_time": 3.6684263157894753e+07, + "cpu_time": 2.7482105263160487e+04, "time_unit": "ns", - "IterationTime": 3.6713736842105279e-06 + "IterationTime": 3.6684263157894747e-06 }, { "name": "BM_pgm_dispatch/all_processors_1_core_1_rta/2048/manual_time", @@ -903,10 +903,10 @@ "repetition_index": 0, "threads": 1, "iterations": 17, - "real_time": 4.1953000000000007e+07, - "cpu_time": 3.1220588235308609e+04, + "real_time": 4.1952058823529415e+07, + "cpu_time": 2.8309411764701639e+04, "time_unit": "ns", - "IterationTime": 4.1953000000000003e-06 + "IterationTime": 4.1952058823529409e-06 }, { "name": "BM_pgm_dispatch/all_processors_1_core_1_rta/4096/manual_time", @@ -918,10 +918,10 @@ "repetition_index": 0, "threads": 1, "iterations": 14, - "real_time": 4.8927500000000000e+07, - "cpu_time": 3.0061428571442102e+04, + "real_time": 4.8853357142857142e+07, + "cpu_time": 3.0190714285703380e+04, "time_unit": "ns", - "IterationTime": 4.8927499999999990e-06 + "IterationTime": 4.8853357142857137e-06 }, { "name": "BM_pgm_dispatch/all_processors_1_core_1_rta/8192/manual_time", @@ -933,10 +933,10 @@ "repetition_index": 0, "threads": 1, "iterations": 11, - "real_time": 6.2969909090909101e+07, - "cpu_time": 3.1834636363631769e+04, + "real_time": 6.2988090909090906e+07, + "cpu_time": 3.2970000000031134e+04, "time_unit": "ns", - "IterationTime": 6.2969909090909095e-06 + "IterationTime": 6.2988090909090911e-06 }, { "name": "BM_pgm_dispatch/one_processor_all_cores_128_rta/256/manual_time", @@ -948,10 +948,10 @@ "repetition_index": 0, "threads": 1, "iterations": 12, - "real_time": 5.7600500000000000e+07, - "cpu_time": 3.8343500000056119e+04, + "real_time": 5.7582083333333336e+07, + "cpu_time": 2.9232166666689114e+04, "time_unit": "ns", - "IterationTime": 5.7600500000000000e-06 + "IterationTime": 5.7582083333333335e-06 }, { "name": "BM_pgm_dispatch/one_processor_all_cores_128_rta/512/manual_time", @@ -963,10 +963,10 @@ "repetition_index": 0, "threads": 1, "iterations": 12, - "real_time": 5.7762833333333336e+07, - "cpu_time": 3.0340916666649064e+04, + "real_time": 5.7757333333333343e+07, + "cpu_time": 2.6393666666605732e+04, "time_unit": "ns", - "IterationTime": 5.7762833333333342e-06 + "IterationTime": 5.7757333333333356e-06 }, { "name": "BM_pgm_dispatch/one_processor_all_cores_128_rta/1024/manual_time", @@ -978,10 +978,10 @@ "repetition_index": 0, "threads": 1, "iterations": 12, - "real_time": 5.8090666666666664e+07, - "cpu_time": 2.9895833333348779e+04, + "real_time": 5.8078833333333336e+07, + "cpu_time": 2.9148333333376781e+04, "time_unit": "ns", - "IterationTime": 5.8090666666666666e-06 + "IterationTime": 5.8078833333333324e-06 }, { "name": "BM_pgm_dispatch/one_processor_all_cores_128_rta/2048/manual_time", @@ -993,10 +993,10 @@ "repetition_index": 0, "threads": 1, "iterations": 12, - "real_time": 5.8695666666666664e+07, - "cpu_time": 3.0913333333308183e+04, + "real_time": 5.8718750000000000e+07, + "cpu_time": 3.2037500000026139e+04, "time_unit": "ns", - "IterationTime": 5.8695666666666663e-06 + "IterationTime": 5.8718750000000004e-06 }, { "name": "BM_pgm_dispatch/one_processor_all_cores_128_rta/4096/manual_time", @@ -1008,10 +1008,10 @@ "repetition_index": 0, "threads": 1, "iterations": 12, - "real_time": 6.0850166666666657e+07, - "cpu_time": 3.4490833333252383e+04, + "real_time": 6.0855916666666657e+07, + "cpu_time": 2.9017500000024418e+04, "time_unit": "ns", - "IterationTime": 6.0850166666666669e-06 + "IterationTime": 6.0855916666666656e-06 }, { "name": "BM_pgm_dispatch/one_processor_all_cores_128_rta/8192/manual_time", @@ -1023,10 +1023,10 @@ "repetition_index": 0, "threads": 1, "iterations": 11, - "real_time": 6.3639545454545468e+07, - "cpu_time": 2.4531909090958430e+04, + "real_time": 6.3647909090909101e+07, + "cpu_time": 5.7657272727208336e+04, "time_unit": "ns", - "IterationTime": 6.3639545454545460e-06 + "IterationTime": 6.3647909090909099e-06 }, { "name": "BM_pgm_dispatch/one_processors_all_cores_1_rta/256/manual_time", @@ -1038,10 +1038,10 @@ "repetition_index": 0, "threads": 1, "iterations": 19, - "real_time": 3.7484105263157889e+07, - "cpu_time": 2.1082684210533014e+04, + "real_time": 3.7490263157894738e+07, + "cpu_time": 3.1591578947311547e+04, "time_unit": "ns", - "IterationTime": 3.7484105263157885e-06 + "IterationTime": 3.7490263157894740e-06 }, { "name": "BM_pgm_dispatch/one_processors_all_cores_1_rta/512/manual_time", @@ -1053,10 +1053,10 @@ "repetition_index": 0, "threads": 1, "iterations": 19, - "real_time": 3.7578157894736834e+07, - "cpu_time": 2.0652526315825377e+04, + "real_time": 3.7583947368421055e+07, + "cpu_time": 2.7524947368368958e+04, "time_unit": "ns", - "IterationTime": 3.7578157894736839e-06 + "IterationTime": 3.7583947368421052e-06 }, { "name": "BM_pgm_dispatch/one_processors_all_cores_1_rta/1024/manual_time", @@ -1068,10 +1068,10 @@ "repetition_index": 0, "threads": 1, "iterations": 19, - "real_time": 3.7757578947368421e+07, - "cpu_time": 2.0148947368394791e+04, + "real_time": 3.7766263157894745e+07, + "cpu_time": 2.7608894736817401e+04, "time_unit": "ns", - "IterationTime": 3.7757578947368423e-06 + "IterationTime": 3.7766263157894748e-06 }, { "name": "BM_pgm_dispatch/one_processors_all_cores_1_rta/2048/manual_time", @@ -1083,10 +1083,10 @@ "repetition_index": 0, "threads": 1, "iterations": 18, - "real_time": 3.8168833333333336e+07, - "cpu_time": 1.8871666666599020e+04, + "real_time": 3.8182833333333336e+07, + "cpu_time": 3.5652777777765245e+04, "time_unit": "ns", - "IterationTime": 3.8168833333333331e-06 + "IterationTime": 3.8182833333333328e-06 }, { "name": "BM_pgm_dispatch/one_processors_all_cores_1_rta/4096/manual_time", @@ -1098,10 +1098,10 @@ "repetition_index": 0, "threads": 1, "iterations": 18, - "real_time": 3.9009111111111112e+07, - "cpu_time": 2.0109444444453096e+04, + "real_time": 3.9015388888888888e+07, + "cpu_time": 3.1919444444423243e+04, "time_unit": "ns", - "IterationTime": 3.9009111111111116e-06 + "IterationTime": 3.9015388888888894e-06 }, { "name": "BM_pgm_dispatch/one_processors_all_cores_1_rta/8192/manual_time", @@ -1113,10 +1113,10 @@ "repetition_index": 0, "threads": 1, "iterations": 17, - "real_time": 4.1178411764705881e+07, - "cpu_time": 3.0142941176503722e+04, + "real_time": 4.1168529411764704e+07, + "cpu_time": 2.6784117647024759e+04, "time_unit": "ns", - "IterationTime": 4.1178411764705887e-06 + "IterationTime": 4.1168529411764698e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_1_rta/256/manual_time", @@ -1128,10 +1128,10 @@ "repetition_index": 0, "threads": 1, "iterations": 17, - "real_time": 4.0965764705882341e+07, - "cpu_time": 3.2121941176508615e+04, + "real_time": 4.0973294117647059e+07, + "cpu_time": 3.0251941176431654e+04, "time_unit": "ns", - "IterationTime": 4.0965764705882342e-06 + "IterationTime": 4.0973294117647062e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_1_rta/512/manual_time", @@ -1143,10 +1143,10 @@ "repetition_index": 0, "threads": 1, "iterations": 17, - "real_time": 4.1141235294117637e+07, - "cpu_time": 2.9815529411770989e+04, + "real_time": 4.1147823529411763e+07, + "cpu_time": 2.7783117647029550e+04, "time_unit": "ns", - "IterationTime": 4.1141235294117641e-06 + "IterationTime": 4.1147823529411768e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_1_rta/1024/manual_time", @@ -1158,10 +1158,10 @@ "repetition_index": 0, "threads": 1, "iterations": 17, - "real_time": 4.1674705882352941e+07, - "cpu_time": 3.0351529411815398e+04, + "real_time": 4.1673176470588244e+07, + "cpu_time": 2.6009411764730197e+04, "time_unit": "ns", - "IterationTime": 4.1674705882352947e-06 + "IterationTime": 4.1673176470588240e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_1_rta/2048/manual_time", @@ -1173,10 +1173,10 @@ "repetition_index": 0, "threads": 1, "iterations": 16, - "real_time": 4.4369937500000007e+07, - "cpu_time": 3.1336250000069122e+04, + "real_time": 4.4385749999999993e+07, + "cpu_time": 3.1358375000012373e+04, "time_unit": "ns", - "IterationTime": 4.4369937500000004e-06 + "IterationTime": 4.4385749999999993e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_1_rta/4096/manual_time", @@ -1188,10 +1188,10 @@ "repetition_index": 0, "threads": 1, "iterations": 14, - "real_time": 4.9822928571428575e+07, - "cpu_time": 3.2757142857141120e+04, + "real_time": 4.9728785714285724e+07, + "cpu_time": 3.4272142857132741e+04, "time_unit": "ns", - "IterationTime": 4.9822928571428567e-06 + "IterationTime": 4.9728785714285730e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_1_rta/8192/manual_time", @@ -1203,10 +1203,10 @@ "repetition_index": 0, "threads": 1, "iterations": 10, - "real_time": 6.9507500000000015e+07, - "cpu_time": 3.1938000000053533e+04, + "real_time": 6.9515500000000000e+07, + "cpu_time": 2.9503000000019598e+04, "time_unit": "ns", - "IterationTime": 6.9507500000000012e-06 + "IterationTime": 6.9515499999999987e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_32_rta/256/manual_time", @@ -1218,10 +1218,10 @@ "repetition_index": 0, "threads": 1, "iterations": 13, - "real_time": 5.5500076923076913e+07, - "cpu_time": 3.6943769230810212e+04, + "real_time": 5.5481153846153848e+07, + "cpu_time": 2.8435384615285930e+04, "time_unit": "ns", - "IterationTime": 5.5500076923076912e-06 + "IterationTime": 5.5481153846153849e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_32_rta/512/manual_time", @@ -1233,10 +1233,10 @@ "repetition_index": 0, "threads": 1, "iterations": 13, - "real_time": 5.5804769230769232e+07, - "cpu_time": 3.2049923076918130e+04, + "real_time": 5.5787769230769232e+07, + "cpu_time": 2.8239153846079451e+04, "time_unit": "ns", - "IterationTime": 5.5804769230769237e-06 + "IterationTime": 5.5787769230769234e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_32_rta/1024/manual_time", @@ -1248,10 +1248,10 @@ "repetition_index": 0, "threads": 1, "iterations": 12, - "real_time": 5.7422916666666657e+07, - "cpu_time": 3.0158166666627294e+04, + "real_time": 5.7414583333333336e+07, + "cpu_time": 2.7385750000015934e+04, "time_unit": "ns", - "IterationTime": 5.7422916666666659e-06 + "IterationTime": 5.7414583333333341e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_32_rta/2048/manual_time", @@ -1263,10 +1263,10 @@ "repetition_index": 0, "threads": 1, "iterations": 11, - "real_time": 6.2508999999999993e+07, - "cpu_time": 3.7220090909138227e+04, + "real_time": 6.2470636363636352e+07, + "cpu_time": 2.9027090909168732e+04, "time_unit": "ns", - "IterationTime": 6.2508999999999980e-06 + "IterationTime": 6.2470636363636352e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_32_rta/4096/manual_time", @@ -1278,10 +1278,10 @@ "repetition_index": 0, "threads": 1, "iterations": 10, - "real_time": 7.0115900000000015e+07, - "cpu_time": 3.5648000000065847e+04, + "real_time": 6.9968000000000015e+07, + "cpu_time": 3.4655299999997173e+04, "time_unit": "ns", - "IterationTime": 7.0115900000000001e-06 + "IterationTime": 6.9967999999999997e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_32_rta/8192/manual_time", @@ -1293,10 +1293,10 @@ "repetition_index": 0, "threads": 1, "iterations": 8, - "real_time": 8.5774750000000015e+07, - "cpu_time": 3.3160000000087566e+04, + "real_time": 8.5776875000000000e+07, + "cpu_time": 4.4509999999942098e+04, "time_unit": "ns", - "IterationTime": 8.5774750000000021e-06 + "IterationTime": 8.5776875000000000e-06 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_128_rta/256/manual_time", @@ -1308,10 +1308,10 @@ "repetition_index": 0, "threads": 1, "iterations": 6, - "real_time": 1.1872416666666667e+08, - "cpu_time": 3.5832500000054781e+04, + "real_time": 1.1872816666666667e+08, + "cpu_time": 3.2293333333655028e+04, "time_unit": "ns", - "IterationTime": 1.1872416666666667e-05 + "IterationTime": 1.1872816666666668e-05 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_128_rta/512/manual_time", @@ -1323,10 +1323,10 @@ "repetition_index": 0, "threads": 1, "iterations": 6, - "real_time": 1.1916200000000000e+08, - "cpu_time": 3.4728499999895728e+04, + "real_time": 1.1918916666666669e+08, + "cpu_time": 3.7318166666485318e+04, "time_unit": "ns", - "IterationTime": 1.1916200000000001e-05 + "IterationTime": 1.1918916666666667e-05 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_128_rta/1024/manual_time", @@ -1338,10 +1338,10 @@ "repetition_index": 0, "threads": 1, "iterations": 6, - "real_time": 1.2089416666666664e+08, - "cpu_time": 2.3970000000280343e+04, + "real_time": 1.2099716666666667e+08, + "cpu_time": 5.3235000000502921e+04, "time_unit": "ns", - "IterationTime": 1.2089416666666665e-05 + "IterationTime": 1.2099716666666669e-05 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_128_rta/2048/manual_time", @@ -1353,10 +1353,10 @@ "repetition_index": 0, "threads": 1, "iterations": 6, - "real_time": 1.2610266666666667e+08, - "cpu_time": 2.4575166666688612e+04, + "real_time": 1.2612233333333333e+08, + "cpu_time": 3.0120000000503449e+04, "time_unit": "ns", - "IterationTime": 1.2610266666666667e-05 + "IterationTime": 1.2612233333333333e-05 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_128_rta/4096/manual_time", @@ -1368,10 +1368,10 @@ "repetition_index": 0, "threads": 1, "iterations": 5, - "real_time": 1.3209140000000003e+08, - "cpu_time": 2.9534000000097651e+04, + "real_time": 1.3205980000000000e+08, + "cpu_time": 3.2816000000224223e+04, "time_unit": "ns", - "IterationTime": 1.3209140000000003e-05 + "IterationTime": 1.3205980000000001e-05 }, { "name": "BM_pgm_dispatch/all_processors_all_cores_128_rta/8192/manual_time", @@ -1383,10 +1383,10 @@ "repetition_index": 0, "threads": 1, "iterations": 5, - "real_time": 1.4751780000000000e+08, - "cpu_time": 2.7633999999920889e+04, + "real_time": 1.4751639999999997e+08, + "cpu_time": 2.9743999999709555e+04, "time_unit": "ns", - "IterationTime": 1.4751780000000000e-05 + "IterationTime": 1.4751639999999999e-05 }, { "name": "BM_pgm_dispatch/sems_1_core_1_processor_trace/256/manual_time", @@ -1398,10 +1398,10 @@ "repetition_index": 0, "threads": 1, "iterations": 23, - "real_time": 3.0070826086956523e+07, - "cpu_time": 1.9661304347930236e+04, + "real_time": 3.0079826086956527e+07, + "cpu_time": 2.2574782608740941e+04, "time_unit": "ns", - "IterationTime": 3.0070826086956525e-06 + "IterationTime": 3.0079826086956523e-06 }, { "name": "BM_pgm_dispatch/sems_1_core_1_processor_trace/512/manual_time", @@ -1413,10 +1413,10 @@ "repetition_index": 0, "threads": 1, "iterations": 23, - "real_time": 3.0183391304347832e+07, - "cpu_time": 2.0213999999958043e+04, + "real_time": 3.0202608695652176e+07, + "cpu_time": 4.0060869565186738e+04, "time_unit": "ns", - "IterationTime": 3.0183391304347831e-06 + "IterationTime": 3.0202608695652174e-06 }, { "name": "BM_pgm_dispatch/sems_1_core_1_processor_trace/1024/manual_time", @@ -1428,10 +1428,10 @@ "repetition_index": 0, "threads": 1, "iterations": 23, - "real_time": 3.0480260869565219e+07, - "cpu_time": 1.9658826087010082e+04, + "real_time": 3.0492695652173914e+07, + "cpu_time": 2.9276956521792508e+04, "time_unit": "ns", - "IterationTime": 3.0480260869565220e-06 + "IterationTime": 3.0492695652173912e-06 }, { "name": "BM_pgm_dispatch/sems_1_core_1_processor_trace/2048/manual_time", @@ -1443,10 +1443,10 @@ "repetition_index": 0, "threads": 1, "iterations": 23, - "real_time": 3.1034043478260871e+07, - "cpu_time": 1.8955478260807013e+04, + "real_time": 3.1046869565217398e+07, + "cpu_time": 2.7367391304267130e+04, "time_unit": "ns", - "IterationTime": 3.1034043478260867e-06 + "IterationTime": 3.1046869565217395e-06 }, { "name": "BM_pgm_dispatch/sems_1_core_1_processor_trace/4096/manual_time", @@ -1458,10 +1458,10 @@ "repetition_index": 0, "threads": 1, "iterations": 21, - "real_time": 3.2993238095238108e+07, - "cpu_time": 1.9665619047616801e+04, + "real_time": 3.3158952380952388e+07, + "cpu_time": 2.7611333333341227e+04, "time_unit": "ns", - "IterationTime": 3.2993238095238104e-06 + "IterationTime": 3.3158952380952389e-06 }, { "name": "BM_pgm_dispatch/sems_1_core_1_processor_trace/8192/manual_time", @@ -1473,10 +1473,10 @@ "repetition_index": 0, "threads": 1, "iterations": 19, - "real_time": 3.5972473684210517e+07, - "cpu_time": 1.8976315789655619e+04, + "real_time": 3.5967578947368428e+07, + "cpu_time": 2.5672157894711901e+04, "time_unit": "ns", - "IterationTime": 3.5972473684210520e-06 + "IterationTime": 3.5967578947368428e-06 }, { "name": "BM_pgm_dispatch/sems_all_cores_1_processor_trace/256/manual_time", @@ -1488,10 +1488,10 @@ "repetition_index": 0, "threads": 1, "iterations": 23, - "real_time": 3.0070695652173907e+07, - "cpu_time": 2.0065217391309332e+04, + "real_time": 3.0080652173913050e+07, + "cpu_time": 2.3450434782661305e+04, "time_unit": "ns", - "IterationTime": 3.0070695652173906e-06 + "IterationTime": 3.0080652173913051e-06 }, { "name": "BM_pgm_dispatch/sems_all_cores_1_processor_trace/512/manual_time", @@ -1503,10 +1503,10 @@ "repetition_index": 0, "threads": 1, "iterations": 23, - "real_time": 3.0182782608695649e+07, - "cpu_time": 1.9268260869586622e+04, + "real_time": 3.0202347826086946e+07, + "cpu_time": 2.6328260869543938e+04, "time_unit": "ns", - "IterationTime": 3.0182782608695648e-06 + "IterationTime": 3.0202347826086945e-06 }, { "name": "BM_pgm_dispatch/sems_all_cores_1_processor_trace/1024/manual_time", @@ -1518,10 +1518,10 @@ "repetition_index": 0, "threads": 1, "iterations": 23, - "real_time": 3.0480173913043477e+07, - "cpu_time": 2.0814782608624682e+04, + "real_time": 3.0492826086956527e+07, + "cpu_time": 2.5107826086822195e+04, "time_unit": "ns", - "IterationTime": 3.0480173913043482e-06 + "IterationTime": 3.0492826086956527e-06 }, { "name": "BM_pgm_dispatch/sems_all_cores_1_processor_trace/2048/manual_time", @@ -1533,10 +1533,10 @@ "repetition_index": 0, "threads": 1, "iterations": 23, - "real_time": 3.1036086956521735e+07, - "cpu_time": 1.9879521739063006e+04, + "real_time": 3.1049826086956531e+07, + "cpu_time": 2.8926956521718483e+04, "time_unit": "ns", - "IterationTime": 3.1036086956521736e-06 + "IterationTime": 3.1049826086956537e-06 }, { "name": "BM_pgm_dispatch/sems_all_cores_1_processor_trace/4096/manual_time", @@ -1548,10 +1548,10 @@ "repetition_index": 0, "threads": 1, "iterations": 21, - "real_time": 3.3019095238095239e+07, - "cpu_time": 2.0720428571406403e+04, + "real_time": 3.3183761904761899e+07, + "cpu_time": 2.5488571428546777e+04, "time_unit": "ns", - "IterationTime": 3.3019095238095238e-06 + "IterationTime": 3.3183761904761902e-06 }, { "name": "BM_pgm_dispatch/sems_all_cores_1_processor_trace/8192/manual_time", @@ -1563,10 +1563,10 @@ "repetition_index": 0, "threads": 1, "iterations": 19, - "real_time": 3.5973947368421055e+07, - "cpu_time": 2.0178684210529689e+04, + "real_time": 3.5970473684210517e+07, + "cpu_time": 2.4643736842253511e+04, "time_unit": "ns", - "IterationTime": 3.5973947368421058e-06 + "IterationTime": 3.5970473684210519e-06 }, { "name": "BM_pgm_dispatch/maxed_config_params_trace/256/manual_time", @@ -1578,10 +1578,10 @@ "repetition_index": 0, "threads": 1, "iterations": 7, - "real_time": 1.0377071428571427e+08, - "cpu_time": 2.2170000000138705e+04, + "real_time": 1.0378914285714285e+08, + "cpu_time": 2.4571285714155725e+04, "time_unit": "ns", - "IterationTime": 1.0377071428571427e-05 + "IterationTime": 1.0378914285714286e-05 }, { "name": "BM_pgm_dispatch/maxed_config_params_trace/512/manual_time", @@ -1593,10 +1593,10 @@ "repetition_index": 0, "threads": 1, "iterations": 7, - "real_time": 1.0426657142857143e+08, - "cpu_time": 2.3283000000365715e+04, + "real_time": 1.0429685714285715e+08, + "cpu_time": 3.0871428571848421e+04, "time_unit": "ns", - "IterationTime": 1.0426657142857143e-05 + "IterationTime": 1.0429685714285715e-05 }, { "name": "BM_pgm_dispatch/maxed_config_params_trace/1024/manual_time", @@ -1608,10 +1608,10 @@ "repetition_index": 0, "threads": 1, "iterations": 7, - "real_time": 1.0614242857142857e+08, - "cpu_time": 2.7466428570781838e+04, + "real_time": 1.0611100000000000e+08, + "cpu_time": 3.9938714285727074e+04, "time_unit": "ns", - "IterationTime": 1.0614242857142859e-05 + "IterationTime": 1.0611100000000001e-05 }, { "name": "BM_pgm_dispatch/maxed_config_params_trace/2048/manual_time", @@ -1623,10 +1623,10 @@ "repetition_index": 0, "threads": 1, "iterations": 6, - "real_time": 1.1098866666666667e+08, - "cpu_time": 2.3233333333649851e+04, + "real_time": 1.1098150000000001e+08, + "cpu_time": 3.3381499999762811e+04, "time_unit": "ns", - "IterationTime": 1.1098866666666666e-05 + "IterationTime": 1.1098150000000000e-05 }, { "name": "BM_pgm_dispatch/maxed_config_params_trace/4096/manual_time", @@ -1638,10 +1638,10 @@ "repetition_index": 0, "threads": 1, "iterations": 6, - "real_time": 1.1733233333333333e+08, - "cpu_time": 2.4433333333462317e+04, + "real_time": 1.1733666666666667e+08, + "cpu_time": 3.0893333332689584e+04, "time_unit": "ns", - "IterationTime": 1.1733233333333333e-05 + "IterationTime": 1.1733666666666667e-05 }, { "name": "BM_pgm_dispatch/maxed_config_params_trace/8192/manual_time", @@ -1653,10 +1653,10 @@ "repetition_index": 0, "threads": 1, "iterations": 5, - "real_time": 1.3236920000000000e+08, - "cpu_time": 2.6089799999340357e+04, + "real_time": 1.3242140000000000e+08, + "cpu_time": 6.4220000000148044e+04, "time_unit": "ns", - "IterationTime": 1.3236920000000002e-05 + "IterationTime": 1.3242140000000001e-05 }, { "name": "BM_pgm_dispatch/kernel_groups_trace/256/manual_time", @@ -1668,10 +1668,10 @@ "repetition_index": 0, "threads": 1, "iterations": 6, - "real_time": 1.2223816666666667e+08, - "cpu_time": 2.6801666667353173e+04, + "real_time": 1.2227466666666669e+08, + "cpu_time": 7.1834833333876231e+04, "time_unit": "ns", - "IterationTime": 1.2223816666666666e-05 + "IterationTime": 1.2227466666666668e-05 }, { "name": "BM_pgm_dispatch/kernel_groups_trace/512/manual_time", @@ -1683,10 +1683,10 @@ "repetition_index": 0, "threads": 1, "iterations": 6, - "real_time": 1.2258733333333333e+08, - "cpu_time": 2.7776666667496858e+04, + "real_time": 1.2259900000000000e+08, + "cpu_time": 3.6168333333345501e+04, "time_unit": "ns", - "IterationTime": 1.2258733333333330e-05 + "IterationTime": 1.2259899999999999e-05 }, { "name": "BM_pgm_dispatch/kernel_groups_trace/1024/manual_time", @@ -1698,10 +1698,10 @@ "repetition_index": 0, "threads": 1, "iterations": 6, - "real_time": 1.2489916666666667e+08, - "cpu_time": 2.6563333333247861e+04, + "real_time": 1.2490616666666664e+08, + "cpu_time": 3.7696666667604477e+04, "time_unit": "ns", - "IterationTime": 1.2489916666666665e-05 + "IterationTime": 1.2490616666666665e-05 }, { "name": "BM_pgm_dispatch/kernel_groups_trace/2048/manual_time", @@ -1713,10 +1713,10 @@ "repetition_index": 0, "threads": 1, "iterations": 5, - "real_time": 1.4246980000000000e+08, - "cpu_time": 2.9727999999806798e+04, + "real_time": 1.4242300000000000e+08, + "cpu_time": 4.3096000000275577e+04, "time_unit": "ns", - "IterationTime": 1.4246980000000001e-05 + "IterationTime": 1.4242299999999999e-05 }, { "name": "BM_pgm_dispatch/kernel_groups_trace/4096/manual_time", @@ -1728,10 +1728,10 @@ "repetition_index": 0, "threads": 1, "iterations": 3, - "real_time": 2.0078166666666666e+08, - "cpu_time": 3.5603333332782466e+04, + "real_time": 2.0092400000000003e+08, + "cpu_time": 1.0000999999941011e+05, "time_unit": "ns", - "IterationTime": 2.0078166666666670e-05 + "IterationTime": 2.0092400000000001e-05 }, { "name": "BM_pgm_dispatch/kernel_groups_trace/8192/manual_time", @@ -1743,10 +1743,10 @@ "repetition_index": 0, "threads": 1, "iterations": 2, - "real_time": 3.1837400000000000e+08, - "cpu_time": 7.5791000000435815e+04, + "real_time": 3.1842150000000000e+08, + "cpu_time": 8.8309999998870131e+04, "time_unit": "ns", - "IterationTime": 3.1837399999999994e-05 + "IterationTime": 3.1842150000000001e-05 }, { "name": "BM_pgm_dispatch/10000_kernel_all_cores_all_processors_32_cbs_trace/256/manual_time", @@ -1758,10 +1758,10 @@ "repetition_index": 0, "threads": 1, "iterations": 6, - "real_time": 1.1883483333333333e+08, - "cpu_time": 3.1042833333809012e+04, + "real_time": 1.1885383333333333e+08, + "cpu_time": 5.3930833333974231e+04, "time_unit": "ns", - "IterationTime": 1.1883483333333336e-05 + "IterationTime": 1.1885383333333331e-05 }, { "name": "BM_pgm_dispatch/10000_kernel_all_cores_all_processors_32_cbs_trace/512/manual_time", @@ -1773,10 +1773,10 @@ "repetition_index": 0, "threads": 1, "iterations": 6, - "real_time": 1.1884550000000000e+08, - "cpu_time": 3.5406666666422854e+04, + "real_time": 1.1886033333333333e+08, + "cpu_time": 5.4605000000170396e+04, "time_unit": "ns", - "IterationTime": 1.1884549999999998e-05 + "IterationTime": 1.1886033333333333e-05 }, { "name": "BM_pgm_dispatch/10000_kernel_all_cores_all_processors_32_cbs_trace/1024/manual_time", @@ -1788,10 +1788,10 @@ "repetition_index": 0, "threads": 1, "iterations": 6, - "real_time": 1.1890100000000000e+08, - "cpu_time": 3.3865000000095810e+04, + "real_time": 1.1891850000000000e+08, + "cpu_time": 4.7281666667231555e+04, "time_unit": "ns", - "IterationTime": 1.1890100000000000e-05 + "IterationTime": 1.1891850000000000e-05 }, { "name": "BM_pgm_dispatch/10000_kernel_all_cores_all_processors_32_cbs_trace/2048/manual_time", @@ -1803,10 +1803,10 @@ "repetition_index": 0, "threads": 1, "iterations": 6, - "real_time": 1.1947133333333333e+08, - "cpu_time": 3.3283333332671340e+04, + "real_time": 1.1947416666666669e+08, + "cpu_time": 3.8594999999475018e+04, "time_unit": "ns", - "IterationTime": 1.1947133333333333e-05 + "IterationTime": 1.1947416666666669e-05 }, { "name": "BM_pgm_dispatch/10000_kernel_all_cores_all_processors_32_cbs_trace/4096/manual_time", @@ -1818,10 +1818,10 @@ "repetition_index": 0, "threads": 1, "iterations": 6, - "real_time": 1.2130549999999999e+08, - "cpu_time": 3.2995499999799453e+04, + "real_time": 1.2130733333333333e+08, + "cpu_time": 3.2120333333551796e+04, "time_unit": "ns", - "IterationTime": 1.2130549999999999e-05 + "IterationTime": 1.2130733333333334e-05 }, { "name": "BM_pgm_dispatch/10000_kernel_all_cores_all_processors_32_cbs_trace/8192/manual_time", @@ -1833,10 +1833,10 @@ "repetition_index": 0, "threads": 1, "iterations": 4, - "real_time": 1.6620975000000003e+08, - "cpu_time": 2.9792750000368073e+04, + "real_time": 1.6623825000000000e+08, + "cpu_time": 4.3950500000278225e+04, "time_unit": "ns", - "IterationTime": 1.6620975000000001e-05 + "IterationTime": 1.6623825000000000e-05 }, { "name": "BM_pgm_dispatch/5000_kernel_all_cores_all_processors_32_cbs_trace/256/manual_time", @@ -1848,10 +1848,10 @@ "repetition_index": 0, "threads": 1, "iterations": 10, - "real_time": 6.8096700000000000e+07, - "cpu_time": 2.6223100000066781e+04, + "real_time": 6.8102999999999985e+07, + "cpu_time": 3.7966700000424680e+04, "time_unit": "ns", - "IterationTime": 6.8096699999999990e-06 + "IterationTime": 6.8102999999999990e-06 }, { "name": "BM_pgm_dispatch/5000_kernel_all_cores_all_processors_32_cbs_trace/512/manual_time", @@ -1863,10 +1863,10 @@ "repetition_index": 0, "threads": 1, "iterations": 10, - "real_time": 6.8104800000000015e+07, - "cpu_time": 3.1231999999903335e+04, + "real_time": 6.8112900000000000e+07, + "cpu_time": 4.4721000000436106e+04, "time_unit": "ns", - "IterationTime": 6.8104800000000006e-06 + "IterationTime": 6.8112899999999988e-06 }, { "name": "BM_pgm_dispatch/5000_kernel_all_cores_all_processors_32_cbs_trace/1024/manual_time", @@ -1878,10 +1878,10 @@ "repetition_index": 0, "threads": 1, "iterations": 10, - "real_time": 6.8165500000000000e+07, - "cpu_time": 2.5873999999959098e+04, + "real_time": 6.8179099999999985e+07, + "cpu_time": 3.8537999999732616e+04, "time_unit": "ns", - "IterationTime": 6.8165500000000008e-06 + "IterationTime": 6.8179099999999986e-06 }, { "name": "BM_pgm_dispatch/5000_kernel_all_cores_all_processors_32_cbs_trace/2048/manual_time", @@ -1893,10 +1893,10 @@ "repetition_index": 0, "threads": 1, "iterations": 10, - "real_time": 6.8736599999999985e+07, - "cpu_time": 3.0934999999487900e+04, + "real_time": 6.8735000000000000e+07, + "cpu_time": 3.6897999999752079e+04, "time_unit": "ns", - "IterationTime": 6.8736599999999988e-06 + "IterationTime": 6.8735000000000003e-06 }, { "name": "BM_pgm_dispatch/5000_kernel_all_cores_all_processors_32_cbs_trace/4096/manual_time", @@ -1908,10 +1908,10 @@ "repetition_index": 0, "threads": 1, "iterations": 10, - "real_time": 7.0558000000000015e+07, - "cpu_time": 2.3976199999964367e+04, + "real_time": 7.0568000000000015e+07, + "cpu_time": 3.4171799999427327e+04, "time_unit": "ns", - "IterationTime": 7.0558000000000011e-06 + "IterationTime": 7.0568000000000008e-06 }, { "name": "BM_pgm_dispatch/5000_kernel_all_cores_all_processors_32_cbs_trace/8192/manual_time", @@ -1923,14 +1923,119 @@ "repetition_index": 0, "threads": 1, "iterations": 6, - "real_time": 1.1595766666666667e+08, - "cpu_time": 2.9203333333782666e+04, + "real_time": 1.1600533333333333e+08, + "cpu_time": 6.3993333333437855e+04, "time_unit": "ns", - "IterationTime": 1.1595766666666667e-05 + "IterationTime": 1.1600533333333334e-05 }, { - "name": "BM_pgm_dispatch/kernel_groups_4_shadow/256/manual_time", + "name": "BM_pgm_dispatch_vary_slow_cycles/256_bytes_brisc_only_all_processors_trace/0/manual_time", + "family_index": 20, + "per_family_instance_index": 0, + "run_name": "BM_pgm_dispatch_vary_slow_cycles/256_bytes_brisc_only_all_processors_trace/0/manual_time", + "run_type": "iteration", + "repetitions": 1, + "repetition_index": 0, + "threads": 1, + "iterations": 26, + "real_time": 2.6718038461538460e+07, + "cpu_time": 3.2454615384617333e+04, + "time_unit": "ns", + "IterationTime": 2.6718038461538463e-06 + }, + { + "name": "BM_pgm_dispatch_vary_slow_cycles/256_bytes_brisc_only_all_processors_trace/1000/manual_time", + "family_index": 20, + "per_family_instance_index": 1, + "run_name": "BM_pgm_dispatch_vary_slow_cycles/256_bytes_brisc_only_all_processors_trace/1000/manual_time", + "run_type": "iteration", + "repetitions": 1, + "repetition_index": 0, + "threads": 1, + "iterations": 26, + "real_time": 2.7278346153846152e+07, + "cpu_time": 5.1366461538402917e+04, + "time_unit": "ns", + "IterationTime": 2.7278346153846159e-06 + }, + { + "name": "BM_pgm_dispatch_vary_slow_cycles/256_bytes_brisc_only_all_processors_trace/2000/manual_time", + "family_index": 20, + "per_family_instance_index": 2, + "run_name": "BM_pgm_dispatch_vary_slow_cycles/256_bytes_brisc_only_all_processors_trace/2000/manual_time", + "run_type": "iteration", + "repetitions": 1, + "repetition_index": 0, + "threads": 1, + "iterations": 23, + "real_time": 3.0445391304347821e+07, + "cpu_time": 8.0444347825841367e+04, + "time_unit": "ns", + "IterationTime": 3.0445391304347821e-06 + }, + { + "name": "BM_pgm_dispatch_vary_slow_cycles/256_bytes_brisc_only_all_processors_trace/3000/manual_time", + "family_index": 20, + "per_family_instance_index": 3, + "run_name": "BM_pgm_dispatch_vary_slow_cycles/256_bytes_brisc_only_all_processors_trace/3000/manual_time", + "run_type": "iteration", + "repetitions": 1, + "repetition_index": 0, + "threads": 1, + "iterations": 17, + "real_time": 4.0024176470588237e+07, + "cpu_time": 5.6982764705788424e+04, + "time_unit": "ns", + "IterationTime": 4.0024176470588232e-06 + }, + { + "name": "BM_pgm_dispatch_vary_slow_cycles/256_bytes_brisc_only_all_processors_trace/4000/manual_time", + "family_index": 20, + "per_family_instance_index": 4, + "run_name": "BM_pgm_dispatch_vary_slow_cycles/256_bytes_brisc_only_all_processors_trace/4000/manual_time", + "run_type": "iteration", + "repetitions": 1, + "repetition_index": 0, + "threads": 1, + "iterations": 14, + "real_time": 5.3831642857142858e+07, + "cpu_time": 1.4073500000019328e+05, + "time_unit": "ns", + "IterationTime": 5.3831642857142854e-06 + }, + { + "name": "BM_pgm_dispatch_vary_slow_cycles/256_bytes_brisc_only_all_processors_trace/5000/manual_time", + "family_index": 20, + "per_family_instance_index": 5, + "run_name": "BM_pgm_dispatch_vary_slow_cycles/256_bytes_brisc_only_all_processors_trace/5000/manual_time", + "run_type": "iteration", + "repetitions": 1, + "repetition_index": 0, + "threads": 1, + "iterations": 12, + "real_time": 6.0328166666666664e+07, + "cpu_time": 1.0642833333326015e+05, + "time_unit": "ns", + "IterationTime": 6.0328166666666668e-06 + }, + { + "name": "BM_pgm_dispatch_vary_slow_cycles/256_bytes_brisc_only_all_processors_trace/10000/manual_time", "family_index": 20, + "per_family_instance_index": 6, + "run_name": "BM_pgm_dispatch_vary_slow_cycles/256_bytes_brisc_only_all_processors_trace/10000/manual_time", + "run_type": "iteration", + "repetitions": 1, + "repetition_index": 0, + "threads": 1, + "iterations": 6, + "real_time": 1.1106266666666667e+08, + "cpu_time": 1.2152666666646420e+05, + "time_unit": "ns", + "IterationTime": 1.1106266666666666e-05 + }, + { + "name": "BM_pgm_dispatch/kernel_groups_4_shadow/256/manual_time", + "family_index": 21, "per_family_instance_index": 0, "run_name": "BM_pgm_dispatch/kernel_groups_4_shadow/256/manual_time", "run_type": "iteration", @@ -1938,14 +2043,14 @@ "repetition_index": 0, "threads": 1, "iterations": 1, - "real_time": 5.4237800000000000e+08, - "cpu_time": 4.8290000002282337e+04, + "real_time": 5.4244600000000000e+08, + "cpu_time": 9.3160000005809707e+04, "time_unit": "ns", - "IterationTime": 5.4237800000000004e-05 + "IterationTime": 5.4244599999999992e-05 }, { "name": "BM_pgm_dispatch/kernel_groups_4_shadow/512/manual_time", - "family_index": 20, + "family_index": 21, "per_family_instance_index": 1, "run_name": "BM_pgm_dispatch/kernel_groups_4_shadow/512/manual_time", "run_type": "iteration", @@ -1953,14 +2058,14 @@ "repetition_index": 0, "threads": 1, "iterations": 1, - "real_time": 5.4552000000000000e+08, - "cpu_time": 4.1389999999807973e+04, + "real_time": 5.4561200000000000e+08, + "cpu_time": 8.8479999995172417e+04, "time_unit": "ns", - "IterationTime": 5.4551999999999995e-05 + "IterationTime": 5.4561199999999995e-05 }, { "name": "BM_pgm_dispatch/kernel_groups_4_shadow/1024/manual_time", - "family_index": 20, + "family_index": 21, "per_family_instance_index": 2, "run_name": "BM_pgm_dispatch/kernel_groups_4_shadow/1024/manual_time", "run_type": "iteration", @@ -1968,14 +2073,14 @@ "repetition_index": 0, "threads": 1, "iterations": 1, - "real_time": 5.5493300000000000e+08, - "cpu_time": 4.2209000000070773e+04, + "real_time": 5.5501000000000000e+08, + "cpu_time": 8.5339999998268468e+04, "time_unit": "ns", - "IterationTime": 5.5493299999999999e-05 + "IterationTime": 5.5501000000000003e-05 }, { "name": "BM_pgm_dispatch/kernel_groups_4_shadow/2048/manual_time", - "family_index": 20, + "family_index": 21, "per_family_instance_index": 3, "run_name": "BM_pgm_dispatch/kernel_groups_4_shadow/2048/manual_time", "run_type": "iteration", @@ -1983,14 +2088,14 @@ "repetition_index": 0, "threads": 1, "iterations": 1, - "real_time": 5.9554600000000000e+08, - "cpu_time": 3.8520000003927635e+04, + "real_time": 5.9559500000000000e+08, + "cpu_time": 6.8378000001700915e+04, "time_unit": "ns", - "IterationTime": 5.9554600000000001e-05 + "IterationTime": 5.9559499999999998e-05 }, { "name": "BM_pgm_dispatch/kernel_groups_4_shadow/4096/manual_time", - "family_index": 20, + "family_index": 21, "per_family_instance_index": 4, "run_name": "BM_pgm_dispatch/kernel_groups_4_shadow/4096/manual_time", "run_type": "iteration", @@ -1998,14 +2103,14 @@ "repetition_index": 0, "threads": 1, "iterations": 1, - "real_time": 8.5543900000000000e+08, - "cpu_time": 4.7340000001838693e+04, + "real_time": 8.5700900000000000e+08, + "cpu_time": 9.7529999997902909e+04, "time_unit": "ns", - "IterationTime": 8.5543899999999999e-05 + "IterationTime": 8.5700900000000005e-05 }, { "name": "BM_pgm_dispatch/kernel_groups_4_shadow/8192/manual_time", - "family_index": 20, + "family_index": 21, "per_family_instance_index": 5, "run_name": "BM_pgm_dispatch/kernel_groups_4_shadow/8192/manual_time", "run_type": "iteration", @@ -2013,14 +2118,14 @@ "repetition_index": 0, "threads": 1, "iterations": 1, - "real_time": 1.5866330000000000e+09, - "cpu_time": 6.2331000002302520e+04, + "real_time": 1.5866420000000000e+09, + "cpu_time": 8.7729999997065985e+04, "time_unit": "ns", - "IterationTime": 1.5866329999999999e-04 + "IterationTime": 1.5866420000000002e-04 }, { "name": "BM_pgm_dispatch/kernel_groups_5_shadow/256/manual_time", - "family_index": 21, + "family_index": 22, "per_family_instance_index": 0, "run_name": "BM_pgm_dispatch/kernel_groups_5_shadow/256/manual_time", "run_type": "iteration", @@ -2028,14 +2133,14 @@ "repetition_index": 0, "threads": 1, "iterations": 1, - "real_time": 6.5096400000000000e+08, - "cpu_time": 4.1160000002093962e+04, + "real_time": 6.5102400000000000e+08, + "cpu_time": 8.2499999997764913e+04, "time_unit": "ns", - "IterationTime": 6.5096400000000002e-05 + "IterationTime": 6.5102400000000002e-05 }, { "name": "BM_pgm_dispatch/kernel_groups_5_shadow/512/manual_time", - "family_index": 21, + "family_index": 22, "per_family_instance_index": 1, "run_name": "BM_pgm_dispatch/kernel_groups_5_shadow/512/manual_time", "run_type": "iteration", @@ -2043,14 +2148,14 @@ "repetition_index": 0, "threads": 1, "iterations": 1, - "real_time": 6.5486500000000000e+08, - "cpu_time": 3.6379999997393497e+04, + "real_time": 6.5491500000000000e+08, + "cpu_time": 6.0670999999956621e+04, "time_unit": "ns", - "IterationTime": 6.5486499999999997e-05 + "IterationTime": 6.5491499999999995e-05 }, { "name": "BM_pgm_dispatch/kernel_groups_5_shadow/1024/manual_time", - "family_index": 21, + "family_index": 22, "per_family_instance_index": 2, "run_name": "BM_pgm_dispatch/kernel_groups_5_shadow/1024/manual_time", "run_type": "iteration", @@ -2058,14 +2163,14 @@ "repetition_index": 0, "threads": 1, "iterations": 1, - "real_time": 6.6611600000000000e+08, - "cpu_time": 3.5420000003227869e+04, + "real_time": 6.6614900000000000e+08, + "cpu_time": 6.6809999999861699e+04, "time_unit": "ns", - "IterationTime": 6.6611600000000004e-05 + "IterationTime": 6.6614900000000005e-05 }, { "name": "BM_pgm_dispatch/kernel_groups_5_shadow/2048/manual_time", - "family_index": 21, + "family_index": 22, "per_family_instance_index": 3, "run_name": "BM_pgm_dispatch/kernel_groups_5_shadow/2048/manual_time", "run_type": "iteration", @@ -2073,14 +2178,14 @@ "repetition_index": 0, "threads": 1, "iterations": 1, - "real_time": 7.1765000000000000e+08, - "cpu_time": 3.1180000000574637e+04, + "real_time": 7.1777900000000000e+08, + "cpu_time": 6.4689999994982369e+04, "time_unit": "ns", - "IterationTime": 7.1765000000000002e-05 + "IterationTime": 7.1777899999999996e-05 }, { "name": "BM_pgm_dispatch/kernel_groups_5_shadow/4096/manual_time", - "family_index": 21, + "family_index": 22, "per_family_instance_index": 4, "run_name": "BM_pgm_dispatch/kernel_groups_5_shadow/4096/manual_time", "run_type": "iteration", @@ -2088,14 +2193,14 @@ "repetition_index": 0, "threads": 1, "iterations": 1, - "real_time": 1.0249530000000000e+09, - "cpu_time": 3.6509000004514295e+04, + "real_time": 1.0252410000000001e+09, + "cpu_time": 6.5110000001311622e+04, "time_unit": "ns", - "IterationTime": 1.0249529999999999e-04 + "IterationTime": 1.0252410000000002e-04 }, { "name": "BM_pgm_dispatch/kernel_groups_5_shadow/8192/manual_time", - "family_index": 21, + "family_index": 22, "per_family_instance_index": 5, "run_name": "BM_pgm_dispatch/kernel_groups_5_shadow/8192/manual_time", "run_type": "iteration", @@ -2103,14 +2208,14 @@ "repetition_index": 0, "threads": 1, "iterations": 1, - "real_time": 1.8616840000000000e+09, - "cpu_time": 3.8631000002453671e+04, + "real_time": 1.8617290000000000e+09, + "cpu_time": 5.8751000004519941e+04, "time_unit": "ns", - "IterationTime": 1.8616840000000001e-04 + "IterationTime": 1.8617290000000000e-04 }, { "name": "BM_pgm_dispatch/eth_dispatch/256/manual_time", - "family_index": 22, + "family_index": 23, "per_family_instance_index": 0, "run_name": "BM_pgm_dispatch/eth_dispatch/256/manual_time", "run_type": "iteration", @@ -2118,14 +2223,14 @@ "repetition_index": 0, "threads": 1, "iterations": 18, - "real_time": 3.9575555555555552e+07, - "cpu_time": 2.2015555555378163e+04, + "real_time": 3.9584000000000007e+07, + "cpu_time": 2.9282944444178029e+04, "time_unit": "ns", - "IterationTime": 3.9575555555555552e-06 + "IterationTime": 3.9584000000000009e-06 }, { "name": "BM_pgm_dispatch/eth_dispatch/512/manual_time", - "family_index": 22, + "family_index": 23, "per_family_instance_index": 1, "run_name": "BM_pgm_dispatch/eth_dispatch/512/manual_time", "run_type": "iteration", @@ -2133,14 +2238,14 @@ "repetition_index": 0, "threads": 1, "iterations": 18, - "real_time": 3.9568499999999993e+07, - "cpu_time": 1.8607777777488209e+04, + "real_time": 3.9588388888888896e+07, + "cpu_time": 3.2771111111094971e+04, "time_unit": "ns", - "IterationTime": 3.9568499999999992e-06 + "IterationTime": 3.9588388888888888e-06 }, { "name": "BM_pgm_dispatch/eth_dispatch/1024/manual_time", - "family_index": 22, + "family_index": 23, "per_family_instance_index": 2, "run_name": "BM_pgm_dispatch/eth_dispatch/1024/manual_time", "run_type": "iteration", @@ -2148,14 +2253,14 @@ "repetition_index": 0, "threads": 1, "iterations": 18, - "real_time": 3.9578277777777784e+07, - "cpu_time": 2.2552444444477893e+04, + "real_time": 3.9585055555555552e+07, + "cpu_time": 3.0058333333471408e+04, "time_unit": "ns", - "IterationTime": 3.9578277777777777e-06 + "IterationTime": 3.9585055555555547e-06 }, { "name": "BM_pgm_dispatch/eth_dispatch/2048/manual_time", - "family_index": 22, + "family_index": 23, "per_family_instance_index": 3, "run_name": "BM_pgm_dispatch/eth_dispatch/2048/manual_time", "run_type": "iteration", @@ -2163,14 +2268,14 @@ "repetition_index": 0, "threads": 1, "iterations": 18, - "real_time": 3.9572277777777784e+07, - "cpu_time": 1.9345055555675117e+04, + "real_time": 3.9588722222222224e+07, + "cpu_time": 3.1872222222043925e+04, "time_unit": "ns", - "IterationTime": 3.9572277777777781e-06 + "IterationTime": 3.9588722222222231e-06 }, { "name": "BM_pgm_dispatch/eth_dispatch/4096/manual_time", - "family_index": 22, + "family_index": 23, "per_family_instance_index": 4, "run_name": "BM_pgm_dispatch/eth_dispatch/4096/manual_time", "run_type": "iteration", @@ -2178,14 +2283,14 @@ "repetition_index": 0, "threads": 1, "iterations": 18, - "real_time": 3.9572444444444448e+07, - "cpu_time": 2.3290999999956184e+04, + "real_time": 3.9596333333333343e+07, + "cpu_time": 4.4238777777631331e+04, "time_unit": "ns", - "IterationTime": 3.9572444444444448e-06 + "IterationTime": 3.9596333333333344e-06 }, { "name": "BM_pgm_dispatch/eth_dispatch/8192/manual_time", - "family_index": 22, + "family_index": 23, "per_family_instance_index": 5, "run_name": "BM_pgm_dispatch/eth_dispatch/8192/manual_time", "run_type": "iteration", @@ -2193,14 +2298,14 @@ "repetition_index": 0, "threads": 1, "iterations": 18, - "real_time": 3.9588333333333336e+07, - "cpu_time": 2.9833888889009409e+04, + "real_time": 3.9596611111111112e+07, + "cpu_time": 4.4103444444548106e+04, "time_unit": "ns", - "IterationTime": 3.9588333333333335e-06 + "IterationTime": 3.9596611111111109e-06 }, { "name": "BM_pgm_dispatch/tensix_eth_2/256/manual_time", - "family_index": 23, + "family_index": 24, "per_family_instance_index": 0, "run_name": "BM_pgm_dispatch/tensix_eth_2/256/manual_time", "run_type": "iteration", @@ -2208,14 +2313,14 @@ "repetition_index": 0, "threads": 1, "iterations": 5, - "real_time": 1.4142620000000000e+08, - "cpu_time": 3.8329999999575652e+04, + "real_time": 1.4140000000000003e+08, + "cpu_time": 4.7829800000442905e+04, "time_unit": "ns", - "IterationTime": 1.4142619999999999e-05 + "IterationTime": 1.4140000000000000e-05 }, { "name": "BM_pgm_dispatch/tensix_eth_2/512/manual_time", - "family_index": 23, + "family_index": 24, "per_family_instance_index": 1, "run_name": "BM_pgm_dispatch/tensix_eth_2/512/manual_time", "run_type": "iteration", @@ -2223,14 +2328,14 @@ "repetition_index": 0, "threads": 1, "iterations": 5, - "real_time": 1.4812320000000000e+08, - "cpu_time": 3.3424000000081833e+04, + "real_time": 1.4768340000000000e+08, + "cpu_time": 3.5056000000110995e+04, "time_unit": "ns", - "IterationTime": 1.4812319999999998e-05 + "IterationTime": 1.4768340000000000e-05 }, { "name": "BM_pgm_dispatch/tensix_eth_2/1024/manual_time", - "family_index": 23, + "family_index": 24, "per_family_instance_index": 2, "run_name": "BM_pgm_dispatch/tensix_eth_2/1024/manual_time", "run_type": "iteration", @@ -2238,14 +2343,14 @@ "repetition_index": 0, "threads": 1, "iterations": 5, - "real_time": 1.5148540000000000e+08, - "cpu_time": 2.4277999999355870e+04, + "real_time": 1.5152940000000000e+08, + "cpu_time": 7.9954000000270753e+04, "time_unit": "ns", - "IterationTime": 1.5148539999999998e-05 + "IterationTime": 1.5152939999999998e-05 }, { "name": "BM_pgm_dispatch/tensix_eth_2/2048/manual_time", - "family_index": 23, + "family_index": 24, "per_family_instance_index": 3, "run_name": "BM_pgm_dispatch/tensix_eth_2/2048/manual_time", "run_type": "iteration", @@ -2253,14 +2358,14 @@ "repetition_index": 0, "threads": 1, "iterations": 4, - "real_time": 1.6367274999999997e+08, - "cpu_time": 3.2517750000238266e+04, + "real_time": 1.6369250000000003e+08, + "cpu_time": 6.2395000000492473e+04, "time_unit": "ns", - "IterationTime": 1.6367274999999997e-05 + "IterationTime": 1.6369250000000003e-05 }, { "name": "BM_pgm_dispatch/tensix_eth_2/4096/manual_time", - "family_index": 23, + "family_index": 24, "per_family_instance_index": 4, "run_name": "BM_pgm_dispatch/tensix_eth_2/4096/manual_time", "run_type": "iteration", @@ -2268,14 +2373,14 @@ "repetition_index": 0, "threads": 1, "iterations": 3, - "real_time": 2.1807833333333334e+08, - "cpu_time": 2.7522999999973763e+04, + "real_time": 2.1813066666666666e+08, + "cpu_time": 8.3360000000235843e+04, "time_unit": "ns", - "IterationTime": 2.1807833333333332e-05 + "IterationTime": 2.1813066666666670e-05 }, { "name": "BM_pgm_dispatch/tensix_eth_2/8192/manual_time", - "family_index": 23, + "family_index": 24, "per_family_instance_index": 5, "run_name": "BM_pgm_dispatch/tensix_eth_2/8192/manual_time", "run_type": "iteration", @@ -2283,14 +2388,14 @@ "repetition_index": 0, "threads": 1, "iterations": 2, - "real_time": 3.2477100000000006e+08, - "cpu_time": 3.4020000001078188e+04, + "real_time": 3.2481750000000000e+08, + "cpu_time": 4.3999999999044805e+04, "time_unit": "ns", - "IterationTime": 3.2477100000000001e-05 + "IterationTime": 3.2481749999999994e-05 }, { "name": "BM_pgm_dispatch/tensix_eth_2_4_shadow/256/manual_time", - "family_index": 24, + "family_index": 25, "per_family_instance_index": 0, "run_name": "BM_pgm_dispatch/tensix_eth_2_4_shadow/256/manual_time", "run_type": "iteration", @@ -2298,14 +2403,14 @@ "repetition_index": 0, "threads": 1, "iterations": 1, - "real_time": 1.0864170000000000e+09, - "cpu_time": 3.6670000000071923e+04, + "real_time": 1.0859430000000000e+09, + "cpu_time": 4.6169000000872984e+04, "time_unit": "ns", - "IterationTime": 1.0864170000000000e-04 + "IterationTime": 1.0859430000000000e-04 }, { "name": "BM_pgm_dispatch/tensix_eth_2_4_shadow/512/manual_time", - "family_index": 24, + "family_index": 25, "per_family_instance_index": 1, "run_name": "BM_pgm_dispatch/tensix_eth_2_4_shadow/512/manual_time", "run_type": "iteration", @@ -2313,14 +2418,14 @@ "repetition_index": 0, "threads": 1, "iterations": 1, - "real_time": 1.1051990000000000e+09, - "cpu_time": 3.6348999998381260e+04, + "real_time": 5.4643700000000000e+09, + "cpu_time": 6.0580000003085384e+04, "time_unit": "ns", - "IterationTime": 1.1051990000000001e-04 + "IterationTime": 5.4643699999999999e-04 }, { "name": "BM_pgm_dispatch/tensix_eth_2_4_shadow/1024/manual_time", - "family_index": 24, + "family_index": 25, "per_family_instance_index": 2, "run_name": "BM_pgm_dispatch/tensix_eth_2_4_shadow/1024/manual_time", "run_type": "iteration", @@ -2328,14 +2433,14 @@ "repetition_index": 0, "threads": 1, "iterations": 1, - "real_time": 1.1301090000000000e+09, - "cpu_time": 3.0899999998723615e+04, + "real_time": 1.1302310000000000e+09, + "cpu_time": 5.5519999996533894e+04, "time_unit": "ns", - "IterationTime": 1.1301090000000001e-04 + "IterationTime": 1.1302310000000000e-04 }, { "name": "BM_pgm_dispatch/tensix_eth_2_4_shadow/2048/manual_time", - "family_index": 24, + "family_index": 25, "per_family_instance_index": 3, "run_name": "BM_pgm_dispatch/tensix_eth_2_4_shadow/2048/manual_time", "run_type": "iteration", @@ -2343,14 +2448,14 @@ "repetition_index": 0, "threads": 1, "iterations": 1, - "real_time": 1.1301990000000000e+09, - "cpu_time": 3.8449999998135812e+04, + "real_time": 1.1301150000000000e+09, + "cpu_time": 8.7759999999548192e+04, "time_unit": "ns", - "IterationTime": 1.1301989999999999e-04 + "IterationTime": 1.1301150000000000e-04 }, { "name": "BM_pgm_dispatch/tensix_eth_2_4_shadow/4096/manual_time", - "family_index": 24, + "family_index": 25, "per_family_instance_index": 4, "run_name": "BM_pgm_dispatch/tensix_eth_2_4_shadow/4096/manual_time", "run_type": "iteration", @@ -2358,14 +2463,14 @@ "repetition_index": 0, "threads": 1, "iterations": 1, - "real_time": 1.2371950000000000e+09, - "cpu_time": 3.1809999995857652e+04, + "real_time": 1.2373150000000000e+09, + "cpu_time": 6.0830000002454195e+04, "time_unit": "ns", - "IterationTime": 1.2371950000000001e-04 + "IterationTime": 1.2373150000000001e-04 }, { "name": "BM_pgm_dispatch/tensix_eth_2_4_shadow/8192/manual_time", - "family_index": 24, + "family_index": 25, "per_family_instance_index": 5, "run_name": "BM_pgm_dispatch/tensix_eth_2_4_shadow/8192/manual_time", "run_type": "iteration", @@ -2373,10 +2478,10 @@ "repetition_index": 0, "threads": 1, "iterations": 1, - "real_time": 1.8342070000000000e+09, - "cpu_time": 3.7970999997583021e+04, + "real_time": 1.8335490000000000e+09, + "cpu_time": 7.1139999995750710e+04, "time_unit": "ns", - "IterationTime": 1.8342070000000000e-04 + "IterationTime": 1.8335490000000000e-04 } ] } diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp index 416566e7655..b9e3aaaf083 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp @@ -303,9 +303,6 @@ static int pgm_dispatch(T& state, TestInfo info) { auto core_count = get_core_count(); info.workers = CoreRange({0, 0}, {std::get<0>(core_count), std::get<1>(core_count)}); } - if constexpr (std::is_same_v) { - info.kernel_size = state.range(0); - } if (info.use_trace) { log_info(LogTest, "Running with trace enabled"); @@ -429,7 +426,15 @@ static int pgm_dispatch(T& state, TestInfo info) { } } -static void BM_pgm_dispatch(benchmark::State& state, TestInfo info) { pgm_dispatch(state, info); } +static void BM_pgm_dispatch(benchmark::State& state, TestInfo info) { + info.kernel_size = state.range(0); + pgm_dispatch(state, info); +} + +static void BM_pgm_dispatch_vary_slow_cycles(benchmark::State& state, TestInfo info) { + info.slow_kernel_cycles = state.range(0); + pgm_dispatch(state, info); +} static void Max12288Args(benchmark::internal::Benchmark* b) { b->Arg(256)->Arg(512)->Arg(1024)->Arg(2048)->Arg(4096)->Arg(8192)->Arg(12288); @@ -439,6 +444,11 @@ static void Max8192Args(benchmark::internal::Benchmark* b) { b->Arg(256)->Arg(512)->Arg(1024)->Arg(2048)->Arg(4096)->Arg(8192); } +static void KernelCycleArgs(benchmark::internal::Benchmark* b) { + // Dispatch time for most normal kernels is around 3000-4000 cycles. + b->Arg(0)->Arg(1000)->Arg(2000)->Arg(3000)->Arg(4000)->Arg(5000)->Arg(10000); +} + BENCHMARK_CAPTURE( BM_pgm_dispatch, brisc_only_trace, @@ -575,6 +585,13 @@ BENCHMARK_CAPTURE( TestInfo{.warmup_iterations = 5000, .slow_kernel_cycles = 5000, .n_cbs = 32, .use_trace = true, .use_all_cores = true}) ->Apply(Max8192Args) ->UseManualTime(); +// Intended to be GO-latency-bound +BENCHMARK_CAPTURE( + BM_pgm_dispatch_vary_slow_cycles, + 256_bytes_brisc_only_all_processors_trace, + TestInfo{.warmup_iterations = 5000, .kernel_size = 256, .ncrisc_enabled = false, .trisc_enabled = false, .use_trace = true, .use_all_cores = true}) + ->Apply(KernelCycleArgs) + ->UseManualTime(); int main(int argc, char** argv) { std::vector input_args(argv, argv + argc); if (test_args::has_command_option(input_args, "--custom")) { From 0df1047bbe08fa4f7e113f3f6f231871b0427632 Mon Sep 17 00:00:00 2001 From: Brian Liu Date: Tue, 18 Feb 2025 11:47:46 -0800 Subject: [PATCH 203/316] #0: Switch tensor.to_torch to return logical tensor - Rename/merge to_torch_logical_shape with to_torch - Add to_torch_with_padded_shape to support returning padded tensor * Switch tests that depend on returning padded tensor to use this * Rename legacy_output to padded_output for clarity * TODO: Remove this path after removing usage of to_torch_with_padded_shape --- .../sweep_tests/tt_lib_ops.py | 14 +++-- .../unit_testing/misc/test_padding_test.py | 4 +- .../unit_testing/misc/test_sharded.py | 2 +- .../misc/test_tilize_hpadding_matmul.py | 2 +- .../unit_tests/operations/test_fill_pad.py | 6 +-- .../unit_tests/tensor/test_tensor_creation.py | 4 +- ttnn/cpp/pybind11/pytensor.cpp | 51 ++++++++----------- ttnn/ttnn/operations/core.py | 4 +- 8 files changed, 43 insertions(+), 44 deletions(-) diff --git a/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py b/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py index 67563c1c924..5487338c636 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py @@ -1718,7 +1718,7 @@ def tilize_with_zero_padding(x, *args, device, dtype, layout, input_mem_config, t0 = setup_tt_tensor(x, device, layout[0], input_mem_config[0], dtype[0]) t1 = ttnn.tilize_with_zero_padding(t0, memory_config=output_mem_config) - return t1.cpu().to_torch() + return t1.cpu().to_torch_with_padded_shape() @setup_host_and_device @@ -1742,7 +1742,7 @@ def tilize_with_val_padding( memory_config=output_mem_config, ) - return t1.cpu().to_torch() + return t1.cpu().to_torch_with_padded_shape() @setup_host_and_device @@ -2224,7 +2224,10 @@ def tensor_pad( t0 = setup_tt_tensor(x, device, layout[0], input_mem_config[0], dtype[0]) t1 = t0.pad(output_tensor_shape, input_tensor_start, pad_value) - return tt2torch_tensor(t1) + tt_output = t1.cpu() + if tt_output.get_layout() != ttnn.ROW_MAJOR_LAYOUT: + tt_output = tt_output.to(ttnn.ROW_MAJOR_LAYOUT) + return tt_output.to_torch_with_padded_shape() @setup_host_and_device @@ -2271,7 +2274,10 @@ def pad_to_tile( t0 = setup_tt_tensor(x, device, layout[0], input_mem_config[0], dtype[0]) t1 = t0.pad_to_tile(pad_value) - return tt2torch_tensor(t1) + tt_output = t1.cpu() + if tt_output.get_layout() != ttnn.ROW_MAJOR_LAYOUT: + tt_output = tt_output.to(ttnn.ROW_MAJOR_LAYOUT) + return tt_output.to_torch_with_padded_shape() @setup_host_and_device diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_padding_test.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_padding_test.py index 726963c8465..c3025e416d6 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_padding_test.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_padding_test.py @@ -29,7 +29,7 @@ def test_run_padding_test(input_tensor_shape, output_tensor_shape, input_tensor_ # Pad inputs on host a_pad = a.pad(output_tensor_shape, input_tensor_start, pad_value) - a_pt = a_pad.to_torch() + a_pt = a_pad.to_torch_with_padded_shape() # Pytorch reference input_tensor_end = tuple(input_tensor_start[i] + input_tensor_shape[i] for i in range(len(input_tensor_shape))) @@ -172,7 +172,7 @@ def test_run_tile_padding_test(input_tensor_shape, pad_value): # Pad inputs on host a_pad = a.pad_to_tile(pad_value) - a_pt = a_pad.to_torch() + a_pt = a_pad.to_torch_with_padded_shape() # Pytorch reference input_tensor_end = tuple(input_tensor_shape[i] for i in range(len(input_tensor_shape))) diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_sharded.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_sharded.py index d123cec54f9..19ca727f549 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_sharded.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_sharded.py @@ -1797,7 +1797,7 @@ def test_sharded_tilize_with_val_padding(input_shape, sharding_config, output_dt interleaved_mem_config, ) - tt_got_back = yt.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch() + tt_got_back = yt.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch_with_padded_shape() y = torch.nn.functional.pad(x, [0, 0, 0, roundup32(H) - H], "constant", 1.0) diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_tilize_hpadding_matmul.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_tilize_hpadding_matmul.py index e21c4ac19e6..628f57e7fa4 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_tilize_hpadding_matmul.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_tilize_hpadding_matmul.py @@ -49,7 +49,7 @@ def run_tilize_matmul_test(M, K, N, device): print("Shape of B_t - " + str(b_t.padded_shape)) t2 = ttnn.matmul(a_t, b_t) assert list(t2.padded_shape) == output_shape - tt_host_rm = t2.cpu().to_torch() + tt_host_rm = t2.cpu().to_torch_with_padded_shape() pyt_got_back = tt_host_rm.reshape(output_shape) # TODO: add support to remove padding in untilize pyt_got_back_rm = untilize(pyt_got_back) diff --git a/tests/ttnn/unit_tests/operations/test_fill_pad.py b/tests/ttnn/unit_tests/operations/test_fill_pad.py index 3f1b9289e7f..4b7884503f5 100644 --- a/tests/ttnn/unit_tests/operations/test_fill_pad.py +++ b/tests/ttnn/unit_tests/operations/test_fill_pad.py @@ -93,7 +93,7 @@ def test_fill_pad( ) output_tensor = ttnn.fill_implicit_tile_padding(input_tensor, fill_value, memory_config=output_mem_config) - padded_torch_output_tensor = ttnn.from_device(output_tensor).to_torch() + padded_torch_output_tensor = ttnn.from_device(output_tensor).to_torch_with_padded_shape() assert_with_pcc(padded_torch_tensor, padded_torch_output_tensor) @@ -160,7 +160,7 @@ def test_fill_pad_complex_sharding(device, fill_value, shape, shard_scheme, dtyp ) output_tensor = ttnn.fill_implicit_tile_padding(input_tensor, fill_value, memory_config=ttnn.DRAM_MEMORY_CONFIG) - padded_torch_output_tensor = ttnn.from_device(output_tensor).to_torch() + padded_torch_output_tensor = ttnn.from_device(output_tensor).to_torch_with_padded_shape() assert_with_pcc(padded_torch_tensor, padded_torch_output_tensor, 0.99) @@ -233,6 +233,6 @@ def test_fill_pad_sharded(device, fill_value, shape, shard_scheme, dtype): ) output_tensor = ttnn.fill_implicit_tile_padding(input_tensor, fill_value, memory_config=ttnn.DRAM_MEMORY_CONFIG) - padded_torch_output_tensor = ttnn.from_device(output_tensor).to_torch() + padded_torch_output_tensor = ttnn.from_device(output_tensor).to_torch_with_padded_shape() assert_with_pcc(padded_torch_tensor, padded_torch_output_tensor, 0.99) diff --git a/tests/ttnn/unit_tests/tensor/test_tensor_creation.py b/tests/ttnn/unit_tests/tensor/test_tensor_creation.py index e3df9a79765..334f13ec362 100644 --- a/tests/ttnn/unit_tests/tensor/test_tensor_creation.py +++ b/tests/ttnn/unit_tests/tensor/test_tensor_creation.py @@ -243,8 +243,8 @@ def test_tensor_creation_with_memory_config(shape, memory_config, tt_dtype, layo tt_tensor_1 = tt_tensor_1.cpu() tt_tensor_2 = tt_tensor_2.cpu() - py_tensor_after_round_trip_1 = tt_tensor_1.to_torch_with_logical_shape() - py_tensor_after_round_trip_2 = tt_tensor_2.to_torch_with_logical_shape() + py_tensor_after_round_trip_1 = tt_tensor_1.to_torch() + py_tensor_after_round_trip_2 = tt_tensor_2.to_torch() py_tensor_after_round_trip_3 = ttnn.to_torch(tt_tensor_1) py_tensor_after_round_trip_4 = ttnn.to_torch(tt_tensor_2) diff --git a/ttnn/cpp/pybind11/pytensor.cpp b/ttnn/cpp/pybind11/pytensor.cpp index f6e55603d8a..51430ff6b2c 100644 --- a/ttnn/cpp/pybind11/pytensor.cpp +++ b/ttnn/cpp/pybind11/pytensor.cpp @@ -387,12 +387,12 @@ Tensor convert_python_tensors_to_tt_tensors( template owned_buffer::Buffer create_row_major_owned_buffer( - owned_buffer::Buffer&& owned_buffer, const ttnn::TensorSpec& tensor_spec, const bool legacy_output) { + owned_buffer::Buffer&& owned_buffer, const ttnn::TensorSpec& tensor_spec, const bool padded_output) { TT_FATAL( !tensor_spec.memory_config().is_sharded() or tensor_spec.memory_config().shard_spec.has_value(), "Sharded tensors must have a shard spec when converting to tt tensors!"); - if (legacy_output) { + if (padded_output) { if (tensor_spec.layout() == Layout::TILE) { auto data = tensor_impl::convert_layout_tile_to_row_major( tensor_spec.physical_shape(), tensor_spec.tile(), owned_buffer); @@ -410,39 +410,39 @@ owned_buffer::Buffer create_row_major_owned_buffer( } std::variant get_host_buffer_from_tensor( - const Tensor& tt_tensor, const bool legacy_output) { + const Tensor& tt_tensor, const bool padded_output) { TT_ASSERT(tt_tensor.storage_type() == StorageType::OWNED or tt_tensor.storage_type() == StorageType::BORROWED); using RetType = std::variant; return std::visit( tt::stl::overloaded{ - [&tt_tensor, legacy_output](const OwnedStorage& storage) -> RetType { + [&tt_tensor, padded_output](const OwnedStorage& storage) -> RetType { const auto& tensor_spec = tt_tensor.get_tensor_spec(); const auto tt_dtype = tensor_spec.data_type(); switch (tt_dtype) { case DataType::UINT8: { return create_row_major_owned_buffer( - std::move(owned_buffer::get_as(storage.buffer)), tensor_spec, legacy_output); + std::move(owned_buffer::get_as(storage.buffer)), tensor_spec, padded_output); } case DataType::UINT16: { return create_row_major_owned_buffer( - std::move(owned_buffer::get_as(storage.buffer)), tensor_spec, legacy_output); + std::move(owned_buffer::get_as(storage.buffer)), tensor_spec, padded_output); } case DataType::INT32: { return create_row_major_owned_buffer( - std::move(owned_buffer::get_as(storage.buffer)), tensor_spec, legacy_output); + std::move(owned_buffer::get_as(storage.buffer)), tensor_spec, padded_output); } case DataType::UINT32: { return create_row_major_owned_buffer( - std::move(owned_buffer::get_as(storage.buffer)), tensor_spec, legacy_output); + std::move(owned_buffer::get_as(storage.buffer)), tensor_spec, padded_output); } case DataType::FLOAT32: { return create_row_major_owned_buffer( - std::move(owned_buffer::get_as(storage.buffer)), tensor_spec, legacy_output); + std::move(owned_buffer::get_as(storage.buffer)), tensor_spec, padded_output); } case DataType::BFLOAT16: { return create_row_major_owned_buffer( - std::move(owned_buffer::get_as<::bfloat16>(storage.buffer)), tensor_spec, legacy_output); + std::move(owned_buffer::get_as<::bfloat16>(storage.buffer)), tensor_spec, padded_output); } case DataType::BFLOAT8_B: case DataType::BFLOAT4_B: { @@ -455,7 +455,7 @@ std::variant get_host_buffer_from_tensor( : unpack_bfp4_tiles_into_float_vec( uint32_data, /*row_major_output=*/false, /*is_exp_a=*/false, tile); auto input_float_buffer = owned_buffer::create(std::move(float_unpacked_data)); - return create_row_major_owned_buffer(std::move(input_float_buffer), tensor_spec, legacy_output); + return create_row_major_owned_buffer(std::move(input_float_buffer), tensor_spec, padded_output); } default: { TT_THROW("Unsupported DataType: {}", tt_dtype); @@ -473,20 +473,13 @@ std::variant get_host_buffer_from_tensor( tt_tensor.get_storage()); } -py::object convert_tt_tensor_to_torch_tensor(const Tensor& tt_tensor, const bool legacy_output = false) { +py::object convert_tt_tensor_to_torch_tensor(const Tensor& tt_tensor, const bool padded_output = false) { GraphTracker::instance().track_function_start( - "tt::tt_metal::detail::convert_tt_tensor_to_torch_tensor", tt_tensor, legacy_output); - - // TODO: Remove legacy_output flag which supports old behaviour of returning tensors with padded shape. - // These cases need to be fixed: - // ROW_MAJOR tensors with padding (since ROW_MAJOR has no alignment, cannot automatically strip data unless - // padded shape is queried) Physical sharding on padded shape (unlike interleaved tensors, cannot derive an - // equivalent logical shard spec to strip out data) - // One way to clean this up is: - // 1. Update tests to use ttnn.from_torch and ttnn.to_torch - // 2. Fix usage of tensor.to_torch inside ttnn functional APIs - // 3. Deprecate old tensor.to_torch and rename tensor.to_torch_with_logical_shape back to tensor.to_torch - auto buffer = get_host_buffer_from_tensor(tt_tensor, legacy_output); + "tt::tt_metal::detail::convert_tt_tensor_to_torch_tensor", tt_tensor, padded_output); + + // TODO: Remove padded_output flag which supports old behaviour of returning tensors with padded shape. + // Need to update tests to not use tensor.to_torch_with_padded_shape() + auto buffer = get_host_buffer_from_tensor(tt_tensor, padded_output); py::object torch = py::module_::import("torch"); auto frombuffer = torch.attr("frombuffer"); @@ -530,7 +523,7 @@ py::object convert_tt_tensor_to_torch_tensor(const Tensor& tt_tensor, const bool return frombuffer(buffer, py::arg("dtype") = torch_dtype); }(); - if (legacy_output) { + if (padded_output) { auto shape = tt_tensor.get_padded_shape(); torch_shape = std::vector(shape.cbegin(), shape.cend()); } @@ -1474,7 +1467,7 @@ void pytensor_module(py::module& m_tensor) { )doc", py::return_value_policy::reference) .def( - "to_torch", + "to_torch_with_padded_shape", [](const Tensor& self) -> py::object { return detail::convert_tt_tensor_to_torch_tensor(self, true); }, R"doc( Convert tensor to torch tensor using legacy padded shape. @@ -1484,11 +1477,11 @@ void pytensor_module(py::module& m_tensor) { .. code-block:: python - data = tt_tensor.cpu().to_torch() # move TT Tensor to host and convert it to torch tensor + data = tt_tensor.cpu().to_torch_with_padded_shape() # move TT Tensor to host and convert it to torch tensor )doc") .def( - "to_torch_with_logical_shape", + "to_torch", [](const Tensor& self) -> py::object { return detail::convert_tt_tensor_to_torch_tensor(self); }, R"doc( Convert tensor to torch tensor. @@ -1497,7 +1490,7 @@ void pytensor_module(py::module& m_tensor) { .. code-block:: python - data = tt_tensor.cpu().to_torch_with_logical_shape() # move TT Tensor to host and convert it to torch tensor + data = tt_tensor.cpu().to_torch() # move TT Tensor to host and convert it to torch tensor )doc") .def( diff --git a/ttnn/ttnn/operations/core.py b/ttnn/ttnn/operations/core.py index 39db661f28e..c47d76b4d3c 100644 --- a/ttnn/ttnn/operations/core.py +++ b/ttnn/ttnn/operations/core.py @@ -312,14 +312,14 @@ def to_torch( raise RuntimeError("ttnn.to_torch: Shard spec must not be None for sharded tensors") if memory_config.is_sharded() and memory_config.shard_spec.mode == ttnn.ShardMode.LOGICAL: - tensor = tensor.to_torch_with_logical_shape() + tensor = tensor.to_torch() else: if (tensor.layout != ttnn.ROW_MAJOR_LAYOUT) and not ( tensor.dtype == ttnn.bfloat8_b or tensor.dtype == ttnn.bfloat4_b ): tensor = tensor.to(ttnn.ROW_MAJOR_LAYOUT, device) - tensor = tensor.to_torch_with_logical_shape() + tensor = tensor.to_torch() if torch_rank is not None: while len(tensor.shape) > torch_rank: From f4719c78f638d8aeb134aa10a67554a402a69531 Mon Sep 17 00:00:00 2001 From: Oleg Milyutin Date: Thu, 20 Feb 2025 17:53:05 -0500 Subject: [PATCH 204/316] #17477: Adopt ND coordinate system in `MeshDeviceView` and the related abstractions (#18073) ### Ticket #17477 ### Problem description Continuing plumbing ND coordinate system across Metal / TTNN. ### What's changed * Adopted `SimpleMeshShape` in `MeshDeviceView`. * Removed `Coordinate`. * Simplified `MeshDeviceView` construction (unused `CoordinateMapper`), simplified getting line/ring coordinates. * Support ND rotation when requesting specific mesh shape from `SystemMesh`. * More features in ND `MeshContainer`, `MeshCoordinate`, `MeshCoordinateRange`. ### Checklist - [x] [All post commit](https://github.com/tenstorrent/tt-metal/actions/runs/13441887717) - pending - [Build failures in programming examples fixed and verified](https://github.com/tenstorrent/tt-metal/actions/runs/13444193986) - [X] New/Existing tests provide coverage for changes - [X] Ran the affected T3K distributed tests locally (`unit_tests_ttnn_cc`, `unit_tests_ttnn_tensor`, `test_distributed`, `distributed_unit_tests_wormhole_b0`). --- .../tt_metal/distributed/test_mesh_buffer.cpp | 12 +- .../tt_metal/distributed/test_mesh_coord.cpp | 42 ++++ .../tt_metal/distributed/test_mesh_events.cpp | 23 +- .../distributed/test_mesh_sub_device.cpp | 3 +- .../distributed/test_mesh_workload.cpp | 4 +- .../test_ethernet_hop_latencies_no_edm.cpp | 53 ++-- tests/ttnn/distributed/test_distributed.cpp | 22 ++ .../distributed/test_distributed_reshape.cpp | 52 +++- ...erisc_data_mover_loopback_with_workers.cpp | 29 ++- tt_metal/api/tt-metalium/distributed.hpp | 5 +- tt_metal/api/tt-metalium/mesh_buffer.hpp | 1 - .../api/tt-metalium/mesh_command_queue.hpp | 2 +- tt_metal/api/tt-metalium/mesh_config.hpp | 6 +- tt_metal/api/tt-metalium/mesh_coord.hpp | 31 +++ tt_metal/api/tt-metalium/mesh_device_view.hpp | 128 ++++------ tt_metal/common/mesh_coord.cpp | 16 ++ tt_metal/distributed/mesh_buffer.cpp | 4 - tt_metal/distributed/mesh_command_queue.cpp | 14 +- tt_metal/distributed/mesh_device.cpp | 26 +- tt_metal/distributed/mesh_device_view.cpp | 228 +++++++----------- tt_metal/distributed/system_mesh.cpp | 187 +++++++------- .../distributed_program_dispatch.cpp | 3 +- .../distributed_buffer_rw.cpp | 2 +- .../distributed_eltwise_add.cpp | 2 +- ttnn/cpp/ttnn/distributed/api.cpp | 9 +- .../ttnn/distributed/distributed_pybind.cpp | 7 +- ttnn/cpp/ttnn/distributed/types.hpp | 4 + .../ccl/all_gather/device/all_gather_op.cpp | 17 +- .../device/reduce_scatter_op.cpp | 16 +- .../device/all_gather_async_op.cpp | 7 +- .../device/reduce_scatter_async_op.cpp | 7 +- ttnn/cpp/ttnn/tensor/storage.cpp | 2 +- ttnn/cpp/ttnn/tensor/tensor_impl.cpp | 27 +-- 33 files changed, 542 insertions(+), 449 deletions(-) diff --git a/tests/tt_metal/distributed/test_mesh_buffer.cpp b/tests/tt_metal/distributed/test_mesh_buffer.cpp index f85f57a329b..d1834c37595 100644 --- a/tests/tt_metal/distributed/test_mesh_buffer.cpp +++ b/tests/tt_metal/distributed/test_mesh_buffer.cpp @@ -137,9 +137,9 @@ TEST_F(MeshBufferTestT3000, GetDeviceBuffer) { MeshBuffer::create(ReplicatedBufferConfig{.size = 16 << 10}, device_local_config, mesh_device_.get()); // Out of bounds coordinates. - EXPECT_ANY_THROW(replicated_buffer->get_device_buffer(Coordinate{2, 4})); + EXPECT_ANY_THROW(replicated_buffer->get_device_buffer(MeshCoordinate{2, 4})); - EXPECT_NO_THROW(replicated_buffer->get_device_buffer(Coordinate{1, 3})); + EXPECT_NO_THROW(replicated_buffer->get_device_buffer(MeshCoordinate{1, 3})); } class DeviceLocalMeshBufferShardingTest @@ -174,14 +174,14 @@ TEST_P(DeviceLocalMeshBufferShardingTest, ShardingTest) { for (std::size_t logical_x = 0; logical_x < buf->device()->num_cols(); logical_x++) { for (std::size_t logical_y = 0; logical_y < buf->device()->num_rows(); logical_y++) { - WriteShard(mesh_device_->mesh_command_queue(), buf, src_vec, Coordinate(logical_y, logical_x)); + WriteShard(mesh_device_->mesh_command_queue(), buf, src_vec, MeshCoordinate(logical_y, logical_x)); } } for (std::size_t logical_x = 0; logical_x < buf->device()->num_cols(); logical_x++) { for (std::size_t logical_y = 0; logical_y < buf->device()->num_rows(); logical_y++) { std::vector dst_vec = {}; - ReadShard(mesh_device_->mesh_command_queue(), dst_vec, buf, Coordinate(logical_y, logical_x)); + ReadShard(mesh_device_->mesh_command_queue(), dst_vec, buf, MeshCoordinate(logical_y, logical_x)); EXPECT_EQ(dst_vec, src_vec); } } @@ -304,14 +304,14 @@ TEST_F(MeshBufferTestSuite, InterleavedShardsReadWrite) { std::iota(src_vec.begin(), src_vec.end(), i); for (std::size_t logical_x = 0; logical_x < buf->device()->num_cols(); logical_x++) { for (std::size_t logical_y = 0; logical_y < buf->device()->num_rows(); logical_y++) { - WriteShard(mesh_device_->mesh_command_queue(), buf, src_vec, Coordinate(logical_y, logical_x)); + WriteShard(mesh_device_->mesh_command_queue(), buf, src_vec, MeshCoordinate(logical_y, logical_x)); } } for (std::size_t logical_x = 0; logical_x < buf->device()->num_cols(); logical_x++) { for (std::size_t logical_y = 0; logical_y < buf->device()->num_rows(); logical_y++) { std::vector dst_vec = {}; - ReadShard(mesh_device_->mesh_command_queue(), dst_vec, buf, Coordinate(logical_y, logical_x)); + ReadShard(mesh_device_->mesh_command_queue(), dst_vec, buf, MeshCoordinate(logical_y, logical_x)); EXPECT_EQ(dst_vec, src_vec); } } diff --git a/tests/tt_metal/distributed/test_mesh_coord.cpp b/tests/tt_metal/distributed/test_mesh_coord.cpp index 9c364c735b4..16eaa7a04bd 100644 --- a/tests/tt_metal/distributed/test_mesh_coord.cpp +++ b/tests/tt_metal/distributed/test_mesh_coord.cpp @@ -13,6 +13,7 @@ namespace { using ::testing::ElementsAre; using ::testing::UnorderedElementsAre; + TEST(SimpleMeshShapeTest, Construction) { SimpleMeshShape shape_1d(3); EXPECT_EQ(shape_1d.dims(), 1); @@ -172,6 +173,31 @@ TEST(MeshCoordinateRangeTest, SubrangeOneElement) { EXPECT_THAT(coords, ElementsAre(MeshCoordinate(1, 1, 1))); } +TEST(MeshCoordinateRangeTest, Contains) { + MeshCoordinateRange range(MeshCoordinate(1, 1, 3), MeshCoordinate(1, 1, 3)); + EXPECT_TRUE(range.contains(MeshCoordinate(1, 1, 3))); + + range = MeshCoordinateRange(MeshCoordinate(0, 2), MeshCoordinate(1, 2)); + EXPECT_TRUE(range.contains(MeshCoordinate(0, 2))); + EXPECT_TRUE(range.contains(MeshCoordinate(1, 2))); + EXPECT_FALSE(range.contains(MeshCoordinate(0, 1))); + EXPECT_FALSE(range.contains(MeshCoordinate(2, 1))); + EXPECT_FALSE(range.contains(MeshCoordinate(2, 2))); +} + +TEST(MeshCoordinateRangeTest, Dimensionality) { + EXPECT_EQ(MeshCoordinateRange(MeshCoordinate(0), MeshCoordinate(5)).dims(), 1); + EXPECT_EQ(MeshCoordinateRange(MeshCoordinate(0, 1), MeshCoordinate(5, 1)).dims(), 2); + EXPECT_EQ(MeshCoordinateRange(MeshCoordinate(0, 1, 2), MeshCoordinate(5, 1, 2)).dims(), 3); +} + +TEST(MeshCoordinateRangeTest, ContainsMismatchedDimensions) { + MeshCoordinateRange range(MeshCoordinate(1, 1, 3), MeshCoordinate(1, 1, 3)); + + EXPECT_EQ(range.dims(), 3); + EXPECT_ANY_THROW(range.contains(MeshCoordinate(1, 1))); +} + TEST(MeshCoordinateRangeTest, MismatchedDimensions) { MeshCoordinate start(1, 0); MeshCoordinate end(2, 3, 1); @@ -221,6 +247,22 @@ TEST(MeshContainerTest, InitialValues) { EXPECT_THAT(initial_values, ElementsAre(3, 3, 3, 3, 3, 3)); } +TEST(MeshContainerTest, FromVector) { + SimpleMeshShape shape(2, 3); + MeshContainer container(shape, std::vector{0, 1, 2, 3, 4, 5}); + + std::vector initial_values; + for (const auto& [_, value] : container) { + initial_values.push_back(value); + } + EXPECT_THAT(initial_values, ElementsAre(0, 1, 2, 3, 4, 5)); +} + +TEST(MeshContainerTest, FromVectorInvalidSize) { + SimpleMeshShape shape(2, 3); + EXPECT_ANY_THROW(MeshContainer(shape, std::vector{0, 1, 2, 3, 4})); +} + TEST(MeshContainerTest, ElementAccessRowMajor) { SimpleMeshShape shape(2, 3); MeshContainer container(shape, 0); diff --git a/tests/tt_metal/distributed/test_mesh_events.cpp b/tests/tt_metal/distributed/test_mesh_events.cpp index 336c8e8ccf1..4b942f0391d 100644 --- a/tests/tt_metal/distributed/test_mesh_events.cpp +++ b/tests/tt_metal/distributed/test_mesh_events.cpp @@ -50,9 +50,12 @@ TEST_F(MeshEventsTestSuite, ReplicatedAsyncIO) { for (std::size_t logical_x = 0; logical_x < buf->device()->num_cols(); logical_x++) { for (std::size_t logical_y = 0; logical_y < buf->device()->num_rows(); logical_y++) { readback_vecs.push_back({}); - auto shard = buf->get_device_buffer(Coordinate(logical_y, logical_x)); + auto shard = buf->get_device_buffer(MeshCoordinate(logical_y, logical_x)); ReadShard( - mesh_device_->mesh_command_queue(1), readback_vecs.back(), buf, Coordinate(logical_y, logical_x)); + mesh_device_->mesh_command_queue(1), + readback_vecs.back(), + buf, + MeshCoordinate(logical_y, logical_x)); } } @@ -173,7 +176,7 @@ TEST_F(MeshEventsTestSuite, AsyncWorkloadAndIO) { mesh_device_->mesh_command_queue(1), dst_vec, output_bufs[col_idx * worker_grid_size.y + row_idx], - Coordinate(logical_y, logical_x)); + MeshCoordinate(logical_y, logical_x)); if (logical_y == 0) { for (int i = 0; i < dst_vec.size(); i++) { EXPECT_EQ(dst_vec[i].to_float(), (2 * iter + 5)); @@ -224,9 +227,12 @@ TEST_F(MeshEventsTestSuite, CustomDeviceRanges) { for (std::size_t logical_x = devices_0.start_coord.x; logical_x < devices_0.end_coord.x; logical_x++) { for (std::size_t logical_y = devices_0.start_coord.y; logical_y < devices_0.end_coord.y; logical_y++) { readback_vecs.push_back({}); - auto shard = buf->get_device_buffer(Coordinate(logical_y, logical_x)); + auto shard = buf->get_device_buffer(MeshCoordinate(logical_y, logical_x)); ReadShard( - mesh_device_->mesh_command_queue(0), readback_vecs.back(), buf, Coordinate(logical_y, logical_x)); + mesh_device_->mesh_command_queue(0), + readback_vecs.back(), + buf, + MeshCoordinate(logical_y, logical_x)); } } @@ -237,9 +243,12 @@ TEST_F(MeshEventsTestSuite, CustomDeviceRanges) { for (std::size_t logical_x = devices_1.start_coord.x; logical_x < devices_1.end_coord.x; logical_x++) { for (std::size_t logical_y = devices_1.start_coord.y; logical_y < devices_1.end_coord.y; logical_y++) { readback_vecs.push_back({}); - auto shard = buf->get_device_buffer(Coordinate(logical_y, logical_x)); + auto shard = buf->get_device_buffer(MeshCoordinate(logical_y, logical_x)); ReadShard( - mesh_device_->mesh_command_queue(0), readback_vecs.back(), buf, Coordinate(logical_y, logical_x)); + mesh_device_->mesh_command_queue(0), + readback_vecs.back(), + buf, + MeshCoordinate(logical_y, logical_x)); } } for (auto& vec : readback_vecs) { diff --git a/tests/tt_metal/distributed/test_mesh_sub_device.cpp b/tests/tt_metal/distributed/test_mesh_sub_device.cpp index d16bfedc48a..b39608a0781 100644 --- a/tests/tt_metal/distributed/test_mesh_sub_device.cpp +++ b/tests/tt_metal/distributed/test_mesh_sub_device.cpp @@ -129,7 +129,8 @@ TEST_F(MeshSubDeviceTestSuite, DataCopyOnSubDevices) { for (std::size_t logical_x = 0; logical_x < output_buf->device()->num_cols(); logical_x++) { for (std::size_t logical_y = 0; logical_y < output_buf->device()->num_rows(); logical_y++) { std::vector dst_vec; - ReadShard(mesh_device_->mesh_command_queue(), dst_vec, output_buf, Coordinate(logical_y, logical_x)); + ReadShard( + mesh_device_->mesh_command_queue(), dst_vec, output_buf, MeshCoordinate(logical_y, logical_x)); EXPECT_EQ(dst_vec, src_vec); } } diff --git a/tests/tt_metal/distributed/test_mesh_workload.cpp b/tests/tt_metal/distributed/test_mesh_workload.cpp index ef19ed2395c..66aa84357a6 100644 --- a/tests/tt_metal/distributed/test_mesh_workload.cpp +++ b/tests/tt_metal/distributed/test_mesh_workload.cpp @@ -570,7 +570,7 @@ TEST_F(MeshWorkloadTestSuite, EltwiseBinaryMeshWorkload) { mesh_device_->mesh_command_queue(), dst_vec, output_bufs[col_idx * worker_grid_size.y + row_idx], - Coordinate(logical_y, logical_x)); + MeshCoordinate(logical_y, logical_x)); if (logical_y == 0) { for (int i = 0; i < dst_vec.size(); i++) { EXPECT_EQ(dst_vec[i].to_float(), 5); @@ -687,7 +687,7 @@ TEST_F(MeshWorkloadTestSuite, MeshWorkloadSanity) { mesh_device_->mesh_command_queue(), dst_vec, output_buffers[col_idx * worker_grid_size.y + row_idx], - Coordinate(logical_y, logical_x)); + MeshCoordinate(logical_y, logical_x)); for (int i = 0; i < dst_vec.size(); i++) { float ref_val = std::pow(2, (iter % 2) + 1); if (i >= 512) { diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_hop_latencies_no_edm.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_hop_latencies_no_edm.cpp index 5e8a4b23024..3b9177b6596 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_hop_latencies_no_edm.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_hop_latencies_no_edm.cpp @@ -33,6 +33,7 @@ #include "eth_l1_address_map.h" using tt::tt_metal::IDevice; +using tt::tt_metal::distributed::MeshCoordinate; using tt::tt_metal::distributed::MeshDevice; using tt::tt_metal::distributed::MeshDeviceConfig; using tt::tt_metal::distributed::MeshDeviceView; @@ -453,44 +454,44 @@ int main(int argc, char** argv) { switch (n_hops) { case 2: return std::vector{ - view.get_device(0, 0), - view.get_device(0, 1), + view.get_device(MeshCoordinate(0, 0)), + view.get_device(MeshCoordinate(0, 1)), }; case 4: return std::vector{ - view.get_device(1, 1), - view.get_device(0, 1), - view.get_device(0, 2), - view.get_device(1, 2), + view.get_device(MeshCoordinate(1, 1)), + view.get_device(MeshCoordinate(0, 1)), + view.get_device(MeshCoordinate(0, 2)), + view.get_device(MeshCoordinate(1, 2)), }; case 8: return std::vector{ - view.get_device(1, 1), - view.get_device(1, 0), - view.get_device(0, 0), - view.get_device(0, 1), - view.get_device(0, 2), - view.get_device(0, 3), - view.get_device(1, 3), - view.get_device(1, 2), + view.get_device(MeshCoordinate(1, 1)), + view.get_device(MeshCoordinate(1, 0)), + view.get_device(MeshCoordinate(0, 0)), + view.get_device(MeshCoordinate(0, 1)), + view.get_device(MeshCoordinate(0, 2)), + view.get_device(MeshCoordinate(0, 3)), + view.get_device(MeshCoordinate(1, 3)), + view.get_device(MeshCoordinate(1, 2)), }; case 12: // Does an extra loop through the inner ring return std::vector{ - view.get_device(1, 1), - view.get_device(1, 0), - view.get_device(0, 0), - view.get_device(0, 1), - view.get_device(0, 2), - view.get_device(1, 2), - view.get_device(1, 1), - view.get_device(0, 1), - view.get_device(0, 2), - view.get_device(0, 3), - view.get_device(1, 3), - view.get_device(1, 2), + view.get_device(MeshCoordinate(1, 1)), + view.get_device(MeshCoordinate(1, 0)), + view.get_device(MeshCoordinate(0, 0)), + view.get_device(MeshCoordinate(0, 1)), + view.get_device(MeshCoordinate(0, 2)), + view.get_device(MeshCoordinate(1, 2)), + view.get_device(MeshCoordinate(1, 1)), + view.get_device(MeshCoordinate(0, 1)), + view.get_device(MeshCoordinate(0, 2)), + view.get_device(MeshCoordinate(0, 3)), + view.get_device(MeshCoordinate(1, 3)), + view.get_device(MeshCoordinate(1, 2)), }; default: TT_THROW("Unsupported hop_count"); return std::vector{}; diff --git a/tests/ttnn/distributed/test_distributed.cpp b/tests/ttnn/distributed/test_distributed.cpp index f6e4cf7d5da..c96312176f1 100644 --- a/tests/ttnn/distributed/test_distributed.cpp +++ b/tests/ttnn/distributed/test_distributed.cpp @@ -4,11 +4,15 @@ #include +#include + #include #include namespace ttnn::distributed::test { +using ::tt::tt_metal::distributed::MeshContainer; + class DistributedTest : public ::testing::Test { protected: void SetUp() override {} @@ -46,4 +50,22 @@ TEST_F(DistributedTest, TestNumDramChannels) { EXPECT_EQ(mesh->num_dram_channels(), 96); // 8 devices * 12 channels } +TEST_F(DistributedTest, ViewIs2D) { + auto mesh = ttnn::distributed::open_mesh_device( + {2, 4}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); + std::vector devices = mesh->get_devices(); + + MeshContainer container_1d(SimpleMeshShape(8), devices); + MeshDeviceView view_1d(container_1d); + EXPECT_FALSE(view_1d.is_mesh_2d()); + + MeshContainer container_2d(SimpleMeshShape(2, 4), devices); + MeshDeviceView view_2d(container_2d); + EXPECT_TRUE(view_2d.is_mesh_2d()); + + MeshContainer container_3d(SimpleMeshShape(2, 2, 2), devices); + MeshDeviceView view_3d(container_3d); + EXPECT_FALSE(view_3d.is_mesh_2d()); +} + } // namespace ttnn::distributed::test diff --git a/tests/ttnn/distributed/test_distributed_reshape.cpp b/tests/ttnn/distributed/test_distributed_reshape.cpp index 9b84cb3fec0..212368f8d7f 100644 --- a/tests/ttnn/distributed/test_distributed_reshape.cpp +++ b/tests/ttnn/distributed/test_distributed_reshape.cpp @@ -3,17 +3,21 @@ // SPDX-License-Identifier: Apache-2.0 #include - +#include #include #include #include #include +#include "mesh_coord.hpp" #include "tests/tt_metal/test_utils/env_vars.hpp" namespace ttnn::distributed::test { +namespace { + +using ::testing::SizeIs; // Helper function to check test environment -void check_test_environment() { +void check_t3k_test_environment() { auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); const auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); const size_t num_devices = tt::tt_metal::GetNumAvailableDevices(); @@ -39,10 +43,10 @@ static constexpr std::array kMeshShapes{ class MeshConfigurationTest : public ::testing::TestWithParam { protected: - void SetUp() override { check_test_environment(); } + void SetUp() override { check_t3k_test_environment(); } }; -TEST_P(MeshConfigurationTest, TestMeshConfigurations) { +TEST_P(MeshConfigurationTest, MeshConfigurations) { const auto& shape = GetParam(); auto mesh = ttnn::distributed::open_mesh_device( {shape.num_rows, shape.num_cols}, @@ -55,15 +59,24 @@ TEST_P(MeshConfigurationTest, TestMeshConfigurations) { ttnn::distributed::close_mesh_device(mesh); } +TEST_P(MeshConfigurationTest, GetPhysicalDeviceIds) { + const auto& shape = GetParam(); + + auto& system_mesh = tt::tt_metal::distributed::SystemMesh::instance(); + EXPECT_THAT( + system_mesh.get_mapped_physical_device_ids(MeshDeviceConfig{.mesh_shape = SimpleMeshShape(shape)}), + SizeIs(shape.num_cols * shape.num_rows)); +} + // Test all possible mesh configurations on T3000 INSTANTIATE_TEST_SUITE_P(MeshShapes, MeshConfigurationTest, ::testing::ValuesIn(kMeshShapes)); class MeshReshapeTest : public ::testing::TestWithParam> { protected: - void SetUp() override { check_test_environment(); } + void SetUp() override { check_t3k_test_environment(); } }; -TEST_P(MeshReshapeTest, TestReshapeBetweenConfigurations) { +TEST_P(MeshReshapeTest, ReshapeBetweenConfigurations) { const auto& [old_shape, new_shape] = GetParam(); if ((old_shape.num_rows * old_shape.num_cols) != (new_shape.num_rows * new_shape.num_cols)) { @@ -105,9 +118,31 @@ INSTANTIATE_TEST_SUITE_P( // Base class for non-parameterized tests class T3000ReshapeTest : public ::testing::Test { protected: - void SetUp() override { check_test_environment(); } + void SetUp() override { check_t3k_test_environment(); } }; +TEST_F(T3000ReshapeTest, InvalidRequestedShape) { + auto& system_mesh = tt::tt_metal::distributed::SystemMesh::instance(); + + // Shape too big. + EXPECT_ANY_THROW(system_mesh.get_mapped_physical_device_ids(MeshDeviceConfig{.mesh_shape = SimpleMeshShape(9)})); + EXPECT_ANY_THROW(system_mesh.get_mapped_physical_device_ids(MeshDeviceConfig{.mesh_shape = SimpleMeshShape(2, 5)})); + + // Invalid offset. + EXPECT_ANY_THROW(system_mesh.get_mapped_physical_device_ids( + MeshDeviceConfig{.mesh_shape = SimpleMeshShape(1, 8), .offset = MeshCoordinate(0, 1)})); + EXPECT_ANY_THROW(system_mesh.get_mapped_physical_device_ids( + MeshDeviceConfig{.mesh_shape = SimpleMeshShape(2, 3), .offset = MeshCoordinate(1, 1)})); + + // Offset dimensionality mismatch. + EXPECT_ANY_THROW(system_mesh.get_mapped_physical_device_ids( + MeshDeviceConfig{.mesh_shape = SimpleMeshShape(2, 3), .offset = MeshCoordinate(1)})); + + // Mismatch system mesh shape. + EXPECT_ANY_THROW(system_mesh.get_mapped_physical_device_ids( + MeshDeviceConfig{.mesh_shape = SimpleMeshShape(8), .offset = MeshCoordinate(1)})); +} + TEST_F(T3000ReshapeTest, InvalidReshapeDimensions) { auto mesh = ttnn::distributed::open_mesh_device( {1, 8}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); @@ -201,7 +236,7 @@ TEST_F(T3000ReshapeTest, From1x4To2x2Valid) { // Fetch the device ids for a physically connected 2x2 mesh. auto physical_device_ids = system_mesh.get_mapped_physical_device_ids(MeshDeviceConfig{ - .mesh_shape = MeshShape{2, 2}, + .mesh_shape = SimpleMeshShape(2, 2), }); // Supply the physical device ids to the mesh constructor that we know we know is 2x2 physically connected. @@ -245,4 +280,5 @@ TEST_F(T3000ReshapeTest, From2x2To1x4) { EXPECT_EQ(mesh_1x4_device_ids, expected_1x4_device_ids); } +} // namespace } // namespace ttnn::distributed::test diff --git a/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp b/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp index e45aa9d9395..52662ba9eef 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp @@ -56,6 +56,7 @@ struct SubdeviceInfo { std::unordered_map fabric_subdevice_id; }; +using tt::tt_metal::distributed::MeshCoordinate; using tt::tt_metal::distributed::MeshDevice; using tt::tt_metal::distributed::MeshDeviceConfig; using tt::tt_metal::distributed::MeshDeviceView; @@ -1125,7 +1126,10 @@ int TestLineFabricEntrypoint( // build a line of devices std::vector devices = { - view.get_device(0, 0), view.get_device(0, 1), view.get_device(0, 2), view.get_device(0, 3)}; + view.get_device(MeshCoordinate(0, 0)), + view.get_device(MeshCoordinate(0, 1)), + view.get_device(MeshCoordinate(0, 2)), + view.get_device(MeshCoordinate(0, 3))}; std::vector programs(enable_persistent_fabric ? 1 : devices.size()); std::optional subdevice_managers = std::nullopt; std::optional> fabric_programs; @@ -1206,8 +1210,8 @@ int TestLoopbackEntrypoint( T3000TestDevice test_fixture; auto view = test_fixture.mesh_device_->get_view(); - const auto& device_0 = view.get_device(0, 0); - const auto& device_1 = view.get_device(0, 1); + const auto& device_0 = view.get_device(MeshCoordinate(0, 0)); + const auto& device_1 = view.get_device(MeshCoordinate(0, 1)); auto const& active_eth_cores = device_0->get_active_ethernet_cores(true); auto eth_sender_core_iter = active_eth_cores.begin(); @@ -1390,7 +1394,7 @@ bool TestMultiInputReaderKernel( std::vector devices; devices.reserve(fabric_num_devices); for (size_t i = 0; i < fabric_num_devices; i++) { - devices.push_back(view.get_device(0, i)); + devices.push_back(view.get_device(MeshCoordinate(0, i))); } std::vector programs(enable_persistent_fabric ? 1 : devices.size()); @@ -2201,7 +2205,7 @@ bool RunPipelinedWorkersTest( T3000TestDevice test_fixture; auto view = test_fixture.mesh_device_->get_view(); - IDevice* device = view.get_device(0, 0); + IDevice* device = view.get_device(MeshCoordinate(0, 0)); ; // General setup is as follows: @@ -2741,7 +2745,10 @@ TEST(CclAsyncOp, ReduceScatterSmall_PersistentFabric) { // build a line of devices std::vector devices = { - view.get_device(0, 1), view.get_device(1, 1), view.get_device(1, 2), view.get_device(0, 2)}; + view.get_device(MeshCoordinate(0, 1)), + view.get_device(MeshCoordinate(1, 1)), + view.get_device(MeshCoordinate(1, 2)), + view.get_device(MeshCoordinate(0, 2))}; const size_t num_devices = devices.size(); TT_FATAL( test_expected_num_devices == num_devices, @@ -2861,7 +2868,10 @@ void run_all_gather_with_persistent_fabric(const size_t dim, const size_t num_li // build a line of devices std::vector devices = { - view.get_device(0, 0), view.get_device(0, 1), view.get_device(0, 2), view.get_device(0, 3)}; + view.get_device(MeshCoordinate(0, 0)), + view.get_device(MeshCoordinate(0, 1)), + view.get_device(MeshCoordinate(0, 2)), + view.get_device(MeshCoordinate(0, 3))}; const size_t num_devices = devices.size(); TT_FATAL( test_expected_num_devices == num_devices, @@ -3001,7 +3011,10 @@ void RunWriteThroughputStabilityTestWithPersistentFabric( // Get the inner 4 device ring on a WH T3K device so that we can use both links for all devices std::vector devices_ = { - view.get_device(0, 1), view.get_device(0, 2), view.get_device(1, 2), view.get_device(1, 1)}; + view.get_device(MeshCoordinate(0, 1)), + view.get_device(MeshCoordinate(0, 2)), + view.get_device(MeshCoordinate(1, 2)), + view.get_device(MeshCoordinate(1, 1))}; std::vector devices; devices.reserve(line_size); for (size_t i = 0; i < line_size; i++) { diff --git a/tt_metal/api/tt-metalium/distributed.hpp b/tt_metal/api/tt-metalium/distributed.hpp index 017214b437a..c1a1fa62fe5 100644 --- a/tt_metal/api/tt-metalium/distributed.hpp +++ b/tt_metal/api/tt-metalium/distributed.hpp @@ -6,6 +6,7 @@ #include "mesh_buffer.hpp" #include "mesh_command_queue.hpp" +#include "mesh_coord.hpp" #include "mesh_event.hpp" namespace tt::tt_metal { @@ -29,7 +30,7 @@ void WriteShard( MeshCommandQueue& mesh_cq, std::shared_ptr& mesh_buffer, std::vector& src, - const Coordinate& coord, + const MeshCoordinate& coord, bool blocking = false) { std::vector shard_data_transfers = {{ .shard_coord = coord, @@ -44,7 +45,7 @@ void ReadShard( MeshCommandQueue& mesh_cq, std::vector& dst, std::shared_ptr& mesh_buffer, - const Coordinate& coord, + const MeshCoordinate& coord, bool blocking = true) { auto shard = mesh_buffer->get_device_buffer(coord); dst.resize(shard->page_size() * shard->num_pages() / sizeof(DType)); diff --git a/tt_metal/api/tt-metalium/mesh_buffer.hpp b/tt_metal/api/tt-metalium/mesh_buffer.hpp index 8656fc02e67..6ae394538ef 100644 --- a/tt_metal/api/tt-metalium/mesh_buffer.hpp +++ b/tt_metal/api/tt-metalium/mesh_buffer.hpp @@ -96,7 +96,6 @@ class MeshBuffer { const ShardedBufferConfig& global_shard_spec() const; const DeviceLocalBufferConfig& device_local_config() const { return device_local_config_; } - std::shared_ptr get_device_buffer(const Coordinate& device_coord) const; std::shared_ptr get_device_buffer(const MeshCoordinate& device_coord) const; uint32_t datum_size_bytes() const; Shape2D physical_shard_shape() const; diff --git a/tt_metal/api/tt-metalium/mesh_command_queue.hpp b/tt_metal/api/tt-metalium/mesh_command_queue.hpp index 11ca2ab65e8..aa3cbf3b414 100644 --- a/tt_metal/api/tt-metalium/mesh_command_queue.hpp +++ b/tt_metal/api/tt-metalium/mesh_command_queue.hpp @@ -66,7 +66,7 @@ class MeshCommandQueue { // Specifies host data to be written to or read from a MeshBuffer shard. struct ShardDataTransfer { - Coordinate shard_coord; + MeshCoordinate shard_coord; void* host_data = nullptr; std::optional region; }; diff --git a/tt_metal/api/tt-metalium/mesh_config.hpp b/tt_metal/api/tt-metalium/mesh_config.hpp index a37111f076e..e14440da1d3 100644 --- a/tt_metal/api/tt-metalium/mesh_config.hpp +++ b/tt_metal/api/tt-metalium/mesh_config.hpp @@ -7,6 +7,8 @@ #include #include +#include "mesh_coord.hpp" + namespace tt::tt_metal::distributed { using DeviceIds = std::vector; @@ -38,8 +40,8 @@ struct MeshShape { */ struct MeshDeviceConfig { - MeshShape mesh_shape{0, 0}; - MeshOffset offset{0, 0}; + SimpleMeshShape mesh_shape{0, 0}; + std::optional offset; std::vector physical_device_ids{}; }; diff --git a/tt_metal/api/tt-metalium/mesh_coord.hpp b/tt_metal/api/tt-metalium/mesh_coord.hpp index 5160bdb745f..9dd3292de1d 100644 --- a/tt_metal/api/tt-metalium/mesh_coord.hpp +++ b/tt_metal/api/tt-metalium/mesh_coord.hpp @@ -8,6 +8,7 @@ #include #include +#include "assert.hpp" #include "shape_base.hpp" #include "utils.hpp" @@ -98,10 +99,16 @@ class MeshCoordinateRange { // Constructs a range that iterates over all coordinates in the mesh. MeshCoordinateRange(const SimpleMeshShape& shape); + // Returns the dimensionality of the range. + size_t dims() const; + // Returns start and (inclusive) end coordinates of the range. const MeshCoordinate& start_coord() const; const MeshCoordinate& end_coord() const; + // Returns true if the range contains the given coordinate. + bool contains(const MeshCoordinate& coord) const; + class Iterator { public: Iterator& operator++(); @@ -186,10 +193,14 @@ template class MeshContainer { public: MeshContainer(const SimpleMeshShape& shape, const T& fill_value); + MeshContainer(const SimpleMeshShape& shape, std::vector values); // Returns a shape of the container. const SimpleMeshShape& shape() const; + // Returns (inclusive) range of coordinates in the container. + const MeshCoordinateRange& coord_range() const; + // Accessor methods. T& at(const MeshCoordinate& coord); const T& at(const MeshCoordinate& coord) const; @@ -252,6 +263,11 @@ class MeshContainer { std::vector& values() { return values_; } const std::vector& values() const { return values_; } + friend bool operator==(const MeshContainer& lhs, const MeshContainer& rhs) { + return lhs.shape() == rhs.shape() && lhs.coord_range() == rhs.coord_range() && lhs.values() == rhs.values(); + } + friend bool operator!=(const MeshContainer& lhs, const MeshContainer& rhs) { return !(lhs == rhs); } + private: SimpleMeshShape shape_; MeshCoordinateRange coord_range_; @@ -262,11 +278,26 @@ template MeshContainer::MeshContainer(const SimpleMeshShape& shape, const T& fill_value) : shape_(shape), coord_range_(shape), values_(shape.mesh_size(), fill_value) {} +template +MeshContainer::MeshContainer(const SimpleMeshShape& shape, std::vector values) : + shape_(shape), coord_range_(shape), values_(std::move(values)) { + TT_FATAL( + shape.mesh_size() == values_.size(), + "Shape and values size mismatch; shape: {}, values: {}", + shape, + values.size()); +} + template const SimpleMeshShape& MeshContainer::shape() const { return shape_; } +template +const MeshCoordinateRange& MeshContainer::coord_range() const { + return coord_range_; +} + template T& MeshContainer::at(const MeshCoordinate& coord) { return values_.at(to_linear_index(shape_, coord)); diff --git a/tt_metal/api/tt-metalium/mesh_device_view.hpp b/tt_metal/api/tt-metalium/mesh_device_view.hpp index fbadc8f32c2..99ed59b3607 100644 --- a/tt_metal/api/tt-metalium/mesh_device_view.hpp +++ b/tt_metal/api/tt-metalium/mesh_device_view.hpp @@ -13,32 +13,14 @@ #include "device.hpp" #include "mesh_config.hpp" +#include "mesh_coord.hpp" +#include "shape2d.hpp" namespace tt::tt_metal::distributed { // Forward declaration of MeshDevice class MeshDevice; -struct Coordinate { - size_t row = 0; - size_t col = 0; - auto operator<=>(const Coordinate&) const = default; - - // Add support for structured bindings - template - decltype(auto) get() const { - if constexpr (I == 0) { - return row; - } else if constexpr (I == 1) { - return col; - } else { - static_assert(I < 2, "Index out of bounds for Coordinate"); - } - } - - friend std::ostream& operator<<(std::ostream& os, const Coordinate& coord) { - return os << "Coord(" << coord.row << ", " << coord.col << ")"; - } -}; + // TODO (Issue #17477): MeshWorkload and MeshEvent currently rely on the coordinate systems // exposed below. These must be uplifted to an ND coordinate system (DeviceCoord and DeviceRange), // keeping things more consistent across the stack. @@ -70,45 +52,49 @@ class MeshDeviceView { public: using DeviceView = std::vector; using DeviceViews = std::vector>; - using CoordinateMapper = std::function(int device_id)>; - MeshDeviceView(const std::vector& devices, const MeshShape& shape); - MeshDeviceView(const std::vector& devices, Coordinate top_left, Coordinate bottom_right); - MeshDeviceView(const MeshDevice& mesh_device); - MeshDeviceView(const std::vector& devices, const CoordinateMapper& mapper); + // Create a view of the entire mesh. + // MeshDeviceView(const MeshDevice& mesh_device); - [[nodiscard]] IDevice* get_device(size_t row, size_t col) const; + // // Create a view of a sub-region of the mesh defined by `range`. + // MeshDeviceView(const std::vector& devices, const MeshCoordinateRange& range); + explicit MeshDeviceView(const MeshContainer& devices); + explicit MeshDeviceView(const MeshDevice& mesh_device); - // Get devices spanning the rectangular region defined by the top-left and bottom-right coordinates - // devices are returned in row-major order with start/end coordinates inclusive - [[nodiscard]] DeviceView get_devices(const Coordinate& start, const Coordinate& end) const; - [[nodiscard]] DeviceView get_devices(const MeshShape& submesh_shape) const; + // Get devices spanning the region defined by `range` in row-major order with start/end coordinates inclusive + [[nodiscard]] DeviceView get_devices(const MeshCoordinateRange& range) const; + [[nodiscard]] DeviceView get_devices(const SimpleMeshShape& submesh_shape) const; [[nodiscard]] DeviceView get_devices() const; - - [[nodiscard]] DeviceView get_devices_on_row(size_t row) const; - [[nodiscard]] DeviceView get_devices_on_column(size_t col) const; - - [[nodiscard]] DeviceViews get_row_views() const; - [[nodiscard]] DeviceViews get_column_views() const; + [[nodiscard]] size_t num_devices() const; [[nodiscard]] bool empty() const noexcept; [[nodiscard]] size_t size() const noexcept; - [[nodiscard]] MeshShape shape() const noexcept; - [[nodiscard]] bool contains(const Coordinate& coord) const noexcept; - [[nodiscard]] const IDevice* at(const Coordinate& coord) const noexcept; + [[nodiscard]] SimpleMeshShape shape() const noexcept; + [[nodiscard]] bool contains(const MeshCoordinate& coord) const noexcept; + [[nodiscard]] IDevice* get_device(const MeshCoordinate& coord) const; + [[nodiscard]] const IDevice* at(const MeshCoordinate& coord) const noexcept; bool operator==(const MeshDeviceView& other) const; - auto begin() const { return devices_.begin(); } - auto end() const { return devices_.end(); } - - [[nodiscard]] size_t num_rows() const { return bottom_right_.row - top_left_.row + 1; } - [[nodiscard]] size_t num_cols() const { return bottom_right_.col - top_left_.col + 1; } - [[nodiscard]] size_t num_devices() const { return devices_.size(); } + auto begin() const { return devices_.values().begin(); } + auto end() const { return devices_.values().end(); } [[nodiscard]] bool contains_device(chip_id_t device_id) const; - [[nodiscard]] Coordinate find_device(chip_id_t device_id) const; - [[nodiscard]] chip_id_t find_device_id(const Coordinate& coord) const; + + // Throws if no device corresponds to `device_id`. + [[nodiscard]] MeshCoordinate find_device(chip_id_t device_id) const; + + // Throws if the `coord` is out of bounds of this view. + [[nodiscard]] chip_id_t find_device_id(const MeshCoordinate& coord) const; + + // TODO: Remove the methods that assume 2D mesh. + [[nodiscard]] bool is_mesh_2d() const; + [[nodiscard]] size_t num_rows() const; + [[nodiscard]] size_t num_cols() const; + [[nodiscard]] DeviceView get_devices_on_row(size_t row) const; + [[nodiscard]] DeviceView get_devices_on_column(size_t col) const; + [[nodiscard]] DeviceViews get_row_views() const; + [[nodiscard]] DeviceViews get_column_views() const; // These utility methods linearize the set of devices in a mesh into a line or ring. // Linearizing a mesh into a line asserts the condition that device[i-1] is connected to device[i]. @@ -117,47 +103,21 @@ class MeshDeviceView { // // Given a starting coordinate, get the coordinates of a line of devices where device[i-1] is connected to device[i] // The current support only provides left-to-right and right-to-left snaking of the line. - [[nodiscard]] static std::vector get_line_coordinates( - size_t length, const Coordinate& offset, size_t num_rows, size_t num_cols); - [[nodiscard]] std::vector get_ring_coordinates( - const MeshShape& ring_shape, const Coordinate& offset, size_t num_rows, size_t num_cols) const; + // + // Important: these utilities currently only support 2D meshes. + [[nodiscard]] static std::vector get_line_coordinates(size_t length, const Shape2D& mesh_shape); + [[nodiscard]] static std::vector get_ring_coordinates( + const Shape2D& ring_shape, const Shape2D& mesh_shape); [[nodiscard]] std::vector get_ring_devices() const; [[nodiscard]] std::vector get_line_devices() const; private: - std::vector devices_; - std::unordered_map device_coordinates_; - Coordinate top_left_; - Coordinate bottom_right_; + MeshContainer devices_; + std::unordered_map device_coordinates_; - void initialize_from_devices(const std::vector& devices, const CoordinateMapper& mapper); - void validate_coordinates() const; + // Set if the view is 2D to enable row/col APIs, otherwise nullopt. + // TODO: remove this? + std::optional shape_2d_; }; -// Helper function to create a MeshDeviceView -inline MeshDeviceView make_mesh_device_view(std::vector devices, MeshDeviceView::CoordinateMapper mapper) { - return MeshDeviceView(std::move(devices), std::move(mapper)); -} - } // namespace tt::tt_metal::distributed - -namespace std { -// Specializations to enable structured bindings -template <> -struct tuple_size : std::integral_constant {}; -template -struct tuple_element { - using type = size_t; -}; - -// Specialization to enable hashing of Coordinate -template <> -struct hash { - size_t operator()(const tt::tt_metal::distributed::Coordinate& coord) const noexcept { - size_t seed = 0; - tt::utils::hash_combine(seed, coord.row); - tt::utils::hash_combine(seed, coord.col); - return seed; - } -}; -} // namespace std diff --git a/tt_metal/common/mesh_coord.cpp b/tt_metal/common/mesh_coord.cpp index 9a98a0ce801..19dab608c35 100644 --- a/tt_metal/common/mesh_coord.cpp +++ b/tt_metal/common/mesh_coord.cpp @@ -105,9 +105,20 @@ MeshCoordinateRange::MeshCoordinateRange(const MeshCoordinate& start, const Mesh MeshCoordinateRange::MeshCoordinateRange(const SimpleMeshShape& shape) : MeshCoordinateRange(zero_coordinate(shape.dims()), shape_back(shape)) {} +size_t MeshCoordinateRange::dims() const { return start_.dims(); } const MeshCoordinate& MeshCoordinateRange::start_coord() const { return start_; } const MeshCoordinate& MeshCoordinateRange::end_coord() const { return end_; } +bool MeshCoordinateRange::contains(const MeshCoordinate& coord) const { + TT_FATAL(coord.dims() == dims(), "Coordinate dimensions do not match: {} != {}", coord.dims(), dims()); + for (int i = 0; i < coord.dims(); ++i) { + if (coord[i] < start_[i] || coord[i] > end_[i]) { + return false; + } + } + return true; +} + MeshCoordinateRange::Iterator::Iterator( const MeshCoordinateRange* range, const MeshCoordinate& current, size_t linear_index) : range_(range), current_coord_(current), linear_index_(linear_index) {} @@ -143,6 +154,11 @@ MeshCoordinateRange::Iterator MeshCoordinateRange::end() const { return Iterator(this, start_, range_size); } +bool operator==(const MeshCoordinateRange& lhs, const MeshCoordinateRange& rhs) { + return lhs.start_coord() == rhs.start_coord() && lhs.end_coord() == rhs.end_coord(); +} +bool operator!=(const MeshCoordinateRange& lhs, const MeshCoordinateRange& rhs) { return !(lhs == rhs); } + size_t to_linear_index(const SimpleMeshShape& shape, const MeshCoordinate& coord) { TT_FATAL( shape.dims() == coord.dims(), diff --git a/tt_metal/distributed/mesh_buffer.cpp b/tt_metal/distributed/mesh_buffer.cpp index 13d1fc5e6cc..9ed3f95627c 100644 --- a/tt_metal/distributed/mesh_buffer.cpp +++ b/tt_metal/distributed/mesh_buffer.cpp @@ -134,10 +134,6 @@ bool MeshBuffer::is_allocated() const { return not std::holds_alternative MeshBuffer::get_device_buffer(const Coordinate& device_coord) const { - return get_device_buffer(MeshCoordinate(device_coord.row, device_coord.col)); -} - std::shared_ptr MeshBuffer::get_device_buffer(const MeshCoordinate& device_coord) const { return buffers_.at(device_coord); } diff --git a/tt_metal/distributed/mesh_command_queue.cpp b/tt_metal/distributed/mesh_command_queue.cpp index e60010e150a..415e5418210 100644 --- a/tt_metal/distributed/mesh_command_queue.cpp +++ b/tt_metal/distributed/mesh_command_queue.cpp @@ -271,7 +271,7 @@ void MeshCommandQueue::write_sharded_buffer(const MeshBuffer& buffer, const void for (std::size_t replicated_device_y = 0; replicated_device_y < num_devices_y; replicated_device_y++) { auto device_shard_view = - buffer.get_device_buffer(Coordinate(replicated_device_y, replicated_device_x)); + buffer.get_device_buffer(MeshCoordinate(replicated_device_y, replicated_device_x)); const BufferRegion region(0, device_shard_view->size()); this->write_shard_to_device(device_shard_view, shard_data.data(), region); } @@ -279,21 +279,23 @@ void MeshCommandQueue::write_sharded_buffer(const MeshBuffer& buffer, const void } else if (height_replicated or width_replicated) { if (buffer.global_shard_spec().shard_orientation == ShardOrientation::ROW_MAJOR) { for (auto replicated_device_y = 0; replicated_device_y < num_devices_y; replicated_device_y++) { - auto device_shard_view = buffer.get_device_buffer(Coordinate(replicated_device_y, device_x)); + auto device_shard_view = + buffer.get_device_buffer(MeshCoordinate(replicated_device_y, device_x)); const BufferRegion region(0, device_shard_view->size()); this->write_shard_to_device(device_shard_view, shard_data.data(), region); } device_x++; } else { for (auto replicated_device_x = 0; replicated_device_x < num_devices_x; replicated_device_x++) { - auto device_shard_view = buffer.get_device_buffer(Coordinate(device_y, replicated_device_x)); + auto device_shard_view = + buffer.get_device_buffer(MeshCoordinate(device_y, replicated_device_x)); const BufferRegion region(0, device_shard_view->size()); this->write_shard_to_device(device_shard_view, shard_data.data(), region); } device_y++; } } else { - auto device_shard_view = buffer.get_device_buffer(Coordinate(device_y, device_x)); + auto device_shard_view = buffer.get_device_buffer(MeshCoordinate(device_y, device_x)); const BufferRegion region(0, device_shard_view->size()); this->write_shard_to_device(device_shard_view, shard_data.data(), region); if (buffer.global_shard_spec().shard_orientation == ShardOrientation::ROW_MAJOR) { @@ -334,7 +336,7 @@ void MeshCommandQueue::read_sharded_buffer(MeshBuffer& buffer, void* dst) { std::vector shard_data = std::vector(total_write_size_per_shard / sizeof(uint32_t), 0); for (std::size_t shard_y = 0; shard_y < num_shards_y; shard_y++) { for (std::size_t shard_x = 0; shard_x < num_shards_x; shard_x++) { - auto device_shard_view = buffer.get_device_buffer(Coordinate(device_y, device_x)); + auto device_shard_view = buffer.get_device_buffer(MeshCoordinate(device_y, device_x)); const BufferRegion region(0, device_shard_view->size()); this->read_shard_from_device(device_shard_view, shard_data.data(), region); @@ -371,7 +373,7 @@ void MeshCommandQueue::enqueue_write_shard_to_sub_grid( logical_x++) { for (std::size_t logical_y = device_range.start_coord.y; logical_y < device_range.end_coord.y + 1; logical_y++) { - auto device_shard_view = buffer.get_device_buffer(Coordinate(logical_y, logical_x)); + auto device_shard_view = buffer.get_device_buffer(MeshCoordinate(logical_y, logical_x)); const BufferRegion region(0, device_shard_view->size()); this->write_shard_to_device(device_shard_view, host_data, region); } diff --git a/tt_metal/distributed/mesh_device.cpp b/tt_metal/distributed/mesh_device.cpp index 5a693b152ae..7b90778d157 100644 --- a/tt_metal/distributed/mesh_device.cpp +++ b/tt_metal/distributed/mesh_device.cpp @@ -80,7 +80,7 @@ MeshDevice::ScopedDevices::ScopedDevices( physical_device_ids.size() == devices_.shape().mesh_size(), "Device size mismatch; expected: {}, actual: {}", devices_.shape().mesh_size(), - opened_devices_.size()); + physical_device_ids.size()); auto it = devices_.begin(); for (auto physical_device_id : physical_device_ids) { @@ -135,10 +135,13 @@ std::shared_ptr MeshDevice::create( size_t num_command_queues, const DispatchCoreConfig& dispatch_core_config, tt::stl::Span l1_bank_remap) { + // TODO: #17477 Extend to ND. + TT_FATAL(config.mesh_shape.dims() == 2, "Mesh shape must be 2D"); + auto mesh_shape_2d = MeshShape{config.mesh_shape[0], config.mesh_shape[1]}; auto mesh_device = std::make_shared( std::make_shared( l1_small_size, trace_region_size, num_command_queues, dispatch_core_config, config), - config.mesh_shape); + mesh_shape_2d); mesh_device->initialize(num_command_queues, l1_small_size, trace_region_size, l1_bank_remap); return mesh_device; @@ -169,11 +172,14 @@ std::shared_ptr MeshDevice::create_submesh(const MeshShape& submesh_ } auto submesh = std::make_shared(scoped_devices_, submesh_shape, shared_from_this()); - auto start_coordinate = Coordinate{offset.row, offset.col}; - auto end_coordinate = Coordinate{offset.row + submesh_shape.num_rows - 1, offset.col + submesh_shape.num_cols - 1}; + auto start_coordinate = MeshCoordinate{offset.row, offset.col}; + auto end_coordinate = + MeshCoordinate{offset.row + submesh_shape.num_rows - 1, offset.col + submesh_shape.num_cols - 1}; - auto submesh_devices = view_->get_devices(start_coordinate, end_coordinate); - submesh->view_ = std::make_unique(submesh_devices, submesh_shape); + MeshContainer submesh_devices_container( + submesh_shape, view_->get_devices(MeshCoordinateRange{start_coordinate, end_coordinate})); + + submesh->view_ = std::make_unique(submesh_devices_container); submeshes_.push_back(submesh); log_trace( LogMetal, @@ -311,8 +317,11 @@ void MeshDevice::reshape(const MeshShape& new_shape) { new_shape.num_rows * new_shape.num_cols == this->num_devices(), "New shape must have the same number of devices as current shape"); + MeshContainer devices(new_shape, this->get_row_major_devices(new_shape)); + auto new_view = std::make_unique(devices); + mesh_shape_ = new_shape; - view_ = std::make_unique(this->get_row_major_devices(new_shape), new_shape); + view_ = std::move(new_view); } bool MeshDevice::close() { @@ -601,7 +610,8 @@ bool MeshDevice::initialize( size_t trace_region_size, tt::stl::Span l1_bank_remap, bool minimal) { - view_ = std::make_unique(scoped_devices_->get_devices(), mesh_shape_); + MeshContainer devices(mesh_shape_, scoped_devices_->get_devices()); + view_ = std::make_unique(devices); // For MeshDevice, we support uniform sub-devices across all devices and we do not support ethernet subdevices. const auto& compute_grid_size = this->compute_with_storage_grid_size(); diff --git a/tt_metal/distributed/mesh_device_view.cpp b/tt_metal/distributed/mesh_device_view.cpp index 883b9a38ebb..64b80167f31 100644 --- a/tt_metal/distributed/mesh_device_view.cpp +++ b/tt_metal/distributed/mesh_device_view.cpp @@ -7,189 +7,146 @@ #include #include +#include "buffer.hpp" +#include "mesh_coord.hpp" +#include "shape2d.hpp" namespace tt::tt_metal::distributed { +namespace { -static std::vector get_devices_from_coordinates( - const MeshDeviceView& mesh, const std::vector& coords) { +std::vector get_devices_from_coordinates( + const MeshDeviceView& mesh, const std::vector& coords) { std::vector devices; for (const auto& coord : coords) { - if (auto device = mesh.get_device(coord.row, coord.col)) { + if (auto device = mesh.get_device(coord)) { devices.push_back(device); } } return devices; } -MeshDeviceView::MeshDeviceView(const std::vector& devices, Coordinate top_left, Coordinate bottom_right) : - top_left_(0, 0), bottom_right_(Coordinate{bottom_right.row - top_left.row, bottom_right.col - top_left.col}) { - auto num_rows = bottom_right.row - top_left.row + 1; - auto num_cols = bottom_right.col - top_left.col + 1; - - for (size_t row = top_left.row; row <= bottom_right.row; ++row) { - for (size_t col = top_left.col; col <= bottom_right.col; ++col) { - auto device_index = row * num_cols + col; - TT_FATAL(device_index < devices.size(), "Device index out of bounds"); - auto device = devices[device_index]; - devices_.push_back(device); - device_coordinates_[device->id()] = {row - top_left.row, col - top_left.col}; - } - } - validate_coordinates(); -} - -MeshDeviceView::MeshDeviceView(const MeshDevice& mesh_device) : - MeshDeviceView(mesh_device.get_devices(), mesh_device.shape()) {} - -MeshDeviceView::MeshDeviceView(const std::vector& devices, const MeshShape& shape) : - MeshDeviceView(devices, Coordinate{0, 0}, Coordinate{shape.num_rows - 1, shape.num_cols - 1}) {} - -MeshDeviceView::MeshDeviceView(const std::vector& devices, const CoordinateMapper& mapper) : - devices_(std::move(devices)) { - initialize_from_devices(devices_, std::move(mapper)); -} +} // namespace -IDevice* MeshDeviceView::get_device(size_t row, size_t col) const { - for (const auto& device : devices_) { - auto it = device_coordinates_.find(device->id()); - if (it != device_coordinates_.end() && it->second.row == row && it->second.col == col) { - return device; - } +MeshDeviceView::MeshDeviceView(const MeshContainer& devices) : devices_(devices) { + if (devices_.shape().dims() == 2) { + shape_2d_ = Shape2D(devices_.shape()[0], devices_.shape()[1]); + } + for (const auto& [coord, device] : devices_) { + device_coordinates_.emplace(device->id(), coord); } - return nullptr; } -MeshDeviceView::DeviceView MeshDeviceView::get_devices(const Coordinate& start, const Coordinate& end) const { - if (start.row > end.row || start.col > end.col) { - log_fatal("Invalid coordinates: start {} must be less than or equal to end {}", start, end); - } +MeshDeviceView::MeshDeviceView(const MeshDevice& mesh_device) : + MeshDeviceView(MeshContainer(SimpleMeshShape(mesh_device.shape()), mesh_device.get_devices())) {} +MeshDeviceView::DeviceView MeshDeviceView::get_devices(const MeshCoordinateRange& range) const { DeviceView devices_in_region; - for (size_t row = start.row; row <= end.row; ++row) { - for (size_t col = start.col; col <= end.col; ++col) { - if (auto device = get_device(row, col)) { - devices_in_region.push_back(device); - } - } + for (const auto& coord : range) { + devices_in_region.push_back(devices_.at(coord)); } return devices_in_region; } -MeshDeviceView::DeviceView MeshDeviceView::get_devices(const MeshShape& submesh_shape) const { - return get_devices({0, 0}, {submesh_shape.num_rows - 1, submesh_shape.num_cols - 1}); +MeshDeviceView::DeviceView MeshDeviceView::get_devices(const SimpleMeshShape& submesh_shape) const { + return get_devices(MeshCoordinateRange(submesh_shape)); } std::vector MeshDeviceView::get_devices_on_row(size_t row) const { + TT_FATAL(shape_2d_.has_value(), "MeshDeviceView is not 2D!"); + TT_FATAL(row < shape_2d_->height(), "Row index out of bounds: {}", row); std::vector row_devices; - for (const auto& device : devices_) { - auto it = device_coordinates_.find(device->id()); - if (it != device_coordinates_.end() && it->second.row == row) { - row_devices.push_back(device); - } + for (int col = 0; col < shape_2d_->width(); ++col) { + row_devices.push_back(devices_.at(MeshCoordinate(row, col))); } return row_devices; } std::vector MeshDeviceView::get_devices_on_column(size_t col) const { + TT_FATAL(shape_2d_.has_value(), "MeshDeviceView is not 2D!"); + TT_FATAL(col < shape_2d_->width(), "Column index out of bounds: {}", col); std::vector col_devices; - for (const auto& device : devices_) { - auto it = device_coordinates_.find(device->id()); - if (it != device_coordinates_.end() && it->second.col == col) { - col_devices.push_back(device); - } + for (int row = 0; row < shape_2d_->height(); ++row) { + col_devices.push_back(devices_.at(MeshCoordinate(row, col))); } return col_devices; } std::vector> MeshDeviceView::get_row_views() const { + TT_FATAL(shape_2d_.has_value(), "MeshDeviceView is not 2D!"); std::vector> row_views; - for (size_t row = top_left_.row; row <= bottom_right_.row; ++row) { + for (size_t row = 0; row < shape_2d_->height(); ++row) { row_views.push_back(get_devices_on_row(row)); } return row_views; } std::vector> MeshDeviceView::get_column_views() const { + TT_FATAL(shape_2d_.has_value(), "MeshDeviceView is not 2D!"); std::vector> column_views; - for (size_t col = top_left_.col; col <= bottom_right_.col; ++col) { + for (size_t col = 0; col < shape_2d_->width(); ++col) { column_views.push_back(get_devices_on_column(col)); } return column_views; } -bool MeshDeviceView::empty() const noexcept { return devices_.empty(); } - -size_t MeshDeviceView::size() const noexcept { return devices_.size(); } +bool MeshDeviceView::empty() const noexcept { return devices_.shape().mesh_size() == 0; } +size_t MeshDeviceView::size() const noexcept { return devices_.shape().mesh_size(); } +SimpleMeshShape MeshDeviceView::shape() const noexcept { return devices_.shape(); } -MeshShape MeshDeviceView::shape() const noexcept { return {num_rows(), num_cols()}; } - -bool MeshDeviceView::contains(const Coordinate& coord) const noexcept { - return coord.row >= top_left_.row && coord.row <= bottom_right_.row && coord.col >= top_left_.col && - coord.col <= bottom_right_.col; +bool MeshDeviceView::contains(const MeshCoordinate& coord) const noexcept { + return devices_.coord_range().contains(coord); } -const IDevice* MeshDeviceView::at(const Coordinate& coord) const noexcept { - if (contains(coord)) { - return get_device(coord.row, coord.col); - } - return nullptr; +IDevice* MeshDeviceView::get_device(const MeshCoordinate& coord) const { + return contains(coord) ? devices_.at(coord) : nullptr; +} +const IDevice* MeshDeviceView::at(const MeshCoordinate& coord) const noexcept { + return contains(coord) ? devices_.at(coord) : nullptr; } bool MeshDeviceView::operator==(const MeshDeviceView& other) const { return devices_ == other.devices_ && device_coordinates_ == other.device_coordinates_ && - top_left_ == other.top_left_ && bottom_right_ == other.bottom_right_; + shape_2d_ == other.shape_2d_; +} + +size_t MeshDeviceView::num_rows() const { + TT_FATAL(shape_2d_.has_value(), "MeshDeviceView is not 2D!"); + return shape_2d_->height(); +} +size_t MeshDeviceView::num_cols() const { + TT_FATAL(shape_2d_.has_value(), "MeshDeviceView is not 2D!"); + return shape_2d_->width(); } +size_t MeshDeviceView::num_devices() const { return devices_.shape().mesh_size(); } bool MeshDeviceView::contains_device(chip_id_t device_id) const { return device_coordinates_.find(device_id) != device_coordinates_.end(); } -Coordinate MeshDeviceView::find_device(chip_id_t device_id) const { +MeshCoordinate MeshDeviceView::find_device(chip_id_t device_id) const { auto it = device_coordinates_.find(device_id); - if (it != device_coordinates_.end()) { - return it->second; - } - TT_THROW("Device not found in mesh: {}", device_id); + TT_FATAL(it != device_coordinates_.end(), "Device not found in mesh: {}", device_id); + return it->second; } -chip_id_t MeshDeviceView::find_device_id(const Coordinate& coord) const { - TT_FATAL( - coord.row >= 0 and coord.row < num_rows() and coord.col >= 0 and coord.col < num_cols(), - "Invalid coordinate: ({}, {})", - coord.row, - coord.col); - return this->devices_.at(coord.row * num_cols() + coord.col)->id(); +chip_id_t MeshDeviceView::find_device_id(const MeshCoordinate& coord) const { + TT_FATAL(contains(coord), "Coordinate {} not found in mesh {}", coord, devices_.shape()); + return devices_.at(coord)->id(); } -void MeshDeviceView::initialize_from_devices(const std::vector& devices, const CoordinateMapper& mapper) { - size_t min_row = std::numeric_limits::max(), min_col = std::numeric_limits::max(); - size_t max_row = std::numeric_limits::min(), max_col = std::numeric_limits::min(); - - for (const auto& device : devices) { - auto coord = mapper(device->id()); - if (!coord) { - throw std::runtime_error("Failed to map device ID to coordinate"); - } - - device_coordinates_[device->id()] = *coord; - min_row = std::min(min_row, coord->row); - min_col = std::min(min_col, coord->col); - max_row = std::max(max_row, coord->row); - max_col = std::max(max_col, coord->col); - } - - top_left_ = {min_row, min_col}; - bottom_right_ = {max_row, max_col}; -} +bool MeshDeviceView::is_mesh_2d() const { return shape_2d_.has_value(); } -std::vector MeshDeviceView::get_line_coordinates( - size_t length, const Coordinate& offset, size_t num_rows, size_t num_cols) { - std::vector line_coords; - auto [row_index, col_index] = offset; +std::vector MeshDeviceView::get_line_coordinates(size_t length, const Shape2D& mesh_shape) { + // Iterate in a zigzag pattern from top-left to bottom-right. + std::vector line_coords; + line_coords.reserve(length); + const auto [num_rows, num_cols] = mesh_shape; + int row_index = 0; + int col_index = 0; bool left_to_right = true; for (size_t i = 0; i < length && row_index < num_rows && col_index < num_cols; ++i) { - line_coords.emplace_back(Coordinate{row_index, col_index}); + line_coords.emplace_back(MeshCoordinate(row_index, col_index)); if (left_to_right && col_index < num_cols - 1) { col_index++; @@ -205,62 +162,55 @@ std::vector MeshDeviceView::get_line_coordinates( return line_coords; } -std::vector MeshDeviceView::get_ring_coordinates( - const MeshShape& ring_shape, const Coordinate& offset, size_t num_rows, size_t num_cols) const { - auto [start_row, start_col] = offset; - auto [ring_rows, ring_cols] = ring_shape; - auto end_row = start_row + ring_rows - 1; - auto end_col = start_col + ring_cols - 1; +std::vector MeshDeviceView::get_ring_coordinates(const Shape2D& ring_shape, const Shape2D& mesh_shape) { + const auto [ring_rows, ring_cols] = ring_shape; + const auto end_row = ring_rows - 1; + const auto end_col = ring_cols - 1; // Validate the specified subgrid - std::vector boundary_coords; - if (start_row + ring_rows > num_rows || start_col + ring_cols > num_cols) { - throw std::invalid_argument("Subgrid is out of mesh bounds."); + std::vector boundary_coords; + if (ring_rows > mesh_shape.height() || ring_cols > mesh_shape.width()) { + TT_THROW("Subgrid is out of mesh bounds."); } // Traverse the top row from left to right - for (size_t col = start_col; col <= end_col; ++col) { - boundary_coords.emplace_back(Coordinate{start_row, col}); + for (size_t col = 0; col <= end_col; ++col) { + boundary_coords.emplace_back(MeshCoordinate{0, col}); } // Traverse the rightmost column from top+1 to bottom - for (size_t row = start_row + 1; row <= end_row; ++row) { - boundary_coords.emplace_back(Coordinate{row, end_col}); + for (size_t row = 1; row <= end_row; ++row) { + boundary_coords.emplace_back(MeshCoordinate{row, end_col}); } // Traverse the bottom row from right to left, if there is more than one row if (ring_rows > 1 and ring_cols > 1) { // Traverse the bottom row from right to left - for (int col = static_cast(end_col - 1); col >= static_cast(start_col); --col) { - boundary_coords.emplace_back(Coordinate{end_row, static_cast(col)}); + for (int col = static_cast(end_col - 1); col >= 0; --col) { + boundary_coords.emplace_back(MeshCoordinate{end_row, static_cast(col)}); } // Traverse the leftmost column from bottom-1 to top+1 - for (int row = static_cast(end_row - 1); row > static_cast(start_row); --row) { - boundary_coords.emplace_back(Coordinate{static_cast(row), start_col}); + for (int row = static_cast(end_row - 1); row > 0; --row) { + boundary_coords.emplace_back(MeshCoordinate{static_cast(row), 0}); } } return boundary_coords; } -void MeshDeviceView::validate_coordinates() const { - if (top_left_.row > bottom_right_.row || top_left_.col > bottom_right_.col) { - throw std::invalid_argument("Invalid coordinates: top_left must be less than or equal to bottom_right"); - } -} - std::vector MeshDeviceView::get_line_devices() const { - auto boundary_coords = - get_line_coordinates(this->num_rows() * this->num_cols(), this->top_left_, this->num_rows(), this->num_cols()); + TT_FATAL(shape_2d_.has_value(), "MeshDeviceView is not 2D!"); + auto boundary_coords = get_line_coordinates(devices_.shape().mesh_size(), *shape_2d_); return get_devices_from_coordinates(*this, boundary_coords); } std::vector MeshDeviceView::get_ring_devices() const { - auto boundary_coords = get_ring_coordinates(shape(), this->top_left_, this->num_rows(), this->num_cols()); + TT_FATAL(shape_2d_.has_value(), "MeshDeviceView is not 2D!"); + auto boundary_coords = get_ring_coordinates(*shape_2d_, *shape_2d_); return get_devices_from_coordinates(*this, boundary_coords); } -MeshDeviceView::DeviceView MeshDeviceView::get_devices() const { return this->devices_; } +MeshDeviceView::DeviceView MeshDeviceView::get_devices() const { return this->devices_.values(); } } // namespace tt::tt_metal::distributed diff --git a/tt_metal/distributed/system_mesh.cpp b/tt_metal/distributed/system_mesh.cpp index 20d912a3b1a..b2eff3b89d2 100644 --- a/tt_metal/distributed/system_mesh.cpp +++ b/tt_metal/distributed/system_mesh.cpp @@ -4,6 +4,7 @@ #include +#include "small_vector.hpp" #include "umd/device/types/cluster_descriptor_types.h" #include "tt_metal/distributed/coordinate_translation.hpp" @@ -89,34 +90,45 @@ chip_id_t SystemMesh::Impl::get_physical_device_id(const MeshCoordinate& coord) std::vector SystemMesh::Impl::get_mapped_physical_device_ids(const MeshDeviceConfig& config) const { std::vector physical_device_ids; - // TODO: #17477 - Extend to ND. + TT_FATAL( - logical_mesh_shape_.dims() == 2, - "SystemMesh only supports 2D meshes; requested dimensions: {}", - logical_mesh_shape_.dims()); + config.mesh_shape.mesh_size() <= logical_mesh_shape_.mesh_size(), + "Requested mesh is too big: {}, SystemMesh {}", + config.mesh_shape.mesh_size(), + logical_mesh_shape_.mesh_size()); - auto [system_mesh_rows, system_mesh_cols] = std::make_tuple(logical_mesh_shape_[0], logical_mesh_shape_[1]); - auto [requested_num_rows, requested_num_cols] = config.mesh_shape; - auto [row_offset, col_offset] = config.offset; + const size_t system_dimensions = logical_mesh_shape_.dims(); - // First check if total size fits - TT_FATAL( - requested_num_rows * requested_num_cols <= system_mesh_rows * system_mesh_cols, - "Requested submesh is too big: {}x{}, SystemMesh shape: {}x{}", - requested_num_rows, - requested_num_cols, - system_mesh_rows, - system_mesh_cols); - - bool is_single_row_or_column = requested_num_rows == 1 or requested_num_cols == 1; - if (is_single_row_or_column) { - TT_FATAL(row_offset == 0 and col_offset == 0, "Row and column offsets unsupported for single row mesh"); - auto line_length = requested_num_rows * requested_num_cols; - auto line_coords = MeshDeviceView::get_line_coordinates( - line_length, Coordinate{row_offset, col_offset}, system_mesh_rows, system_mesh_cols); - for (const auto& logical_coordinate : line_coords) { - auto physical_device_id = - logical_to_device_id_.at(MeshCoordinate(logical_coordinate.row, logical_coordinate.col)); + const MeshCoordinate system_offset = [&config, system_dimensions]() { + if (config.offset.has_value()) { + TT_FATAL( + config.offset->dims() == system_dimensions, + "Provided offset dimensions mismatch: {} != {}", + config.offset, + system_dimensions); + return *config.offset; + } else { + return MeshCoordinate(tt::stl::SmallVector(system_dimensions, 0)); + } + }(); + + const bool line_topology = [&config]() { + const int non_unit_dims = + std::count_if(config.mesh_shape.cbegin(), config.mesh_shape.cend(), [](int dim) { return dim != 1; }); + return non_unit_dims <= 1; + }(); + if (line_topology) { + TT_FATAL( + std::all_of(system_offset.coords().begin(), system_offset.coords().end(), [](int dim) { return dim == 0; }), + "Offsets are unsupported for a line mesh"); + + // TODO: consider if we can do this in 3D. + TT_FATAL(logical_mesh_shape_.dims() == 2, "Line topology is only supported for 2D meshes"); + Shape2D shape_2d(logical_mesh_shape_[0], logical_mesh_shape_[1]); + + auto line_length = config.mesh_shape.mesh_size(); + for (const auto& logical_coordinate : MeshDeviceView::get_line_coordinates(line_length, shape_2d)) { + auto physical_device_id = logical_to_device_id_.at(logical_coordinate); physical_device_ids.push_back(physical_device_id); log_debug( @@ -124,96 +136,63 @@ std::vector SystemMesh::Impl::get_mapped_physical_device_ids(const Me } return physical_device_ids; } - bool requires_rotation = requested_num_rows > system_mesh_rows || requested_num_cols > system_mesh_cols; - - if (requires_rotation) { - bool can_rotate = requested_num_rows <= system_mesh_cols && requested_num_cols <= system_mesh_rows; - if (can_rotate) { - // Rotate requested shape; row_offset and col_offset refer to original orientation - std::swap(requested_num_rows, requested_num_cols); - } else { - TT_THROW( - "User has requested a submesh that is too big and is not rotatable: {}x{} and SystemMesh is {}x{}.", - requested_num_rows, - requested_num_cols, - system_mesh_rows, - system_mesh_cols); - } - } else { - // If no rotation, check dimensions directly - TT_FATAL( - requested_num_rows <= system_mesh_rows && requested_num_cols <= system_mesh_cols, - "Requested submesh is too big: {}x{} and SystemMesh is {}x{}", - requested_num_rows, - requested_num_cols, - system_mesh_rows, - system_mesh_cols); - } - size_t original_rows = system_mesh_rows; - size_t original_cols = system_mesh_cols; - - // Check that offsets fit in the original mesh - TT_FATAL( - row_offset + requested_num_rows <= original_rows, - "Row offset + requested rows exceeds mesh size: {} + {} > {}", - row_offset, - requested_num_rows, - original_rows); TT_FATAL( - col_offset + requested_num_cols <= original_cols, - "Column offset + requested columns exceeds mesh size: {} + {} > {}", - col_offset, - requested_num_cols, - original_cols); - - // Map each submesh coordinate to the original logical coordinates - for (size_t row = 0; row < requested_num_rows; row++) { - for (size_t col = 0; col < requested_num_cols; col++) { - Coordinate logical_coordinate; - if (requires_rotation) { - // After swapping requested_num_rows and requested_num_cols, - // (row, col) now iterate over the rotated shape. - size_t old_row = row_offset + row; // top row - size_t old_col = col_offset + col; // increasing columns horizontally - logical_coordinate = Coordinate{old_row, old_col}; - } else { - logical_coordinate = Coordinate{row + row_offset, col + col_offset}; + config.mesh_shape.dims() == system_dimensions, + "Requested mesh shape dimensions mismatch: {} != {}", + config.mesh_shape, + logical_mesh_shape_); + + // Attempt to fit the requested mesh into the system mesh, potentially rotating it. + auto requested_mesh_fits = [this, &system_offset](const tt::stl::SmallVector& rotated_shape) { + for (int i = 0; i < logical_mesh_shape_.dims(); ++i) { + if (system_offset[i] + rotated_shape[i] > logical_mesh_shape_[i]) { + return false; } + } + return true; + }; + + tt::stl::SmallVector rotated_shape(config.mesh_shape.cbegin(), config.mesh_shape.cend()); + size_t rotations = 0; + while (!requested_mesh_fits(rotated_shape) && rotations < system_dimensions) { + std::rotate(rotated_shape.begin(), rotated_shape.begin() + 1, rotated_shape.end()); + ++rotations; + } + // After rotating N times, no luck. The requested mesh it too big. + if (rotations == system_dimensions) { + TT_THROW( + "Requested mesh is too big and is not rotatable: {} and SystemMesh {}, offset {}", + config.mesh_shape, + logical_mesh_shape_, + system_offset); + } - TT_FATAL( - logical_coordinate.row < system_mesh_rows, - "Row coordinate out of bounds: {} >= {}", - logical_coordinate.row, - system_mesh_rows); - TT_FATAL( - logical_coordinate.col < system_mesh_cols, - "Column coordinate out of bounds: {} >= {}", - logical_coordinate.col, - system_mesh_cols); + tt::stl::SmallVector end_coord; + for (int i = 0; i < system_dimensions; ++i) { + end_coord.push_back(system_offset[i] + rotated_shape[i] - 1); + } - auto physical_device_id = - logical_to_device_id_.at(MeshCoordinate(logical_coordinate.row, logical_coordinate.col)); - physical_device_ids.push_back(physical_device_id); + MeshCoordinateRange system_range(system_offset, MeshCoordinate(end_coord)); - log_debug( - LogMetal, "Logical coordinate: {}, Physical device ID: {}", logical_coordinate, physical_device_id); - } + for (const auto& system_coord : system_range) { + auto physical_device_id = logical_to_device_id_.find(system_coord); + TT_FATAL( + physical_device_id != logical_to_device_id_.end(), + "Logical coordinate: {} not found in SystemMesh of shape {}", + system_coord, + logical_mesh_shape_); + physical_device_ids.push_back(physical_device_id->second); + log_debug(LogMetal, "Logical coordinate: {}, Physical device ID: {}", system_coord, physical_device_id->second); } return physical_device_ids; } std::vector SystemMesh::Impl::request_available_devices(const MeshDeviceConfig& config) const { - auto [requested_num_rows, requested_num_cols] = config.mesh_shape; - auto [row_offset, col_offset] = config.offset; - - log_debug( - LogMetal, - "Mapping MeshDevice ({}x{}) with offset: {}, {}", - requested_num_rows, - requested_num_cols, - row_offset, - col_offset); + log_debug(LogMetal, "Mapping MeshDevice ({})", config.mesh_shape); + if (config.offset.has_value()) { + log_debug(LogMetal, "Offset: {}", config.offset.value()); + } return config.physical_device_ids.empty() ? this->get_mapped_physical_device_ids(config) : config.physical_device_ids; diff --git a/tt_metal/programming_examples/distributed/1_distributed_program_dispatch/distributed_program_dispatch.cpp b/tt_metal/programming_examples/distributed/1_distributed_program_dispatch/distributed_program_dispatch.cpp index 922af6b7dcb..c15df5a5f95 100644 --- a/tt_metal/programming_examples/distributed/1_distributed_program_dispatch/distributed_program_dispatch.cpp +++ b/tt_metal/programming_examples/distributed/1_distributed_program_dispatch/distributed_program_dispatch.cpp @@ -3,13 +3,14 @@ // SPDX-License-Identifier: Apache-2.0 #include +#include // Stand-alone example demonstrating usage of native multi-device TT-Metalium APIs // for issuing a program dispatch across a mesh of devices. int main(int argc, char** argv) { using namespace tt::tt_metal::distributed; - auto mesh_device = MeshDevice::create(MeshDeviceConfig{.mesh_shape{2, 4}}); + auto mesh_device = MeshDevice::create(MeshDeviceConfig{.mesh_shape = SimpleMeshShape(2, 4)}); auto& cq = mesh_device->mesh_command_queue(); // In a typical single-device fashion, instantiate a program with diff --git a/tt_metal/programming_examples/distributed/2_distributed_buffer_rw/distributed_buffer_rw.cpp b/tt_metal/programming_examples/distributed/2_distributed_buffer_rw/distributed_buffer_rw.cpp index a1b17cec8d5..9a401213a4f 100644 --- a/tt_metal/programming_examples/distributed/2_distributed_buffer_rw/distributed_buffer_rw.cpp +++ b/tt_metal/programming_examples/distributed/2_distributed_buffer_rw/distributed_buffer_rw.cpp @@ -19,7 +19,7 @@ int main(int argc, char** argv) { using namespace tt::tt_metal::distributed; using tt::tt_metal::distributed::ShardedBufferConfig; - auto mesh_device = MeshDevice::create(MeshDeviceConfig{.mesh_shape{2, 4}}); + auto mesh_device = MeshDevice::create(MeshDeviceConfig{.mesh_shape = SimpleMeshShape(2, 4)}); auto& cq = mesh_device->mesh_command_queue(); // Define the shape of the shard and the distributed buffer. diff --git a/tt_metal/programming_examples/distributed/3_distributed_eltwise_add/distributed_eltwise_add.cpp b/tt_metal/programming_examples/distributed/3_distributed_eltwise_add/distributed_eltwise_add.cpp index 9dbf0bbbd61..7ed668c4c22 100644 --- a/tt_metal/programming_examples/distributed/3_distributed_eltwise_add/distributed_eltwise_add.cpp +++ b/tt_metal/programming_examples/distributed/3_distributed_eltwise_add/distributed_eltwise_add.cpp @@ -85,7 +85,7 @@ Program CreateEltwiseAddProgram( // The example showcases TT-Metalium's ability to abstract away the complexity // of distributed memory management and compute. int main(int argc, char** argv) { - auto mesh_device = MeshDevice::create(MeshDeviceConfig{.mesh_shape{2, 4}}); + auto mesh_device = MeshDevice::create(MeshDeviceConfig{.mesh_shape = SimpleMeshShape(2, 4)}); // Define the global buffer shape and shard shape for distributed buffers auto shard_shape = Shape2D{32, 32}; diff --git a/ttnn/cpp/ttnn/distributed/api.cpp b/ttnn/cpp/ttnn/distributed/api.cpp index 9133ec419ac..e8f2846b3ba 100644 --- a/ttnn/cpp/ttnn/distributed/api.cpp +++ b/ttnn/cpp/ttnn/distributed/api.cpp @@ -7,6 +7,7 @@ #include #include +#include "tt-metalium/mesh_coord.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/tensor/tensor_utils.hpp" #include "ttnn/distributed/distributed_tensor_config.hpp" @@ -26,8 +27,10 @@ std::shared_ptr open_mesh_device( const DispatchCoreConfig& dispatch_core_config, const MeshOffset& offset, const std::vector& physical_device_ids) { - auto config = - MeshDeviceConfig{.mesh_shape = mesh_shape, .offset = offset, .physical_device_ids = physical_device_ids}; + std::optional offset_opt = + offset.row != 0 || offset.col != 0 ? std::make_optional(offset.row, offset.col) : std::nullopt; + auto config = MeshDeviceConfig{ + .mesh_shape = SimpleMeshShape(mesh_shape), .offset = offset_opt, .physical_device_ids = physical_device_ids}; return MeshDevice::create(config, l1_small_size, trace_region_size, num_command_queues, dispatch_core_config); } @@ -128,7 +131,7 @@ std::vector get_t3k_physical_device_ids_ring() { TT_FATAL(num_devices == 8, "T3000 ring topology only works with 8 devices"); auto physical_device_ids = - instance.get_mapped_physical_device_ids(MeshDeviceConfig{MeshShape{1, 8}, MeshOffset{0, 0}}); + instance.get_mapped_physical_device_ids(MeshDeviceConfig{.mesh_shape = SimpleMeshShape(1, 8)}); return physical_device_ids; } diff --git a/ttnn/cpp/ttnn/distributed/distributed_pybind.cpp b/ttnn/cpp/ttnn/distributed/distributed_pybind.cpp index 50ee1506df5..92c02b515c3 100644 --- a/ttnn/cpp/ttnn/distributed/distributed_pybind.cpp +++ b/ttnn/cpp/ttnn/distributed/distributed_pybind.cpp @@ -6,6 +6,7 @@ #include #include +#include "tt-metalium/mesh_coord.hpp" #include "ttnn/distributed/api.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/types.hpp" @@ -70,8 +71,10 @@ void py_module(py::module& module) { const std::vector& physical_device_ids) { return MeshDevice::create( MeshDeviceConfig{ - .mesh_shape = mesh_device_shape, - .offset = offset, + .mesh_shape = SimpleMeshShape(mesh_device_shape), + .offset = offset.row != 0 || offset.col != 0 + ? std::make_optional(offset.row, offset.col) + : std::nullopt, .physical_device_ids = physical_device_ids, }, l1_small_size, diff --git a/ttnn/cpp/ttnn/distributed/types.hpp b/ttnn/cpp/ttnn/distributed/types.hpp index c31993a3d01..de8ae02c43a 100644 --- a/ttnn/cpp/ttnn/distributed/types.hpp +++ b/ttnn/cpp/ttnn/distributed/types.hpp @@ -13,6 +13,8 @@ namespace ttnn::distributed { using MeshShape = tt::tt_metal::distributed::MeshShape; +using SimpleMeshShape = tt::tt_metal::distributed::SimpleMeshShape; +using MeshCoordinate = tt::tt_metal::distributed::MeshCoordinate; using MeshOffset = tt::tt_metal::distributed::MeshOffset; using DeviceIds = tt::tt_metal::distributed::DeviceIds; using MeshDevice = tt::tt_metal::distributed::MeshDevice; @@ -27,12 +29,14 @@ namespace ttnn { // These types are exported to the ttnn namespace for convenience. using ttnn::distributed::DeviceIds; +using ttnn::distributed::MeshCoordinate; using ttnn::distributed::MeshDevice; using ttnn::distributed::MeshDeviceConfig; using ttnn::distributed::MeshDeviceView; using ttnn::distributed::MeshOffset; using ttnn::distributed::MeshShape; using ttnn::distributed::MeshSubDeviceManagerId; +using ttnn::distributed::SimpleMeshShape; using ttnn::distributed::SystemMesh; } // namespace ttnn diff --git a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.cpp b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.cpp index b763cab08f4..ae1939e7ae7 100644 --- a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.cpp @@ -6,6 +6,7 @@ #include "ttnn/operations/math.hpp" #include +#include #include "ttnn/tensor/tensor_utils.hpp" @@ -360,18 +361,22 @@ Tensor all_gather( const std::vector>& optional_output_tensors) mutable -> std::vector { const auto& input_device_tensor = input_tensors.at(0); + TT_FATAL( + mesh_view.is_mesh_2d(), + "all-gather invoked with cluster_axis API on >2D mesh, which is currently unsupported"); const auto coordinate = mesh_view.find_device(input_device_tensor.device()->id()); - const auto view_index = (cluster_axis == 0) ? coordinate.col : coordinate.row; - const auto device_index = (cluster_axis == 0) ? coordinate.row : coordinate.col; + const auto view_index = (cluster_axis == 0) ? coordinate[1] : coordinate[0]; + const auto device_index = (cluster_axis == 0) ? coordinate[0] : coordinate[1]; auto get_chip_id = [&](std::size_t line_index) -> std::optional { - auto new_coord = coordinate; + auto new_row = coordinate[0]; + auto new_col = coordinate[1]; if (cluster_axis == 0) { - new_coord.row = line_index % num_devices; + new_row = line_index % num_devices; } else { - new_coord.col = line_index % num_devices; + new_col = line_index % num_devices; } - return mesh_view.find_device_id(new_coord); + return mesh_view.find_device_id(MeshCoordinate(new_row, new_col)); }; bool is_last_chip_in_clockwise_direction = device_index == (num_devices - 1); diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.cpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.cpp index 909a254df8d..af614f48b80 100644 --- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.cpp @@ -223,18 +223,22 @@ Tensor reduce_scatter( const std::vector>& optional_output_tensors) mutable -> std::vector { const auto& input_device_tensor = input_tensors.at(0); + TT_FATAL( + mesh_view.is_mesh_2d(), + "reduce-scatter invoked with cluster_axis API on >2D mesh, which is currently unsupported"); const auto coordinate = mesh_view.find_device(input_device_tensor.device()->id()); - const auto view_index = (cluster_axis == 0) ? coordinate.col : coordinate.row; - const auto device_index = (cluster_axis == 0) ? coordinate.row : coordinate.col; + const auto view_index = (cluster_axis == 0) ? coordinate[1] : coordinate[0]; + const auto device_index = (cluster_axis == 0) ? coordinate[0] : coordinate[1]; auto get_chip_id = [&](std::size_t line_index) -> std::optional { - auto new_coord = coordinate; + auto new_row = coordinate[0]; + auto new_col = coordinate[1]; if (cluster_axis == 0) { - new_coord.row = line_index % num_devices; + new_row = line_index % num_devices; } else { - new_coord.col = line_index % num_devices; + new_col = line_index % num_devices; } - return mesh_view.find_device_id(new_coord); + return mesh_view.find_device_id(MeshCoordinate(new_row, new_col)); }; bool is_last_chip_in_clockwise_direction = device_index == (num_devices - 1); diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.cpp index f295d317f64..eea3800c374 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.cpp @@ -399,9 +399,12 @@ Tensor all_gather_async( const std::vector>& optional_output_tensors) mutable -> std::vector { const auto& input_device_tensor = input_tensors.at(0); + TT_FATAL( + mesh_view.is_mesh_2d(), + "all-gather invoked with cluster_axis API on >2D mesh, which is currently unsupported"); const auto coordinate = mesh_view.find_device(input_device_tensor.device()->id()); - std::vector devices = (cluster_axis == 0) ? mesh_view.get_devices_on_column(coordinate.col) - : mesh_view.get_devices_on_row(coordinate.row); + std::vector devices = (cluster_axis == 0) ? mesh_view.get_devices_on_column(coordinate[1]) + : mesh_view.get_devices_on_row(coordinate[0]); const auto& input_tensor = input_tensors.at(0); diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.cpp index fe431c64c4b..eeb67c0f502 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.cpp @@ -335,9 +335,12 @@ Tensor reduce_scatter( const std::vector>& optional_output_tensors) mutable -> std::vector { const auto& input_device_tensor = input_tensors.at(0); + TT_FATAL( + mesh_view.is_mesh_2d(), + "reduce-scatter invoked with cluster_axis API on >2D mesh, which is currently unsupported"); const auto coordinate = mesh_view.find_device(input_device_tensor.device()->id()); - std::vector devices = (cluster_axis == 0) ? mesh_view.get_devices_on_column(coordinate.col) - : mesh_view.get_devices_on_row(coordinate.row); + std::vector devices = (cluster_axis == 0) ? mesh_view.get_devices_on_column(coordinate[1]) + : mesh_view.get_devices_on_row(coordinate[0]); const auto& input_tensor = input_tensors.at(0); diff --git a/ttnn/cpp/ttnn/tensor/storage.cpp b/ttnn/cpp/ttnn/tensor/storage.cpp index e86cc45a2d5..cd6fb20179d 100644 --- a/ttnn/cpp/ttnn/tensor/storage.cpp +++ b/ttnn/cpp/ttnn/tensor/storage.cpp @@ -34,7 +34,7 @@ MultiDeviceStorage::MultiDeviceStorage( for (int row = 0; row < num_rows; ++row) { for (int col = 0; col < num_cols; ++col) { - auto buffer = mesh_buffer->get_device_buffer(distributed::Coordinate{row, col}); + auto buffer = mesh_buffer->get_device_buffer(distributed::MeshCoordinate(row, col)); const int device_id = buffer->device()->id(); ordered_device_ids.push_back(device_id); buffers.emplace(device_id, std::move(buffer)); diff --git a/ttnn/cpp/ttnn/tensor/tensor_impl.cpp b/ttnn/cpp/ttnn/tensor/tensor_impl.cpp index edcf4a2ad4d..baae4fb53a4 100644 --- a/ttnn/cpp/ttnn/tensor/tensor_impl.cpp +++ b/ttnn/cpp/ttnn/tensor/tensor_impl.cpp @@ -592,7 +592,9 @@ Tensor to_host_mesh_tensor(const Tensor& tensor, bool blocking) { specs.reserve(num_buffers); buffers.reserve(num_buffers); shard_data_transfers.reserve(num_buffers); - distributed::Coordinate shard_coord = {0, 0}; + distributed::MeshCoordinateRange coord_range( + distributed::MeshCoordinate(0, 0), distributed::MeshCoordinate(num_rows - 1, num_cols - 1)); + auto shard_coord = coord_range.begin(); for (int id : storage.ordered_device_ids) { std::vector host_buffer; const auto& shard_tensor_spec = storage.specs.at(id); @@ -602,14 +604,10 @@ Tensor to_host_mesh_tensor(const Tensor& tensor, bool blocking) { buffers.push_back(owned_buffer::create(std::move(host_buffer))); shard_data_transfers.push_back(distributed::MeshCommandQueue::ShardDataTransfer{ - .shard_coord = shard_coord, + .shard_coord = *shard_coord, .host_data = std::visit([](auto& b) { return b.data(); }, buffers.back()), .region = BufferRegion(0, tensor_size_bytes)}); - - if (++shard_coord.col == num_cols) { - shard_coord.col = 0; - ++shard_coord.row; - } + ++shard_coord; } mesh_cq.enqueue_read_shards(shard_data_transfers, mesh_buffer, /*blocking=*/true); @@ -782,14 +780,17 @@ MultiDeviceStorage shard_to_mesh_buffer( std::vector shard_data_transfers; shard_data_transfers.reserve(storage.buffers.size()); - distributed::Coordinate shard_coord = {0, 0}; - for (int i = 0; i < storage.buffers.size(); i++) { + + distributed::MeshCoordinateRange coord_range( + distributed::MeshCoordinate(0, 0), distributed::MeshCoordinate(num_rows - 1, num_cols - 1)); + auto shard_coord = coord_range.begin(); + for (int i = 0; i < storage.buffers.size(); ++shard_coord, i++) { TensorSpec shard_tensor_spec( storage.specs[i].logical_shape(), storage.specs[i].tensor_layout().with_memory_config(tensor_spec.memory_config())); const auto& shard_host_buffer = storage.buffers[i]; - const auto& shard_buffer = mesh_buffer->get_device_buffer(shard_coord); + const auto& shard_buffer = mesh_buffer->get_device_buffer(*shard_coord); ordered_device_ids.push_back(shard_buffer->device()->id()); buffers.insert({shard_buffer->device()->id(), shard_buffer}); specs.insert({shard_buffer->device()->id(), shard_tensor_spec}); @@ -806,13 +807,9 @@ MultiDeviceStorage shard_to_mesh_buffer( expected_packed_buffer_size_bytes <= tensor_spec.compute_packed_buffer_size_bytes(), "Shard tensor size exceeds the global tensor size!"); shard_data_transfers.push_back(distributed::MeshCommandQueue::ShardDataTransfer{ - .shard_coord = shard_coord, + .shard_coord = *shard_coord, .host_data = data_to_write.data(), .region = BufferRegion(0, input_size_bytes)}); - if (++shard_coord.col == num_cols) { - shard_coord.col = 0; - ++shard_coord.row; - } } mesh_device->mesh_command_queue().enqueue_write_shards(mesh_buffer, shard_data_transfers, /*blocking=*/false); From 615fbc0ec6c87bd685c1fba3516c03c765a4e54f Mon Sep 17 00:00:00 2001 From: Andrew Fuller Date: Thu, 20 Feb 2025 19:21:50 -0500 Subject: [PATCH 205/316] Fix CMake version check to also scan tests (#18119) ### Ticket None ### Problem description The CMake scan wasn't scanning the tests because they're off by default. ### What's changed Enable some additional paths through the CMake files. Dropped the toolchain because we can use the system defaults for our purposes. --- .github/workflows/all-static-checks.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/all-static-checks.yaml b/.github/workflows/all-static-checks.yaml index b3b45bad4b7..b10b5b0774c 100644 --- a/.github/workflows/all-static-checks.yaml +++ b/.github/workflows/all-static-checks.yaml @@ -132,5 +132,4 @@ jobs: # TODO: Use a lukka/run-cmake with a preset after upgrading to a more modern CMake run: | echo "Checking compatibility with $(cmake --version)" - # FIXME: Why is HAVE_STD_REGEX needed? Clean up when we solve it. - cmake -DCMAKE_TOOLCHAIN_FILE=cmake/x86_64-linux-clang-17-libcpp-toolchain.cmake -DHAVE_STD_REGEX=ON -B build . + cmake -D BUILD_PROGRAMMING_EXAMPLES=ON -D TT_METAL_BUILD_TESTS=ON -B build . From ab071c9096a5d00760522db794b82273ee81f586 Mon Sep 17 00:00:00 2001 From: Michael Chiou <156848643+ttmchiou@users.noreply.github.com> Date: Thu, 20 Feb 2025 16:41:36 -0800 Subject: [PATCH 206/316] #18115: Remove running grayskull tests in post-commit (#18116) ### Ticket https://github.com/tenstorrent/tt-metal/issues/18115 GS is deprecated as of v0.55 of tt-metal ### Problem description Stop CI running grayskull tests ### What's changed Remove running grayskull on post-commit workflow. Cleanup of individual workflows to follow ### Checklist - [ ] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes - [ ] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [ ] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [ ] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [ ] New/Existing tests provide coverage for changes --- .github/workflows/_test-wheels-impl.yaml | 2 - .../workflows/all-post-commit-workflows.yaml | 6 --- ...atch-full-regressions-and-models-impl.yaml | 42 +++++++++---------- .../full-regressions-and-models.yaml | 2 +- .../workflows/metal-run-microbenchmarks.yaml | 1 - .../workflows/models-post-commit-wrapper.yaml | 1 - .github/workflows/models-post-commit.yaml | 2 - .github/workflows/perf-models-impl.yaml | 1 - .../run-profiler-regression-wrapper.yaml | 1 - .../workflows/run-profiler-regression.yaml | 2 - ...ss-fast-dispatch-build-and-unit-tests.yaml | 2 - ...ss-slow-dispatch-build-and-unit-tests.yaml | 2 - .github/workflows/test-dispatch.yaml | 3 +- .github/workflows/tt-metal-l2-nightly.yaml | 2 - .../workflows/ttnn-post-commit-wrapper.yaml | 1 - .github/workflows/ttnn-post-commit.yaml | 2 - .github/workflows/ttnn-run-sweeps.yaml | 6 --- .github/workflows/umd-unit-tests-wrapper.yaml | 1 - .github/workflows/umd-unit-tests.yaml | 2 - 19 files changed, 23 insertions(+), 58 deletions(-) diff --git a/.github/workflows/_test-wheels-impl.yaml b/.github/workflows/_test-wheels-impl.yaml index b61afa66161..6ad4eb24e3b 100644 --- a/.github/workflows/_test-wheels-impl.yaml +++ b/.github/workflows/_test-wheels-impl.yaml @@ -26,7 +26,6 @@ jobs: matrix: os: ${{ fromJson(inputs.from-precompiled && '["ubuntu-20.04"]' || '["ubuntu-20.04", "ubuntu-22.04"]') }} runner-hw-info: [ - {arch: grayskull}, {arch: wormhole_b0} ] runs-on: ${{ matrix.os }} @@ -52,7 +51,6 @@ jobs: # We only have this for non-Docker silicon runners right now os: [ubuntu-20.04] runner-hw-info: [ - {arch: grayskull, type: E150}, {arch: wormhole_b0, type: N150}, {arch: wormhole_b0, type: N300} ] diff --git a/.github/workflows/all-post-commit-workflows.yaml b/.github/workflows/all-post-commit-workflows.yaml index 06cbc2652ec..c5f5b285a9d 100644 --- a/.github/workflows/all-post-commit-workflows.yaml +++ b/.github/workflows/all-post-commit-workflows.yaml @@ -64,7 +64,6 @@ jobs: fail-fast: false matrix: test-group: [ - { arch: grayskull, runner-label: E150 }, { arch: wormhole_b0, runner-label: N150 }, { arch: wormhole_b0, runner-label: N300 }, ] @@ -80,7 +79,6 @@ jobs: fail-fast: false matrix: test-group: [ - { arch: grayskull, runner-label: E150 }, { arch: wormhole_b0, runner-label: N150 }, { arch: wormhole_b0, runner-label: N300 }, ] @@ -112,7 +110,6 @@ jobs: fail-fast: false matrix: test-group: [ - { arch: grayskull, runner-label: E150 }, { arch: wormhole_b0, runner-label: N150 }, { arch: wormhole_b0, runner-label: N300 }, ] @@ -128,7 +125,6 @@ jobs: fail-fast: false matrix: test-group: [ - { arch: grayskull, runner-label: E150 }, { arch: wormhole_b0, runner-label: N150 }, { arch: wormhole_b0, runner-label: N300 }, ] @@ -144,7 +140,6 @@ jobs: fail-fast: false matrix: test-group: [ - { arch: grayskull, runner-label: E150 }, { arch: wormhole_b0, runner-label: N150 }, { arch: wormhole_b0, runner-label: N300 }, ] @@ -179,7 +174,6 @@ jobs: fail-fast: false matrix: test-group: [ - { arch: grayskull, runner-label: E150 }, { arch: wormhole_b0, runner-label: N150 }, { arch: wormhole_b0, runner-label: N300 }, ] diff --git a/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml b/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml index 196bfe013f7..9294f3947a0 100644 --- a/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml +++ b/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml @@ -12,20 +12,20 @@ jobs: matrix: test-group: [ - { - name: "Common models GS", - arch: grayskull, - runs-on: ["cloud-virtual-machine", "E150", "in-service"], - cmd: tests/scripts/single_card/nightly/run_common_models.sh, - timeout: 40 - }, - { - name: "GS ttnn nightly", - arch: grayskull, - runs-on: ["cloud-virtual-machine", "E150", "in-service"], - cmd: tests/scripts/single_card/nightly/run_ttnn.sh, - timeout: 40 - }, + # { + # name: "Common models GS", + # arch: grayskull, + # runs-on: ["cloud-virtual-machine", "E150", "in-service"], + # cmd: tests/scripts/single_card/nightly/run_common_models.sh, + # timeout: 40 + # }, + # { + # name: "GS ttnn nightly", + # arch: grayskull, + # runs-on: ["cloud-virtual-machine", "E150", "in-service"], + # cmd: tests/scripts/single_card/nightly/run_ttnn.sh, + # timeout: 40 + # }, { name: "WH N150 ttnn nightly", arch: wormhole_b0, @@ -40,13 +40,13 @@ jobs: cmd: tests/scripts/single_card/nightly/run_ttnn.sh, timeout: 70 }, - { - name: "GS-only models", - arch: grayskull, - runs-on: ["cloud-virtual-machine", "E150", "in-service"], - cmd: tests/scripts/single_card/nightly/run_gs_only.sh, - timeout: 40 - }, + # { + # name: "GS-only models", + # arch: grayskull, + # runs-on: ["cloud-virtual-machine", "E150", "in-service"], + # cmd: tests/scripts/single_card/nightly/run_gs_only.sh, + # timeout: 40 + # }, ] name: FD ${{ matrix.test-group.name }} ${{ matrix.test-group.arch }} env: diff --git a/.github/workflows/full-regressions-and-models.yaml b/.github/workflows/full-regressions-and-models.yaml index 6f6784136df..493f34fab84 100644 --- a/.github/workflows/full-regressions-and-models.yaml +++ b/.github/workflows/full-regressions-and-models.yaml @@ -17,7 +17,7 @@ jobs: # so we try not to get hanging machines fail-fast: false matrix: - arch: [grayskull, wormhole_b0] + arch: [wormhole_b0] frequent-type: [api] env: ARCH_NAME: ${{ matrix.arch }} diff --git a/.github/workflows/metal-run-microbenchmarks.yaml b/.github/workflows/metal-run-microbenchmarks.yaml index 7df326ba8d4..b5dd7892857 100644 --- a/.github/workflows/metal-run-microbenchmarks.yaml +++ b/.github/workflows/metal-run-microbenchmarks.yaml @@ -14,7 +14,6 @@ jobs: fail-fast: false matrix: runner-info: [ - {arch: grayskull, runs-on: ["E150", "pipeline-perf", "bare-metal", "in-service"]}, # Do not run N150 on microbenchmarks for now as we do not have the machines for it # {arch: wormhole_b0, runs-on: ["pipeline-perf", "N150", "bare-metal", "in-service"]}, # N300 diff --git a/.github/workflows/models-post-commit-wrapper.yaml b/.github/workflows/models-post-commit-wrapper.yaml index be31f38a4ce..b63c9fb6869 100644 --- a/.github/workflows/models-post-commit-wrapper.yaml +++ b/.github/workflows/models-post-commit-wrapper.yaml @@ -18,7 +18,6 @@ jobs: fail-fast: false matrix: test-group: [ - { arch: grayskull, runner-label: E150 }, { arch: wormhole_b0, runner-label: N150 }, { arch: wormhole_b0, runner-label: N300 }, ] diff --git a/.github/workflows/models-post-commit.yaml b/.github/workflows/models-post-commit.yaml index 63de42d7ad6..0bb512a3dec 100644 --- a/.github/workflows/models-post-commit.yaml +++ b/.github/workflows/models-post-commit.yaml @@ -19,14 +19,12 @@ on: required: true type: choice options: - - grayskull - wormhole_b0 - blackhole runner-label: required: true type: choice options: - - E150 - N150 - N300 - BH diff --git a/.github/workflows/perf-models-impl.yaml b/.github/workflows/perf-models-impl.yaml index dab1338b772..2514c4c2142 100644 --- a/.github/workflows/perf-models-impl.yaml +++ b/.github/workflows/perf-models-impl.yaml @@ -11,7 +11,6 @@ jobs: fail-fast: false matrix: test-info: [ - {name: "GS", arch: grayskull, runs-on: ["E150", "pipeline-perf", "bare-metal", "in-service"], machine-type: "bare_metal"}, {name: "N300 WH B0", arch: wormhole_b0, runs-on: ["N300", "pipeline-perf", "bare-metal", "in-service"], machine-type: "bare_metal"}, ] model-type: [llm_javelin, cnn_javelin, other] diff --git a/.github/workflows/run-profiler-regression-wrapper.yaml b/.github/workflows/run-profiler-regression-wrapper.yaml index 52248542b21..1bfc2106b43 100644 --- a/.github/workflows/run-profiler-regression-wrapper.yaml +++ b/.github/workflows/run-profiler-regression-wrapper.yaml @@ -16,7 +16,6 @@ jobs: fail-fast: false matrix: test-group: [ - { arch: grayskull, runner-label: E150 }, { arch: wormhole_b0, runner-label: N150 }, { arch: wormhole_b0, runner-label: N300 }, ] diff --git a/.github/workflows/run-profiler-regression.yaml b/.github/workflows/run-profiler-regression.yaml index 4cbc4224b45..12c33cbcab9 100644 --- a/.github/workflows/run-profiler-regression.yaml +++ b/.github/workflows/run-profiler-regression.yaml @@ -23,14 +23,12 @@ on: required: true type: choice options: - - grayskull - wormhole_b0 - blackhole runner-label: required: true type: choice options: - - E150 - N150 - N300 - BH diff --git a/.github/workflows/stress-fast-dispatch-build-and-unit-tests.yaml b/.github/workflows/stress-fast-dispatch-build-and-unit-tests.yaml index 2a3e5717d0b..d85d29ac4a8 100644 --- a/.github/workflows/stress-fast-dispatch-build-and-unit-tests.yaml +++ b/.github/workflows/stress-fast-dispatch-build-and-unit-tests.yaml @@ -19,8 +19,6 @@ jobs: fail-fast: false matrix: runner-info: [ - # E150 - {arch: grayskull, runs-on: ["cloud-virtual-machine", "E150", "in-service"], machine-type: "virtual_machine", name: "E150"}, # N150 {arch: wormhole_b0, runs-on: ["cloud-virtual-machine", "N150", "in-service"], machine-type: "virtual_machine", name: "N150"}, # N300 diff --git a/.github/workflows/stress-slow-dispatch-build-and-unit-tests.yaml b/.github/workflows/stress-slow-dispatch-build-and-unit-tests.yaml index f75e6ea6aae..aade47120c6 100644 --- a/.github/workflows/stress-slow-dispatch-build-and-unit-tests.yaml +++ b/.github/workflows/stress-slow-dispatch-build-and-unit-tests.yaml @@ -19,8 +19,6 @@ jobs: fail-fast: false matrix: runner-info: [ - # E150 - {arch: grayskull, runs-on: ["cloud-virtual-machine", "E150", "in-service"], machine-type: "virtual_machine", name: "E150"}, # N150 {arch: wormhole_b0, runs-on: ["cloud-virtual-machine", "N150", "in-service"], machine-type: "virtual_machine", name: "N150"}, # N300 diff --git a/.github/workflows/test-dispatch.yaml b/.github/workflows/test-dispatch.yaml index 416970b809c..a3dbfe680a7 100644 --- a/.github/workflows/test-dispatch.yaml +++ b/.github/workflows/test-dispatch.yaml @@ -7,11 +7,10 @@ on: required: true type: choice options: - - grayskull - wormhole_b0 - blackhole runner-label: - description: 'Optional: N150, N300, E150, BH, config-t3000, config-tg, config-tgg' + description: 'Optional: N150, N300, BH, config-t3000, config-tg, config-tgg' required: true type: string default: '["in-service"]' diff --git a/.github/workflows/tt-metal-l2-nightly.yaml b/.github/workflows/tt-metal-l2-nightly.yaml index 35c08c107dd..7bdd961431c 100644 --- a/.github/workflows/tt-metal-l2-nightly.yaml +++ b/.github/workflows/tt-metal-l2-nightly.yaml @@ -19,14 +19,12 @@ on: required: true type: choice options: - - grayskull - wormhole_b0 - blackhole runner-label: required: true type: choice options: - - E150 - N150 - N300 - BH diff --git a/.github/workflows/ttnn-post-commit-wrapper.yaml b/.github/workflows/ttnn-post-commit-wrapper.yaml index 74a5c9575ea..52485735f6a 100644 --- a/.github/workflows/ttnn-post-commit-wrapper.yaml +++ b/.github/workflows/ttnn-post-commit-wrapper.yaml @@ -18,7 +18,6 @@ jobs: fail-fast: false matrix: test-group: [ - { arch: grayskull, runner-label: E150 }, { arch: wormhole_b0, runner-label: N150 }, { arch: wormhole_b0, runner-label: N300 }, ] diff --git a/.github/workflows/ttnn-post-commit.yaml b/.github/workflows/ttnn-post-commit.yaml index fe1c4a5ac61..2e3f57afe08 100644 --- a/.github/workflows/ttnn-post-commit.yaml +++ b/.github/workflows/ttnn-post-commit.yaml @@ -23,14 +23,12 @@ on: required: true type: choice options: - - grayskull - wormhole_b0 - blackhole runner-label: required: true type: choice options: - - E150 - N150 - N300 - BH diff --git a/.github/workflows/ttnn-run-sweeps.yaml b/.github/workflows/ttnn-run-sweeps.yaml index 1b7ab7f1bbf..3c03bf85ead 100644 --- a/.github/workflows/ttnn-run-sweeps.yaml +++ b/.github/workflows/ttnn-run-sweeps.yaml @@ -586,12 +586,6 @@ jobs: matrix: test-group: [ - { - name: "Grayskull E150 Sweeps", - arch: grayskull, - runs-on: ["cloud-virtual-machine", "E150", "in-service"], - tt-smi-cmd: "tt-smi-metal -r 0" - }, { name: "Wormhole N150 Sweeps", arch: wormhole_b0, diff --git a/.github/workflows/umd-unit-tests-wrapper.yaml b/.github/workflows/umd-unit-tests-wrapper.yaml index ec1eab12684..d573b65a697 100644 --- a/.github/workflows/umd-unit-tests-wrapper.yaml +++ b/.github/workflows/umd-unit-tests-wrapper.yaml @@ -10,7 +10,6 @@ jobs: fail-fast: false matrix: test-group: [ - { arch: grayskull, runner-label: E150 }, { arch: wormhole_b0, runner-label: N150 }, { arch: wormhole_b0, runner-label: N300 }, ] diff --git a/.github/workflows/umd-unit-tests.yaml b/.github/workflows/umd-unit-tests.yaml index 460ec079503..4b23d103e1c 100644 --- a/.github/workflows/umd-unit-tests.yaml +++ b/.github/workflows/umd-unit-tests.yaml @@ -19,14 +19,12 @@ on: required: true type: choice options: - - grayskull - wormhole_b0 - blackhole runner-label: required: true type: choice options: - - E150 - N150 - N300 - BH From 53f3d05926fe8e119ca34e3d928e2fe3b9ffde05 Mon Sep 17 00:00:00 2001 From: Bryan Wilder Field Lozano Date: Thu, 20 Feb 2025 16:48:46 -0800 Subject: [PATCH 207/316] [skip ci] Move test wheels workflow to package and release (#18089) --- .github/workflows/_test-wheels-impl.yaml | 25 ------------------- .../workflows/all-post-commit-workflows.yaml | 6 ----- .github/workflows/package-and-release.yaml | 8 +++++- 3 files changed, 7 insertions(+), 32 deletions(-) diff --git a/.github/workflows/_test-wheels-impl.yaml b/.github/workflows/_test-wheels-impl.yaml index 6ad4eb24e3b..c5e2b7f7aca 100644 --- a/.github/workflows/_test-wheels-impl.yaml +++ b/.github/workflows/_test-wheels-impl.yaml @@ -45,28 +45,3 @@ jobs: source tests/end_to_end_tests/env/bin/activate cd tests/end_to_end_tests pytest -c conftest.py . -m eager_host_side - test-wheels-silicon: - strategy: - matrix: - # We only have this for non-Docker silicon runners right now - os: [ubuntu-20.04] - runner-hw-info: [ - {arch: wormhole_b0, type: N150}, - {arch: wormhole_b0, type: N300} - ] - runs-on: ["cloud-virtual-machine", "${{ matrix.runner-hw-info.type }}", "in-service"] - steps: - - uses: actions/checkout@v4 - - uses: actions/download-artifact@v4 - with: - name: eager-dist-${{ matrix.os }}-any - - name: Set up end-to-end tests environment - run: ./tests/scripts/set_up_end_to_end_tests_env.sh - - name: Activate env and run release tests - silicon - timeout-minutes: 2 - shell: bash - run: | - source tests/end_to_end_tests/env/bin/activate - python3 -m ttnn.examples.usage.run_op_on_device - cd tests/end_to_end_tests - pytest -c conftest.py . -m eager_package_silicon diff --git a/.github/workflows/all-post-commit-workflows.yaml b/.github/workflows/all-post-commit-workflows.yaml index c5f5b285a9d..b39ceed6881 100644 --- a/.github/workflows/all-post-commit-workflows.yaml +++ b/.github/workflows/all-post-commit-workflows.yaml @@ -50,12 +50,6 @@ jobs: build-type: ${{ inputs.build-type || 'Release' }} tracy: true secrets: inherit - test-wheels: - needs: build-artifact - uses: ./.github/workflows/_test-wheels-impl.yaml - with: - from-precompiled: true - secrets: inherit # Slow Dispatch Unit Tests sd-unit-tests: needs: build-artifact diff --git a/.github/workflows/package-and-release.yaml b/.github/workflows/package-and-release.yaml index b7676486ca8..47d679e81b1 100644 --- a/.github/workflows/package-and-release.yaml +++ b/.github/workflows/package-and-release.yaml @@ -31,6 +31,12 @@ jobs: uses: ./.github/workflows/build-artifact.yaml with: tracy: true + test-wheels: + needs: build-artifact + uses: ./.github/workflows/_test-wheels-impl.yaml + with: + from-precompiled: true + secrets: inherit single-card-demos: needs: build-artifact uses: ./.github/workflows/single-card-demo-tests-impl.yaml @@ -133,7 +139,7 @@ jobs: path: RELEASE_NOTES.txt # Candidate for breaking up create-and-upload-draft-release: - needs: [create-tag, create-release-notes, build-artifact] + needs: [create-tag, create-release-notes, build-artifact, test-wheels] # May accidentally create two releases without restricting to 1 job concurrency: create_upload_draft_release runs-on: ubuntu-latest From 9113d2e3e7557bffee0868e62c393a0733239614 Mon Sep 17 00:00:00 2001 From: asaigal Date: Thu, 6 Feb 2025 05:36:54 +0000 Subject: [PATCH 208/316] MeshTrace Initial Implementation - Add distributed APIs to trace MeshWorkloads in MeshDevice DRAM - Supports tracing heterogenous workloads and those running on a subset of the MeshDevice - Add an explicit MeshTrace assembly step that allows a single set of dispatch commands to be reused across physical devices running the same programs - Cleanup logic inside EnqueueTraceCommand and move it to a shared header between distributed and tt_metal/dispatch - Add tests for tracing: - Homogenous workloads - Heterogenous workloads - Workloads Running on SubDevices --- tests/tt_metal/distributed/CMakeLists.txt | 1 + .../tt_metal/distributed/test_mesh_trace.cpp | 522 ++++++++++++++++++ .../distributed/test_mesh_workload.cpp | 266 +-------- tests/tt_metal/distributed/utils.cpp | 276 ++++++++- tests/tt_metal/distributed/utils.hpp | 5 + .../tt_metal/common/multi_device_fixture.hpp | 13 +- .../tt_metal/stl/test_strong_type.cpp | 2 +- .../misc/sub_device/sync_and_add.cpp | 46 ++ .../misc/sub_device/sync_and_increment.cpp | 1 - tt_metal/api/tt-metalium/distributed.hpp | 8 + tt_metal/api/tt-metalium/mesh_buffer.hpp | 2 + .../api/tt-metalium/mesh_command_queue.hpp | 64 ++- tt_metal/api/tt-metalium/mesh_common.hpp | 23 + tt_metal/api/tt-metalium/mesh_device.hpp | 13 +- tt_metal/api/tt-metalium/mesh_device_view.hpp | 10 - tt_metal/api/tt-metalium/mesh_trace.hpp | 84 +++ .../tt-metalium}/strong_type.hpp | 0 tt_metal/api/tt-metalium/trace_buffer.hpp | 13 +- tt_metal/distributed/CMakeLists.txt | 1 + tt_metal/distributed/distributed.cpp | 16 + tt_metal/distributed/mesh_command_queue.cpp | 265 +++++++-- tt_metal/distributed/mesh_device.cpp | 38 +- tt_metal/distributed/mesh_trace.cpp | 156 ++++++ tt_metal/distributed/mesh_workload_utils.cpp | 53 +- tt_metal/distributed/mesh_workload_utils.hpp | 3 + tt_metal/impl/CMakeLists.txt | 1 + .../impl/dispatch/hardware_command_queue.cpp | 128 ++--- .../impl/dispatch/host_runtime_commands.cpp | 132 ----- .../impl/dispatch/host_runtime_commands.hpp | 30 - .../impl/flatbuffer/light_metal_binary.fbs | 8 +- .../impl/lightmetal/lightmetal_capture.cpp | 2 +- .../impl/lightmetal/lightmetal_replay.cpp | 2 +- tt_metal/impl/trace/dispatch.cpp | 255 +++++++++ tt_metal/impl/trace/dispatch.hpp | 74 +++ tt_metal/impl/trace/trace.cpp | 56 +- ttnn/cpp/ttnn/common/queue_id.hpp | 2 +- 36 files changed, 1930 insertions(+), 641 deletions(-) create mode 100644 tests/tt_metal/distributed/test_mesh_trace.cpp create mode 100644 tests/tt_metal/tt_metal/test_kernels/misc/sub_device/sync_and_add.cpp create mode 100644 tt_metal/api/tt-metalium/mesh_common.hpp create mode 100644 tt_metal/api/tt-metalium/mesh_trace.hpp rename tt_metal/{tt_stl => api/tt-metalium}/strong_type.hpp (100%) create mode 100644 tt_metal/distributed/mesh_trace.cpp create mode 100644 tt_metal/impl/trace/dispatch.cpp create mode 100644 tt_metal/impl/trace/dispatch.hpp diff --git a/tests/tt_metal/distributed/CMakeLists.txt b/tests/tt_metal/distributed/CMakeLists.txt index 08fededb592..922e19ef993 100644 --- a/tests/tt_metal/distributed/CMakeLists.txt +++ b/tests/tt_metal/distributed/CMakeLists.txt @@ -6,6 +6,7 @@ set(UNIT_TESTS_DISTRIBUTED_SRC ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_sub_device.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_allocator.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_events.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_trace.cpp ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp ) diff --git a/tests/tt_metal/distributed/test_mesh_trace.cpp b/tests/tt_metal/distributed/test_mesh_trace.cpp new file mode 100644 index 00000000000..f4ecf8259bd --- /dev/null +++ b/tests/tt_metal/distributed/test_mesh_trace.cpp @@ -0,0 +1,522 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include +#include +#include +#include + +#include "tests/tt_metal/tt_metal/common/multi_device_fixture.hpp" +#include "tests/tt_metal/tt_metal/dispatch/sub_device_test_utils.hpp" +#include "tests/tt_metal/distributed/utils.hpp" + +namespace tt::tt_metal::distributed::test { +namespace { + +// Define custom fixtures initializing a trace region on the MeshDevice +class GenericMeshDeviceTraceFixture : public MeshDeviceFixtureBase { +protected: + GenericMeshDeviceTraceFixture() : MeshDeviceFixtureBase(Config{.num_cqs = 1, .trace_region_size = (64 << 20)}) {} +}; + +class T3000MeshDeviceTraceFixture : public MeshDeviceFixtureBase { +protected: + T3000MeshDeviceTraceFixture() : + MeshDeviceFixtureBase(Config{.mesh_device_type = MeshDeviceType::T3000, .trace_region_size = (64 << 20)}) {} +}; + +using MeshTraceTestT3000 = T3000MeshDeviceTraceFixture; +using MeshTraceTestSuite = GenericMeshDeviceTraceFixture; + +TEST_F(MeshTraceTestSuite, Sanity) { + auto random_seed = 10; + uint32_t seed = tt::parse_env("TT_METAL_SEED", random_seed); + log_info(tt::LogTest, "Using Test Seed: {}", seed); + srand(seed); + + uint32_t num_workloads_per_trace = 5; + uint32_t num_traces = 4; + uint32_t num_iters = 10; + + LogicalDeviceRange all_devices = + LogicalDeviceRange({0, 0}, {mesh_device_->num_cols() - 1, mesh_device_->num_rows() - 1}); + + std::vector> mesh_workloads = {}; + for (int i = 0; i < num_workloads_per_trace * num_traces; i++) { + auto workload = std::make_shared(); + auto programs = tt::tt_metal::distributed::test::utils::create_random_programs( + 1, mesh_device_->compute_with_storage_grid_size(), seed); + AddProgramToMeshWorkload(*workload, *programs[0], all_devices); + EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), *workload, false); + mesh_workloads.push_back(workload); + } + + std::vector trace_ids = {}; + for (int trace_idx = 0; trace_idx < num_traces; trace_idx++) { + auto trace_id = BeginTraceCapture(mesh_device_.get(), 0); + for (int workload_idx = 0; workload_idx < num_workloads_per_trace; workload_idx++) { + EnqueueMeshWorkload( + mesh_device_->mesh_command_queue(), + *mesh_workloads[trace_idx * num_workloads_per_trace + workload_idx], + false); + } + EndTraceCapture(mesh_device_.get(), 0, trace_id); + trace_ids.push_back(trace_id); + } + + for (int i = 0; i < num_iters; i++) { + for (auto trace_id : trace_ids) { + ReplayTrace(mesh_device_.get(), 0, trace_id, false); + } + } + Finish(mesh_device_->mesh_command_queue()); + + for (auto trace_id : trace_ids) { + ReleaseTrace(mesh_device_.get(), trace_id); + } +} + +class MeshTraceSweepTest : public MeshTraceTestT3000, + public testing::WithParamInterface>> {}; + +TEST_P(MeshTraceSweepTest, Sweep) { + auto random_seed = 10; + uint32_t seed = tt::parse_env("TT_METAL_SEED", random_seed); + log_info(tt::LogTest, "Using Test Seed: {}", seed); + srand(seed); + + auto workload_grids = GetParam(); + uint32_t num_workloads = 10; + + std::vector> mesh_workloads = {}; + + for (auto& workload_grid : workload_grids) { + for (int i = 0; i < num_workloads; i++) { + auto workload = std::make_shared(); + for (auto& program_grid : workload_grid) { + auto programs = tt::tt_metal::distributed::test::utils::create_random_programs( + 1, mesh_device_->compute_with_storage_grid_size(), seed); + AddProgramToMeshWorkload(*workload, *programs[0], program_grid); + } + EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), *workload, false); + mesh_workloads.push_back(workload); + } + } + auto trace_id = BeginTraceCapture(mesh_device_.get(), 0); + for (auto& workload : mesh_workloads) { + EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), *workload, false); + } + EndTraceCapture(mesh_device_.get(), 0, trace_id); + for (int i = 0; i < 50; i++) { + ReplayTrace(mesh_device_.get(), 0, trace_id, false); + } + Finish(mesh_device_->mesh_command_queue()); + ReleaseTrace(mesh_device_.get(), trace_id); +} + +INSTANTIATE_TEST_SUITE_P( + MeshTraceSweepTests, + MeshTraceSweepTest, + ::testing::Values( + std::vector>({ + {LogicalDeviceRange({0, 0}, {3, 1})}, // Full grid + {LogicalDeviceRange({0, 0}, {3, 0}), LogicalDeviceRange({0, 1}, {3, 1})}, // Split grid into 2 rows + {LogicalDeviceRange({0, 0}, {1, 1}), LogicalDeviceRange({2, 0}, {3, 1})}, // Split grid into 2 columns + {LogicalDeviceRange({0, 0}, {1, 1}), + LogicalDeviceRange({2, 0}, {2, 1}), + LogicalDeviceRange({3, 0}, {3, 1})}, // Split grid into 3 columns + {LogicalDeviceRange({0, 0}, {0, 1}), + LogicalDeviceRange({1, 0}, {1, 1}), + LogicalDeviceRange({2, 0}, {2, 1}), + LogicalDeviceRange({3, 0}, {3, 1})}, // Split grid into 4 columns + }), + std::vector>({ + {LogicalDeviceRange({0, 0}, {0, 1}), + LogicalDeviceRange({1, 0}, {1, 1}), + LogicalDeviceRange({2, 0}, {2, 1}), + LogicalDeviceRange({3, 0}, {3, 1})}, // Split grid into 4 columns + {LogicalDeviceRange({0, 0}, {3, 0}), LogicalDeviceRange({0, 1}, {3, 1})}, // Split grid into 2 rows + {LogicalDeviceRange({0, 0}, {3, 1})}, // Full grid + {LogicalDeviceRange({0, 0}, {3, 0})}, // Run on top row only + {LogicalDeviceRange({0, 1}, {3, 1})}, // Run on bottom row only + }), + std::vector>({ + {LogicalDeviceRange({0, 0}, {3, 0})}, // Run on top row only + {LogicalDeviceRange({0, 1}, {3, 1})}, // Run on bottom row only + {LogicalDeviceRange({0, 0}, {0, 1})}, // Run on left most column only + {LogicalDeviceRange({1, 0}, {3, 1})}, // Run on right most 3-columns only + {LogicalDeviceRange({0, 0}, {1, 1})}, // Run on left most 2-columns only + {LogicalDeviceRange({0, 0}, {3, 1})}, // Full grid + }), + std::vector>({ + {LogicalDeviceRange({0, 0}, {0, 0}), + LogicalDeviceRange({1, 0}, {1, 0}), + LogicalDeviceRange({2, 0}, {2, 0}), + LogicalDeviceRange({3, 0}, {3, 0}), + LogicalDeviceRange({0, 1}, {0, 1}), + LogicalDeviceRange({1, 1}, {1, 1}), + LogicalDeviceRange({2, 1}, {2, 1}), + LogicalDeviceRange({3, 1}, {3, 1})}, // Run on individual devices + {LogicalDeviceRange({0, 0}, {3, 0})}, // Run on top row only + {LogicalDeviceRange({0, 1}, {3, 1})}, // Run on bottom row only + {LogicalDeviceRange({0, 0}, {3, 1})}, // Full grid + }))); + +TEST_F(MeshTraceTestT3000, EltwiseBinaryMeshTrace) { + std::vector> src0_bufs = {}; + std::vector> src1_bufs = {}; + std::vector> intermed_bufs_0 = {}; + std::vector> intermed_bufs_1 = {}; + std::vector> output_bufs = {}; + + CoreCoord worker_grid_size = mesh_device_->compute_with_storage_grid_size(); + + // Separate Mesh into top and bottom rows + LogicalDeviceRange row_0 = LogicalDeviceRange({0, 0}, {3, 0}); + LogicalDeviceRange row_1 = LogicalDeviceRange({0, 1}, {3, 1}); + // Separate Mesh into 3 columns + LogicalDeviceRange col_0 = LogicalDeviceRange({0, 0}, {1, 1}); + LogicalDeviceRange col_1 = LogicalDeviceRange({2, 0}, {2, 1}); + LogicalDeviceRange col_2 = LogicalDeviceRange({3, 0}, {3, 1}); + + // Create first workload: running addition on top row and multiplication on bottom row + auto programs = tt::tt_metal::distributed::test::utils::create_eltwise_bin_programs( + mesh_device_, src0_bufs, src1_bufs, intermed_bufs_0); + auto mesh_workload = CreateMeshWorkload(); + AddProgramToMeshWorkload(mesh_workload, *programs[0], row_0); + AddProgramToMeshWorkload(mesh_workload, *programs[1], row_1); + // Create second workload: running addition on top row (src1 + intermed0) and multiplication on + // bottom row (src1 * intermed0) + auto programs_1 = tt::tt_metal::distributed::test::utils::create_eltwise_bin_programs( + mesh_device_, intermed_bufs_0, src1_bufs, intermed_bufs_1); + auto mesh_workload_1 = CreateMeshWorkload(); + AddProgramToMeshWorkload(mesh_workload_1, *programs_1[1], row_0); + AddProgramToMeshWorkload(mesh_workload_1, *programs_1[0], row_1); + // Create third workload: running addition on 1st col (src1 + intermed1), multiplication on + // second col (src1 * intermed1) and subtraction on the third col( src1 - intermed1) + auto programs_2 = tt::tt_metal::distributed::test::utils::create_eltwise_bin_programs( + mesh_device_, intermed_bufs_1, src1_bufs, output_bufs); + auto mesh_workload_2 = CreateMeshWorkload(); + AddProgramToMeshWorkload(mesh_workload_2, *programs_2[0], col_0); + AddProgramToMeshWorkload(mesh_workload_2, *programs_2[1], col_1); + AddProgramToMeshWorkload(mesh_workload_2, *programs_2[2], col_2); + + // Initialize inputs + std::vector src0_vec = create_constant_vector_of_bfloat16(src0_bufs[0]->size(), 2); + std::vector src1_vec = create_constant_vector_of_bfloat16(src1_bufs[0]->size(), 3); + // Write inputs for all cores across the Mesh + for (std::size_t col_idx = 0; col_idx < worker_grid_size.x; col_idx++) { + for (std::size_t row_idx = 0; row_idx < worker_grid_size.y; row_idx++) { + EnqueueWriteMeshBuffer( + mesh_device_->mesh_command_queue(), src0_bufs[col_idx * worker_grid_size.y + row_idx], src0_vec); + EnqueueWriteMeshBuffer( + mesh_device_->mesh_command_queue(), src1_bufs[col_idx * worker_grid_size.y + row_idx], src1_vec); + } + } + // Compile workloads + EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), mesh_workload, false); + EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), mesh_workload_1, false); + EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), mesh_workload_2, false); + // Capture trace + auto trace_id = BeginTraceCapture(mesh_device_.get(), 0); + EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), mesh_workload, false); + EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), mesh_workload_1, false); + EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), mesh_workload_2, false); + EndTraceCapture(mesh_device_.get(), 0, trace_id); + + // Run workload multiple times + for (int i = 0; i < 1000; i++) { + ReplayTrace(mesh_device_.get(), 0, trace_id, false); + } + // Verify outputs + std::vector expected_values = {18, 18, 45, 12, 12, 12, 27, 6}; + for (std::size_t logical_y = 0; logical_y < mesh_device_->num_rows(); logical_y++) { + for (std::size_t logical_x = 0; logical_x < mesh_device_->num_cols(); logical_x++) { + for (std::size_t col_idx = 0; col_idx < worker_grid_size.x; col_idx++) { + for (std::size_t row_idx = 0; row_idx < worker_grid_size.y; row_idx++) { + std::vector dst_vec = {}; + ReadShard( + mesh_device_->mesh_command_queue(), + dst_vec, + output_bufs[col_idx * worker_grid_size.y + row_idx], + MeshCoordinate(logical_y, logical_x)); + auto expected_value = expected_values[logical_x + logical_y * mesh_device_->num_cols()]; + for (int i = 0; i < dst_vec.size(); i++) { + EXPECT_EQ(dst_vec[i].to_float(), expected_value); + } + } + } + } + } + ReleaseTrace(mesh_device_.get(), trace_id); +} + +TEST_F(MeshTraceTestSuite, SyncWorkloadsOnSubDeviceTrace) { + SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))}); + SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})})}); + + uint32_t num_iters = 5; + auto sub_device_manager = mesh_device_->create_sub_device_manager({sub_device_1, sub_device_2}, 3200); + mesh_device_->load_sub_device_manager(sub_device_manager); + + // Create three variants of the same program set - will be traced on the Mesh differently + auto [waiter_program_0, syncer_program_0, incrementer_program_0, global_sem_0] = + create_basic_sync_program(mesh_device_.get(), sub_device_1, sub_device_2); + + auto [waiter_program_1, syncer_program_1, incrementer_program_1, global_sem_1] = + create_basic_sync_program(mesh_device_.get(), sub_device_1, sub_device_2); + + auto [waiter_program_2, syncer_program_2, incrementer_program_2, global_sem_2] = + create_basic_sync_program(mesh_device_.get(), sub_device_1, sub_device_2); + + // Top row - first MeshWorkload set + LogicalDeviceRange top_row = LogicalDeviceRange({0, 0}, {mesh_device_->num_cols() - 1, 0}); + // Bottom row - second MeshWorkload set + LogicalDeviceRange bottom_row = LogicalDeviceRange({0, 1}, {mesh_device_->num_cols() - 1, 1}); + // All devices: third MeshWorkload set + LogicalDeviceRange all_devices = + LogicalDeviceRange({0, 0}, {mesh_device_->num_cols() - 1, mesh_device_->num_rows() - 1}); + + // Initialize and construct all MeshWorkloads running on different SubDevices + auto waiter_0 = CreateMeshWorkload(); + auto syncer_0 = CreateMeshWorkload(); + auto incrementer_0 = CreateMeshWorkload(); + + auto waiter_1 = CreateMeshWorkload(); + auto syncer_1 = CreateMeshWorkload(); + auto incrementer_1 = CreateMeshWorkload(); + + auto waiter_2 = CreateMeshWorkload(); + auto syncer_2 = CreateMeshWorkload(); + auto incrementer_2 = CreateMeshWorkload(); + + AddProgramToMeshWorkload(waiter_0, waiter_program_0, top_row); + AddProgramToMeshWorkload(syncer_0, syncer_program_0, top_row); + AddProgramToMeshWorkload(incrementer_0, incrementer_program_0, top_row); + + AddProgramToMeshWorkload(waiter_1, waiter_program_1, bottom_row); + AddProgramToMeshWorkload(syncer_1, syncer_program_1, bottom_row); + AddProgramToMeshWorkload(incrementer_1, incrementer_program_1, bottom_row); + + AddProgramToMeshWorkload(waiter_2, waiter_program_2, all_devices); + AddProgramToMeshWorkload(syncer_2, syncer_program_2, all_devices); + AddProgramToMeshWorkload(incrementer_2, incrementer_program_2, all_devices); + + // Compile all MeshWorkloads + EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), waiter_0, false); + mesh_device_->set_sub_device_stall_group({SubDeviceId{0}}); + EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), syncer_0, true); + EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), incrementer_0, false); + mesh_device_->reset_sub_device_stall_group(); + Finish(mesh_device_->mesh_command_queue()); + + EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), waiter_1, false); + mesh_device_->set_sub_device_stall_group({SubDeviceId{0}}); + EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), syncer_1, true); + EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), incrementer_1, false); + mesh_device_->reset_sub_device_stall_group(); + Finish(mesh_device_->mesh_command_queue()); + + EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), waiter_2, false); + mesh_device_->set_sub_device_stall_group({SubDeviceId{0}}); + EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), syncer_2, true); + EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), incrementer_2, false); + mesh_device_->reset_sub_device_stall_group(); + Finish(mesh_device_->mesh_command_queue()); + + // Capture trace + auto trace_id = BeginTraceCapture(mesh_device_.get(), 0); + EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), waiter_0, false); + EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), syncer_0, false); + EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), incrementer_0, false); + EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), waiter_1, false); + EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), syncer_1, false); + EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), incrementer_1, false); + EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), waiter_2, false); + EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), syncer_2, false); + EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), incrementer_2, false); + EndTraceCapture(mesh_device_.get(), 0, trace_id); + + // Run trace on all SubDevices in the Mesh + for (uint32_t i = 0; i < num_iters; i++) { + ReplayTrace(mesh_device_.get(), 0, trace_id, false); + } + Finish(mesh_device_->mesh_command_queue()); + ReleaseTrace(mesh_device_.get(), trace_id); +} + +TEST_F(MeshTraceTestSuite, DataCopyOnSubDevicesTrace) { + // Create 4 SubDevices + SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {0, 0}))}); // Sync with host + SubDevice sub_device_2(std::array{CoreRangeSet(CoreRange({1, 1}, {1, 1}))}); // Run datacopy + SubDevice sub_device_3(std::array{CoreRangeSet( + CoreRange({2, 2}, {2, 2}))}); // Dummy - use this for blocking operations when using persistent kernels + SubDevice sub_device_4(std::array{CoreRangeSet(CoreRange({3, 3}, {3, 3}))}); // Run addition + + // Create and Load SubDeviceConfig on the mesh + auto sub_device_manager = + mesh_device_->create_sub_device_manager({sub_device_1, sub_device_2, sub_device_3, sub_device_4}, 3200); + mesh_device_->load_sub_device_manager(sub_device_manager); + + // Create IO Buffers + uint32_t single_tile_size = ::tt::tt_metal::detail::TileSize(DataFormat::UInt32); + uint32_t num_tiles = 32; + DeviceLocalBufferConfig per_device_buffer_config{ + .page_size = single_tile_size * num_tiles, + .buffer_type = tt_metal::BufferType::DRAM, + .buffer_layout = TensorMemoryLayout::INTERLEAVED, + .bottom_up = true}; + + ReplicatedBufferConfig global_buffer_config{ + .size = single_tile_size * num_tiles, + }; + auto input_buf = MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device_.get()); + auto output_buf = MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device_.get()); + + // Query coords for syncer, datacopy and addition workloads + auto syncer_coord = sub_device_1.cores(HalProgrammableCoreType::TENSIX).ranges().at(0).start_coord; + auto syncer_core = CoreRangeSet(CoreRange(syncer_coord, syncer_coord)); + auto syncer_core_phys = mesh_device_->worker_core_from_logical_core(syncer_coord); + auto datacopy_coord = sub_device_2.cores(HalProgrammableCoreType::TENSIX).ranges().at(0).start_coord; + auto datacopy_core = CoreRangeSet(CoreRange(datacopy_coord, datacopy_coord)); + auto datacopy_core_phys = mesh_device_->worker_core_from_logical_core(datacopy_coord); + auto add_coord = sub_device_4.cores(HalProgrammableCoreType::TENSIX).ranges().at(0).start_coord; + auto add_core = CoreRangeSet(CoreRange(add_coord, add_coord)); + auto add_core_phys = mesh_device_->worker_core_from_logical_core(add_coord); + + // Create global semaphore for syncing between programs + auto all_cores = syncer_core.merge(datacopy_core).merge(add_core); + auto global_sem = CreateGlobalSemaphore(mesh_device_.get(), all_cores, 0); + + // Program syncs with host and notifies downstream datacopy or addition program + Program sync_and_incr_program = CreateProgram(); + auto sync_kernel = CreateKernel( + sync_and_incr_program, + "tests/tt_metal/tt_metal/test_kernels/misc/sub_device/sync_and_increment.cpp", + syncer_core, + DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); + std::array sync_rt_args = {global_sem.address(), datacopy_core_phys.x, datacopy_core_phys.y}; + SetRuntimeArgs(sync_and_incr_program, sync_kernel, syncer_core, sync_rt_args); + // Program copies data from dram once notified + Program datacopy_program = CreateProgram(); + auto datacopy_kernel = CreateKernel( + datacopy_program, + "tests/tt_metal/tt_metal/test_kernels/misc/sub_device/sync_and_datacopy.cpp", + datacopy_core, + DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); + std::array datacopy_rt_args = { + global_sem.address(), 0, 0, input_buf->address(), output_buf->address(), num_tiles}; + SetRuntimeArgs(datacopy_program, datacopy_kernel, datacopy_core, datacopy_rt_args); + constexpr uint32_t src0_cb_index = CBIndex::c_0; + CircularBufferConfig cb_src0_config = + CircularBufferConfig(single_tile_size * num_tiles, {{src0_cb_index, DataFormat::UInt32}}) + .set_page_size(src0_cb_index, single_tile_size); + CBHandle cb_src0 = CreateCircularBuffer(datacopy_program, datacopy_core, cb_src0_config); + // Program copies data from DRAM, does addition in RISC once notified + Program add_program = CreateProgram(); + auto add_kernel = CreateKernel( + add_program, + "tests/tt_metal/tt_metal/test_kernels/misc/sub_device/sync_and_add.cpp", + datacopy_core, + DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); + std::array add_rt_args = { + global_sem.address(), + 0, + 0, + input_buf->address(), + output_buf->address(), + num_tiles, + add_core_phys.x, + add_core_phys.y, + 1}; + SetRuntimeArgs(add_program, add_kernel, datacopy_core, add_rt_args); + CBHandle add_cb = CreateCircularBuffer(add_program, datacopy_core, cb_src0_config); + // Same program as above, but runs on different SubDevice. Reads from DRAM, once + // notified by previous program + Program add_program_2 = CreateProgram(); + auto add_kernel_2 = CreateKernel( + add_program_2, + "tests/tt_metal/tt_metal/test_kernels/misc/sub_device/sync_and_add.cpp", + add_core, + DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); + std::array add_rt_args_2 = { + global_sem.address(), 0, 0, output_buf->address(), output_buf->address(), num_tiles, 0, 0, 2}; + SetRuntimeArgs(add_program_2, add_kernel_2, add_core, add_rt_args_2); + CBHandle add_cb_2 = CreateCircularBuffer(add_program_2, add_core, cb_src0_config); + + LogicalDeviceRange devices = + LogicalDeviceRange({0, 0}, {mesh_device_->num_cols() - 1, mesh_device_->num_rows() - 1}); + LogicalDeviceRange top_row = LogicalDeviceRange({0, 0}, {mesh_device_->num_cols() - 1, 0}); + LogicalDeviceRange bottom_row = LogicalDeviceRange({0, 1}, {mesh_device_->num_cols() - 1, 1}); + + // Create and initialize MeshWorkloads + auto syncer_mesh_workload = CreateMeshWorkload(); + auto datacopy_mesh_workload = CreateMeshWorkload(); + auto add_mesh_workload = CreateMeshWorkload(); + // Sync program goes to entire Mesh + AddProgramToMeshWorkload(syncer_mesh_workload, sync_and_incr_program, devices); + // Datacopy goes to top row + AddProgramToMeshWorkload(datacopy_mesh_workload, datacopy_program, top_row); + // First addition goes to bottom row + AddProgramToMeshWorkload(datacopy_mesh_workload, add_program, bottom_row); + // Second addition goes to bottom row + AddProgramToMeshWorkload(add_mesh_workload, add_program_2, bottom_row); + + // Compile and load workloads + mesh_device_->set_sub_device_stall_group({SubDeviceId{2}}); + EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), syncer_mesh_workload, false); + EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), datacopy_mesh_workload, false); + EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), add_mesh_workload, false); + + for (auto device : mesh_device_->get_devices()) { + tt::llrt::write_hex_vec_to_core(device->id(), syncer_core_phys, std::vector{1}, global_sem.address()); + } + + // Capture Trace + auto trace_id = BeginTraceCapture(mesh_device_.get(), 0); + EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), syncer_mesh_workload, false); + EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), datacopy_mesh_workload, false); + EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), add_mesh_workload, false); + EndTraceCapture(mesh_device_.get(), 0, trace_id); + // Run trace and verify outputs + for (int i = 0; i < 50; i++) { + ReplayTrace(mesh_device_.get(), 0, trace_id, false); + + std::vector src_vec(input_buf->size() / sizeof(uint32_t)); + std::iota(src_vec.begin(), src_vec.end(), i); + // Block after this write on host, since the global semaphore update starting the + // program goes through an independent path (UMD) and can go out of order wrt the + // buffer data + mesh_device_->set_sub_device_stall_group({SubDeviceId{2}}); + EnqueueWriteMeshBuffer(mesh_device_->mesh_command_queue(), input_buf, src_vec, true); + + for (auto device : mesh_device_->get_devices()) { + tt::llrt::write_hex_vec_to_core( + device->id(), syncer_core_phys, std::vector{1}, global_sem.address()); + } + mesh_device_->reset_sub_device_stall_group(); + for (std::size_t logical_x = 0; logical_x < output_buf->device()->num_cols(); logical_x++) { + for (std::size_t logical_y = 0; logical_y < 1; logical_y++) { + std::vector dst_vec; + ReadShard(mesh_device_->mesh_command_queue(), dst_vec, output_buf, MeshCoordinate(logical_y, logical_x)); + EXPECT_EQ(dst_vec, src_vec); + } + } + for (std::size_t logical_x = 0; logical_x < output_buf->device()->num_cols(); logical_x++) { + for (std::size_t logical_y = 1; logical_y < 2; logical_y++) { + std::vector dst_vec; + ReadShard(mesh_device_->mesh_command_queue(), dst_vec, output_buf, MeshCoordinate(logical_y, logical_x)); + for (int j = 0; j < dst_vec.size(); j++) { + EXPECT_EQ(dst_vec[j], src_vec[j] + 3); + } + } + } + } + ReleaseTrace(mesh_device_.get(), trace_id); +} + +} // namespace +} // namespace tt::tt_metal::distributed::test diff --git a/tests/tt_metal/distributed/test_mesh_workload.cpp b/tests/tt_metal/distributed/test_mesh_workload.cpp index 66aa84357a6..5e88493d029 100644 --- a/tests/tt_metal/distributed/test_mesh_workload.cpp +++ b/tests/tt_metal/distributed/test_mesh_workload.cpp @@ -9,7 +9,6 @@ #include #include -#include "tests/tt_metal/tt_metal/dispatch/dispatch_test_utils.hpp" #include "tests/tt_metal/tt_metal/common/multi_device_fixture.hpp" #include "tests/tt_metal/distributed/utils.hpp" @@ -23,257 +22,6 @@ struct CBConfig { tt::DataFormat data_format; }; -std::vector> create_random_programs( - uint32_t num_programs, - CoreCoord worker_grid_size, - uint32_t seed, - const std::unordered_set& active_eth_cores = {}) { - uint32_t MAX_LOOP = 100; - uint32_t page_size = 1024; - uint32_t max_eth_cores = 3; - - uint32_t BRISC_OUTER_LOOP, BRISC_MIDDLE_LOOP, BRISC_INNER_LOOP, NUM_CBS, NUM_SEMS; - uint32_t NCRISC_OUTER_LOOP, NCRISC_MIDDLE_LOOP, NCRISC_INNER_LOOP; - uint32_t TRISC_OUTER_LOOP, TRISC_MIDDLE_LOOP, TRISC_INNER_LOOP; - uint32_t ERISC_OUTER_LOOP, ERISC_MIDDLE_LOOP, ERISC_INNER_LOOP; - bool USE_MAX_RT_ARGS; - - CoreRange cr({0, 0}, {worker_grid_size.x - 1, worker_grid_size.y - 1}); - CoreRangeSet cr_set(cr); - - std::vector> programs; - - std::map data_movement_defines = {{"DATA_MOVEMENT", "1"}}; - std::map compute_defines = {{"COMPUTE", "1"}}; - std::map erisc_defines = {{"ERISC", "1"}}; - - for (uint32_t i = 0; i < num_programs; i++) { - Program& program = *programs.emplace_back(std::make_shared()); - // ========== Set configs for BRISC ========== - if (i == 0) { - // Ensures that we get at least one compilation with the max amount to - // ensure it compiles and runs - BRISC_OUTER_LOOP = MAX_LOOP; - BRISC_MIDDLE_LOOP = MAX_LOOP; - BRISC_INNER_LOOP = MAX_LOOP; - NUM_CBS = NUM_CIRCULAR_BUFFERS; - NUM_SEMS = NUM_SEMAPHORES; - USE_MAX_RT_ARGS = true; - } else { - BRISC_OUTER_LOOP = rand() % (MAX_LOOP) + 1; - BRISC_MIDDLE_LOOP = rand() % (MAX_LOOP) + 1; - BRISC_INNER_LOOP = rand() % (MAX_LOOP) + 1; - NUM_CBS = rand() % (NUM_CIRCULAR_BUFFERS) + 1; - NUM_SEMS = rand() % (NUM_SEMAPHORES) + 1; - USE_MAX_RT_ARGS = false; - } - // Create CBs - for (uint32_t j = 0; j < NUM_CBS; j++) { - CircularBufferConfig cb_config = CircularBufferConfig(page_size * (j + 1), {{j, tt::DataFormat::Float16_b}}) - .set_page_size(j, page_size * (j + 1)); - auto cb = CreateCircularBuffer(program, cr_set, cb_config); - } - - // Create Semaphores - for (uint32_t j = 0; j < NUM_SEMS; j++) { - CreateSemaphore(program, cr_set, j + 1); - uint32_t curr_idx = 0; - if (active_eth_cores.size()) { - auto active_eth_core = active_eth_cores.begin(); - for (int k = 0; k < max_eth_cores && active_eth_core != active_eth_cores.end(); - ++i, ++active_eth_core) { - CreateSemaphore(program, *active_eth_core, j + 1, CoreType::ETH); - } - } - } - - // Create RTAs - auto [brisc_unique_rtargs, brisc_common_rtargs] = create_runtime_args(USE_MAX_RT_ARGS); - uint32_t num_brisc_unique_rtargs = brisc_unique_rtargs.size(); - uint32_t num_brisc_common_rtargs = brisc_common_rtargs.size(); - std::vector brisc_compile_args = { - BRISC_OUTER_LOOP, - BRISC_MIDDLE_LOOP, - BRISC_INNER_LOOP, - NUM_CBS, - NUM_SEMS, - num_brisc_unique_rtargs, - num_brisc_common_rtargs, - page_size}; - - // ========== Set configs for NCRISC ========== - if (i == 0) { - NCRISC_OUTER_LOOP = MAX_LOOP; - NCRISC_MIDDLE_LOOP = MAX_LOOP; - NCRISC_INNER_LOOP = MAX_LOOP; - } else { - NCRISC_OUTER_LOOP = rand() % (MAX_LOOP) + 1; - NCRISC_MIDDLE_LOOP = rand() % (MAX_LOOP) + 1; - NCRISC_INNER_LOOP = rand() % (MAX_LOOP) + 1; - } - - auto [ncrisc_unique_rtargs, ncrisc_common_rtargs] = create_runtime_args(USE_MAX_RT_ARGS); - uint32_t num_ncrisc_unique_rtargs = ncrisc_unique_rtargs.size(); - uint32_t num_ncrisc_common_rtargs = ncrisc_common_rtargs.size(); - std::vector ncrisc_compile_args = { - NCRISC_OUTER_LOOP, - NCRISC_MIDDLE_LOOP, - NCRISC_INNER_LOOP, - NUM_CBS, - NUM_SEMS, - num_ncrisc_unique_rtargs, - num_ncrisc_common_rtargs, - page_size}; - - // ========== Set configs for TRISC ========== - if (i == 0) { - TRISC_OUTER_LOOP = MAX_LOOP; - TRISC_MIDDLE_LOOP = MAX_LOOP; - TRISC_INNER_LOOP = MAX_LOOP; - } else { - TRISC_OUTER_LOOP = rand() % (MAX_LOOP) + 1; - TRISC_MIDDLE_LOOP = rand() % (MAX_LOOP) + 1; - TRISC_INNER_LOOP = rand() % (MAX_LOOP) + 1; - } - - auto [trisc_unique_rtargs, trisc_common_rtargs] = create_runtime_args(USE_MAX_RT_ARGS); - uint32_t num_trisc_unique_rtargs = trisc_unique_rtargs.size(); - uint32_t num_trisc_common_rtargs = trisc_common_rtargs.size(); - std::vector trisc_compile_args = { - TRISC_OUTER_LOOP, - TRISC_MIDDLE_LOOP, - TRISC_INNER_LOOP, - NUM_CBS, - NUM_SEMS, - num_trisc_unique_rtargs, - num_trisc_common_rtargs, - page_size}; - - if (i == 0) { - ERISC_OUTER_LOOP = MAX_LOOP; - ERISC_MIDDLE_LOOP = MAX_LOOP; - ERISC_INNER_LOOP = MAX_LOOP; - } else { - ERISC_OUTER_LOOP = rand() % (MAX_LOOP) + 1; - ERISC_MIDDLE_LOOP = rand() % (MAX_LOOP) + 1; - ERISC_INNER_LOOP = rand() % (MAX_LOOP) + 1; - } - // Only setup RTAs on ERISC. No Common RTAs. - uint32_t max_erisc_rtas = 64; - uint32_t num_erisc_rtas = rand() % (max_erisc_rtas + 1); - auto [erisc_unique_rtargs, erisc_common_rtargs] = create_runtime_args(num_erisc_rtas, 0, 0, 0); - uint32_t num_erisc_unique_rtargs = erisc_unique_rtargs.size(); - uint32_t num_erisc_common_rt_args = erisc_common_rtargs.size(); - - std::vector erisc_compile_time_args = { - ERISC_OUTER_LOOP, - ERISC_MIDDLE_LOOP, - ERISC_INNER_LOOP, - 0, /* CBs are not supported on ERISC cores */ - NUM_SEMS, - num_erisc_unique_rtargs, - num_erisc_common_rt_args, - page_size}; - - // Create Kernels - bool at_least_one_kernel = false; - if (i == 0 or ((rand() % 2) == 0)) { - auto dummy_brisc_kernel = CreateKernel( - program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", - cr_set, - DataMovementConfig{ - .processor = DataMovementProcessor::RISCV_0, - .noc = NOC::RISCV_0_default, - .compile_args = brisc_compile_args, - .defines = data_movement_defines}); - SetRuntimeArgs(program, dummy_brisc_kernel, cr_set, brisc_unique_rtargs); - SetCommonRuntimeArgs(program, dummy_brisc_kernel, brisc_common_rtargs); - at_least_one_kernel = true; - } - - if (i == 0 or ((rand() % 2) == 0)) { - auto dummy_ncrisc_kernel = CreateKernel( - program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", - cr_set, - DataMovementConfig{ - .processor = DataMovementProcessor::RISCV_1, - .noc = NOC::RISCV_1_default, - .compile_args = ncrisc_compile_args, - .defines = data_movement_defines}); - SetRuntimeArgs(program, dummy_ncrisc_kernel, cr_set, ncrisc_unique_rtargs); - SetCommonRuntimeArgs(program, dummy_ncrisc_kernel, ncrisc_common_rtargs); - at_least_one_kernel = true; - } - - if (i == 0 or ((rand() % 2) == 0)) { - auto dummy_trisc_kernel = CreateKernel( - program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", - cr_set, - ComputeConfig{ - .math_approx_mode = false, .compile_args = trisc_compile_args, .defines = compute_defines}); - SetRuntimeArgs(program, dummy_trisc_kernel, cr_set, trisc_unique_rtargs); - SetCommonRuntimeArgs(program, dummy_trisc_kernel, trisc_common_rtargs); - at_least_one_kernel = true; - } - - if (not at_least_one_kernel) { - uint32_t random_risc = rand() % 3 + 1; - if (random_risc == 1) { - auto dummy_brisc_kernel = CreateKernel( - program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", - cr_set, - DataMovementConfig{ - .processor = DataMovementProcessor::RISCV_0, - .noc = NOC::RISCV_0_default, - .compile_args = brisc_compile_args, - .defines = data_movement_defines}); - SetRuntimeArgs(program, dummy_brisc_kernel, cr_set, brisc_unique_rtargs); - SetCommonRuntimeArgs(program, dummy_brisc_kernel, brisc_common_rtargs); - } else if (random_risc == 2) { - auto dummy_ncrisc_kernel = CreateKernel( - program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", - cr_set, - DataMovementConfig{ - .processor = DataMovementProcessor::RISCV_1, - .noc = NOC::RISCV_1_default, - .compile_args = ncrisc_compile_args, - .defines = data_movement_defines}); - SetRuntimeArgs(program, dummy_ncrisc_kernel, cr_set, ncrisc_unique_rtargs); - SetCommonRuntimeArgs(program, dummy_ncrisc_kernel, ncrisc_common_rtargs); - } else if (random_risc == 3) { - auto dummy_trisc_kernel = CreateKernel( - program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", - cr_set, - ComputeConfig{ - .math_approx_mode = false, .compile_args = trisc_compile_args, .defines = compute_defines}); - SetRuntimeArgs(program, dummy_trisc_kernel, cr_set, trisc_unique_rtargs); - SetCommonRuntimeArgs(program, dummy_trisc_kernel, trisc_common_rtargs); - } else { - TT_THROW("Invalid"); - } - } - if (active_eth_cores.size()) { - auto active_eth_core = active_eth_cores.begin(); - for (int k = 0; k < max_eth_cores && active_eth_core != active_eth_cores.end(); ++i, ++active_eth_core) { - auto dummy_erisc_kernel = CreateKernel( - program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", - *active_eth_core, - EthernetConfig{ - .noc = NOC::NOC_0, .compile_args = erisc_compile_time_args, .defines = erisc_defines}); - SetRuntimeArgs(program, dummy_erisc_kernel, *active_eth_core, erisc_unique_rtargs); - } - } - } - return programs; -} - std::vector initialize_dummy_circular_buffers( Program& program, const CoreRangeSet& cr_set, const std::vector& cb_configs) { std::vector cb_handles; @@ -402,7 +150,7 @@ TEST_F(MeshWorkloadTestT3000, MeshWorkloadOnActiveEthAsserts) { for (std::size_t logical_x = 0; logical_x < x_end; logical_x++) { for (std::size_t logical_y = 0; logical_y < y_end; logical_y++) { IDevice* device = mesh_device_->get_device(logical_y, logical_x); - auto programs = create_random_programs( + auto programs = tt::tt_metal::distributed::test::utils::create_random_programs( 1, mesh_device_->compute_with_storage_grid_size(), seed, device->get_active_ethernet_cores(true)); LogicalDeviceRange devices = {{logical_x, logical_y}, {logical_x, logical_y}}; AddProgramToMeshWorkload(*workload, *programs[0], devices); @@ -422,7 +170,8 @@ TEST_F(MeshWorkloadTestT3000, SimultaneousMeshWorkloads) { log_info("Create MeshWorkloads with multiple programs each"); - auto programs = create_random_programs(num_programs, mesh_device_->compute_with_storage_grid_size(), seed); + auto programs = tt::tt_metal::distributed::test::utils::create_random_programs( + num_programs, mesh_device_->compute_with_storage_grid_size(), seed); std::vector> mesh_workloads = {}; log_info(tt::LogTest, "Compile and load {} MeshWorkloads", num_programs); @@ -442,7 +191,8 @@ TEST_F(MeshWorkloadTestT3000, SimultaneousMeshWorkloads) { EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), *random_workload, false); mesh_workloads.push_back(random_workload); } - programs = create_random_programs(num_programs, mesh_device_->compute_with_storage_grid_size(), seed); + programs = tt::tt_metal::distributed::test::utils::create_random_programs( + num_programs, mesh_device_->compute_with_storage_grid_size(), seed); for (int i = 0; i < num_programs; i += 4) { std::shared_ptr random_workload = std::make_shared(); LogicalDeviceRange devices_0 = LogicalDeviceRange({0, 0}, {0, 1}); @@ -456,7 +206,8 @@ TEST_F(MeshWorkloadTestT3000, SimultaneousMeshWorkloads) { EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), *random_workload, false); mesh_workloads.push_back(random_workload); } - programs = create_random_programs(num_heterogeneous_programs, mesh_device_->compute_with_storage_grid_size(), seed); + programs = tt::tt_metal::distributed::test::utils::create_random_programs( + num_heterogeneous_programs, mesh_device_->compute_with_storage_grid_size(), seed); for (int i = 0; i < num_heterogeneous_programs; i += 8) { std::shared_ptr random_workload = std::make_shared(); LogicalDeviceRange devices_0 = LogicalDeviceRange({0, 0}, {0, 0}); @@ -500,7 +251,8 @@ TEST_F(MeshWorkloadTestSuite, RandomizedMeshWorkload) { log_info(tt::LogTest, "Using Test Seed: {}", seed); srand(seed); log_info("Create {} MeshWorkloads", num_programs); - auto programs = create_random_programs(num_programs, mesh_device_->compute_with_storage_grid_size(), seed); + auto programs = tt::tt_metal::distributed::test::utils::create_random_programs( + num_programs, mesh_device_->compute_with_storage_grid_size(), seed); std::mt19937 rng(seed); std::uniform_int_distribution gen_x(1, mesh_device_->num_cols()); std::uniform_int_distribution gen_y(1, mesh_device_->num_rows()); diff --git a/tests/tt_metal/distributed/utils.cpp b/tests/tt_metal/distributed/utils.cpp index c53f1c9d96a..871312d5303 100644 --- a/tests/tt_metal/distributed/utils.cpp +++ b/tests/tt_metal/distributed/utils.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include "tests/tt_metal/distributed/utils.hpp" +#include "tests/tt_metal/tt_metal/dispatch/dispatch_test_utils.hpp" namespace tt::tt_metal::distributed::test::utils { @@ -11,12 +12,14 @@ std::vector> create_eltwise_bin_programs( std::vector>& src0_bufs, std::vector>& src1_bufs, std::vector>& output_bufs) { - const std::vector op_id_to_op_define = {"add_tiles", "mul_tiles"}; - const std::vector op_id_to_op_type_define = {"EltwiseBinaryType::ELWADD", "EltwiseBinaryType::ELWMUL"}; + const std::vector op_id_to_op_define = {"add_tiles", "mul_tiles", "sub_tiles"}; + const std::vector op_id_to_op_type_define = { + "EltwiseBinaryType::ELWADD", "EltwiseBinaryType::ELWMUL", "EltwiseBinaryType::ELWSUB"}; CoreCoord worker_grid_size = mesh_device->compute_with_storage_grid_size(); - std::vector> programs = {std::make_shared(), std::make_shared()}; + std::vector> programs = { + std::make_shared(), std::make_shared(), std::make_shared()}; auto full_grid = CoreRange({0, 0}, {worker_grid_size.x - 1, worker_grid_size.y - 1}); for (std::size_t eltwise_op = 0; eltwise_op < op_id_to_op_define.size(); eltwise_op++) { @@ -34,15 +37,17 @@ std::vector> create_eltwise_bin_programs( .buffer_layout = TensorMemoryLayout::INTERLEAVED, .bottom_up = true}; + bool allocate_bufs = src0_bufs.empty(); for (std::size_t col_idx = 0; col_idx < worker_grid_size.x; col_idx++) { for (std::size_t row_idx = 0; row_idx < worker_grid_size.y; row_idx++) { - auto src0_dram_buffer = - MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device.get()); - src0_bufs.push_back(src0_dram_buffer); - - auto src1_dram_buffer = - MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device.get()); - src1_bufs.push_back(src1_dram_buffer); + if (allocate_bufs) { + auto src0_dram_buffer = + MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device.get()); + src0_bufs.push_back(src0_dram_buffer); + auto src1_dram_buffer = + MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device.get()); + src1_bufs.push_back(src1_dram_buffer); + } auto dst_dram_buffer = MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device.get()); output_bufs.push_back(dst_dram_buffer); @@ -123,4 +128,255 @@ std::vector> create_eltwise_bin_programs( return programs; } +std::vector> create_random_programs( + uint32_t num_programs, + CoreCoord worker_grid_size, + uint32_t seed, + const std::unordered_set& active_eth_cores) { + uint32_t MAX_LOOP = 100; + uint32_t page_size = 1024; + uint32_t max_eth_cores = 3; + + uint32_t BRISC_OUTER_LOOP, BRISC_MIDDLE_LOOP, BRISC_INNER_LOOP, NUM_CBS, NUM_SEMS; + uint32_t NCRISC_OUTER_LOOP, NCRISC_MIDDLE_LOOP, NCRISC_INNER_LOOP; + uint32_t TRISC_OUTER_LOOP, TRISC_MIDDLE_LOOP, TRISC_INNER_LOOP; + uint32_t ERISC_OUTER_LOOP, ERISC_MIDDLE_LOOP, ERISC_INNER_LOOP; + bool USE_MAX_RT_ARGS; + + CoreRange cr({0, 0}, {worker_grid_size.x - 1, worker_grid_size.y - 1}); + CoreRangeSet cr_set(cr); + + std::vector> programs; + + std::map data_movement_defines = {{"DATA_MOVEMENT", "1"}}; + std::map compute_defines = {{"COMPUTE", "1"}}; + std::map erisc_defines = {{"ERISC", "1"}}; + + for (uint32_t i = 0; i < num_programs; i++) { + Program& program = *programs.emplace_back(std::make_shared()); + // ========== Set configs for BRISC ========== + if (i == 0) { + // Ensures that we get at least one compilation with the max amount to + // ensure it compiles and runs + BRISC_OUTER_LOOP = MAX_LOOP; + BRISC_MIDDLE_LOOP = MAX_LOOP; + BRISC_INNER_LOOP = MAX_LOOP; + NUM_CBS = NUM_CIRCULAR_BUFFERS; + NUM_SEMS = NUM_SEMAPHORES; + USE_MAX_RT_ARGS = true; + } else { + BRISC_OUTER_LOOP = rand() % (MAX_LOOP) + 1; + BRISC_MIDDLE_LOOP = rand() % (MAX_LOOP) + 1; + BRISC_INNER_LOOP = rand() % (MAX_LOOP) + 1; + NUM_CBS = rand() % (NUM_CIRCULAR_BUFFERS) + 1; + NUM_SEMS = rand() % (NUM_SEMAPHORES) + 1; + USE_MAX_RT_ARGS = false; + } + // Create CBs + for (uint32_t j = 0; j < NUM_CBS; j++) { + CircularBufferConfig cb_config = CircularBufferConfig(page_size * (j + 1), {{j, tt::DataFormat::Float16_b}}) + .set_page_size(j, page_size * (j + 1)); + auto cb = CreateCircularBuffer(program, cr_set, cb_config); + } + + // Create Semaphores + for (uint32_t j = 0; j < NUM_SEMS; j++) { + CreateSemaphore(program, cr_set, j + 1); + uint32_t curr_idx = 0; + if (active_eth_cores.size()) { + auto active_eth_core = active_eth_cores.begin(); + for (int k = 0; k < max_eth_cores && active_eth_core != active_eth_cores.end(); + ++i, ++active_eth_core) { + CreateSemaphore(program, *active_eth_core, j + 1, CoreType::ETH); + } + } + } + + // Create RTAs + auto [brisc_unique_rtargs, brisc_common_rtargs] = create_runtime_args(USE_MAX_RT_ARGS); + uint32_t num_brisc_unique_rtargs = brisc_unique_rtargs.size(); + uint32_t num_brisc_common_rtargs = brisc_common_rtargs.size(); + std::vector brisc_compile_args = { + BRISC_OUTER_LOOP, + BRISC_MIDDLE_LOOP, + BRISC_INNER_LOOP, + NUM_CBS, + NUM_SEMS, + num_brisc_unique_rtargs, + num_brisc_common_rtargs, + page_size}; + + // ========== Set configs for NCRISC ========== + if (i == 0) { + NCRISC_OUTER_LOOP = MAX_LOOP; + NCRISC_MIDDLE_LOOP = MAX_LOOP; + NCRISC_INNER_LOOP = MAX_LOOP; + } else { + NCRISC_OUTER_LOOP = rand() % (MAX_LOOP) + 1; + NCRISC_MIDDLE_LOOP = rand() % (MAX_LOOP) + 1; + NCRISC_INNER_LOOP = rand() % (MAX_LOOP) + 1; + } + + auto [ncrisc_unique_rtargs, ncrisc_common_rtargs] = create_runtime_args(USE_MAX_RT_ARGS); + uint32_t num_ncrisc_unique_rtargs = ncrisc_unique_rtargs.size(); + uint32_t num_ncrisc_common_rtargs = ncrisc_common_rtargs.size(); + std::vector ncrisc_compile_args = { + NCRISC_OUTER_LOOP, + NCRISC_MIDDLE_LOOP, + NCRISC_INNER_LOOP, + NUM_CBS, + NUM_SEMS, + num_ncrisc_unique_rtargs, + num_ncrisc_common_rtargs, + page_size}; + + // ========== Set configs for TRISC ========== + if (i == 0) { + TRISC_OUTER_LOOP = MAX_LOOP; + TRISC_MIDDLE_LOOP = MAX_LOOP; + TRISC_INNER_LOOP = MAX_LOOP; + } else { + TRISC_OUTER_LOOP = rand() % (MAX_LOOP) + 1; + TRISC_MIDDLE_LOOP = rand() % (MAX_LOOP) + 1; + TRISC_INNER_LOOP = rand() % (MAX_LOOP) + 1; + } + + auto [trisc_unique_rtargs, trisc_common_rtargs] = create_runtime_args(USE_MAX_RT_ARGS); + uint32_t num_trisc_unique_rtargs = trisc_unique_rtargs.size(); + uint32_t num_trisc_common_rtargs = trisc_common_rtargs.size(); + std::vector trisc_compile_args = { + TRISC_OUTER_LOOP, + TRISC_MIDDLE_LOOP, + TRISC_INNER_LOOP, + NUM_CBS, + NUM_SEMS, + num_trisc_unique_rtargs, + num_trisc_common_rtargs, + page_size}; + + if (i == 0) { + ERISC_OUTER_LOOP = MAX_LOOP; + ERISC_MIDDLE_LOOP = MAX_LOOP; + ERISC_INNER_LOOP = MAX_LOOP; + } else { + ERISC_OUTER_LOOP = rand() % (MAX_LOOP) + 1; + ERISC_MIDDLE_LOOP = rand() % (MAX_LOOP) + 1; + ERISC_INNER_LOOP = rand() % (MAX_LOOP) + 1; + } + // Only setup RTAs on ERISC. No Common RTAs. + uint32_t max_erisc_rtas = 64; + uint32_t num_erisc_rtas = rand() % (max_erisc_rtas + 1); + auto [erisc_unique_rtargs, erisc_common_rtargs] = create_runtime_args(num_erisc_rtas, 0, 0, 0); + uint32_t num_erisc_unique_rtargs = erisc_unique_rtargs.size(); + uint32_t num_erisc_common_rt_args = erisc_common_rtargs.size(); + + std::vector erisc_compile_time_args = { + ERISC_OUTER_LOOP, + ERISC_MIDDLE_LOOP, + ERISC_INNER_LOOP, + 0, /* CBs are not supported on ERISC cores */ + NUM_SEMS, + num_erisc_unique_rtargs, + num_erisc_common_rt_args, + page_size}; + + // Create Kernels + bool at_least_one_kernel = false; + if (i == 0 or ((rand() % 2) == 0)) { + auto dummy_brisc_kernel = CreateKernel( + program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", + cr_set, + DataMovementConfig{ + .processor = DataMovementProcessor::RISCV_0, + .noc = NOC::RISCV_0_default, + .compile_args = brisc_compile_args, + .defines = data_movement_defines}); + SetRuntimeArgs(program, dummy_brisc_kernel, cr_set, brisc_unique_rtargs); + SetCommonRuntimeArgs(program, dummy_brisc_kernel, brisc_common_rtargs); + at_least_one_kernel = true; + } + + if (i == 0 or ((rand() % 2) == 0)) { + auto dummy_ncrisc_kernel = CreateKernel( + program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", + cr_set, + DataMovementConfig{ + .processor = DataMovementProcessor::RISCV_1, + .noc = NOC::RISCV_1_default, + .compile_args = ncrisc_compile_args, + .defines = data_movement_defines}); + SetRuntimeArgs(program, dummy_ncrisc_kernel, cr_set, ncrisc_unique_rtargs); + SetCommonRuntimeArgs(program, dummy_ncrisc_kernel, ncrisc_common_rtargs); + at_least_one_kernel = true; + } + + if (i == 0 or ((rand() % 2) == 0)) { + auto dummy_trisc_kernel = CreateKernel( + program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", + cr_set, + ComputeConfig{ + .math_approx_mode = false, .compile_args = trisc_compile_args, .defines = compute_defines}); + SetRuntimeArgs(program, dummy_trisc_kernel, cr_set, trisc_unique_rtargs); + SetCommonRuntimeArgs(program, dummy_trisc_kernel, trisc_common_rtargs); + at_least_one_kernel = true; + } + + if (not at_least_one_kernel) { + uint32_t random_risc = rand() % 3 + 1; + if (random_risc == 1) { + auto dummy_brisc_kernel = CreateKernel( + program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", + cr_set, + DataMovementConfig{ + .processor = DataMovementProcessor::RISCV_0, + .noc = NOC::RISCV_0_default, + .compile_args = brisc_compile_args, + .defines = data_movement_defines}); + SetRuntimeArgs(program, dummy_brisc_kernel, cr_set, brisc_unique_rtargs); + SetCommonRuntimeArgs(program, dummy_brisc_kernel, brisc_common_rtargs); + } else if (random_risc == 2) { + auto dummy_ncrisc_kernel = CreateKernel( + program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", + cr_set, + DataMovementConfig{ + .processor = DataMovementProcessor::RISCV_1, + .noc = NOC::RISCV_1_default, + .compile_args = ncrisc_compile_args, + .defines = data_movement_defines}); + SetRuntimeArgs(program, dummy_ncrisc_kernel, cr_set, ncrisc_unique_rtargs); + SetCommonRuntimeArgs(program, dummy_ncrisc_kernel, ncrisc_common_rtargs); + } else if (random_risc == 3) { + auto dummy_trisc_kernel = CreateKernel( + program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", + cr_set, + ComputeConfig{ + .math_approx_mode = false, .compile_args = trisc_compile_args, .defines = compute_defines}); + SetRuntimeArgs(program, dummy_trisc_kernel, cr_set, trisc_unique_rtargs); + SetCommonRuntimeArgs(program, dummy_trisc_kernel, trisc_common_rtargs); + } else { + TT_THROW("Invalid"); + } + } + if (active_eth_cores.size()) { + auto active_eth_core = active_eth_cores.begin(); + for (int k = 0; k < max_eth_cores && active_eth_core != active_eth_cores.end(); ++i, ++active_eth_core) { + auto dummy_erisc_kernel = CreateKernel( + program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", + *active_eth_core, + EthernetConfig{ + .noc = NOC::NOC_0, .compile_args = erisc_compile_time_args, .defines = erisc_defines}); + SetRuntimeArgs(program, dummy_erisc_kernel, *active_eth_core, erisc_unique_rtargs); + } + } + } + return programs; +} + } // namespace tt::tt_metal::distributed::test::utils diff --git a/tests/tt_metal/distributed/utils.hpp b/tests/tt_metal/distributed/utils.hpp index 36b1bbb2fdd..5240f5804b7 100644 --- a/tests/tt_metal/distributed/utils.hpp +++ b/tests/tt_metal/distributed/utils.hpp @@ -15,4 +15,9 @@ std::vector> create_eltwise_bin_programs( std::vector>& src1_bufs, std::vector>& output_bufs); +std::vector> create_random_programs( + uint32_t num_programs, + CoreCoord worker_grid_size, + uint32_t seed, + const std::unordered_set& active_eth_cores = {}); } // namespace tt::tt_metal::distributed::test::utils diff --git a/tests/tt_metal/tt_metal/common/multi_device_fixture.hpp b/tests/tt_metal/tt_metal/common/multi_device_fixture.hpp index 752ada9b376..d7e9e9598ae 100644 --- a/tests/tt_metal/tt_metal/common/multi_device_fixture.hpp +++ b/tests/tt_metal/tt_metal/common/multi_device_fixture.hpp @@ -66,6 +66,7 @@ class MeshDeviceFixtureBase : public ::testing::Test { // The associated test will be run if the connected cluster corresponds to a supported topology. std::optional mesh_device_type; int num_cqs = 1; + uint32_t trace_region_size = 0; }; MeshDeviceFixtureBase(const Config& fixture_config) : config_(fixture_config) {} @@ -94,11 +95,14 @@ class MeshDeviceFixtureBase : public ::testing::Test { magic_enum::enum_name(*mesh_device_type), magic_enum::enum_name(*config_.mesh_device_type)); } - // Use ethernet dispatch for more than 1 CQ on T3K/N300 DispatchCoreType core_type = (config_.num_cqs >= 2) ? DispatchCoreType::ETH : DispatchCoreType::WORKER; mesh_device_ = MeshDevice::create( - MeshDeviceConfig{.mesh_shape = get_mesh_shape(*mesh_device_type)}, 0, 0, config_.num_cqs, core_type); + MeshDeviceConfig{.mesh_shape = get_mesh_shape(*mesh_device_type)}, + 0, + config_.trace_region_size, + config_.num_cqs, + core_type); } void TearDown() override { @@ -145,6 +149,11 @@ class GenericMultiCQMeshDeviceFixture : public MeshDeviceFixtureBase { GenericMultiCQMeshDeviceFixture() : MeshDeviceFixtureBase(Config{.num_cqs = 2}) {} }; +class GenericMeshDeviceTraceFixture : public MeshDeviceFixtureBase { +protected: + GenericMeshDeviceTraceFixture() : MeshDeviceFixtureBase(Config{.num_cqs = 1, .trace_region_size = (64 << 20)}) {} +}; + // Fixtures that specify the mesh device type explicitly. // The associated test will be run if the cluster topology matches // what is specified. diff --git a/tests/tt_metal/tt_metal/stl/test_strong_type.cpp b/tests/tt_metal/tt_metal/stl/test_strong_type.cpp index 6983cca7f84..3e543931cbe 100644 --- a/tests/tt_metal/tt_metal/stl/test_strong_type.cpp +++ b/tests/tt_metal/tt_metal/stl/test_strong_type.cpp @@ -7,7 +7,7 @@ #include -#include "tt_metal/tt_stl/strong_type.hpp" +#include using MyIntId = tt::stl::StrongType; using MyStringId = tt::stl::StrongType; diff --git a/tests/tt_metal/tt_metal/test_kernels/misc/sub_device/sync_and_add.cpp b/tests/tt_metal/tt_metal/test_kernels/misc/sub_device/sync_and_add.cpp new file mode 100644 index 00000000000..783a205d7a4 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/misc/sub_device/sync_and_add.cpp @@ -0,0 +1,46 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +// #include + +// #include "dataflow_api.h" + +void kernel_main() { + uint32_t local_sem_addr = get_arg_val(0); + uint32_t src_bank_id = get_arg_val(1); + uint32_t dst_bank_id = get_arg_val(2); + uint32_t src_dram_addr = get_arg_val(3); + uint32_t dst_dram_addr = get_arg_val(4); + uint32_t num_tiles = get_arg_val(5); + uint32_t incr_core_x = get_arg_val(6); + uint32_t incr_core_y = get_arg_val(7); + uint32_t add_val = get_arg_val(8); + + constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0; // index=0 + uint32_t tile_size_bytes = get_tile_size(cb_id_in0) * num_tiles; + uint32_t l1_write_addr = get_write_ptr(cb_id_in0); + + uint64_t src_dram_noc_addr = get_noc_addr_from_bank_id(src_bank_id, src_dram_addr); + uint64_t dst_dram_noc_addr = get_noc_addr_from_bank_id(dst_bank_id, dst_dram_addr); + + volatile tt_l1_ptr uint32_t* local_sem = reinterpret_cast(local_sem_addr); + noc_semaphore_wait(local_sem, 1); + uint64_t noc_local_sem_addr = get_noc_addr(local_sem_addr); + noc_semaphore_inc(noc_local_sem_addr, -1); + noc_async_atomic_barrier(); + noc_async_read(src_dram_noc_addr, l1_write_addr, tile_size_bytes); + noc_async_read_barrier(); + uint32_t* data_addr = (uint32_t*)l1_write_addr; + for (uint32_t i = 0; i < tile_size_bytes / sizeof(uint32_t); i++) { + *(data_addr + i) = *(data_addr + i) + add_val; + } + noc_async_write(l1_write_addr, dst_dram_noc_addr, tile_size_bytes); + noc_async_write_barrier(); + // Increment global sem on downstream core, if remote specified + if (incr_core_x && incr_core_y) { + uint64_t noc_remote_sem_addr = get_noc_addr(incr_core_x, incr_core_y, local_sem_addr); + noc_semaphore_inc(noc_remote_sem_addr, 1); + noc_async_atomic_barrier(); + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/misc/sub_device/sync_and_increment.cpp b/tests/tt_metal/tt_metal/test_kernels/misc/sub_device/sync_and_increment.cpp index 5a1362cbe39..0cd66c75dc2 100644 --- a/tests/tt_metal/tt_metal/test_kernels/misc/sub_device/sync_and_increment.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/misc/sub_device/sync_and_increment.cpp @@ -7,7 +7,6 @@ #include "dataflow_api.h" void kernel_main() { - DPRINT << "start syncer" << ENDL(); uint32_t sem_addr = get_arg_val(0); uint32_t remote_x = get_arg_val(1); uint32_t remote_y = get_arg_val(2); diff --git a/tt_metal/api/tt-metalium/distributed.hpp b/tt_metal/api/tt-metalium/distributed.hpp index c1a1fa62fe5..31e02050724 100644 --- a/tt_metal/api/tt-metalium/distributed.hpp +++ b/tt_metal/api/tt-metalium/distributed.hpp @@ -95,6 +95,14 @@ void EnqueueWaitForEvent(MeshCommandQueue& mesh_cq, const std::shared_ptr& event); +MeshTraceId BeginTraceCapture(MeshDevice* device, uint8_t cq_id); + +void EndTraceCapture(MeshDevice* device, uint8_t cq_id, const MeshTraceId& trace_id); + +void ReplayTrace(MeshDevice* device, uint8_t cq_id, const MeshTraceId& trace_id, bool blocking); + +void ReleaseTrace(MeshDevice* device, const MeshTraceId& trace_id); + void Finish(MeshCommandQueue& mesh_cq, tt::stl::Span sub_device_ids = {}); } // namespace distributed diff --git a/tt_metal/api/tt-metalium/mesh_buffer.hpp b/tt_metal/api/tt-metalium/mesh_buffer.hpp index 6ae394538ef..de14271da85 100644 --- a/tt_metal/api/tt-metalium/mesh_buffer.hpp +++ b/tt_metal/api/tt-metalium/mesh_buffer.hpp @@ -100,6 +100,8 @@ class MeshBuffer { uint32_t datum_size_bytes() const; Shape2D physical_shard_shape() const; std::pair replicated_dims() const; + uint32_t page_size() const { return device_local_config_.page_size; } + uint32_t num_pages() const { return page_size() == 0 ? 0 : device_local_size_ / page_size(); } private: // Creates an owning `MeshBuffer`, backed by an allocation made through `backing_buffer`. diff --git a/tt_metal/api/tt-metalium/mesh_command_queue.hpp b/tt_metal/api/tt-metalium/mesh_command_queue.hpp index aa3cbf3b414..386b5418aa4 100644 --- a/tt_metal/api/tt-metalium/mesh_command_queue.hpp +++ b/tt_metal/api/tt-metalium/mesh_command_queue.hpp @@ -12,6 +12,7 @@ #include "mesh_buffer.hpp" #include "mesh_device.hpp" #include "mesh_workload.hpp" +#include "mesh_trace.hpp" namespace tt::tt_metal::distributed { @@ -49,8 +50,61 @@ class MeshCommandQueue { tt::stl::Span sub_device_ids, bool notify_host, const std::optional& device_range = std::nullopt); + // Trace capture utility functions + // Captures dispatch commands associated with running a program on a Virtual Mesh subgrid + // inside the appropriate trace staging vector (corresponding to the specified subgrid) + void capture_program_trace_on_subgrid( + const LogicalDeviceRange& sub_grid, + ProgramCommandSequence& program_cmd_seq, + bool stall_first, + bool stall_before_program); + // For a given MeshWorkload, a subgrid is unused if no programs are run on it. Go signals + // must be sent to this subgrid, to ensure consistent global state across the Virtual Mesh. + // When running trace, the dispatch commands responsible for forwarding go signals must be + // captured on these subgrids. + void capture_go_signal_trace_on_unused_subgrids( + std::vector& active_sub_grids, + const SubDeviceId& sub_device_id, + uint32_t expected_num_workers_completed, + bool mcast_go_signals, + bool unicast_go_signals); + // Workload dispatch utility functions + // Write dispatch commands associated with running a program on a Virtual Mesh subgrid + void write_program_cmds_to_subgrid( + const LogicalDeviceRange& sub_grid, + ProgramCommandSequence& program_cmd_seq, + bool stall_first, + bool stall_before_program, + std::unordered_set& chip_ids_in_workload); + // For a given MeshWorkload, a subgrid is unused if no programs are run on it. Go signals + // must be sent to this subgrid, to ensure consistent global state across the Virtual Mesh. + // This function generates and writes dispatch commands forwarding go signals to these subgrids. + void write_go_signal_to_unused_sub_grids( + std::unordered_set& chip_ids_in_workload, + const SubDeviceId& sub_device_id, + uint32_t expected_num_workers_completed, + bool mcast_go_signals, + bool unicast_go_signals); + // Access a reference system memory manager, which acts as a global host side state manager for + // specific MeshCommandQueue attributes (launch_message_buffer_state, event counter, etc.) + // TODO: All Mesh level host state managed by this class should be moved out, since its not + // tied to system memory anyway. + SystemMemoryManager& reference_sysmem_manager(); + std::array config_buffer_mgr_; std::array expected_num_workers_completed_; + + std::array + worker_launch_message_buffer_state_reset_; + std::array expected_num_workers_completed_reset_; + std::array + config_buffer_mgr_reset_; + // The following data structures are only popiulated when the MeshCQ is being used to trace workloads + // i.e. between record_begin() and record_end() being called + std::optional trace_id_; + std::shared_ptr trace_ctx_; + std::vector ordered_mesh_trace_md_; + MeshDevice* mesh_device_ = nullptr; uint32_t id_ = 0; CoreCoord dispatch_core_; @@ -73,7 +127,11 @@ class MeshCommandQueue { // MeshBuffer Write APIs void enqueue_write_shard_to_sub_grid( - const MeshBuffer& buffer, const void* host_data, const LogicalDeviceRange& device_range, bool blocking); + const MeshBuffer& buffer, + const void* host_data, + const LogicalDeviceRange& device_range, + bool blocking, + std::optional region = std::nullopt); void enqueue_write_mesh_buffer(const std::shared_ptr& buffer, const void* host_data, bool blocking); void enqueue_write_shards( const std::shared_ptr& mesh_buffer, @@ -103,6 +161,10 @@ class MeshCommandQueue { bool reset_launch_msg_state, uint32_t num_sub_devices, const vector_memcpy_aligned& go_signal_noc_data); + void record_begin(const MeshTraceId& trace_id, const std::shared_ptr& ctx); + void record_end(); + const std::vector& get_mesh_trace_md(); + void enqueue_trace(const MeshTraceId& trace_id, bool blocking); }; } // namespace tt::tt_metal::distributed diff --git a/tt_metal/api/tt-metalium/mesh_common.hpp b/tt_metal/api/tt-metalium/mesh_common.hpp new file mode 100644 index 00000000000..c83e832f44b --- /dev/null +++ b/tt_metal/api/tt-metalium/mesh_common.hpp @@ -0,0 +1,23 @@ + +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include + +// Define common types used across TT-Mesh data-structures and APIs + +using MeshTraceId = tt::stl::StrongType; + +// TODO (Issue #17477): MeshWorkload and MeshEvent currently rely on the coordinate systems +// exposed below. These must be uplifted to an ND coordinate system (DeviceCoord and DeviceRange), +// keeping things more consistent across the stack. +// For now, since the LogicalDeviceRange concept is fundamentally identical to the CoreRange concept +// on a 2D Mesh use this definition. CoreRange contains several utility functions required +// in the MeshWorkload context. + +using DeviceCoord = CoreCoord; +using LogicalDeviceRange = CoreRange; diff --git a/tt_metal/api/tt-metalium/mesh_device.hpp b/tt_metal/api/tt-metalium/mesh_device.hpp index a2fe85910da..81b1310d527 100644 --- a/tt_metal/api/tt-metalium/mesh_device.hpp +++ b/tt_metal/api/tt-metalium/mesh_device.hpp @@ -11,6 +11,7 @@ #include "device.hpp" +#include "mesh_common.hpp" #include "mesh_config.hpp" #include "mesh_coord.hpp" #include "mesh_device_view.hpp" @@ -26,6 +27,7 @@ namespace distributed { class MeshCommandQueue; class MeshDeviceView; class MeshSubDeviceManagerId; +class MeshTraceBuffer; class MeshDevice : public IDevice, public std::enable_shared_from_this { private: @@ -62,7 +64,8 @@ class MeshDevice : public IDevice, public std::enable_shared_from_this parent_mesh_; // Submesh created with reference to parent mesh std::vector> mesh_command_queues_; std::unique_ptr sub_device_manager_tracker_; - + std::unordered_map> trace_buffer_pool_; + uint32_t trace_buffers_size_ = 0; // This is a reference device used to query properties that are the same for all devices in the mesh. IDevice* reference_device() const; @@ -144,8 +147,16 @@ class MeshDevice : public IDevice, public std::enable_shared_from_this get_trace(uint32_t tid) override; + + // MeshTrace Internal APIs - these should be used to deprecate the single device backed trace APIs + void begin_mesh_trace(uint8_t cq_id, const MeshTraceId& trace_id); + void end_mesh_trace(uint8_t cq_id, const MeshTraceId& trace_id); + void release_mesh_trace(const MeshTraceId& trace_id); + std::shared_ptr get_mesh_trace(const MeshTraceId& trace_id); + std::shared_ptr& create_mesh_trace(const MeshTraceId& trace_id); uint32_t get_trace_buffers_size() const override; void set_trace_buffers_size(uint32_t size) override; + // Light Metal void load_trace(uint8_t cq_id, uint32_t trace_id, const TraceDescriptor& trace_desc) override; diff --git a/tt_metal/api/tt-metalium/mesh_device_view.hpp b/tt_metal/api/tt-metalium/mesh_device_view.hpp index 99ed59b3607..afe2b49fb05 100644 --- a/tt_metal/api/tt-metalium/mesh_device_view.hpp +++ b/tt_metal/api/tt-metalium/mesh_device_view.hpp @@ -21,16 +21,6 @@ namespace tt::tt_metal::distributed { // Forward declaration of MeshDevice class MeshDevice; -// TODO (Issue #17477): MeshWorkload and MeshEvent currently rely on the coordinate systems -// exposed below. These must be uplifted to an ND coordinate system (DeviceCoord and DeviceRange), -// keeping things more consistent across the stack. -// For now, since the LogicalDeviceRange concept is fundamentally identical to the CoreRange concept -// on a 2D Mesh use this definition. CoreRange contains several utility functions required -// in the MeshWorkload context. - -using DeviceCoord = CoreCoord; -using LogicalDeviceRange = CoreRange; - /** * @brief The MeshDeviceView class provides a view of a specific sub-region within the MeshDevice. * diff --git a/tt_metal/api/tt-metalium/mesh_trace.hpp b/tt_metal/api/tt-metalium/mesh_trace.hpp new file mode 100644 index 00000000000..3d242248d45 --- /dev/null +++ b/tt_metal/api/tt-metalium/mesh_trace.hpp @@ -0,0 +1,84 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "mesh_buffer.hpp" +#include "trace_buffer.hpp" +#include "mesh_common.hpp" + +namespace tt::tt_metal::distributed { + +// MeshTrace capture consists of 3 steps: +// 1. Staging: Workload dispatch commands are recorded inside a host data structure +// and the MeshTraceStagingMetadata holds information for where the trace data/commands +// have been stored. The commands are not ready to be committed to device DRAM in this +// form, hence they are temporarily staged and will be processed downstream. +// 2. Assembly: Create a MeshTrace from the staged commands by moving all dispatch +// commands out of the staging structure, and consolidate them into a single MeshTrace +// that can be written out to DRAM. +// 3. Commit to Mesh: Write assembled trace to DRAM buffer. + +// Data structure containing MeshTrace staging information +// For each MeshWorkload in the trace, this contains: +// - The device_range each program in the MeshWorkload runs on +// - The sysmem_manager coordinate the associated dispatch commands are stored in +// - The offset and size of the dispatch commands in the sysmem_manager +// staging vector +struct MeshTraceStagingMetadata { + LogicalDeviceRange device_range = LogicalDeviceRange({0, 0}); + DeviceCoord sysmem_manager_coord = DeviceCoord(0, 0); + std::size_t offset = 0; + std::size_t size = 0; +}; + +// Finalized/Consolidated dispatch commands on a device_range, corresponding +// to a trace +struct MeshTraceData { + LogicalDeviceRange device_range = LogicalDeviceRange({0, 0}); + std::vector data = {}; +}; + +// Wrapper around the MeshTraceData. Captures the complete state of a MeshTrace +// (including the dispatch commands across devices, the SubDevices the trace runs on +// the size of the trace and the number of workers in the trace) on host +class MeshTraceDescriptor { +public: + // Mapping of sub_device_id to descriptor + std::unordered_map descriptors; + // Store the keys of the map in a vector after descriptor has finished being populated + // This is an optimization since we sometimes need to only pass the keys in a container + std::vector sub_device_ids; + // Trace data per logical Device in a Mesh. + std::vector ordered_trace_data; + uint32_t total_trace_size = 0; + // Once the trace is captured/staged inside the sysmem_managers on a MeshDevice, assemble all + // dispatch commands related to the MeshTrace + void assemble_dispatch_commands(MeshDevice* device, const std::vector& mesh_trace_md); +}; + +// Ties a MeshTraceDescriptor (host side state) to a MeshBuffer (device side state) +struct MeshTraceBuffer { + // The trace descriptor associated with a MeshTrace + std::shared_ptr desc; + // The MeshBuffer this trace will be serialized to, before being run on a + // MeshDevice + std::shared_ptr mesh_buffer; +}; + +// Top level class - Manages MeshTrace +class MeshTrace { +public: + // Get global (unique) ID for trace + static MeshTraceId next_id(); + // Create an empty MeshTraceBuffer, which needs to be populated + // with a MeshTraceDescriptor and a MeshBuffer, to get tied to a MeshDevice. + static std::shared_ptr create_empty_mesh_trace_buffer(); + // Once the Trace Data per logical device has been captured in the + // MeshTraceDescriptor corresponding to this MeshTraceBuffer, + // it can be binarized to a MeshDevice through a Command Queue. + static void populate_mesh_buffer(MeshCommandQueue& mesh_cq, std::shared_ptr& trace_buffer); +}; + +} // namespace tt::tt_metal::distributed diff --git a/tt_metal/tt_stl/strong_type.hpp b/tt_metal/api/tt-metalium/strong_type.hpp similarity index 100% rename from tt_metal/tt_stl/strong_type.hpp rename to tt_metal/api/tt-metalium/strong_type.hpp diff --git a/tt_metal/api/tt-metalium/trace_buffer.hpp b/tt_metal/api/tt-metalium/trace_buffer.hpp index e304b2813e4..fb7667fc282 100644 --- a/tt_metal/api/tt-metalium/trace_buffer.hpp +++ b/tt_metal/api/tt-metalium/trace_buffer.hpp @@ -20,14 +20,15 @@ inline namespace v0 { class Buffer; } +struct TraceWorkerDescriptor { + uint32_t num_completion_worker_cores = 0; + uint32_t num_traced_programs_needing_go_signal_multicast = 0; + uint32_t num_traced_programs_needing_go_signal_unicast = 0; +}; + struct TraceDescriptor { - struct Descriptor { - uint32_t num_completion_worker_cores = 0; - uint32_t num_traced_programs_needing_go_signal_multicast = 0; - uint32_t num_traced_programs_needing_go_signal_unicast = 0; - }; // Mapping of sub_device_id to descriptor - std::unordered_map descriptors; + std::unordered_map descriptors; // Store the keys of the map in a vector after descriptor has finished being populated // This is an optimization since we sometimes need to only pass the keys in a container std::vector sub_device_ids; diff --git a/tt_metal/distributed/CMakeLists.txt b/tt_metal/distributed/CMakeLists.txt index ba9dbb1a442..3879a1648eb 100644 --- a/tt_metal/distributed/CMakeLists.txt +++ b/tt_metal/distributed/CMakeLists.txt @@ -8,6 +8,7 @@ set(DISTRIBUTED_SRC ${CMAKE_CURRENT_SOURCE_DIR}/mesh_workload_utils.cpp ${CMAKE_CURRENT_SOURCE_DIR}/mesh_command_queue.cpp ${CMAKE_CURRENT_SOURCE_DIR}/mesh_buffer.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/mesh_trace.cpp ) add_library(distributed OBJECT ${DISTRIBUTED_SRC}) diff --git a/tt_metal/distributed/distributed.cpp b/tt_metal/distributed/distributed.cpp index b92546832a1..8d067316db1 100644 --- a/tt_metal/distributed/distributed.cpp +++ b/tt_metal/distributed/distributed.cpp @@ -46,6 +46,22 @@ void EventSynchronize(const std::shared_ptr& event) { mesh_cq.verify_reported_events_after_draining(event); } +MeshTraceId BeginTraceCapture(MeshDevice* device, uint8_t cq_id) { + auto trace_id = MeshTrace::next_id(); + device->begin_mesh_trace(cq_id, trace_id); + return trace_id; +} + +void EndTraceCapture(MeshDevice* device, uint8_t cq_id, const MeshTraceId& trace_id) { + device->end_mesh_trace(cq_id, trace_id); +} + +void ReplayTrace(MeshDevice* device, uint8_t cq_id, const MeshTraceId& trace_id, bool blocking) { + device->mesh_command_queue(cq_id).enqueue_trace(trace_id, blocking); +} + +void ReleaseTrace(MeshDevice* device, const MeshTraceId& trace_id) { device->release_mesh_trace(trace_id); } + void Finish(MeshCommandQueue& mesh_cq, tt::stl::Span sub_device_ids) { mesh_cq.finish(sub_device_ids); } diff --git a/tt_metal/distributed/mesh_command_queue.cpp b/tt_metal/distributed/mesh_command_queue.cpp index 415e5418210..1a8d6a90766 100644 --- a/tt_metal/distributed/mesh_command_queue.cpp +++ b/tt_metal/distributed/mesh_command_queue.cpp @@ -12,9 +12,11 @@ #include "tt_metal/distributed/mesh_workload_utils.hpp" #include "tt_metal/impl/buffers/dispatch.hpp" #include "tt_metal/impl/program/dispatch.hpp" +#include "tt_metal/impl/trace/dispatch.hpp" #include "tt_metal/impl/dispatch/dispatch_query_manager.hpp" #include "tt_cluster.hpp" + namespace tt::tt_metal::distributed { struct MeshReadEventDescriptor { @@ -69,7 +71,7 @@ void MeshCommandQueue::enqueue_mesh_workload(MeshWorkload& mesh_workload, bool b auto sub_device_id = *(sub_device_ids.begin()); auto sub_device_index = sub_device_id.to_index(); auto mesh_device_id = this->mesh_device_->id(); - auto& sysmem_manager = mesh_device_->get_device(0, 0)->sysmem_manager(); + auto& sysmem_manager = this->reference_sysmem_manager(); auto dispatch_core_config = DispatchQueryManager::instance().get_dispatch_core_config(); CoreType dispatch_core_type = dispatch_core_config.get_core_type(); @@ -91,29 +93,32 @@ void MeshCommandQueue::enqueue_mesh_workload(MeshWorkload& mesh_workload, bool b } program_dispatch::ProgramDispatchMetadata dispatch_metadata; + uint32_t expected_num_workers_completed = sysmem_manager.get_bypass_mode() + ? trace_ctx_->descriptors[sub_device_id].num_completion_worker_cores + : expected_num_workers_completed_[sub_device_index]; // Reserve space in the L1 Kernel Config Ring Buffer for this workload. program_dispatch::reserve_space_in_kernel_config_buffer( this->get_config_buffer_mgr(sub_device_index), mesh_workload.get_program_config_sizes(), mesh_workload.get_program_binary_status(mesh_device_id), num_workers, - expected_num_workers_completed_[sub_device_index], + expected_num_workers_completed, dispatch_metadata); std::unordered_set chip_ids_in_workload = {}; + std::vector active_sub_grids = {}; // Iterate over all programs. Update dispatch commands per program to reflect // current device state. Write the finalized program command sequence to each // physical device tied to the program. for (const auto& device_range : mesh_workload.get_logical_device_ranges()) { auto& program = mesh_workload.get_program_on_device_range(device_range); auto& program_cmd_seq = mesh_workload.get_dispatch_cmds_for_program(program); - program_dispatch::update_program_dispatch_commands( program, program_cmd_seq, sysmem_manager.get_worker_launch_message_buffer_state()[sub_device_index].get_mcast_wptr(), sysmem_manager.get_worker_launch_message_buffer_state()[sub_device_index].get_unicast_wptr(), - expected_num_workers_completed_[sub_device_index], + expected_num_workers_completed, this->virtual_program_dispatch_core(), dispatch_core_type, sub_device_id, @@ -123,35 +128,26 @@ void MeshCommandQueue::enqueue_mesh_workload(MeshWorkload& mesh_workload, bool b unicast_go_signals, mesh_device_->num_worker_cores(HalProgrammableCoreType::ACTIVE_ETH, sub_device_id))); - for (std::size_t logical_x = device_range.start_coord.x; logical_x < device_range.end_coord.x + 1; - logical_x++) { - for (std::size_t logical_y = device_range.start_coord.y; logical_y < device_range.end_coord.y + 1; - logical_y++) { - program_dispatch::write_program_command_sequence( - program_cmd_seq, - this->mesh_device_->get_device(logical_y, logical_x)->sysmem_manager(), - id_, - dispatch_core_type, - dispatch_metadata.stall_first, - dispatch_metadata.stall_before_program); - chip_ids_in_workload.insert(this->mesh_device_->get_device(logical_y, logical_x)->id()); - } + if (sysmem_manager.get_bypass_mode()) { + this->capture_program_trace_on_subgrid( + device_range, program_cmd_seq, dispatch_metadata.stall_first, dispatch_metadata.stall_before_program); + active_sub_grids.push_back(device_range); + } else { + this->write_program_cmds_to_subgrid( + device_range, + program_cmd_seq, + dispatch_metadata.stall_first, + dispatch_metadata.stall_before_program, + chip_ids_in_workload); } } // Send go signals to devices not running a program to ensure consistent global state - for (auto& device : this->mesh_device_->get_devices()) { - if (chip_ids_in_workload.find(device->id()) == chip_ids_in_workload.end()) { - write_go_signal( - id_, - device, - sub_device_id, - device->sysmem_manager(), - expected_num_workers_completed_[sub_device_index], - this->virtual_program_dispatch_core(), - mcast_go_signals, - unicast_go_signals, - mesh_device_->num_worker_cores(HalProgrammableCoreType::ACTIVE_ETH, sub_device_id)); - } + if (not sysmem_manager.get_bypass_mode()) { + this->write_go_signal_to_unused_sub_grids( + chip_ids_in_workload, sub_device_id, expected_num_workers_completed, mcast_go_signals, unicast_go_signals); + } else { + this->capture_go_signal_trace_on_unused_subgrids( + active_sub_grids, sub_device_id, expected_num_workers_completed, mcast_go_signals, unicast_go_signals); } // Increment Launch Message Buffer Write Pointers if (mcast_go_signals) { @@ -160,8 +156,19 @@ void MeshCommandQueue::enqueue_mesh_workload(MeshWorkload& mesh_workload, bool b if (unicast_go_signals) { sysmem_manager.get_worker_launch_message_buffer_state()[sub_device_index].inc_unicast_wptr(1); } - // Update the expected number of workers dispatch must wait on - expected_num_workers_completed_[sub_device_index] += num_workers; + + if (sysmem_manager.get_bypass_mode()) { + if (mcast_go_signals) { + // The workload contains programs that required a go signal mcast. Capture this here + // to accurately update the launch msg ring buffer state post trace execution on all + // mcast cores. + trace_ctx_->descriptors[sub_device_id].num_traced_programs_needing_go_signal_multicast++; + } + // Update the expected number of workers dispatch must wait on + trace_ctx_->descriptors[sub_device_id].num_completion_worker_cores += num_workers; + } else { + expected_num_workers_completed_[sub_device_index] += num_workers; + } // From the dispatcher's perspective, binaries are now committed to DRAM mesh_workload.set_program_binary_status(mesh_device_id, ProgramBinaryStatus::Committed); mesh_workload.set_last_used_command_queue_for_testing(this); @@ -367,15 +374,19 @@ void MeshCommandQueue::read_sharded_buffer(MeshBuffer& buffer, void* dst) { } void MeshCommandQueue::enqueue_write_shard_to_sub_grid( - const MeshBuffer& buffer, const void* host_data, const LogicalDeviceRange& device_range, bool blocking) { + const MeshBuffer& buffer, + const void* host_data, + const LogicalDeviceRange& device_range, + bool blocking, + std::optional region) { if (buffer.global_layout() == MeshBufferLayout::REPLICATED) { for (std::size_t logical_x = device_range.start_coord.x; logical_x < device_range.end_coord.x + 1; logical_x++) { for (std::size_t logical_y = device_range.start_coord.y; logical_y < device_range.end_coord.y + 1; logical_y++) { auto device_shard_view = buffer.get_device_buffer(MeshCoordinate(logical_y, logical_x)); - const BufferRegion region(0, device_shard_view->size()); - this->write_shard_to_device(device_shard_view, host_data, region); + const BufferRegion buffer_region = region.value_or(BufferRegion(0, device_shard_view->size())); + this->write_shard_to_device(device_shard_view, host_data, buffer_region); } } } else { @@ -438,7 +449,7 @@ void MeshCommandQueue::enqueue_record_event_helper( tt::stl::Span sub_device_ids, bool notify_host, const std::optional& device_range) { - auto& sysmem_manager = mesh_device_->get_device(0, 0)->sysmem_manager(); + auto& sysmem_manager = this->reference_sysmem_manager(); event->cq_id = id_; event->event_id = sysmem_manager.get_next_event(id_); event->device = mesh_device_; @@ -510,6 +521,7 @@ void MeshCommandQueue::drain_events_from_completion_queue() { uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id()); bool exit_condition = false; device->sysmem_manager().completion_queue_wait_front(id_, exit_condition); + event_dispatch::read_events_from_completion_queue( mesh_read_descriptor->single_device_descriptor, mmio_device_id, @@ -554,9 +566,188 @@ void MeshCommandQueue::reset_worker_state( program_dispatch::reset_config_buf_mgrs_and_expected_workers( config_buffer_mgr_, expected_num_workers_completed_, mesh_device_->num_sub_devices()); if (reset_launch_msg_state) { - auto& sysmem_manager = mesh_device_->get_device(0, 0)->sysmem_manager(); + auto& sysmem_manager = this->reference_sysmem_manager(); sysmem_manager.reset_worker_launch_message_buffer_state(num_sub_devices); } } +void MeshCommandQueue::write_program_cmds_to_subgrid( + const LogicalDeviceRange& sub_grid, + ProgramCommandSequence& program_cmd_seq, + bool stall_first, + bool stall_before_program, + std::unordered_set& chip_ids_in_workload) { + auto dispatch_core_config = DispatchQueryManager::instance().get_dispatch_core_config(); + CoreType dispatch_core_type = dispatch_core_config.get_core_type(); + + for (std::size_t logical_x = sub_grid.start_coord.x; logical_x < sub_grid.end_coord.x + 1; logical_x++) { + for (std::size_t logical_y = sub_grid.start_coord.y; logical_y < sub_grid.end_coord.y + 1; logical_y++) { + program_dispatch::write_program_command_sequence( + program_cmd_seq, + this->mesh_device_->get_device(logical_y, logical_x)->sysmem_manager(), + id_, + dispatch_core_type, + stall_first, + stall_before_program); + chip_ids_in_workload.insert(this->mesh_device_->get_device(logical_y, logical_x)->id()); + } + } +} + +void MeshCommandQueue::write_go_signal_to_unused_sub_grids( + std::unordered_set& chip_ids_in_workload, + const SubDeviceId& sub_device_id, + uint32_t expected_num_workers_completed, + bool mcast_go_signals, + bool unicast_go_signals) { + for (auto& device : this->mesh_device_->get_devices()) { + if (chip_ids_in_workload.find(device->id()) == chip_ids_in_workload.end()) { + write_go_signal( + id_, + mesh_device_, + sub_device_id, + device->sysmem_manager(), + expected_num_workers_completed, + this->virtual_program_dispatch_core(), + mcast_go_signals, + unicast_go_signals, + mesh_device_->num_worker_cores(HalProgrammableCoreType::ACTIVE_ETH, sub_device_id)); + } + } +} + +void MeshCommandQueue::capture_program_trace_on_subgrid( + const LogicalDeviceRange& sub_grid, + ProgramCommandSequence& program_cmd_seq, + bool stall_first, + bool stall_before_program) { + auto start_coord = sub_grid.start_coord; + auto& sysmem_manager_for_trace = mesh_device_->get_device(start_coord.y, start_coord.x)->sysmem_manager(); + uint32_t sysmem_manager_offset = sysmem_manager_for_trace.get_issue_queue_write_ptr(id_); + + auto dispatch_core_config = DispatchQueryManager::instance().get_dispatch_core_config(); + CoreType dispatch_core_type = dispatch_core_config.get_core_type(); + + program_dispatch::write_program_command_sequence( + program_cmd_seq, sysmem_manager_for_trace, id_, dispatch_core_type, stall_first, stall_before_program); + auto mesh_trace_md = MeshTraceStagingMetadata{ + sub_grid, + start_coord, + sysmem_manager_offset, + sysmem_manager_for_trace.get_issue_queue_write_ptr(id_) - sysmem_manager_offset}; + ordered_mesh_trace_md_.push_back(mesh_trace_md); +} + +void MeshCommandQueue::capture_go_signal_trace_on_unused_subgrids( + std::vector& active_sub_grids, + const SubDeviceId& sub_device_id, + uint32_t expected_num_workers_completed, + bool mcast_go_signals, + bool unicast_go_signals) { + CoreRangeSet active_ranges = active_sub_grids[0]; + for (int i = 1; i < active_sub_grids.size(); i++) { + active_ranges = active_ranges.merge(active_sub_grids[i]); + } + TT_FATAL(active_ranges.size() == 1, "Cannot support non convex grids"); + CoreRange active_grid = active_ranges.bounding_box(); + CoreRange full_grid = CoreRange({0, 0}, {mesh_device_->num_cols() - 1, mesh_device_->num_rows() - 1}); + if (active_grid != full_grid) { + CoreRange unused_grid = convex_relative_complement(full_grid, active_grid); + + auto start_coord = unused_grid.start_coord; + auto& sysmem_manager_for_trace = mesh_device_->get_device(start_coord.y, start_coord.x)->sysmem_manager(); + uint32_t sysmem_manager_offset = sysmem_manager_for_trace.get_issue_queue_write_ptr(id_); + write_go_signal( + id_, + mesh_device_, + sub_device_id, + sysmem_manager_for_trace, + expected_num_workers_completed, + this->virtual_program_dispatch_core(), + mcast_go_signals, + unicast_go_signals, + mesh_device_->num_worker_cores(HalProgrammableCoreType::ACTIVE_ETH, sub_device_id)); + auto mesh_trace_md = MeshTraceStagingMetadata{ + unused_grid, + start_coord, + sysmem_manager_offset, + sysmem_manager_for_trace.get_issue_queue_write_ptr(id_) - sysmem_manager_offset}; + ordered_mesh_trace_md_.push_back(mesh_trace_md); + } +} + +void MeshCommandQueue::enqueue_trace(const MeshTraceId& trace_id, bool blocking) { + auto trace_inst = mesh_device_->get_mesh_trace(trace_id); + auto descriptor = trace_inst->desc; + auto buffer = trace_inst->mesh_buffer; + uint32_t num_sub_devices = descriptor->sub_device_ids.size(); + + auto cmd_sequence_sizeB = trace_dispatch::compute_trace_cmd_size(num_sub_devices); + + trace_dispatch::TraceDispatchMetadata dispatch_md( + cmd_sequence_sizeB, + descriptor->descriptors, + descriptor->sub_device_ids, + buffer->page_size(), + buffer->num_pages(), + buffer->address()); + + for (auto device : mesh_device_->get_devices()) { + trace_dispatch::issue_trace_commands( + mesh_device_, device->sysmem_manager(), dispatch_md, id_, expected_num_workers_completed_, dispatch_core_); + } + trace_dispatch::update_worker_state_post_trace_execution( + trace_inst->desc->descriptors, + this->reference_sysmem_manager(), + config_buffer_mgr_, + expected_num_workers_completed_); + + if (blocking) { + this->finish(); + } +} + +void MeshCommandQueue::record_begin(const MeshTraceId& trace_id, const std::shared_ptr& ctx) { + trace_dispatch::reset_host_dispatch_state_for_trace( + mesh_device_->num_sub_devices(), + this->reference_sysmem_manager(), + expected_num_workers_completed_, + config_buffer_mgr_, + worker_launch_message_buffer_state_reset_, + expected_num_workers_completed_reset_, + config_buffer_mgr_reset_); + + trace_id_ = trace_id; + trace_ctx_ = ctx; + for (auto device : mesh_device_->get_devices()) { + device->sysmem_manager().set_bypass_mode(/*enable*/ true, /*clear*/ true); + } +} + +void MeshCommandQueue::record_end() { + trace_ctx_->assemble_dispatch_commands(this->device(), this->get_mesh_trace_md()); + trace_id_ = std::nullopt; + trace_ctx_ = nullptr; + + trace_dispatch::load_host_dispatch_state( + mesh_device_->num_sub_devices(), + this->reference_sysmem_manager(), + expected_num_workers_completed_, + config_buffer_mgr_, + worker_launch_message_buffer_state_reset_, + expected_num_workers_completed_reset_, + config_buffer_mgr_reset_); + + ordered_mesh_trace_md_.clear(); + for (auto device : mesh_device_->get_devices()) { + device->sysmem_manager().set_bypass_mode(/*enable*/ false, /*clear*/ true); + } +} + +const std::vector& MeshCommandQueue::get_mesh_trace_md() { return ordered_mesh_trace_md_; } + +SystemMemoryManager& MeshCommandQueue::reference_sysmem_manager() { + return mesh_device_->get_device(0, 0)->sysmem_manager(); +} + } // namespace tt::tt_metal::distributed diff --git a/tt_metal/distributed/mesh_device.cpp b/tt_metal/distributed/mesh_device.cpp index 7b90778d157..5c731e8bd30 100644 --- a/tt_metal/distributed/mesh_device.cpp +++ b/tt_metal/distributed/mesh_device.cpp @@ -584,18 +584,40 @@ void MeshDevice::release_trace(const uint32_t tid) { device->release_trace(tid); } } + +std::shared_ptr& MeshDevice::create_mesh_trace(const MeshTraceId& trace_id) { + auto [trace, emplaced] = trace_buffer_pool_.emplace(trace_id, MeshTrace::create_empty_mesh_trace_buffer()); + TT_FATAL(emplaced, "Trace buffer with tid {} already exists", *trace_id); + return trace->second; +} + +void MeshDevice::release_mesh_trace(const MeshTraceId& trace_id) { trace_buffer_pool_.erase(trace_id); } + +std::shared_ptr MeshDevice::get_mesh_trace(const MeshTraceId& trace_id) { + auto trace = trace_buffer_pool_.find(trace_id); + if (trace != trace_buffer_pool_.end()) { + return trace->second; + } + TT_THROW("Trace Instance with ID {} is not initialized", *trace_id); +} + +void MeshDevice::begin_mesh_trace(uint8_t cq_id, const MeshTraceId& trace_id) { + auto& mesh_trace_buffer = this->create_mesh_trace(trace_id); + mesh_command_queues_[cq_id]->record_begin(trace_id, mesh_trace_buffer->desc); +} + +void MeshDevice::end_mesh_trace(uint8_t cq_id, const MeshTraceId& trace_id) { + auto trace_buffer = this->get_mesh_trace(trace_id); + mesh_command_queues_[cq_id]->record_end(); + MeshTrace::populate_mesh_buffer(*(mesh_command_queues_[cq_id]), trace_buffer); +} + std::shared_ptr MeshDevice::get_trace(uint32_t tid) { TT_THROW("get_trace() is not supported on MeshDevice - use individual devices instead"); return reference_device()->get_trace(tid); } -uint32_t MeshDevice::get_trace_buffers_size() const { - TT_THROW("get_trace_buffers_size() is not supported on MeshDevice - use individual devices instead"); - return reference_device()->get_trace_buffers_size(); -} -void MeshDevice::set_trace_buffers_size(uint32_t size) { - TT_THROW("set_trace_buffers_size() is not supported on MeshDevice - use individual devices instead"); - reference_device()->set_trace_buffers_size(size); -} +uint32_t MeshDevice::get_trace_buffers_size() const { return trace_buffers_size_; } +void MeshDevice::set_trace_buffers_size(uint32_t size) { trace_buffers_size_ = size; } // Light Metal void MeshDevice::load_trace(const uint8_t cq_id, const uint32_t trace_id, const TraceDescriptor& trace_desc) { diff --git a/tt_metal/distributed/mesh_trace.cpp b/tt_metal/distributed/mesh_trace.cpp new file mode 100644 index 00000000000..49cd6f1a779 --- /dev/null +++ b/tt_metal/distributed/mesh_trace.cpp @@ -0,0 +1,156 @@ + +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#include "tt_metal/distributed/mesh_workload_utils.hpp" +#include "tt_metal/impl/trace/dispatch.hpp" + +namespace tt::tt_metal::distributed { + +MeshTraceId MeshTrace::next_id() { + static std::atomic global_trace_id{0}; + return MeshTraceId(global_trace_id++); +} + +void MeshTraceDescriptor::assemble_dispatch_commands( + MeshDevice* mesh_device, const std::vector& mesh_trace_md) { + auto& trace_data = this->ordered_trace_data; + for (auto& trace_md : mesh_trace_md) { + auto& sysmem_mgr_coord = trace_md.sysmem_manager_coord; + auto& sysmem_manager = mesh_device->get_device(sysmem_mgr_coord.y, sysmem_mgr_coord.x)->sysmem_manager(); + auto trace_data_word_offset = trace_md.offset / sizeof(uint32_t); + auto trace_data_size_words = trace_md.size / sizeof(uint32_t); + auto& bypass_data = sysmem_manager.get_bypass_data(); + bool intersection_found = false; + + std::vector intermed_trace_data = {}; + std::vector program_cmds_vector( + std::make_move_iterator(bypass_data.begin() + trace_data_word_offset), + std::make_move_iterator(bypass_data.begin() + trace_data_word_offset + trace_data_size_words)); + std::vector device_ranges_to_invalidate = {}; + for (auto& program : trace_data) { + if (program.device_range.intersects(trace_md.device_range)) { + // The current program intersects with a program that was previously + // placed on the Mesh. + intersection_found = true; + auto intersection = program.device_range.intersection(trace_md.device_range).value(); + if (intersection == program.device_range) { + // Intersection matches the originally placed program. + program.data.insert( + program.data.end(), + std::make_move_iterator(program_cmds_vector.begin()), + std::make_move_iterator(program_cmds_vector.end())); + } else { + // Intersection is a subset of the originally placed program. + auto compliment_ = convex_relative_complement(program.device_range, intersection); + intermed_trace_data.push_back(MeshTraceData{compliment_, program.data}); + intermed_trace_data.push_back(MeshTraceData{intersection, program.data}); + auto& intersection_data = intermed_trace_data.back().data; + intersection_data.insert( + intersection_data.end(), + std::make_move_iterator(program_cmds_vector.begin()), + std::make_move_iterator(program_cmds_vector.end())); + device_ranges_to_invalidate.push_back(program.device_range); + } + } + } + if (intermed_trace_data.size()) { + // Invalidate programs with partial intersections with current programs. + for (auto& program : trace_data) { + if (std::find( + device_ranges_to_invalidate.begin(), device_ranges_to_invalidate.end(), program.device_range) == + device_ranges_to_invalidate.end()) { + intermed_trace_data.push_back(std::move(program)); + } + } + trace_data = intermed_trace_data; + } + if (not intersection_found) { + // Intersection not found, place program on Mesh. + trace_data.push_back(MeshTraceData{trace_md.device_range, std::move(program_cmds_vector)}); + } + this->total_trace_size += trace_md.size; + } + auto bcast_device_range = LogicalDeviceRange({0, 0}, {mesh_device->num_cols() - 1, mesh_device->num_rows() - 1}); + std::vector exec_buf_end = {}; + + DeviceCommand command_sequence(hal.get_alignment(HalMemType::HOST)); + command_sequence.add_prefetch_exec_buf_end(); + + for (int i = 0; i < command_sequence.size_bytes() / sizeof(uint32_t); i++) { + exec_buf_end.push_back(((uint32_t*)command_sequence.data())[i]); + } + + for (auto& program : trace_data) { + if (program.device_range.intersects(bcast_device_range)) { + program.data.insert(program.data.end(), exec_buf_end.begin(), exec_buf_end.end()); + } + } + this->total_trace_size += command_sequence.size_bytes(); + + this->sub_device_ids.reserve(this->descriptors.size()); + for (const auto& [id, _] : this->descriptors) { + this->sub_device_ids.push_back(id); + } +} + +std::shared_ptr MeshTrace::create_empty_mesh_trace_buffer() { + return std::make_shared(std::make_shared(), nullptr); +} + +void MeshTrace::populate_mesh_buffer(MeshCommandQueue& mesh_cq, std::shared_ptr& trace_buffer) { + auto mesh_device = mesh_cq.device(); + uint64_t unpadded_size = trace_buffer->desc->total_trace_size; + size_t page_size = trace_dispatch::compute_interleaved_trace_buf_page_size( + unpadded_size, mesh_cq.device()->allocator()->get_num_banks(BufferType::DRAM)); + size_t padded_size = round_up(unpadded_size, page_size); + + const auto current_trace_buffers_size = mesh_cq.device()->get_trace_buffers_size(); + mesh_cq.device()->set_trace_buffers_size(current_trace_buffers_size + padded_size); + auto trace_region_size = mesh_cq.device()->allocator()->get_config().trace_region_size; + TT_FATAL( + mesh_cq.device()->get_trace_buffers_size() <= trace_region_size, + "Creating trace buffers of size {}B on MeshDevice {}, but only {}B is allocated for trace region.", + mesh_cq.device()->get_trace_buffers_size(), + mesh_cq.device()->id(), + trace_region_size); + + DeviceLocalBufferConfig device_local_trace_buf_config = { + .page_size = page_size, + .buffer_type = BufferType::TRACE, + .buffer_layout = TensorMemoryLayout::INTERLEAVED, + }; + + ReplicatedBufferConfig global_trace_buf_config = { + .size = padded_size, + }; + + trace_buffer->mesh_buffer = + MeshBuffer::create(global_trace_buf_config, device_local_trace_buf_config, mesh_cq.device()); + + std::unordered_map write_offset_per_device_range = {}; + for (auto& mesh_trace_data : trace_buffer->desc->ordered_trace_data) { + auto& device_range = mesh_trace_data.device_range; + if (write_offset_per_device_range.find(device_range) == write_offset_per_device_range.end()) { + write_offset_per_device_range.insert({device_range, 0}); + } + std::vector write_data = mesh_trace_data.data; + auto unpadded_data_size = write_data.size() * sizeof(uint32_t); + auto padded_data_size = round_up(unpadded_data_size, page_size); + size_t numel_padding = (padded_data_size - unpadded_data_size) / sizeof(uint32_t); + if (numel_padding > 0) { + write_data.resize(write_data.size() + numel_padding, 0); + } + auto write_region = + BufferRegion(write_offset_per_device_range.at(device_range), write_data.size() * sizeof(uint32_t)); + mesh_cq.enqueue_write_shard_to_sub_grid( + *(trace_buffer->mesh_buffer), write_data.data(), device_range, true, write_region); + write_offset_per_device_range.at(device_range) += mesh_trace_data.data.size() * sizeof(uint32_t); + } +} + +} // namespace tt::tt_metal::distributed diff --git a/tt_metal/distributed/mesh_workload_utils.cpp b/tt_metal/distributed/mesh_workload_utils.cpp index c51a99c957a..21be612bdb0 100644 --- a/tt_metal/distributed/mesh_workload_utils.cpp +++ b/tt_metal/distributed/mesh_workload_utils.cpp @@ -7,6 +7,7 @@ #include "tt_metal/impl/program/dispatch.hpp" #include "tt_metal/impl/dispatch/dispatch_query_manager.hpp" +#include "tt_metal/distributed/mesh_workload_utils.hpp" namespace tt::tt_metal::distributed { @@ -29,19 +30,23 @@ void write_go_signal( void* cmd_region = sysmem_manager.issue_queue_reserve(cmd_sequence_sizeB, cq_id); + auto dispatch_core_config = DispatchQueryManager::instance().get_dispatch_core_config(); + CoreType dispatch_core_type = dispatch_core_config.get_core_type(); + auto sub_device_index = sub_device_id.to_index(); + HugepageDeviceCommand go_signal_cmd_sequence(cmd_region, cmd_sequence_sizeB); go_msg_t run_program_go_signal; - run_program_go_signal.signal = RUN_MSG_GO; run_program_go_signal.master_x = dispatch_core.x; run_program_go_signal.master_y = dispatch_core.y; - run_program_go_signal.dispatch_message_offset = 0; + run_program_go_signal.dispatch_message_offset = + (uint8_t)DispatchMemMap::get(dispatch_core_type).get_dispatch_message_offset(sub_device_index); - CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id()); - uint32_t dispatch_message_addr = DispatchMemMap::get(dispatch_core_type) - .get_device_command_queue_addr(CommandQueueDeviceAddrType::DISPATCH_MESSAGE); + uint32_t dispatch_message_addr = + DispatchMemMap::get(dispatch_core_type) + .get_device_command_queue_addr(CommandQueueDeviceAddrType::DISPATCH_MESSAGE) + + DispatchMemMap::get(dispatch_core_type).get_dispatch_message_offset(sub_device_index); - auto sub_device_index = sub_device_id.to_index(); // When running with dispatch_s enabled: // - dispatch_d must notify dispatch_s that a go signal can be sent // - dispatch_s then mcasts the go signal to all workers. @@ -49,11 +54,13 @@ void write_go_signal( // - dispatch_d handles sending the go signal to all workers // There is no need for dispatch_d to barrier before sending the dispatch_s notification or go signal, // since this go signal is not preceeded by NOC txns for program config data + DispatcherSelect dispatcher_for_go_signal = DispatcherSelect::DISPATCH_MASTER; if (DispatchQueryManager::instance().dispatch_s_enabled()) { uint16_t index_bitmask = 1 << sub_device_index; go_signal_cmd_sequence.add_notify_dispatch_s_go_signal_cmd( 0, /* wait */ index_bitmask /* index_bitmask */); // When running on sub devices, we must account for this + dispatcher_for_go_signal = DispatcherSelect::DISPATCH_SLAVE; } go_signal_cmd_sequence.add_dispatch_go_signal_mcast( expected_num_workers_completed, @@ -62,7 +69,7 @@ void write_go_signal( send_mcast ? device->num_noc_mcast_txns(sub_device_id) : 0, send_unicasts ? ((num_unicast_txns > 0) ? num_unicast_txns : device->num_noc_unicast_txns(sub_device_id)) : 0, device->noc_data_start_index(sub_device_id, send_mcast, send_unicasts), /* noc_data_start_idx */ - DispatcherSelect::DISPATCH_SLAVE); + dispatcher_for_go_signal); sysmem_manager.issue_queue_push_back(cmd_sequence_sizeB, cq_id); @@ -70,4 +77,36 @@ void write_go_signal( sysmem_manager.fetch_queue_write(cmd_sequence_sizeB, cq_id); } +bool is_row_major_intersection(const LogicalDeviceRange& parent, const LogicalDeviceRange& intersection) { + return intersection.grid_size().x == parent.grid_size().x; +} + +LogicalDeviceRange convex_relative_complement( + const LogicalDeviceRange& parent, const LogicalDeviceRange& intersection) { + TT_FATAL(parent.contains(intersection), "Parent must contain intersection"); + auto intersection_grid_size = intersection.grid_size(); + auto parent_grid_size = parent.grid_size(); + TT_FATAL( + intersection_grid_size.x == parent_grid_size.x || intersection_grid_size.y == parent_grid_size.y, + "Non convex grids not supported"); + + if (is_row_major_intersection(parent, intersection)) { + if (intersection.start_coord.y == parent.start_coord.y) { + return LogicalDeviceRange( + {parent.start_coord.x, intersection.end_coord.y + 1}, {parent.end_coord.x, parent.end_coord.y}); + } else { + return LogicalDeviceRange( + {parent.start_coord.x, parent.start_coord.y}, {parent.end_coord.x, intersection.start_coord.y - 1}); + } + } else { + if (intersection.start_coord.x == parent.start_coord.x) { + return LogicalDeviceRange( + {intersection.end_coord.x + 1, parent.start_coord.y}, {parent.end_coord.x, parent.end_coord.y}); + } else { + return LogicalDeviceRange( + {parent.start_coord.x, parent.start_coord.y}, {intersection.start_coord.x - 1, parent.end_coord.y}); + } + } +} + } // namespace tt::tt_metal::distributed diff --git a/tt_metal/distributed/mesh_workload_utils.hpp b/tt_metal/distributed/mesh_workload_utils.hpp index 1461aad13f8..c4fd759a5c6 100644 --- a/tt_metal/distributed/mesh_workload_utils.hpp +++ b/tt_metal/distributed/mesh_workload_utils.hpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include +#include // Utility functions for dispatch MeshWorkloads // Used by MeshCommandQueue @@ -19,4 +20,6 @@ void write_go_signal( bool send_unicasts, int num_unicast_txns = -1); +LogicalDeviceRange convex_relative_complement(const LogicalDeviceRange& parent, const LogicalDeviceRange& intersection); + } // namespace tt::tt_metal::distributed diff --git a/tt_metal/impl/CMakeLists.txt b/tt_metal/impl/CMakeLists.txt index 7cd2d6bc3cf..db78ed6d2cb 100644 --- a/tt_metal/impl/CMakeLists.txt +++ b/tt_metal/impl/CMakeLists.txt @@ -48,6 +48,7 @@ set(IMPL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/debug/watcher_device_reader.cpp ${CMAKE_CURRENT_SOURCE_DIR}/trace/trace.cpp ${CMAKE_CURRENT_SOURCE_DIR}/trace/trace_buffer.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/trace/dispatch.cpp ${CMAKE_CURRENT_SOURCE_DIR}/event/event.cpp ${CMAKE_CURRENT_SOURCE_DIR}/event/dispatch.cpp ${CMAKE_CURRENT_SOURCE_DIR}/flatbuffer/base_types_from_flatbuffer.cpp diff --git a/tt_metal/impl/dispatch/hardware_command_queue.cpp b/tt_metal/impl/dispatch/hardware_command_queue.cpp index ebbcca6781d..c7223bb3b72 100644 --- a/tt_metal/impl/dispatch/hardware_command_queue.cpp +++ b/tt_metal/impl/dispatch/hardware_command_queue.cpp @@ -20,6 +20,7 @@ #include "tt_metal/impl/debug/watcher_server.hpp" #include "tt_metal/impl/program/dispatch.hpp" +#include "tt_metal/impl/trace/dispatch.hpp" #include "tt_metal/impl/dispatch/dispatch_query_manager.hpp" namespace tt::tt_metal { @@ -405,38 +406,31 @@ void HWCommandQueue::enqueue_trace(const uint32_t trace_id, bool blocking) { ZoneScopedN("HWCommandQueue_enqueue_trace"); auto trace_inst = this->device_->get_trace(trace_id); - auto command = EnqueueTraceCommand( - this->id_, - this->device_, - this->manager, - trace_inst->desc, - *trace_inst->buffer, + auto descriptor = trace_inst->desc; + auto buffer = trace_inst->buffer; + uint32_t num_sub_devices = descriptor->sub_device_ids.size(); + + auto cmd_sequence_sizeB = trace_dispatch::compute_trace_cmd_size(num_sub_devices); + + trace_dispatch::TraceDispatchMetadata dispatch_md( + cmd_sequence_sizeB, + descriptor->descriptors, + descriptor->sub_device_ids, + buffer->page_size(), + buffer->num_pages(), + buffer->address()); + + trace_dispatch::issue_trace_commands( + device_, + device_->sysmem_manager(), + dispatch_md, + id_, this->expected_num_workers_completed, - this->noc_index_, - this->virtual_enqueue_program_dispatch_core_); + virtual_enqueue_program_dispatch_core_); - this->enqueue_command(command, false, {}); + trace_dispatch::update_worker_state_post_trace_execution( + trace_inst->desc->descriptors, this->manager, this->config_buffer_mgr, this->expected_num_workers_completed); - for (const auto& [id, desc] : trace_inst->desc->descriptors) { - auto index = id.to_index(); - // Increment the expected worker cores counter due to trace programs completion - this->expected_num_workers_completed[index] += desc.num_completion_worker_cores; - // After trace runs, the rdptr on each worker will be incremented by the number of programs in the trace - // Update the wptr on host to match state. If the trace doesn't execute on a - // class of worker (unicast or multicast), it doesn't reset or modify the - // state for those workers. - auto& worker_launch_message_buffer_state = this->manager.get_worker_launch_message_buffer_state()[index]; - if (desc.num_traced_programs_needing_go_signal_multicast) { - worker_launch_message_buffer_state.set_mcast_wptr(desc.num_traced_programs_needing_go_signal_multicast); - } - if (desc.num_traced_programs_needing_go_signal_unicast) { - worker_launch_message_buffer_state.set_unicast_wptr(desc.num_traced_programs_needing_go_signal_unicast); - } - // The config buffer manager is unaware of what memory is used inside the trace, so mark all memory as used so - // that it will force a stall and avoid stomping on in-use state. - // TODO(jbauman): Reuse old state from the trace. - this->config_buffer_mgr[index].mark_completely_full(this->expected_num_workers_completed[index]); - } if (blocking) { this->finish(trace_inst->desc->sub_device_ids); } @@ -534,68 +528,32 @@ const CoreCoord& HWCommandQueue::virtual_enqueue_program_dispatch_core() const { } void HWCommandQueue::record_begin(const uint32_t tid, const std::shared_ptr& ctx) { - auto num_sub_devices = this->device_->num_sub_devices(); - // Record the original value of expected_num_workers_completed, and reset it to 0. - std::copy( - this->expected_num_workers_completed.begin(), - this->expected_num_workers_completed.begin() + num_sub_devices, - this->expected_num_workers_completed_reset.begin()); - std::fill( - this->expected_num_workers_completed.begin(), - this->expected_num_workers_completed.begin() + num_sub_devices, - 0); + // Clear host dispatch state, since when trace runs we will reset the launch_msg_ring_buffer, + // worker_config_buffer, etc. + trace_dispatch::reset_host_dispatch_state_for_trace( + device_->num_sub_devices(), + this->manager, + this->expected_num_workers_completed, + this->config_buffer_mgr, + this->worker_launch_message_buffer_state_reset, + this->expected_num_workers_completed_reset, + this->config_buffer_mgr_reset); + // Record commands using bypass mode this->tid_ = tid; this->trace_ctx = std::move(ctx); - // Record original value of launch msg buffer - auto& worker_launch_message_buffer_state = this->manager.get_worker_launch_message_buffer_state(); - std::copy( - worker_launch_message_buffer_state.begin(), - worker_launch_message_buffer_state.begin() + num_sub_devices, - this->worker_launch_message_buffer_state_reset.begin()); - for (uint32_t i = 0; i < num_sub_devices; ++i) { - // Set launch msg wptr to 0. Every time trace runs on device, it will ensure that the workers - // reset their rptr to be in sync with device. - worker_launch_message_buffer_state[i].reset(); - } - this->manager.set_bypass_mode(true, true); // start - // Record original value of config buffer manager - std::copy( - this->config_buffer_mgr.begin(), - this->config_buffer_mgr.begin() + num_sub_devices, - this->config_buffer_mgr_reset.begin()); - for (uint32_t i = 0; i < num_sub_devices; ++i) { - // Sync values in the trace need to match up with the counter starting at 0 again. - this->config_buffer_mgr[i].mark_completely_full(this->expected_num_workers_completed[i]); - } + this->manager.set_bypass_mode(true, true); // start trace capture } void HWCommandQueue::record_end() { auto& trace_data = this->trace_ctx->data; trace_data = std::move(this->manager.get_bypass_data()); - // Add command to terminate the trace buffer + // Add trace end command to terminate the trace buffer DeviceCommand command_sequence(hal.get_alignment(HalMemType::HOST)); command_sequence.add_prefetch_exec_buf_end(); for (int i = 0; i < command_sequence.size_bytes() / sizeof(uint32_t); i++) { trace_data.push_back(((uint32_t*)command_sequence.data())[i]); } - // Reset the expected workers, launch msg buffer state, and config buffer mgr to their original value, - // so device can run programs after a trace was captured. This is needed since trace capture modifies the state on - // host, even though device doesn't run any programs. - auto num_sub_devices = this->device_->num_sub_devices(); - std::copy( - this->expected_num_workers_completed_reset.begin(), - this->expected_num_workers_completed_reset.begin() + num_sub_devices, - this->expected_num_workers_completed.begin()); - std::copy( - this->worker_launch_message_buffer_state_reset.begin(), - this->worker_launch_message_buffer_state_reset.begin() + num_sub_devices, - this->manager.get_worker_launch_message_buffer_state().begin()); - std::copy( - this->config_buffer_mgr_reset.begin(), - this->config_buffer_mgr_reset.begin() + num_sub_devices, - this->config_buffer_mgr.begin()); - // Copy the desc keys into a separate vector. When enqueuing traces, we sometimes need to pass sub-device ids // separately this->trace_ctx->sub_device_ids.reserve(this->trace_ctx->descriptors.size()); @@ -605,7 +563,19 @@ void HWCommandQueue::record_end() { } this->tid_ = std::nullopt; this->trace_ctx = nullptr; - this->manager.set_bypass_mode(false, true); // stop + + // Reset the expected workers, launch msg buffer state, and config buffer mgr to their original value, + // so device can run programs after a trace was captured. This is needed since trace capture modifies the state on + // host, even though device doesn't run any programs. + trace_dispatch::load_host_dispatch_state( + device_->num_sub_devices(), + this->manager, + this->expected_num_workers_completed, + this->config_buffer_mgr, + this->worker_launch_message_buffer_state_reset, + this->expected_num_workers_completed_reset, + this->config_buffer_mgr_reset); + this->manager.set_bypass_mode(false, true); // stop trace capture } void HWCommandQueue::terminate() { diff --git a/tt_metal/impl/dispatch/host_runtime_commands.cpp b/tt_metal/impl/dispatch/host_runtime_commands.cpp index 368bc663199..7f03a8608fa 100644 --- a/tt_metal/impl/dispatch/host_runtime_commands.cpp +++ b/tt_metal/impl/dispatch/host_runtime_commands.cpp @@ -173,138 +173,6 @@ void EnqueueProgramCommand::process() { program.set_program_binary_status(device->id(), ProgramBinaryStatus::Committed); } -EnqueueTraceCommand::EnqueueTraceCommand( - uint32_t command_queue_id, - IDevice* device, - SystemMemoryManager& manager, - std::shared_ptr& descriptor, - Buffer& buffer, - std::array& expected_num_workers_completed, - NOC noc_index, - CoreCoord dispatch_core) : - command_queue_id(command_queue_id), - buffer(buffer), - device(device), - manager(manager), - descriptor(descriptor), - expected_num_workers_completed(expected_num_workers_completed), - clear_count(true), - noc_index(noc_index), - dispatch_core(dispatch_core) {} - -void EnqueueTraceCommand::process() { - uint32_t num_sub_devices = descriptor->descriptors.size(); - uint32_t pcie_alignment = hal.get_alignment(HalMemType::HOST); - uint32_t go_signals_cmd_size = - align(sizeof(CQPrefetchCmd) + sizeof(CQDispatchCmd), pcie_alignment) * descriptor->descriptors.size(); - - uint32_t cmd_sequence_sizeB = - DispatchQueryManager::instance().dispatch_s_enabled() * - hal.get_alignment( - HalMemType::HOST) + // dispatch_d -> dispatch_s sem update (send only if dispatch_s is running) - go_signals_cmd_size + // go signal cmd - (hal.get_alignment( - HalMemType::HOST) + // wait to ensure that reset go signal was processed (dispatch_d) - // when dispatch_s and dispatch_d are running on 2 cores, workers update dispatch_s. - // dispatch_s is responsible for resetting worker count and giving dispatch_d the - // latest worker state. This is encapsulated in the dispatch_s wait command (only to - // be sent when dispatch is distributed on 2 cores) - (DispatchQueryManager::instance().distributed_dispatcher()) * hal.get_alignment(HalMemType::HOST)) * - num_sub_devices + - hal.get_alignment(HalMemType::HOST); // CQ_PREFETCH_CMD_EXEC_BUF - - void* cmd_region = this->manager.issue_queue_reserve(cmd_sequence_sizeB, this->command_queue_id); - - HugepageDeviceCommand command_sequence(cmd_region, cmd_sequence_sizeB); - - DispatcherSelect dispatcher_for_go_signal = DispatcherSelect::DISPATCH_MASTER; - if (DispatchQueryManager::instance().dispatch_s_enabled()) { - uint16_t index_bitmask = 0; - for (const auto& id : descriptor->sub_device_ids) { - index_bitmask |= 1 << id.to_index(); - } - command_sequence.add_notify_dispatch_s_go_signal_cmd(false, index_bitmask); - dispatcher_for_go_signal = DispatcherSelect::DISPATCH_SLAVE; - } - CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id()); - uint32_t dispatch_message_base_addr = - DispatchMemMap::get(dispatch_core_type) - .get_device_command_queue_addr(CommandQueueDeviceAddrType::DISPATCH_MESSAGE); - go_msg_t reset_launch_message_read_ptr_go_signal; - reset_launch_message_read_ptr_go_signal.signal = RUN_MSG_RESET_READ_PTR; - reset_launch_message_read_ptr_go_signal.master_x = (uint8_t)this->dispatch_core.x; - reset_launch_message_read_ptr_go_signal.master_y = (uint8_t)this->dispatch_core.y; - for (const auto& [id, desc] : descriptor->descriptors) { - const auto& noc_data_start_idx = device->noc_data_start_index( - id, - desc.num_traced_programs_needing_go_signal_multicast, - desc.num_traced_programs_needing_go_signal_unicast); - const auto& num_noc_mcast_txns = - desc.num_traced_programs_needing_go_signal_multicast ? device->num_noc_mcast_txns(id) : 0; - const auto& num_noc_unicast_txns = - desc.num_traced_programs_needing_go_signal_unicast ? device->num_noc_unicast_txns(id) : 0; - reset_launch_message_read_ptr_go_signal.dispatch_message_offset = - (uint8_t)DispatchMemMap::get(dispatch_core_type).get_dispatch_message_offset(id.to_index()); - uint32_t dispatch_message_addr = - dispatch_message_base_addr + - DispatchMemMap::get(dispatch_core_type).get_dispatch_message_offset(id.to_index()); - auto index = id.to_index(); - // Wait to ensure that all kernels have completed. Then send the reset_rd_ptr go_signal. - command_sequence.add_dispatch_go_signal_mcast( - this->expected_num_workers_completed[index], - *reinterpret_cast(&reset_launch_message_read_ptr_go_signal), - dispatch_message_addr, - num_noc_mcast_txns, - num_noc_unicast_txns, - noc_data_start_idx, - dispatcher_for_go_signal); - if (desc.num_traced_programs_needing_go_signal_multicast) { - this->expected_num_workers_completed[index] += - device->num_worker_cores(HalProgrammableCoreType::TENSIX, id); - } - if (desc.num_traced_programs_needing_go_signal_unicast) { - this->expected_num_workers_completed[index] += - device->num_worker_cores(HalProgrammableCoreType::ACTIVE_ETH, id); - } - } - // Wait to ensure that all workers have reset their read_ptr. dispatch_d will stall until all workers have completed - // this step, before sending kernel config data to workers or notifying dispatch_s that its safe to send the - // go_signal. Clear the dispatch <--> worker semaphore, since trace starts at 0. - for (const auto& id : descriptor->sub_device_ids) { - auto index = id.to_index(); - uint32_t dispatch_message_addr = - dispatch_message_base_addr + DispatchMemMap::get(dispatch_core_type).get_dispatch_message_offset(index); - if (DispatchQueryManager::instance().distributed_dispatcher()) { - command_sequence.add_dispatch_wait( - false, - dispatch_message_addr, - this->expected_num_workers_completed[index], - this->clear_count, - false, - true, - 1); - } - command_sequence.add_dispatch_wait( - false, dispatch_message_addr, this->expected_num_workers_completed[index], this->clear_count); - if (this->clear_count) { - this->expected_num_workers_completed[index] = 0; - } - } - - uint32_t page_size = buffer.page_size(); - uint32_t page_size_log2 = __builtin_ctz(page_size); - TT_ASSERT((page_size & (page_size - 1)) == 0, "Page size must be a power of 2"); - - command_sequence.add_prefetch_exec_buf(buffer.address(), page_size_log2, buffer.num_pages()); - - this->manager.issue_queue_push_back(cmd_sequence_sizeB, this->command_queue_id); - - this->manager.fetch_queue_reserve_back(this->command_queue_id); - - const bool stall_prefetcher = true; - this->manager.fetch_queue_write(cmd_sequence_sizeB, this->command_queue_id, stall_prefetcher); -} - EnqueueTerminateCommand::EnqueueTerminateCommand( uint32_t command_queue_id, IDevice* device, SystemMemoryManager& manager) : command_queue_id(command_queue_id), device(device), manager(manager) {} diff --git a/tt_metal/impl/dispatch/host_runtime_commands.hpp b/tt_metal/impl/dispatch/host_runtime_commands.hpp index 6a62c3a2053..61cf2604fed 100644 --- a/tt_metal/impl/dispatch/host_runtime_commands.hpp +++ b/tt_metal/impl/dispatch/host_runtime_commands.hpp @@ -96,36 +96,6 @@ class EnqueueProgramCommand : public Command { constexpr bool has_side_effects() { return true; } }; -class EnqueueTraceCommand : public Command { -private: - uint32_t command_queue_id; - Buffer& buffer; - IDevice* device; - SystemMemoryManager& manager; - std::shared_ptr& descriptor; - std::array& expected_num_workers_completed; - bool clear_count; - NOC noc_index; - CoreCoord dispatch_core; - -public: - EnqueueTraceCommand( - uint32_t command_queue_id, - IDevice* device, - SystemMemoryManager& manager, - std::shared_ptr& descriptor, - Buffer& buffer, - std::array& expected_num_workers_completed, - NOC noc_index, - CoreCoord dispatch_core); - - void process(); - - EnqueueCommandType type() { return EnqueueCommandType::ENQUEUE_TRACE; } - - constexpr bool has_side_effects() { return true; } -}; - class EnqueueTerminateCommand : public Command { private: uint32_t command_queue_id; diff --git a/tt_metal/impl/flatbuffer/light_metal_binary.fbs b/tt_metal/impl/flatbuffer/light_metal_binary.fbs index 619e69bf01c..17f8f38f46e 100644 --- a/tt_metal/impl/flatbuffer/light_metal_binary.fbs +++ b/tt_metal/impl/flatbuffer/light_metal_binary.fbs @@ -2,17 +2,17 @@ include "flatbuffer/command.fbs"; namespace tt.tt_metal.flatbuffer; -// Represents the Descriptor struct inside TraceDescriptor, given slightly less vague name here. -table TraceDescriptorMetaData { +// Represents the TraceWorkerDescriptor struct +table TraceWorkerDescriptor { num_completion_worker_cores: uint32; num_traced_programs_needing_go_signal_multicast: uint32; num_traced_programs_needing_go_signal_unicast: uint32; } -// Represents a key-value pair for SubDeviceId -> TraceDescriptorMetaData mapping +// Represents a key-value pair for SubDeviceId -> TraceWorkerDescriptor mapping table SubDeviceDescriptorMapping { sub_device_id: uint8; - descriptor: TraceDescriptorMetaData; + descriptor: TraceWorkerDescriptor; } // Matches C++ struct TraceDescriptor diff --git a/tt_metal/impl/lightmetal/lightmetal_capture.cpp b/tt_metal/impl/lightmetal/lightmetal_capture.cpp index 8ac29b15e33..c6dc136f11e 100644 --- a/tt_metal/impl/lightmetal/lightmetal_capture.cpp +++ b/tt_metal/impl/lightmetal/lightmetal_capture.cpp @@ -201,7 +201,7 @@ TraceDescriptorByTraceIdOffset to_flatbuffer( std::vector> sub_device_descriptor_offsets; for (const auto& [sub_device_id, descriptor] : trace_desc.descriptors) { - auto descriptor_offset = tt::tt_metal::flatbuffer::CreateTraceDescriptorMetaData( + auto descriptor_offset = tt::tt_metal::flatbuffer::CreateTraceWorkerDescriptor( builder, descriptor.num_completion_worker_cores, descriptor.num_traced_programs_needing_go_signal_multicast, diff --git a/tt_metal/impl/lightmetal/lightmetal_replay.cpp b/tt_metal/impl/lightmetal/lightmetal_replay.cpp index d42805161ae..b028faf4bb0 100644 --- a/tt_metal/impl/lightmetal/lightmetal_replay.cpp +++ b/tt_metal/impl/lightmetal/lightmetal_replay.cpp @@ -40,7 +40,7 @@ TraceDescriptor from_flatbuffer(const tt::tt_metal::flatbuffer::TraceDescriptor* if (auto sub_device_descriptors_fb = fb_desc->sub_device_descriptors()) { for (const auto* mapping : *sub_device_descriptors_fb) { if (mapping) { - TraceDescriptor::Descriptor descriptor; + TraceWorkerDescriptor descriptor; descriptor.num_completion_worker_cores = mapping->descriptor()->num_completion_worker_cores(); descriptor.num_traced_programs_needing_go_signal_multicast = mapping->descriptor()->num_traced_programs_needing_go_signal_multicast(); diff --git a/tt_metal/impl/trace/dispatch.cpp b/tt_metal/impl/trace/dispatch.cpp new file mode 100644 index 00000000000..19d08460004 --- /dev/null +++ b/tt_metal/impl/trace/dispatch.cpp @@ -0,0 +1,255 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "tt_metal/impl/trace/dispatch.hpp" +#include "tt_metal/impl/dispatch/dispatch_query_manager.hpp" + +namespace tt::tt_metal::trace_dispatch { + +void reset_host_dispatch_state_for_trace( + uint32_t num_sub_devices, + SystemMemoryManager& sysmem_manager, + std::array& expected_num_workers_completed, + std::array& config_buffer_mgr, + std::array& + worker_launch_message_buffer_state_reset, + std::array& expected_num_workers_completed_reset, + std::array& config_buffer_mgr_reset) { + // Record the original value of expected_num_workers_completed, and reset it to 0. + std::copy( + expected_num_workers_completed.begin(), + expected_num_workers_completed.begin() + num_sub_devices, + expected_num_workers_completed_reset.begin()); + std::fill(expected_num_workers_completed.begin(), expected_num_workers_completed.begin() + num_sub_devices, 0); + + // Record original value of launch msg buffer + auto& worker_launch_message_buffer_state = sysmem_manager.get_worker_launch_message_buffer_state(); + std::copy( + worker_launch_message_buffer_state.begin(), + worker_launch_message_buffer_state.begin() + num_sub_devices, + worker_launch_message_buffer_state_reset.begin()); + for (uint32_t i = 0; i < num_sub_devices; ++i) { + // Set launch msg wptr to 0. Every time trace runs on device, it will ensure that the workers + // reset their rptr to be in sync with device. + worker_launch_message_buffer_state[i].reset(); + } + // Record original value of config buffer manager + std::copy(config_buffer_mgr.begin(), config_buffer_mgr.begin() + num_sub_devices, config_buffer_mgr_reset.begin()); + for (uint32_t i = 0; i < num_sub_devices; ++i) { + // Sync values in the trace need to match up with the counter starting at 0 again. + config_buffer_mgr[i].mark_completely_full(expected_num_workers_completed[i]); + } +} + +void load_host_dispatch_state( + uint32_t num_sub_devices, + SystemMemoryManager& sysmem_manager, + std::array& expected_num_workers_completed, + std::array& config_buffer_mgr, + std::array& + worker_launch_message_buffer_state_reset, + std::array& expected_num_workers_completed_reset, + std::array& config_buffer_mgr_reset) { + std::copy( + expected_num_workers_completed_reset.begin(), + expected_num_workers_completed_reset.begin() + num_sub_devices, + expected_num_workers_completed.begin()); + std::copy( + worker_launch_message_buffer_state_reset.begin(), + worker_launch_message_buffer_state_reset.begin() + num_sub_devices, + sysmem_manager.get_worker_launch_message_buffer_state().begin()); + std::copy( + config_buffer_mgr_reset.begin(), config_buffer_mgr_reset.begin() + num_sub_devices, config_buffer_mgr.begin()); +} + +void issue_trace_commands( + IDevice* device, + SystemMemoryManager& sysmem_manager, + const TraceDispatchMetadata& dispatch_md, + uint8_t cq_id, + const std::array& expected_num_workers_completed, + CoreCoord dispatch_core) { + void* cmd_region = sysmem_manager.issue_queue_reserve(dispatch_md.cmd_sequence_sizeB, cq_id); + + HugepageDeviceCommand command_sequence(cmd_region, dispatch_md.cmd_sequence_sizeB); + + DispatcherSelect dispatcher_for_go_signal = DispatcherSelect::DISPATCH_MASTER; + if (DispatchQueryManager::instance().dispatch_s_enabled()) { + uint16_t index_bitmask = 0; + for (const auto& id : dispatch_md.sub_device_ids) { + index_bitmask |= 1 << id.to_index(); + } + command_sequence.add_notify_dispatch_s_go_signal_cmd(false, index_bitmask); + dispatcher_for_go_signal = DispatcherSelect::DISPATCH_SLAVE; + } + auto dispatch_core_config = DispatchQueryManager::instance().get_dispatch_core_config(); + auto dispatch_core_type = dispatch_core_config.get_core_type(); + + uint32_t dispatch_message_base_addr = + DispatchMemMap::get(dispatch_core_type) + .get_device_command_queue_addr(CommandQueueDeviceAddrType::DISPATCH_MESSAGE); + + go_msg_t reset_launch_message_read_ptr_go_signal; + reset_launch_message_read_ptr_go_signal.signal = RUN_MSG_RESET_READ_PTR; + reset_launch_message_read_ptr_go_signal.master_x = (uint8_t)dispatch_core.x; + reset_launch_message_read_ptr_go_signal.master_y = (uint8_t)dispatch_core.y; + + for (const auto& [id, desc] : dispatch_md.trace_worker_descriptors) { + const auto& noc_data_start_idx = device->noc_data_start_index( + id, + desc.num_traced_programs_needing_go_signal_multicast, + desc.num_traced_programs_needing_go_signal_unicast); + + const auto& num_noc_mcast_txns = + desc.num_traced_programs_needing_go_signal_multicast ? device->num_noc_mcast_txns(id) : 0; + const auto& num_noc_unicast_txns = + desc.num_traced_programs_needing_go_signal_unicast ? device->num_noc_unicast_txns(id) : 0; + reset_launch_message_read_ptr_go_signal.dispatch_message_offset = + (uint8_t)DispatchMemMap::get(dispatch_core_type).get_dispatch_message_offset(id.to_index()); + uint32_t dispatch_message_addr = + dispatch_message_base_addr + + DispatchMemMap::get(dispatch_core_type).get_dispatch_message_offset(id.to_index()); + auto index = id.to_index(); + + // Wait to ensure that all kernels have completed. Then send the reset_rd_ptr go_signal. + command_sequence.add_dispatch_go_signal_mcast( + expected_num_workers_completed[index], + *reinterpret_cast(&reset_launch_message_read_ptr_go_signal), + dispatch_message_addr, + num_noc_mcast_txns, + num_noc_unicast_txns, + noc_data_start_idx, + dispatcher_for_go_signal); + } + + // Wait to ensure that all workers have reset their read_ptr. dispatch_d will stall until all workers have completed + // this step, before sending kernel config data to workers or notifying dispatch_s that its safe to send the + // go_signal. Clear the dispatch <--> worker semaphore, since trace starts at 0. + constexpr bool clear_count = true; + for (const auto& [id, desc] : dispatch_md.trace_worker_descriptors) { + auto index = id.to_index(); + uint32_t expected_num_workers = expected_num_workers_completed[index]; + if (desc.num_traced_programs_needing_go_signal_multicast) { + expected_num_workers += device->num_worker_cores(HalProgrammableCoreType::TENSIX, id); + } + if (desc.num_traced_programs_needing_go_signal_unicast) { + expected_num_workers += device->num_worker_cores(HalProgrammableCoreType::ACTIVE_ETH, id); + } + uint32_t dispatch_message_addr = + dispatch_message_base_addr + DispatchMemMap::get(dispatch_core_type).get_dispatch_message_offset(index); + + if (DispatchQueryManager::instance().distributed_dispatcher()) { + command_sequence.add_dispatch_wait( + false, dispatch_message_addr, expected_num_workers, clear_count, false, true, 1); + } + command_sequence.add_dispatch_wait(false, dispatch_message_addr, expected_num_workers, clear_count); + } + + uint32_t page_size_log2 = __builtin_ctz(dispatch_md.trace_buffer_page_size); + TT_ASSERT( + (dispatch_md.trace_buffer_page_size & (dispatch_md.trace_buffer_page_size - 1)) == 0, + "Page size must be a power of 2"); + + command_sequence.add_prefetch_exec_buf( + dispatch_md.trace_buffer_address, page_size_log2, dispatch_md.trace_buffer_num_pages); + + sysmem_manager.issue_queue_push_back(dispatch_md.cmd_sequence_sizeB, cq_id); + + sysmem_manager.fetch_queue_reserve_back(cq_id); + + const bool stall_prefetcher = true; + sysmem_manager.fetch_queue_write(dispatch_md.cmd_sequence_sizeB, cq_id, stall_prefetcher); +} + +uint32_t compute_trace_cmd_size(uint32_t num_sub_devices) { + uint32_t pcie_alignment = hal.get_alignment(HalMemType::HOST); + uint32_t go_signals_cmd_size = + align(sizeof(CQPrefetchCmd) + sizeof(CQDispatchCmd), pcie_alignment) * num_sub_devices; + + uint32_t cmd_sequence_sizeB = + DispatchQueryManager::instance().dispatch_s_enabled() * + hal.get_alignment( + HalMemType::HOST) + // dispatch_d -> dispatch_s sem update (send only if dispatch_s is running) + go_signals_cmd_size + // go signal cmd + (hal.get_alignment( + HalMemType::HOST) + // wait to ensure that reset go signal was processed (dispatch_d) + // when dispatch_s and dispatch_d are running on 2 cores, workers update dispatch_s. + // dispatch_s is responsible for resetting worker count and giving dispatch_d the + // latest worker state. This is encapsulated in the dispatch_s wait command (only to + // be sent when dispatch is distributed on 2 cores) + (DispatchQueryManager::instance().distributed_dispatcher()) * hal.get_alignment(HalMemType::HOST)) * + num_sub_devices + + hal.get_alignment(HalMemType::HOST); // CQ_PREFETCH_CMD_EXEC_BUF + + return cmd_sequence_sizeB; +} + +void update_worker_state_post_trace_execution( + const std::unordered_map& trace_worker_descriptors, + SystemMemoryManager& manager, + std::array& config_buffer_mgr, + std::array& expected_num_workers_completed) { + for (const auto& [id, desc] : trace_worker_descriptors) { + auto index = id.to_index(); + // Update the expected worker cores counter due to trace programs completion + expected_num_workers_completed[index] = desc.num_completion_worker_cores; + // After trace runs, the rdptr on each worker will be incremented by the number of programs in the trace + // Update the wptr on host to match state. If the trace doesn't execute on a + // class of worker (unicast or multicast), it doesn't reset or modify the + // state for those workers. + auto& worker_launch_message_buffer_state = manager.get_worker_launch_message_buffer_state()[index]; + if (desc.num_traced_programs_needing_go_signal_multicast) { + worker_launch_message_buffer_state.set_mcast_wptr(desc.num_traced_programs_needing_go_signal_multicast); + } + if (desc.num_traced_programs_needing_go_signal_unicast) { + worker_launch_message_buffer_state.set_unicast_wptr(desc.num_traced_programs_needing_go_signal_unicast); + } + // The config buffer manager is unaware of what memory is used inside the trace, so mark all memory as used so + // that it will force a stall and avoid stomping on in-use state. + // TODO(jbauman): Reuse old state from the trace. + config_buffer_mgr[index].mark_completely_full(expected_num_workers_completed[index]); + } +} + +// Assumes pages are interleaved across all banks starting at 0 +std::size_t compute_interleaved_trace_buf_page_size(uint32_t buf_size, const uint32_t num_banks) { + // Tuneable parameters for the trace buffer - heavily affect prefetcher + // read performance. TODO: Explore ideal page size for the trace buffer + // to maximize read bandwidth. + // Min size is bounded by NOC transfer efficiency + // Max size is bounded by Prefetcher CmdDatQ size + constexpr uint32_t kExecBufPageMin = 1024; + constexpr uint32_t kExecBufPageMax = 4096; + // The algorithm below currently minimizes the amount of wasted space due to + // padding. TODO: Tune for performance. + std::vector candidates; + candidates.reserve(__builtin_clz(kExecBufPageMin) - __builtin_clz(kExecBufPageMax) + 1); + for (uint32_t size = 1; size <= kExecBufPageMax; size <<= 1) { + if (size >= kExecBufPageMin) { + candidates.push_back(size); + } + } + uint32_t min_waste = -1; + uint32_t pick = 0; + // Pick the largest size that minimizes waste + for (const uint32_t size : candidates) { + // Pad data to the next fully banked size + uint32_t fully_banked = num_banks * size; + uint32_t padded_size = (buf_size + fully_banked - 1) / fully_banked * fully_banked; + uint32_t waste = padded_size - buf_size; + if (waste <= min_waste) { + min_waste = waste; + pick = size; + } + } + TT_FATAL( + pick >= kExecBufPageMin and pick <= kExecBufPageMax, + "pick {} not between min_size {} and max_size {}", + pick, + kExecBufPageMin, + kExecBufPageMax); + return pick; +} + +} // namespace tt::tt_metal::trace_dispatch diff --git a/tt_metal/impl/trace/dispatch.hpp b/tt_metal/impl/trace/dispatch.hpp new file mode 100644 index 00000000000..f84d0c3bbef --- /dev/null +++ b/tt_metal/impl/trace/dispatch.hpp @@ -0,0 +1,74 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include + +namespace tt::tt_metal::trace_dispatch { + +struct TraceDispatchMetadata { + uint32_t cmd_sequence_sizeB; + std::unordered_map& trace_worker_descriptors; + std::vector& sub_device_ids; + uint32_t trace_buffer_page_size = 0; + uint32_t trace_buffer_num_pages = 0; + uint32_t trace_buffer_address = 0; + + TraceDispatchMetadata( + uint32_t cmd_size, + std::unordered_map& descriptors, + std::vector& sub_devices, + uint32_t buf_page_size, + uint32_t buf_num_pages, + uint32_t buf_address) : + cmd_sequence_sizeB(cmd_size), + trace_worker_descriptors(descriptors), + sub_device_ids(sub_devices), + trace_buffer_page_size(buf_page_size), + trace_buffer_num_pages(buf_num_pages), + trace_buffer_address(buf_address) {} +}; + +void reset_host_dispatch_state_for_trace( + uint32_t num_sub_devices, + SystemMemoryManager& sysmem_manager, + std::array& expected_num_workers_completed, + std::array& config_buffer_mgr, + std::array& + worker_launch_message_buffer_state_reset, + std::array& expected_num_workers_completed_reset, + std::array& config_buffer_mgr_reset); + +void load_host_dispatch_state( + uint32_t num_sub_devices, + SystemMemoryManager& sysmem_manager, + std::array& expected_num_workers_completed, + std::array& config_buffer_mgr, + std::array& + worker_launch_message_buffer_state_reset, + std::array& expected_num_workers_completed_reset, + std::array& config_buffer_mgr_reset); + +void issue_trace_commands( + IDevice* device, + SystemMemoryManager& sysmem_manager, + const TraceDispatchMetadata& dispatch_md, + uint8_t cq_id, + const std::array& expected_num_workers_completed, + CoreCoord dispatch_core); + +uint32_t compute_trace_cmd_size(uint32_t num_sub_devices); + +void update_worker_state_post_trace_execution( + const std::unordered_map& trace_worker_descriptors, + SystemMemoryManager& manager, + std::array& config_buffer_mgr, + std::array& expected_num_workers_completed); + +std::size_t compute_interleaved_trace_buf_page_size(uint32_t buf_size, const uint32_t num_banks); + +} // namespace tt::tt_metal::trace_dispatch diff --git a/tt_metal/impl/trace/trace.cpp b/tt_metal/impl/trace/trace.cpp index 3e8e5e235d9..2789223c307 100644 --- a/tt_metal/impl/trace/trace.cpp +++ b/tt_metal/impl/trace/trace.cpp @@ -13,55 +13,7 @@ #include #include #include "tt_metal/trace.hpp" - -namespace { -// Labels to make the code more readable -static constexpr bool kBlocking = true; -static constexpr bool kNonBlocking = false; - -// Min size is bounded by NOC transfer efficiency -// Max size is bounded by Prefetcher CmdDatQ size -static constexpr uint32_t kExecBufPageMin = 1024; -static constexpr uint32_t kExecBufPageMax = 4096; - -// Assumes pages are interleaved across all banks starting at 0 -size_t interleaved_page_size( - const uint32_t buf_size, const uint32_t num_banks, const uint32_t min_size, const uint32_t max_size) { - // Populate power of 2 numbers within min and max as candidates - TT_FATAL( - min_size > 0 and min_size <= max_size, - "min_size {} not positive and less than or equal to max_size {}.", - min_size, - max_size); - std::vector candidates; - candidates.reserve(__builtin_clz(min_size) - __builtin_clz(max_size) + 1); - for (uint32_t size = 1; size <= max_size; size <<= 1) { - if (size >= min_size) { - candidates.push_back(size); - } - } - uint32_t min_waste = -1; - uint32_t pick = 0; - // Pick the largest size that minimizes waste - for (const uint32_t size : candidates) { - // Pad data to the next fully banked size - uint32_t fully_banked = num_banks * size; - uint32_t padded_size = (buf_size + fully_banked - 1) / fully_banked * fully_banked; - uint32_t waste = padded_size - buf_size; - if (waste <= min_waste) { - min_waste = waste; - pick = size; - } - } - TT_FATAL( - pick >= min_size and pick <= max_size, - "pick {} not between min_size {} and max_size {}", - pick, - min_size, - max_size); - return pick; -} -} // namespace +#include "tt_metal/impl/trace/dispatch.hpp" namespace tt::tt_metal { @@ -76,8 +28,8 @@ std::shared_ptr Trace::create_empty_trace_buffer() { void Trace::initialize_buffer(CommandQueue& cq, const std::shared_ptr& trace_buffer) { std::vector& trace_data = trace_buffer->desc->data; uint64_t unpadded_size = trace_data.size() * sizeof(uint32_t); - size_t page_size = interleaved_page_size( - unpadded_size, cq.device()->allocator()->get_num_banks(BufferType::DRAM), kExecBufPageMin, kExecBufPageMax); + size_t page_size = trace_dispatch::compute_interleaved_trace_buf_page_size( + unpadded_size, cq.device()->allocator()->get_num_banks(BufferType::DRAM)); uint64_t padded_size = round_up(unpadded_size, page_size); size_t numel_padding = (padded_size - unpadded_size) / sizeof(uint32_t); if (numel_padding > 0) { @@ -95,7 +47,7 @@ void Trace::initialize_buffer(CommandQueue& cq, const std::shared_ptrbuffer = Buffer::create(cq.device(), padded_size, page_size, BufferType::TRACE, TensorMemoryLayout::INTERLEAVED); - EnqueueWriteBuffer(cq, trace_buffer->buffer, trace_data, kBlocking); + EnqueueWriteBuffer(cq, trace_buffer->buffer, trace_data, true /* blocking */); log_trace( LogMetalTrace, "Trace issue buffer unpadded size={}, padded size={}, num_pages={}", diff --git a/ttnn/cpp/ttnn/common/queue_id.hpp b/ttnn/cpp/ttnn/common/queue_id.hpp index 6b5f2cd33b0..dc9d801bbc6 100644 --- a/ttnn/cpp/ttnn/common/queue_id.hpp +++ b/ttnn/cpp/ttnn/common/queue_id.hpp @@ -4,7 +4,7 @@ #pragma once -#include +#include namespace ttnn { /* From eb06c15f53577e367a4dd917b48472fc6987ea4e Mon Sep 17 00:00:00 2001 From: Noah Hein <60185486+nhein-tt@users.noreply.github.com> Date: Thu, 20 Feb 2025 20:15:24 -0600 Subject: [PATCH 209/316] [skip ci] Bounty program (#18051) ### Ticket n/A ### Problem description As part of the bug bounty program that Devrel runs, this PR addresses all legal requirements needed for external developers to accept payment for work done as a part of that program. ### What's changed n/a no code changes, only docs. ### Checklist n/a --------- Co-authored-by: Shubham Saboo <31396011+Shubhamsaboo@users.noreply.github.com> Co-authored-by: Artem Yerofieiev <169092593+ayerofieiev-tt@users.noreply.github.com> --- BOUNTY_TERMS.md | 126 ++++++++++++++++++++++++++++++++++++++++++++++++ README.md | 3 ++ 2 files changed, 129 insertions(+) create mode 100644 BOUNTY_TERMS.md diff --git a/BOUNTY_TERMS.md b/BOUNTY_TERMS.md new file mode 100644 index 00000000000..28e1e035146 --- /dev/null +++ b/BOUNTY_TERMS.md @@ -0,0 +1,126 @@ +# TENSTORRENT BOUNTY PROGRAM TERMS AND CONDITIONS + +Please read these terms and conditions (these “**Terms**”), which form a legally binding contract between Tenstorrent AI ULC and its affiliates (“**Tenstorrent**,” “**us**,” or “**our**”) and qualifying individuals (“**Participant**,” “**you**,” or “**your**”) who wish to participate in Tenstorrent’s contribution program (the “**Program**”) and help improve our in-scope open-source projects by addressing issues, implementing new features, or resolving performance challenges (“**Contributions**”). Participants that submit Accepted Contributions shall be eligible to earn a payout (a “**Bounty**”), as determined solely in Tenstorrent’s discretion, in accordance with these Terms. + +These Terms include important clauses, including without limitation, instances where Participants may be liable to Tenstorrent, a class action waiver, and other limitations of your rights and remedies. Disputes will be adjudicated solely in the courts of the State of California. By participating in the Program, all Participants must agree to be bound by these Terms and comply with these Terms. If an individual does not wish to, or cannot comply with these Terms, they are ineligible for a Bounty Payout and must not participate in the Program. + +--- + +## About the Program + +Tenstorrent offers this Program as an initiative for our community members that are helping us improve our open-source software. The Program is not a competition. No fees are payable or purchase is necessary to participate in the Program. All Program communication and updates will be shared via the relevant Tenstorrent open-source repository. + +This Program is a discretionary initiative. Tenstorrent, in our sole discretion, may modify these Terms at any time and may modify, restrict, suspend, terminate, or otherwise change any aspect of this Program, including the fulfillment of any Bounty Payouts at any time. If Tenstorrent changes these Terms, by continuing to participate in the Program, you are deemed to have accepted the changes. + +--- +## Participation Eligibility + +To be eligible to participate in the Program you must: + +* be the legal age of majority in your country and have the legal capacity to enter into, and be bound by, these Terms; +* if you are participating in the Program as an entity, have the legal authority to accept these Terms on the applicable entity’s behalf (in which case “you” will mean the foregoing entity); +* not be subject to legal obligations that prevent you from participating in the Program (for example, under your employment contract or ethical rules); +* not be a sanctioned person or a citizen or resident of a sanctioned country under applicable law, including under U.S. embargo or sanctions; +* not be in violation of any applicable laws or regulations when participating in the Program; +* not ask for payment in exchange for issue details or dispute the applicability of the Program to you, including the amount of any proposed or actual payment or categorization of a Contribution; and +* not be a current employee, vendor, contractor, or agent for Tenstorrent. + +You may be required to provide Tenstorrent with proof of compliance and eligibility in the form requested with regard to any of your obligations hereunder. Tenstorrent reserves the right to limit or refuse your eligibility to participate in the Program for any reason in its sole discretion, including but not limited to where your participation is prohibited by any applicable law. If Tenstorrent becomes aware of any violation of these Terms, Tenstorrent may elect to, among other things, (a) withhold, amend, or cancel the benefits of or payments under the Program or (b) require return of any payment made to you, including taking any action at law to obtain such payment. + +--- +## Scope of Contributions + +The Bounty will be applicable for Accepted Contributions in [tt-metal](https://github.com/tenstorrent/tt-metal). An **"Accepted Contribution"** refers to merged pull requests that address an open GitHub issue which is tagged with both (1) “bounty” and (2) one of the categories listed in Exhibit A. + +--- + +## Bounty Payment + +Subject to these Terms, you will receive payments based on the category of Contribution in accordance with Exhibit A. In order to receive a Bounty payment, you: + +* must not be in breach of these Terms; +* must be assigned on GitHub to the ssue for which you are submitting a pull request, and your pull request must be submitted while you are still assigned to the issue (you have forfeited your right to any Bounty once the issue is re-assigned to another contributor). Tenstorrent reserves the right to re-assign any issue if the assigned contributor becomes unresponsive for over two (2) weeks or if the assigned contributor explicitly forfeits the assignment; +* must release your Contributions under the license of the repository in which you are submitting a pull request; +* provide additional information as may be required by us (such as payment information) and meet all requirements to receive such Bounty as may be required by applicable law and regulations. If you do not provide such additional information or meet such requirements, we may not provide payment; and +* may not designate someone else to receive your Bounty payout. + +--- + +## Your Obligations + +You shall: + +* only participate in the Program solely for the intended purpose of disclosing or resolving issues to Tenstorrent as described in these Terms and any related documentation; +* participate in the Program for lawful purposes only and shall comply with all applicable laws and regulations; and +* only access, disclose or modify your own user data and be solely responsible for the accuracy, completeness, appropriateness, and legality of any data or Contributions you upload or provide through your participation in the Program. + +You shall not: + +* attempt to gain access to another user’s account or data; +* transmit any viruses or exploits through your participation in the Program, except for the sole purpose of discovery and submission of Contributions and subject to compliance with these Terms; +* upload, input, access, store, distribute or any material that: (i) is unlawful, harmful, threatening, defamatory, obscene, infringing, harassing or racially or ethnically offensive; (ii) facilitates illegal activity; (iii) is otherwise illegal (including without limitation infringement of any third party intellectual property rights or any other rights); or (iv) causes damage or injury to any person or property; or +* upload or input or otherwise disclose to Tenstorrent any information which you do not have the rights to or which you are under an existing contractual or other legal obligation to maintain in confidence. + +--- + +## Intellectual Property Rights + +You acknowledge and agree that all your Contributions under the Program shall be released under the license of the repository in which you are submitting a pull request. You represent and warrant that your Contribution is your own work, that you haven’t used information owned by another person or entity, and that you have the legal right to submit the Contribution to Tenstorrent. + +Your rights with respect to our software, related documentation, and any updates, developments, or improvements thereto are governed by the license included in the applicable GitHub repository. + +--- + +## Your Information + +You will provide us with all information as we may reasonably require for you to participate in the Program and, where relevant, receive a Bounty award. Tenstorrent shall only use the information you provide us to permit your participation in the Program and to tender Bounty payouts. Except for our obligations under applicable data protection laws with respect to our processing of any personal data you may provide us through your participation in Program, we disclaim all liability of any kind with respect to (i) any information, data or materials you upload or otherwise provide through your participation in the Program, (ii) third party information, (iii) any other material or services which may be accessed when participating in the Program, or (iv) for any fraud committed in connection with the Program. + +--- + +## No Warranties + +TENSTORRENT MAKES NO WARRANTIES, EXPRESS OR IMPLIED, GUARANTEES OR CONDITIONS WITH RESPECT TO THE PROGRAM. YOU UNDERSTAND THAT YOUR PARTICIPATION IN THE PROGRAM IS AT YOUR OWN RISK. TO THE EXTENT PERMITTED UNDER YOUR LOCAL LAW, WE EXCLUDE ANY IMPLIED WARRANTIES IN CONNECTION WITH THE PROGRAM. YOU MAY HAVE CERTAIN RIGHTS UNDER YOUR LOCAL LAW. NOTHING IN THESE TERMS IS INTENDED TO AFFECT THOSE RIGHTS, IF THEY ARE APPLICABLE. + +--- + +## Limitation of Liability and Disclaimer + +Should your participation in the Program be found to breach legal obligations you may have with other third parties or any other rights or in the event of a breach of these Terms, we may terminate your participation in the Program and may further deem you to be ineligible for a Bounty payment. You agree to defend, indemnify and hold harmless Tenstorrent and its respective officers, directors, employees, agents, licensors, and suppliers, from and against all claims, actions or demands, liabilities, and settlements, including, without limitation, reasonable legal and accounting fees, arising in connection with such breach. + +TO THE MAXIMUM EXTENT PERMITTED BY LAW, (A) WE SHALL NOT BE LIABLE TO YOU FOR ANY DAMAGES, CLAIMS, EXPENSES OR OTHER COSTS (INCLUDING, WITHOUT LIMITATION, ATTORNEYS’ FEES) YOU SUFFER OR INCUR AS A RESULT OF THIRD-PARTY CLAIMS RELATING TO YOUR PARTICIPATION IN THE PROGRAM, (B) UNDER NO CIRCUMSTANCES WILL WE BE LIABLE FOR ANY INDIRECT, SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, AND (C) OUR MAXIMUM AGGREGATE LIABILITY TO YOU ARISING OUT OF OR IN CONNECTION WITH THESE TERMS SHALL BE LIMITED TO $100, REGARDLESS OF THE CAUSE. WE DO NOT EXCLUDE OR LIMIT OUR LIABILITY FOR FRAUD OR FOR ANY OTHER LIABILITY WHICH CANNOT BE LIMITED OR EXCLUDED BY APPLICABLE LAW.    + +--- + +## Issues + +If you encounter any issues with your participation in the Program, please reach out to us at bounties@tenstorrent.com. + +--- + +## Applicability of these Terms + +These Terms shall apply for as long as you are participating in the Program pursuant to these Terms. Cancellation of the Program, termination of these Terms, or your explicit withdrawal from the Program shall not affect Tenstorrent’s rights and your obligations under these Terms prior to such cancellation or termination, which shall continue to apply, unless otherwise agreed in writing. + +--- + +## Governing Law & Disputes + +These Terms shall be governed by and construed in accordance with the laws of the State of California and any federal laws applicable therein and shall be binding upon the parties hereto in California and worldwide. The parties consent to the exclusive jurisdiction of the courts of the State of California for any dispute arising out of this Agreement. Except where prohibited, as a condition of participating in this Program, each Participant agrees that between the parties, any and all disputes, claims, and causes of action arising out of or connected with this Program, or the Bounty Payout awarded must be resolved individually, without resort to any form of class action. + +--- + +## General + +These Terms will be binding on and will inure to the benefit of the legal representatives, successors and assigns of the parties hereto. These Terms (and any policies referenced herein and incorporated by reference) constitute the entire agreement between you and us with respect to the subject matter hereof, and you have not relied upon any promises or representations by us with respect to the subject matter except as set forth herein. You may not assign these Terms or assign any rights or delegate any obligations hereunder, in whole or in part, whether voluntarily or by operation of law. The governing language of these Terms is English. A person who is not a party to these Terms has no rights to enforce, or to enjoy the benefit of, any term of these Terms. + +--- + +## Exhibit A – Tenstorrent Bounty Rewards Chart + +| Category | Definition | Examples | Payment Range (US Dollars) | +| :---- | :---- | :---- | :---- | +| difficulty/warmup | Tasks suitable for first-time contributors. Straightforward and low complexity. | \- Minor bug fixes. \- Documentation improvements. \- Adding or fixing a test case. \- Basic logging updates. \- Updating a README or sample script. | 1 – 200 | +| difficulty/easy | Tasks requiring basic familiarity with the repo and some understanding of the architecture. | \- Extending an existing feature. \- Updating API calls. \- Simple refactoring tasks. \- Adding a new test suite. | 201 – 500 | +| difficulty/medium | Tasks requiring significant familiarity with the code base, architecture, or domain knowledge  | \- Implementing a new feature. \- Adding support for a new model. \- Debugging and fixing non-trivial performance issues. \- Integration of a library or external tool. | 501 – 1999 | +| difficulty/hard | Complex tasks demanding deep architectural understanding and significant effort. | \- Major feature implementation. \- Core system redesign. \- Implementing a new kernel or low-level ops. \- Optimizing performance-critical code paths. | 2000 – 3000 | + diff --git a/README.md b/README.md index db6c978ea98..9ff79c7fb7e 100644 --- a/README.md +++ b/README.md @@ -166,3 +166,6 @@ Get started with [simple kernels](https://docs.tenstorrent.com/tt-metal/latest/t - [Matmul OP on Multi_core (Basic)](./tech_reports/prog_examples/matmul_multi_core/matmul_multi_core.md) - [Matmul Multi_core Reuse (Optimized)](./tech_reports/prog_examples/matmul_multi_core_optimized/data_reuse.md) - [Matmul Multi_core Multi-Cast (Optimized)](./tech_reports/prog_examples/matmul_multi_core_optimized/data_mcast.md) + +### Tenstorrent Bounty Program Terms and Conditions +This repo is a part of Tenstorrent’s bounty program. If you are interested in helping to improve tt-metal, please make sure to read the [Tenstorrent Bounty Program Terms and Conditions](https://github.com/tenstorrent/tt-metal/blob/main/BOUNTY_TERMS.md) before heading to the issues tab. Look for the issues that are tagged with both “bounty” and difficulty level! From fd2a5e5e58abf202fa7d6c40fb1b50132f79f9b5 Mon Sep 17 00:00:00 2001 From: asaigal Date: Thu, 20 Feb 2025 17:55:40 -0800 Subject: [PATCH 210/316] Support non-convex intersections between SubGrids when capturing and assembling MeshTrace commands --- .../tt_metal/distributed/test_mesh_trace.cpp | 30 ++++++++++++ tt_metal/api/tt-metalium/mesh_common.hpp | 1 + tt_metal/distributed/mesh_command_queue.cpp | 49 ++++++++++--------- tt_metal/distributed/mesh_trace.cpp | 6 ++- tt_metal/distributed/mesh_workload_utils.cpp | 40 +++++++++++++-- tt_metal/distributed/mesh_workload_utils.hpp | 2 +- 6 files changed, 98 insertions(+), 30 deletions(-) diff --git a/tests/tt_metal/distributed/test_mesh_trace.cpp b/tests/tt_metal/distributed/test_mesh_trace.cpp index f4ecf8259bd..b3e51f352c2 100644 --- a/tests/tt_metal/distributed/test_mesh_trace.cpp +++ b/tests/tt_metal/distributed/test_mesh_trace.cpp @@ -121,6 +121,22 @@ INSTANTIATE_TEST_SUITE_P( MeshTraceSweepTests, MeshTraceSweepTest, ::testing::Values( + std::vector>({ + {LogicalDeviceRange({0, 0}, {3, 1})}, // Full grid + {LogicalDeviceRange({1, 0}, {1, 1})}, // Run on single center column + {LogicalDeviceRange({2, 0}, {2, 0})}, // Run on single device - top row, center + {LogicalDeviceRange({3, 1}, {3, 1})}, // Run on bottom right device + {LogicalDeviceRange({0, 0}, {0, 0})}, // Run on top left device + {LogicalDeviceRange({0, 0}, {3, 1})}, // Full grid + }), + std::vector>({ + {LogicalDeviceRange({0, 0}, {3, 1})}, // Full grid + {LogicalDeviceRange({1, 0}, {1, 1}), + LogicalDeviceRange({2, 0}, {2, 1}), + LogicalDeviceRange({3, 0}, {3, 1}), + LogicalDeviceRange({0, 0}, {0, 1})}, // Split grid into 4 columns + {LogicalDeviceRange({0, 0}, {3, 0}), LogicalDeviceRange({0, 1}, {3, 1})}, // Split grid into 2 rows + }), std::vector>({ {LogicalDeviceRange({0, 0}, {3, 1})}, // Full grid {LogicalDeviceRange({0, 0}, {3, 0}), LogicalDeviceRange({0, 1}, {3, 1})}, // Split grid into 2 rows @@ -133,6 +149,20 @@ INSTANTIATE_TEST_SUITE_P( LogicalDeviceRange({2, 0}, {2, 1}), LogicalDeviceRange({3, 0}, {3, 1})}, // Split grid into 4 columns }), + std::vector>({ + {LogicalDeviceRange({0, 0}, {3, 1})}, // Full grid + {LogicalDeviceRange({0, 0}, {0, 0}), + LogicalDeviceRange({1, 0}, {1, 0}), + LogicalDeviceRange({2, 0}, {2, 0}), + LogicalDeviceRange({3, 0}, {3, 0}), + LogicalDeviceRange({0, 1}, {0, 1}), + LogicalDeviceRange({1, 1}, {1, 1}), + LogicalDeviceRange({2, 1}, {2, 1}), + LogicalDeviceRange({3, 1}, {3, 1})}, // Run on individual devices + {LogicalDeviceRange({1, 0}, {2, 1})}, // Run on 2 center columns + {LogicalDeviceRange({2, 0}, {2, 1})}, // Run on single center column + {LogicalDeviceRange({1, 1}, {2, 1})}, // Run on 2 devices on the bottom row + }), std::vector>({ {LogicalDeviceRange({0, 0}, {0, 1}), LogicalDeviceRange({1, 0}, {1, 1}), diff --git a/tt_metal/api/tt-metalium/mesh_common.hpp b/tt_metal/api/tt-metalium/mesh_common.hpp index c83e832f44b..5433e133d99 100644 --- a/tt_metal/api/tt-metalium/mesh_common.hpp +++ b/tt_metal/api/tt-metalium/mesh_common.hpp @@ -21,3 +21,4 @@ using MeshTraceId = tt::stl::StrongType; using DeviceCoord = CoreCoord; using LogicalDeviceRange = CoreRange; +using LogicalDeviceRangeSet = CoreRangeSet; diff --git a/tt_metal/distributed/mesh_command_queue.cpp b/tt_metal/distributed/mesh_command_queue.cpp index 1a8d6a90766..5e971d42a51 100644 --- a/tt_metal/distributed/mesh_command_queue.cpp +++ b/tt_metal/distributed/mesh_command_queue.cpp @@ -106,7 +106,7 @@ void MeshCommandQueue::enqueue_mesh_workload(MeshWorkload& mesh_workload, bool b dispatch_metadata); std::unordered_set chip_ids_in_workload = {}; - std::vector active_sub_grids = {}; + std::vector active_sub_grids = {}; // Iterate over all programs. Update dispatch commands per program to reflect // current device state. Write the finalized program command sequence to each // physical device tied to the program. @@ -639,12 +639,12 @@ void MeshCommandQueue::capture_program_trace_on_subgrid( } void MeshCommandQueue::capture_go_signal_trace_on_unused_subgrids( - std::vector& active_sub_grids, + std::vector& active_sub_grids, const SubDeviceId& sub_device_id, uint32_t expected_num_workers_completed, bool mcast_go_signals, bool unicast_go_signals) { - CoreRangeSet active_ranges = active_sub_grids[0]; + LogicalDeviceRangeSet active_ranges = active_sub_grids[0]; for (int i = 1; i < active_sub_grids.size(); i++) { active_ranges = active_ranges.merge(active_sub_grids[i]); } @@ -652,27 +652,28 @@ void MeshCommandQueue::capture_go_signal_trace_on_unused_subgrids( CoreRange active_grid = active_ranges.bounding_box(); CoreRange full_grid = CoreRange({0, 0}, {mesh_device_->num_cols() - 1, mesh_device_->num_rows() - 1}); if (active_grid != full_grid) { - CoreRange unused_grid = convex_relative_complement(full_grid, active_grid); - - auto start_coord = unused_grid.start_coord; - auto& sysmem_manager_for_trace = mesh_device_->get_device(start_coord.y, start_coord.x)->sysmem_manager(); - uint32_t sysmem_manager_offset = sysmem_manager_for_trace.get_issue_queue_write_ptr(id_); - write_go_signal( - id_, - mesh_device_, - sub_device_id, - sysmem_manager_for_trace, - expected_num_workers_completed, - this->virtual_program_dispatch_core(), - mcast_go_signals, - unicast_go_signals, - mesh_device_->num_worker_cores(HalProgrammableCoreType::ACTIVE_ETH, sub_device_id)); - auto mesh_trace_md = MeshTraceStagingMetadata{ - unused_grid, - start_coord, - sysmem_manager_offset, - sysmem_manager_for_trace.get_issue_queue_write_ptr(id_) - sysmem_manager_offset}; - ordered_mesh_trace_md_.push_back(mesh_trace_md); + LogicalDeviceRangeSet unused_grids = relative_complement(full_grid, active_grid); + for (auto& unused_grid : unused_grids.ranges()) { + auto start_coord = unused_grid.start_coord; + auto& sysmem_manager_for_trace = mesh_device_->get_device(start_coord.y, start_coord.x)->sysmem_manager(); + uint32_t sysmem_manager_offset = sysmem_manager_for_trace.get_issue_queue_write_ptr(id_); + write_go_signal( + id_, + mesh_device_, + sub_device_id, + sysmem_manager_for_trace, + expected_num_workers_completed, + this->virtual_program_dispatch_core(), + mcast_go_signals, + unicast_go_signals, + mesh_device_->num_worker_cores(HalProgrammableCoreType::ACTIVE_ETH, sub_device_id)); + auto mesh_trace_md = MeshTraceStagingMetadata{ + unused_grid, + start_coord, + sysmem_manager_offset, + sysmem_manager_for_trace.get_issue_queue_write_ptr(id_) - sysmem_manager_offset}; + ordered_mesh_trace_md_.push_back(mesh_trace_md); + } } } diff --git a/tt_metal/distributed/mesh_trace.cpp b/tt_metal/distributed/mesh_trace.cpp index 49cd6f1a779..536f48bd977 100644 --- a/tt_metal/distributed/mesh_trace.cpp +++ b/tt_metal/distributed/mesh_trace.cpp @@ -46,8 +46,10 @@ void MeshTraceDescriptor::assemble_dispatch_commands( std::make_move_iterator(program_cmds_vector.end())); } else { // Intersection is a subset of the originally placed program. - auto compliment_ = convex_relative_complement(program.device_range, intersection); - intermed_trace_data.push_back(MeshTraceData{compliment_, program.data}); + auto complement = relative_complement(program.device_range, intersection); + for (auto& complement_range : complement.ranges()) { + intermed_trace_data.push_back(MeshTraceData{complement_range, program.data}); + } intermed_trace_data.push_back(MeshTraceData{intersection, program.data}); auto& intersection_data = intermed_trace_data.back().data; intersection_data.insert( diff --git a/tt_metal/distributed/mesh_workload_utils.cpp b/tt_metal/distributed/mesh_workload_utils.cpp index 21be612bdb0..2bbc713c87c 100644 --- a/tt_metal/distributed/mesh_workload_utils.cpp +++ b/tt_metal/distributed/mesh_workload_utils.cpp @@ -80,15 +80,27 @@ void write_go_signal( bool is_row_major_intersection(const LogicalDeviceRange& parent, const LogicalDeviceRange& intersection) { return intersection.grid_size().x == parent.grid_size().x; } +bool matching_dimensions(const LogicalDeviceRange& parent, const LogicalDeviceRange& intersection) { + auto intersection_grid_size = intersection.grid_size(); + auto parent_grid_size = parent.grid_size(); + return intersection_grid_size.x == parent_grid_size.x || intersection_grid_size.y == parent_grid_size.y; +} + +bool matching_vertices(const LogicalDeviceRange& parent, const LogicalDeviceRange& intersection) { + return (intersection.start_coord.x == parent.start_coord.x && intersection.start_coord.y == parent.start_coord.y) || + (intersection.end_coord.x == parent.end_coord.x && intersection.end_coord.y == parent.end_coord.y); +} + +bool has_convex_relative_complement(const LogicalDeviceRange& parent, const LogicalDeviceRange& intersection) { + return matching_dimensions(parent, intersection) && matching_vertices(parent, intersection); +} LogicalDeviceRange convex_relative_complement( const LogicalDeviceRange& parent, const LogicalDeviceRange& intersection) { TT_FATAL(parent.contains(intersection), "Parent must contain intersection"); auto intersection_grid_size = intersection.grid_size(); auto parent_grid_size = parent.grid_size(); - TT_FATAL( - intersection_grid_size.x == parent_grid_size.x || intersection_grid_size.y == parent_grid_size.y, - "Non convex grids not supported"); + TT_FATAL(has_convex_relative_complement(parent, intersection), "Non convex grids not supported"); if (is_row_major_intersection(parent, intersection)) { if (intersection.start_coord.y == parent.start_coord.y) { @@ -109,4 +121,26 @@ LogicalDeviceRange convex_relative_complement( } } +LogicalDeviceRangeSet relative_complement(const LogicalDeviceRange& parent, const LogicalDeviceRange& intersection) { + TT_FATAL(parent.contains(intersection), "Parent must contain intersection"); + if (has_convex_relative_complement(parent, intersection)) { + return convex_relative_complement(parent, intersection); + } + std::vector relative_complement = {}; + std::unordered_set devices_in_intersection = {}; + for (auto& intersection_device : intersection) { + devices_in_intersection.insert(intersection_device); + } + for (auto& parent_device : parent) { + if (devices_in_intersection.find(parent_device) == devices_in_intersection.end()) { + relative_complement.push_back(CoreRange(parent_device)); + } + } + LogicalDeviceRangeSet merged_complement = relative_complement[0]; + for (int i = 1; i < relative_complement.size(); i++) { + merged_complement = merged_complement.merge(relative_complement[i]); + } + return merged_complement; +} + } // namespace tt::tt_metal::distributed diff --git a/tt_metal/distributed/mesh_workload_utils.hpp b/tt_metal/distributed/mesh_workload_utils.hpp index c4fd759a5c6..577aff84af7 100644 --- a/tt_metal/distributed/mesh_workload_utils.hpp +++ b/tt_metal/distributed/mesh_workload_utils.hpp @@ -20,6 +20,6 @@ void write_go_signal( bool send_unicasts, int num_unicast_txns = -1); -LogicalDeviceRange convex_relative_complement(const LogicalDeviceRange& parent, const LogicalDeviceRange& intersection); +LogicalDeviceRangeSet relative_complement(const LogicalDeviceRange& parent, const LogicalDeviceRange& intersection); } // namespace tt::tt_metal::distributed From 99e8f45516093967fd56ff3de98efa47868a3a02 Mon Sep 17 00:00:00 2001 From: Bryan Wilder Field Lozano Date: Thu, 20 Feb 2025 19:27:36 -0800 Subject: [PATCH 211/316] [skip ci] Remove Taskflow from tt-train dependencies (#18078) --- tt-train/cmake/dependencies.cmake | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tt-train/cmake/dependencies.cmake b/tt-train/cmake/dependencies.cmake index c29e4a9231f..c98149d5bdd 100644 --- a/tt-train/cmake/dependencies.cmake +++ b/tt-train/cmake/dependencies.cmake @@ -84,11 +84,6 @@ CPMAddPackage( "XTENSOR_ENABLE_TESTS OFF" ) -CPMAddPackage(NAME Taskflow GITHUB_REPOSITORY taskflow/taskflow GIT_TAG v3.7.0 OPTIONS "TF_BUILD_TESTS OFF") -if(Taskflow_ADDED AND NOT TARGET Taskflow::Taskflow) - add_library(Taskflow::Taskflow ALIAS Taskflow) -endif() - include(${PROJECT_SOURCE_DIR}/cmake/fetch_cli11.cmake) # gersemi: off From d9263f289e069fdeda588154e066ff3ab4ea4426 Mon Sep 17 00:00:00 2001 From: Aleksandar Djordjevic Date: Fri, 21 Feb 2025 11:48:57 +0100 Subject: [PATCH 212/316] Printing packer's and unpacker's configuration registers (#17368) ### Ticket [Link to Github Issue] (https://github.com/tenstorrent/tt-metal/issues/16229) ### Problem description Implementing dprint functions for configuration registers for packer and unpacker. ### What's changed Used readers previously implemented in LLK to get configuration registers and then used DPRINT to print these. Added two new filed (dprint_tensix_pack.h and dprint_tensix_unpack.h) containing APis with following names: - dprint_tensix_alu_config - dprint_tensix_unpack_tile_descriptor - dprint_tensix_unpack_config - dprint_tensix_pack_config - dprint_tensix_pack_relu_config - dprint_tensix_dest_rd_ctrl - dprint_tensix_pack_edge_offset - dprint_tensix_pack_counters - dprint_tensix_pack_strides ### Checklist - [ ] Post commit CI passes - [ ] Blackhole Post commit (if applicable) - [ ] Model regression CI testing passes (if applicable) - [ ] Device performance regression CI testing passes (if applicable) - [ ] **(For models and ops writers)** Full [new models](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) tests passes - [ ] New/Existing tests provide coverage for changes --- .../tt_metal/debug_tools/CMakeLists.txt | 1 + .../dprint/test_print_config_register.cpp | 595 ++++++++++++++++ .../dataflow/writer_config_reg.cpp | 362 ++++++++++ tt_metal/hw/inc/debug/dprint_tensix.h | 77 +++ tt_metal/hw/inc/debug/dprint_tensix_pack.h | 634 ++++++++++++++++++ tt_metal/hw/inc/debug/dprint_tensix_unpack.h | 508 ++++++++++++++ tt_metal/third_party/tt_llk_grayskull | 2 +- 7 files changed, 2178 insertions(+), 1 deletion(-) create mode 100644 tests/tt_metal/tt_metal/debug_tools/dprint/test_print_config_register.cpp create mode 100644 tests/tt_metal/tt_metal/test_kernels/dataflow/writer_config_reg.cpp create mode 100644 tt_metal/hw/inc/debug/dprint_tensix_pack.h create mode 100644 tt_metal/hw/inc/debug/dprint_tensix_unpack.h diff --git a/tests/tt_metal/tt_metal/debug_tools/CMakeLists.txt b/tests/tt_metal/tt_metal/debug_tools/CMakeLists.txt index 7244ca3e45a..7c7f56bb74d 100644 --- a/tests/tt_metal/tt_metal/debug_tools/CMakeLists.txt +++ b/tests/tt_metal/tt_metal/debug_tools/CMakeLists.txt @@ -11,6 +11,7 @@ set(UNIT_TESTS_DEBUG_TOOLS_SRC ${CMAKE_CURRENT_SOURCE_DIR}/dprint/test_print_tensix_dest.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dprint/test_print_tiles.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dprint/test_raise_wait.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dprint/test_print_config_register.cpp ${CMAKE_CURRENT_SOURCE_DIR}/watcher/test_assert.cpp ${CMAKE_CURRENT_SOURCE_DIR}/watcher/test_link_training.cpp ${CMAKE_CURRENT_SOURCE_DIR}/watcher/test_noc_sanitize_delays.cpp diff --git a/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_config_register.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_config_register.cpp new file mode 100644 index 00000000000..60212f12e89 --- /dev/null +++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_config_register.cpp @@ -0,0 +1,595 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#include +#include "debug_tools_fixture.hpp" +#include "gtest/gtest.h" +#include "debug_tools_test_utils.hpp" +#include +#include +#include "tt_metal/test_utils/df/df.hpp" +#include "tt_metal/test_utils/stimulus.hpp" +////////////////////////////////////////////////////////////////////////////////////////// +// A test for checking dprint +////////////////////////////////////////////////////////////////////////////////////////// +using namespace tt; +using namespace tt::tt_metal; +using namespace tt::test_utils; +using namespace tt::test_utils::df; + +// Register names +#define ALU_CONFIG 0 +#define UNPACK_TILE_DESCRIPTOR 1 +#define UNPACK_CONFIG 2 +#define PACK_CONFIG 3 +#define RELU_CONFIG 4 +#define DEST_RD_CTRL 5 +#define PACK_EDGE_OFFSET 6 +#define PACK_COUNTERS 7 +#define PACK_STRIDES 8 + +// Type of prints +const std::unordered_set format_fields = {"ALU_FORMAT_SPEC_REG0_SrcA", "ALU_FORMAT_SPEC_REG1_SrcB", + "ALU_FORMAT_SPEC_REG2_Dstacc", "in_data_format", "out_data_format"}; +const std::unordered_set decimal_fields = { + "blobs_per_xy_plane", + "x_dim", + "y_dim", + "z_dim", + "w_dim", + "blobs_y_start", + "digest_size", + "upsample_rate", + "shift_amount", + "fifo_size", + "row_ptr_section_size", + "exp_section_size", + "pack_per_xy_plane", + "downsample_shift_count", + "exp_threshold", + "STACC_RELU_ReluThreshold", + "pack_reads_per_xy_plane", + "pack_xys_per_til", + "pack_per_xy_plane_offset", + "sub_l1_tile_header_size", + "add_tile_header_size"}; + +// ALU CONFIG +const std::vector field_names_alu_config_all = { + "ALU_ROUNDING_MODE_Fpu_srnd_en", + "ALU_ROUNDING_MODE_Gasket_srnd_en", + "ALU_ROUNDING_MODE_Packer_srnd_en", + "ALU_ROUNDING_MODE_Padding", + "ALU_ROUNDING_MODE_GS_LF", + "ALU_ROUNDING_MODE_Bfp8_HF", + "ALU_FORMAT_SPEC_REG0_SrcAUnsigned", + "ALU_FORMAT_SPEC_REG0_SrcBUnsigned", + "ALU_FORMAT_SPEC_REG0_SrcA", + "ALU_FORMAT_SPEC_REG1_SrcB", + "ALU_FORMAT_SPEC_REG2_Dstacc", + "ALU_ACC_CTRL_Fp32_enabled", + "ALU_ACC_CTRL_SFPU_Fp32_enabled", + "ALU_ACC_CTRL_INT8_math_enabled"}; +const std::vector field_values_alu_config_all = {1, 0, 1, 15, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1}; + +// PACK_EDGE_OFFSET +const std::vector field_names_pack_edge_offset_all = { + "mask", + "mode", + "tile_row_set_select_pack0", + "tile_row_set_select_pack1", + "tile_row_set_select_pack2", + "tile_row_set_select_pack3", + "reserved"}; +const std::vector field_values_pack_edge_offset_all = {16, 1, 0, 1, 2, 3, 0}; + +// PACK_COUNTERS +const std::vector field_names_pack_counters_all = { + "pack_per_xy_plane", + "pack_reads_per_xy_plane", + "pack_xys_per_til", + "pack_yz_transposed", + "pack_per_xy_plane_offset"}; +const std::vector field_values_pack_counters_all = {4, 8, 2, 0, 6}; + +// RELU_CONFIG +const std::vector field_names_relu_config_all = { + "ALU_ACC_CTRL_Zero_Flag_disabled_src", + "ALU_ACC_CTRL_Zero_Flag_disabled_dst", + "STACC_RELU_ApplyRelu", + "STACC_RELU_ReluThreshold", + "DISABLE_RISC_BP_Disable_main", + "DISABLE_RISC_BP_Disable_trisc", + "DISABLE_RISC_BP_Disable_ncrisc", + "DISABLE_RISC_BP_Disable_bmp_clear_main", + "DISABLE_RISC_BP_Disable_bmp_clear_trisc", + "DISABLE_RISC_BP_Disable_bmp_clear_ncrisc"}; +const std::vector field_values_relu_config_all = {0, 0, 1, 8, 0, 0, 0, 0, 0, 0}; + +// PACK_DEST_RD_CTRL +const std::vector field_names_dest_rd_ctrl_all = { + "PCK_DEST_RD_CTRL_Read_32b_data", + "PCK_DEST_RD_CTRL_Read_unsigned", + "PCK_DEST_RD_CTRL_Read_int8", + "PCK_DEST_RD_CTRL_Round_10b_mant", + "PCK_DEST_RD_CTRL_Reserved"}; +const std::vector field_values_dest_rd_ctrl_all = {1, 0, 1, 1, 0}; + +// UNPACK TILE DESCRIPTOR +const std::vector field_names_unpack_tile_descriptor_grayskull = { + "in_data_format", + "uncompressed", + "reserved_0", + "blobs_per_xy_plane", + "reserved_1", + "x_dim", + "y_dim", + "z_dim", + "w_dim", + "blobs_y_start", + "digest_type", + "digest_size"}; +const std::vector field_values_unpack_tile_descriptor_grayskull = {5, 1, 2, 10, 7, 2, 4, 8, 16, 32, 0, 0}; + +// UNPACK CONFIG +const std::vector field_names_unpack_config_grayskull = { + "out_data_format", + "throttle_mode", + "context_count", + "haloize_mode", + "tileize_mode", + "force_shared_exp", + "reserved_0", + "upsample_rate", + "upsample_and_interlave", + "shift_amount", + "uncompress_cntx0_3", + "reserved_1", + "uncompress_cntx4_7", + "reserved_2", + "limit_addr", + "fifo_size"}; +const std::vector field_values_unpack_config_grayskull = {0, 1, 2, 0, 1, 0, 0, 3, 0, 16, 5, 0, 2, 0, 28, 29}; + +// PACK CONFIG +const std::vector field_names_pack_config_grayskull = { + "row_ptr_section_size", + "exp_section_size", + "l1_dest_addr", + "uncompress", + "add_l1_dest_addr_offset", + "reserved_0", + "out_data_format", + "in_data_format", + "reserved_1", + "src_if_sel", + "pack_per_xy_plane", + "l1_src_addr", + "downsample_mask", + "downsample_shift_count", + "read_mode", + "exp_threshold_en", + "reserved_2", + "exp_threshold"}; +const std::vector field_values_pack_config_grayskull = { + 12, 24, 16, 0, 1, 0, 5, 5, 0, 1, 0, 8, 12, 4, 0, 1, 0, 12}; + +// UNPACK TILE DESCRIPTOR +const std::vector field_names_unpack_tile_descriptor_wormhole_or_blackhole = { + "in_data_format", + "uncompressed", + "reserved_0", + "blobs_per_xy_plane", + "reserved_1", + "x_dim", + "y_dim", + "z_dim", + "w_dim", + "blobs_y_start_lo", + "blobs_y_start_hi", + "digest_type", + "digest_size"}; +const std::vector field_values_unpack_tile_descriptor_wormhole_or_blackhole = { + 5, 1, 0, 10, 7, 2, 4, 8, 16, 32, 0, 0, 0}; + +// UNPACK CONFIG +const std::vector field_names_unpack_config_wormhole_or_blackhole = { + "out_data_format", + "throttle_mode", + "context_count", + "haloize_mode", + "tileize_mode", + "unpack_src_reg_set_update", + "unpack_if_sel", + "upsample_rate", + "reserved_1", + "upsample_and_interlave", + "shift_amount", + "uncompress_cntx0_3", + "unpack_if_sel_cntx0_3", + "force_shared_exp", + "reserved_2", + "uncompress_cntx4_7", + "unpack_if_sel_cntx4_7", + "reserved_3", + "limit_addr", + "reserved_4", + "fifo_size", + "reserved_5"}; +const std::vector field_values_unpack_config_wormhole_or_blackhole = {0, 1, 2, 0, 1, 1, 0, 3, 0, 0, 16, + 5, 6, 0, 0, 2, 3, 0, 28, 0, 29, 0}; + +const std::vector field_names_pack_config_blackhole = { + "row_ptr_section_size", + "exp_section_size", + "l1_dest_addr", + "uncompress", + "add_l1_dest_addr_offset", + "disable_pack_zero_flag", + "reserved_0", + "out_data_format", + "in_data_format", + "dis_shared_exp_assembler", + "auto_set_last_pacr_intf_sel", + "enable_out_fifo", + "sub_l1_tile_header_size", + "src_if_sel", + "pack_start_intf_pos", + "all_pack_disable_zero_compress_ovrd", + "add_tile_header_size", + "pack_dis_y_pos_start_offset", + "l1_src_addr"}; +const std::vector field_values_pack_config_blackhole = { + 12, 24, 16, 0, 1, 1, 0, 5, 5, 0, 0, 1, 0, 1, 2, 0, 1, 0, 8}; +// PACK CONFIG +const std::vector field_names_pack_config_wormhole = { + "row_ptr_section_size", + "exp_section_size", + "l1_dest_addr", + "uncompress", + "add_l1_dest_addr_offset", + "reserved_0", + "out_data_format", + "in_data_format", + "reserved_1", + "src_if_sel", + "pack_per_xy_plane", + "l1_src_addr", + "downsample_mask", + "downsample_shift_count", + "read_mode", + "exp_threshold_en", + "pack_l1_acc_disable_pack_zero_flag", + "reserved_2", + "exp_threshold"}; +const std::vector field_values_pack_config_wormhole = { + 12, 24, 16, 0, 1, 0, 5, 5, 0, 1, 0, 8, 12, 4, 0, 1, 2, 0, 12}; + +// Configuration for Data Flow Test involving Reader, Datacopy, and Writer +struct ConfigRegPrintTestConfig { + CoreCoord core = {}; + std::string write_kernel; + std::string print_kernel; + int num_of_registers; + std::vector field_names; + std::vector field_values; + uint32_t register_name; +}; + +// Dprints data format as string given an uint +static std::string data_format_to_string(uint8_t data_format) { + switch (data_format) { + case (uint8_t) DataFormat::Float32: + return "Float32"; + case (uint8_t) DataFormat::Float16: + return "Float16"; + case (uint8_t) DataFormat::Bfp8: + return "Bfp8"; + case (uint8_t) DataFormat::Bfp4: + return "Bfp4"; + case (uint8_t) DataFormat::Bfp2: + return "Bfp2"; + case (uint8_t) DataFormat::Float16_b: + return "Float16_b"; + case (uint8_t) DataFormat::Bfp8_b: + return "Bfp8_b"; + case (uint8_t) DataFormat::Bfp4_b: + return "Bfp4_b"; + case (uint8_t) DataFormat::Bfp2_b: + return "Bfp2_b"; + case (uint8_t) DataFormat::Lf8: + return "Lf8"; + case (uint8_t) DataFormat::Int8: + return "Int8"; + case (uint8_t) DataFormat::UInt8: + return "UInt8"; + case (uint8_t) DataFormat::UInt16: + return "UInt16"; + case (uint8_t) DataFormat::Int32: + return "Int32"; + case (uint8_t) DataFormat::UInt32: + return "UInt32"; + case (uint8_t) DataFormat::Tf32: + return "Tf32"; + default: + return "INVALID DATA FORMAT"; + } +} + +static std::string int_to_hex(int value) { + std::stringstream ss; + ss << std::hex << value; // Convert to hexadecimal + return ss.str(); +} + +// Prepares the compute kernel with the specified program and test configuration +static KernelHandle prepare_writer(tt_metal::Program& program, const ConfigRegPrintTestConfig& config) { + return tt_metal::CreateKernel( + program, + config.write_kernel, + config.core, + tt_metal::ComputeConfig{ + .compile_args = { config.register_name }}); +} + +static std::string generate_golden_output(const std::vector& field_names, const std::vector& values, uint num_of_registers, uint32_t register_name) { + std::string golden_output; + bool multiple_registers = num_of_registers > 1; + for (uint reg_id = 1; reg_id <= num_of_registers; reg_id++) { + if (multiple_registers) golden_output += "REG_ID: " + std::to_string(reg_id) + "\n"; + for (size_t i = 0; i < field_names.size(); i++) { + if (field_names[i] == "blobs_y_start_lo") continue; + if (field_names[i] == "blobs_y_start_hi") { + uint32_t val = (values[i] << 16) | values[i-1]; + golden_output += "blobs_y_start: " + std::to_string(val) + "\n"; + continue; + } + if (format_fields.find(field_names[i]) != format_fields.end()) + golden_output += field_names[i] + ": " + data_format_to_string(values[i]) + "\n"; + else if (decimal_fields.find(field_names[i]) != format_fields.end()) + golden_output += field_names[i] + ": " + std::to_string(values[i]) + "\n"; + else { + golden_output += field_names[i] + ": 0x" + int_to_hex(values[i]) + "\n"; + } + + if (register_name == PACK_EDGE_OFFSET && reg_id > 1) break; + } + if (reg_id != num_of_registers) golden_output += "\n"; + } + return golden_output; +} + +static void print_config_reg( + DPrintFixture* fixture, tt_metal::IDevice* device, const ConfigRegPrintTestConfig& config) { + // Create program + tt_metal::Program program = tt_metal::CreateProgram(); + + // Prepare write kernel + auto write_kernel = prepare_writer(program, config); + + // Generate golden output + std::string golden_output = generate_golden_output(config.field_names, config.field_values, config.num_of_registers, config.register_name); + + // Run the program + fixture->RunProgram(device, program); + + // Check the print log against golden output. + EXPECT_TRUE(FilesMatchesString(DPrintFixture::dprint_file_name, golden_output)); +} + +TEST_F(DPrintFixture, ConfigRegAluTestPrint) { + std::vector field_names_alu_config = field_names_alu_config_all; + std::vector field_values_alu_config = field_values_alu_config_all; + + // Setup test configuration + ConfigRegPrintTestConfig test_config = { + .core = CoreCoord(0, 0), + .write_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_config_reg.cpp", + .num_of_registers = 1, + .field_names = field_names_alu_config, + .field_values = field_values_alu_config, + .register_name = ALU_CONFIG}; + + if (this->arch_ == ARCH::GRAYSKULL) { + GTEST_SKIP() << "Printing ALU CONFIG is not supported on grayskull."; + } + + // Run the test on the device + this->RunTestOnDevice( + [&](DPrintFixture* fixture, IDevice* device) { print_config_reg(fixture, device, test_config); }, + this->devices_[0]); +} + +TEST_F(DPrintFixture, ConfigRegTileDescriptorTestPrint) { + // Setup test configuration + + std::vector field_names_unpack_tile_descriptor; + std::vector field_values_unpack_tile_descriptor; + + if (this->arch_ == ARCH::GRAYSKULL) { + field_names_unpack_tile_descriptor = field_names_unpack_tile_descriptor_grayskull; + field_values_unpack_tile_descriptor = field_values_unpack_tile_descriptor_grayskull; + } else { + field_names_unpack_tile_descriptor = field_names_unpack_tile_descriptor_wormhole_or_blackhole; + field_values_unpack_tile_descriptor = field_values_unpack_tile_descriptor_wormhole_or_blackhole; + } + + ConfigRegPrintTestConfig test_config = { + .core = CoreCoord(0, 0), + .write_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_config_reg.cpp", + .num_of_registers = 2, + .field_names = field_names_unpack_tile_descriptor, + .field_values = field_values_unpack_tile_descriptor, + .register_name = UNPACK_TILE_DESCRIPTOR}; + + // Run the test on the device + this->RunTestOnDevice( + [&](DPrintFixture* fixture, IDevice* device) { print_config_reg(fixture, device, test_config); }, + this->devices_[0]); +} + +TEST_F(DPrintFixture, ConfigRegUnpackTestPrint) { + std::vector field_names_unpack_config; + std::vector field_values_unpack_config; + + if (this->arch_ == ARCH::GRAYSKULL) { + field_names_unpack_config = field_names_unpack_config_grayskull; + field_values_unpack_config = field_values_unpack_config_grayskull; + } else { + field_names_unpack_config = field_names_unpack_config_wormhole_or_blackhole; + field_values_unpack_config = field_values_unpack_config_wormhole_or_blackhole; + } + + // Setup test configuration + ConfigRegPrintTestConfig test_config = { + .core = CoreCoord(0, 0), + .write_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_config_reg.cpp", + .num_of_registers = 2, + .field_names = field_names_unpack_config, + .field_values = field_values_unpack_config, + .register_name = UNPACK_CONFIG}; + + // Run the test on the device + this->RunTestOnDevice( + [&](DPrintFixture* fixture, IDevice* device) { print_config_reg(fixture, device, test_config); }, + this->devices_[0]); +} + +TEST_F(DPrintFixture, ConfigRegPackTestPrint) { + std::vector field_names_pack_config; + std::vector field_values_pack_config; + + if (this->arch_ == ARCH::GRAYSKULL) { + field_names_pack_config = field_names_pack_config_grayskull; + field_values_pack_config = field_values_pack_config_grayskull; + } else if (this->arch_ == ARCH::WORMHOLE_B0) { + field_names_pack_config = field_names_pack_config_wormhole; + field_values_pack_config = field_values_pack_config_wormhole; + } else { + field_names_pack_config = field_names_pack_config_blackhole; + field_values_pack_config = field_values_pack_config_blackhole; + } + + int num_of_registers; + if (this->arch_ == ARCH::BLACKHOLE) { + num_of_registers = 1; + } else { + num_of_registers = 4; + } + + // Setup test configuration + ConfigRegPrintTestConfig test_config = { + .core = CoreCoord(0, 0), + .write_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_config_reg.cpp", + .num_of_registers = num_of_registers, + .field_names = field_names_pack_config, + .field_values = field_values_pack_config, + .register_name = PACK_CONFIG}; + + // Run the test on the device + this->RunTestOnDevice( + [&](DPrintFixture* fixture, IDevice* device) { print_config_reg(fixture, device, test_config); }, + this->devices_[0]); +} + +TEST_F(DPrintFixture, ConfigRegReluTestPrint) { + std::vector field_names_relu_config = field_names_relu_config_all; + std::vector field_values_relu_config = field_values_relu_config_all; + + // Setup test configuration + ConfigRegPrintTestConfig test_config = { + .core = CoreCoord(0, 0), + .write_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_config_reg.cpp", + .num_of_registers = 1, + .field_names = field_names_relu_config, + .field_values = field_values_relu_config, + .register_name = RELU_CONFIG}; + + if (this->arch_ == ARCH::GRAYSKULL) { + GTEST_SKIP() << "Printing RELU CONFIG is not supported on grayskull."; + } + + // Run the test on the device + this->RunTestOnDevice( + [&](DPrintFixture* fixture, IDevice* device) { print_config_reg(fixture, device, test_config); }, + this->devices_[0]); +} + +TEST_F(DPrintFixture, ConfigRegDestRdCtrlTestPrint) { + std::vector field_names_dest_rd_ctrl = field_names_dest_rd_ctrl_all; + std::vector field_values_dest_rd_ctrl = field_values_dest_rd_ctrl_all; + + // Setup test configuration + ConfigRegPrintTestConfig test_config = { + .core = CoreCoord(0, 0), + .write_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_config_reg.cpp", + .num_of_registers = 1, + .field_names = field_names_dest_rd_ctrl, + .field_values = field_values_dest_rd_ctrl, + .register_name = DEST_RD_CTRL}; + + if (this->arch_ == ARCH::GRAYSKULL) { + GTEST_SKIP() << "Printing DEST RD CTRL is not supported on grayskull."; + } + + // Run the test on the device + this->RunTestOnDevice( + [&](DPrintFixture* fixture, IDevice* device) { print_config_reg(fixture, device, test_config); }, + this->devices_[0]); +} + +TEST_F(DPrintFixture, ConfigRegPackEdgeOffsetTestPrint) { + std::vector field_names_pack_edge_offset = field_names_pack_edge_offset_all; + std::vector field_values_pack_edge_offset = field_values_pack_edge_offset_all; + + int num_of_registers; + if (this->arch_ == ARCH::BLACKHOLE) { + num_of_registers = 1; + } else { + num_of_registers = 4; + } + + // Setup test configuration + ConfigRegPrintTestConfig test_config = { + .core = CoreCoord(0, 0), + .write_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_config_reg.cpp", + .num_of_registers = num_of_registers, + .field_names = field_names_pack_edge_offset, + .field_values = field_values_pack_edge_offset, + .register_name = PACK_EDGE_OFFSET}; + + // Run the test on the device + this->RunTestOnDevice( + [&](DPrintFixture* fixture, IDevice* device) { print_config_reg(fixture, device, test_config); }, + this->devices_[0]); +} + +TEST_F(DPrintFixture, ConfigRegPackCountersTestPrint) { + std::vector field_names_pack_counters = field_names_pack_counters_all; + std::vector field_values_pack_counters = field_values_pack_counters_all; + + int num_of_registers; + if (this->arch_ == ARCH::BLACKHOLE) { + num_of_registers = 1; + } else { + num_of_registers = 4; + } + + // Setup test configuration + ConfigRegPrintTestConfig test_config = { + .core = CoreCoord(0, 0), + .write_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_config_reg.cpp", + .num_of_registers = num_of_registers, + .field_names = field_names_pack_counters, + .field_values = field_values_pack_counters, + .register_name = PACK_COUNTERS}; + + // Run the test on the device + this->RunTestOnDevice( + [&](DPrintFixture* fixture, IDevice* device) { print_config_reg(fixture, device, test_config); }, + this->devices_[0]); +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_config_reg.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_config_reg.cpp new file mode 100644 index 00000000000..8124417544a --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_config_reg.cpp @@ -0,0 +1,362 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "debug/dprint_tensix_pack.h" +#include "debug/dprint_tensix_unpack.h" + +#include + +// Register names +#define ALU_CONFIG 0 +#define UNPACK_TILE_DESCRIPTOR 1 +#define UNPACK_CONFIG 2 +#define PACK_CONFIG 3 +#define RELU_CONFIG 4 +#define DEST_RD_CTRL 5 +#define PACK_EDGE_OFFSET 6 +#define PACK_COUNTERS 7 +#define PACK_STRIDES 8 + +namespace NAMESPACE { +#if defined(ARCH_WORMHOLE) or defined(ARCH_BLACKHOLE) +void generate_alu_config(ckernel::unpacker::alu_config_t& config) { + config.ALU_ROUNDING_MODE_Fpu_srnd_en = 1; + config.ALU_ROUNDING_MODE_Gasket_srnd_en = 0; + config.ALU_ROUNDING_MODE_Packer_srnd_en = 1; + config.ALU_ROUNDING_MODE_Padding = 15; + config.ALU_ROUNDING_MODE_GS_LF = 0; + config.ALU_ROUNDING_MODE_Bfp8_HF = 1; + config.ALU_FORMAT_SPEC_REG0_SrcAUnsigned = 1; + config.ALU_FORMAT_SPEC_REG0_SrcBUnsigned = 0; + config.ALU_FORMAT_SPEC_REG0_SrcA = 0; + config.ALU_FORMAT_SPEC_REG1_SrcB = 1; + config.ALU_FORMAT_SPEC_REG2_Dstacc = 0; + config.ALU_ACC_CTRL_Fp32_enabled = 0; + config.ALU_ACC_CTRL_SFPU_Fp32_enabled = 0; + config.ALU_ACC_CTRL_INT8_math_enabled = 1; +} +#endif + +void generate_unpack_tile_descriptor(ckernel::unpacker::unpack_tile_descriptor_t& tile_descriptor) { + tile_descriptor.in_data_format = 5; + tile_descriptor.uncompressed = 1; + tile_descriptor.reserved_0 = 0; + tile_descriptor.blobs_per_xy_plane = 10; + tile_descriptor.reserved_1 = 7; + tile_descriptor.x_dim = 2; + tile_descriptor.y_dim = 4; + tile_descriptor.z_dim = 8; + tile_descriptor.w_dim = 16; +#ifdef ARCH_GRAYSKULL + tile_descriptor.blobs_y_start = 32; +#else // ARCH_WORMHOLE or ARCH_BLACKHOLE + tile_descriptor.blobs_y_start_lo = 32; + tile_descriptor.blobs_y_start_hi = 0; +#endif + tile_descriptor.digest_type = 0; + tile_descriptor.digest_size = 0; +} + +void generate_unpack_config(ckernel::unpacker::unpack_config_t& config) { + config.out_data_format = 0; + config.throttle_mode = 1; + config.context_count = 2; + config.haloize_mode = 0; + config.tileize_mode = 1; + config.upsample_rate = 3; + config.reserved_1 = 0; + config.upsamle_and_interlave = 0; + config.shift_amount = 16; + config.uncompress_cntx0_3 = 5; + config.force_shared_exp = 0; + config.reserved_2 = 0; + config.uncompress_cntx4_7 = 2; + config.limit_addr = 28; + config.fifo_size = 29; + +#ifdef ARCH_GRAYSKULL + config.reserved_0 = 0; +#else // ARCH_WORMHOLE or ARCH_BLACKHOLE + config.reserved_3 = 0; + config.reserved_4 = 0; + config.reserved_5 = 0; + config.unpack_if_sel_cntx0_3 = 6; + config.unpack_if_sel_cntx4_7 = 3; + config.unpack_src_reg_set_update = 1; + config.unpack_if_sel = 0; +#endif +} + +void generate_pack_config(ckernel::packer::pack_config_t& config) { + config.row_ptr_section_size = 12; + config.exp_section_size = 24; + config.l1_dest_addr = 16; + config.uncompress = 0; + config.add_l1_dest_addr_offset = 1; + config.reserved_0 = 0; + config.out_data_format = 5; + config.in_data_format = 5; + config.src_if_sel = 1; + config.l1_src_addr = 8; +#if defined(ARCH_WORMHOLE) or defined(ARCH_GRAYSKULL) + config.reserved_1 = 0; + config.pack_per_xy_plane = 0; + config.downsample_mask = 12; + config.downsample_shift_count = 4; + config.read_mode = 0; + config.exp_threshold_en = 1; +#ifdef ARCH_WORMHOLE + config.pack_l1_acc_disable_pack_zero_flag = 2; +#endif + config.reserved_2 = 0; + config.exp_threshold = 12; +#endif +#ifdef ARCH_BLACKHOLE + config.disable_pack_zero_flag = 1; + config.dis_shared_exp_assembler = 0; + config.auto_set_last_pacr_intf_sel = 0; + config.enable_out_fifo = 1; + config.sub_l1_tile_header_size = 0; + config.pack_start_intf_pos = 2; + config.all_pack_disable_zero_compress_ovrd = 0; + config.add_tile_header_size = 1; + config.pack_dis_y_pos_start_offset = 0; +#endif +} + +#if defined(ARCH_WORMHOLE) or defined(ARCH_BLACKHOLE) +void generate_relu_config(ckernel::packer::relu_config_t& config) { + config.ALU_ACC_CTRL_Zero_Flag_disabled_src = 0; + config.ALU_ACC_CTRL_Zero_Flag_disabled_dst = 0; + config.STACC_RELU_ApplyRelu = 1; + config.STACC_RELU_ReluThreshold = 8; + config.DISABLE_RISC_BP_Disable_main = 0; + config.DISABLE_RISC_BP_Disable_trisc = 0; + config.DISABLE_RISC_BP_Disable_ncrisc = 0; + config.DISABLE_RISC_BP_Disable_bmp_clear_main = 0; + config.DISABLE_RISC_BP_Disable_bmp_clear_trisc = 0; + config.DISABLE_RISC_BP_Disable_bmp_clear_ncrisc = 0; +} +#endif + +#if defined(ARCH_WORMHOLE) or defined(ARCH_BLACKHOLE) +void generate_dest_rd_ctrl(ckernel::packer::dest_rd_ctrl_t& dest) { + dest.PCK_DEST_RD_CTRL_Read_32b_data = 1; + dest.PCK_DEST_RD_CTRL_Read_unsigned = 0; + dest.PCK_DEST_RD_CTRL_Read_int8 = 1; + dest.PCK_DEST_RD_CTRL_Round_10b_mant = 1; + dest.PCK_DEST_RD_CTRL_Reserved = 0; +} +#endif + +void generate_pack_edge_offset(ckernel::packer::pck_edge_offset_t& edge) { + edge.mask = 16; + edge.mode = 1; + edge.tile_row_set_select_pack0 = 0; + edge.tile_row_set_select_pack1 = 1; + edge.tile_row_set_select_pack2 = 2; + edge.tile_row_set_select_pack3 = 3; + edge.reserved = 0; +} + +void generate_pack_counters(ckernel::packer::pack_counters_t& counter) { + counter.pack_per_xy_plane = 4; + counter.pack_reads_per_xy_plane = 8; + counter.pack_xys_per_til = 2; + counter.pack_yz_transposed = 0; + counter.pack_per_xy_plane_offset = 6; +} + +#if defined(ARCH_WORMHOLE) or defined(ARCH_BLACKHOLE) +void write_alu_config(volatile uint tt_reg_ptr* cfg, uint32_t address, const ckernel::unpacker::alu_config_u &config) { + cfg[address] = config.val; +} +#endif + +void write_unpack_tile_descriptor(volatile uint tt_reg_ptr* cfg, uint32_t address, uint num_of_words, const ckernel::unpacker::unpack_tile_descriptor_u &tile_descriptor) { + for (uint i = 0; i < num_of_words; i++) + cfg[address + i] = tile_descriptor.val[i]; +} + +void write_unpack_config(volatile uint tt_reg_ptr* cfg, uint32_t address, uint num_of_words, const ckernel::unpacker::unpack_config_u &config) { + for (uint i = 0; i < num_of_words; i++) + cfg[address + i] = config.val[i]; +} + +void write_pack_config(volatile uint tt_reg_ptr* cfg, uint32_t address, uint num_of_words, const ckernel::packer::pack_config_u &config) { + for (uint i = 0; i < num_of_words; i++) + cfg[address + i] = config.val[i]; +} + +#if defined(ARCH_WORMHOLE) or defined(ARCH_BLACKHOLE) +void write_relu_config(volatile uint tt_reg_ptr* cfg, uint32_t address, uint num_of_words, const ckernel::packer::relu_config_u &config) { + for (uint i = 0; i < num_of_words; i++) + cfg[address + i] = config.val[i]; +} +#endif + +#if defined(ARCH_WORMHOLE) or defined(ARCH_BLACKHOLE) +void write_dest_rd_ctrl(volatile uint tt_reg_ptr* cfg, uint32_t address, const ckernel::packer::dest_rd_ctrl_u &dest) { + cfg[address] = dest.val; +} +#endif + +void write_pack_edge_offset(volatile uint tt_reg_ptr* cfg, uint32_t address, const ckernel::packer::pck_edge_offset_u &edge) { + cfg[address] = edge.val; +} + +void write_pack_counters(volatile uint tt_reg_ptr* cfg, uint32_t address, const ckernel::packer::pack_counters_u &counter) { + cfg[address] = counter.val; +} + +void MAIN { + uint32_t register_name = get_compile_time_arg_val(0); + + // Get pointer to registers for current state ID + volatile uint tt_reg_ptr* cfg = get_cfg_pointer(); + + switch (register_name) { + #if defined(ARCH_WORMHOLE) or defined(ARCH_BLACKHOLE) + case ALU_CONFIG: + ckernel::unpacker::alu_config_u alu_config; + generate_alu_config(alu_config.f); + ckernel::unpacker::alu_config_u alu_config_original; + alu_config_original.f = ckernel::unpacker::read_alu_config(); + write_alu_config(cfg, ALU_ROUNDING_MODE_Fpu_srnd_en_ADDR32, alu_config); + dprint_tensix_alu_config(); + write_alu_config(cfg, ALU_ROUNDING_MODE_Fpu_srnd_en_ADDR32, alu_config_original); + break; + #endif + case UNPACK_TILE_DESCRIPTOR: + ckernel::unpacker::unpack_tile_descriptor_u tile_descriptor; + generate_unpack_tile_descriptor(tile_descriptor.f); + std::array tile_descriptor_vec; + tile_descriptor_vec = ckernel::unpacker::read_unpack_tile_descriptor(); + write_unpack_tile_descriptor(cfg, THCON_SEC0_REG0_TileDescriptor_ADDR32, 4, tile_descriptor); + write_unpack_tile_descriptor(cfg, THCON_SEC1_REG0_TileDescriptor_ADDR32, 4, tile_descriptor); + dprint_tensix_unpack_tile_descriptor(); + tile_descriptor.f = tile_descriptor_vec[0]; + write_unpack_tile_descriptor(cfg, THCON_SEC0_REG0_TileDescriptor_ADDR32, 4, tile_descriptor); + tile_descriptor.f = tile_descriptor_vec[1]; + write_unpack_tile_descriptor(cfg, THCON_SEC1_REG0_TileDescriptor_ADDR32, 4, tile_descriptor); + break; + case UNPACK_CONFIG: + uint num_of_words_unpack_config; + #ifdef ARCH_GRAYSKULL + num_of_words_unpack_config = 3; + #else + num_of_words_unpack_config = 4; + #endif + ckernel::unpacker::unpack_config_u unpack_config; + generate_unpack_config(unpack_config.f); + std::array unpack_config_vec; + unpack_config_vec = ckernel::unpacker::read_unpack_config(); + write_unpack_config(cfg, THCON_SEC0_REG2_Out_data_format_ADDR32, num_of_words_unpack_config, unpack_config); + write_unpack_config(cfg, THCON_SEC1_REG2_Out_data_format_ADDR32, num_of_words_unpack_config, unpack_config); + dprint_tensix_unpack_config(); + unpack_config.f = unpack_config_vec[0]; + write_unpack_config(cfg, THCON_SEC0_REG2_Out_data_format_ADDR32, num_of_words_unpack_config, unpack_config); + unpack_config.f = unpack_config_vec[1]; + write_unpack_config(cfg, THCON_SEC1_REG2_Out_data_format_ADDR32, num_of_words_unpack_config, unpack_config); + break; + case PACK_CONFIG: + uint num_of_words_pack_config; + #ifdef ARCH_BLACKHOLE + num_of_words_pack_config = 3; + #else + num_of_words_pack_config = 4; + #endif + ckernel::packer::pack_config_u pack_config; + generate_pack_config(pack_config.f); + std::array pack_config_vec; + pack_config_vec = ckernel::packer::read_pack_config(); + write_pack_config(cfg, THCON_SEC0_REG1_Row_start_section_size_ADDR32, num_of_words_pack_config, pack_config); + #if defined(ARCH_GRAYSKULL) or defined(ARCH_WORMHOLE) + write_pack_config(cfg, THCON_SEC0_REG8_Row_start_section_size_ADDR32, num_of_words_pack_config, pack_config); + write_pack_config(cfg, THCON_SEC1_REG1_Row_start_section_size_ADDR32, num_of_words_pack_config, pack_config); + write_pack_config(cfg, THCON_SEC1_REG8_Row_start_section_size_ADDR32, num_of_words_pack_config, pack_config); + #endif + dprint_tensix_pack_config(); + pack_config.f = pack_config_vec[0]; + write_pack_config(cfg, THCON_SEC0_REG1_Row_start_section_size_ADDR32, num_of_words_pack_config, pack_config); + #if defined(ARCH_GRAYSKULL) or defined(ARCH_WORMHOLE) + pack_config.f = pack_config_vec[1]; + write_pack_config(cfg, THCON_SEC0_REG8_Row_start_section_size_ADDR32, num_of_words_pack_config, pack_config); + pack_config.f = pack_config_vec[2]; + write_pack_config(cfg, THCON_SEC1_REG1_Row_start_section_size_ADDR32, num_of_words_pack_config, pack_config); + pack_config.f = pack_config_vec[3]; + write_pack_config(cfg, THCON_SEC1_REG8_Row_start_section_size_ADDR32, num_of_words_pack_config, pack_config); + #endif + break; + #if defined(ARCH_WORMHOLE) or defined(ARCH_BLACKHOLE) + case RELU_CONFIG: + ckernel::packer::relu_config_u relu_config; + generate_relu_config(relu_config.r); + ckernel::packer::relu_config_u relu_config_original; + relu_config_original.r = ckernel::packer::read_relu_config(); + write_relu_config(cfg, ALU_ACC_CTRL_Zero_Flag_disabled_src_ADDR32, 1, relu_config); + dprint_tensix_pack_relu_config(); + write_relu_config(cfg, ALU_ACC_CTRL_Zero_Flag_disabled_src_ADDR32, 1, relu_config_original); + break; + #endif + #if defined(ARCH_WORMHOLE) or defined(ARCH_BLACKHOLE) + case DEST_RD_CTRL: + ckernel::packer::dest_rd_ctrl_u dest; + generate_dest_rd_ctrl(dest.f); + ckernel::packer::dest_rd_ctrl_u dest_original; + dest_original.f = ckernel::packer::read_dest_rd_ctrl(); + write_dest_rd_ctrl(cfg, PCK_DEST_RD_CTRL_Read_32b_data_ADDR32, dest); + dprint_tensix_dest_rd_ctrl(); + write_dest_rd_ctrl(cfg, PCK_DEST_RD_CTRL_Read_32b_data_ADDR32, dest_original); + break; + #endif + case PACK_EDGE_OFFSET: + ckernel::packer::pck_edge_offset_u edge; + generate_pack_edge_offset(edge.f); + std::array edge_vec; + edge_vec = ckernel::packer::read_pack_edge_offset(); + write_pack_edge_offset(cfg, PCK_EDGE_OFFSET_SEC0_mask_ADDR32, edge); + #if defined(ARCH_GRAYSKULL) or defined(ARCH_WORMHOLE) + write_pack_edge_offset(cfg, PCK_EDGE_OFFSET_SEC1_mask_ADDR32, edge); + write_pack_edge_offset(cfg, PCK_EDGE_OFFSET_SEC2_mask_ADDR32, edge); + write_pack_edge_offset(cfg, PCK_EDGE_OFFSET_SEC3_mask_ADDR32, edge); + #endif + dprint_tensix_pack_edge_offset(); + edge.f = edge_vec[0]; + write_pack_edge_offset(cfg, PCK_EDGE_OFFSET_SEC0_mask_ADDR32, edge); + #if defined(ARCH_GRAYSKULL) or defined(ARCH_WORMHOLE) + edge.f = edge_vec[1]; + write_pack_edge_offset(cfg, PCK_EDGE_OFFSET_SEC1_mask_ADDR32, edge); + edge.f = edge_vec[2]; + write_pack_edge_offset(cfg, PCK_EDGE_OFFSET_SEC2_mask_ADDR32, edge); + edge.f = edge_vec[3]; + write_pack_edge_offset(cfg, PCK_EDGE_OFFSET_SEC3_mask_ADDR32, edge); + #endif + break; + case PACK_COUNTERS: + ckernel::packer::pack_counters_u counter; + generate_pack_counters(counter.f); + std::array counter_vec; + counter_vec = ckernel::packer::read_pack_counters(); + write_pack_counters(cfg, PACK_COUNTERS_SEC0_pack_per_xy_plane_ADDR32, counter); + #if defined(ARCH_GRAYSKULL) or defined(ARCH_WORMHOLE) + write_pack_counters(cfg, PACK_COUNTERS_SEC1_pack_per_xy_plane_ADDR32, counter); + write_pack_counters(cfg, PACK_COUNTERS_SEC2_pack_per_xy_plane_ADDR32, counter); + write_pack_counters(cfg, PACK_COUNTERS_SEC3_pack_per_xy_plane_ADDR32, counter); + #endif + dprint_tensix_pack_counters(); + counter.f = counter_vec[0]; + write_pack_counters(cfg, PACK_COUNTERS_SEC0_pack_per_xy_plane_ADDR32, counter); + #if defined(ARCH_GRAYSKULL) or defined(ARCH_WORMHOLE) + counter.f = counter_vec[1]; + write_pack_counters(cfg, PACK_COUNTERS_SEC1_pack_per_xy_plane_ADDR32, counter); + counter.f = counter_vec[2]; + write_pack_counters(cfg, PACK_COUNTERS_SEC2_pack_per_xy_plane_ADDR32, counter); + counter.f = counter_vec[3]; + write_pack_counters(cfg, PACK_COUNTERS_SEC3_pack_per_xy_plane_ADDR32, counter); + #endif + break; + } +} +} // namespace NAMESPACE diff --git a/tt_metal/hw/inc/debug/dprint_tensix.h b/tt_metal/hw/inc/debug/dprint_tensix.h index 4c1dead3047..2ea056d80d6 100644 --- a/tt_metal/hw/inc/debug/dprint_tensix.h +++ b/tt_metal/hw/inc/debug/dprint_tensix.h @@ -41,6 +41,63 @@ inline void dprint_array_with_data_type(uint32_t data_format, uint32_t* data, ui << ENDL(); } +// Dprints data format as string given an uint +inline void dprint_data_format(uint8_t data_format) { + switch (data_format) { + case (uint8_t) DataFormat::Float32: + DPRINT << "Float32"; + break; + case (uint8_t) DataFormat::Float16: + DPRINT << "Float16"; + break; + case (uint8_t) DataFormat::Bfp8: + DPRINT << "Bfp8"; + break; + case (uint8_t) DataFormat::Bfp4: + DPRINT << "Bfp4"; + break; + case (uint8_t) DataFormat::Bfp2: + DPRINT << "Bfp2"; + break; + case (uint8_t) DataFormat::Float16_b: + DPRINT << "Float16_b"; + break; + case (uint8_t) DataFormat::Bfp8_b: + DPRINT << "Bfp8_b"; + break; + case (uint8_t) DataFormat::Bfp4_b: + DPRINT << "Bfp4_b"; + break; + case (uint8_t) DataFormat::Bfp2_b: + DPRINT << "Bfp2_b"; + break; + case (uint8_t) DataFormat::Lf8: + DPRINT << "Lf8"; + break; + case (uint8_t) DataFormat::Int8: + DPRINT << "Int8"; + break; + case (uint8_t) DataFormat::UInt8: + DPRINT << "UInt8"; + break; + case (uint8_t) DataFormat::UInt16: + DPRINT << "UInt16"; + break; + case (uint8_t) DataFormat::Int32: + DPRINT << "Int32"; + break; + case (uint8_t) DataFormat::UInt32: + DPRINT << "UInt32"; + break; + case (uint8_t) DataFormat::Tf32: + DPRINT << "Tf32"; + break; + default: + DPRINT << "INVALID DATA FORMAT"; + break; + } +} + // if flag DEST_ACCESS_CFG_remap_addrs is enabled // destination register row identifiers are remmaped // bits 5:3 are rotated 543 -> 354 @@ -197,3 +254,23 @@ void dprint_tensix_dest_reg(int tile_id = 0) { uint32_t reg_val = dbg_read_cfgreg(ckernel::dbg_cfgreg::bank, reg_field_name##_ADDR32); \ DPRINT << #reg_field_name << " = " << HEX() << reg_val << ENDL(); \ } + +// Print the content of the register field given the value in the register. +#define DPRINT_TENSIX_CONFIG_FIELD(reg_val, reg_field_name, name, printDec) \ + { \ + uint32_t field_value = (reg_val & reg_field_name##_MASK) >> reg_field_name##_SHAMT; \ + DPRINT << name << " = "; \ + if (printDec) DPRINT << DEC(); \ + else DPRINT << "0x" << HEX(); \ + DPRINT << field_value << "; "; \ + } + +inline void dprint_tensix_struct_field(uint32_t word, uint32_t mask, uint8_t shamt, const char* name, bool printDec = false) +{ + DPRINT << name << ": "; + if (printDec) DPRINT << DEC(); + else { + DPRINT << "0x" << HEX(); + } + DPRINT << ((word & mask) >> shamt) << ENDL(); +} diff --git a/tt_metal/hw/inc/debug/dprint_tensix_pack.h b/tt_metal/hw/inc/debug/dprint_tensix_pack.h new file mode 100644 index 00000000000..7d55557c890 --- /dev/null +++ b/tt_metal/hw/inc/debug/dprint_tensix_pack.h @@ -0,0 +1,634 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include + +#include "dprint.h" +#include "dprint_tensix.h" +#include "cpack_common.h" + +// NOTE: FUNCTIONS WITHOUT HELPER SUFIX ARE INTENDED TO BE USED + +// PACK CONFIG + +// These function's argument should be return value of read_pack_config() + +inline void dprint_tensix_pack_config_row_ptr_section_size(const ckernel::packer::pack_config_t& config) { + DPRINT << DEC() << config.row_ptr_section_size << ENDL(); +} + +inline void dprint_tensix_pack_config_exp_section_size(const ckernel::packer::pack_config_t& config) { + DPRINT << DEC() << config.exp_section_size << ENDL(); +} + +inline void dprint_tensix_pack_config_l1_dest_addr(const ckernel::packer::pack_config_t& config) { + DPRINT << "0x" << HEX() << config.l1_dest_addr << ENDL(); +} + +inline void dprint_tensix_pack_config_uncompressed(const ckernel::packer::pack_config_t& config) { + DPRINT << "0x" << HEX() << config.uncompress << ENDL(); +} + +inline void dprint_tensix_pack_config_add_l1_dest_addr_offset(const ckernel::packer::pack_config_t& config) { + DPRINT << "0x" << HEX() << config.add_l1_dest_addr_offset << ENDL(); +} + +inline void dprint_tensix_pack_config_reserved_0(const ckernel::packer::pack_config_t& config) { + DPRINT << "0x" << HEX() << config.reserved_0 << ENDL(); +} + +inline void dprint_tensix_pack_config_out_data_format(const ckernel::packer::pack_config_t& config) { + dprint_data_format(config.out_data_format); + DPRINT << ENDL(); +} + +inline void dprint_tensix_pack_config_in_data_format(const ckernel::packer::pack_config_t& config) { + dprint_data_format(config.in_data_format); + DPRINT << ENDL(); +} + +#if defined(ARCH_GRAYSKULL) || defined(ARCH_WORMHOLE) +inline void dprint_tensix_pack_config_reserved_1(const ckernel::packer::pack_config_t& config) { + DPRINT << "0x" << HEX() << config.reserved_1 << ENDL(); +} +#endif + +inline void dprint_tensix_pack_config_src_if_sel(const ckernel::packer::pack_config_t& config) { + DPRINT << "0x" << HEX() << config.src_if_sel << ENDL(); +} + +#if defined(ARCH_GRAYSKULL) || defined(ARCH_WORMHOLE) +inline void dprint_tensix_pack_config_pack_per_xy_plane(const ckernel::packer::pack_config_t& config) { + DPRINT << DEC() << config.pack_per_xy_plane << ENDL(); +} +#endif + +inline void dprint_tensix_pack_config_l1_src_addr(const ckernel::packer::pack_config_t& config) { + DPRINT << "0x" << HEX() << config.l1_src_addr << ENDL(); +} + +#if defined(ARCH_GRAYSKULL) || defined(ARCH_WORMHOLE) +inline void dprint_tensix_pack_config_downsample_mask(const ckernel::packer::pack_config_t& config) { + DPRINT << "0x" << HEX() << config.downsample_mask << ENDL(); +} + +inline void dprint_tensix_pack_config_downsample_shift_count(const ckernel::packer::pack_config_t& config) { + DPRINT << DEC() << config.downsample_shift_count << ENDL(); +} + +inline void dprint_tensix_pack_config_read_mode(const ckernel::packer::pack_config_t& config) { + DPRINT << "0x" << HEX() << config.read_mode << ENDL(); +} + +inline void dprint_tensix_pack_config_exp_threshold_en(const ckernel::packer::pack_config_t& config) { + DPRINT << "0x" << HEX() << config.exp_threshold_en << ENDL(); +} + +inline void dprint_tensix_pack_config_reserved_2(const ckernel::packer::pack_config_t& config) { + DPRINT << "0x" << HEX() << config.reserved_2 << ENDL(); +} + +inline void dprint_tensix_pack_config_exp_threshold(const ckernel::packer::pack_config_t& config) { + DPRINT << DEC() << config.exp_threshold << ENDL(); +} +#endif + +#ifdef ARCH_WORMHOLE +inline void dprint_tensix_pack_config_l1_acc_disable_pack_zero_flag(const ckernel::packer::pack_config_t& config) { + DPRINT << "0x" << HEX() << config.pack_l1_acc_disable_pack_zero_flag << ENDL(); +} +#endif + +#ifdef ARCH_BLACKHOLE +inline void dprint_tensix_pack_config_disable_pack_zero_flag(const ckernel::packer::pack_config_t& config) { + DPRINT << "0x" << HEX() << config.disable_pack_zero_flag << ENDL(); +} + +inline void dprint_tensix_pack_config_dis_shared_exp_assembler(const ckernel::packer::pack_config_t& config) { + DPRINT << "0x" << HEX() << config.dis_shared_exp_assembler << ENDL(); +} + +inline void dprint_tensix_pack_config_auto_set_last_pacr_intf_sel(const ckernel::packer::pack_config_t& config) { + DPRINT << "0x" << HEX() << config.auto_set_last_pacr_intf_sel << ENDL(); +} + +inline void dprint_tensix_pack_config_enable_out_fifo(const ckernel::packer::pack_config_t& config) { + DPRINT << "0x" << HEX() << config.enable_out_fifo << ENDL(); +} + +inline void dprint_tensix_pack_config_sub_l1_tile_header_size(const ckernel::packer::pack_config_t& config) { + DPRINT << DEC() << config.sub_l1_tile_header_size << ENDL(); +} + +inline void dprint_tensix_pack_config_pack_start_intf_pos(const ckernel::packer::pack_config_t& config) { + DPRINT << "0x" << HEX() << config.pack_start_intf_pos << ENDL(); +} + +inline void dprint_tensix_pack_config_all_pack_disable_zero_compress_ovrd( + const ckernel::packer::pack_config_t& config) { + DPRINT << "0x" << HEX() << config.all_pack_disable_zero_compress_ovrd << ENDL(); +} + +inline void dprint_tensix_pack_config_add_tile_header_size(const ckernel::packer::pack_config_t& config) { + DPRINT << DEC() << config.add_tile_header_size << ENDL(); +} + +inline void dprint_tensix_pack_config_pack_dis_y_pos_start_offset(const ckernel::packer::pack_config_t& config) { + DPRINT << "0x" << HEX() << config.pack_dis_y_pos_start_offset << ENDL(); +} +#endif + +#ifdef ARCH_GRAYSKULL + +inline void dprint_tensix_pack_config_helper(const ckernel::packer::pack_config_t& config) { + DPRINT << "row_ptr_section_size: "; + dprint_tensix_pack_config_row_ptr_section_size(config); + DPRINT << "exp_section_size: "; + dprint_tensix_pack_config_exp_section_size(config); + DPRINT << "l1_dest_addr: "; + dprint_tensix_pack_config_l1_dest_addr(config); + DPRINT << "uncompress: "; + dprint_tensix_pack_config_uncompress(config); + DPRINT << "add_l1_dest_addr_offset: "; + dprint_tensix_pack_config_add_l1_dest_addr_offset(config); + DPRINT << "reserved_0: "; + dprint_tensix_pack_config_reserved_0(config); + DPRINT << "out_data_format: "; + dprint_tensix_pack_config_out_data_format(config); + DPRINT << "in_data_format: "; + dprint_tensix_pack_config_in_data_format(config); + DPRINT << "reserved_1: "; + dprint_tensix_pack_config_reserved_1(config); + DPRINT << "src_if_sel: "; + dprint_tensix_pack_config_src_if_sel(config); + DPRINT << "pack_per_xy_plane: "; + dprint_tensix_pack_config_pack_per_xy_plane(config); + DPRINT << "l1_src_addr: "; + dprint_tensix_pack_conifg_l1_src_addr(config); + DPRINT << "downsample_mask: "; + dprint_tensix_pack_config_downsample_mask(config); + DPRINT << "downsample_shift_count: "; + dprint_tensix_pack_config_downsample_shift_count(config); + DPRINT << "read_mode: "; + dprint_tensix_pack_config_read_mode(config); + DPRINT << "exp_threshold_en: "; + dprint_tensix_pack_config_exp_threshold_en(config); + DPRINT << "reserved_2: "; + dprint_tensix_pack_config_reserved_2(config); + DPRINT << "exp_threshold: "; + dprint_tensix_pack_config_exp_threshold(config); +} + +#else // ARCH_WORMHOLE or ARCH_BLACKHOLE + +#ifdef ARCH_WORMHOLE +inline void dprint_tensix_pack_config_helper(const ckernel::packer::pack_config_t& config) { + DPRINT << "row_ptr_section_size: "; + dprint_tensix_pack_config_row_ptr_section_size(config); + DPRINT << "exp_section_size: "; + dprint_tensix_pack_config_exp_section_size(config); + DPRINT << "l1_dest_addr: "; + dprint_tensix_pack_config_l1_dest_addr(config); + DPRINT << "uncompress: "; + dprint_tensix_pack_config_uncompressed(config); + DPRINT << "add_l1_dest_addr_offset: "; + dprint_tensix_pack_config_add_l1_dest_addr_offset(config); + DPRINT << "reserved_0: "; + dprint_tensix_pack_config_reserved_0(config); + DPRINT << "out_data_format: "; + dprint_tensix_pack_config_out_data_format(config); + DPRINT << "in_data_format: "; + dprint_tensix_pack_config_in_data_format(config); + DPRINT << "reserved_1: "; + dprint_tensix_pack_config_reserved_1(config); + DPRINT << "src_if_sel: "; + dprint_tensix_pack_config_src_if_sel(config); + DPRINT << "pack_per_xy_plane: "; + dprint_tensix_pack_config_pack_per_xy_plane(config); + DPRINT << "l1_src_addr: "; + dprint_tensix_pack_config_l1_src_addr(config); + DPRINT << "downsample_mask: "; + dprint_tensix_pack_config_downsample_mask(config); + DPRINT << "downsample_shift_count: "; + dprint_tensix_pack_config_downsample_shift_count(config); + DPRINT << "read_mode: "; + dprint_tensix_pack_config_read_mode(config); + DPRINT << "exp_threshold_en: "; + dprint_tensix_pack_config_exp_threshold_en(config); + DPRINT << "pack_l1_acc_disable_pack_zero_flag: "; + dprint_tensix_pack_config_l1_acc_disable_pack_zero_flag(config); + DPRINT << "reserved_2: "; + dprint_tensix_pack_config_reserved_2(config); + DPRINT << "exp_threshold: "; + dprint_tensix_pack_config_exp_threshold(config); +} +#endif // ARCH_WORMHOLE + +#ifdef ARCH_BLACKHOLE +inline void dprint_tensix_pack_config_helper(const ckernel::packer::pack_config_t& config) { + DPRINT << "row_ptr_section_size: "; + dprint_tensix_pack_config_row_ptr_section_size(config); + DPRINT << "exp_section_size: "; + dprint_tensix_pack_config_exp_section_size(config); + DPRINT << "l1_dest_addr: "; + dprint_tensix_pack_config_l1_dest_addr(config); + DPRINT << "uncompress: "; + dprint_tensix_pack_config_uncompressed(config); + DPRINT << "add_l1_dest_addr_offset: "; + dprint_tensix_pack_config_add_l1_dest_addr_offset(config); + DPRINT << "disable_pack_zero_flag: "; + dprint_tensix_pack_config_disable_pack_zero_flag(config); + DPRINT << "reserved_0: "; + dprint_tensix_pack_config_reserved_0(config); + DPRINT << "out_data_format: "; + dprint_tensix_pack_config_out_data_format(config); + DPRINT << "in_data_format: "; + dprint_tensix_pack_config_in_data_format(config); + DPRINT << "dis_shared_exp_assembler: "; + dprint_tensix_pack_config_dis_shared_exp_assembler(config); + DPRINT << "auto_set_last_pacr_intf_sel: "; + dprint_tensix_pack_config_auto_set_last_pacr_intf_sel(config); + DPRINT << "enable_out_fifo: "; + dprint_tensix_pack_config_enable_out_fifo(config); + DPRINT << "sub_l1_tile_header_size: "; + dprint_tensix_pack_config_sub_l1_tile_header_size(config); + DPRINT << "src_if_sel: "; + dprint_tensix_pack_config_src_if_sel(config); + DPRINT << "pack_start_intf_pos: "; + dprint_tensix_pack_config_pack_start_intf_pos(config); + DPRINT << "all_pack_disable_zero_compress_ovrd: "; + dprint_tensix_pack_config_all_pack_disable_zero_compress_ovrd(config); + DPRINT << "add_tile_header_size: "; + dprint_tensix_pack_config_add_tile_header_size(config); + DPRINT << "pack_dis_y_pos_start_offset: "; + dprint_tensix_pack_config_pack_dis_y_pos_start_offset(config); + DPRINT << "l1_src_addr: "; + dprint_tensix_pack_config_l1_src_addr(config); +} +#endif // ARCH_BLACKHOLE + +// PACK RELU CONFIG + +// These functions' argument should be return value of read_relu_config() + +inline void dprint_tensix_pack_relu_config_alu_acc_ctrl_zero_flag_disabled_src( + const ckernel::packer::relu_config_t& config) { + DPRINT << "0x" << HEX() << config.ALU_ACC_CTRL_Zero_Flag_disabled_src << ENDL(); +} + +inline void dprint_tensix_pack_relu_config_alu_acc_ctrl_zero_flag_disabled_dst( + const ckernel::packer::relu_config_t& config) { + DPRINT << "0x" << HEX() << config.ALU_ACC_CTRL_Zero_Flag_disabled_dst << ENDL(); +} + +inline void dprint_tensix_pack_relu_config_stacc_relu_apply_relu(const ckernel::packer::relu_config_t& config) { + DPRINT << "0x" << HEX() << config.STACC_RELU_ApplyRelu << ENDL(); +} + +inline void dprint_tensix_pack_relu_config_stacc_relu_relu_threshold(const ckernel::packer::relu_config_t& config) { + DPRINT << DEC() << config.STACC_RELU_ReluThreshold << ENDL(); +} + +inline void dprint_tensix_pack_relu_config_disable_risc_bp_disable_main(const ckernel::packer::relu_config_t& config) { + DPRINT << "0x" << HEX() << config.DISABLE_RISC_BP_Disable_main << ENDL(); +} + +inline void dprint_tensix_pack_relu_config_disable_risc_bp_disable_trisc(const ckernel::packer::relu_config_t& config) { + DPRINT << "0x" << HEX() << config.DISABLE_RISC_BP_Disable_trisc << ENDL(); +} + +inline void dprint_tensix_pack_relu_config_disable_risc_bp_disable_ncrisc( + const ckernel::packer::relu_config_t& config) { + DPRINT << "0x" << HEX() << config.DISABLE_RISC_BP_Disable_ncrisc << ENDL(); +} + +inline void dprint_tensix_pack_relu_config_disable_risc_bp_disable_bmp_clear_main( + const ckernel::packer::relu_config_t& config) { + DPRINT << "0x" << HEX() << config.DISABLE_RISC_BP_Disable_bmp_clear_main << ENDL(); +} + +inline void dprint_tensix_pack_relu_config_disable_risc_bp_disable_bmp_clear_trisc( + const ckernel::packer::relu_config_t& config) { + DPRINT << "0x" << HEX() << config.DISABLE_RISC_BP_Disable_bmp_clear_trisc << ENDL(); +} + +inline void dprint_tensix_pack_relu_config_disable_risc_bp_disable_bmp_clear_ncrisc( + const ckernel::packer::relu_config_t& config) { + DPRINT << "0x" << HEX() << config.DISABLE_RISC_BP_Disable_bmp_clear_ncrisc << ENDL(); +} + +inline void dprint_tensix_pack_relu_config() { + MATH(ckernel::packer::relu_config_t config = ckernel::packer::read_relu_config(); + + DPRINT << "ALU_ACC_CTRL_Zero_Flag_disabled_src: "; + dprint_tensix_pack_relu_config_alu_acc_ctrl_zero_flag_disabled_src(config); + DPRINT << "ALU_ACC_CTRL_Zero_Flag_disabled_dst: "; + dprint_tensix_pack_relu_config_alu_acc_ctrl_zero_flag_disabled_dst(config); + DPRINT << "STACC_RELU_ApplyRelu: "; + dprint_tensix_pack_relu_config_stacc_relu_apply_relu(config); + DPRINT << "STACC_RELU_ReluThreshold: "; + dprint_tensix_pack_relu_config_stacc_relu_relu_threshold(config); + DPRINT << "DISABLE_RISC_BP_Disable_main: "; + dprint_tensix_pack_relu_config_disable_risc_bp_disable_main(config); + DPRINT << "DISABLE_RISC_BP_Disable_trisc: "; + dprint_tensix_pack_relu_config_disable_risc_bp_disable_trisc(config); + DPRINT << "DISABLE_RISC_BP_Disable_ncrisc: "; + dprint_tensix_pack_relu_config_disable_risc_bp_disable_ncrisc(config); + DPRINT << "DISABLE_RISC_BP_Disable_bmp_clear_main: "; + dprint_tensix_pack_relu_config_disable_risc_bp_disable_bmp_clear_main(config); + DPRINT << "DISABLE_RISC_BP_Disable_bmp_clear_trisc: "; + dprint_tensix_pack_relu_config_disable_risc_bp_disable_bmp_clear_trisc(config); + DPRINT << "DISABLE_RISC_BP_Disable_bmp_clear_ncrisc: "; + dprint_tensix_pack_relu_config_disable_risc_bp_disable_bmp_clear_ncrisc(config);) +} + +// PACK DEST RD CTRL + +// These functions' argument should be return value of read_dest_rd_ctrl() + +inline void dprint_tensix_pack_dest_rd_ctrl_pck_dest_rd_ctrl_read_32b_data( + const ckernel::packer::dest_rd_ctrl_t& dest) { + DPRINT << "0x" << HEX() << dest.PCK_DEST_RD_CTRL_Read_32b_data << ENDL(); +} + +inline void dprint_tensix_pack_dest_rd_ctrl_pck_dest_rd_ctrl_read_unsigned( + const ckernel::packer::dest_rd_ctrl_t& dest) { + DPRINT << "0x" << HEX() << dest.PCK_DEST_RD_CTRL_Read_unsigned << ENDL(); +} + +inline void dprint_tensix_pack_dest_rd_ctrl_pck_dest_rd_ctrl_read_int8(const ckernel::packer::dest_rd_ctrl_t& dest) { + DPRINT << "0x" << HEX() << dest.PCK_DEST_RD_CTRL_Read_int8 << ENDL(); +} + +inline void dprint_tensix_pack_dest_rd_ctrl_pck_dest_rd_ctrl_round_10b_mant( + const ckernel::packer::dest_rd_ctrl_t& dest) { + DPRINT << "0x" << HEX() << dest.PCK_DEST_RD_CTRL_Round_10b_mant << ENDL(); +} + +inline void dprint_tensix_pack_dest_rd_ctrl_pck_dest_rd_ctrl_reserved(const ckernel::packer::dest_rd_ctrl_t& dest) { + DPRINT << "0x" << HEX() << dest.PCK_DEST_RD_CTRL_Reserved << ENDL(); +} + +// Printing dest control bits +inline void dprint_tensix_dest_rd_ctrl() { + PACK(ckernel::packer::dest_rd_ctrl_t dest = ckernel::packer::read_dest_rd_ctrl(); + + DPRINT << "PCK_DEST_RD_CTRL_Read_32b_data: "; + dprint_tensix_pack_dest_rd_ctrl_pck_dest_rd_ctrl_read_32b_data(dest); + DPRINT << "PCK_DEST_RD_CTRL_Read_unsigned: "; + dprint_tensix_pack_dest_rd_ctrl_pck_dest_rd_ctrl_read_unsigned(dest); + DPRINT << "PCK_DEST_RD_CTRL_Read_int8: "; + dprint_tensix_pack_dest_rd_ctrl_pck_dest_rd_ctrl_read_int8(dest); + DPRINT << "PCK_DEST_RD_CTRL_Round_10b_mant: "; + dprint_tensix_pack_dest_rd_ctrl_pck_dest_rd_ctrl_round_10b_mant(dest); + DPRINT << "PCK_DEST_RD_CTRL_Reserved: "; + dprint_tensix_pack_dest_rd_ctrl_pck_dest_rd_ctrl_reserved(dest);) +} + +#endif // END OF ELSE + +// PACK STRIDES +#ifdef ARCH_BLACKHOLE +inline void dprint_tensix_pack_strides_x_stride(const uint32_t& word) { + dprint_tensix_struct_field(word, 0xffff, 0, "x_stride", true); // decimal +} + +inline void dprint_tensix_pack_strides_y_stride(const uint32_t& word) { + dprint_tensix_struct_field(word, 0xffff0000, 16, "y_stride", true); // decimal +} + +inline void dprint_tensix_pack_strides_z_stride(const uint32_t& word) { + dprint_tensix_struct_field(word, 0xffff, 0, "z_stride", true); // decimal +} + +inline void dprint_tensix_pack_strides_w_stride(const uint32_t& word) { + dprint_tensix_struct_field(word, 0xffff0000, 16, "w_stride", true); // decimal +} +#else +inline void dprint_tensix_pack_strides_x_stride(const uint32_t& word) { + dprint_tensix_struct_field(word, 0xffff, 0, "x_stride", true); // decimal +} + +inline void dprint_tensix_pack_strides_y_stride(const uint32_t& word) { + dprint_tensix_struct_field(word, 0xffff0000, 16, "y_stride", true); // decimal +} + +inline void dprint_tensix_pack_strides_z_stride(const uint32_t& word) { + dprint_tensix_struct_field(word, 0xffff, 0, "z_stride", true); // decimal +} + +inline void dprint_tensix_pack_strides_w_stride(const uint32_t& word) { + dprint_tensix_struct_field(word, 0xffff0000, 16, "w_stride", true); // decimal +} +#endif + +// Printing packer strides +inline void dprint_tensix_pack_strides_helper(uint reg_id, const volatile uint tt_reg_ptr* cfg) { + uint32_t reg_addr = 0; + switch (reg_id) { + case 1: reg_addr = PCK0_ADDR_CTRL_XY_REG_0_Xstride_ADDR32; break; + case 2: reg_addr = PCK0_ADDR_CTRL_XY_REG_1_Xstride_ADDR32; break; + default: DPRINT << "Aborting! Invalid register id (valid ids are between 1 and 2)" << ENDL(); break; + } + + // word 0 xy_stride + uint32_t word = cfg[reg_addr]; + dprint_tensix_pack_strides_x_stride(word); + dprint_tensix_pack_strides_y_stride(word); + + // word 1 zw_stride + word = cfg[reg_addr + 1]; + dprint_tensix_pack_strides_z_stride(word); + dprint_tensix_pack_strides_w_stride(word); +} + +// PCK_EDGE_OFFSET + +// These function's argument should be return value of read_pack_edge_offset() + +inline void dprint_tensix_pack_edge_offset_mask(const ckernel::packer::pck_edge_offset_t& edge) { + DPRINT << "0x" << HEX() << edge.mask << ENDL(); +} + +inline void dprint_tensix_pack_edge_offset_mode(const ckernel::packer::pck_edge_offset_t& edge) { + DPRINT << "0x" << HEX() << edge.mode << ENDL(); +} + +inline void dprint_tensix_pack_edge_offset_tile_row_set_select_pack0(const ckernel::packer::pck_edge_offset_t& edge) { + DPRINT << "0x" << HEX() << edge.tile_row_set_select_pack0 << ENDL(); +} + +inline void dprint_tensix_pack_edge_offset_tile_row_set_select_pack1(const ckernel::packer::pck_edge_offset_t& edge) { + DPRINT << "0x" << HEX() << edge.tile_row_set_select_pack1 << ENDL(); +} + +inline void dprint_tensix_pack_edge_offset_tile_row_set_select_pack2(const ckernel::packer::pck_edge_offset_t& edge) { + DPRINT << "0x" << HEX() << edge.tile_row_set_select_pack2 << ENDL(); +} + +inline void dprint_tensix_pack_edge_offset_tile_row_set_select_pack3(const ckernel::packer::pck_edge_offset_t& edge) { + DPRINT << "0x" << HEX() << edge.tile_row_set_select_pack3 << ENDL(); +} + +inline void dprint_tensix_pack_edge_offset_reserved(const ckernel::packer::pck_edge_offset_t& edge) { + DPRINT << "0x" << HEX() << edge.reserved << ENDL(); +} + +// Printing packer edge offset +inline void dprint_tensix_pack_edge_offset_helper(const ckernel::packer::pck_edge_offset_t& edge, uint reg_id) { + DPRINT << "mask: "; + dprint_tensix_pack_edge_offset_mask(edge); + if (reg_id == 1) { + DPRINT << "mode: "; + dprint_tensix_pack_edge_offset_mode(edge); + DPRINT << "tile_row_set_select_pack0: "; + dprint_tensix_pack_edge_offset_tile_row_set_select_pack0(edge); + DPRINT << "tile_row_set_select_pack1: "; + dprint_tensix_pack_edge_offset_tile_row_set_select_pack1(edge); + DPRINT << "tile_row_set_select_pack2: "; + dprint_tensix_pack_edge_offset_tile_row_set_select_pack2(edge); + DPRINT << "tile_row_set_select_pack3: "; + dprint_tensix_pack_edge_offset_tile_row_set_select_pack3(edge); + DPRINT << "reserved: "; + dprint_tensix_pack_edge_offset_reserved(edge); + } +} + +// Choose what register you want printed with reg_id (1-4), 0 for all +inline void dprint_tensix_pack_edge_offset(uint reg_id = 0) { + std::array edge_vec; + PACK( + edge_vec = ckernel::packer::read_pack_edge_offset(); + if (reg_id >= 1 && reg_id <= ckernel::packer::NUM_PACKERS) { + if (ckernel::packer::NUM_PACKERS > 1) { + DPRINT << "REG_ID: " << reg_id << ENDL(); + } + dprint_tensix_pack_edge_offset_helper(edge_vec[reg_id - 1], reg_id); + } + // Print all registers + else if (reg_id == 0) { + for (uint i = 1; i <= ckernel::packer::NUM_PACKERS; i++) { + if (ckernel::packer::NUM_PACKERS > 1) { + DPRINT << "REG_ID: " << i << ENDL(); + } + dprint_tensix_pack_edge_offset_helper(edge_vec[i - 1], i); + if (i != ckernel::packer::NUM_PACKERS) { + DPRINT << ENDL(); + } + } + } else DPRINT + << "INVALID REGISTER ID! PLEASE CHOOSE A NUMBER BETWEEN 0 AND " << ckernel::packer::NUM_PACKERS << "." + << ENDL();) +} + +// PACK COUNTERS + +// These functions' argument should be return value of read_pack_counters() + +inline void dprint_tensix_pack_counters_pack_per_xy_plane(const ckernel::packer::pack_counters_t& counters) { + DPRINT << DEC() << counters.pack_per_xy_plane << ENDL(); +} + +inline void dprint_tensix_pack_counters_pack_reads_per_xy_plane(const ckernel::packer::pack_counters_t& counters) { + DPRINT << DEC() << counters.pack_reads_per_xy_plane << ENDL(); +} + +inline void dprint_tensix_pack_counters_pack_xys_per_til(const ckernel::packer::pack_counters_t& counters) { + DPRINT << DEC() << counters.pack_xys_per_til << ENDL(); +} + +inline void dprint_tensix_pack_counters_pack_yz_transposed(const ckernel::packer::pack_counters_t& counters) { + DPRINT << "0x" << HEX() << counters.pack_yz_transposed << ENDL(); +} + +inline void dprint_tensix_pack_counters_pack_per_xy_plane_offset(const ckernel::packer::pack_counters_t& counters) { + DPRINT << DEC() << counters.pack_per_xy_plane_offset << ENDL(); +} + +// Printing packer counters +inline void dprint_tensix_pack_counters_helper(const ckernel::packer::pack_counters_t& counters) { + DPRINT << "pack_per_xy_plane: "; + dprint_tensix_pack_counters_pack_per_xy_plane(counters); + DPRINT << "pack_reads_per_xy_plane: "; + dprint_tensix_pack_counters_pack_reads_per_xy_plane(counters); + DPRINT << "pack_xys_per_til: "; + dprint_tensix_pack_counters_pack_xys_per_til(counters); + DPRINT << "pack_yz_transposed: "; + dprint_tensix_pack_counters_pack_yz_transposed(counters); + DPRINT << "pack_per_xy_plane_offset: "; + dprint_tensix_pack_counters_pack_per_xy_plane_offset(counters); +} + +// Choose what register you want printed with reg_id (1-4), 0 for all +inline void dprint_tensix_pack_counters(uint reg_id = 0) { + std::array counters_vec; + PACK( + counters_vec = ckernel::packer::read_pack_counters(); + if (reg_id >= 1 && reg_id <= ckernel::packer::NUM_PACKERS) { + if (ckernel::packer::NUM_PACKERS > 1) { + DPRINT << "REG_ID: " << reg_id << ENDL(); + } + dprint_tensix_pack_counters_helper(counters_vec[reg_id - 1]); + } + // Print all registers + else if (reg_id == 0) { + for (uint i = 1; i <= ckernel::packer::NUM_PACKERS; i++) { + if (ckernel::packer::NUM_PACKERS > 1) { + DPRINT << "REG_ID: " << i << ENDL(); + } + dprint_tensix_pack_counters_helper(counters_vec[i - 1]); + if (i != ckernel::packer::NUM_PACKERS) { + DPRINT << ENDL(); + } + } + } else DPRINT + << "INVALID REGISTER ID! PLEASE CHOOSE A NUMBER BETWEEN 0 AND " << ckernel::packer::NUM_PACKERS << "." + << ENDL();) +} + +// Choose what register you want by id (1-4). 0 for all. +inline void dprint_tensix_pack_config(uint reg_id = 0) { + std::array config_vec; + MATH( + config_vec = ckernel::packer::read_pack_config(); if (reg_id >= 1 && reg_id <= ckernel::packer::NUM_PACKERS) { + if (ckernel::packer::NUM_PACKERS > 1) { + DPRINT << "REG_ID: " << reg_id << ENDL(); + } + dprint_tensix_pack_config_helper(config_vec[reg_id - 1]); + } else if (reg_id == 0) for (uint i = 1; i <= ckernel::packer::NUM_PACKERS; i++) { + if (ckernel::packer::NUM_PACKERS > 1) { + DPRINT << "REG_ID: " << i << ENDL(); + } + dprint_tensix_pack_config_helper(config_vec[i - 1]); + if (i != ckernel::packer::NUM_PACKERS) { + DPRINT << ENDL(); + } + } else DPRINT << "INVALID REGISTER ID! PLEASE CHOOSE A NUMBER BETWEEN 0 AND " + << ckernel::packer::NUM_PACKERS << "." << ENDL();) +} + +// Choose what register you want printed (1-2). 0 for all. +inline void dprint_tensix_pack_strides(uint reg_id = 0) { + PACK( + // Get pointer to registers for current state ID + volatile uint tt_reg_ptr* cfg = get_cfg_pointer(); + + if (reg_id >= 1 && reg_id <= 2) { + DPRINT << "REG_ID: " << reg_id << ENDL(); + dprint_tensix_pack_strides_helper(reg_id, cfg); + } + // Print all registers + else if (reg_id == 0) { + for (uint i = 1; i <= 2; i++) { + DPRINT << "REG_ID: " << i << ENDL(); + dprint_tensix_pack_strides_helper(i, cfg); + if (i != 2) { + DPRINT << ENDL(); + } + } + } else DPRINT + << "INVALID REGISTER ID! PLEASE CHOOSE A NUMBER BETWEEN 0 AND 2." << ENDL();) +} diff --git a/tt_metal/hw/inc/debug/dprint_tensix_unpack.h b/tt_metal/hw/inc/debug/dprint_tensix_unpack.h new file mode 100644 index 00000000000..261797fa86d --- /dev/null +++ b/tt_metal/hw/inc/debug/dprint_tensix_unpack.h @@ -0,0 +1,508 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include + +#include "dprint.h" +#include "dprint_tensix.h" +#include "cunpack_common.h" + +// NOTE: FUNCTIONS WITHOUT HELPER SUFIX ARE INTENDED TO BE USED + +// UNPACK TILE DESCRIPTOR + +// These function's argument should be return value of read_unpack_tile_descriptor() + +inline void dprint_tensix_unpack_tile_descriptor_in_data_format( + const ckernel::unpacker::unpack_tile_descriptor_t& tile_descriptor) { + dprint_data_format(tile_descriptor.in_data_format); + DPRINT << ENDL(); +} + +inline void dprint_tensix_unpack_tile_descriptor_uncompressed( + const ckernel::unpacker::unpack_tile_descriptor_t& tile_descriptor) { + DPRINT << "0x" << HEX() << tile_descriptor.uncompressed << ENDL(); +} + +inline void dprint_tensix_unpack_tile_descriptor_reserved_0( + const ckernel::unpacker::unpack_tile_descriptor_t& tile_descriptor) { + DPRINT << "0x" << HEX() << tile_descriptor.reserved_0 << ENDL(); +} + +inline void dprint_tensix_unpack_tile_descriptor_blobs_per_xy_plane( + const ckernel::unpacker::unpack_tile_descriptor_t& tile_descriptor) { + DPRINT << DEC() << tile_descriptor.blobs_per_xy_plane << ENDL(); +} + +inline void dprint_tensix_unpack_tile_descriptor_reserved_1( + const ckernel::unpacker::unpack_tile_descriptor_t& tile_descriptor) { + DPRINT << "0x" << HEX() << tile_descriptor.reserved_1 << ENDL(); +} + +inline void dprint_tensix_unpack_tile_descriptor_x_dim( + const ckernel::unpacker::unpack_tile_descriptor_t& tile_descriptor) { + DPRINT << DEC() << tile_descriptor.x_dim << ENDL(); +} + +inline void dprint_tensix_unpack_tile_descriptor_y_dim( + const ckernel::unpacker::unpack_tile_descriptor_t& tile_descriptor) { + DPRINT << DEC() << tile_descriptor.y_dim << ENDL(); +} + +inline void dprint_tensix_unpack_tile_descriptor_z_dim( + const ckernel::unpacker::unpack_tile_descriptor_t& tile_descriptor) { + DPRINT << DEC() << tile_descriptor.z_dim << ENDL(); +} + +inline void dprint_tensix_unpack_tile_descriptor_w_dim( + const ckernel::unpacker::unpack_tile_descriptor_t& tile_descriptor) { + DPRINT << DEC() << tile_descriptor.w_dim << ENDL(); +} + +inline void dprint_tensix_unpack_tile_descriptor_blobs_y_start( + const ckernel::unpacker::unpack_tile_descriptor_t& tile_descriptor) { +#ifdef ARCH_GRAYSKULL + DPRINT << DEC() << tile_descriptor.blobs_y_start << ENDL(); +#else + DPRINT << DEC() << ((tile_descriptor.blobs_y_start_hi << 16) | tile_descriptor.blobs_y_start_lo) << ENDL(); +#endif +} + +inline void dprint_tensix_unpack_tile_descriptor_digest_type( + const ckernel::unpacker::unpack_tile_descriptor_t& tile_descriptor) { + DPRINT << "0x" << HEX() << tile_descriptor.digest_type << ENDL(); +} + +inline void dprint_tensix_unpack_tile_descriptor_digest_size( + const ckernel::unpacker::unpack_tile_descriptor_t& tile_descriptor) { + DPRINT << DEC() << tile_descriptor.digest_size << ENDL(); +} + +// UNPACK CONFIG + +// These function's argument should be return value of read_unpack_config() + +inline void dprint_tensix_unpack_config_out_data_format(const ckernel::unpacker::unpack_config_t& config) { + dprint_data_format(config.out_data_format); + DPRINT << ENDL(); +} + +inline void dprint_tensix_unpack_config_throttle_mode(const ckernel::unpacker::unpack_config_t& config) { + DPRINT << "0x" << HEX() << config.throttle_mode << ENDL(); +} + +inline void dprint_tensix_unpack_config_context_count(const ckernel::unpacker::unpack_config_t& config) { + DPRINT << "0x" << HEX() << config.context_count << ENDL(); +} + +inline void dprint_tensix_unpack_config_haloize_mode(const ckernel::unpacker::unpack_config_t& config) { + DPRINT << "0x" << HEX() << config.haloize_mode << ENDL(); +} + +inline void dprint_tensix_unpack_config_tileize_mode(const ckernel::unpacker::unpack_config_t& config) { + DPRINT << "0x" << HEX() << config.tileize_mode << ENDL(); +} + +inline void dprint_tensix_unpack_config_force_shared_exp(const ckernel::unpacker::unpack_config_t& config) { + DPRINT << "0x" << HEX() << config.force_shared_exp << ENDL(); +} + +#ifdef ARCH_GRAYSKULL +inline void dprint_tensix_unpack_config_reserved_0(const ckernel::unpacker::unpack_config_t& config) { + DPRINT << "0x" << HEX() << config.reserved_0 << ENDL(); +} +#endif + +inline void dprint_tensix_unpack_config_upsample_rate(const ckernel::unpacker::unpack_config_t& config) { + DPRINT << DEC() << config.upsample_rate << ENDL(); +} + +inline void dprint_tensix_unpack_config_upsample_and_interlave(const ckernel::unpacker::unpack_config_t& config) { + DPRINT << "0x" << HEX() << config.upsamle_and_interlave << ENDL(); +} + +inline void dprint_tensix_unpack_config_shift_amount(const ckernel::unpacker::unpack_config_t& config) { + DPRINT << DEC() << config.shift_amount << ENDL(); +} + +inline void dprint_tensix_unpack_config_uncompress_cntx0_3(const ckernel::unpacker::unpack_config_t& config) { + DPRINT << "0x" << HEX() << config.uncompress_cntx0_3 << ENDL(); +} + +inline void dprint_tensix_unpack_config_reserved_1(const ckernel::unpacker::unpack_config_t& config) { + DPRINT << "0x" << HEX() << config.reserved_1 << ENDL(); +} + +inline void dprint_tensix_unpack_config_uncompress_cntx4_7(const ckernel::unpacker::unpack_config_t& config) { + DPRINT << "0x" << HEX() << config.uncompress_cntx4_7 << ENDL(); +} + +inline void dprint_tensix_unpack_config_reserved_2(const ckernel::unpacker::unpack_config_t& config) { + DPRINT << "0x" << HEX() << config.reserved_2 << ENDL(); +} + +inline void dprint_tensix_unpack_config_limit_addr(const ckernel::unpacker::unpack_config_t& config) { + DPRINT << "0x" << HEX() << config.limit_addr << ENDL(); +} + +inline void dprint_tensix_unpack_config_fifo_size(const ckernel::unpacker::unpack_config_t& config) { + DPRINT << DEC() << config.fifo_size << ENDL(); +} + +#if defined(ARCH_WORMHOLE) || defined(ARCH_BLACKHOLE) +inline void dprint_tensix_unpack_config_unpack_src_reg_set_update(const ckernel::unpacker::unpack_config_t& config) { + DPRINT << "0x" << HEX() << config.unpack_src_reg_set_update << ENDL(); +} + +inline void dprint_tensix_unpack_config_unpack_if_sel(const ckernel::unpacker::unpack_config_t& config) { + DPRINT << "0x" << HEX() << config.unpack_if_sel << ENDL(); +} + +inline void dprint_tensix_unpack_config_unpack_if_sel_cntx0_3(const ckernel::unpacker::unpack_config_t& config) { + DPRINT << "0x" << HEX() << config.unpack_if_sel_cntx0_3 << ENDL(); +} + +inline void dprint_tensix_unpack_config_unpack_if_sel_cntx4_7(const ckernel::unpacker::unpack_config_t& config) { + DPRINT << "0x" << HEX() << config.unpack_if_sel_cntx4_7 << ENDL(); +} + +inline void dprint_tensix_unpack_config_reserved_3(const ckernel::unpacker::unpack_config_t& config) { + DPRINT << "0x" << HEX() << config.reserved_3 << ENDL(); +} + +inline void dprint_tensix_unpack_config_reserved_4(const ckernel::unpacker::unpack_config_t& config) { + DPRINT << "0x" << HEX() << config.reserved_4 << ENDL(); +} + +inline void dprint_tensix_unpack_config_reserved_5(const ckernel::unpacker::unpack_config_t& config) { + DPRINT << "0x" << HEX() << config.reserved_5 << ENDL(); +} +#endif + +// HARDWARE SPECIFIC FUNCTIONS + +#ifdef ARCH_GRAYSKULL +inline void dprint_tensix_unpack_tile_descriptor_helper( + const ckernel::unpacker::tile_descriptor_t& tile_descriptor) { + DPRINT << "in_data_format: "; + dprint_tensix_unpack_tile_descriptor_in_data_format(tile_descriptor); + DPRINT << "uncompressed: "; + dprint_tensix_unpack_tile_descriptor_uncompressed(tile_descriptor); + DPRINT << "reserved_0: "; + dprint_tensix_unpack_tile_descriptor_reserved_0(tile_descriptor); + DPRINT << "blobs_per_xy_plane: " dprint_tensix_unpack_tile_descriptor_blobs_per_xy_plane(tile_descriptor); + DPRINT << "reserved_1: "; + dprint_tensix_unpack_tile_descriptor_reserved_1(tile_descriptor); + DPRINT << "x_dim: "; + dprint_tensix_unpack_tile_descriptor_x_dim(tile_descriptor); + DPRINT << "y_dim: "; + dprint_tensix_unpacK_tile_descriptor_y_dim(tile_descriptor); + DPRINT << "z_dim: "; + dprint_tensix_unpack_tile_descriptor_z_dim(tile_descriptor); + DPRINT << "w_dim: "; + dprint_tensix_unpack_tile_descriptor_w_dim(tile_descriptor); + DPRINT << "blobs_y_start: "; + dprint_tensix_unpack_tile_descriptor_blobs_y_start(tile_descriptor); + DPRINT << "digest_type: "; + dprint_tensix_unpack_tile_descriptor_digest_type(tile_descriptor); + DPRINT << "digest_size: "; + dprint_tensix_unpack_tile_descriptor_digest_type(tile_descriptor); +} + +inline void dprint_tensix_unpack_tile_descriptor(uint reg_id = 0) { + std::array tile_descriptor_vec; + UNPACK( + tile_descriptor_vec = ckernel::unpacker::read_unpack_tile_descriptor(); + if (reg_id >= 1 && reg_id <= ckernel::unpacker::NUM_UNPACKERS) { + DPRINT << "REG_ID: " << reg_id << ENDL(); + dprint_tensix_unpack_tile_descriptor_helper(tile_descriptor_vec[reg_id - 1]); + } else if (reg_id == 0) { + for (uint i = 1; i <= ckernel::unpacker::NUM_UNPACKERS; i++) { + DPRINT << "REG_ID: " << i << ENDL(); + dprint_tensix_unpack_tile_descriptor_helper(tile_descriptor_vec[i - 1]); + if (i != ckernel::unpacker::NUM_UNPACKERS) { + DPRINT << ENDL(); + } + } + } else { + DPRINT << "INVALID REGISTER ID! PLEASE CHOOSE A NUMBER BETWEEN 0 AND " << ckernel::unpacker::NUM_UNPACKERS << "." << ENDL(); + } + ) +} + +inline void dprint_tensix_unpack_config_helper(const ckernel::unpacker::unpack_config_t& config) { + DPRINT << "out_data_format: "; + dprint_tensix_unpack_config_out_data_format(config); + DPRINT << "throttle_mode: "; + dprint_tensix_unpack_config_throttle_mode(config); + DPRINT << "context_count: "; + dprint_tensix_unpack_config_context_count(config); + DPRINT << "haloize_mode: "; + dprint_tensix_unpack_config_haloize_mode(config); + DPRINT << "tileize_mode: "; + dprint_tensix_unpack_config_tileize_mode(config); + DPRINT << "force_shared_exp: "; + dprint_tensix_unpack_config_force_shared_exp(config) DPRINT << "reserved_0: "; + dprint_tensix_unpack_config_reserved_0(config); + DPRINT << "upsample_rate: "; + dprint_tensix_unpack_config_upsample_rate(config); + DPRINT << "upsamle_and_interlave: "; + dprint_tensix_unpack_config_upsample_and_interlave(config); + DPRINT << "shift_amount: "; + dprint_tensix_unpack_config_shift_amount(config); + DPRINT << "uncompress_cntx0_3: "; + dprint_tensix_unpack_config_uncompress_cntx0_3(config); + DPRINT << "reserved_1: "; + dprint_tensix_unpack_config_reserved_1(config); + DPRINT << "uncompress_cntx4_7: "; + dprint_tensix_unpack_config_uncompress_cntx4_7(config); + DPRINT << "reserved_2: "; + dprint_tensix_unpack_config_reserved_2(config); + DPRINT << "limit_addr: "; + dprint_tensix_unpack_config_limit_addr(config); + DPRINT << "fifo_size: "; + dprint_tensix_unpack_config_fifo_size(config); +} + +inline void dprint_tensix_unpack_config(uint reg_id = 0) { + std::array config_vec; + UNPACK( + config_vec = ckernel::unpacker::read_unpack_config(); + if (reg_id >= 1 && reg_id <= ckernel::unpacker::NUM_UNPACKERS) { + DPRINT << "REG_ID: " << reg_id << ENDL(); + dprint_tensix_unpack_config_helper(config_vec[reg_id - 1]); + } else if (reg_id == 0) { + for (uint i = 1; i <= ckernel::unpacker::NUM_UNPACKERS; i++) { + DPRINT << "REG_ID: " << i << ENDL(); + dprint_tensix_unpack_config_helper(config_vec[i - 1]); + if (i != ckernel::unpacker::NUM_UNPACKERS) { + DPRINT << ENDL(); + } + } + } else { + DPRINT << "INVALID REGISTER ID! PLEASE CHOOSE A NUMBER BETWEEN 0 AND " << ckernel::unpacker::NUM_UNPACKERS << "." << ENDL(); + } + ) +} + +#else // ARCH_WORMHOLE or ARCH_BLACKHOLE +inline void dprint_tensix_unpack_tile_descriptor_helper( + const ckernel::unpacker::unpack_tile_descriptor_t& tile_descriptor) { + DPRINT << "in_data_format: "; + dprint_tensix_unpack_tile_descriptor_in_data_format(tile_descriptor); + DPRINT << "uncompressed: "; + dprint_tensix_unpack_tile_descriptor_uncompressed(tile_descriptor); + DPRINT << "reserved_0: "; + dprint_tensix_unpack_tile_descriptor_reserved_0(tile_descriptor); + DPRINT << "blobs_per_xy_plane: "; + dprint_tensix_unpack_tile_descriptor_blobs_per_xy_plane(tile_descriptor); + DPRINT << "reserved_1: "; + dprint_tensix_unpack_tile_descriptor_reserved_1(tile_descriptor); + DPRINT << "x_dim: "; + dprint_tensix_unpack_tile_descriptor_x_dim(tile_descriptor); + DPRINT << "y_dim: "; + dprint_tensix_unpack_tile_descriptor_y_dim(tile_descriptor); + DPRINT << "z_dim: "; + dprint_tensix_unpack_tile_descriptor_z_dim(tile_descriptor); + DPRINT << "w_dim: "; + dprint_tensix_unpack_tile_descriptor_w_dim(tile_descriptor); + DPRINT << "blobs_y_start: "; + dprint_tensix_unpack_tile_descriptor_blobs_y_start(tile_descriptor); + DPRINT << "digest_type: "; + dprint_tensix_unpack_tile_descriptor_digest_type(tile_descriptor); + DPRINT << "digest_size: "; + dprint_tensix_unpack_tile_descriptor_digest_size(tile_descriptor); +} + +// Choose which register you want (1-2). 0 for both. +inline void dprint_tensix_unpack_tile_descriptor(uint reg_id = 0) { + std::array tile_descriptor_vec; + UNPACK( + tile_descriptor_vec = ckernel::unpacker::read_unpack_tile_descriptor(); + if (reg_id >= 1 && reg_id <= ckernel::unpacker::NUM_UNPACKERS) { + DPRINT << "REG_ID: " << reg_id << ENDL(); + dprint_tensix_unpack_tile_descriptor_helper(tile_descriptor_vec[reg_id - 1]); + } else if (reg_id == 0) { + for (uint i = 1; i <= ckernel::unpacker::NUM_UNPACKERS; i++) { + DPRINT << "REG_ID: " << i << ENDL(); + dprint_tensix_unpack_tile_descriptor_helper(tile_descriptor_vec[i - 1]); + if (i != ckernel::unpacker::NUM_UNPACKERS) { + DPRINT << ENDL(); + } + } + } else { + DPRINT << "INVALID REGISTER ID! PLEASE CHOOSE A NUMBER BETWEEN 0 AND " << ckernel::unpacker::NUM_UNPACKERS << "." << ENDL(); + } + ) +} + +inline void dprint_tensix_unpack_config_helper(const ckernel::unpacker::unpack_config_t& config) { + DPRINT << "out_data_format: "; + dprint_tensix_unpack_config_out_data_format(config); + DPRINT << "throttle_mode: "; + dprint_tensix_unpack_config_throttle_mode(config); + DPRINT << "context_count: "; + dprint_tensix_unpack_config_context_count(config); + DPRINT << "haloize_mode: "; + dprint_tensix_unpack_config_haloize_mode(config); + DPRINT << "tileize_mode: "; + dprint_tensix_unpack_config_tileize_mode(config); + DPRINT << "unpack_src_reg_set_update: "; + dprint_tensix_unpack_config_unpack_src_reg_set_update(config); + DPRINT << "unpack_if_sel: "; + dprint_tensix_unpack_config_unpack_if_sel(config); + DPRINT << "upsample_rate: "; + dprint_tensix_unpack_config_upsample_rate(config); + DPRINT << "reserved_1: "; + dprint_tensix_unpack_config_reserved_1(config); + DPRINT << "upsample_and_interlave: "; + dprint_tensix_unpack_config_upsample_and_interlave(config); + DPRINT << "shift_amount: "; + dprint_tensix_unpack_config_shift_amount(config); + DPRINT << "uncompress_cntx0_3: "; + dprint_tensix_unpack_config_uncompress_cntx0_3(config); + DPRINT << "unpack_if_sel_cntx0_3: "; + dprint_tensix_unpack_config_unpack_if_sel_cntx0_3(config); + DPRINT << "force_shared_exp: "; + dprint_tensix_unpack_config_force_shared_exp(config); + DPRINT << "reserved_2: "; + dprint_tensix_unpack_config_reserved_2(config); + DPRINT << "uncompress_cntx4_7: "; + dprint_tensix_unpack_config_uncompress_cntx4_7(config); + DPRINT << "unpack_if_sel_cntx4_7: "; + dprint_tensix_unpack_config_unpack_if_sel_cntx4_7(config); + DPRINT << "reserved_3: "; + dprint_tensix_unpack_config_reserved_3(config); + DPRINT << "limit_addr: "; + dprint_tensix_unpack_config_limit_addr(config); + DPRINT << "reserved_4: "; + dprint_tensix_unpack_config_reserved_4(config); + DPRINT << "fifo_size: "; + dprint_tensix_unpack_config_fifo_size(config); + DPRINT << "reserved_5: "; + dprint_tensix_unpack_config_reserved_5(config); +} + +// Choose which register you want (1-2). 0 for both. +inline void dprint_tensix_unpack_config(uint reg_id = 0) { + std::array config_vec; + UNPACK( + config_vec = ckernel::unpacker::read_unpack_config(); + if (reg_id >= 1 && reg_id <= ckernel::unpacker::NUM_UNPACKERS) { + DPRINT << "REG_ID: " << reg_id << ENDL(); + dprint_tensix_unpack_config_helper(config_vec[reg_id - 1]); + } else if (reg_id == 0) { + for (uint i = 1; i <= ckernel::unpacker::NUM_UNPACKERS; i++) { + DPRINT << "REG_ID: " << i << ENDL(); + dprint_tensix_unpack_config_helper(config_vec[i - 1]); + if (i != ckernel::unpacker::NUM_UNPACKERS) { + DPRINT << ENDL(); + } + } + } else { + DPRINT << "INVALID REGISTER ID! PLEASE CHOOSE A NUMBER BETWEEN 0 AND " << ckernel::unpacker::NUM_UNPACKERS << "." << ENDL(); + } + ) +} + +// ALU CONFIG + +// These functions' argument should be return value of read_alu_config() + +inline void dprint_tensix_alu_config_alu_rounding_mode_fpu_srnd_en(const ckernel::unpacker::alu_config_t& config) { + DPRINT << "0x" << HEX() << config.ALU_ROUNDING_MODE_Fpu_srnd_en << ENDL(); +} + +inline void dprint_tensix_alu_config_alu_rounding_mode_gasket_srnd_en(const ckernel::unpacker::alu_config_t& config) { + DPRINT << "0x" << HEX() << config.ALU_ROUNDING_MODE_Gasket_srnd_en << ENDL(); +} + +inline void dprint_tensix_alu_config_alu_rounding_mode_packer_srnd_en(const ckernel::unpacker::alu_config_t& config) { + DPRINT << "0x" << HEX() << config.ALU_ROUNDING_MODE_Packer_srnd_en << ENDL(); +} + +inline void dprint_tensix_alu_config_alu_rounding_mode_padding(const ckernel::unpacker::alu_config_t& config) { + DPRINT << "0x" << HEX() << config.ALU_ROUNDING_MODE_Padding << ENDL(); +} + +inline void dprint_tensix_alu_config_alu_rounding_mode_gs_lf(const ckernel::unpacker::alu_config_t& config) { + DPRINT << "0x" << HEX() << config.ALU_ROUNDING_MODE_GS_LF << ENDL(); +} + +inline void dprint_tensix_alu_config_alu_rounding_mode_bfp8_hf(const ckernel::unpacker::alu_config_t& config) { + DPRINT << "0x" << HEX() << config.ALU_ROUNDING_MODE_Bfp8_HF << ENDL(); +} + +inline void dprint_tensix_alu_config_alu_format_spec_reg0_srcaunsigned(const ckernel::unpacker::alu_config_t& config) { + DPRINT << "0x" << HEX() << config.ALU_FORMAT_SPEC_REG0_SrcAUnsigned << ENDL(); +} + +inline void dprint_tensix_alu_config_alu_format_spec_reg0_srcbunsigned(const ckernel::unpacker::alu_config_t& config) { + DPRINT << "0x" << HEX() << config.ALU_FORMAT_SPEC_REG0_SrcBUnsigned << ENDL(); +} + +inline void dprint_tensix_alu_config_alu_format_spec_reg0_srca(const ckernel::unpacker::alu_config_t& config) { + dprint_data_format(config.ALU_FORMAT_SPEC_REG0_SrcA); + DPRINT << ENDL(); +} + +inline void dprint_tensix_alu_config_alu_format_spec_reg1_srcb(const ckernel::unpacker::alu_config_t& config) { + dprint_data_format(config.ALU_FORMAT_SPEC_REG1_SrcB); + DPRINT << ENDL(); +} + +inline void dprint_tensix_alu_config_alu_format_spec_reg2_dstacc(const ckernel::unpacker::alu_config_t& config) { + dprint_data_format(config.ALU_FORMAT_SPEC_REG2_Dstacc); + DPRINT << ENDL(); +} + +inline void dprint_tensix_alu_config_alu_acc_ctrl_fp32_enabled(const ckernel::unpacker::alu_config_t& config) { + DPRINT << "0x" << HEX() << config.ALU_ACC_CTRL_Fp32_enabled << ENDL(); +} + +inline void dprint_tensix_alu_config_alu_acc_ctrl_sfpu_fp32_enabled(const ckernel::unpacker::alu_config_t& config) { + DPRINT << "0x" << HEX() << config.ALU_ACC_CTRL_SFPU_Fp32_enabled << ENDL(); +} + +inline void dprint_tensix_alu_config_alu_acc_ctrl_int8_math_enabled(const ckernel::unpacker::alu_config_t& config) { + DPRINT << "0x" << HEX() << config.ALU_ACC_CTRL_INT8_math_enabled << ENDL(); +} + +// Print content of the register field by field. +inline void dprint_tensix_alu_config() { + MATH(ckernel::unpacker::alu_config_t config = ckernel::unpacker::read_alu_config(); + + DPRINT << "ALU_ROUNDING_MODE_Fpu_srnd_en: "; + dprint_tensix_alu_config_alu_rounding_mode_fpu_srnd_en(config); + DPRINT << "ALU_ROUNDING_MODE_Gasket_srnd_en: "; + dprint_tensix_alu_config_alu_rounding_mode_gasket_srnd_en(config); + DPRINT << "ALU_ROUNDING_MODE_Packer_srnd_en: "; + dprint_tensix_alu_config_alu_rounding_mode_packer_srnd_en(config); + DPRINT << "ALU_ROUNDING_MODE_Padding: "; + dprint_tensix_alu_config_alu_rounding_mode_padding(config); + DPRINT << "ALU_ROUNDING_MODE_GS_LF: "; + dprint_tensix_alu_config_alu_rounding_mode_gs_lf(config); + DPRINT << "ALU_ROUNDING_MODE_Bfp8_HF: "; + dprint_tensix_alu_config_alu_rounding_mode_bfp8_hf(config); + DPRINT << "ALU_FORMAT_SPEC_REG0_SrcAUnsigned: "; + dprint_tensix_alu_config_alu_format_spec_reg0_srcaunsigned(config); + DPRINT << "ALU_FORMAT_SPEC_REG0_SrcBUnsigned: "; + dprint_tensix_alu_config_alu_format_spec_reg0_srcbunsigned(config); + DPRINT << "ALU_FORMAT_SPEC_REG0_SrcA: "; + dprint_tensix_alu_config_alu_format_spec_reg0_srca(config); + DPRINT << "ALU_FORMAT_SPEC_REG1_SrcB: "; + dprint_tensix_alu_config_alu_format_spec_reg1_srcb(config); + DPRINT << "ALU_FORMAT_SPEC_REG2_Dstacc: "; + dprint_tensix_alu_config_alu_format_spec_reg2_dstacc(config); + DPRINT << "ALU_ACC_CTRL_Fp32_enabled: "; + dprint_tensix_alu_config_alu_acc_ctrl_fp32_enabled(config); + DPRINT << "ALU_ACC_CTRL_SFPU_Fp32_enabled: "; + dprint_tensix_alu_config_alu_acc_ctrl_sfpu_fp32_enabled(config); + DPRINT << "ALU_ACC_CTRL_INT8_math_enabled: "; + dprint_tensix_alu_config_alu_acc_ctrl_int8_math_enabled(config);) +} + +#endif // END OF ELSE diff --git a/tt_metal/third_party/tt_llk_grayskull b/tt_metal/third_party/tt_llk_grayskull index 0c04db64275..be2b32e22f9 160000 --- a/tt_metal/third_party/tt_llk_grayskull +++ b/tt_metal/third_party/tt_llk_grayskull @@ -1 +1 @@ -Subproject commit 0c04db64275a4bd36a7e14d3c533855cb33f6a20 +Subproject commit be2b32e22f939526cb2c0bef021f636312c4f1d2 From 4abbec50c81768ac51d1afe88f3862b1df856d80 Mon Sep 17 00:00:00 2001 From: Austin Ho Date: Thu, 20 Feb 2025 21:09:05 +0000 Subject: [PATCH 213/316] #18045: Increase dispatch s page size to fit llama sub-device use case. Add better host and device asserts for when data exceeds the page size of dispatch s --- tt_metal/api/tt-metalium/device_command.hpp | 12 ++++++++++-- tt_metal/api/tt-metalium/dispatch_settings.hpp | 6 +++--- tt_metal/impl/dispatch/kernels/cq_dispatch_slave.cpp | 2 ++ 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/tt_metal/api/tt-metalium/device_command.hpp b/tt_metal/api/tt-metalium/device_command.hpp index 94da4304c57..905dcc41b45 100644 --- a/tt_metal/api/tt-metalium/device_command.hpp +++ b/tt_metal/api/tt-metalium/device_command.hpp @@ -267,8 +267,8 @@ class DeviceCommand { if constexpr (inline_data) { TT_ASSERT(data != nullptr); // compiled out? this->add_data(data, data_sizeB, data_sizeB); - // this->cmd_write_offsetB has been incremented by sizeof(CQPrefetchCmd) + sizeof(CQDispatchCmd) + data_sizeB - // need to ensure this is aligned for next cmds to be written at the correct location + // this->cmd_write_offsetB has been incremented by sizeof(CQPrefetchCmd) + sizeof(CQDispatchCmd) + + // data_sizeB need to ensure this is aligned for next cmds to be written at the correct location this->cmd_write_offsetB = tt::align(this->cmd_write_offsetB, this->pcie_alignment); } } else { @@ -454,6 +454,14 @@ class DeviceCommand { DispatchSettings::DISPATCH_GO_SIGNAL_NOC_DATA_ENTRIES); auto data_sizeB = noc_mcast_unicast_data.size() * sizeof(uint32_t); uint32_t lengthB = sizeof(CQDispatchCmd) + data_sizeB; + if (dispatcher_type == DispatcherSelect::DISPATCH_SLAVE) { + constexpr uint32_t dispatch_page_size = 1 << DispatchSettings::DISPATCH_S_BUFFER_LOG_PAGE_SIZE; + TT_FATAL( + lengthB <= dispatch_page_size, + "Data to set go signal noc data {} must fit within one dispatch page {} when sending to dispatch_s", + lengthB, + dispatch_page_size); + } this->add_prefetch_relay_inline(true, lengthB, dispatcher_type); auto initialize_set_go_signal_noc_data_cmd = [&](CQDispatchCmd* set_go_signal_noc_data_cmd) { set_go_signal_noc_data_cmd->base.cmd_id = CQ_DISPATCH_SET_GO_SIGNAL_NOC_DATA; diff --git a/tt_metal/api/tt-metalium/dispatch_settings.hpp b/tt_metal/api/tt-metalium/dispatch_settings.hpp index fe91d61183f..d7a7161741a 100644 --- a/tt_metal/api/tt-metalium/dispatch_settings.hpp +++ b/tt_metal/api/tt-metalium/dispatch_settings.hpp @@ -117,9 +117,9 @@ class DispatchSettings { static constexpr uint32_t DISPATCH_GO_SIGNAL_NOC_DATA_ENTRIES = 64; - // dispatch_s CB page size is 128 bytes. This should currently be enough to accomodate all commands that - // are sent to it. Change as needed, once this endpoint is required to handle more than go signal mcasts. - static constexpr uint32_t DISPATCH_S_BUFFER_LOG_PAGE_SIZE = 7; + // dispatch_s CB page size is 256 bytes. This should currently be enough to accomodate all commands that + // are sent to it. Change as needed. + static constexpr uint32_t DISPATCH_S_BUFFER_LOG_PAGE_SIZE = 8; static constexpr uint32_t GO_SIGNAL_BITS_PER_TXN_TYPE = 4; diff --git a/tt_metal/impl/dispatch/kernels/cq_dispatch_slave.cpp b/tt_metal/impl/dispatch/kernels/cq_dispatch_slave.cpp index 1520beb8d0c..3b27f9cd4a1 100644 --- a/tt_metal/impl/dispatch/kernels/cq_dispatch_slave.cpp +++ b/tt_metal/impl/dispatch/kernels/cq_dispatch_slave.cpp @@ -283,6 +283,8 @@ void kernel_main() { case CQ_DISPATCH_CMD_TERMINATE: done = true; break; default: DPRINT << "dispatcher_s invalid command" << ENDL(); ASSERT(0); } + // Dispatch s only supports single page commands for now + ASSERT(cmd_ptr <= ((uint32_t)cmd + cb_page_size)); cmd_ptr = round_up_pow2(cmd_ptr, cb_page_size); // Release a single page to prefetcher. Assumption is that all dispatch_s commands fit inside a single page for // now. From 00fb7ad3a4ce1db88f788b96a65de7739ef52ed3 Mon Sep 17 00:00:00 2001 From: William Ly Date: Fri, 21 Feb 2025 10:33:29 -0500 Subject: [PATCH 214/316] =?UTF-8?q?Revert=20"Printing=20packer's=20and=20u?= =?UTF-8?q?npacker's=20configuration=20registers=20(#17=E2=80=A6=20(#18142?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …368)" This reverts commit d9263f289e069fdeda588154e066ff3ab4ea4426. ### Ticket Link to Github Issue ### Problem description Provide context for the problem. ### What's changed Describe the approach used to solve the problem. Summarize the changes made and its impact. ### Checklist - [ ] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes - [ ] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [ ] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [ ] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [ ] New/Existing tests provide coverage for changes --- .../tt_metal/debug_tools/CMakeLists.txt | 1 - .../dprint/test_print_config_register.cpp | 595 ---------------- .../dataflow/writer_config_reg.cpp | 362 ---------- tt_metal/hw/inc/debug/dprint_tensix.h | 77 --- tt_metal/hw/inc/debug/dprint_tensix_pack.h | 634 ------------------ tt_metal/hw/inc/debug/dprint_tensix_unpack.h | 508 -------------- tt_metal/third_party/tt_llk_grayskull | 2 +- 7 files changed, 1 insertion(+), 2178 deletions(-) delete mode 100644 tests/tt_metal/tt_metal/debug_tools/dprint/test_print_config_register.cpp delete mode 100644 tests/tt_metal/tt_metal/test_kernels/dataflow/writer_config_reg.cpp delete mode 100644 tt_metal/hw/inc/debug/dprint_tensix_pack.h delete mode 100644 tt_metal/hw/inc/debug/dprint_tensix_unpack.h diff --git a/tests/tt_metal/tt_metal/debug_tools/CMakeLists.txt b/tests/tt_metal/tt_metal/debug_tools/CMakeLists.txt index 7c7f56bb74d..7244ca3e45a 100644 --- a/tests/tt_metal/tt_metal/debug_tools/CMakeLists.txt +++ b/tests/tt_metal/tt_metal/debug_tools/CMakeLists.txt @@ -11,7 +11,6 @@ set(UNIT_TESTS_DEBUG_TOOLS_SRC ${CMAKE_CURRENT_SOURCE_DIR}/dprint/test_print_tensix_dest.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dprint/test_print_tiles.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dprint/test_raise_wait.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dprint/test_print_config_register.cpp ${CMAKE_CURRENT_SOURCE_DIR}/watcher/test_assert.cpp ${CMAKE_CURRENT_SOURCE_DIR}/watcher/test_link_training.cpp ${CMAKE_CURRENT_SOURCE_DIR}/watcher/test_noc_sanitize_delays.cpp diff --git a/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_config_register.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_config_register.cpp deleted file mode 100644 index 60212f12e89..00000000000 --- a/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_config_register.cpp +++ /dev/null @@ -1,595 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include -#include - -#include -#include "debug_tools_fixture.hpp" -#include "gtest/gtest.h" -#include "debug_tools_test_utils.hpp" -#include -#include -#include "tt_metal/test_utils/df/df.hpp" -#include "tt_metal/test_utils/stimulus.hpp" -////////////////////////////////////////////////////////////////////////////////////////// -// A test for checking dprint -////////////////////////////////////////////////////////////////////////////////////////// -using namespace tt; -using namespace tt::tt_metal; -using namespace tt::test_utils; -using namespace tt::test_utils::df; - -// Register names -#define ALU_CONFIG 0 -#define UNPACK_TILE_DESCRIPTOR 1 -#define UNPACK_CONFIG 2 -#define PACK_CONFIG 3 -#define RELU_CONFIG 4 -#define DEST_RD_CTRL 5 -#define PACK_EDGE_OFFSET 6 -#define PACK_COUNTERS 7 -#define PACK_STRIDES 8 - -// Type of prints -const std::unordered_set format_fields = {"ALU_FORMAT_SPEC_REG0_SrcA", "ALU_FORMAT_SPEC_REG1_SrcB", - "ALU_FORMAT_SPEC_REG2_Dstacc", "in_data_format", "out_data_format"}; -const std::unordered_set decimal_fields = { - "blobs_per_xy_plane", - "x_dim", - "y_dim", - "z_dim", - "w_dim", - "blobs_y_start", - "digest_size", - "upsample_rate", - "shift_amount", - "fifo_size", - "row_ptr_section_size", - "exp_section_size", - "pack_per_xy_plane", - "downsample_shift_count", - "exp_threshold", - "STACC_RELU_ReluThreshold", - "pack_reads_per_xy_plane", - "pack_xys_per_til", - "pack_per_xy_plane_offset", - "sub_l1_tile_header_size", - "add_tile_header_size"}; - -// ALU CONFIG -const std::vector field_names_alu_config_all = { - "ALU_ROUNDING_MODE_Fpu_srnd_en", - "ALU_ROUNDING_MODE_Gasket_srnd_en", - "ALU_ROUNDING_MODE_Packer_srnd_en", - "ALU_ROUNDING_MODE_Padding", - "ALU_ROUNDING_MODE_GS_LF", - "ALU_ROUNDING_MODE_Bfp8_HF", - "ALU_FORMAT_SPEC_REG0_SrcAUnsigned", - "ALU_FORMAT_SPEC_REG0_SrcBUnsigned", - "ALU_FORMAT_SPEC_REG0_SrcA", - "ALU_FORMAT_SPEC_REG1_SrcB", - "ALU_FORMAT_SPEC_REG2_Dstacc", - "ALU_ACC_CTRL_Fp32_enabled", - "ALU_ACC_CTRL_SFPU_Fp32_enabled", - "ALU_ACC_CTRL_INT8_math_enabled"}; -const std::vector field_values_alu_config_all = {1, 0, 1, 15, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1}; - -// PACK_EDGE_OFFSET -const std::vector field_names_pack_edge_offset_all = { - "mask", - "mode", - "tile_row_set_select_pack0", - "tile_row_set_select_pack1", - "tile_row_set_select_pack2", - "tile_row_set_select_pack3", - "reserved"}; -const std::vector field_values_pack_edge_offset_all = {16, 1, 0, 1, 2, 3, 0}; - -// PACK_COUNTERS -const std::vector field_names_pack_counters_all = { - "pack_per_xy_plane", - "pack_reads_per_xy_plane", - "pack_xys_per_til", - "pack_yz_transposed", - "pack_per_xy_plane_offset"}; -const std::vector field_values_pack_counters_all = {4, 8, 2, 0, 6}; - -// RELU_CONFIG -const std::vector field_names_relu_config_all = { - "ALU_ACC_CTRL_Zero_Flag_disabled_src", - "ALU_ACC_CTRL_Zero_Flag_disabled_dst", - "STACC_RELU_ApplyRelu", - "STACC_RELU_ReluThreshold", - "DISABLE_RISC_BP_Disable_main", - "DISABLE_RISC_BP_Disable_trisc", - "DISABLE_RISC_BP_Disable_ncrisc", - "DISABLE_RISC_BP_Disable_bmp_clear_main", - "DISABLE_RISC_BP_Disable_bmp_clear_trisc", - "DISABLE_RISC_BP_Disable_bmp_clear_ncrisc"}; -const std::vector field_values_relu_config_all = {0, 0, 1, 8, 0, 0, 0, 0, 0, 0}; - -// PACK_DEST_RD_CTRL -const std::vector field_names_dest_rd_ctrl_all = { - "PCK_DEST_RD_CTRL_Read_32b_data", - "PCK_DEST_RD_CTRL_Read_unsigned", - "PCK_DEST_RD_CTRL_Read_int8", - "PCK_DEST_RD_CTRL_Round_10b_mant", - "PCK_DEST_RD_CTRL_Reserved"}; -const std::vector field_values_dest_rd_ctrl_all = {1, 0, 1, 1, 0}; - -// UNPACK TILE DESCRIPTOR -const std::vector field_names_unpack_tile_descriptor_grayskull = { - "in_data_format", - "uncompressed", - "reserved_0", - "blobs_per_xy_plane", - "reserved_1", - "x_dim", - "y_dim", - "z_dim", - "w_dim", - "blobs_y_start", - "digest_type", - "digest_size"}; -const std::vector field_values_unpack_tile_descriptor_grayskull = {5, 1, 2, 10, 7, 2, 4, 8, 16, 32, 0, 0}; - -// UNPACK CONFIG -const std::vector field_names_unpack_config_grayskull = { - "out_data_format", - "throttle_mode", - "context_count", - "haloize_mode", - "tileize_mode", - "force_shared_exp", - "reserved_0", - "upsample_rate", - "upsample_and_interlave", - "shift_amount", - "uncompress_cntx0_3", - "reserved_1", - "uncompress_cntx4_7", - "reserved_2", - "limit_addr", - "fifo_size"}; -const std::vector field_values_unpack_config_grayskull = {0, 1, 2, 0, 1, 0, 0, 3, 0, 16, 5, 0, 2, 0, 28, 29}; - -// PACK CONFIG -const std::vector field_names_pack_config_grayskull = { - "row_ptr_section_size", - "exp_section_size", - "l1_dest_addr", - "uncompress", - "add_l1_dest_addr_offset", - "reserved_0", - "out_data_format", - "in_data_format", - "reserved_1", - "src_if_sel", - "pack_per_xy_plane", - "l1_src_addr", - "downsample_mask", - "downsample_shift_count", - "read_mode", - "exp_threshold_en", - "reserved_2", - "exp_threshold"}; -const std::vector field_values_pack_config_grayskull = { - 12, 24, 16, 0, 1, 0, 5, 5, 0, 1, 0, 8, 12, 4, 0, 1, 0, 12}; - -// UNPACK TILE DESCRIPTOR -const std::vector field_names_unpack_tile_descriptor_wormhole_or_blackhole = { - "in_data_format", - "uncompressed", - "reserved_0", - "blobs_per_xy_plane", - "reserved_1", - "x_dim", - "y_dim", - "z_dim", - "w_dim", - "blobs_y_start_lo", - "blobs_y_start_hi", - "digest_type", - "digest_size"}; -const std::vector field_values_unpack_tile_descriptor_wormhole_or_blackhole = { - 5, 1, 0, 10, 7, 2, 4, 8, 16, 32, 0, 0, 0}; - -// UNPACK CONFIG -const std::vector field_names_unpack_config_wormhole_or_blackhole = { - "out_data_format", - "throttle_mode", - "context_count", - "haloize_mode", - "tileize_mode", - "unpack_src_reg_set_update", - "unpack_if_sel", - "upsample_rate", - "reserved_1", - "upsample_and_interlave", - "shift_amount", - "uncompress_cntx0_3", - "unpack_if_sel_cntx0_3", - "force_shared_exp", - "reserved_2", - "uncompress_cntx4_7", - "unpack_if_sel_cntx4_7", - "reserved_3", - "limit_addr", - "reserved_4", - "fifo_size", - "reserved_5"}; -const std::vector field_values_unpack_config_wormhole_or_blackhole = {0, 1, 2, 0, 1, 1, 0, 3, 0, 0, 16, - 5, 6, 0, 0, 2, 3, 0, 28, 0, 29, 0}; - -const std::vector field_names_pack_config_blackhole = { - "row_ptr_section_size", - "exp_section_size", - "l1_dest_addr", - "uncompress", - "add_l1_dest_addr_offset", - "disable_pack_zero_flag", - "reserved_0", - "out_data_format", - "in_data_format", - "dis_shared_exp_assembler", - "auto_set_last_pacr_intf_sel", - "enable_out_fifo", - "sub_l1_tile_header_size", - "src_if_sel", - "pack_start_intf_pos", - "all_pack_disable_zero_compress_ovrd", - "add_tile_header_size", - "pack_dis_y_pos_start_offset", - "l1_src_addr"}; -const std::vector field_values_pack_config_blackhole = { - 12, 24, 16, 0, 1, 1, 0, 5, 5, 0, 0, 1, 0, 1, 2, 0, 1, 0, 8}; -// PACK CONFIG -const std::vector field_names_pack_config_wormhole = { - "row_ptr_section_size", - "exp_section_size", - "l1_dest_addr", - "uncompress", - "add_l1_dest_addr_offset", - "reserved_0", - "out_data_format", - "in_data_format", - "reserved_1", - "src_if_sel", - "pack_per_xy_plane", - "l1_src_addr", - "downsample_mask", - "downsample_shift_count", - "read_mode", - "exp_threshold_en", - "pack_l1_acc_disable_pack_zero_flag", - "reserved_2", - "exp_threshold"}; -const std::vector field_values_pack_config_wormhole = { - 12, 24, 16, 0, 1, 0, 5, 5, 0, 1, 0, 8, 12, 4, 0, 1, 2, 0, 12}; - -// Configuration for Data Flow Test involving Reader, Datacopy, and Writer -struct ConfigRegPrintTestConfig { - CoreCoord core = {}; - std::string write_kernel; - std::string print_kernel; - int num_of_registers; - std::vector field_names; - std::vector field_values; - uint32_t register_name; -}; - -// Dprints data format as string given an uint -static std::string data_format_to_string(uint8_t data_format) { - switch (data_format) { - case (uint8_t) DataFormat::Float32: - return "Float32"; - case (uint8_t) DataFormat::Float16: - return "Float16"; - case (uint8_t) DataFormat::Bfp8: - return "Bfp8"; - case (uint8_t) DataFormat::Bfp4: - return "Bfp4"; - case (uint8_t) DataFormat::Bfp2: - return "Bfp2"; - case (uint8_t) DataFormat::Float16_b: - return "Float16_b"; - case (uint8_t) DataFormat::Bfp8_b: - return "Bfp8_b"; - case (uint8_t) DataFormat::Bfp4_b: - return "Bfp4_b"; - case (uint8_t) DataFormat::Bfp2_b: - return "Bfp2_b"; - case (uint8_t) DataFormat::Lf8: - return "Lf8"; - case (uint8_t) DataFormat::Int8: - return "Int8"; - case (uint8_t) DataFormat::UInt8: - return "UInt8"; - case (uint8_t) DataFormat::UInt16: - return "UInt16"; - case (uint8_t) DataFormat::Int32: - return "Int32"; - case (uint8_t) DataFormat::UInt32: - return "UInt32"; - case (uint8_t) DataFormat::Tf32: - return "Tf32"; - default: - return "INVALID DATA FORMAT"; - } -} - -static std::string int_to_hex(int value) { - std::stringstream ss; - ss << std::hex << value; // Convert to hexadecimal - return ss.str(); -} - -// Prepares the compute kernel with the specified program and test configuration -static KernelHandle prepare_writer(tt_metal::Program& program, const ConfigRegPrintTestConfig& config) { - return tt_metal::CreateKernel( - program, - config.write_kernel, - config.core, - tt_metal::ComputeConfig{ - .compile_args = { config.register_name }}); -} - -static std::string generate_golden_output(const std::vector& field_names, const std::vector& values, uint num_of_registers, uint32_t register_name) { - std::string golden_output; - bool multiple_registers = num_of_registers > 1; - for (uint reg_id = 1; reg_id <= num_of_registers; reg_id++) { - if (multiple_registers) golden_output += "REG_ID: " + std::to_string(reg_id) + "\n"; - for (size_t i = 0; i < field_names.size(); i++) { - if (field_names[i] == "blobs_y_start_lo") continue; - if (field_names[i] == "blobs_y_start_hi") { - uint32_t val = (values[i] << 16) | values[i-1]; - golden_output += "blobs_y_start: " + std::to_string(val) + "\n"; - continue; - } - if (format_fields.find(field_names[i]) != format_fields.end()) - golden_output += field_names[i] + ": " + data_format_to_string(values[i]) + "\n"; - else if (decimal_fields.find(field_names[i]) != format_fields.end()) - golden_output += field_names[i] + ": " + std::to_string(values[i]) + "\n"; - else { - golden_output += field_names[i] + ": 0x" + int_to_hex(values[i]) + "\n"; - } - - if (register_name == PACK_EDGE_OFFSET && reg_id > 1) break; - } - if (reg_id != num_of_registers) golden_output += "\n"; - } - return golden_output; -} - -static void print_config_reg( - DPrintFixture* fixture, tt_metal::IDevice* device, const ConfigRegPrintTestConfig& config) { - // Create program - tt_metal::Program program = tt_metal::CreateProgram(); - - // Prepare write kernel - auto write_kernel = prepare_writer(program, config); - - // Generate golden output - std::string golden_output = generate_golden_output(config.field_names, config.field_values, config.num_of_registers, config.register_name); - - // Run the program - fixture->RunProgram(device, program); - - // Check the print log against golden output. - EXPECT_TRUE(FilesMatchesString(DPrintFixture::dprint_file_name, golden_output)); -} - -TEST_F(DPrintFixture, ConfigRegAluTestPrint) { - std::vector field_names_alu_config = field_names_alu_config_all; - std::vector field_values_alu_config = field_values_alu_config_all; - - // Setup test configuration - ConfigRegPrintTestConfig test_config = { - .core = CoreCoord(0, 0), - .write_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_config_reg.cpp", - .num_of_registers = 1, - .field_names = field_names_alu_config, - .field_values = field_values_alu_config, - .register_name = ALU_CONFIG}; - - if (this->arch_ == ARCH::GRAYSKULL) { - GTEST_SKIP() << "Printing ALU CONFIG is not supported on grayskull."; - } - - // Run the test on the device - this->RunTestOnDevice( - [&](DPrintFixture* fixture, IDevice* device) { print_config_reg(fixture, device, test_config); }, - this->devices_[0]); -} - -TEST_F(DPrintFixture, ConfigRegTileDescriptorTestPrint) { - // Setup test configuration - - std::vector field_names_unpack_tile_descriptor; - std::vector field_values_unpack_tile_descriptor; - - if (this->arch_ == ARCH::GRAYSKULL) { - field_names_unpack_tile_descriptor = field_names_unpack_tile_descriptor_grayskull; - field_values_unpack_tile_descriptor = field_values_unpack_tile_descriptor_grayskull; - } else { - field_names_unpack_tile_descriptor = field_names_unpack_tile_descriptor_wormhole_or_blackhole; - field_values_unpack_tile_descriptor = field_values_unpack_tile_descriptor_wormhole_or_blackhole; - } - - ConfigRegPrintTestConfig test_config = { - .core = CoreCoord(0, 0), - .write_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_config_reg.cpp", - .num_of_registers = 2, - .field_names = field_names_unpack_tile_descriptor, - .field_values = field_values_unpack_tile_descriptor, - .register_name = UNPACK_TILE_DESCRIPTOR}; - - // Run the test on the device - this->RunTestOnDevice( - [&](DPrintFixture* fixture, IDevice* device) { print_config_reg(fixture, device, test_config); }, - this->devices_[0]); -} - -TEST_F(DPrintFixture, ConfigRegUnpackTestPrint) { - std::vector field_names_unpack_config; - std::vector field_values_unpack_config; - - if (this->arch_ == ARCH::GRAYSKULL) { - field_names_unpack_config = field_names_unpack_config_grayskull; - field_values_unpack_config = field_values_unpack_config_grayskull; - } else { - field_names_unpack_config = field_names_unpack_config_wormhole_or_blackhole; - field_values_unpack_config = field_values_unpack_config_wormhole_or_blackhole; - } - - // Setup test configuration - ConfigRegPrintTestConfig test_config = { - .core = CoreCoord(0, 0), - .write_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_config_reg.cpp", - .num_of_registers = 2, - .field_names = field_names_unpack_config, - .field_values = field_values_unpack_config, - .register_name = UNPACK_CONFIG}; - - // Run the test on the device - this->RunTestOnDevice( - [&](DPrintFixture* fixture, IDevice* device) { print_config_reg(fixture, device, test_config); }, - this->devices_[0]); -} - -TEST_F(DPrintFixture, ConfigRegPackTestPrint) { - std::vector field_names_pack_config; - std::vector field_values_pack_config; - - if (this->arch_ == ARCH::GRAYSKULL) { - field_names_pack_config = field_names_pack_config_grayskull; - field_values_pack_config = field_values_pack_config_grayskull; - } else if (this->arch_ == ARCH::WORMHOLE_B0) { - field_names_pack_config = field_names_pack_config_wormhole; - field_values_pack_config = field_values_pack_config_wormhole; - } else { - field_names_pack_config = field_names_pack_config_blackhole; - field_values_pack_config = field_values_pack_config_blackhole; - } - - int num_of_registers; - if (this->arch_ == ARCH::BLACKHOLE) { - num_of_registers = 1; - } else { - num_of_registers = 4; - } - - // Setup test configuration - ConfigRegPrintTestConfig test_config = { - .core = CoreCoord(0, 0), - .write_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_config_reg.cpp", - .num_of_registers = num_of_registers, - .field_names = field_names_pack_config, - .field_values = field_values_pack_config, - .register_name = PACK_CONFIG}; - - // Run the test on the device - this->RunTestOnDevice( - [&](DPrintFixture* fixture, IDevice* device) { print_config_reg(fixture, device, test_config); }, - this->devices_[0]); -} - -TEST_F(DPrintFixture, ConfigRegReluTestPrint) { - std::vector field_names_relu_config = field_names_relu_config_all; - std::vector field_values_relu_config = field_values_relu_config_all; - - // Setup test configuration - ConfigRegPrintTestConfig test_config = { - .core = CoreCoord(0, 0), - .write_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_config_reg.cpp", - .num_of_registers = 1, - .field_names = field_names_relu_config, - .field_values = field_values_relu_config, - .register_name = RELU_CONFIG}; - - if (this->arch_ == ARCH::GRAYSKULL) { - GTEST_SKIP() << "Printing RELU CONFIG is not supported on grayskull."; - } - - // Run the test on the device - this->RunTestOnDevice( - [&](DPrintFixture* fixture, IDevice* device) { print_config_reg(fixture, device, test_config); }, - this->devices_[0]); -} - -TEST_F(DPrintFixture, ConfigRegDestRdCtrlTestPrint) { - std::vector field_names_dest_rd_ctrl = field_names_dest_rd_ctrl_all; - std::vector field_values_dest_rd_ctrl = field_values_dest_rd_ctrl_all; - - // Setup test configuration - ConfigRegPrintTestConfig test_config = { - .core = CoreCoord(0, 0), - .write_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_config_reg.cpp", - .num_of_registers = 1, - .field_names = field_names_dest_rd_ctrl, - .field_values = field_values_dest_rd_ctrl, - .register_name = DEST_RD_CTRL}; - - if (this->arch_ == ARCH::GRAYSKULL) { - GTEST_SKIP() << "Printing DEST RD CTRL is not supported on grayskull."; - } - - // Run the test on the device - this->RunTestOnDevice( - [&](DPrintFixture* fixture, IDevice* device) { print_config_reg(fixture, device, test_config); }, - this->devices_[0]); -} - -TEST_F(DPrintFixture, ConfigRegPackEdgeOffsetTestPrint) { - std::vector field_names_pack_edge_offset = field_names_pack_edge_offset_all; - std::vector field_values_pack_edge_offset = field_values_pack_edge_offset_all; - - int num_of_registers; - if (this->arch_ == ARCH::BLACKHOLE) { - num_of_registers = 1; - } else { - num_of_registers = 4; - } - - // Setup test configuration - ConfigRegPrintTestConfig test_config = { - .core = CoreCoord(0, 0), - .write_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_config_reg.cpp", - .num_of_registers = num_of_registers, - .field_names = field_names_pack_edge_offset, - .field_values = field_values_pack_edge_offset, - .register_name = PACK_EDGE_OFFSET}; - - // Run the test on the device - this->RunTestOnDevice( - [&](DPrintFixture* fixture, IDevice* device) { print_config_reg(fixture, device, test_config); }, - this->devices_[0]); -} - -TEST_F(DPrintFixture, ConfigRegPackCountersTestPrint) { - std::vector field_names_pack_counters = field_names_pack_counters_all; - std::vector field_values_pack_counters = field_values_pack_counters_all; - - int num_of_registers; - if (this->arch_ == ARCH::BLACKHOLE) { - num_of_registers = 1; - } else { - num_of_registers = 4; - } - - // Setup test configuration - ConfigRegPrintTestConfig test_config = { - .core = CoreCoord(0, 0), - .write_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_config_reg.cpp", - .num_of_registers = num_of_registers, - .field_names = field_names_pack_counters, - .field_values = field_values_pack_counters, - .register_name = PACK_COUNTERS}; - - // Run the test on the device - this->RunTestOnDevice( - [&](DPrintFixture* fixture, IDevice* device) { print_config_reg(fixture, device, test_config); }, - this->devices_[0]); -} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_config_reg.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_config_reg.cpp deleted file mode 100644 index 8124417544a..00000000000 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_config_reg.cpp +++ /dev/null @@ -1,362 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include "debug/dprint_tensix_pack.h" -#include "debug/dprint_tensix_unpack.h" - -#include - -// Register names -#define ALU_CONFIG 0 -#define UNPACK_TILE_DESCRIPTOR 1 -#define UNPACK_CONFIG 2 -#define PACK_CONFIG 3 -#define RELU_CONFIG 4 -#define DEST_RD_CTRL 5 -#define PACK_EDGE_OFFSET 6 -#define PACK_COUNTERS 7 -#define PACK_STRIDES 8 - -namespace NAMESPACE { -#if defined(ARCH_WORMHOLE) or defined(ARCH_BLACKHOLE) -void generate_alu_config(ckernel::unpacker::alu_config_t& config) { - config.ALU_ROUNDING_MODE_Fpu_srnd_en = 1; - config.ALU_ROUNDING_MODE_Gasket_srnd_en = 0; - config.ALU_ROUNDING_MODE_Packer_srnd_en = 1; - config.ALU_ROUNDING_MODE_Padding = 15; - config.ALU_ROUNDING_MODE_GS_LF = 0; - config.ALU_ROUNDING_MODE_Bfp8_HF = 1; - config.ALU_FORMAT_SPEC_REG0_SrcAUnsigned = 1; - config.ALU_FORMAT_SPEC_REG0_SrcBUnsigned = 0; - config.ALU_FORMAT_SPEC_REG0_SrcA = 0; - config.ALU_FORMAT_SPEC_REG1_SrcB = 1; - config.ALU_FORMAT_SPEC_REG2_Dstacc = 0; - config.ALU_ACC_CTRL_Fp32_enabled = 0; - config.ALU_ACC_CTRL_SFPU_Fp32_enabled = 0; - config.ALU_ACC_CTRL_INT8_math_enabled = 1; -} -#endif - -void generate_unpack_tile_descriptor(ckernel::unpacker::unpack_tile_descriptor_t& tile_descriptor) { - tile_descriptor.in_data_format = 5; - tile_descriptor.uncompressed = 1; - tile_descriptor.reserved_0 = 0; - tile_descriptor.blobs_per_xy_plane = 10; - tile_descriptor.reserved_1 = 7; - tile_descriptor.x_dim = 2; - tile_descriptor.y_dim = 4; - tile_descriptor.z_dim = 8; - tile_descriptor.w_dim = 16; -#ifdef ARCH_GRAYSKULL - tile_descriptor.blobs_y_start = 32; -#else // ARCH_WORMHOLE or ARCH_BLACKHOLE - tile_descriptor.blobs_y_start_lo = 32; - tile_descriptor.blobs_y_start_hi = 0; -#endif - tile_descriptor.digest_type = 0; - tile_descriptor.digest_size = 0; -} - -void generate_unpack_config(ckernel::unpacker::unpack_config_t& config) { - config.out_data_format = 0; - config.throttle_mode = 1; - config.context_count = 2; - config.haloize_mode = 0; - config.tileize_mode = 1; - config.upsample_rate = 3; - config.reserved_1 = 0; - config.upsamle_and_interlave = 0; - config.shift_amount = 16; - config.uncompress_cntx0_3 = 5; - config.force_shared_exp = 0; - config.reserved_2 = 0; - config.uncompress_cntx4_7 = 2; - config.limit_addr = 28; - config.fifo_size = 29; - -#ifdef ARCH_GRAYSKULL - config.reserved_0 = 0; -#else // ARCH_WORMHOLE or ARCH_BLACKHOLE - config.reserved_3 = 0; - config.reserved_4 = 0; - config.reserved_5 = 0; - config.unpack_if_sel_cntx0_3 = 6; - config.unpack_if_sel_cntx4_7 = 3; - config.unpack_src_reg_set_update = 1; - config.unpack_if_sel = 0; -#endif -} - -void generate_pack_config(ckernel::packer::pack_config_t& config) { - config.row_ptr_section_size = 12; - config.exp_section_size = 24; - config.l1_dest_addr = 16; - config.uncompress = 0; - config.add_l1_dest_addr_offset = 1; - config.reserved_0 = 0; - config.out_data_format = 5; - config.in_data_format = 5; - config.src_if_sel = 1; - config.l1_src_addr = 8; -#if defined(ARCH_WORMHOLE) or defined(ARCH_GRAYSKULL) - config.reserved_1 = 0; - config.pack_per_xy_plane = 0; - config.downsample_mask = 12; - config.downsample_shift_count = 4; - config.read_mode = 0; - config.exp_threshold_en = 1; -#ifdef ARCH_WORMHOLE - config.pack_l1_acc_disable_pack_zero_flag = 2; -#endif - config.reserved_2 = 0; - config.exp_threshold = 12; -#endif -#ifdef ARCH_BLACKHOLE - config.disable_pack_zero_flag = 1; - config.dis_shared_exp_assembler = 0; - config.auto_set_last_pacr_intf_sel = 0; - config.enable_out_fifo = 1; - config.sub_l1_tile_header_size = 0; - config.pack_start_intf_pos = 2; - config.all_pack_disable_zero_compress_ovrd = 0; - config.add_tile_header_size = 1; - config.pack_dis_y_pos_start_offset = 0; -#endif -} - -#if defined(ARCH_WORMHOLE) or defined(ARCH_BLACKHOLE) -void generate_relu_config(ckernel::packer::relu_config_t& config) { - config.ALU_ACC_CTRL_Zero_Flag_disabled_src = 0; - config.ALU_ACC_CTRL_Zero_Flag_disabled_dst = 0; - config.STACC_RELU_ApplyRelu = 1; - config.STACC_RELU_ReluThreshold = 8; - config.DISABLE_RISC_BP_Disable_main = 0; - config.DISABLE_RISC_BP_Disable_trisc = 0; - config.DISABLE_RISC_BP_Disable_ncrisc = 0; - config.DISABLE_RISC_BP_Disable_bmp_clear_main = 0; - config.DISABLE_RISC_BP_Disable_bmp_clear_trisc = 0; - config.DISABLE_RISC_BP_Disable_bmp_clear_ncrisc = 0; -} -#endif - -#if defined(ARCH_WORMHOLE) or defined(ARCH_BLACKHOLE) -void generate_dest_rd_ctrl(ckernel::packer::dest_rd_ctrl_t& dest) { - dest.PCK_DEST_RD_CTRL_Read_32b_data = 1; - dest.PCK_DEST_RD_CTRL_Read_unsigned = 0; - dest.PCK_DEST_RD_CTRL_Read_int8 = 1; - dest.PCK_DEST_RD_CTRL_Round_10b_mant = 1; - dest.PCK_DEST_RD_CTRL_Reserved = 0; -} -#endif - -void generate_pack_edge_offset(ckernel::packer::pck_edge_offset_t& edge) { - edge.mask = 16; - edge.mode = 1; - edge.tile_row_set_select_pack0 = 0; - edge.tile_row_set_select_pack1 = 1; - edge.tile_row_set_select_pack2 = 2; - edge.tile_row_set_select_pack3 = 3; - edge.reserved = 0; -} - -void generate_pack_counters(ckernel::packer::pack_counters_t& counter) { - counter.pack_per_xy_plane = 4; - counter.pack_reads_per_xy_plane = 8; - counter.pack_xys_per_til = 2; - counter.pack_yz_transposed = 0; - counter.pack_per_xy_plane_offset = 6; -} - -#if defined(ARCH_WORMHOLE) or defined(ARCH_BLACKHOLE) -void write_alu_config(volatile uint tt_reg_ptr* cfg, uint32_t address, const ckernel::unpacker::alu_config_u &config) { - cfg[address] = config.val; -} -#endif - -void write_unpack_tile_descriptor(volatile uint tt_reg_ptr* cfg, uint32_t address, uint num_of_words, const ckernel::unpacker::unpack_tile_descriptor_u &tile_descriptor) { - for (uint i = 0; i < num_of_words; i++) - cfg[address + i] = tile_descriptor.val[i]; -} - -void write_unpack_config(volatile uint tt_reg_ptr* cfg, uint32_t address, uint num_of_words, const ckernel::unpacker::unpack_config_u &config) { - for (uint i = 0; i < num_of_words; i++) - cfg[address + i] = config.val[i]; -} - -void write_pack_config(volatile uint tt_reg_ptr* cfg, uint32_t address, uint num_of_words, const ckernel::packer::pack_config_u &config) { - for (uint i = 0; i < num_of_words; i++) - cfg[address + i] = config.val[i]; -} - -#if defined(ARCH_WORMHOLE) or defined(ARCH_BLACKHOLE) -void write_relu_config(volatile uint tt_reg_ptr* cfg, uint32_t address, uint num_of_words, const ckernel::packer::relu_config_u &config) { - for (uint i = 0; i < num_of_words; i++) - cfg[address + i] = config.val[i]; -} -#endif - -#if defined(ARCH_WORMHOLE) or defined(ARCH_BLACKHOLE) -void write_dest_rd_ctrl(volatile uint tt_reg_ptr* cfg, uint32_t address, const ckernel::packer::dest_rd_ctrl_u &dest) { - cfg[address] = dest.val; -} -#endif - -void write_pack_edge_offset(volatile uint tt_reg_ptr* cfg, uint32_t address, const ckernel::packer::pck_edge_offset_u &edge) { - cfg[address] = edge.val; -} - -void write_pack_counters(volatile uint tt_reg_ptr* cfg, uint32_t address, const ckernel::packer::pack_counters_u &counter) { - cfg[address] = counter.val; -} - -void MAIN { - uint32_t register_name = get_compile_time_arg_val(0); - - // Get pointer to registers for current state ID - volatile uint tt_reg_ptr* cfg = get_cfg_pointer(); - - switch (register_name) { - #if defined(ARCH_WORMHOLE) or defined(ARCH_BLACKHOLE) - case ALU_CONFIG: - ckernel::unpacker::alu_config_u alu_config; - generate_alu_config(alu_config.f); - ckernel::unpacker::alu_config_u alu_config_original; - alu_config_original.f = ckernel::unpacker::read_alu_config(); - write_alu_config(cfg, ALU_ROUNDING_MODE_Fpu_srnd_en_ADDR32, alu_config); - dprint_tensix_alu_config(); - write_alu_config(cfg, ALU_ROUNDING_MODE_Fpu_srnd_en_ADDR32, alu_config_original); - break; - #endif - case UNPACK_TILE_DESCRIPTOR: - ckernel::unpacker::unpack_tile_descriptor_u tile_descriptor; - generate_unpack_tile_descriptor(tile_descriptor.f); - std::array tile_descriptor_vec; - tile_descriptor_vec = ckernel::unpacker::read_unpack_tile_descriptor(); - write_unpack_tile_descriptor(cfg, THCON_SEC0_REG0_TileDescriptor_ADDR32, 4, tile_descriptor); - write_unpack_tile_descriptor(cfg, THCON_SEC1_REG0_TileDescriptor_ADDR32, 4, tile_descriptor); - dprint_tensix_unpack_tile_descriptor(); - tile_descriptor.f = tile_descriptor_vec[0]; - write_unpack_tile_descriptor(cfg, THCON_SEC0_REG0_TileDescriptor_ADDR32, 4, tile_descriptor); - tile_descriptor.f = tile_descriptor_vec[1]; - write_unpack_tile_descriptor(cfg, THCON_SEC1_REG0_TileDescriptor_ADDR32, 4, tile_descriptor); - break; - case UNPACK_CONFIG: - uint num_of_words_unpack_config; - #ifdef ARCH_GRAYSKULL - num_of_words_unpack_config = 3; - #else - num_of_words_unpack_config = 4; - #endif - ckernel::unpacker::unpack_config_u unpack_config; - generate_unpack_config(unpack_config.f); - std::array unpack_config_vec; - unpack_config_vec = ckernel::unpacker::read_unpack_config(); - write_unpack_config(cfg, THCON_SEC0_REG2_Out_data_format_ADDR32, num_of_words_unpack_config, unpack_config); - write_unpack_config(cfg, THCON_SEC1_REG2_Out_data_format_ADDR32, num_of_words_unpack_config, unpack_config); - dprint_tensix_unpack_config(); - unpack_config.f = unpack_config_vec[0]; - write_unpack_config(cfg, THCON_SEC0_REG2_Out_data_format_ADDR32, num_of_words_unpack_config, unpack_config); - unpack_config.f = unpack_config_vec[1]; - write_unpack_config(cfg, THCON_SEC1_REG2_Out_data_format_ADDR32, num_of_words_unpack_config, unpack_config); - break; - case PACK_CONFIG: - uint num_of_words_pack_config; - #ifdef ARCH_BLACKHOLE - num_of_words_pack_config = 3; - #else - num_of_words_pack_config = 4; - #endif - ckernel::packer::pack_config_u pack_config; - generate_pack_config(pack_config.f); - std::array pack_config_vec; - pack_config_vec = ckernel::packer::read_pack_config(); - write_pack_config(cfg, THCON_SEC0_REG1_Row_start_section_size_ADDR32, num_of_words_pack_config, pack_config); - #if defined(ARCH_GRAYSKULL) or defined(ARCH_WORMHOLE) - write_pack_config(cfg, THCON_SEC0_REG8_Row_start_section_size_ADDR32, num_of_words_pack_config, pack_config); - write_pack_config(cfg, THCON_SEC1_REG1_Row_start_section_size_ADDR32, num_of_words_pack_config, pack_config); - write_pack_config(cfg, THCON_SEC1_REG8_Row_start_section_size_ADDR32, num_of_words_pack_config, pack_config); - #endif - dprint_tensix_pack_config(); - pack_config.f = pack_config_vec[0]; - write_pack_config(cfg, THCON_SEC0_REG1_Row_start_section_size_ADDR32, num_of_words_pack_config, pack_config); - #if defined(ARCH_GRAYSKULL) or defined(ARCH_WORMHOLE) - pack_config.f = pack_config_vec[1]; - write_pack_config(cfg, THCON_SEC0_REG8_Row_start_section_size_ADDR32, num_of_words_pack_config, pack_config); - pack_config.f = pack_config_vec[2]; - write_pack_config(cfg, THCON_SEC1_REG1_Row_start_section_size_ADDR32, num_of_words_pack_config, pack_config); - pack_config.f = pack_config_vec[3]; - write_pack_config(cfg, THCON_SEC1_REG8_Row_start_section_size_ADDR32, num_of_words_pack_config, pack_config); - #endif - break; - #if defined(ARCH_WORMHOLE) or defined(ARCH_BLACKHOLE) - case RELU_CONFIG: - ckernel::packer::relu_config_u relu_config; - generate_relu_config(relu_config.r); - ckernel::packer::relu_config_u relu_config_original; - relu_config_original.r = ckernel::packer::read_relu_config(); - write_relu_config(cfg, ALU_ACC_CTRL_Zero_Flag_disabled_src_ADDR32, 1, relu_config); - dprint_tensix_pack_relu_config(); - write_relu_config(cfg, ALU_ACC_CTRL_Zero_Flag_disabled_src_ADDR32, 1, relu_config_original); - break; - #endif - #if defined(ARCH_WORMHOLE) or defined(ARCH_BLACKHOLE) - case DEST_RD_CTRL: - ckernel::packer::dest_rd_ctrl_u dest; - generate_dest_rd_ctrl(dest.f); - ckernel::packer::dest_rd_ctrl_u dest_original; - dest_original.f = ckernel::packer::read_dest_rd_ctrl(); - write_dest_rd_ctrl(cfg, PCK_DEST_RD_CTRL_Read_32b_data_ADDR32, dest); - dprint_tensix_dest_rd_ctrl(); - write_dest_rd_ctrl(cfg, PCK_DEST_RD_CTRL_Read_32b_data_ADDR32, dest_original); - break; - #endif - case PACK_EDGE_OFFSET: - ckernel::packer::pck_edge_offset_u edge; - generate_pack_edge_offset(edge.f); - std::array edge_vec; - edge_vec = ckernel::packer::read_pack_edge_offset(); - write_pack_edge_offset(cfg, PCK_EDGE_OFFSET_SEC0_mask_ADDR32, edge); - #if defined(ARCH_GRAYSKULL) or defined(ARCH_WORMHOLE) - write_pack_edge_offset(cfg, PCK_EDGE_OFFSET_SEC1_mask_ADDR32, edge); - write_pack_edge_offset(cfg, PCK_EDGE_OFFSET_SEC2_mask_ADDR32, edge); - write_pack_edge_offset(cfg, PCK_EDGE_OFFSET_SEC3_mask_ADDR32, edge); - #endif - dprint_tensix_pack_edge_offset(); - edge.f = edge_vec[0]; - write_pack_edge_offset(cfg, PCK_EDGE_OFFSET_SEC0_mask_ADDR32, edge); - #if defined(ARCH_GRAYSKULL) or defined(ARCH_WORMHOLE) - edge.f = edge_vec[1]; - write_pack_edge_offset(cfg, PCK_EDGE_OFFSET_SEC1_mask_ADDR32, edge); - edge.f = edge_vec[2]; - write_pack_edge_offset(cfg, PCK_EDGE_OFFSET_SEC2_mask_ADDR32, edge); - edge.f = edge_vec[3]; - write_pack_edge_offset(cfg, PCK_EDGE_OFFSET_SEC3_mask_ADDR32, edge); - #endif - break; - case PACK_COUNTERS: - ckernel::packer::pack_counters_u counter; - generate_pack_counters(counter.f); - std::array counter_vec; - counter_vec = ckernel::packer::read_pack_counters(); - write_pack_counters(cfg, PACK_COUNTERS_SEC0_pack_per_xy_plane_ADDR32, counter); - #if defined(ARCH_GRAYSKULL) or defined(ARCH_WORMHOLE) - write_pack_counters(cfg, PACK_COUNTERS_SEC1_pack_per_xy_plane_ADDR32, counter); - write_pack_counters(cfg, PACK_COUNTERS_SEC2_pack_per_xy_plane_ADDR32, counter); - write_pack_counters(cfg, PACK_COUNTERS_SEC3_pack_per_xy_plane_ADDR32, counter); - #endif - dprint_tensix_pack_counters(); - counter.f = counter_vec[0]; - write_pack_counters(cfg, PACK_COUNTERS_SEC0_pack_per_xy_plane_ADDR32, counter); - #if defined(ARCH_GRAYSKULL) or defined(ARCH_WORMHOLE) - counter.f = counter_vec[1]; - write_pack_counters(cfg, PACK_COUNTERS_SEC1_pack_per_xy_plane_ADDR32, counter); - counter.f = counter_vec[2]; - write_pack_counters(cfg, PACK_COUNTERS_SEC2_pack_per_xy_plane_ADDR32, counter); - counter.f = counter_vec[3]; - write_pack_counters(cfg, PACK_COUNTERS_SEC3_pack_per_xy_plane_ADDR32, counter); - #endif - break; - } -} -} // namespace NAMESPACE diff --git a/tt_metal/hw/inc/debug/dprint_tensix.h b/tt_metal/hw/inc/debug/dprint_tensix.h index 2ea056d80d6..4c1dead3047 100644 --- a/tt_metal/hw/inc/debug/dprint_tensix.h +++ b/tt_metal/hw/inc/debug/dprint_tensix.h @@ -41,63 +41,6 @@ inline void dprint_array_with_data_type(uint32_t data_format, uint32_t* data, ui << ENDL(); } -// Dprints data format as string given an uint -inline void dprint_data_format(uint8_t data_format) { - switch (data_format) { - case (uint8_t) DataFormat::Float32: - DPRINT << "Float32"; - break; - case (uint8_t) DataFormat::Float16: - DPRINT << "Float16"; - break; - case (uint8_t) DataFormat::Bfp8: - DPRINT << "Bfp8"; - break; - case (uint8_t) DataFormat::Bfp4: - DPRINT << "Bfp4"; - break; - case (uint8_t) DataFormat::Bfp2: - DPRINT << "Bfp2"; - break; - case (uint8_t) DataFormat::Float16_b: - DPRINT << "Float16_b"; - break; - case (uint8_t) DataFormat::Bfp8_b: - DPRINT << "Bfp8_b"; - break; - case (uint8_t) DataFormat::Bfp4_b: - DPRINT << "Bfp4_b"; - break; - case (uint8_t) DataFormat::Bfp2_b: - DPRINT << "Bfp2_b"; - break; - case (uint8_t) DataFormat::Lf8: - DPRINT << "Lf8"; - break; - case (uint8_t) DataFormat::Int8: - DPRINT << "Int8"; - break; - case (uint8_t) DataFormat::UInt8: - DPRINT << "UInt8"; - break; - case (uint8_t) DataFormat::UInt16: - DPRINT << "UInt16"; - break; - case (uint8_t) DataFormat::Int32: - DPRINT << "Int32"; - break; - case (uint8_t) DataFormat::UInt32: - DPRINT << "UInt32"; - break; - case (uint8_t) DataFormat::Tf32: - DPRINT << "Tf32"; - break; - default: - DPRINT << "INVALID DATA FORMAT"; - break; - } -} - // if flag DEST_ACCESS_CFG_remap_addrs is enabled // destination register row identifiers are remmaped // bits 5:3 are rotated 543 -> 354 @@ -254,23 +197,3 @@ void dprint_tensix_dest_reg(int tile_id = 0) { uint32_t reg_val = dbg_read_cfgreg(ckernel::dbg_cfgreg::bank, reg_field_name##_ADDR32); \ DPRINT << #reg_field_name << " = " << HEX() << reg_val << ENDL(); \ } - -// Print the content of the register field given the value in the register. -#define DPRINT_TENSIX_CONFIG_FIELD(reg_val, reg_field_name, name, printDec) \ - { \ - uint32_t field_value = (reg_val & reg_field_name##_MASK) >> reg_field_name##_SHAMT; \ - DPRINT << name << " = "; \ - if (printDec) DPRINT << DEC(); \ - else DPRINT << "0x" << HEX(); \ - DPRINT << field_value << "; "; \ - } - -inline void dprint_tensix_struct_field(uint32_t word, uint32_t mask, uint8_t shamt, const char* name, bool printDec = false) -{ - DPRINT << name << ": "; - if (printDec) DPRINT << DEC(); - else { - DPRINT << "0x" << HEX(); - } - DPRINT << ((word & mask) >> shamt) << ENDL(); -} diff --git a/tt_metal/hw/inc/debug/dprint_tensix_pack.h b/tt_metal/hw/inc/debug/dprint_tensix_pack.h deleted file mode 100644 index 7d55557c890..00000000000 --- a/tt_metal/hw/inc/debug/dprint_tensix_pack.h +++ /dev/null @@ -1,634 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include - -#include "dprint.h" -#include "dprint_tensix.h" -#include "cpack_common.h" - -// NOTE: FUNCTIONS WITHOUT HELPER SUFIX ARE INTENDED TO BE USED - -// PACK CONFIG - -// These function's argument should be return value of read_pack_config() - -inline void dprint_tensix_pack_config_row_ptr_section_size(const ckernel::packer::pack_config_t& config) { - DPRINT << DEC() << config.row_ptr_section_size << ENDL(); -} - -inline void dprint_tensix_pack_config_exp_section_size(const ckernel::packer::pack_config_t& config) { - DPRINT << DEC() << config.exp_section_size << ENDL(); -} - -inline void dprint_tensix_pack_config_l1_dest_addr(const ckernel::packer::pack_config_t& config) { - DPRINT << "0x" << HEX() << config.l1_dest_addr << ENDL(); -} - -inline void dprint_tensix_pack_config_uncompressed(const ckernel::packer::pack_config_t& config) { - DPRINT << "0x" << HEX() << config.uncompress << ENDL(); -} - -inline void dprint_tensix_pack_config_add_l1_dest_addr_offset(const ckernel::packer::pack_config_t& config) { - DPRINT << "0x" << HEX() << config.add_l1_dest_addr_offset << ENDL(); -} - -inline void dprint_tensix_pack_config_reserved_0(const ckernel::packer::pack_config_t& config) { - DPRINT << "0x" << HEX() << config.reserved_0 << ENDL(); -} - -inline void dprint_tensix_pack_config_out_data_format(const ckernel::packer::pack_config_t& config) { - dprint_data_format(config.out_data_format); - DPRINT << ENDL(); -} - -inline void dprint_tensix_pack_config_in_data_format(const ckernel::packer::pack_config_t& config) { - dprint_data_format(config.in_data_format); - DPRINT << ENDL(); -} - -#if defined(ARCH_GRAYSKULL) || defined(ARCH_WORMHOLE) -inline void dprint_tensix_pack_config_reserved_1(const ckernel::packer::pack_config_t& config) { - DPRINT << "0x" << HEX() << config.reserved_1 << ENDL(); -} -#endif - -inline void dprint_tensix_pack_config_src_if_sel(const ckernel::packer::pack_config_t& config) { - DPRINT << "0x" << HEX() << config.src_if_sel << ENDL(); -} - -#if defined(ARCH_GRAYSKULL) || defined(ARCH_WORMHOLE) -inline void dprint_tensix_pack_config_pack_per_xy_plane(const ckernel::packer::pack_config_t& config) { - DPRINT << DEC() << config.pack_per_xy_plane << ENDL(); -} -#endif - -inline void dprint_tensix_pack_config_l1_src_addr(const ckernel::packer::pack_config_t& config) { - DPRINT << "0x" << HEX() << config.l1_src_addr << ENDL(); -} - -#if defined(ARCH_GRAYSKULL) || defined(ARCH_WORMHOLE) -inline void dprint_tensix_pack_config_downsample_mask(const ckernel::packer::pack_config_t& config) { - DPRINT << "0x" << HEX() << config.downsample_mask << ENDL(); -} - -inline void dprint_tensix_pack_config_downsample_shift_count(const ckernel::packer::pack_config_t& config) { - DPRINT << DEC() << config.downsample_shift_count << ENDL(); -} - -inline void dprint_tensix_pack_config_read_mode(const ckernel::packer::pack_config_t& config) { - DPRINT << "0x" << HEX() << config.read_mode << ENDL(); -} - -inline void dprint_tensix_pack_config_exp_threshold_en(const ckernel::packer::pack_config_t& config) { - DPRINT << "0x" << HEX() << config.exp_threshold_en << ENDL(); -} - -inline void dprint_tensix_pack_config_reserved_2(const ckernel::packer::pack_config_t& config) { - DPRINT << "0x" << HEX() << config.reserved_2 << ENDL(); -} - -inline void dprint_tensix_pack_config_exp_threshold(const ckernel::packer::pack_config_t& config) { - DPRINT << DEC() << config.exp_threshold << ENDL(); -} -#endif - -#ifdef ARCH_WORMHOLE -inline void dprint_tensix_pack_config_l1_acc_disable_pack_zero_flag(const ckernel::packer::pack_config_t& config) { - DPRINT << "0x" << HEX() << config.pack_l1_acc_disable_pack_zero_flag << ENDL(); -} -#endif - -#ifdef ARCH_BLACKHOLE -inline void dprint_tensix_pack_config_disable_pack_zero_flag(const ckernel::packer::pack_config_t& config) { - DPRINT << "0x" << HEX() << config.disable_pack_zero_flag << ENDL(); -} - -inline void dprint_tensix_pack_config_dis_shared_exp_assembler(const ckernel::packer::pack_config_t& config) { - DPRINT << "0x" << HEX() << config.dis_shared_exp_assembler << ENDL(); -} - -inline void dprint_tensix_pack_config_auto_set_last_pacr_intf_sel(const ckernel::packer::pack_config_t& config) { - DPRINT << "0x" << HEX() << config.auto_set_last_pacr_intf_sel << ENDL(); -} - -inline void dprint_tensix_pack_config_enable_out_fifo(const ckernel::packer::pack_config_t& config) { - DPRINT << "0x" << HEX() << config.enable_out_fifo << ENDL(); -} - -inline void dprint_tensix_pack_config_sub_l1_tile_header_size(const ckernel::packer::pack_config_t& config) { - DPRINT << DEC() << config.sub_l1_tile_header_size << ENDL(); -} - -inline void dprint_tensix_pack_config_pack_start_intf_pos(const ckernel::packer::pack_config_t& config) { - DPRINT << "0x" << HEX() << config.pack_start_intf_pos << ENDL(); -} - -inline void dprint_tensix_pack_config_all_pack_disable_zero_compress_ovrd( - const ckernel::packer::pack_config_t& config) { - DPRINT << "0x" << HEX() << config.all_pack_disable_zero_compress_ovrd << ENDL(); -} - -inline void dprint_tensix_pack_config_add_tile_header_size(const ckernel::packer::pack_config_t& config) { - DPRINT << DEC() << config.add_tile_header_size << ENDL(); -} - -inline void dprint_tensix_pack_config_pack_dis_y_pos_start_offset(const ckernel::packer::pack_config_t& config) { - DPRINT << "0x" << HEX() << config.pack_dis_y_pos_start_offset << ENDL(); -} -#endif - -#ifdef ARCH_GRAYSKULL - -inline void dprint_tensix_pack_config_helper(const ckernel::packer::pack_config_t& config) { - DPRINT << "row_ptr_section_size: "; - dprint_tensix_pack_config_row_ptr_section_size(config); - DPRINT << "exp_section_size: "; - dprint_tensix_pack_config_exp_section_size(config); - DPRINT << "l1_dest_addr: "; - dprint_tensix_pack_config_l1_dest_addr(config); - DPRINT << "uncompress: "; - dprint_tensix_pack_config_uncompress(config); - DPRINT << "add_l1_dest_addr_offset: "; - dprint_tensix_pack_config_add_l1_dest_addr_offset(config); - DPRINT << "reserved_0: "; - dprint_tensix_pack_config_reserved_0(config); - DPRINT << "out_data_format: "; - dprint_tensix_pack_config_out_data_format(config); - DPRINT << "in_data_format: "; - dprint_tensix_pack_config_in_data_format(config); - DPRINT << "reserved_1: "; - dprint_tensix_pack_config_reserved_1(config); - DPRINT << "src_if_sel: "; - dprint_tensix_pack_config_src_if_sel(config); - DPRINT << "pack_per_xy_plane: "; - dprint_tensix_pack_config_pack_per_xy_plane(config); - DPRINT << "l1_src_addr: "; - dprint_tensix_pack_conifg_l1_src_addr(config); - DPRINT << "downsample_mask: "; - dprint_tensix_pack_config_downsample_mask(config); - DPRINT << "downsample_shift_count: "; - dprint_tensix_pack_config_downsample_shift_count(config); - DPRINT << "read_mode: "; - dprint_tensix_pack_config_read_mode(config); - DPRINT << "exp_threshold_en: "; - dprint_tensix_pack_config_exp_threshold_en(config); - DPRINT << "reserved_2: "; - dprint_tensix_pack_config_reserved_2(config); - DPRINT << "exp_threshold: "; - dprint_tensix_pack_config_exp_threshold(config); -} - -#else // ARCH_WORMHOLE or ARCH_BLACKHOLE - -#ifdef ARCH_WORMHOLE -inline void dprint_tensix_pack_config_helper(const ckernel::packer::pack_config_t& config) { - DPRINT << "row_ptr_section_size: "; - dprint_tensix_pack_config_row_ptr_section_size(config); - DPRINT << "exp_section_size: "; - dprint_tensix_pack_config_exp_section_size(config); - DPRINT << "l1_dest_addr: "; - dprint_tensix_pack_config_l1_dest_addr(config); - DPRINT << "uncompress: "; - dprint_tensix_pack_config_uncompressed(config); - DPRINT << "add_l1_dest_addr_offset: "; - dprint_tensix_pack_config_add_l1_dest_addr_offset(config); - DPRINT << "reserved_0: "; - dprint_tensix_pack_config_reserved_0(config); - DPRINT << "out_data_format: "; - dprint_tensix_pack_config_out_data_format(config); - DPRINT << "in_data_format: "; - dprint_tensix_pack_config_in_data_format(config); - DPRINT << "reserved_1: "; - dprint_tensix_pack_config_reserved_1(config); - DPRINT << "src_if_sel: "; - dprint_tensix_pack_config_src_if_sel(config); - DPRINT << "pack_per_xy_plane: "; - dprint_tensix_pack_config_pack_per_xy_plane(config); - DPRINT << "l1_src_addr: "; - dprint_tensix_pack_config_l1_src_addr(config); - DPRINT << "downsample_mask: "; - dprint_tensix_pack_config_downsample_mask(config); - DPRINT << "downsample_shift_count: "; - dprint_tensix_pack_config_downsample_shift_count(config); - DPRINT << "read_mode: "; - dprint_tensix_pack_config_read_mode(config); - DPRINT << "exp_threshold_en: "; - dprint_tensix_pack_config_exp_threshold_en(config); - DPRINT << "pack_l1_acc_disable_pack_zero_flag: "; - dprint_tensix_pack_config_l1_acc_disable_pack_zero_flag(config); - DPRINT << "reserved_2: "; - dprint_tensix_pack_config_reserved_2(config); - DPRINT << "exp_threshold: "; - dprint_tensix_pack_config_exp_threshold(config); -} -#endif // ARCH_WORMHOLE - -#ifdef ARCH_BLACKHOLE -inline void dprint_tensix_pack_config_helper(const ckernel::packer::pack_config_t& config) { - DPRINT << "row_ptr_section_size: "; - dprint_tensix_pack_config_row_ptr_section_size(config); - DPRINT << "exp_section_size: "; - dprint_tensix_pack_config_exp_section_size(config); - DPRINT << "l1_dest_addr: "; - dprint_tensix_pack_config_l1_dest_addr(config); - DPRINT << "uncompress: "; - dprint_tensix_pack_config_uncompressed(config); - DPRINT << "add_l1_dest_addr_offset: "; - dprint_tensix_pack_config_add_l1_dest_addr_offset(config); - DPRINT << "disable_pack_zero_flag: "; - dprint_tensix_pack_config_disable_pack_zero_flag(config); - DPRINT << "reserved_0: "; - dprint_tensix_pack_config_reserved_0(config); - DPRINT << "out_data_format: "; - dprint_tensix_pack_config_out_data_format(config); - DPRINT << "in_data_format: "; - dprint_tensix_pack_config_in_data_format(config); - DPRINT << "dis_shared_exp_assembler: "; - dprint_tensix_pack_config_dis_shared_exp_assembler(config); - DPRINT << "auto_set_last_pacr_intf_sel: "; - dprint_tensix_pack_config_auto_set_last_pacr_intf_sel(config); - DPRINT << "enable_out_fifo: "; - dprint_tensix_pack_config_enable_out_fifo(config); - DPRINT << "sub_l1_tile_header_size: "; - dprint_tensix_pack_config_sub_l1_tile_header_size(config); - DPRINT << "src_if_sel: "; - dprint_tensix_pack_config_src_if_sel(config); - DPRINT << "pack_start_intf_pos: "; - dprint_tensix_pack_config_pack_start_intf_pos(config); - DPRINT << "all_pack_disable_zero_compress_ovrd: "; - dprint_tensix_pack_config_all_pack_disable_zero_compress_ovrd(config); - DPRINT << "add_tile_header_size: "; - dprint_tensix_pack_config_add_tile_header_size(config); - DPRINT << "pack_dis_y_pos_start_offset: "; - dprint_tensix_pack_config_pack_dis_y_pos_start_offset(config); - DPRINT << "l1_src_addr: "; - dprint_tensix_pack_config_l1_src_addr(config); -} -#endif // ARCH_BLACKHOLE - -// PACK RELU CONFIG - -// These functions' argument should be return value of read_relu_config() - -inline void dprint_tensix_pack_relu_config_alu_acc_ctrl_zero_flag_disabled_src( - const ckernel::packer::relu_config_t& config) { - DPRINT << "0x" << HEX() << config.ALU_ACC_CTRL_Zero_Flag_disabled_src << ENDL(); -} - -inline void dprint_tensix_pack_relu_config_alu_acc_ctrl_zero_flag_disabled_dst( - const ckernel::packer::relu_config_t& config) { - DPRINT << "0x" << HEX() << config.ALU_ACC_CTRL_Zero_Flag_disabled_dst << ENDL(); -} - -inline void dprint_tensix_pack_relu_config_stacc_relu_apply_relu(const ckernel::packer::relu_config_t& config) { - DPRINT << "0x" << HEX() << config.STACC_RELU_ApplyRelu << ENDL(); -} - -inline void dprint_tensix_pack_relu_config_stacc_relu_relu_threshold(const ckernel::packer::relu_config_t& config) { - DPRINT << DEC() << config.STACC_RELU_ReluThreshold << ENDL(); -} - -inline void dprint_tensix_pack_relu_config_disable_risc_bp_disable_main(const ckernel::packer::relu_config_t& config) { - DPRINT << "0x" << HEX() << config.DISABLE_RISC_BP_Disable_main << ENDL(); -} - -inline void dprint_tensix_pack_relu_config_disable_risc_bp_disable_trisc(const ckernel::packer::relu_config_t& config) { - DPRINT << "0x" << HEX() << config.DISABLE_RISC_BP_Disable_trisc << ENDL(); -} - -inline void dprint_tensix_pack_relu_config_disable_risc_bp_disable_ncrisc( - const ckernel::packer::relu_config_t& config) { - DPRINT << "0x" << HEX() << config.DISABLE_RISC_BP_Disable_ncrisc << ENDL(); -} - -inline void dprint_tensix_pack_relu_config_disable_risc_bp_disable_bmp_clear_main( - const ckernel::packer::relu_config_t& config) { - DPRINT << "0x" << HEX() << config.DISABLE_RISC_BP_Disable_bmp_clear_main << ENDL(); -} - -inline void dprint_tensix_pack_relu_config_disable_risc_bp_disable_bmp_clear_trisc( - const ckernel::packer::relu_config_t& config) { - DPRINT << "0x" << HEX() << config.DISABLE_RISC_BP_Disable_bmp_clear_trisc << ENDL(); -} - -inline void dprint_tensix_pack_relu_config_disable_risc_bp_disable_bmp_clear_ncrisc( - const ckernel::packer::relu_config_t& config) { - DPRINT << "0x" << HEX() << config.DISABLE_RISC_BP_Disable_bmp_clear_ncrisc << ENDL(); -} - -inline void dprint_tensix_pack_relu_config() { - MATH(ckernel::packer::relu_config_t config = ckernel::packer::read_relu_config(); - - DPRINT << "ALU_ACC_CTRL_Zero_Flag_disabled_src: "; - dprint_tensix_pack_relu_config_alu_acc_ctrl_zero_flag_disabled_src(config); - DPRINT << "ALU_ACC_CTRL_Zero_Flag_disabled_dst: "; - dprint_tensix_pack_relu_config_alu_acc_ctrl_zero_flag_disabled_dst(config); - DPRINT << "STACC_RELU_ApplyRelu: "; - dprint_tensix_pack_relu_config_stacc_relu_apply_relu(config); - DPRINT << "STACC_RELU_ReluThreshold: "; - dprint_tensix_pack_relu_config_stacc_relu_relu_threshold(config); - DPRINT << "DISABLE_RISC_BP_Disable_main: "; - dprint_tensix_pack_relu_config_disable_risc_bp_disable_main(config); - DPRINT << "DISABLE_RISC_BP_Disable_trisc: "; - dprint_tensix_pack_relu_config_disable_risc_bp_disable_trisc(config); - DPRINT << "DISABLE_RISC_BP_Disable_ncrisc: "; - dprint_tensix_pack_relu_config_disable_risc_bp_disable_ncrisc(config); - DPRINT << "DISABLE_RISC_BP_Disable_bmp_clear_main: "; - dprint_tensix_pack_relu_config_disable_risc_bp_disable_bmp_clear_main(config); - DPRINT << "DISABLE_RISC_BP_Disable_bmp_clear_trisc: "; - dprint_tensix_pack_relu_config_disable_risc_bp_disable_bmp_clear_trisc(config); - DPRINT << "DISABLE_RISC_BP_Disable_bmp_clear_ncrisc: "; - dprint_tensix_pack_relu_config_disable_risc_bp_disable_bmp_clear_ncrisc(config);) -} - -// PACK DEST RD CTRL - -// These functions' argument should be return value of read_dest_rd_ctrl() - -inline void dprint_tensix_pack_dest_rd_ctrl_pck_dest_rd_ctrl_read_32b_data( - const ckernel::packer::dest_rd_ctrl_t& dest) { - DPRINT << "0x" << HEX() << dest.PCK_DEST_RD_CTRL_Read_32b_data << ENDL(); -} - -inline void dprint_tensix_pack_dest_rd_ctrl_pck_dest_rd_ctrl_read_unsigned( - const ckernel::packer::dest_rd_ctrl_t& dest) { - DPRINT << "0x" << HEX() << dest.PCK_DEST_RD_CTRL_Read_unsigned << ENDL(); -} - -inline void dprint_tensix_pack_dest_rd_ctrl_pck_dest_rd_ctrl_read_int8(const ckernel::packer::dest_rd_ctrl_t& dest) { - DPRINT << "0x" << HEX() << dest.PCK_DEST_RD_CTRL_Read_int8 << ENDL(); -} - -inline void dprint_tensix_pack_dest_rd_ctrl_pck_dest_rd_ctrl_round_10b_mant( - const ckernel::packer::dest_rd_ctrl_t& dest) { - DPRINT << "0x" << HEX() << dest.PCK_DEST_RD_CTRL_Round_10b_mant << ENDL(); -} - -inline void dprint_tensix_pack_dest_rd_ctrl_pck_dest_rd_ctrl_reserved(const ckernel::packer::dest_rd_ctrl_t& dest) { - DPRINT << "0x" << HEX() << dest.PCK_DEST_RD_CTRL_Reserved << ENDL(); -} - -// Printing dest control bits -inline void dprint_tensix_dest_rd_ctrl() { - PACK(ckernel::packer::dest_rd_ctrl_t dest = ckernel::packer::read_dest_rd_ctrl(); - - DPRINT << "PCK_DEST_RD_CTRL_Read_32b_data: "; - dprint_tensix_pack_dest_rd_ctrl_pck_dest_rd_ctrl_read_32b_data(dest); - DPRINT << "PCK_DEST_RD_CTRL_Read_unsigned: "; - dprint_tensix_pack_dest_rd_ctrl_pck_dest_rd_ctrl_read_unsigned(dest); - DPRINT << "PCK_DEST_RD_CTRL_Read_int8: "; - dprint_tensix_pack_dest_rd_ctrl_pck_dest_rd_ctrl_read_int8(dest); - DPRINT << "PCK_DEST_RD_CTRL_Round_10b_mant: "; - dprint_tensix_pack_dest_rd_ctrl_pck_dest_rd_ctrl_round_10b_mant(dest); - DPRINT << "PCK_DEST_RD_CTRL_Reserved: "; - dprint_tensix_pack_dest_rd_ctrl_pck_dest_rd_ctrl_reserved(dest);) -} - -#endif // END OF ELSE - -// PACK STRIDES -#ifdef ARCH_BLACKHOLE -inline void dprint_tensix_pack_strides_x_stride(const uint32_t& word) { - dprint_tensix_struct_field(word, 0xffff, 0, "x_stride", true); // decimal -} - -inline void dprint_tensix_pack_strides_y_stride(const uint32_t& word) { - dprint_tensix_struct_field(word, 0xffff0000, 16, "y_stride", true); // decimal -} - -inline void dprint_tensix_pack_strides_z_stride(const uint32_t& word) { - dprint_tensix_struct_field(word, 0xffff, 0, "z_stride", true); // decimal -} - -inline void dprint_tensix_pack_strides_w_stride(const uint32_t& word) { - dprint_tensix_struct_field(word, 0xffff0000, 16, "w_stride", true); // decimal -} -#else -inline void dprint_tensix_pack_strides_x_stride(const uint32_t& word) { - dprint_tensix_struct_field(word, 0xffff, 0, "x_stride", true); // decimal -} - -inline void dprint_tensix_pack_strides_y_stride(const uint32_t& word) { - dprint_tensix_struct_field(word, 0xffff0000, 16, "y_stride", true); // decimal -} - -inline void dprint_tensix_pack_strides_z_stride(const uint32_t& word) { - dprint_tensix_struct_field(word, 0xffff, 0, "z_stride", true); // decimal -} - -inline void dprint_tensix_pack_strides_w_stride(const uint32_t& word) { - dprint_tensix_struct_field(word, 0xffff0000, 16, "w_stride", true); // decimal -} -#endif - -// Printing packer strides -inline void dprint_tensix_pack_strides_helper(uint reg_id, const volatile uint tt_reg_ptr* cfg) { - uint32_t reg_addr = 0; - switch (reg_id) { - case 1: reg_addr = PCK0_ADDR_CTRL_XY_REG_0_Xstride_ADDR32; break; - case 2: reg_addr = PCK0_ADDR_CTRL_XY_REG_1_Xstride_ADDR32; break; - default: DPRINT << "Aborting! Invalid register id (valid ids are between 1 and 2)" << ENDL(); break; - } - - // word 0 xy_stride - uint32_t word = cfg[reg_addr]; - dprint_tensix_pack_strides_x_stride(word); - dprint_tensix_pack_strides_y_stride(word); - - // word 1 zw_stride - word = cfg[reg_addr + 1]; - dprint_tensix_pack_strides_z_stride(word); - dprint_tensix_pack_strides_w_stride(word); -} - -// PCK_EDGE_OFFSET - -// These function's argument should be return value of read_pack_edge_offset() - -inline void dprint_tensix_pack_edge_offset_mask(const ckernel::packer::pck_edge_offset_t& edge) { - DPRINT << "0x" << HEX() << edge.mask << ENDL(); -} - -inline void dprint_tensix_pack_edge_offset_mode(const ckernel::packer::pck_edge_offset_t& edge) { - DPRINT << "0x" << HEX() << edge.mode << ENDL(); -} - -inline void dprint_tensix_pack_edge_offset_tile_row_set_select_pack0(const ckernel::packer::pck_edge_offset_t& edge) { - DPRINT << "0x" << HEX() << edge.tile_row_set_select_pack0 << ENDL(); -} - -inline void dprint_tensix_pack_edge_offset_tile_row_set_select_pack1(const ckernel::packer::pck_edge_offset_t& edge) { - DPRINT << "0x" << HEX() << edge.tile_row_set_select_pack1 << ENDL(); -} - -inline void dprint_tensix_pack_edge_offset_tile_row_set_select_pack2(const ckernel::packer::pck_edge_offset_t& edge) { - DPRINT << "0x" << HEX() << edge.tile_row_set_select_pack2 << ENDL(); -} - -inline void dprint_tensix_pack_edge_offset_tile_row_set_select_pack3(const ckernel::packer::pck_edge_offset_t& edge) { - DPRINT << "0x" << HEX() << edge.tile_row_set_select_pack3 << ENDL(); -} - -inline void dprint_tensix_pack_edge_offset_reserved(const ckernel::packer::pck_edge_offset_t& edge) { - DPRINT << "0x" << HEX() << edge.reserved << ENDL(); -} - -// Printing packer edge offset -inline void dprint_tensix_pack_edge_offset_helper(const ckernel::packer::pck_edge_offset_t& edge, uint reg_id) { - DPRINT << "mask: "; - dprint_tensix_pack_edge_offset_mask(edge); - if (reg_id == 1) { - DPRINT << "mode: "; - dprint_tensix_pack_edge_offset_mode(edge); - DPRINT << "tile_row_set_select_pack0: "; - dprint_tensix_pack_edge_offset_tile_row_set_select_pack0(edge); - DPRINT << "tile_row_set_select_pack1: "; - dprint_tensix_pack_edge_offset_tile_row_set_select_pack1(edge); - DPRINT << "tile_row_set_select_pack2: "; - dprint_tensix_pack_edge_offset_tile_row_set_select_pack2(edge); - DPRINT << "tile_row_set_select_pack3: "; - dprint_tensix_pack_edge_offset_tile_row_set_select_pack3(edge); - DPRINT << "reserved: "; - dprint_tensix_pack_edge_offset_reserved(edge); - } -} - -// Choose what register you want printed with reg_id (1-4), 0 for all -inline void dprint_tensix_pack_edge_offset(uint reg_id = 0) { - std::array edge_vec; - PACK( - edge_vec = ckernel::packer::read_pack_edge_offset(); - if (reg_id >= 1 && reg_id <= ckernel::packer::NUM_PACKERS) { - if (ckernel::packer::NUM_PACKERS > 1) { - DPRINT << "REG_ID: " << reg_id << ENDL(); - } - dprint_tensix_pack_edge_offset_helper(edge_vec[reg_id - 1], reg_id); - } - // Print all registers - else if (reg_id == 0) { - for (uint i = 1; i <= ckernel::packer::NUM_PACKERS; i++) { - if (ckernel::packer::NUM_PACKERS > 1) { - DPRINT << "REG_ID: " << i << ENDL(); - } - dprint_tensix_pack_edge_offset_helper(edge_vec[i - 1], i); - if (i != ckernel::packer::NUM_PACKERS) { - DPRINT << ENDL(); - } - } - } else DPRINT - << "INVALID REGISTER ID! PLEASE CHOOSE A NUMBER BETWEEN 0 AND " << ckernel::packer::NUM_PACKERS << "." - << ENDL();) -} - -// PACK COUNTERS - -// These functions' argument should be return value of read_pack_counters() - -inline void dprint_tensix_pack_counters_pack_per_xy_plane(const ckernel::packer::pack_counters_t& counters) { - DPRINT << DEC() << counters.pack_per_xy_plane << ENDL(); -} - -inline void dprint_tensix_pack_counters_pack_reads_per_xy_plane(const ckernel::packer::pack_counters_t& counters) { - DPRINT << DEC() << counters.pack_reads_per_xy_plane << ENDL(); -} - -inline void dprint_tensix_pack_counters_pack_xys_per_til(const ckernel::packer::pack_counters_t& counters) { - DPRINT << DEC() << counters.pack_xys_per_til << ENDL(); -} - -inline void dprint_tensix_pack_counters_pack_yz_transposed(const ckernel::packer::pack_counters_t& counters) { - DPRINT << "0x" << HEX() << counters.pack_yz_transposed << ENDL(); -} - -inline void dprint_tensix_pack_counters_pack_per_xy_plane_offset(const ckernel::packer::pack_counters_t& counters) { - DPRINT << DEC() << counters.pack_per_xy_plane_offset << ENDL(); -} - -// Printing packer counters -inline void dprint_tensix_pack_counters_helper(const ckernel::packer::pack_counters_t& counters) { - DPRINT << "pack_per_xy_plane: "; - dprint_tensix_pack_counters_pack_per_xy_plane(counters); - DPRINT << "pack_reads_per_xy_plane: "; - dprint_tensix_pack_counters_pack_reads_per_xy_plane(counters); - DPRINT << "pack_xys_per_til: "; - dprint_tensix_pack_counters_pack_xys_per_til(counters); - DPRINT << "pack_yz_transposed: "; - dprint_tensix_pack_counters_pack_yz_transposed(counters); - DPRINT << "pack_per_xy_plane_offset: "; - dprint_tensix_pack_counters_pack_per_xy_plane_offset(counters); -} - -// Choose what register you want printed with reg_id (1-4), 0 for all -inline void dprint_tensix_pack_counters(uint reg_id = 0) { - std::array counters_vec; - PACK( - counters_vec = ckernel::packer::read_pack_counters(); - if (reg_id >= 1 && reg_id <= ckernel::packer::NUM_PACKERS) { - if (ckernel::packer::NUM_PACKERS > 1) { - DPRINT << "REG_ID: " << reg_id << ENDL(); - } - dprint_tensix_pack_counters_helper(counters_vec[reg_id - 1]); - } - // Print all registers - else if (reg_id == 0) { - for (uint i = 1; i <= ckernel::packer::NUM_PACKERS; i++) { - if (ckernel::packer::NUM_PACKERS > 1) { - DPRINT << "REG_ID: " << i << ENDL(); - } - dprint_tensix_pack_counters_helper(counters_vec[i - 1]); - if (i != ckernel::packer::NUM_PACKERS) { - DPRINT << ENDL(); - } - } - } else DPRINT - << "INVALID REGISTER ID! PLEASE CHOOSE A NUMBER BETWEEN 0 AND " << ckernel::packer::NUM_PACKERS << "." - << ENDL();) -} - -// Choose what register you want by id (1-4). 0 for all. -inline void dprint_tensix_pack_config(uint reg_id = 0) { - std::array config_vec; - MATH( - config_vec = ckernel::packer::read_pack_config(); if (reg_id >= 1 && reg_id <= ckernel::packer::NUM_PACKERS) { - if (ckernel::packer::NUM_PACKERS > 1) { - DPRINT << "REG_ID: " << reg_id << ENDL(); - } - dprint_tensix_pack_config_helper(config_vec[reg_id - 1]); - } else if (reg_id == 0) for (uint i = 1; i <= ckernel::packer::NUM_PACKERS; i++) { - if (ckernel::packer::NUM_PACKERS > 1) { - DPRINT << "REG_ID: " << i << ENDL(); - } - dprint_tensix_pack_config_helper(config_vec[i - 1]); - if (i != ckernel::packer::NUM_PACKERS) { - DPRINT << ENDL(); - } - } else DPRINT << "INVALID REGISTER ID! PLEASE CHOOSE A NUMBER BETWEEN 0 AND " - << ckernel::packer::NUM_PACKERS << "." << ENDL();) -} - -// Choose what register you want printed (1-2). 0 for all. -inline void dprint_tensix_pack_strides(uint reg_id = 0) { - PACK( - // Get pointer to registers for current state ID - volatile uint tt_reg_ptr* cfg = get_cfg_pointer(); - - if (reg_id >= 1 && reg_id <= 2) { - DPRINT << "REG_ID: " << reg_id << ENDL(); - dprint_tensix_pack_strides_helper(reg_id, cfg); - } - // Print all registers - else if (reg_id == 0) { - for (uint i = 1; i <= 2; i++) { - DPRINT << "REG_ID: " << i << ENDL(); - dprint_tensix_pack_strides_helper(i, cfg); - if (i != 2) { - DPRINT << ENDL(); - } - } - } else DPRINT - << "INVALID REGISTER ID! PLEASE CHOOSE A NUMBER BETWEEN 0 AND 2." << ENDL();) -} diff --git a/tt_metal/hw/inc/debug/dprint_tensix_unpack.h b/tt_metal/hw/inc/debug/dprint_tensix_unpack.h deleted file mode 100644 index 261797fa86d..00000000000 --- a/tt_metal/hw/inc/debug/dprint_tensix_unpack.h +++ /dev/null @@ -1,508 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include - -#include "dprint.h" -#include "dprint_tensix.h" -#include "cunpack_common.h" - -// NOTE: FUNCTIONS WITHOUT HELPER SUFIX ARE INTENDED TO BE USED - -// UNPACK TILE DESCRIPTOR - -// These function's argument should be return value of read_unpack_tile_descriptor() - -inline void dprint_tensix_unpack_tile_descriptor_in_data_format( - const ckernel::unpacker::unpack_tile_descriptor_t& tile_descriptor) { - dprint_data_format(tile_descriptor.in_data_format); - DPRINT << ENDL(); -} - -inline void dprint_tensix_unpack_tile_descriptor_uncompressed( - const ckernel::unpacker::unpack_tile_descriptor_t& tile_descriptor) { - DPRINT << "0x" << HEX() << tile_descriptor.uncompressed << ENDL(); -} - -inline void dprint_tensix_unpack_tile_descriptor_reserved_0( - const ckernel::unpacker::unpack_tile_descriptor_t& tile_descriptor) { - DPRINT << "0x" << HEX() << tile_descriptor.reserved_0 << ENDL(); -} - -inline void dprint_tensix_unpack_tile_descriptor_blobs_per_xy_plane( - const ckernel::unpacker::unpack_tile_descriptor_t& tile_descriptor) { - DPRINT << DEC() << tile_descriptor.blobs_per_xy_plane << ENDL(); -} - -inline void dprint_tensix_unpack_tile_descriptor_reserved_1( - const ckernel::unpacker::unpack_tile_descriptor_t& tile_descriptor) { - DPRINT << "0x" << HEX() << tile_descriptor.reserved_1 << ENDL(); -} - -inline void dprint_tensix_unpack_tile_descriptor_x_dim( - const ckernel::unpacker::unpack_tile_descriptor_t& tile_descriptor) { - DPRINT << DEC() << tile_descriptor.x_dim << ENDL(); -} - -inline void dprint_tensix_unpack_tile_descriptor_y_dim( - const ckernel::unpacker::unpack_tile_descriptor_t& tile_descriptor) { - DPRINT << DEC() << tile_descriptor.y_dim << ENDL(); -} - -inline void dprint_tensix_unpack_tile_descriptor_z_dim( - const ckernel::unpacker::unpack_tile_descriptor_t& tile_descriptor) { - DPRINT << DEC() << tile_descriptor.z_dim << ENDL(); -} - -inline void dprint_tensix_unpack_tile_descriptor_w_dim( - const ckernel::unpacker::unpack_tile_descriptor_t& tile_descriptor) { - DPRINT << DEC() << tile_descriptor.w_dim << ENDL(); -} - -inline void dprint_tensix_unpack_tile_descriptor_blobs_y_start( - const ckernel::unpacker::unpack_tile_descriptor_t& tile_descriptor) { -#ifdef ARCH_GRAYSKULL - DPRINT << DEC() << tile_descriptor.blobs_y_start << ENDL(); -#else - DPRINT << DEC() << ((tile_descriptor.blobs_y_start_hi << 16) | tile_descriptor.blobs_y_start_lo) << ENDL(); -#endif -} - -inline void dprint_tensix_unpack_tile_descriptor_digest_type( - const ckernel::unpacker::unpack_tile_descriptor_t& tile_descriptor) { - DPRINT << "0x" << HEX() << tile_descriptor.digest_type << ENDL(); -} - -inline void dprint_tensix_unpack_tile_descriptor_digest_size( - const ckernel::unpacker::unpack_tile_descriptor_t& tile_descriptor) { - DPRINT << DEC() << tile_descriptor.digest_size << ENDL(); -} - -// UNPACK CONFIG - -// These function's argument should be return value of read_unpack_config() - -inline void dprint_tensix_unpack_config_out_data_format(const ckernel::unpacker::unpack_config_t& config) { - dprint_data_format(config.out_data_format); - DPRINT << ENDL(); -} - -inline void dprint_tensix_unpack_config_throttle_mode(const ckernel::unpacker::unpack_config_t& config) { - DPRINT << "0x" << HEX() << config.throttle_mode << ENDL(); -} - -inline void dprint_tensix_unpack_config_context_count(const ckernel::unpacker::unpack_config_t& config) { - DPRINT << "0x" << HEX() << config.context_count << ENDL(); -} - -inline void dprint_tensix_unpack_config_haloize_mode(const ckernel::unpacker::unpack_config_t& config) { - DPRINT << "0x" << HEX() << config.haloize_mode << ENDL(); -} - -inline void dprint_tensix_unpack_config_tileize_mode(const ckernel::unpacker::unpack_config_t& config) { - DPRINT << "0x" << HEX() << config.tileize_mode << ENDL(); -} - -inline void dprint_tensix_unpack_config_force_shared_exp(const ckernel::unpacker::unpack_config_t& config) { - DPRINT << "0x" << HEX() << config.force_shared_exp << ENDL(); -} - -#ifdef ARCH_GRAYSKULL -inline void dprint_tensix_unpack_config_reserved_0(const ckernel::unpacker::unpack_config_t& config) { - DPRINT << "0x" << HEX() << config.reserved_0 << ENDL(); -} -#endif - -inline void dprint_tensix_unpack_config_upsample_rate(const ckernel::unpacker::unpack_config_t& config) { - DPRINT << DEC() << config.upsample_rate << ENDL(); -} - -inline void dprint_tensix_unpack_config_upsample_and_interlave(const ckernel::unpacker::unpack_config_t& config) { - DPRINT << "0x" << HEX() << config.upsamle_and_interlave << ENDL(); -} - -inline void dprint_tensix_unpack_config_shift_amount(const ckernel::unpacker::unpack_config_t& config) { - DPRINT << DEC() << config.shift_amount << ENDL(); -} - -inline void dprint_tensix_unpack_config_uncompress_cntx0_3(const ckernel::unpacker::unpack_config_t& config) { - DPRINT << "0x" << HEX() << config.uncompress_cntx0_3 << ENDL(); -} - -inline void dprint_tensix_unpack_config_reserved_1(const ckernel::unpacker::unpack_config_t& config) { - DPRINT << "0x" << HEX() << config.reserved_1 << ENDL(); -} - -inline void dprint_tensix_unpack_config_uncompress_cntx4_7(const ckernel::unpacker::unpack_config_t& config) { - DPRINT << "0x" << HEX() << config.uncompress_cntx4_7 << ENDL(); -} - -inline void dprint_tensix_unpack_config_reserved_2(const ckernel::unpacker::unpack_config_t& config) { - DPRINT << "0x" << HEX() << config.reserved_2 << ENDL(); -} - -inline void dprint_tensix_unpack_config_limit_addr(const ckernel::unpacker::unpack_config_t& config) { - DPRINT << "0x" << HEX() << config.limit_addr << ENDL(); -} - -inline void dprint_tensix_unpack_config_fifo_size(const ckernel::unpacker::unpack_config_t& config) { - DPRINT << DEC() << config.fifo_size << ENDL(); -} - -#if defined(ARCH_WORMHOLE) || defined(ARCH_BLACKHOLE) -inline void dprint_tensix_unpack_config_unpack_src_reg_set_update(const ckernel::unpacker::unpack_config_t& config) { - DPRINT << "0x" << HEX() << config.unpack_src_reg_set_update << ENDL(); -} - -inline void dprint_tensix_unpack_config_unpack_if_sel(const ckernel::unpacker::unpack_config_t& config) { - DPRINT << "0x" << HEX() << config.unpack_if_sel << ENDL(); -} - -inline void dprint_tensix_unpack_config_unpack_if_sel_cntx0_3(const ckernel::unpacker::unpack_config_t& config) { - DPRINT << "0x" << HEX() << config.unpack_if_sel_cntx0_3 << ENDL(); -} - -inline void dprint_tensix_unpack_config_unpack_if_sel_cntx4_7(const ckernel::unpacker::unpack_config_t& config) { - DPRINT << "0x" << HEX() << config.unpack_if_sel_cntx4_7 << ENDL(); -} - -inline void dprint_tensix_unpack_config_reserved_3(const ckernel::unpacker::unpack_config_t& config) { - DPRINT << "0x" << HEX() << config.reserved_3 << ENDL(); -} - -inline void dprint_tensix_unpack_config_reserved_4(const ckernel::unpacker::unpack_config_t& config) { - DPRINT << "0x" << HEX() << config.reserved_4 << ENDL(); -} - -inline void dprint_tensix_unpack_config_reserved_5(const ckernel::unpacker::unpack_config_t& config) { - DPRINT << "0x" << HEX() << config.reserved_5 << ENDL(); -} -#endif - -// HARDWARE SPECIFIC FUNCTIONS - -#ifdef ARCH_GRAYSKULL -inline void dprint_tensix_unpack_tile_descriptor_helper( - const ckernel::unpacker::tile_descriptor_t& tile_descriptor) { - DPRINT << "in_data_format: "; - dprint_tensix_unpack_tile_descriptor_in_data_format(tile_descriptor); - DPRINT << "uncompressed: "; - dprint_tensix_unpack_tile_descriptor_uncompressed(tile_descriptor); - DPRINT << "reserved_0: "; - dprint_tensix_unpack_tile_descriptor_reserved_0(tile_descriptor); - DPRINT << "blobs_per_xy_plane: " dprint_tensix_unpack_tile_descriptor_blobs_per_xy_plane(tile_descriptor); - DPRINT << "reserved_1: "; - dprint_tensix_unpack_tile_descriptor_reserved_1(tile_descriptor); - DPRINT << "x_dim: "; - dprint_tensix_unpack_tile_descriptor_x_dim(tile_descriptor); - DPRINT << "y_dim: "; - dprint_tensix_unpacK_tile_descriptor_y_dim(tile_descriptor); - DPRINT << "z_dim: "; - dprint_tensix_unpack_tile_descriptor_z_dim(tile_descriptor); - DPRINT << "w_dim: "; - dprint_tensix_unpack_tile_descriptor_w_dim(tile_descriptor); - DPRINT << "blobs_y_start: "; - dprint_tensix_unpack_tile_descriptor_blobs_y_start(tile_descriptor); - DPRINT << "digest_type: "; - dprint_tensix_unpack_tile_descriptor_digest_type(tile_descriptor); - DPRINT << "digest_size: "; - dprint_tensix_unpack_tile_descriptor_digest_type(tile_descriptor); -} - -inline void dprint_tensix_unpack_tile_descriptor(uint reg_id = 0) { - std::array tile_descriptor_vec; - UNPACK( - tile_descriptor_vec = ckernel::unpacker::read_unpack_tile_descriptor(); - if (reg_id >= 1 && reg_id <= ckernel::unpacker::NUM_UNPACKERS) { - DPRINT << "REG_ID: " << reg_id << ENDL(); - dprint_tensix_unpack_tile_descriptor_helper(tile_descriptor_vec[reg_id - 1]); - } else if (reg_id == 0) { - for (uint i = 1; i <= ckernel::unpacker::NUM_UNPACKERS; i++) { - DPRINT << "REG_ID: " << i << ENDL(); - dprint_tensix_unpack_tile_descriptor_helper(tile_descriptor_vec[i - 1]); - if (i != ckernel::unpacker::NUM_UNPACKERS) { - DPRINT << ENDL(); - } - } - } else { - DPRINT << "INVALID REGISTER ID! PLEASE CHOOSE A NUMBER BETWEEN 0 AND " << ckernel::unpacker::NUM_UNPACKERS << "." << ENDL(); - } - ) -} - -inline void dprint_tensix_unpack_config_helper(const ckernel::unpacker::unpack_config_t& config) { - DPRINT << "out_data_format: "; - dprint_tensix_unpack_config_out_data_format(config); - DPRINT << "throttle_mode: "; - dprint_tensix_unpack_config_throttle_mode(config); - DPRINT << "context_count: "; - dprint_tensix_unpack_config_context_count(config); - DPRINT << "haloize_mode: "; - dprint_tensix_unpack_config_haloize_mode(config); - DPRINT << "tileize_mode: "; - dprint_tensix_unpack_config_tileize_mode(config); - DPRINT << "force_shared_exp: "; - dprint_tensix_unpack_config_force_shared_exp(config) DPRINT << "reserved_0: "; - dprint_tensix_unpack_config_reserved_0(config); - DPRINT << "upsample_rate: "; - dprint_tensix_unpack_config_upsample_rate(config); - DPRINT << "upsamle_and_interlave: "; - dprint_tensix_unpack_config_upsample_and_interlave(config); - DPRINT << "shift_amount: "; - dprint_tensix_unpack_config_shift_amount(config); - DPRINT << "uncompress_cntx0_3: "; - dprint_tensix_unpack_config_uncompress_cntx0_3(config); - DPRINT << "reserved_1: "; - dprint_tensix_unpack_config_reserved_1(config); - DPRINT << "uncompress_cntx4_7: "; - dprint_tensix_unpack_config_uncompress_cntx4_7(config); - DPRINT << "reserved_2: "; - dprint_tensix_unpack_config_reserved_2(config); - DPRINT << "limit_addr: "; - dprint_tensix_unpack_config_limit_addr(config); - DPRINT << "fifo_size: "; - dprint_tensix_unpack_config_fifo_size(config); -} - -inline void dprint_tensix_unpack_config(uint reg_id = 0) { - std::array config_vec; - UNPACK( - config_vec = ckernel::unpacker::read_unpack_config(); - if (reg_id >= 1 && reg_id <= ckernel::unpacker::NUM_UNPACKERS) { - DPRINT << "REG_ID: " << reg_id << ENDL(); - dprint_tensix_unpack_config_helper(config_vec[reg_id - 1]); - } else if (reg_id == 0) { - for (uint i = 1; i <= ckernel::unpacker::NUM_UNPACKERS; i++) { - DPRINT << "REG_ID: " << i << ENDL(); - dprint_tensix_unpack_config_helper(config_vec[i - 1]); - if (i != ckernel::unpacker::NUM_UNPACKERS) { - DPRINT << ENDL(); - } - } - } else { - DPRINT << "INVALID REGISTER ID! PLEASE CHOOSE A NUMBER BETWEEN 0 AND " << ckernel::unpacker::NUM_UNPACKERS << "." << ENDL(); - } - ) -} - -#else // ARCH_WORMHOLE or ARCH_BLACKHOLE -inline void dprint_tensix_unpack_tile_descriptor_helper( - const ckernel::unpacker::unpack_tile_descriptor_t& tile_descriptor) { - DPRINT << "in_data_format: "; - dprint_tensix_unpack_tile_descriptor_in_data_format(tile_descriptor); - DPRINT << "uncompressed: "; - dprint_tensix_unpack_tile_descriptor_uncompressed(tile_descriptor); - DPRINT << "reserved_0: "; - dprint_tensix_unpack_tile_descriptor_reserved_0(tile_descriptor); - DPRINT << "blobs_per_xy_plane: "; - dprint_tensix_unpack_tile_descriptor_blobs_per_xy_plane(tile_descriptor); - DPRINT << "reserved_1: "; - dprint_tensix_unpack_tile_descriptor_reserved_1(tile_descriptor); - DPRINT << "x_dim: "; - dprint_tensix_unpack_tile_descriptor_x_dim(tile_descriptor); - DPRINT << "y_dim: "; - dprint_tensix_unpack_tile_descriptor_y_dim(tile_descriptor); - DPRINT << "z_dim: "; - dprint_tensix_unpack_tile_descriptor_z_dim(tile_descriptor); - DPRINT << "w_dim: "; - dprint_tensix_unpack_tile_descriptor_w_dim(tile_descriptor); - DPRINT << "blobs_y_start: "; - dprint_tensix_unpack_tile_descriptor_blobs_y_start(tile_descriptor); - DPRINT << "digest_type: "; - dprint_tensix_unpack_tile_descriptor_digest_type(tile_descriptor); - DPRINT << "digest_size: "; - dprint_tensix_unpack_tile_descriptor_digest_size(tile_descriptor); -} - -// Choose which register you want (1-2). 0 for both. -inline void dprint_tensix_unpack_tile_descriptor(uint reg_id = 0) { - std::array tile_descriptor_vec; - UNPACK( - tile_descriptor_vec = ckernel::unpacker::read_unpack_tile_descriptor(); - if (reg_id >= 1 && reg_id <= ckernel::unpacker::NUM_UNPACKERS) { - DPRINT << "REG_ID: " << reg_id << ENDL(); - dprint_tensix_unpack_tile_descriptor_helper(tile_descriptor_vec[reg_id - 1]); - } else if (reg_id == 0) { - for (uint i = 1; i <= ckernel::unpacker::NUM_UNPACKERS; i++) { - DPRINT << "REG_ID: " << i << ENDL(); - dprint_tensix_unpack_tile_descriptor_helper(tile_descriptor_vec[i - 1]); - if (i != ckernel::unpacker::NUM_UNPACKERS) { - DPRINT << ENDL(); - } - } - } else { - DPRINT << "INVALID REGISTER ID! PLEASE CHOOSE A NUMBER BETWEEN 0 AND " << ckernel::unpacker::NUM_UNPACKERS << "." << ENDL(); - } - ) -} - -inline void dprint_tensix_unpack_config_helper(const ckernel::unpacker::unpack_config_t& config) { - DPRINT << "out_data_format: "; - dprint_tensix_unpack_config_out_data_format(config); - DPRINT << "throttle_mode: "; - dprint_tensix_unpack_config_throttle_mode(config); - DPRINT << "context_count: "; - dprint_tensix_unpack_config_context_count(config); - DPRINT << "haloize_mode: "; - dprint_tensix_unpack_config_haloize_mode(config); - DPRINT << "tileize_mode: "; - dprint_tensix_unpack_config_tileize_mode(config); - DPRINT << "unpack_src_reg_set_update: "; - dprint_tensix_unpack_config_unpack_src_reg_set_update(config); - DPRINT << "unpack_if_sel: "; - dprint_tensix_unpack_config_unpack_if_sel(config); - DPRINT << "upsample_rate: "; - dprint_tensix_unpack_config_upsample_rate(config); - DPRINT << "reserved_1: "; - dprint_tensix_unpack_config_reserved_1(config); - DPRINT << "upsample_and_interlave: "; - dprint_tensix_unpack_config_upsample_and_interlave(config); - DPRINT << "shift_amount: "; - dprint_tensix_unpack_config_shift_amount(config); - DPRINT << "uncompress_cntx0_3: "; - dprint_tensix_unpack_config_uncompress_cntx0_3(config); - DPRINT << "unpack_if_sel_cntx0_3: "; - dprint_tensix_unpack_config_unpack_if_sel_cntx0_3(config); - DPRINT << "force_shared_exp: "; - dprint_tensix_unpack_config_force_shared_exp(config); - DPRINT << "reserved_2: "; - dprint_tensix_unpack_config_reserved_2(config); - DPRINT << "uncompress_cntx4_7: "; - dprint_tensix_unpack_config_uncompress_cntx4_7(config); - DPRINT << "unpack_if_sel_cntx4_7: "; - dprint_tensix_unpack_config_unpack_if_sel_cntx4_7(config); - DPRINT << "reserved_3: "; - dprint_tensix_unpack_config_reserved_3(config); - DPRINT << "limit_addr: "; - dprint_tensix_unpack_config_limit_addr(config); - DPRINT << "reserved_4: "; - dprint_tensix_unpack_config_reserved_4(config); - DPRINT << "fifo_size: "; - dprint_tensix_unpack_config_fifo_size(config); - DPRINT << "reserved_5: "; - dprint_tensix_unpack_config_reserved_5(config); -} - -// Choose which register you want (1-2). 0 for both. -inline void dprint_tensix_unpack_config(uint reg_id = 0) { - std::array config_vec; - UNPACK( - config_vec = ckernel::unpacker::read_unpack_config(); - if (reg_id >= 1 && reg_id <= ckernel::unpacker::NUM_UNPACKERS) { - DPRINT << "REG_ID: " << reg_id << ENDL(); - dprint_tensix_unpack_config_helper(config_vec[reg_id - 1]); - } else if (reg_id == 0) { - for (uint i = 1; i <= ckernel::unpacker::NUM_UNPACKERS; i++) { - DPRINT << "REG_ID: " << i << ENDL(); - dprint_tensix_unpack_config_helper(config_vec[i - 1]); - if (i != ckernel::unpacker::NUM_UNPACKERS) { - DPRINT << ENDL(); - } - } - } else { - DPRINT << "INVALID REGISTER ID! PLEASE CHOOSE A NUMBER BETWEEN 0 AND " << ckernel::unpacker::NUM_UNPACKERS << "." << ENDL(); - } - ) -} - -// ALU CONFIG - -// These functions' argument should be return value of read_alu_config() - -inline void dprint_tensix_alu_config_alu_rounding_mode_fpu_srnd_en(const ckernel::unpacker::alu_config_t& config) { - DPRINT << "0x" << HEX() << config.ALU_ROUNDING_MODE_Fpu_srnd_en << ENDL(); -} - -inline void dprint_tensix_alu_config_alu_rounding_mode_gasket_srnd_en(const ckernel::unpacker::alu_config_t& config) { - DPRINT << "0x" << HEX() << config.ALU_ROUNDING_MODE_Gasket_srnd_en << ENDL(); -} - -inline void dprint_tensix_alu_config_alu_rounding_mode_packer_srnd_en(const ckernel::unpacker::alu_config_t& config) { - DPRINT << "0x" << HEX() << config.ALU_ROUNDING_MODE_Packer_srnd_en << ENDL(); -} - -inline void dprint_tensix_alu_config_alu_rounding_mode_padding(const ckernel::unpacker::alu_config_t& config) { - DPRINT << "0x" << HEX() << config.ALU_ROUNDING_MODE_Padding << ENDL(); -} - -inline void dprint_tensix_alu_config_alu_rounding_mode_gs_lf(const ckernel::unpacker::alu_config_t& config) { - DPRINT << "0x" << HEX() << config.ALU_ROUNDING_MODE_GS_LF << ENDL(); -} - -inline void dprint_tensix_alu_config_alu_rounding_mode_bfp8_hf(const ckernel::unpacker::alu_config_t& config) { - DPRINT << "0x" << HEX() << config.ALU_ROUNDING_MODE_Bfp8_HF << ENDL(); -} - -inline void dprint_tensix_alu_config_alu_format_spec_reg0_srcaunsigned(const ckernel::unpacker::alu_config_t& config) { - DPRINT << "0x" << HEX() << config.ALU_FORMAT_SPEC_REG0_SrcAUnsigned << ENDL(); -} - -inline void dprint_tensix_alu_config_alu_format_spec_reg0_srcbunsigned(const ckernel::unpacker::alu_config_t& config) { - DPRINT << "0x" << HEX() << config.ALU_FORMAT_SPEC_REG0_SrcBUnsigned << ENDL(); -} - -inline void dprint_tensix_alu_config_alu_format_spec_reg0_srca(const ckernel::unpacker::alu_config_t& config) { - dprint_data_format(config.ALU_FORMAT_SPEC_REG0_SrcA); - DPRINT << ENDL(); -} - -inline void dprint_tensix_alu_config_alu_format_spec_reg1_srcb(const ckernel::unpacker::alu_config_t& config) { - dprint_data_format(config.ALU_FORMAT_SPEC_REG1_SrcB); - DPRINT << ENDL(); -} - -inline void dprint_tensix_alu_config_alu_format_spec_reg2_dstacc(const ckernel::unpacker::alu_config_t& config) { - dprint_data_format(config.ALU_FORMAT_SPEC_REG2_Dstacc); - DPRINT << ENDL(); -} - -inline void dprint_tensix_alu_config_alu_acc_ctrl_fp32_enabled(const ckernel::unpacker::alu_config_t& config) { - DPRINT << "0x" << HEX() << config.ALU_ACC_CTRL_Fp32_enabled << ENDL(); -} - -inline void dprint_tensix_alu_config_alu_acc_ctrl_sfpu_fp32_enabled(const ckernel::unpacker::alu_config_t& config) { - DPRINT << "0x" << HEX() << config.ALU_ACC_CTRL_SFPU_Fp32_enabled << ENDL(); -} - -inline void dprint_tensix_alu_config_alu_acc_ctrl_int8_math_enabled(const ckernel::unpacker::alu_config_t& config) { - DPRINT << "0x" << HEX() << config.ALU_ACC_CTRL_INT8_math_enabled << ENDL(); -} - -// Print content of the register field by field. -inline void dprint_tensix_alu_config() { - MATH(ckernel::unpacker::alu_config_t config = ckernel::unpacker::read_alu_config(); - - DPRINT << "ALU_ROUNDING_MODE_Fpu_srnd_en: "; - dprint_tensix_alu_config_alu_rounding_mode_fpu_srnd_en(config); - DPRINT << "ALU_ROUNDING_MODE_Gasket_srnd_en: "; - dprint_tensix_alu_config_alu_rounding_mode_gasket_srnd_en(config); - DPRINT << "ALU_ROUNDING_MODE_Packer_srnd_en: "; - dprint_tensix_alu_config_alu_rounding_mode_packer_srnd_en(config); - DPRINT << "ALU_ROUNDING_MODE_Padding: "; - dprint_tensix_alu_config_alu_rounding_mode_padding(config); - DPRINT << "ALU_ROUNDING_MODE_GS_LF: "; - dprint_tensix_alu_config_alu_rounding_mode_gs_lf(config); - DPRINT << "ALU_ROUNDING_MODE_Bfp8_HF: "; - dprint_tensix_alu_config_alu_rounding_mode_bfp8_hf(config); - DPRINT << "ALU_FORMAT_SPEC_REG0_SrcAUnsigned: "; - dprint_tensix_alu_config_alu_format_spec_reg0_srcaunsigned(config); - DPRINT << "ALU_FORMAT_SPEC_REG0_SrcBUnsigned: "; - dprint_tensix_alu_config_alu_format_spec_reg0_srcbunsigned(config); - DPRINT << "ALU_FORMAT_SPEC_REG0_SrcA: "; - dprint_tensix_alu_config_alu_format_spec_reg0_srca(config); - DPRINT << "ALU_FORMAT_SPEC_REG1_SrcB: "; - dprint_tensix_alu_config_alu_format_spec_reg1_srcb(config); - DPRINT << "ALU_FORMAT_SPEC_REG2_Dstacc: "; - dprint_tensix_alu_config_alu_format_spec_reg2_dstacc(config); - DPRINT << "ALU_ACC_CTRL_Fp32_enabled: "; - dprint_tensix_alu_config_alu_acc_ctrl_fp32_enabled(config); - DPRINT << "ALU_ACC_CTRL_SFPU_Fp32_enabled: "; - dprint_tensix_alu_config_alu_acc_ctrl_sfpu_fp32_enabled(config); - DPRINT << "ALU_ACC_CTRL_INT8_math_enabled: "; - dprint_tensix_alu_config_alu_acc_ctrl_int8_math_enabled(config);) -} - -#endif // END OF ELSE diff --git a/tt_metal/third_party/tt_llk_grayskull b/tt_metal/third_party/tt_llk_grayskull index be2b32e22f9..0c04db64275 160000 --- a/tt_metal/third_party/tt_llk_grayskull +++ b/tt_metal/third_party/tt_llk_grayskull @@ -1 +1 @@ -Subproject commit be2b32e22f939526cb2c0bef021f636312c4f1d2 +Subproject commit 0c04db64275a4bd36a7e14d3c533855cb33f6a20 From 62de6a9d1f9f07bf26d0850fd21f419993ef4de8 Mon Sep 17 00:00:00 2001 From: William Ly Date: Fri, 21 Feb 2025 10:55:14 -0500 Subject: [PATCH 215/316] #17878: Update failed test logging to appear in GHA job+workflow annotations (#18106) ### Ticket https://github.com/tenstorrent/tt-metal/issues/17878 ### Problem description Failed unit tests don't show up in GHA annotations. To find out the test that failed you have to dig through the test job logs. ### What's changed Expose test failure messages in pytest and gtest in GHA annotations: - pytest: use `pytest-github-actions-annotate-failures` plugin, which handles it for us - requires setting `GITHUB_ACTIONS=true` [for docker containers](https://github.com/pytest-dev/pytest-github-actions-annotate-failures), and exclude warnings with `--exclude-warning-annotations` - example: https://github.com/tenstorrent/tt-metal/actions/runs/13443325356/job/37563108566 - gtest: create a custom action `actions/generate-gtest-failure-message` that calls `python3 .github/scripts/data_analysis/print_gtest_annotations.py` - unfortunately gtest doesn't have an equivalent hook/plugin like pytest - requires `xmltodict` - runs at the end of gtest workflows and prints unit test failures to the GHA log which auto-convert into annotations - example: https://github.com/tenstorrent/tt-metal/actions/runs/13443325356/job/37563095078 - update all-post-commit workflows ### Checklist - [x] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes https://github.com/tenstorrent/tt-metal/actions/runs/13449043032 - [x] Remove dummy failed tests from PR --- .../generate-gtest-failure-message/action.yml | 17 ++++ .../data_analysis/print_gtest_annotations.py | 89 +++++++++++++++++++ .github/workflows/build-and-unit-tests.yaml | 6 ++ .github/workflows/cpp-post-commit.yaml | 6 ++ .../fabric-build-and-unit-tests.yaml | 6 ++ .../fast-dispatch-build-and-unit-tests.yaml | 17 ++-- .github/workflows/ttnn-post-commit.yaml | 27 +++--- tt_metal/python_env/requirements-dev.txt | 4 + 8 files changed, 151 insertions(+), 21 deletions(-) create mode 100644 .github/actions/generate-gtest-failure-message/action.yml create mode 100644 .github/scripts/data_analysis/print_gtest_annotations.py diff --git a/.github/actions/generate-gtest-failure-message/action.yml b/.github/actions/generate-gtest-failure-message/action.yml new file mode 100644 index 00000000000..e5a0eb1672b --- /dev/null +++ b/.github/actions/generate-gtest-failure-message/action.yml @@ -0,0 +1,17 @@ +name: "Generate gtest failure message" +description: "Generate gtest failure message for Github workflow annotations" + +inputs: + path: + description: "Paths to pass containing gtest XML files" + required: true + +runs: + using: "composite" + steps: + - name: Generate gtest failure messages + id: generate-gtest-message + shell: bash + run: | + set +e + python3 .github/scripts/data_analysis/print_gtest_annotations.py ${{ inputs.path }} diff --git a/.github/scripts/data_analysis/print_gtest_annotations.py b/.github/scripts/data_analysis/print_gtest_annotations.py new file mode 100644 index 00000000000..a599b4e440e --- /dev/null +++ b/.github/scripts/data_analysis/print_gtest_annotations.py @@ -0,0 +1,89 @@ +import argparse +import xmltodict +import glob +import os +from typing import Union + + +def _guaranteed_list(x): + if not x: + return [] + elif isinstance(x, list): + return x + else: + return [x] + + +def _build_workflow_command( + command_name: str, + file: str, + line: int, + end_line: Union[int, None] = None, + column: Union[int, None] = None, + end_column: Union[int, None] = None, + title: Union[str, None] = None, + message: Union[str, None] = None, +): + result = f"::{command_name} " + + entries = [ + ("file", file), + ("line", line), + ("endLine", end_line), + ("col", column), + ("endColumn", end_column), + ("title", title), + ] + + result = result + ",".join(f"{k}={v}" for k, v in entries if v is not None) + + if message is not None: + result = result + "::" + _escape(message) + + return result + + +def _escape(s: str) -> str: + return s.replace("%", "%25").replace("\r", "%0D").replace("\n", "%0A") + + +if __name__ == "__main__": + # Get xml dir path from cmdline + parser = argparse.ArgumentParser() + parser.add_argument("directory", type=str, help="Path to the GoogleTest XML directory") + args = parser.parse_args() + + # Path to the directory containing XML files + xml_dir = args.directory + + # Use glob to find all XML files in the directory + xml_files = glob.glob(os.path.join(xml_dir, "*.xml")) + + # Iterate through each XML file + for xml_file in xml_files: + with open(xml_file, "r") as f: + results = xmltodict.parse(f.read()) + + # Check for failed tests + failed_tests = [] + for testsuite in _guaranteed_list(results["testsuites"]["testsuite"]): + for testcase in _guaranteed_list(testsuite["testcase"]): + if "failure" in testcase: + failed_tests.append(testcase) + + # Create error annotations for each failed test + for failed_test in failed_tests: + failure_messages = _guaranteed_list(failed_test["failure"]) + if failure_messages: + # first message is often enough + failure_message = failure_messages[0]["@message"] + else: + failure_message = "unknown_failure_message" + + msg = _build_workflow_command( + command_name="error", + file=failed_test["@file"].lstrip("/work/"), + line=int(failed_test["@line"]), + message=failure_message, + ) + print(msg) diff --git a/.github/workflows/build-and-unit-tests.yaml b/.github/workflows/build-and-unit-tests.yaml index 145fad832af..3cef129926c 100644 --- a/.github/workflows/build-and-unit-tests.yaml +++ b/.github/workflows/build-and-unit-tests.yaml @@ -108,3 +108,9 @@ jobs: - name: Generate system logs on failure uses: ./.github/actions/generate-system-logs if: ${{ failure() }} + - name: Generate gtest annotations on failure + uses: ./.github/actions/generate-gtest-failure-message + if: ${{ failure() }} + with: + path: | + generated/test_reports/ diff --git a/.github/workflows/cpp-post-commit.yaml b/.github/workflows/cpp-post-commit.yaml index f9689deec4e..00a16e01a77 100644 --- a/.github/workflows/cpp-post-commit.yaml +++ b/.github/workflows/cpp-post-commit.yaml @@ -113,3 +113,9 @@ jobs: - name: Generate system logs on failure uses: ./.github/actions/generate-system-logs if: ${{ failure() }} + - name: Generate gtest annotations on failure + uses: ./.github/actions/generate-gtest-failure-message + if: ${{ failure() }} + with: + path: | + generated/test_reports/ diff --git a/.github/workflows/fabric-build-and-unit-tests.yaml b/.github/workflows/fabric-build-and-unit-tests.yaml index 0f0265939e8..03445266d1c 100644 --- a/.github/workflows/fabric-build-and-unit-tests.yaml +++ b/.github/workflows/fabric-build-and-unit-tests.yaml @@ -91,3 +91,9 @@ jobs: - name: Generate system logs on failure uses: ./.github/actions/generate-system-logs if: ${{ failure() }} + - name: Generate gtest annotations on failure + uses: ./.github/actions/generate-gtest-failure-message + if: ${{ failure() }} + with: + path: | + generated/test_reports/ diff --git a/.github/workflows/fast-dispatch-build-and-unit-tests.yaml b/.github/workflows/fast-dispatch-build-and-unit-tests.yaml index 125a0cf4f41..aefef4fa0e2 100644 --- a/.github/workflows/fast-dispatch-build-and-unit-tests.yaml +++ b/.github/workflows/fast-dispatch-build-and-unit-tests.yaml @@ -52,14 +52,14 @@ jobs: matrix: os: ["${{ inputs.os }}"] test-group: [ - {name: eager unit tests 1, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 1 }, - {name: eager unit tests 2, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 2 }, - {name: eager unit tests 3, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 3 }, - {name: eager unit tests 4, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 4 }, - {name: eager unit tests 5, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 5 }, - {name: eager unit tests 6, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 6 }, - {name: eager unit tests 7, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 7 }, - {name: sweep, cmd: pytest tests/tt_eager/python_api_testing/sweep_tests/pytests/ -xvvv}, + {name: eager unit tests 1, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 1 --exclude-warning-annotations }, + {name: eager unit tests 2, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 2 --exclude-warning-annotations }, + {name: eager unit tests 3, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 3 --exclude-warning-annotations }, + {name: eager unit tests 4, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 4 --exclude-warning-annotations }, + {name: eager unit tests 5, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 5 --exclude-warning-annotations }, + {name: eager unit tests 6, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 6 --exclude-warning-annotations }, + {name: eager unit tests 7, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 7 --exclude-warning-annotations }, + {name: sweep, cmd: pytest tests/tt_eager/python_api_testing/sweep_tests/pytests/ -xvvv --exclude-warning-annotations }, ] name: ${{ matrix.test-group.name }} ${{ inputs.arch }} ${{ inputs.runner-label }} env: @@ -82,6 +82,7 @@ jobs: docker_password: ${{ secrets.GITHUB_TOKEN }} docker_opts: | -e ARCH_NAME=${{ inputs.arch }} + -e GITHUB_ACTIONS=true run_args: | ${{ matrix.test-group.cmd }} - uses: ./.github/actions/slack-report diff --git a/.github/workflows/ttnn-post-commit.yaml b/.github/workflows/ttnn-post-commit.yaml index 2e3f57afe08..5d579306c12 100644 --- a/.github/workflows/ttnn-post-commit.yaml +++ b/.github/workflows/ttnn-post-commit.yaml @@ -52,31 +52,31 @@ jobs: os: ["ubuntu-20.04"] test-group: - name: ttnn group 1 - cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --group 1 -m "not disable_fast_runtime_mode" + cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --exclude-warning-annotations --group 1 -m "not disable_fast_runtime_mode" - name: ttnn group 2 - cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --group 2 -m "not disable_fast_runtime_mode" + cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --exclude-warning-annotations --group 2 -m "not disable_fast_runtime_mode" - name: ttnn group 3 - cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --group 3 -m "not disable_fast_runtime_mode" + cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --exclude-warning-annotations --group 3 -m "not disable_fast_runtime_mode" - name: ttnn group 4 - cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --group 4 -m "not disable_fast_runtime_mode" + cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --exclude-warning-annotations --group 4 -m "not disable_fast_runtime_mode" - name: ttnn group 5 - cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --group 5 -m "not disable_fast_runtime_mode" + cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --exclude-warning-annotations --group 5 -m "not disable_fast_runtime_mode" - name: ttnn group 6 - cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --group 6 -m "not disable_fast_runtime_mode" + cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --exclude-warning-annotations --group 6 -m "not disable_fast_runtime_mode" - name: ttnn group 7 - cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --group 7 -m "not disable_fast_runtime_mode" + cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --exclude-warning-annotations --group 7 -m "not disable_fast_runtime_mode" - name: ttnn group 8 - cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --group 8 -m "not disable_fast_runtime_mode" + cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --exclude-warning-annotations --group 8 -m "not disable_fast_runtime_mode" - name: ttnn group 9 - cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --group 9 -m "not disable_fast_runtime_mode" + cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --exclude-warning-annotations --group 9 -m "not disable_fast_runtime_mode" - name: ttnn group 10 - cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --group 10 -m "not disable_fast_runtime_mode" + cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --exclude-warning-annotations --group 10 -m "not disable_fast_runtime_mode" - name: ttnn group 11 - cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --group 11 -m "not disable_fast_runtime_mode" + cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --exclude-warning-annotations --group 11 -m "not disable_fast_runtime_mode" - name: ttnn group 12 - cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --group 12 -m "not disable_fast_runtime_mode" + cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --exclude-warning-annotations --group 12 -m "not disable_fast_runtime_mode" - name: ttnn fast runtime off - cmd: pytest tests/ttnn/unit_tests -xv -m requires_fast_runtime_mode_off + cmd: pytest tests/ttnn/unit_tests -xv --exclude-warning-annotations -m requires_fast_runtime_mode_off fast_runtime_mode_off: true - name: ttnn example tests cmd: ./tests/scripts/run_ttnn_examples.sh @@ -103,6 +103,7 @@ jobs: docker_password: ${{ secrets.GITHUB_TOKEN }} docker_opts: | -e ARCH_NAME=${{ inputs.arch }} + -e GITHUB_ACTIONS=true run_args: | WHEEL_FILENAME=$(ls -1 *.whl) pip3 install --user $WHEEL_FILENAME diff --git a/tt_metal/python_env/requirements-dev.txt b/tt_metal/python_env/requirements-dev.txt index a9ed3355d47..808205dc2ce 100644 --- a/tt_metal/python_env/requirements-dev.txt +++ b/tt_metal/python_env/requirements-dev.txt @@ -4,6 +4,10 @@ loguru +# For github workflow unit test failure annotations +xmltodict +pytest-github-actions-annotate-failures==0.3.0 + # During dep resolution, black may install platformdirs >=4.0.0, which is # a breaking dependency for virtualenv installed by pre-commit. virtualenv # requires <4.0.0 platformdirs, so we're pinning platformdirs here From 0df803765594f09a70c7cae1d8adb7752339140f Mon Sep 17 00:00:00 2001 From: Mouliraj Elamurugan Date: Fri, 21 Feb 2025 21:28:40 +0530 Subject: [PATCH 216/316] #17687: Add data_type checker (#17828) ### Ticket Link to Github Issue #17687 ### Problem description ttnn.add doesn't work as expected for ttnn.uint8 ### What's changed Updated the code to throw an error for any unsupported data type. ### Checklist - [ ] [All post commit CI](https://github.com/tenstorrent/tt-metal/actions/runs/13370741236) --- .../ttnn/operations/eltwise/binary/binary.cpp | 24 ++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/binary.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/binary.cpp index fb6033d77eb..61ec0a4311d 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary/binary.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/binary.cpp @@ -22,6 +22,25 @@ constexpr bool is_associative(BinaryOpType op) { op == BinaryOpType::LOGADDEXP2 || op == BinaryOpType::LOGICAL_XOR; } +constexpr bool is_dtype_supported(BinaryOpType op, DataType dtype) { + switch (op) { + case BinaryOpType::ADD: + case BinaryOpType::SUB: + return ( + dtype == DataType::FLOAT32 || dtype == DataType::BFLOAT16 || dtype == DataType::BFLOAT8_B || + dtype == DataType::BFLOAT4_B || dtype == DataType::INT32); + case BinaryOpType::BITWISE_XOR: + case BinaryOpType::BITWISE_AND: + case BinaryOpType::BITWISE_OR: + case BinaryOpType::LEFT_SHIFT: + case BinaryOpType::RIGHT_SHIFT: return dtype == DataType::INT32; + default: + return ( + dtype == DataType::FLOAT32 || dtype == DataType::BFLOAT16 || dtype == DataType::BFLOAT8_B || + dtype == DataType::BFLOAT4_B); + } +} + // Tensor - Scalar inline Tensor binary_impl( QueueId queue_id, @@ -108,7 +127,10 @@ template auto preprocess_inputs(const Tensor& input_tensor_a_arg, const Tensor& input_tensor_b_arg) { Tensor input_tensor_a = input_tensor_a_arg; Tensor input_tensor_b = input_tensor_b_arg; - + DataType a_dtype = input_tensor_a.get_dtype(); + DataType b_dtype = input_tensor_b.get_dtype(); + TT_FATAL(is_dtype_supported(binary_op_type, a_dtype), "Unsupported data type {}", a_dtype); + TT_FATAL(is_dtype_supported(binary_op_type, b_dtype), "Unsupported data type {}", b_dtype); // TODO: #7731 (Remove calls to repeat ) auto repeat_smaller = [](const auto& first, auto& second) { const auto& first_shape = first.get_logical_shape(); From 01cac26c6a08aef90a8b3948e21c94bbec2a8394 Mon Sep 17 00:00:00 2001 From: Vladimir Milosevic <157983820+vmilosevic@users.noreply.github.com> Date: Fri, 21 Feb 2025 17:10:20 +0100 Subject: [PATCH 217/316] Replace individual llks with tt_llk (#16929) ### Ticket https://github.com/tenstorrent/tt-metal/issues/18134 ### Problem description Code from tt_llk_ is moved to common repo tt_llk. Each architecture has its own subfolder in tt_llk repo. This PR is updating submodules to reflect that. ### What's changed This PR is updating submodules to reflect llk repository merge. There should be no other changes - tt_llk is public repo, moving to it from individual tt_llk repos - tt_llk_ repos are archived - all commits from tt_llk_ are merged into tt_llk ### Checklist - [x] Post commit CI passes - [x] Blackhole Post commit (if applicable) - [ ] Model regression CI testing passes (if applicable) - [ ] Device performance regression CI testing passes (if applicable) - [ ] **(For models and ops writers)** Full [new models](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) tests passes - [ ] New/Existing tests provide coverage for changes --- .gitmodules | 12 ++--- tt_metal/CMakeLists.txt | 60 ++++++++++++------------- tt_metal/hw/CMakeLists.txt | 4 +- tt_metal/jit_build/build.cpp | 6 +-- tt_metal/third_party/tt_llk | 1 + tt_metal/third_party/tt_llk_blackhole | 1 - tt_metal/third_party/tt_llk_grayskull | 1 - tt_metal/third_party/tt_llk_wormhole_b0 | 1 - 8 files changed, 39 insertions(+), 47 deletions(-) create mode 160000 tt_metal/third_party/tt_llk delete mode 160000 tt_metal/third_party/tt_llk_blackhole delete mode 160000 tt_metal/third_party/tt_llk_grayskull delete mode 160000 tt_metal/third_party/tt_llk_wormhole_b0 diff --git a/.gitmodules b/.gitmodules index 4ed1820d85c..0993dd40046 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,18 +4,12 @@ [submodule "tt_metal/third_party/umd"] path = tt_metal/third_party/umd url = https://github.com/tenstorrent/tt-umd.git -[submodule "tt_metal/third_party/tt_llk_grayskull"] - path = tt_metal/third_party/tt_llk_grayskull - url = https://github.com/tenstorrent/tt-llk-gs.git -[submodule "tt_metal/third_party/tt_llk_wormhole_b0"] - path = tt_metal/third_party/tt_llk_wormhole_b0 - url = https://github.com/tenstorrent/tt-llk-wh-b0.git [submodule "models/demos/t3000/llama2_70b/reference/llama"] path = models/demos/t3000/llama2_70b/reference/llama url = https://github.com/tenstorrent-metal/llama.git -[submodule "tt_metal/third_party/tt_llk_blackhole"] - path = tt_metal/third_party/tt_llk_blackhole - url = https://github.com/tenstorrent/tt-llk-bh.git [submodule "3rd_party/wandb-cpp"] path = tt-train/3rd_party/wandb-cpp url = https://github.com/yhisaki/wandb-cpp +[submodule "tt_metal/third_party/tt_llk"] + path = tt_metal/third_party/tt_llk + url = https://github.com/tenstorrent/tt-llk.git diff --git a/tt_metal/CMakeLists.txt b/tt_metal/CMakeLists.txt index 46a372f85a8..7d96a44a239 100644 --- a/tt_metal/CMakeLists.txt +++ b/tt_metal/CMakeLists.txt @@ -64,36 +64,36 @@ if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.23) core_descriptors/grayskull_120_arch.yaml core_descriptors/wormhole_b0_80_arch.yaml core_descriptors/blackhole_140_arch.yaml - third_party/tt_llk_blackhole/common/inc/ckernel.h - third_party/tt_llk_blackhole/common/inc/ckernel_include.h - third_party/tt_llk_blackhole/common/inc/ckernel_defs.h - third_party/tt_llk_blackhole/common/inc/ckernel_instr_params.h - third_party/tt_llk_blackhole/common/inc/ckernel_addrmod.h - third_party/tt_llk_blackhole/common/inc/ckernel_gpr_map.h - third_party/tt_llk_blackhole/common/inc/ckernel_structs.h - third_party/tt_llk_blackhole/common/inc/ckernel_ops.h - third_party/tt_llk_blackhole/common/inc/ckernel_globals.h - third_party/tt_llk_blackhole/llk_lib/llk_defs.h - third_party/tt_llk_wormhole_b0/common/inc/ckernel.h - third_party/tt_llk_wormhole_b0/common/inc/ckernel_include.h - third_party/tt_llk_wormhole_b0/common/inc/ckernel_defs.h - third_party/tt_llk_wormhole_b0/common/inc/ckernel_instr_params.h - third_party/tt_llk_wormhole_b0/common/inc/ckernel_addrmod.h - third_party/tt_llk_wormhole_b0/common/inc/ckernel_gpr_map.h - third_party/tt_llk_wormhole_b0/common/inc/ckernel_structs.h - third_party/tt_llk_wormhole_b0/common/inc/ckernel_ops.h - third_party/tt_llk_wormhole_b0/common/inc/ckernel_globals.h - third_party/tt_llk_wormhole_b0/llk_lib/llk_defs.h - third_party/tt_llk_grayskull/common/inc/ckernel.h - third_party/tt_llk_grayskull/common/inc/ckernel_include.h - third_party/tt_llk_grayskull/common/inc/ckernel_defs.h - third_party/tt_llk_grayskull/common/inc/ckernel_instr_params.h - third_party/tt_llk_grayskull/common/inc/ckernel_addrmod.h - third_party/tt_llk_grayskull/common/inc/ckernel_gpr_map.h - third_party/tt_llk_grayskull/common/inc/ckernel_structs.h - third_party/tt_llk_grayskull/common/inc/ckernel_ops.h - third_party/tt_llk_grayskull/common/inc/ckernel_globals.h - third_party/tt_llk_grayskull/llk_lib/llk_defs.h + third_party/tt_llk/tt_llk_blackhole/common/inc/ckernel.h + third_party/tt_llk/tt_llk_blackhole/common/inc/ckernel_include.h + third_party/tt_llk/tt_llk_blackhole/common/inc/ckernel_defs.h + third_party/tt_llk/tt_llk_blackhole/common/inc/ckernel_instr_params.h + third_party/tt_llk/tt_llk_blackhole/common/inc/ckernel_addrmod.h + third_party/tt_llk/tt_llk_blackhole/common/inc/ckernel_gpr_map.h + third_party/tt_llk/tt_llk_blackhole/common/inc/ckernel_structs.h + third_party/tt_llk/tt_llk_blackhole/common/inc/ckernel_ops.h + third_party/tt_llk/tt_llk_blackhole/common/inc/ckernel_globals.h + third_party/tt_llk/tt_llk_blackhole/llk_lib/llk_defs.h + third_party/tt_llk/tt_llk_wormhole_b0/common/inc/ckernel.h + third_party/tt_llk/tt_llk_wormhole_b0/common/inc/ckernel_include.h + third_party/tt_llk/tt_llk_wormhole_b0/common/inc/ckernel_defs.h + third_party/tt_llk/tt_llk_wormhole_b0/common/inc/ckernel_instr_params.h + third_party/tt_llk/tt_llk_wormhole_b0/common/inc/ckernel_addrmod.h + third_party/tt_llk/tt_llk_wormhole_b0/common/inc/ckernel_gpr_map.h + third_party/tt_llk/tt_llk_wormhole_b0/common/inc/ckernel_structs.h + third_party/tt_llk/tt_llk_wormhole_b0/common/inc/ckernel_ops.h + third_party/tt_llk/tt_llk_wormhole_b0/common/inc/ckernel_globals.h + third_party/tt_llk/tt_llk_wormhole_b0/llk_lib/llk_defs.h + third_party/tt_llk/tt_llk_grayskull/common/inc/ckernel.h + third_party/tt_llk/tt_llk_grayskull/common/inc/ckernel_include.h + third_party/tt_llk/tt_llk_grayskull/common/inc/ckernel_defs.h + third_party/tt_llk/tt_llk_grayskull/common/inc/ckernel_instr_params.h + third_party/tt_llk/tt_llk_grayskull/common/inc/ckernel_addrmod.h + third_party/tt_llk/tt_llk_grayskull/common/inc/ckernel_gpr_map.h + third_party/tt_llk/tt_llk_grayskull/common/inc/ckernel_structs.h + third_party/tt_llk/tt_llk_grayskull/common/inc/ckernel_ops.h + third_party/tt_llk/tt_llk_grayskull/common/inc/ckernel_globals.h + third_party/tt_llk/tt_llk_grayskull/llk_lib/llk_defs.h tools/profiler/kernel_profiler.hpp impl/dispatch/kernels/cq_common.hpp impl/dispatch/kernels/cq_helpers.hpp diff --git a/tt_metal/hw/CMakeLists.txt b/tt_metal/hw/CMakeLists.txt index 25387208487..ced61995a75 100644 --- a/tt_metal/hw/CMakeLists.txt +++ b/tt_metal/hw/CMakeLists.txt @@ -167,8 +167,8 @@ foreach(ARCH IN LISTS ARCHS) list(APPEND GPP_INCLUDES -I${PROJECT_SOURCE_DIR}/tt_metal/third_party/umd/device/${ARCH}) list(APPEND GPP_INCLUDES -I${PROJECT_SOURCE_DIR}/tt_metal/hw/ckernels/${ARCH_B0}/metal/common) list(APPEND GPP_INCLUDES -I${PROJECT_SOURCE_DIR}/tt_metal/hw/ckernels/${ARCH_B0}/metal/llk_io) - list(APPEND GPP_INCLUDES -I${PROJECT_SOURCE_DIR}/tt_metal/third_party/tt_llk_${ARCH_B0}/common/inc) - list(APPEND GPP_INCLUDES -I${PROJECT_SOURCE_DIR}/tt_metal/third_party/tt_llk_${ARCH_B0}/llk_lib) + list(APPEND GPP_INCLUDES -I${PROJECT_SOURCE_DIR}/tt_metal/third_party/tt_llk/tt_llk_${ARCH_B0}/common/inc) + list(APPEND GPP_INCLUDES -I${PROJECT_SOURCE_DIR}/tt_metal/third_party/tt_llk/tt_llk_${ARCH_B0}/llk_lib) foreach(HWLIB IN LISTS HWLIBS) if("${ARCH}" STREQUAL "blackhole" AND "${HWLIB}" STREQUAL "ncrisc-halt") diff --git a/tt_metal/jit_build/build.cpp b/tt_metal/jit_build/build.cpp index f6c8f991d05..d5d8b6eaca8 100644 --- a/tt_metal/jit_build/build.cpp +++ b/tt_metal/jit_build/build.cpp @@ -170,12 +170,12 @@ void JitBuildEnv::init( this->arch_name_ + "_defines " + "-I" + this->root_ + "tt_metal/hw/inc/" + this->aliased_arch_name_ + "/noc " + "-I" + this->root_ + "tt_metal/hw/ckernels/" + this->arch_name_ + "/metal/common " + "-I" + this->root_ + "tt_metal/hw/ckernels/" + - this->arch_name_ + "/metal/llk_io " + "-I" + this->root_ + "tt_metal/third_party/tt_llk_" + + this->arch_name_ + "/metal/llk_io " + "-I" + this->root_ + "tt_metal/third_party/tt_llk/tt_llk_" + this->arch_name_ + "/common/inc " + // TODO(fixme) datamovement fw shouldn't read this "-I" + this->root_ + "tt_metal/api/" + this->aliased_arch_name_ + " " + "-I" + this->root_ + "tt_metal/api/" + this->aliased_arch_name_ + "/tt-metalium " + "-I" + this->root_ + "tt_metal/api/tt-metalium/ " + "-I" + this->root_ + "tt_metal/api/ " + "-I" + this->root_ + - "tt_metal/third_party/tt_llk_" + this->arch_name_ + "/llk_lib "; + "tt_metal/third_party/tt_llk/tt_llk_" + this->arch_name_ + "/llk_lib "; this->lflags_ = common_flags; this->lflags_ += "-fno-exceptions -Wl,-z,max-page-size=16 -Wl,-z,common-page-size=16 -nostartfiles "; @@ -345,7 +345,7 @@ JitBuildCompute::JitBuildCompute(const JitBuildEnv& env, const JitBuiltStateConf "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/llk_api " + "-I" + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/llk_api/llk_sfpu " + "-I" + env_.root_ + "runtime/sfpi/include " + "-I" + env_.root_ + "tt_metal/hw/firmware/src " + "-I" + env_.root_ + - "tt_metal/third_party/tt_llk_" + env.arch_name_ + "/llk_lib "; + "tt_metal/third_party/tt_llk/tt_llk_" + env.arch_name_ + "/llk_lib "; if (this->is_fw_) { this->srcs_.push_back("tt_metal/hw/firmware/src/trisc.cc"); diff --git a/tt_metal/third_party/tt_llk b/tt_metal/third_party/tt_llk new file mode 160000 index 00000000000..8dde27a7c3e --- /dev/null +++ b/tt_metal/third_party/tt_llk @@ -0,0 +1 @@ +Subproject commit 8dde27a7c3e1f4ea0b900cdb07509875e9d695d0 diff --git a/tt_metal/third_party/tt_llk_blackhole b/tt_metal/third_party/tt_llk_blackhole deleted file mode 160000 index 8c25441b351..00000000000 --- a/tt_metal/third_party/tt_llk_blackhole +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 8c25441b351646046d8de3fd6b8d895b7c87135d diff --git a/tt_metal/third_party/tt_llk_grayskull b/tt_metal/third_party/tt_llk_grayskull deleted file mode 160000 index 0c04db64275..00000000000 --- a/tt_metal/third_party/tt_llk_grayskull +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 0c04db64275a4bd36a7e14d3c533855cb33f6a20 diff --git a/tt_metal/third_party/tt_llk_wormhole_b0 b/tt_metal/third_party/tt_llk_wormhole_b0 deleted file mode 160000 index a34e1966683..00000000000 --- a/tt_metal/third_party/tt_llk_wormhole_b0 +++ /dev/null @@ -1 +0,0 @@ -Subproject commit a34e1966683c478d575d5ea79413004955c8a57f From 2fae63e77fae11b0de48cde136d41587ef53355c Mon Sep 17 00:00:00 2001 From: Oleg Milyutin Date: Fri, 21 Feb 2025 11:25:18 -0500 Subject: [PATCH 218/316] #18050: Delegate to `MeshDeviceView` for mapping / enumerating devices in a mesh (#18127) ### Ticket #18050 ### Problem description "Scoped devices" is used only for keeping lifetimes of opened devices, and for validating that a mesh device is uniformly configured. The ordering and the size of scoped devices won't match what we pass in for submeshes, and won't stay consistent during reshapes. ### What's changed * Delegate to `MeshDeviceView` for mapping / enumerating devices in a mesh (`MeshDevice::get_device` method). * Create `MeshDeviceView` outside of constructor and pass in explicitly as a parameter - instead of setting from outside in `initialize()` method (for root meshes) or via seetting `submesh->view_ = ...` for submeshes. * Rename `ScopedDevices::get_devices()` to `ScopedDevices::root_mesh_devices()` to emphasize the scoped devices correspond to the root mesh. * Add a test for submeshes. ### Checklist - [x] [All post commit](https://github.com/tenstorrent/tt-metal/actions/runs/13450729066) - [X] New/Existing tests provide coverage for changes --- tests/ttnn/distributed/test_distributed.cpp | 30 ++- .../distributed/test_distributed_reshape.cpp | 29 +-- tt_metal/api/tt-metalium/mesh_device.hpp | 9 +- tt_metal/distributed/mesh_device.cpp | 179 +++++++++--------- 4 files changed, 125 insertions(+), 122 deletions(-) diff --git a/tests/ttnn/distributed/test_distributed.cpp b/tests/ttnn/distributed/test_distributed.cpp index c96312176f1..ee9d2f83fb4 100644 --- a/tests/ttnn/distributed/test_distributed.cpp +++ b/tests/ttnn/distributed/test_distributed.cpp @@ -3,14 +3,18 @@ // SPDX-License-Identifier: Apache-2.0 #include +#include #include #include #include +#include "ttnn/distributed/types.hpp" namespace ttnn::distributed::test { +using ::testing::IsEmpty; +using ::testing::SizeIs; using ::tt::tt_metal::distributed::MeshContainer; class DistributedTest : public ::testing::Test { @@ -47,7 +51,7 @@ TEST_F(DistributedTest, TestMemoryAllocationStatistics) { TEST_F(DistributedTest, TestNumDramChannels) { auto mesh = ttnn::distributed::open_mesh_device( {2, 4}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); - EXPECT_EQ(mesh->num_dram_channels(), 96); // 8 devices * 12 channels + EXPECT_EQ(mesh->num_dram_channels(), 96); // 8 devices * 12 channels } TEST_F(DistributedTest, ViewIs2D) { @@ -68,4 +72,28 @@ TEST_F(DistributedTest, ViewIs2D) { EXPECT_FALSE(view_3d.is_mesh_2d()); } +TEST_F(DistributedTest, Submesh) { + auto mesh = ttnn::distributed::open_mesh_device( + {2, 4}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); + + EXPECT_EQ(mesh->shape().num_rows, 2); + EXPECT_EQ(mesh->shape().num_cols, 4); + EXPECT_THAT(mesh->get_devices(), SizeIs(8)); + EXPECT_TRUE(mesh->is_parent_mesh()); + EXPECT_THAT(mesh->get_submeshes(), IsEmpty()); + + auto submesh = mesh->create_submesh(MeshShape{1, 2}, MeshOffset{1, 1}); + EXPECT_THAT(mesh->get_submeshes(), SizeIs(1)); + EXPECT_EQ(submesh->shape().num_rows, 1); + EXPECT_EQ(submesh->shape().num_cols, 2); + EXPECT_THAT(submesh->get_devices(), SizeIs(2)); + EXPECT_FALSE(submesh->is_parent_mesh()); + EXPECT_THAT(submesh->get_submeshes(), IsEmpty()); + + // Verify coordinates are correct. + EXPECT_EQ(mesh->get_device(MeshCoordinate{1, 1})->id(), submesh->get_device(MeshCoordinate{0, 0})->id()); + EXPECT_EQ(mesh->get_device(MeshCoordinate{1, 2})->id(), submesh->get_device(MeshCoordinate{0, 1})->id()); + EXPECT_EQ(submesh->get_device(1, 1), nullptr); + +} // namespace ttnn::distributed::test } // namespace ttnn::distributed::test diff --git a/tests/ttnn/distributed/test_distributed_reshape.cpp b/tests/ttnn/distributed/test_distributed_reshape.cpp index 212368f8d7f..f3a085d0700 100644 --- a/tests/ttnn/distributed/test_distributed_reshape.cpp +++ b/tests/ttnn/distributed/test_distributed_reshape.cpp @@ -82,7 +82,7 @@ TEST_P(MeshReshapeTest, ReshapeBetweenConfigurations) { if ((old_shape.num_rows * old_shape.num_cols) != (new_shape.num_rows * new_shape.num_cols)) { GTEST_SKIP() << "Device counts don't match; we test this in InvalidReshapeDimensions"; } - if (old_shape.num_rows == 1 or old_shape.num_cols == 1) { + if (old_shape.num_rows == 1 or old_shape.num_cols == 1 or new_shape.num_rows == 1 or new_shape.num_cols == 1) { GTEST_SKIP() << "Old shape is 1xN or Nx1; we test this in From1x4To2x2Invalid"; } @@ -106,7 +106,8 @@ TEST_P(MeshReshapeTest, ReshapeBetweenConfigurations) { EXPECT_EQ(mesh->num_cols(), new_shape.num_cols); // Verify device ordering is preserved - EXPECT_EQ(mesh->get_device_ids(), original_order); + EXPECT_EQ(mesh->get_device_ids(), original_order) + << "Device ordering is not preserved " << SimpleMeshShape(old_shape) << " -> " << SimpleMeshShape(new_shape); } // Generate all possible combinations of shapes from kMeshShapes @@ -199,30 +200,6 @@ TEST_F(T3000ReshapeTest, InvalidTotalDeviceCount) { EXPECT_EQ(mesh->num_cols(), 8); } -TEST_F(T3000ReshapeTest, RingPreservation) { - auto mesh = ttnn::distributed::open_mesh_device( - {1, 8}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); - - // Store original device positions - std::vector original_layout; - for (size_t i = 0; i < mesh->num_rows(); ++i) { - for (size_t j = 0; j < mesh->num_cols(); ++j) { - original_layout.push_back(mesh->get_device(i, j)->id()); - } - } - - mesh->reshape({2, 4}); - - // Verify devices are still connected in a Ring topology - std::vector new_layout; - for (size_t i = 0; i < mesh->num_rows(); ++i) { - for (size_t j = 0; j < mesh->num_cols(); ++j) { - new_layout.push_back(mesh->get_device(i, j)->id()); - } - } - EXPECT_EQ(new_layout, original_layout); -} - TEST_F(T3000ReshapeTest, From1x4To2x2Invalid) { auto mesh = ttnn::distributed::open_mesh_device( {1, 4}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); diff --git a/tt_metal/api/tt-metalium/mesh_device.hpp b/tt_metal/api/tt-metalium/mesh_device.hpp index 81b1310d527..9b7c6843abd 100644 --- a/tt_metal/api/tt-metalium/mesh_device.hpp +++ b/tt_metal/api/tt-metalium/mesh_device.hpp @@ -35,7 +35,7 @@ class MeshDevice : public IDevice, public std::enable_shared_from_this opened_devices_; - MeshContainer devices_; + std::vector devices_; public: // Constructor acquires physical resources @@ -51,8 +51,8 @@ class MeshDevice : public IDevice, public std::enable_shared_from_this& get_devices() const; - IDevice* get_device(const MeshCoordinate& coord) const; + // Returns the list of devices opened by the root mesh device (i.e. not submeshes). + const std::vector& root_devices() const; }; std::shared_ptr scoped_devices_; @@ -74,8 +74,9 @@ class MeshDevice : public IDevice, public std::enable_shared_from_this mesh_handle, + std::shared_ptr scoped_devices, const MeshShape& mesh_shape, + std::unique_ptr mesh_device_view, std::weak_ptr parent_mesh = {}); ~MeshDevice() override; diff --git a/tt_metal/distributed/mesh_device.cpp b/tt_metal/distributed/mesh_device.cpp index 5c731e8bd30..7190e8e3806 100644 --- a/tt_metal/distributed/mesh_device.cpp +++ b/tt_metal/distributed/mesh_device.cpp @@ -68,24 +68,15 @@ MeshDevice::ScopedDevices::ScopedDevices( size_t trace_region_size, size_t num_command_queues, const DispatchCoreConfig& dispatch_core_config, - const MeshDeviceConfig& config) : - devices_(SimpleMeshShape(config.mesh_shape), /*fill_value=*/nullptr) { + const MeshDeviceConfig& config) { auto& system_mesh = SystemMesh::instance(); auto physical_device_ids = system_mesh.request_available_devices(config); opened_devices_ = tt::tt_metal::detail::CreateDevices( physical_device_ids, num_command_queues, l1_small_size, trace_region_size, dispatch_core_config); - TT_FATAL( - physical_device_ids.size() == devices_.shape().mesh_size(), - "Device size mismatch; expected: {}, actual: {}", - devices_.shape().mesh_size(), - physical_device_ids.size()); - - auto it = devices_.begin(); for (auto physical_device_id : physical_device_ids) { - it->value() = opened_devices_.at(physical_device_id); - ++it; + devices_.push_back(opened_devices_.at(physical_device_id)); } } @@ -95,36 +86,38 @@ MeshDevice::ScopedDevices::~ScopedDevices() { } } -const std::vector& MeshDevice::ScopedDevices::get_devices() const { return devices_.values(); } - -IDevice* MeshDevice::ScopedDevices::get_device(const MeshCoordinate& coord) const { return devices_.at(coord); } +const std::vector& MeshDevice::ScopedDevices::root_devices() const { return devices_; } uint8_t MeshDevice::num_hw_cqs() const { return validate_and_get_reference_value( - scoped_devices_->get_devices(), [](const auto& device) { return device->num_hw_cqs(); }); + scoped_devices_->root_devices(), [](const auto& device) { return device->num_hw_cqs(); }); } bool MeshDevice::is_initialized() const { return validate_and_get_reference_value( - scoped_devices_->get_devices(), [](const auto& device) { return device->is_initialized(); }); + scoped_devices_->root_devices(), [](const auto& device) { return device->is_initialized(); }); } uint32_t MeshDevice::l1_size_per_core() const { return validate_and_get_reference_value( - scoped_devices_->get_devices(), [](const auto& device) { return device->l1_size_per_core(); }); + scoped_devices_->root_devices(), [](const auto& device) { return device->l1_size_per_core(); }); } uint32_t MeshDevice::dram_size_per_channel() const { return validate_and_get_reference_value( - scoped_devices_->get_devices(), [](const auto& device) { return device->dram_size_per_channel(); }); + scoped_devices_->root_devices(), [](const auto& device) { return device->dram_size_per_channel(); }); } IDevice* MeshDevice::reference_device() const { return this->get_devices().at(0); } MeshDevice::MeshDevice( - std::shared_ptr mesh_handle, const MeshShape& mesh_shape, std::weak_ptr parent_mesh) : + std::shared_ptr mesh_handle, + const MeshShape& mesh_shape, + std::unique_ptr mesh_device_view, + std::weak_ptr parent_mesh) : scoped_devices_(std::move(mesh_handle)), mesh_shape_(mesh_shape), + view_(std::move(mesh_device_view)), mesh_id_(generate_unique_mesh_id()), parent_mesh_(std::move(parent_mesh)) {} @@ -138,10 +131,15 @@ std::shared_ptr MeshDevice::create( // TODO: #17477 Extend to ND. TT_FATAL(config.mesh_shape.dims() == 2, "Mesh shape must be 2D"); auto mesh_shape_2d = MeshShape{config.mesh_shape[0], config.mesh_shape[1]}; + + auto scoped_devices = std::make_shared( + l1_small_size, trace_region_size, num_command_queues, dispatch_core_config, config); + MeshContainer devices(config.mesh_shape, scoped_devices->root_devices()); auto mesh_device = std::make_shared( - std::make_shared( - l1_small_size, trace_region_size, num_command_queues, dispatch_core_config, config), - mesh_shape_2d); + std::move(scoped_devices), + mesh_shape_2d, + std::make_unique(devices), + std::weak_ptr()); mesh_device->initialize(num_command_queues, l1_small_size, trace_region_size, l1_bank_remap); return mesh_device; @@ -171,7 +169,6 @@ std::shared_ptr MeshDevice::create_submesh(const MeshShape& submesh_ mesh_shape_.num_cols); } - auto submesh = std::make_shared(scoped_devices_, submesh_shape, shared_from_this()); auto start_coordinate = MeshCoordinate{offset.row, offset.col}; auto end_coordinate = MeshCoordinate{offset.row + submesh_shape.num_rows - 1, offset.col + submesh_shape.num_cols - 1}; @@ -179,7 +176,12 @@ std::shared_ptr MeshDevice::create_submesh(const MeshShape& submesh_ MeshContainer submesh_devices_container( submesh_shape, view_->get_devices(MeshCoordinateRange{start_coordinate, end_coordinate})); - submesh->view_ = std::make_unique(submesh_devices_container); + auto submesh = std::make_shared( + scoped_devices_, + submesh_shape, + std::make_unique(submesh_devices_container), + shared_from_this()); + submeshes_.push_back(submesh); log_trace( LogMetal, @@ -223,7 +225,7 @@ IDevice* MeshDevice::get_device(size_t row_idx, size_t col_idx) const { return get_device(MeshCoordinate{row_idx, col_idx}); } -IDevice* MeshDevice::get_device(const MeshCoordinate& coord) const { return scoped_devices_->get_device(coord); } +IDevice* MeshDevice::get_device(const MeshCoordinate& coord) const { return view_->get_device(coord); } MeshCommandQueue& MeshDevice::mesh_command_queue(std::size_t cq_id) const { TT_FATAL(this->using_fast_dispatch(), "Can only access the MeshCommandQueue when using Fast Dispatch."); @@ -243,12 +245,12 @@ size_t MeshDevice::num_devices() const { return view_->num_devices(); } CoreCoord MeshDevice::compute_with_storage_grid_size() const { return validate_and_get_reference_value( - scoped_devices_->get_devices(), [](const auto& device) { return device->compute_with_storage_grid_size(); }); + scoped_devices_->root_devices(), [](const auto& device) { return device->compute_with_storage_grid_size(); }); } tt::ARCH MeshDevice::arch() const { return validate_and_get_reference_value( - scoped_devices_->get_devices(), [](const auto& device) { return device->arch(); }); + scoped_devices_->root_devices(), [](const auto& device) { return device->arch(); }); } size_t MeshDevice::num_rows() const { return mesh_shape_.num_rows; } @@ -281,33 +283,31 @@ std::vector MeshDevice::get_row_major_devices(const MeshShape& new_sha // From an MxN mesh, we can always reduce rank to a 1xM*N Line mesh. // However, going from a Line mesh to an MxN mesh is not always possible. - std::vector new_device_order; - if (new_shape.num_rows != 1 and new_shape.num_cols != 1) { - auto new_physical_device_ids = - SystemMesh::instance().request_available_devices( - MeshDeviceConfig{ - .mesh_shape=new_shape - } - ); - - for (size_t i = 0; i < new_physical_device_ids.size(); i++) { - if (physical_device_id_to_linearized_index.find(new_physical_device_ids[i]) == physical_device_id_to_linearized_index.end()) { - TT_THROW( - "User has requested a reshape of the MeshDevice to shape: {}x{}, but it is not possible to form a " - "physically connected mesh of {}x{} grid with the opened devices from the original shape: {}x{}.", - new_shape.num_rows, - new_shape.num_cols, - new_shape.num_rows, - new_shape.num_cols, - this->num_rows(), - this->num_cols()); - } - } - for (size_t i = 0; i < new_physical_device_ids.size(); i++) { - new_device_order.push_back(this->get_device(new_physical_device_ids[i])); + if (new_shape.num_rows == 1 || new_shape.num_cols == 1) { + return view_->get_line_devices(); + } + + auto new_physical_device_ids = + SystemMesh::instance().request_available_devices(MeshDeviceConfig{.mesh_shape = new_shape}); + + for (size_t i = 0; i < new_physical_device_ids.size(); i++) { + if (physical_device_id_to_linearized_index.find(new_physical_device_ids[i]) == + physical_device_id_to_linearized_index.end()) { + TT_THROW( + "User has requested a reshape of the MeshDevice to shape: {}x{}, but it is not possible to form a " + "physically connected mesh of {}x{} grid with the opened devices from the original shape: {}x{}.", + new_shape.num_rows, + new_shape.num_cols, + new_shape.num_rows, + new_shape.num_cols, + this->num_rows(), + this->num_cols()); } - } else { - new_device_order = view_->get_line_devices(); + } + + std::vector new_device_order; + for (size_t i = 0; i < new_physical_device_ids.size(); i++) { + new_device_order.push_back(this->get_device(new_physical_device_ids[i])); } return new_device_order; } @@ -401,66 +401,66 @@ std::tuple MeshDevice::create_sub_device_manage } CoreCoord MeshDevice::dram_grid_size() const { return validate_and_get_reference_value( - scoped_devices_->get_devices(), [](const auto& device) { return device->dram_grid_size(); }); + scoped_devices_->root_devices(), [](const auto& device) { return device->dram_grid_size(); }); } bool MeshDevice::using_slow_dispatch() const { return validate_and_get_reference_value( - scoped_devices_->get_devices(), [](const auto& device) { return device->using_slow_dispatch(); }); + scoped_devices_->root_devices(), [](const auto& device) { return device->using_slow_dispatch(); }); } bool MeshDevice::using_fast_dispatch() const { return validate_and_get_reference_value( - scoped_devices_->get_devices(), [](const auto& device) { return device->using_fast_dispatch(); }); + scoped_devices_->root_devices(), [](const auto& device) { return device->using_fast_dispatch(); }); } // Device property methods that can be delegated to reference device CoreCoord MeshDevice::grid_size() const { return validate_and_get_reference_value( - scoped_devices_->get_devices(), [](const auto& device) { return device->grid_size(); }); + scoped_devices_->root_devices(), [](const auto& device) { return device->grid_size(); }); } CoreCoord MeshDevice::logical_grid_size() const { return validate_and_get_reference_value( - scoped_devices_->get_devices(), [](const auto& device) { return device->logical_grid_size(); }); + scoped_devices_->root_devices(), [](const auto& device) { return device->logical_grid_size(); }); } CoreType MeshDevice::core_type_from_virtual_core(const CoreCoord& virtual_coord) const { - return validate_and_get_reference_value(scoped_devices_->get_devices(), [virtual_coord](const auto& device) { + return validate_and_get_reference_value(scoped_devices_->root_devices(), [virtual_coord](const auto& device) { return device->core_type_from_virtual_core(virtual_coord); }); } CoreCoord MeshDevice::virtual_noc_coordinate(uint8_t noc_index, CoreCoord coord) const { - return validate_and_get_reference_value(scoped_devices_->get_devices(), [noc_index, coord](const auto& device) { + return validate_and_get_reference_value(scoped_devices_->root_devices(), [noc_index, coord](const auto& device) { return device->virtual_noc_coordinate(noc_index, coord); }); } CoreCoord MeshDevice::virtual_noc0_coordinate(uint8_t noc_index, CoreCoord coord) const { - return validate_and_get_reference_value(scoped_devices_->get_devices(), [noc_index, coord](const auto& device) { + return validate_and_get_reference_value(scoped_devices_->root_devices(), [noc_index, coord](const auto& device) { return device->virtual_noc0_coordinate(noc_index, coord); }); } std::vector MeshDevice::worker_cores_from_logical_cores(const std::vector& logical_cores) const { - return validate_and_get_reference_value(scoped_devices_->get_devices(), [logical_cores](const auto& device) { + return validate_and_get_reference_value(scoped_devices_->root_devices(), [logical_cores](const auto& device) { return device->worker_cores_from_logical_cores(logical_cores); }); } std::vector MeshDevice::get_optimal_dram_bank_to_logical_worker_assignment() { - return validate_and_get_reference_value(scoped_devices_->get_devices(), [](const auto& device) { + return validate_and_get_reference_value(scoped_devices_->root_devices(), [](const auto& device) { return device->get_optimal_dram_bank_to_logical_worker_assignment(); }); } CoreCoord MeshDevice::virtual_core_from_logical_core(const CoreCoord& logical_coord, const CoreType& core_type) const { return validate_and_get_reference_value( - scoped_devices_->get_devices(), [logical_coord, core_type](const auto& device) { + scoped_devices_->root_devices(), [logical_coord, core_type](const auto& device) { return device->virtual_core_from_logical_core(logical_coord, core_type); }); } CoreCoord MeshDevice::worker_core_from_logical_core(const CoreCoord& logical_core) const { - return validate_and_get_reference_value(scoped_devices_->get_devices(), [logical_core](const auto& device) { + return validate_and_get_reference_value(scoped_devices_->root_devices(), [logical_core](const auto& device) { return device->worker_core_from_logical_core(logical_core); }); } CoreCoord MeshDevice::logical_core_from_ethernet_core(const CoreCoord& ethernet_core) const { - return validate_and_get_reference_value(scoped_devices_->get_devices(), [ethernet_core](const auto& device) { + return validate_and_get_reference_value(scoped_devices_->root_devices(), [ethernet_core](const auto& device) { return device->logical_core_from_ethernet_core(ethernet_core); }); } @@ -468,12 +468,12 @@ CoreCoord MeshDevice::logical_core_from_ethernet_core(const CoreCoord& ethernet_ // These methods require some change / or assert out for now std::vector MeshDevice::ethernet_cores_from_logical_cores( const std::vector& logical_cores) const { - return validate_and_get_reference_value(scoped_devices_->get_devices(), [logical_cores](const auto& device) { + return validate_and_get_reference_value(scoped_devices_->root_devices(), [logical_cores](const auto& device) { return device->ethernet_cores_from_logical_cores(logical_cores); }); } CoreCoord MeshDevice::ethernet_core_from_logical_core(const CoreCoord& logical_core) const { - return validate_and_get_reference_value(scoped_devices_->get_devices(), [logical_core](const auto& device) { + return validate_and_get_reference_value(scoped_devices_->root_devices(), [logical_core](const auto& device) { return device->ethernet_core_from_logical_core(logical_core); }); } @@ -513,12 +513,12 @@ uint32_t MeshDevice::num_worker_cores(HalProgrammableCoreType core_type, SubDevi int MeshDevice::num_dram_channels() const { return reference_device()->num_dram_channels() * this->num_devices(); } CoreCoord MeshDevice::logical_core_from_dram_channel(uint32_t dram_channel) const { - return validate_and_get_reference_value(scoped_devices_->get_devices(), [dram_channel](const auto& device) { + return validate_and_get_reference_value(scoped_devices_->root_devices(), [dram_channel](const auto& device) { return device->logical_core_from_dram_channel(dram_channel); }); } uint32_t MeshDevice::dram_channel_from_logical_core(const CoreCoord& logical_core) const { - return validate_and_get_reference_value(scoped_devices_->get_devices(), [logical_core](const auto& device) { + return validate_and_get_reference_value(scoped_devices_->root_devices(), [logical_core](const auto& device) { return device->dram_channel_from_logical_core(logical_core); }); } @@ -526,21 +526,21 @@ uint32_t MeshDevice::dram_channel_from_logical_core(const CoreCoord& logical_cor // Core management and network operations const std::set& MeshDevice::ethernet_cores() const { return validate_and_get_reference_value( - scoped_devices_->get_devices(), + scoped_devices_->root_devices(), [](const auto& device) -> const std::set& { return device->ethernet_cores(); }); } const std::set& MeshDevice::storage_only_cores() const { return validate_and_get_reference_value( - scoped_devices_->get_devices(), + scoped_devices_->root_devices(), [](const auto& device) -> const std::set& { return device->storage_only_cores(); }); } uint32_t MeshDevice::get_noc_unicast_encoding(uint8_t noc_index, const CoreCoord& core) const { - return validate_and_get_reference_value(scoped_devices_->get_devices(), [noc_index, core](const auto& device) { + return validate_and_get_reference_value(scoped_devices_->root_devices(), [noc_index, core](const auto& device) { return device->get_noc_unicast_encoding(noc_index, core); }); } uint32_t MeshDevice::get_noc_multicast_encoding(uint8_t noc_index, const CoreRange& cores) const { - return validate_and_get_reference_value(scoped_devices_->get_devices(), [noc_index, cores](const auto& device) { + return validate_and_get_reference_value(scoped_devices_->root_devices(), [noc_index, cores](const auto& device) { return device->get_noc_multicast_encoding(noc_index, cores); }); } @@ -558,29 +558,29 @@ CommandQueue& MeshDevice::command_queue(size_t cq_id) { // Trace management void MeshDevice::begin_trace(const uint8_t cq_id, const uint32_t tid) { - for (auto& device : scoped_devices_->get_devices()) { + for (auto& device : scoped_devices_->root_devices()) { device->begin_trace(cq_id, tid); } } void MeshDevice::end_trace(const uint8_t cq_id, const uint32_t tid) { - for (auto& device : scoped_devices_->get_devices()) { + for (auto& device : scoped_devices_->root_devices()) { device->end_trace(cq_id, tid); } } void MeshDevice::replay_trace( const uint8_t cq_id, const uint32_t tid, const bool block_on_device, const bool block_on_worker_thread) { - for (auto& device : scoped_devices_->get_devices()) { + for (auto& device : scoped_devices_->root_devices()) { device->replay_trace(cq_id, tid, block_on_device, false /* block_on_worker_thread */); } // If blocking, wait until worker threads have completed if (block_on_worker_thread) { - for (auto& device : scoped_devices_->get_devices()) { + for (auto& device : scoped_devices_->root_devices()) { device->synchronize(); } } } void MeshDevice::release_trace(const uint32_t tid) { - for (auto& device : scoped_devices_->get_devices()) { + for (auto& device : scoped_devices_->root_devices()) { device->release_trace(tid); } } @@ -632,9 +632,6 @@ bool MeshDevice::initialize( size_t trace_region_size, tt::stl::Span l1_bank_remap, bool minimal) { - MeshContainer devices(mesh_shape_, scoped_devices_->get_devices()); - view_ = std::make_unique(devices); - // For MeshDevice, we support uniform sub-devices across all devices and we do not support ethernet subdevices. const auto& compute_grid_size = this->compute_with_storage_grid_size(); auto sub_devices = { @@ -690,7 +687,7 @@ std::vector> MeshDevice::extract_dst_no size_t MeshDevice::get_device_kernel_defines_hash() { return validate_and_get_reference_value( - scoped_devices_->get_devices(), [](const auto& device) { return device->get_device_kernel_defines_hash(); }); + scoped_devices_->root_devices(), [](const auto& device) { return device->get_device_kernel_defines_hash(); }); } // Methods for SubDevice Management @@ -717,7 +714,7 @@ SubDeviceManagerId MeshDevice::get_default_sub_device_manager_id() const { return sub_device_manager_tracker_->get_default_sub_device_manager()->id(); } CoreCoord MeshDevice::virtual_program_dispatch_core(uint8_t cq_id) const { - return validate_and_get_reference_value(scoped_devices_->get_devices(), [cq_id](const auto& device) { + return validate_and_get_reference_value(scoped_devices_->root_devices(), [cq_id](const auto& device) { return device->virtual_program_dispatch_core(cq_id); }); } @@ -767,7 +764,7 @@ const std::unique_ptr& MeshDevice::allocator(SubDeviceId sub_device_i MeshSubDeviceManagerId MeshDevice::mesh_create_sub_device_manager( tt::stl::Span sub_devices, DeviceAddr local_l1_size) { MeshSubDeviceManagerId mesh_sub_device_manager_id(*this); - const auto& devices = scoped_devices_->get_devices(); + const auto& devices = scoped_devices_->root_devices(); for (uint32_t i = 0; i < devices.size(); i++) { auto* device = devices[i]; auto& sub_device_manager_id = mesh_sub_device_manager_id.sub_device_manager_ids[i]; @@ -784,7 +781,7 @@ MeshSubDeviceManagerId MeshDevice::mesh_create_sub_device_manager( std::tuple MeshDevice::mesh_create_sub_device_manager_with_fabric(tt::stl::Span sub_devices, DeviceAddr local_l1_size) { MeshSubDeviceManagerId mesh_sub_device_manager_id(*this); SubDeviceId fabric_sub_device_id; - const auto& devices = scoped_devices_->get_devices(); + const auto& devices = scoped_devices_->root_devices(); for (uint32_t i = 0; i < devices.size(); i++) { auto* device = devices[i]; auto& sub_device_manager_id = mesh_sub_device_manager_id.sub_device_manager_ids[i]; @@ -800,7 +797,7 @@ std::tuple MeshDevice::mesh_create_sub_devi } void MeshDevice::mesh_load_sub_device_manager(MeshSubDeviceManagerId mesh_sub_device_manager_id) { - const auto& devices = scoped_devices_->get_devices(); + const auto& devices = scoped_devices_->root_devices(); for (uint32_t i = 0; i < devices.size(); i++) { auto* device = devices[i]; auto sub_device_manager_id = mesh_sub_device_manager_id.sub_device_manager_ids[i]; @@ -809,12 +806,12 @@ void MeshDevice::mesh_load_sub_device_manager(MeshSubDeviceManagerId mesh_sub_de } } void MeshDevice::mesh_clear_loaded_sub_device_manager() { - for (auto* device : scoped_devices_->get_devices()) { + for (auto* device : scoped_devices_->root_devices()) { device->push_work([device]() { device->clear_loaded_sub_device_manager(); }); } } void MeshDevice::mesh_remove_sub_device_manager(MeshSubDeviceManagerId mesh_sub_device_manager_id) { - const auto& devices = scoped_devices_->get_devices(); + const auto& devices = scoped_devices_->root_devices(); for (uint32_t i = 0; i < devices.size(); i++) { auto* device = devices[i]; auto sub_device_manager_id = mesh_sub_device_manager_id.sub_device_manager_ids[i]; @@ -824,13 +821,13 @@ void MeshDevice::mesh_remove_sub_device_manager(MeshSubDeviceManagerId mesh_sub_ } void MeshDevice::mesh_set_sub_device_stall_group(tt::stl::Span sub_device_ids) { - for (auto* device : scoped_devices_->get_devices()) { + for (auto* device : scoped_devices_->root_devices()) { device->push_work([device, sub_device_ids=std::vector(sub_device_ids.begin(), sub_device_ids.end())]() { device->set_sub_device_stall_group(sub_device_ids); }); } } void MeshDevice::mesh_reset_sub_device_stall_group() { - for (auto* device : scoped_devices_->get_devices()) { + for (auto* device : scoped_devices_->root_devices()) { device->push_work([device]() { device->reset_sub_device_stall_group(); }); } } From 3eb506c465e817c2163b6e5dd36aa96a10a72f18 Mon Sep 17 00:00:00 2001 From: Andrew Fuller Date: Fri, 21 Feb 2025 12:07:13 -0500 Subject: [PATCH 219/316] Update the bisect script (#18126) ### Ticket None ### Problem description The bisect script aaaalmost worked. But not quite. ### What's changed * Control the timeout * Provide adequate history to perform a bisect * Suppress uninteresting log messages for sanity * Group log messages for sanity * Don't bail on timeouts; just skip --- .github/workflows/bisect-dispatch.yaml | 15 +++++++-- tests/scripts/tt_bisect.sh | 46 +++++++++++++++----------- 2 files changed, 38 insertions(+), 23 deletions(-) diff --git a/.github/workflows/bisect-dispatch.yaml b/.github/workflows/bisect-dispatch.yaml index 61f373958a1..dce44222ea7 100644 --- a/.github/workflows/bisect-dispatch.yaml +++ b/.github/workflows/bisect-dispatch.yaml @@ -46,6 +46,10 @@ on: command: required: true type: string + timeout: + required: true + type: string + description: "Timeout (eg: 5m, 1h)" description: type: string default: "Git bisect dispatch" @@ -68,7 +72,11 @@ jobs: - ${{ inputs.runner-label }} - ${{ inputs.extra-label }} steps: - - uses: tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main + - name: ⬇️ Checkout + uses: actions/checkout@v4 + with: + submodules: recursive + fetch-depth: 0 - name: Set up dyanmic env vars for build run: | echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV @@ -78,9 +86,10 @@ jobs: - name: Extract files run: tar -xvf ttm_any.tar - uses: ./.github/actions/install-python-deps - - name: Run pre/post regression tests in a loop + - name: Run Git Bisect + shell: bash run: | source ${{ github.workspace }}/python_env/bin/activate cd $TT_METAL_HOME export PYTHONPATH=$TT_METAL_HOME - ./tests/scripts/tt_bisect.sh -f "${{ inputs.command }}" -b ${{ inputs.bad-commit }} -g ${{ inputs.good-commit }} + ./tests/scripts/tt_bisect.sh -t ${{ inputs.timeout }} -f "${{ inputs.command }}" -b ${{ inputs.bad-commit }} -g ${{ inputs.good-commit }} diff --git a/tests/scripts/tt_bisect.sh b/tests/scripts/tt_bisect.sh index 28becf7a83f..5304803d18b 100755 --- a/tests/scripts/tt_bisect.sh +++ b/tests/scripts/tt_bisect.sh @@ -1,4 +1,5 @@ #!/bin/bash +set -euo pipefail : << 'END' This script is used to find the commit that broke a test. @@ -53,28 +54,38 @@ found=false git bisect start $bad_commit $good_commit -- while [[ "$found" = "false" ]]; do - build_code=0 - echo "at commit `git rev-parse HEAD`" - echo "building Metal" - ./build_metal.sh --build-tests; build_code+=$? + git submodule update --recursive + echo "::group::Building `git rev-parse HEAD`" + build_rc=0 + ./build_metal.sh --build-tests > /dev/null || build_rc=$? + echo "::endgroup::" - if [[ $build_code -ne 0 ]]; then - echo "Build failed" + if [[ $build_rc -ne 0 ]]; then + echo "Build failed; skipping this commit" git bisect skip continue fi - timeout $timeout_duration bash -c "$test" - timeout_code=${PIPESTATUS[0]} - echo $timeout_code + echo "::group::Testing `git rev-parse HEAD`" + timeout_rc=0 + timeout "$timeout_duration" bash -c "$test" || timeout_rc=$? + echo "Exit code: $timeout_rc" + echo "::endgroup::" - if [ $timeout_code -eq 0 ]; then - first_line=$(git bisect good | head -n 1) - elif [ $timeout_code -eq 124 ]; then - echo `git rev-parse HEAD` > ~/bad_commit.txt - break + if [ $timeout_rc -eq 0 ]; then + echo "Commit is good" + increment=$(git bisect good) + echo "${increment}" + first_line=$(echo "${increment}" | head -n 1) + elif [ $timeout_rc -eq 124 ]; then + echo "Test has timed out, skipping this commit" + git bisect skip + continue else - first_line=$(git bisect bad | head -n 1) + echo "Commit is bad" + increment=$(git bisect bad) + echo "${increment}" + first_line=$(echo "${increment}" | head -n 1) fi if [[ $first_line == *"is the first bad commit"* ]]; then @@ -83,8 +94,3 @@ while [[ "$found" = "false" ]]; do fi done git bisect reset - -if [ $timeout_code -eq 124 ]; then - echo "Test has hung, need to reset the board" - exit 124 -fi From 6b652ce5542100be8e2e98a2414ea59fab654201 Mon Sep 17 00:00:00 2001 From: Almeet Bhullar Date: Thu, 20 Feb 2025 15:12:58 +0000 Subject: [PATCH 220/316] Update perf bounds for eth ubench --- ...thernet_link_write_worker_with_transaction_id_bandwidth.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/tt_metal/microbenchmarks/ethernet/test_ethernet_link_write_worker_with_transaction_id_bandwidth.py b/tests/tt_metal/microbenchmarks/ethernet/test_ethernet_link_write_worker_with_transaction_id_bandwidth.py index ddffe910ac1..bdb28fd60af 100644 --- a/tests/tt_metal/microbenchmarks/ethernet/test_ethernet_link_write_worker_with_transaction_id_bandwidth.py +++ b/tests/tt_metal/microbenchmarks/ethernet/test_ethernet_link_write_worker_with_transaction_id_bandwidth.py @@ -155,7 +155,7 @@ def test_erisc_write_worker_bw_bi_dir(sample_count, sample_size_expected_bw, cha @pytest.mark.parametrize("disable_trid", [1]) @pytest.mark.parametrize( "sample_size_expected_bw", - [(16, 0.18), (128, 1.46), (256, 2.93), (512, 5.73), (1024, 9.15), (2048, 11.83), (4096, 12.04), (8192, 12.07)], + [(16, 0.18), (128, 1.70), (256, 3.79), (512, 7.72), (1024, 11.3), (2048, 11.83), (4096, 12.04), (8192, 12.07)], ) def test_erisc_write_worker_bw_uni_dir_no_trid(sample_count, sample_size_expected_bw, channel_count, disable_trid): benchmark_type_id = 2 @@ -176,7 +176,7 @@ def test_erisc_write_worker_bw_uni_dir_no_trid(sample_count, sample_size_expecte @pytest.mark.parametrize("disable_trid", [1]) @pytest.mark.parametrize( "sample_size_expected_bw", - [(16, 0.10), (128, 0.87), (256, 1.73), (512, 3.44), (1024, 5.99), (2048, 9.70), (4096, 11.82)], + [(16, 0.10), (128, 0.87), (256, 1.99), (512, 4.47), (1024, 9.43), (2048, 11.00), (4096, 11.82)], ) def test_erisc_write_worker_bw_bi_dir_no_trid(sample_count, sample_size_expected_bw, channel_count, disable_trid): benchmark_type_id = 3 From bd1a67ded8763827e9c44e2d490de91b3e420083 Mon Sep 17 00:00:00 2001 From: Dimitri Gnidash <119051828+dimitri-tenstorrent@users.noreply.github.com> Date: Fri, 21 Feb 2025 13:00:49 -0500 Subject: [PATCH 221/316] Allow the user to select the version of the docs (#17434) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Ticket #17433 ### Problem description The users would like to see the documentation for several past versions but we currently only build the documentation for the latest commit on main. ### What's changed 1. Added a UI selector to be able to switch between version of the documentation The versions of the documentation are stored in https://github.com/tenstorrent/tt-metal/blob/dimitri/test-versioned-docs/docs/published_versions.json The UI selector makes the HTTP request to get the list of the possible versions and adds the versions to the list for the user to select. This required a change to the _layout.html for both `ttnn` and `tt-metalium`. 2. The deployment of Github pages (what we currently use to host documentation) needs to change to be done from the branch `gh-pages`. This is needed so that we store previous documentation and only add new versions to the branch as folders. The proposed folder structure (e.g.): ``` v0.55.0/ttnn/index.html v0.54.0/tt-metalium/index.html ``` We will need to change the settings for Github pages to be similar to the image below: Screenshot 2025-01-31 at 1 24 50 PM 3. We also need to adjust the package workflow to pass the version number to the docs building workflow and use a different Github actions to publish the folder. ### Additional Requirements - [x] Display the version of the docs built next to the library name - [x] Add a test to verify all links in the Installing.md and Readme.md - [x] Add a test to make sure that after deploying docs the links to docs.tenstorrent.com remain accessible ### Checklist - [x] Prefill the `gh-pages` branch with 3 versions of documentation: 55, 54, and 53 - [x] Switch the population of latest in the workflow - [x] Flip the setting in the Github pages config (test with @tt-rkim ) - [x] Post commit CI passes - [ ] Blackhole Post commit (if applicable) - [ ] Model regression CI testing passes (if applicable) - [ ] Device performance regression CI testing passes (if applicable) - [ ] **(For models and ops writers)** Full [new models](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) tests passes - [ ] New/Existing tests provide coverage for changes --- .github/workflows/code-analysis.yaml | 1 - .../workflows/docs-latest-public-wrapper.yaml | 2 + .github/workflows/docs-latest-public.yaml | 20 ++++--- .github/workflows/package-and-release.yaml | 2 +- docs/published_versions.json | 7 +++ docs/source/common/_static/tt_theme.css | 8 +++ docs/source/common/_templates/layout.html | 12 +++++ docs/source/common/_templates/versions.html | 54 +++++++++++++++++++ 8 files changed, 97 insertions(+), 9 deletions(-) create mode 100644 docs/published_versions.json create mode 100644 docs/source/common/_templates/versions.html diff --git a/.github/workflows/code-analysis.yaml b/.github/workflows/code-analysis.yaml index b096bb0c5e0..331921254f1 100644 --- a/.github/workflows/code-analysis.yaml +++ b/.github/workflows/code-analysis.yaml @@ -46,7 +46,6 @@ jobs: distro: ${{ inputs.distro }} version: ${{ inputs.version }} architecture: ${{ inputs.architecture }} - clang-tidy: name: 🤖 Clang Tidy needs: build-docker-image diff --git a/.github/workflows/docs-latest-public-wrapper.yaml b/.github/workflows/docs-latest-public-wrapper.yaml index 35c1f016a80..07164ddd381 100644 --- a/.github/workflows/docs-latest-public-wrapper.yaml +++ b/.github/workflows/docs-latest-public-wrapper.yaml @@ -15,3 +15,5 @@ jobs: needs: build-artifact uses: ./.github/workflows/docs-latest-public.yaml secrets: inherit + with: + version: latest diff --git a/.github/workflows/docs-latest-public.yaml b/.github/workflows/docs-latest-public.yaml index d3e918a6dcc..ef671c2f436 100644 --- a/.github/workflows/docs-latest-public.yaml +++ b/.github/workflows/docs-latest-public.yaml @@ -2,6 +2,11 @@ name: "[internal] Docs build and deploy to GitHub pages on main impl" on: workflow_call: + inputs: + version: + required: false + type: string + default: latest concurrency: # Note that people may spam the post-commit pipeline on their branch, and @@ -20,7 +25,6 @@ jobs: matrix: arch: [grayskull] env: - DOCS_VERSION: latest ARCH_NAME: ${{ matrix.arch }} LOGURU_LEVEL: INFO LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib @@ -57,21 +61,23 @@ jobs: - name: Prepare artifact - move output run: | mkdir gh_pages - mv docs/build/html gh_pages/$DOCS_VERSION + mv docs/build/html gh_pages/${{ inputs.version }} - name: Prepare artifact - create .nojekyll run: | touch gh_pages/.nojekyll - name: Prepare artifact - create root index run: | touch gh_pages/index.html - - name: Upload artifact - uses: actions/upload-pages-artifact@v3.0.1 - with: - path: "gh_pages" - name: Deploy to GitHub Pages if: ${{ github.ref == 'refs/heads/main' }} + uses: JamesIves/github-pages-deploy-action@v4 id: deployment - uses: actions/deploy-pages@v4.0.4 + with: + token: ${{ secrets.GITHUB_TOKEN }} + branch: gh-pages + target-folder: ${{ inputs.version }} + folder: ./gh_pages/${{ inputs.version }} + force: false - name: Delete artifact if deployment failed # When the deployment API call fails, the artifacts are not cleaned up correctly # and the next attempt (!) run will cause an error. diff --git a/.github/workflows/package-and-release.yaml b/.github/workflows/package-and-release.yaml index 47d679e81b1..1c186079501 100644 --- a/.github/workflows/package-and-release.yaml +++ b/.github/workflows/package-and-release.yaml @@ -210,7 +210,7 @@ jobs: create-and-upload-draft-release ] if: ${{ needs.get-params.outputs.is-release-candidate !='true' && needs.get-params.outputs.should-create-release == 'true' }} - uses: ./.github/workflows/docs-release.yaml + uses: ./.github/workflows/docs-latest-public.yaml with: version: ${{ needs.create-tag.outputs.version }} secrets: inherit diff --git a/docs/published_versions.json b/docs/published_versions.json new file mode 100644 index 00000000000..978d82a8caf --- /dev/null +++ b/docs/published_versions.json @@ -0,0 +1,7 @@ +{ + "versions": [ + "latest", + "v0.55.0", + "v0.54.0" + ] +} diff --git a/docs/source/common/_static/tt_theme.css b/docs/source/common/_static/tt_theme.css index a4f1176666d..9b81114bea5 100644 --- a/docs/source/common/_static/tt_theme.css +++ b/docs/source/common/_static/tt_theme.css @@ -453,3 +453,11 @@ html.writer-html5 background: var(--color-background-alt2) !important; color: var(--color-foreground) !important; } + +.rst-versions.shift-up { + overflow-y: auto; +} + +.project-versions { + font-size: small; +} diff --git a/docs/source/common/_templates/layout.html b/docs/source/common/_templates/layout.html index e80a0b044a7..34ce35ad1af 100644 --- a/docs/source/common/_templates/layout.html +++ b/docs/source/common/_templates/layout.html @@ -17,6 +17,18 @@ {{ project }} +{%- if theme_display_version %} + {%- set nav_version = version %} + {%- if READTHEDOCS and current_version %} + {%- set nav_version = current_version %} + {%- endif %} + {%- if nav_version %} +
+ {{ nav_version }} +
+ {%- endif %} +{%- endif %} + {%- include "searchbox.html" %} {%- endblock %} diff --git a/docs/source/common/_templates/versions.html b/docs/source/common/_templates/versions.html new file mode 100644 index 00000000000..6e118db8db7 --- /dev/null +++ b/docs/source/common/_templates/versions.html @@ -0,0 +1,54 @@ +
+ + Version: latest + + +
+
+
{{ _('Versions') }}
+
+
+ +
+
+ + From a416f8beccb4e165a9e2a2191e0177bf7df8a36a Mon Sep 17 00:00:00 2001 From: Nigel Huang Date: Wed, 19 Feb 2025 20:03:10 +0000 Subject: [PATCH 222/316] #0: check for region overlaps in cq_prefetch - Add static checks on prefetcher_hd and d for overlapped buffer regions --- .../impl/dispatch/kernels/cq_prefetch.cpp | 24 +++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp b/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp index 71a90be2797..ea03c9ab8b8 100644 --- a/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp +++ b/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp @@ -64,6 +64,28 @@ constexpr uint32_t dispatch_s_cb_log_page_size = get_compile_time_arg_val(25); constexpr uint32_t is_d_variant = get_compile_time_arg_val(26); constexpr uint32_t is_h_variant = get_compile_time_arg_val(27); +constexpr uint32_t prefetch_q_end = prefetch_q_base + prefetch_q_size; +constexpr uint32_t cmddat_q_end = cmddat_q_base + cmddat_q_size; +constexpr uint32_t scratch_db_end = scratch_db_base + scratch_db_size; + +// hd and h: fetch_q, cmddat_q, scratch_db +static_assert( + !(is_h_variant) || (prefetch_q_base >= cmddat_q_end || cmddat_q_base >= prefetch_q_end), + "prefetch_q and cmddat_q overlap"); + +static_assert( + !(is_h_variant) || (prefetch_q_base >= scratch_db_end || scratch_db_base >= prefetch_q_end), + "prefetch_q and scratch_db overlap"); + +static_assert( + !(is_h_variant) || (scratch_db_base >= cmddat_q_end || cmddat_q_base >= scratch_db_end), + "cmddat_q and scratch_db overlap"); + +// d: cmddat_q, scratch_db +static_assert( + !(is_d_variant && !is_h_variant) || (scratch_db_base >= cmddat_q_end || cmddat_q_base >= scratch_db_end), + "cmddat_q and scratch_db overlap"); + constexpr uint8_t my_noc_index = NOC_INDEX; constexpr uint32_t my_noc_xy = uint32_t(NOC_XY_ENCODING(MY_NOC_X, MY_NOC_Y)); constexpr uint32_t upstream_noc_xy = uint32_t(NOC_XY_ENCODING(UPSTREAM_NOC_X, UPSTREAM_NOC_Y)); @@ -75,9 +97,7 @@ constexpr uint32_t downstream_cb_page_size = 1 << downstream_cb_log_page_size; constexpr uint32_t dispatch_s_cb_page_size = 1 << dispatch_s_cb_log_page_size; constexpr uint32_t downstream_cb_end = downstream_cb_base + (1 << downstream_cb_log_page_size) * downstream_cb_pages; constexpr uint32_t dispatch_s_buffer_end = dispatch_s_buffer_base + dispatch_s_buffer_size; -constexpr uint32_t prefetch_q_end = prefetch_q_base + prefetch_q_size; constexpr uint32_t cmddat_q_page_size = 1 << cmddat_q_log_page_size; -constexpr uint32_t cmddat_q_end = cmddat_q_base + cmddat_q_size; constexpr uint32_t scratch_db_half_size = scratch_db_size / 2; constexpr uint32_t scratch_db_base0 = scratch_db_base; From 87b193e704808d602032ead3c77ad90a2982f029 Mon Sep 17 00:00:00 2001 From: Pavle Petrovic Date: Fri, 21 Feb 2025 19:54:01 +0100 Subject: [PATCH 223/316] Add Phi-3.5-mini-instruct model support (#17955) --- models/demos/llama3/PERF.md | 6 ++++ .../Phi-3.5-mini-instruct.refpt | Bin 0 -> 50792 bytes models/demos/llama3/tt/load_checkpoints.py | 26 ++++++++++++++++++ models/demos/llama3/tt/model_config.py | 1 + 4 files changed, 33 insertions(+) create mode 100644 models/demos/llama3/tests/reference_outputs/Phi-3.5-mini-instruct.refpt diff --git a/models/demos/llama3/PERF.md b/models/demos/llama3/PERF.md index 2209cbcec87..ce1a72b6685 100644 --- a/models/demos/llama3/PERF.md +++ b/models/demos/llama3/PERF.md @@ -31,6 +31,9 @@ This configuration uses bfp4 MLP FF1+FF3 for all models. **Batch_size=1 and pref | Llama3.1-70B | TG | 95 | 100 | 12.7 | | | Qwen2.5-7B | N300 | 80 | 96 | 37.9 | | | Qwen2.5-72B | T3K | 98 | 100 | 12.8 | | +| Phi3.5-mini | N150 | | | 43.2 | 98 | +| Phi3.5-mini | N300 | | | 57.8 | 62 | +| Phi3.5-mini | T3K | | | 48.8 | 51 | ## Accuracy @@ -58,6 +61,9 @@ This configuration uses bfp4 MLP FF1+FF3 only for the 3.1-70B model and the Qwen | Llama3.1-70B | TG | 95 | 100 | 12.7 | | | Qwen2.5-7B | N300 | 80 | 96 | 33.4 | | | Qwen2.5-72B | T3K | 99 | 100 | 12.8 | | +| Phi3.5-mini | N150 | | | 38.8 | 92 | +| Phi3.5-mini | N300 | | | 53.9 | 63 | +| Phi3.5-mini | T3K | | | 48.6 | 53 | ## Long-context (64K Tokens) diff --git a/models/demos/llama3/tests/reference_outputs/Phi-3.5-mini-instruct.refpt b/models/demos/llama3/tests/reference_outputs/Phi-3.5-mini-instruct.refpt new file mode 100644 index 0000000000000000000000000000000000000000..37011e0b05f1203396074976e7c174c1d24da41c GIT binary patch literal 50792 zcmcKD37B3*l{fGXfe?ahA?yQm0%Q>qvY3#BH5~|$u!OJ$kTq$NKo+{w0TeYDWk*z8 z21QhYpnw7@A_|BEL_vW;WpNNxFe3;!!pJ5F({25KJf`M*r(4IbpC=Ue)~Rz&{m-dW zr|QTQ)N58gy!)COt-O=`r8Exy#T)&e~ zd+Ujvl?!LJQg_|Co!ZRpJ-3J()wbT;KKnPV-qE&xrd;;dbt+N@7!=eqtiR$_3|;BJ-Z@`@ctPH7p`0}cLkMyI&Wzo$yk^ovu=|D5{X zQvUh0Ql8(xl=pVov-5AAbyJq2>FzF9==^(ljZ$9SW$(_vm1~u9yUj|ubJ{;Rv0PuA z>-##&^*Xzj@&mcQ$rk1MqU}rhmE6C+U%5VdwNkzsIjg-R_q#m0^Y1@H|HwPbHNKN_ ze_YCa($C@33LW|*yUfw~_i*I=IC>r({@OpgL7_W#pHfbsXXsbtdiRv4r#v*};lf!3N%BJu+W2?ev4v2|FM7IOxqG?)WcUx6Q?B0#-9t#N63SE+}` zIq~1#CzSh}9b3v@ZCuJv&MW0x!FfXH&&fEwHTVD6QRvze4_ku!q?GJ%So~1`wu;`< z(*L%UZ${2@;eSW?v%7JQ5BbLXX=$(inAqj1=%e2Cb?NW8=%d{U!F%QC(!V#NU+;|X zqlT3G-%5jThklpnJ$s*0e`nftq|^`c`Pb07ZW7Zp9#AC!Kd72MGMD*iGc^=qXcdmmNW&k3IR zh?^_Zud8<`bfb?frS|HNh+XGTE%oBvl*9+P=>JC1L;Pfqmgw>1HieJ=T@rfuXtyHw zk4b!7Km8gL`?>$0@s9^LFMP#Y<8NHWf878vJ$&zvpVxB9fgAbeDfii7QsV!`v5))^ z-D*pUenWOH<<#gUPyBG=<`bD$K67HBn->Oko+v-uJn?N#;)y(v+^-ESeC|trNZ)k_ zl>6{UFD^W}N2z!J(&(#w?_6ILyNKI;a(`U%yTQ4BxwY`SE!T&||1PTH3_H~Mf&6#j z{K98+aIW)%y)*vh30FqW!;wq>Ym%QEcRME^cq;f0%k_UG-hU)<;jvwceDmsGqQ?ft zckoy*^0d1v_tC@k%Fv0=t--@M{Ym2XPgCBW^6|_A`^K*P>wBr^R}(Yt=B3o`s`x9v z;!lTU9PO8JH8Syqe*EL!$hm)9!GqpUME_f>xN}YP#<&EOZY=?vGNwy{;?OO~(BR!R?4`ONDij@j!3m z;vMO?b<$RQm3HXptG)FReSQ!b5CAK+)5 z@w4D!eN?ZD&da!2H{(Hj{OMy|VO@molCcGs&Ep^J+`0Yn*FSWl-c_!(w~jJjzCz)~m0~T~&X6guY&{ zx^La8KaJL_=wIJ{RorSk#3k`WJN<+Ye|lH!?ivo(uiC#5`^?R_pO$!Y?2w`lKfFHq z7X0`Z`@KyUJlnI*csOzF+VFoX^Uh_dhmZamNBA$zctfYYPy9hY@HdZ$b1y|7^UR-f z{d&r0Qua<9?UnMisvbXd_~E0h`Qd|atv9>E@AY)(smQ+}rFef!uFZ?`gLM+`&(xG??z#}`{lJv;vXGi+Msm2H0TCvj|gGxVY z|K2wJ-8v=zUL)84C0@oa`8WUHG=4ifc?iF!*FNz-`q7KO4@q1$4*2Jtn-+cbv-W#@ z%&%}}7r623+D@+3!yV4#Evx*&I3u5auIW*aKlkZ}o;>ziHA@TAA^7(uiU=@zc0FBkKU|j9dBr#mUd@1D=_A`<&!^Z5g-r z^Xw;xtL$|{{Pc_46g>`!-#?Ie-I4qJbV=-cLHzgLiKX2c$s670uiqU|>iH?0l=NVK z{sKSttL@AG*ashWV{df)1>G6dJi%|bul$$&=*Pe4=bC=x(p!7?(YX(YMtt0F#83b6 zMW4;Cu<*B@CfB&!EcyR8 zvM<3-HQnBM&Twt?c_=toSL6THHKKbu_|*IstX}wkHGc7nzUBJ9*zNN8q4`W+b8_hIN0|5QN6>Rc=)}jf zVz;w$|IX+&FnQE|(dX02i+&lp19IPUDQ3ZzY)2o zzV;u@dpqq?@DU&3BtCw3P^q^MS;t9nv5uFG z;-q~G`An@}J->@r-Nnf*5=X_!X*(9Z*G*m`PQtNKoLrW8@U7WtpYd78$zIK+o?LNK zTofn&HT=-AgZ?|W!2dSiqwqz?9`>*9&v>z~Rny%%v$S73aYnnZCO?p;X?IuR1v|*o zu1LSXmh%wulZ}IGBm4G_zpyiVlBb{M!RNC7s9Y)@lx{SQA5i}1z1x=3ym(Ri_st{A zefnLPIA&Ztm;RCer^Ka?XFN{}fA!)WJ59Q(e;MfjJcpc_@#i@fx!U0aN6$(0Pe0bnI)T2=wHA3D$tQjj zefLcHU%7vO^zEIz#`z9<$m4d6{+q|n{gW@9o^f&W0Y%=ESr=>nOytP}_2Vn?BXsiQ z<)QaH!@Qs#Jvw<4G|V=wafrT*y0CV7sIjy&tKD}yh)EDv6A zo0xNMTTU)|PkdJ?@iXtpo7qQy(ZL5U;s6}sfu0}H^J|$;FUdT~pW!kj^-sh;wY=I6 zXTu#8dvquh!4}bxhWu`=+!m-Yxq;&AES5_?#3x*y;YtZ>^8n3*N@N{=yBe z@@IG(=lmJHxCn2!!V~UrJ3e_IzT_L%_`)CG8fW-^B>PWrhOhGruLm#wy?^k!C#7}u z4>E7f9AEs9pVJ3^4^3OuZ)Z*|*M~=r{mmh>%l*$(c}Yv;$~Wy-;13US*#lne(#Rh0 zVi$PVb~!Hd!6oT0xo{_s9{eAF_7G3l!#EIE*aiNtWL{&RX~Dzu=___9xSg5%`z3yT zKI@aiXO#LUGQLiWKMhZuq3>gfOD*98XSnMxdi~XZxOYbfmzoYAeC_kH3;WpzrU!h* zLHV3GrQUfZ^{(X?b^a&6AV;3W&&}KNbL;85lb^|Bw~U|5Cr&z|*yH~FO8G|Suf7>i ze;iuwk2<=Py))k)8UH&W^D@0gC;$6)>?dv<8oB7;4PSVRBkIwsN6&8TAg;l8X&T;q za?#s(5!cRXDfj7ToZ(MCeaM%0l4HCWPsWjX8*U$sANNgua?bWe{(-?uKi|szi$j0u zs6w}Uwf>f88V}C3^Aq~xEAHr@e%1J)ho64(U-K^<;R;7`;YA+4YC8>h**E;!BNRjoZg#Ke)ls zyeh8qKe&<4&&j7xjUU|lr{7H}&&)a(j`Ees!C^tlFTpqK_w&+zLGrIxgX3k%V;(r9 z;JkI(otwD+-S9gx<<=={yxXgIz%Kk*e#5Wfq5trJ7rWHip#_DbA7 zIM2zi&Hmwkh0hLq6u$K57q1K~_w`r3{)!)sa5gV$_q(hI;0a&2ix=VuIpmWAcYJ$_ zbDg)ySHuf>rTG(oalrglzh~7F{N*S7_R`#^KYno25B+T9r*Jzw^D^AvDBjBZ;7Bh1 z_fE$x>M zFXb7z|Iw_Q@Y9|h#5MfU)%eu@#UAL{L0-Wg@MjnJ%P06V+{qQk@z-ue`eh!h;|6=o z4BZbCZ(d1U{z&dm$T*)Azu6&nIw$2f)9#oh1y6dMlKPWRjJ-yb(*E|w;qU#g;}gH+ ziS_w#_JBYCcsP9ELH_dGhZnol_FxBgVULzwi@x_vD&+^lkDZJg_>%+wTA$h;gA+H+ zr|i?n9@l1md}(yihdq9s`El#4d)Vc+tf%;|{6l`KE^?3CrIcH4 zQ_7<^Ev4r^yKGRd*}*)^e$NC)d1j4MZ8veQr*>m6C3M3bEA%j#>)d4 zud8QXe}DMR8BpZ@GS4TSYZ$OixzGPE2!7}xz4pT_03(VKmZ zH+D8I*^_+ZpM3h%_O0zIPF|k%lf0I_ub*DvDo#d%!7XnJ^fm8O&|7RPq?9nCwsxK zJG+TP?Db;gULU*l%eB0X9QLAzaYmn>+RePeKJ~cj(_HY>Z~Wn6J;9zgW}lS(o~-s^ z_xzW_51n!AJ+>XklzRIU?8vT999QZeiT&2jIAz}rlLw8?{kJmStvl(@p5_<%gS-Z= z=xe*O1N(}P=<9igJ+B#A^gksynP+-BF4?&|`&sEzN`cLHkE3p&&PMclo#Rd2I9l89n zQGT#I{b0Y^ZuqiaZ8vsepWAmTdKs_AC4a%6T>OpK715_rJY~1O@$c5`pRCNf^OfXP zAIbhV{>B?z5xXLOBlfrtKEXTk?=_}B8`L67cmG0%LZx#0h3;<`LZ zfB6BN4#@gpqu}uG!S&NV0UX7JT7Uhbe|Pwa^ZW~L@Pa4YtS`ucSFJ~*c<|2T z-SCn}!E4a)f}`+hcT-CHNA{Dvms#TnFV9KfSnI1l@T>RljYs@z z{NOZxL9t_v=cuf&$sr$}aKyjH?{k^I;NaZX-5F2po^ILw}9nNr+SHl_phtDi}lUvJk?iubi z&c-W0<+s*5@-+F=i`nm*9DKf!e0W*%smoJu-u+SL6L?s^!1158PO&b5r}zgyIC+l2 z|M->nT;K;!`-Ik;>fwn_|L`^M!n+a9^2*6;7CSD9f6t7+Zj(5b?*{xUfH}2Svz1YulN9W{U zN?c(l_G2G<>Zkj)-Pnu$;Hce@?EmxYeG|uimGJ{txDU+p;2LN6-jV%m{P6#K#|Qn4 z6XT=1xMqCA4W4j@yZ0Kb3*ie_?Q6YjoCjxp0B7Un!T8w?iDPRguNjnU&turbJX+5O zp9`M))hG6mhs&#+BbhS3j0<|L$omy9C$8Dg`ChK!Y#i<~u(UTmk4k%Vlk$9J)5PVm z$v4#VgJ&~-#K&iHUE>Mw<<)q@550Qy?4=&Pdhcs%$B*P43wJ2}9J@y;#a%c%Z)HDJ z-fo{ry*ysM^I9Vk=k%L?aCTm+KBwh-drxQlJGb?GuCF_^=zG9+rF3pf9w>gYpMB~X z!S}k%bLLHcBtGc>(^CuoBSSCVe|B!Ue^uh`B?p)5Zzu23Pv?X67Y_Ub9s98xd)fC_ zkG>HdyE@+`PCRyU(QifiFOShrIo)e{zGObBae)`T=?52h;O`s~{moD2AvnQDoFo^2^6)h-@vU+BaQsP}%fqKT{Pd5%!V8Y}d*Nw+j9hZar3XCm=chG(;^M^U>-~tmlV{444rndo75|pd z+dq>J?T|b`JP-%om-Wa6(+ZzE_gIymcmpZs=C@@RhAH~5lcf7^TmC*KA2J&^rlx6Kkiew*uCvVVC^s{K;Pt z{l$&X4lKCxWBx79@>}tipF6*NX87%!eE-L}HlMvSI9j*TN4&LufT#E(KNM%2&zIL) z4;&qNKaPKliynVCy!31R)L)-*vD1ih-*<1ft@0Q4CD% zA5D2s^6qo94|G(|p${6aM^kM*Ky*{!^ZxquVmq*3s)_zWDvSi#_xMy?%6uA9}cnTlTxHOXNBH z6`pV-A70jt^yn#mO#@4R;3)sD@oWuVJEokKIPvx5*B31)eD=(9jOP;{K9;z$QQG}6 z`$=$MkCxQ)TkBZkW6&vuzj3lG>k4+JxBk2seh&>V?Ox9~k?+>|t^8LUrXT;P^)vso z1A6x12kcSDwc7s1mvdjv8@<1|=s$H*DftPx_}B8qy`K8PtMTv8#6BA*?(Z8MKf18U zCkGzl3A>4F^g|C9`ogEDxWEGr_4qJu@FyP*^e~=kT=>8F({o(;y!LAZ5B=9~_4R%D zmx`KPRWZ;BDbBS&TW^k7RP?p3fiHh1x0Xk5_`?~#{8xVGJO`ZN z>wE`%;axvRv<~$=k$fe5$q^6mMfb?yRsCRA;&J^uJL~{|_QCgrdi-iS z_2~cEboz-yI!(zE3_Qc>E!F$deC>9~(#V^;hG6;_zqV*IVTL>)7l&&tJ3n?-lEp z^38)wDUM!|yhGfAXLIy%>0rtNHn!T6g$q2mczszy5x3rz&5_ z^I`r=Z#ep%4}RKNAENhMRJ)V*EB%BY+|ctwIKq$rp(AhG;0G`M#t)ZHD)RUd{P6jG zaMcg~+&}U0!aO%WY_GycyS`J(HT>M?hrXxJztzKmJmZ#s)%c-5H+~FH_`wN3a^MQT z52n5Pj`%Cwl<+dn;CXb$eT^Tv{DEEI^Wn(ZvXX!4(1PEi!T+w5`$W#i5=ZE3{U+b7 z$N3e>v)F?jw5M0yj{a~)r$4nl;D1N-Mb92^Cl`NqP^#zG?1LZt`7t|8&;EyaIV!k* zF6Fm#Zc&^+H|w`Xb~An^r9b?9as2u6apv!E7JuOkSK|QQ7iC>s;|yPTe=2!hjkCB+FTdXi-&{iZwO%6)b;K5BdT{W)^Ym-dJM93Tc}9sTpx!uQ0?$M6&1 z;l|&^Dfi(<9{k{@tm)B-`|zv%T|4}1{P?-J)tY=4evcei`X_(uIG|jwII5KJbN$fH zl;9 z=lKHvto7la==dqTpRMBaqgm(Qlznmh_$@!&BI`zRV#K_H??ahCYy9{p{o#o}+~9?- zQC@;hJ9M?5;@{KwY#oX}oQ_L;g%=#*%TLKWa6-WYZg7RyZppWYC0?@Uo4NjO^2!Hu zuI3lnS6MA_`1Z_)aORJ2H4oPLM(vk1&b42Pqx!Wt^SgLyJu5CuNu0VgadTkii(dx6 zOVY28rT=}BPsn?JkbMICgVVMzIO^xi;d@Z{ZXNmqgBv~3yM_at>UlRU6<*85!cw~V8^vW~dph@#Jp*&lW8>a?TFedh<@ z3ff_h(R)njjFZ>4Dfi(FS9mwQ ztJF8AeT}ntcJ!b^H!b;*`|yUdJmKNk_mjbUhvB9DPZB@Y$-c(LGs^vkLkEZZ5})=C zJ)HH+{@0i#r9C^L!^gQ$`&+}J7kkw8+OdOia8Ke}OXgGkpO*cq55>>v!9VarZ~k8%z4!zbYCH#LFe&moZ|7s5o@=Nbi z|04RImU3B2`SMM%3;sJa7rFQ49C`oDx1S5%-tTEnJotXb-(lhV{5GY%b)kJm`zW8z zINm&Y!}`JDzPY8{&*L{s6W`a$xR!UAKjdxmvkpHe@_k3f_ntoxK1)*ngW$@J15yu9 zIE+vJ1@A!_zsBFZwEK_n`*LubdPwQVyoIIw$_AzE9lAee9vqqWe+rKL#CTCpuS=qT zT`%uKcXjlBDg5cr-wq63E27WC;j8`5>lS_K^Ucg3>dAj|w?cPz`uD5Mx9HTnkM8gg zoEpE`EPnrk=%F1xW7#d`(fJ@;GUZ*UV| zUA2Y6odzTo(WjK2>qDDrgr_2#3a%l$dQ0Uvzf z+c$ctpP2TWBoAxP_-9|GI4XV`-}ZCR4N1Iku7h0d(5)9b@40&3rJcOVd?;`797TR= z9`YQD|L`~CZ2zp=TyGWmyG5b9rWO2**ZBvPYxddtuyXxu>_)Er!_Oy9i`V>fPUKvj z`23xH3!nDHl}!@|ZV2C7TTA^(IfwU-=+TjJ?_B$^*!iiUh5nGVADDc&CD(A6kp9De z&-jmiyeIURXWV)pen9`izbWy5LiD+~x!m7(K`FnCRA1;XB zi3jxlZ0PKB>R;Utam)V56^Ucwf$@n>9C@vBM`) z-jVoncJg@o4M`sGspPNX0bKdT&B>E~mho7}5phC%;OEwZ?xTb2smTKtChqFbn;ED2 zZ65KvXX3`K@ze3~hmGSeF9rvAk_S)Em+X5Qzaz&MKRRMuDTgPHtVo>EuX_AG6M6Cl z=kBh_c>6%|2=ebt+_*9Q)Nku!>+mfzj=q%fYTi3%YSHh~*yW&6<(l8f_s@@B`2Q;L zU;Z#5^Tnq#-sBOUmv3@n;k!okI4k~y-=N^NDE_5?_D$qR)&)PyxSKyBd}FU0JIeJ1 z+n4fF>F@E$@8G~6Zf!1fw}jp~eeVl-KXz>V0xtBT=Xt?rLe;N3bNzU0kuxU!dt;|^ zJv{AvcU7MJ#l+j+>{sZ#r{ev}H3ygbM~C00=|4XekIXCdez>yZZ-dJ*M-_g4hpt!F zX+Mo$OiWz5D)xIT^1O%q_0*FCk4w`3S7N7S2NXH$h5pFJ<$B-v@5FxPT0ij9AAIkQ z|2>iUaHGkk-S|PJ+%0~6arE@vx?>{u&EW8S%F}}*Ipne@xtE3C&$eAv&pz{4m214L z9sc}ezr?+VlV{;e9y<5=*)tU%_2NA}(c72%a>ld$YI)A%TNiyV3Et+P{;`AS7~UTl z7Wtbe-t|BQD)SmO%H?>b_T+&~A z=YIJmJf9v|?0}vgt&__AEiw+?pK)<@=C{M*m*W$k$bmn8aAmJ6hZeri2S4-on2ZDa zaGx1m=z4EgN_^)eF8?~~GJMc|C;ql>*75pL$BkMqaRWVmQ{!Ls!QXq+pFgSScVqnH z=9K4T{OXVX>j%HG|L^^fFDxwleiU52CvxVV<$ldao}xeay5{f3!RU+!=it%tYyRvx zgYUQOowzza6yL-kqM`H&MOxq5OAn(}1S^ z=gn$gw0O~p9mA*3oi}{Uh_S;L%v(5b_`HRSJKC4b>ezAi@g2wSIBH7E`b|exfBftA zC50RPrv~?nqt7Ope_l=g>v@>pJu_aM-<_OxepgAo_hffYeSLmN9P_)$elOX6FPxQM zOPre&e)JJ9#7pDx3&EW}mu5WPl)TkC)w*F}1o3<8IP$&ce$6`ye%kdupa&6qKj9l$n5=Y^3OmKjU``!7$@M)!= z>n3j`|M}=CKd2+5M&FMlF2d*bV2nENe^+{E#9gZqct3g2(+Q_8x(jjr3W@96h% z%>zG=zGuZBk4brXX2M4N{@(lgt)H&{uQ=3t;LF}G>|1bxv*%6E1rPRgE_m92LhpNp zzuTx>v;URxgIey|!Ry|%?`~ez{=%#;4$L@yAa++@m(CM;Z__;DeK>iQ_u0I!W`Z)&|RN}QP%etxGHpSsk(w!h!8Wq;=v z{2saAk!{3}9csJpmHi{{cl*vaKJ?MPw!iqdYx?K=!;RY4exUuY5}&`6dC2czo*2H3 z_|^WfWRo(U{SKsa`hLf;5uZl>a8TldJhf5#+Wu!m@3Gnc_8lg9j^EvD#LxF!j2FL? zHZy$v?%A+huWT;;Zp6=bVtR+ZJAUGh-#Kf<_nt{bEvv+d3tzwU z(n!y#!I6C$@wGnsd-?mmLan#oxj8I)?v?#1zk}la`$l^Eo&bEm(p>QKeS}8se0P98 z-rw(Od*3O5mwhGkNh7(wb0ClXRQfI6Uvk|4ZhzljqPOoH?3!`z{3CyAq=)ZZz>Qx{ z+P}#6e673o&Joi`p6dJW))jMd-o*D6y5s8{2z>eD%Xv=4US|it?#7}1emVIQ{C!vX z@8#n=)Nkvzd%?|jqUA&4*oAqX<~dFM9cFka*`=O`eRrUqhnKY!`NqHfRNtY*SBbxQ zwVszf&s&-Ow?^&i`Iw!K%Q)_p^|pRD;#1Gd_`IE$WgnnX`+6Ss9ZKK#G*8ud)HpPn zhwlmw@-g#==kC5E*+{>7e*WyhVlUrSY}BrvhmF&{jw$WWjQ`vB;%~kKxPIyz@vrA) z{6}P*;A@@edvuNXHJXo`(qG^GYSg}-Z~1}mdwK5peEf>P>vvE6rJk4P2B%ji7hD_Z zSI@uRZ}a|IqjvTD>bpR%CEw&%_PZPLX*7@4^VWg0|Mz~>^Cq0@I6XJx*!ODAT)pu1 z9U6M}w6s6PUjNs1Wh1%fCHsu}SFbPY_MTsPZbeVut>ACY?b~<4XQTL0-QM}?f3$u5 zT&sWb*q+L-+j|e+d-nXdr~EvJ8qu-uWKq z+?`|BKj+ZvbLQkJodaN3d8u=kmqs7=or}C7_vKeD6`ga3b$jRhoafU|cr>T@{G*}w-sWuttc z{_f4w14~}yoQQKE&UrXD;hctkIcMPR!2B)^RFP^t(;gpsk|Kp%Rb?(6Bn0{RvJox!JhjvZSUH=+uFs^CyZZ#P3 z*P{-LTifRyKY!kPkMHQz=Gb|&r?hO`K%r4%Mvogaa@_cF6UI*%Gk)xtk>f{>9XWF3 z_?>ndGjdedzs{t9#>s((E> zpE2T;ljql5&)W5=&AJlku7AT;Vfpb+Io)wR;XNI#i`r+mw%2U`_F&22RoHIaX@Tz8 z{uSYoHP63wqE&g$UEV{UqiUXi>lRr*N85i}(LK*NW6kt$t(eWAFr9H{_e}ra{xyAC zpFTT;%T~RIzYS@Z-Mapi-adM@Q&QJoXS=^XH|=wGx$ZP-=e;%lO5UflRTq7s{clo< Bq}>1j literal 0 HcmV?d00001 diff --git a/models/demos/llama3/tt/load_checkpoints.py b/models/demos/llama3/tt/load_checkpoints.py index f85788ee1e3..ca36ffe140e 100644 --- a/models/demos/llama3/tt/load_checkpoints.py +++ b/models/demos/llama3/tt/load_checkpoints.py @@ -48,6 +48,7 @@ def standardize_hf_keys(state_dict): def convert_hf_to_meta(state_dict, head_dim): + state_dict = split_hf_keys(state_dict) state_dict = convert_hf_qkv_to_meta_format(state_dict, head_dim) state_dict = map_hf_to_meta_keys(state_dict) return state_dict @@ -184,6 +185,31 @@ def load_sharded_checkpoints(checkpoints, n_layers): return checkpoint +def split_hf_keys(loaded_weights): + converted_weights = {} + for key, tensor in loaded_weights.items(): + if "self_attn.qkv_proj" in key: + # split Q, K and V + q_key = key.replace("self_attn.qkv_proj", "self_attn.q_proj") + k_key = key.replace("self_attn.qkv_proj", "self_attn.k_proj") + v_key = key.replace("self_attn.qkv_proj", "self_attn.v_proj") + q_tensor, k_tensor, v_tensor = torch.split(tensor, tensor.shape[0] // 3, dim=0) + converted_weights[q_key] = q_tensor + converted_weights[k_key] = k_tensor + converted_weights[v_key] = v_tensor + elif "mlp.gate_up_proj" in key: + # Split Gate and Up + gate_key = key.replace("mlp.gate_up_proj", "mlp.gate_proj") + up_key = key.replace("mlp.gate_up_proj", "mlp.up_proj") + gate_tensor, up_tensor = torch.split(tensor, tensor.shape[0] // 2, dim=0) + converted_weights[gate_key] = gate_tensor + converted_weights[up_key] = up_tensor + else: + # Keep all other weights unchanged + converted_weights[key] = tensor + return converted_weights + + def convert_hf_qkv_to_meta_format(loaded_weights, head_dim): """Convert HuggingFace QKV weights to Meta format for RoPE compatibility.""" converted_weights = {} diff --git a/models/demos/llama3/tt/model_config.py b/models/demos/llama3/tt/model_config.py index f278e9d755f..14409115cfa 100644 --- a/models/demos/llama3/tt/model_config.py +++ b/models/demos/llama3/tt/model_config.py @@ -204,6 +204,7 @@ def __init__( "DeepSeek-R1-Distill-Llama-70B": {"N150": None, "N300": None, "T3K": 32, "TG": 128}, "Qwen2.5-7B": {"N150": 4, "N300": 64, "T3K": 128, "TG": 128}, "Qwen2.5-72B": {"N150": None, "N300": None, "T3K": 32, "TG": 128}, + "Phi-3.5-mini-instruct": {"N150": 128, "N300": 128, "T3K": 128, "TG": 128}, } try: max_prefill_chunk_size_div1024 = MAX_PREFILL_CHUNK_SIZES_DIV1024[self.base_model_name][self.device_name] From a7fffd259566503e5de2fdbaa335dc4c5ed524ce Mon Sep 17 00:00:00 2001 From: Bryan Wilder Field Lozano Date: Fri, 21 Feb 2025 11:08:35 -0800 Subject: [PATCH 224/316] [skip ci] Update package-and-release.yaml to generate fewer release candidates (#18155) --- .github/workflows/package-and-release.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/package-and-release.yaml b/.github/workflows/package-and-release.yaml index 1c186079501..e6d92cb127e 100644 --- a/.github/workflows/package-and-release.yaml +++ b/.github/workflows/package-and-release.yaml @@ -139,7 +139,7 @@ jobs: path: RELEASE_NOTES.txt # Candidate for breaking up create-and-upload-draft-release: - needs: [create-tag, create-release-notes, build-artifact, test-wheels] + needs: [create-tag, create-release-notes, build-artifact, test-wheels, single-card-demos] # May accidentally create two releases without restricting to 1 job concurrency: create_upload_draft_release runs-on: ubuntu-latest From 1eef336a075be8e0bd20a9f66515f28b01c487db Mon Sep 17 00:00:00 2001 From: Yu Gao <145494740+yugaoTT@users.noreply.github.com> Date: Fri, 21 Feb 2025 14:22:43 -0500 Subject: [PATCH 225/316] #0: fix uneven split on height/width of out tensor in Matmul (#18113) ### Ticket [Link to Github Issue](https://github.com/tenstorrent/tt-metal/issues/17491#event-16386901268) ### Problem description sometimes the matmul has less out-blocks for the last core on row/col, we need to add some re-calc logic to get the correct last num_blocks_h and num_blocks_w for the last core. ### Checklist - [x] [All post commit] https://github.com/tenstorrent/tt-metal/actions/runs/13460386767/job/37614401925 - [x] blackhole https://github.com/tenstorrent/tt-metal/actions/runs/13461943909 --- .../ttnn/unit_tests/operations/test_matmul.py | 159 ++++++++++++++++++ ...ile_layout_in1_receiver_writer_padding.cpp | 20 ++- ..._tile_layout_in1_sender_writer_padding.cpp | 12 +- ...ti_core_reuse_mcast_1d_program_factory.cpp | 31 ++-- ...ti_core_reuse_mcast_2d_program_factory.cpp | 25 ++- 5 files changed, 211 insertions(+), 36 deletions(-) diff --git a/tests/ttnn/unit_tests/operations/test_matmul.py b/tests/ttnn/unit_tests/operations/test_matmul.py index d108d8f0aa2..1bb4cb64bf6 100644 --- a/tests/ttnn/unit_tests/operations/test_matmul.py +++ b/tests/ttnn/unit_tests/operations/test_matmul.py @@ -1185,6 +1185,165 @@ def test_matmul_1d_multiple_output_blocks_per_core( assert device.num_program_cache_entries() == 1 +@pytest.mark.parametrize("side", ["height", "width"]) +@pytest.mark.parametrize("tile_count", [1376, 1375]) +def test_padded_2d_matmul(device, side, tile_count): + """ + This test checks that when the program config specifies per_core_M and per_core_N + which would multiply out to be larger than the true shape of the output, matmul + does not clobber memory outside the shape of the output. + """ + compute_grid_size = device.compute_with_storage_grid_size() + grid_size = [compute_grid_size.x, compute_grid_size.y] + if grid_size[1] < 8: + pytest.skip("device does not have 8x8 grid") + + if side == "height": + M = tile_count * 32 + K = 256 + N = 32 + out_block_h = 11 + out_block_w = 1 + per_core_M = 176 + per_core_N = 1 + else: + M = 32 + K = 256 + N = tile_count * 32 + out_block_h = 1 + out_block_w = 11 + per_core_M = 1 + per_core_N = 176 + torch.manual_seed(0) + program_config = ttnn.MatmulMultiCoreReuseMultiCastProgramConfig( + compute_with_storage_grid_size=(8, 8), + in0_block_w=1, + out_block_h=out_block_h, + out_block_w=out_block_w, + out_subblock_h=1, + out_subblock_w=1, + per_core_M=per_core_M, + per_core_N=per_core_N, + transpose_mcast=False, + fused_activation=None, + fuse_batch=False, + ) + + torch_act = torch.randn([1, 1, M, K], dtype=torch.bfloat16) + torch_weight = torch.randn([1, 1, K, N], dtype=torch.bfloat16) + # Allocate tensors above and below where the output will be + X = 2**8 + dummy_lower = torch.full([1, 1, X, X], 2) + dummy_out = torch.zeros([1, 1, M, N]) + dummy_upper = torch.full([1, 1, X, X], 4) + + act = ttnn.from_torch(torch_act, layout=ttnn.TILE_LAYOUT, device=device, dtype=ttnn.bfloat16) + weight = ttnn.from_torch(torch_weight, layout=ttnn.TILE_LAYOUT, device=device, dtype=ttnn.bfloat16) + lower_tt = ttnn.from_torch(dummy_lower, layout=ttnn.TILE_LAYOUT, device=device, dtype=ttnn.bfloat16) + out_tt = ttnn.from_torch(dummy_out, layout=ttnn.TILE_LAYOUT, device=device, dtype=ttnn.bfloat16) + upper_tt = ttnn.from_torch(dummy_upper, layout=ttnn.TILE_LAYOUT, device=device, dtype=ttnn.bfloat16) + # Free up dummy output tensor so matmul will allocate output there + ttnn.deallocate(out_tt) + output_tensor = ttnn.matmul( + act, + weight, + program_config=program_config, + compute_kernel_config=ttnn.WormholeComputeKernelConfig( + math_fidelity=ttnn.MathFidelity.HiFi2, math_approx_mode=False, fp32_dest_acc_en=True, packer_l1_acc=False + ), + ) + lower = ttnn.to_torch(lower_tt).float() + upper = ttnn.to_torch(upper_tt).float() + # Check that the tensors above and below the output are unchanged + torch_output_tensor = torch.matmul(torch_act, torch_weight) + output_tensor = ttnn.to_torch(output_tensor) + pcc = 0.999 + assert_with_pcc(torch_output_tensor, output_tensor, pcc) + assert torch.all(lower == 2) + assert torch.all(upper == 4) + + +@pytest.mark.parametrize("side", ["height", "width"]) +@pytest.mark.parametrize( + "has_program_config", + [True, False], +) +def test_padded_1d_matmul(device, side, has_program_config): + if side == "height": + M = 10069 + K = 96 + N = 1152 + out_block_h = 21 + out_block_w = 9 + out_subblock_h = 3 + out_subblock_w = 1 + per_core_M = 21 + per_core_N = 36 + mcast_in0 = False + else: + M = 1152 + K = 96 + N = 10369 + out_block_h = 9 + out_block_w = 21 + out_subblock_h = 1 + out_subblock_w = 3 + per_core_M = 36 + per_core_N = 21 + mcast_in0 = True + if has_program_config: + program_config = ttnn.MatmulMultiCoreReuseMultiCast1DProgramConfig( + compute_with_storage_grid_size=(4, 4), + in0_block_w=1, + out_block_h=out_block_h, + out_block_w=out_block_w, + out_subblock_h=out_subblock_h, + out_subblock_w=out_subblock_w, + per_core_M=per_core_M, + per_core_N=per_core_N, + mcast_in0=mcast_in0, + fused_activation=None, + fuse_batch=True, + ) + else: + program_config = None + + torch.manual_seed(0) + pcc = 0.999 + torch_act = torch.randn([1, 1, M, K], dtype=torch.float16) + torch_weight = torch.randn([1, 1, K, N], dtype=torch.float16) + # Allocate tensors above and below where the output will be + X = 2**8 + dummy_lower = torch.full([1, 1, X, X], 2) + dummy_out = torch.zeros([1, 1, M, N]) + dummy_upper = torch.full([1, 1, X, X], 4) + + act = ttnn.from_torch(torch_act, layout=ttnn.TILE_LAYOUT, device=device, dtype=ttnn.bfloat16) + weight = ttnn.from_torch(torch_weight, layout=ttnn.TILE_LAYOUT, device=device, dtype=ttnn.bfloat16) + lower_tt = ttnn.from_torch(dummy_lower, layout=ttnn.TILE_LAYOUT, device=device, dtype=ttnn.bfloat16) + out_tt = ttnn.from_torch(dummy_out, layout=ttnn.TILE_LAYOUT, device=device, dtype=ttnn.bfloat16) + upper_tt = ttnn.from_torch(dummy_upper, layout=ttnn.TILE_LAYOUT, device=device, dtype=ttnn.bfloat16) + # Free up dummy output tensor so linear will allocate output there + ttnn.deallocate(out_tt) + output_tensor = ttnn.matmul( + act, + weight, + core_grid=None if has_program_config else ttnn.CoreGrid(x=4, y=4), + program_config=program_config, + compute_kernel_config=ttnn.WormholeComputeKernelConfig( + math_fidelity=ttnn.MathFidelity.HiFi2, math_approx_mode=False, fp32_dest_acc_en=True, packer_l1_acc=False + ), + ) + lower = ttnn.to_torch(lower_tt).float() + upper = ttnn.to_torch(upper_tt).float() + # Check that the tensors above and below the output are unchanged + torch_output_tensor = torch.matmul(torch_act, torch_weight) + output_tensor = ttnn.to_torch(output_tensor) + assert_with_pcc(torch_output_tensor, output_tensor, pcc) + assert torch.all(lower == 2) + assert torch.all(upper == 4) + + # fmt: off @pytest.mark.skipif(is_wormhole_b0() or is_blackhole(), reason="Unsupported on WH and BH") @pytest.mark.parametrize("m_size,k_size,n_size", [ diff --git a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp index 940a8127695..d6f6c48786e 100644 --- a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp @@ -20,6 +20,8 @@ void kernel_main() { uint32_t out_tensor_start_tile_id = get_arg_val(rt_args_idx++); // padding args (WRITER) + const uint32_t last_num_blocks_h_dim = get_arg_val(rt_args_idx++); + const uint32_t last_num_blocks_w_dim = get_arg_val(rt_args_idx++); const uint32_t out_num_nonzero_subblocks_h = get_arg_val(rt_args_idx++); const uint32_t out_last_num_nonzero_subblocks_h = get_arg_val(rt_args_idx++); const uint32_t out_last_subblock_h = get_arg_val(rt_args_idx++); @@ -140,12 +142,14 @@ void kernel_main() { #ifndef OUT_SHARDED // WRITER + uint32_t num_blocks_h_dim_ = bh >= last_num_blocks_h_dim - 1 ? last_num_blocks_h_dim : num_blocks_h_dim; + uint32_t num_blocks_w_dim_ = bw >= last_num_blocks_w_dim - 1 ? last_num_blocks_w_dim : num_blocks_w_dim; uint32_t out_num_nonzero_subblocks_h_ = out_num_nonzero_subblocks_h; uint32_t out_num_nonzero_subblocks_w_ = out_num_nonzero_subblocks_w; - if (bh == num_blocks_h_dim - 1) { + if (bh == num_blocks_h_dim_ - 1) { out_num_nonzero_subblocks_h_ = out_last_num_nonzero_subblocks_h; } - if (bw == num_blocks_w_dim - 1) { + if (bw == num_blocks_w_dim_ - 1) { out_num_nonzero_subblocks_w_ = out_last_num_nonzero_subblocks_w; } uint32_t out_tensor_sbh_start_tile_id = out_tensor_current_w_dim_block_tile_id; @@ -157,10 +161,10 @@ void kernel_main() { uint32_t out_subblock_h_ = out_subblock_h; uint32_t out_subblock_w_ = out_subblock_w; uint32_t subblock_tiles_addr_skip = 0; - if (bh == num_blocks_h_dim - 1 && sbh == out_num_nonzero_subblocks_h - 1) { + if (bh == num_blocks_h_dim_ - 1 && sbh == out_num_nonzero_subblocks_h_ - 1) { out_subblock_h_ = out_last_subblock_h; } - if (bw == num_blocks_w_dim - 1 && sbw == out_num_nonzero_subblocks_w - 1) { + if (bw == num_blocks_w_dim_ - 1 && sbw == out_num_nonzero_subblocks_w_ - 1) { out_subblock_w_ = out_last_subblock_w; subblock_tiles_addr_skip = padded_subblock_tiles_addr_skip; } @@ -171,7 +175,9 @@ void kernel_main() { for (uint32_t h = 0; h < out_subblock_h_; ++h) { uint32_t out_tensor_tile_id = out_tensor_sb_row_start_tile_id; for (uint32_t w = 0; w < out_subblock_w_; ++w) { - noc_async_write_tile(out_tensor_tile_id, s, l1_read_addr); + if (bh < num_blocks_h_dim_ && bw < num_blocks_w_dim_) { + noc_async_write_tile(out_tensor_tile_id, s, l1_read_addr); + } l1_read_addr += output_single_tile_size_bytes; @@ -188,14 +194,14 @@ void kernel_main() { out_tensor_sbw_start_tile_id += out_tensor_next_subblock_stride_w; } // Pop fully padded subblocks along the row - if (bw == num_blocks_w_dim - 1) { + if (bw == num_blocks_w_dim_ - 1) { cb_wait_front(cb_id_out0, padded_block_tiles_w_skip); cb_pop_front(cb_id_out0, padded_block_tiles_w_skip); } out_tensor_sbh_start_tile_id += out_tensor_next_subblock_stride_h; } // Pop row(s) of fully padded subblocks - if (bh == num_blocks_h_dim - 1) { + if (bh == num_blocks_h_dim_ - 1) { cb_wait_front(cb_id_out0, padded_block_tiles_h_skip); cb_pop_front(cb_id_out0, padded_block_tiles_h_skip); } diff --git a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp index f4216089725..a8c53334a4f 100644 --- a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp @@ -28,6 +28,7 @@ void kernel_main() { // padding args (READER) const uint32_t last_block_w = get_arg_val(rt_args_idx++); // padding args (WRITER) + const uint32_t last_num_blocks_w_dim = get_arg_val(rt_args_idx++); const uint32_t out_num_nonzero_subblocks_h = get_arg_val(rt_args_idx++); const uint32_t out_last_subblock_h = get_arg_val(rt_args_idx++); const uint32_t padded_block_tiles_h_skip = get_arg_val(rt_args_idx++); @@ -420,9 +421,10 @@ void kernel_main() { #ifndef OUT_SHARDED // WRITER + uint32_t num_blocks_w_dim_ = bw >= last_num_blocks_w_dim - 1 ? last_num_blocks_w_dim : num_blocks_w_dim; uint32_t out_num_nonzero_subblocks_h_ = out_num_nonzero_subblocks_h; uint32_t out_num_nonzero_subblocks_w_ = out_num_nonzero_subblocks_w; - if (bw == num_blocks_w_dim - 1) { + if (bw == num_blocks_w_dim_ - 1) { out_num_nonzero_subblocks_w_ = out_last_num_nonzero_subblocks_w; } uint32_t out_tensor_sbh_start_tile_id = out_tensor_current_w_dim_block_tile_id; @@ -437,7 +439,7 @@ void kernel_main() { if (bh == num_blocks_h_dim - 1 && sbh == out_num_nonzero_subblocks_h - 1) { out_subblock_h_ = out_last_subblock_h; } - if (bw == num_blocks_w_dim - 1 && sbw == out_num_nonzero_subblocks_w - 1) { + if (bw == num_blocks_w_dim_ - 1 && sbw == out_num_nonzero_subblocks_w_ - 1) { out_subblock_w_ = out_last_subblock_w; subblock_tiles_addr_skip = padded_subblock_tiles_addr_skip; } @@ -448,7 +450,9 @@ void kernel_main() { for (uint32_t h = 0; h < out_subblock_h_; ++h) { uint32_t out_tensor_tile_id = out_tensor_sb_row_start_tile_id; for (uint32_t w = 0; w < out_subblock_w_; ++w) { - noc_async_write_tile(out_tensor_tile_id, s, l1_read_addr); + if (bw < num_blocks_w_dim_) { + noc_async_write_tile(out_tensor_tile_id, s, l1_read_addr); + } l1_read_addr += output_single_tile_size_bytes; @@ -464,7 +468,7 @@ void kernel_main() { out_tensor_sbw_start_tile_id += out_tensor_next_subblock_stride_w; } // Pop fully padded subblocks along the row - if (bw == num_blocks_w_dim - 1) { + if (bw == num_blocks_w_dim_ - 1) { cb_wait_front(cb_id_out0, padded_block_tiles_w_skip); cb_pop_front(cb_id_out0, padded_block_tiles_w_skip); } diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_1d_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_1d_program_factory.cpp index 5f75c3780cd..63ce0c232a1 100644 --- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_1d_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_1d_program_factory.cpp @@ -718,23 +718,17 @@ operation::ProgramWithCallbacks create_program_mcast_in0( in3_CB_size); } - // Parameters for last row, col, or block - uint32_t last_per_core_M = M % per_core_M == 0 ? per_core_M : M % per_core_M; + // Parameters for last row, col, or block, no need to re-calc h-dim since there's no split on height uint32_t last_per_core_N = N % per_core_N == 0 ? per_core_N : N % per_core_N; - uint32_t last_out_block_h = last_per_core_M % out_block_h == 0 ? out_block_h : last_per_core_M % out_block_h; uint32_t last_out_block_w = last_per_core_N % out_block_w == 0 ? out_block_w : last_per_core_N % out_block_w; - uint32_t last_block_num_nonzero_subblocks_h = (last_out_block_h - 1) / out_subblock_h + 1; + uint32_t last_out_num_blocks_w = (last_per_core_N - 1) / out_block_w + 1; uint32_t last_block_num_nonzero_subblocks_w = (last_out_block_w - 1) / out_subblock_w + 1; - uint32_t last_subblock_of_last_block_h = - last_out_block_h % out_subblock_h == 0 ? out_subblock_h : last_out_block_h % out_subblock_h; uint32_t last_subblock_of_last_block_w = last_out_block_w % out_subblock_w == 0 ? out_subblock_w : last_out_block_w % out_subblock_w; uint32_t last_block_padded_subblock_tiles_addr_skip = output_single_tile_size * (out_subblock_w - last_subblock_of_last_block_w); uint32_t last_block_padded_block_tiles_w_skip = (out_subblock_w * out_subblock_h) * (out_block_w / out_subblock_w - last_block_num_nonzero_subblocks_w); - uint32_t last_block_padded_block_tiles_h_skip = - (out_block_h / out_subblock_h - last_block_num_nonzero_subblocks_h) * (out_block_w * out_subblock_h); CoreCoord start_core_noc = top_left_core_physical; CoreCoord end_core_noc = bottom_right_core_physical; @@ -842,6 +836,7 @@ operation::ProgramWithCallbacks create_program_mcast_in0( mm_in1_sender_writer_args.push_back(last_out_block_w); // padding args (WRITER) + mm_in1_sender_writer_args.push_back(last_out_num_blocks_w); mm_in1_sender_writer_args.push_back(out_block_h / out_subblock_h); mm_in1_sender_writer_args.push_back(out_subblock_h); mm_in1_sender_writer_args.push_back(0); @@ -855,6 +850,7 @@ operation::ProgramWithCallbacks create_program_mcast_in0( mm_in1_sender_writer_args.push_back(out_block_w); // padding args (WRITER) + mm_in1_sender_writer_args.push_back(out_num_blocks_x); mm_in1_sender_writer_args.push_back(out_block_h / out_subblock_h); mm_in1_sender_writer_args.push_back(out_subblock_h); mm_in1_sender_writer_args.push_back(0); @@ -945,7 +941,7 @@ operation::ProgramWithCallbacks create_program_mcast_in0( writer_runtime_args[0] = src_buffer_b->address(); writer_runtime_args[6] = dst_buffer->address(); if (bias_tensor.has_value()) { - writer_runtime_args[17] = (*bias_buffer)->address(); + writer_runtime_args[18] = (*bias_buffer)->address(); } } @@ -1492,19 +1488,11 @@ operation::ProgramWithCallbacks create_program_mcast_in1( // Parameters for last row, col, or block uint32_t last_per_core_M = M % per_core_M == 0 ? per_core_M : M % per_core_M; - uint32_t last_per_core_N = N % per_core_N == 0 ? per_core_N : N % per_core_N; uint32_t last_out_block_h = last_per_core_M % out_block_h == 0 ? out_block_h : last_per_core_M % out_block_h; - uint32_t last_out_block_w = last_per_core_N % out_block_w == 0 ? out_block_w : last_per_core_N % out_block_w; + uint32_t last_out_num_blocks_h = (last_per_core_M - 1) / out_block_h + 1; uint32_t last_block_num_nonzero_subblocks_h = (last_out_block_h - 1) / out_subblock_h + 1; - uint32_t last_block_num_nonzero_subblocks_w = (last_out_block_w - 1) / out_subblock_w + 1; uint32_t last_subblock_of_last_block_h = last_out_block_h % out_subblock_h == 0 ? out_subblock_h : last_out_block_h % out_subblock_h; - uint32_t last_subblock_of_last_block_w = - last_out_block_w % out_subblock_w == 0 ? out_subblock_w : last_out_block_w % out_subblock_w; - uint32_t last_block_padded_subblock_tiles_addr_skip = - output_single_tile_size * (out_subblock_w - last_subblock_of_last_block_w); - uint32_t last_block_padded_block_tiles_w_skip = - (out_subblock_w * out_subblock_h) * (out_block_w / out_subblock_w - last_block_num_nonzero_subblocks_w); uint32_t last_block_padded_block_tiles_h_skip = (out_block_h / out_subblock_h - last_block_num_nonzero_subblocks_h) * (out_block_w * out_subblock_h); @@ -1541,6 +1529,7 @@ operation::ProgramWithCallbacks create_program_mcast_in1( // padding args (READER) (std::uint32_t)out_block_w, // last_block_w // padding args (WRITER) + (std::uint32_t)out_num_blocks_x, (std::uint32_t)out_block_h / out_subblock_h, (std::uint32_t)out_subblock_h, (std::uint32_t)0, @@ -1575,6 +1564,8 @@ operation::ProgramWithCallbacks create_program_mcast_in1( if (output_idx_y == num_blocks_y - 1) { // padding args (WRITER) + mm_in1_receiver_writer_args.push_back(last_out_num_blocks_h); + mm_in1_receiver_writer_args.push_back(out_num_blocks_x); mm_in1_receiver_writer_args.push_back(out_block_h / out_subblock_h); mm_in1_receiver_writer_args.push_back(last_block_num_nonzero_subblocks_h); mm_in1_receiver_writer_args.push_back(last_subblock_of_last_block_h); @@ -1586,6 +1577,8 @@ operation::ProgramWithCallbacks create_program_mcast_in1( mm_in1_receiver_writer_args.push_back(0); } else { // padding args (WRITER) + mm_in1_receiver_writer_args.push_back(out_num_blocks_y); + mm_in1_receiver_writer_args.push_back(out_num_blocks_x); mm_in1_receiver_writer_args.push_back(out_block_h / out_subblock_h); mm_in1_receiver_writer_args.push_back(out_block_h / out_subblock_h); mm_in1_receiver_writer_args.push_back(out_subblock_h); @@ -1664,7 +1657,7 @@ operation::ProgramWithCallbacks create_program_mcast_in1( sender_writer_runtime_args[0] = src_buffer_b->address(); sender_writer_runtime_args[6] = dst_buffer->address(); if (bias_tensor.has_value()) { - sender_writer_runtime_args[17] = (*bias_buffer)->address(); + sender_writer_runtime_args[18] = (*bias_buffer)->address(); } } diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_2d_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_2d_program_factory.cpp index 333c82538c8..0b8c289aaf8 100644 --- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_2d_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_2d_program_factory.cpp @@ -845,6 +845,8 @@ operation::ProgramWithCallbacks create_program_mcast_in0_in1( uint32_t last_per_core_N = N % per_core_N == 0 ? per_core_N : N % per_core_N; uint32_t last_out_block_h = last_per_core_M % out_block_h == 0 ? out_block_h : last_per_core_M % out_block_h; uint32_t last_out_block_w = last_per_core_N % out_block_w == 0 ? out_block_w : last_per_core_N % out_block_w; + uint32_t last_out_num_blocks_h = (last_per_core_M - 1) / out_block_h + 1; + uint32_t last_out_num_blocks_w = (last_per_core_N - 1) / out_block_w + 1; uint32_t last_block_num_nonzero_subblocks_h = (last_out_block_h - 1) / out_subblock_h + 1; uint32_t last_block_num_nonzero_subblocks_w = (last_out_block_w - 1) / out_subblock_w + 1; uint32_t last_subblock_of_last_block_h = @@ -1021,11 +1023,12 @@ operation::ProgramWithCallbacks create_program_mcast_in0_in1( (std::uint32_t)in1_idx * per_core_N + in0_idx * per_core_M * N // out_tensor_start_tile_id }; - if (in1_idx == in1_end_idx) { + if (in1_idx == in1_end_idx) { // right cores when no transpose_mcast // padding args (READER) mm_in1_sender_writer_args.push_back(last_out_block_w); // padding args (WRITER) + mm_in1_sender_writer_args.push_back(last_out_num_blocks_w); mm_in1_sender_writer_args.push_back(out_block_h / out_subblock_h); mm_in1_sender_writer_args.push_back(out_subblock_h); mm_in1_sender_writer_args.push_back(0); @@ -1039,6 +1042,7 @@ operation::ProgramWithCallbacks create_program_mcast_in0_in1( mm_in1_sender_writer_args.push_back(out_block_w); // padding args (WRITER) + mm_in1_sender_writer_args.push_back(out_num_blocks_x); mm_in1_sender_writer_args.push_back(out_block_h / out_subblock_h); mm_in1_sender_writer_args.push_back(out_subblock_h); mm_in1_sender_writer_args.push_back(0); @@ -1059,6 +1063,7 @@ operation::ProgramWithCallbacks create_program_mcast_in0_in1( } if (in1_is_sharded and in1_is_dram) { // in1 is dram sharded + uint32_t num_iter_index = mm_in1_sender_writer_args.size() + 1; vc = vc == 3 ? 0 : vc + 1; mm_in1_sender_writer_args.push_back(vc); @@ -1117,7 +1122,7 @@ operation::ProgramWithCallbacks create_program_mcast_in0_in1( worker_core_stride = stride; } } - mm_in1_sender_writer_args.insert(mm_in1_sender_writer_args.begin() + 20, num_iter); + mm_in1_sender_writer_args.insert(mm_in1_sender_writer_args.begin() + num_iter_index, num_iter); } if (fuse_op) { fused_op_signaler->push_matmul_fused_op_rt_args(mm_in1_sender_writer_args, true); @@ -1139,8 +1144,10 @@ operation::ProgramWithCallbacks create_program_mcast_in0_in1( (std::uint32_t)in1_idx * per_core_N + in0_idx * per_core_M * N // out_tensor_start_tile_id }; - if (in1_idx == in1_end_idx and in0_idx == in0_end_idx) { + if (in1_idx == in1_end_idx and in0_idx == in0_end_idx) { // bottom-right core when no transpose_mcast // padding args (WRITER) + mm_in1_receiver_writer_args.push_back(last_out_num_blocks_h); + mm_in1_receiver_writer_args.push_back(last_out_num_blocks_w); mm_in1_receiver_writer_args.push_back(out_block_h / out_subblock_h); mm_in1_receiver_writer_args.push_back(last_block_num_nonzero_subblocks_h); mm_in1_receiver_writer_args.push_back(last_subblock_of_last_block_h); @@ -1150,8 +1157,10 @@ operation::ProgramWithCallbacks create_program_mcast_in0_in1( mm_in1_receiver_writer_args.push_back(last_subblock_of_last_block_w); mm_in1_receiver_writer_args.push_back(last_block_padded_subblock_tiles_addr_skip); mm_in1_receiver_writer_args.push_back(last_block_padded_block_tiles_w_skip); - } else if (in0_idx == in0_end_idx) { + } else if (in0_idx == in0_end_idx) { // bottom cores except bottom-right when no transpose_mcast // padding args (WRITER) + mm_in1_receiver_writer_args.push_back(last_out_num_blocks_h); + mm_in1_receiver_writer_args.push_back(out_num_blocks_x); mm_in1_receiver_writer_args.push_back(out_block_h / out_subblock_h); mm_in1_receiver_writer_args.push_back(last_block_num_nonzero_subblocks_h); mm_in1_receiver_writer_args.push_back(last_subblock_of_last_block_h); @@ -1161,8 +1170,10 @@ operation::ProgramWithCallbacks create_program_mcast_in0_in1( mm_in1_receiver_writer_args.push_back(out_subblock_w); mm_in1_receiver_writer_args.push_back(0); mm_in1_receiver_writer_args.push_back(0); - } else if (in1_idx == in1_end_idx) { + } else if (in1_idx == in1_end_idx) { // right cores except bottom when no transpose_mcast // padding args (WRITER) + mm_in1_receiver_writer_args.push_back(out_num_blocks_y); + mm_in1_receiver_writer_args.push_back(last_out_num_blocks_w); mm_in1_receiver_writer_args.push_back(out_block_h / out_subblock_h); mm_in1_receiver_writer_args.push_back(out_block_h / out_subblock_h); mm_in1_receiver_writer_args.push_back(out_subblock_h); @@ -1174,6 +1185,8 @@ operation::ProgramWithCallbacks create_program_mcast_in0_in1( mm_in1_receiver_writer_args.push_back(last_block_padded_block_tiles_w_skip); } else { // padding args (WRITER) + mm_in1_receiver_writer_args.push_back(out_num_blocks_y); + mm_in1_receiver_writer_args.push_back(out_num_blocks_x); mm_in1_receiver_writer_args.push_back(out_block_h / out_subblock_h); mm_in1_receiver_writer_args.push_back(out_block_h / out_subblock_h); mm_in1_receiver_writer_args.push_back(out_subblock_h); @@ -1256,7 +1269,7 @@ operation::ProgramWithCallbacks create_program_mcast_in0_in1( writer_runtime_args[0] = src_buffer_b->address(); writer_runtime_args[6] = dst_buffer->address(); if (bias_tensor.has_value()) { - writer_runtime_args[17] = (*bias_buffer)->address(); + writer_runtime_args[18] = (*bias_buffer)->address(); } } From 9ada8ab2acff6f9b4789d72b66cfd1e4d5bc6b91 Mon Sep 17 00:00:00 2001 From: Dalar Vartanians <132954887+dvartaniansTT@users.noreply.github.com> Date: Fri, 21 Feb 2025 11:32:24 -0800 Subject: [PATCH 226/316] fix the reverted PR for Optimize the web demo for yolov4 (#15478) (#15838) ### Problem description Have a real-time web demo for yolov4. There was a merged PR for this that got reverted due to some failure. redoing the PR and running more tests for it now. ### What's changed Enable trace + 2cq Optimize the post processing ### Checklist - [x] Post commit CI passes - [ ] Blackhole Post commit (if applicable) - [x ] Model regression CI testing passes (if applicable) - [x] Device performance regression CI testing passes (if applicable) - [ ] **(For models and ops writers)** Full [new models](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) tests passes - [x] New/Existing tests provide coverage for changes --------- Co-authored-by: Mohamed Bahnas <116673264+mbahnasTT@users.noreply.github.com> Co-authored-by: Mohamed Bahnas --- .../wormhole/yolov4/test_yolov4_performant.py | 4 +- .../yolov4/test_yolov4_performant_webdemo.py | 44 +-- models/demos/yolov4/README.md | 27 +- models/demos/yolov4/demo/demo.py | 231 ++++++++-------- models/demos/yolov4/tests/test_perf_yolo.py | 17 +- .../yolov4/tests/yolov4_perfomant_webdemo.py | 250 ++--------------- .../demos/yolov4/tests/yolov4_test_infra.py | 63 ++--- models/demos/yolov4/ttnn/common.py | 8 + models/demos/yolov4/ttnn/genboxes.py | 256 ++++++++++++++++++ models/demos/yolov4/ttnn/yolov4.py | 35 ++- models/demos/yolov4/web_demo/README.md | 5 + .../demos/yolov4/web_demo/client/coco.names | 80 ++++++ .../yolov4/web_demo/client/requirements.txt | 1 + models/demos/yolov4/web_demo/client/yolov4.py | 181 ++++--------- .../yolov4/web_demo/server/fast_api_yolov4.py | 166 +++++++++++- .../yolov4/test_ttnn_downsample1.py | 10 +- .../yolov4/test_ttnn_downsample2.py | 10 +- .../yolov4/test_ttnn_downsample3.py | 11 +- .../yolov4/test_ttnn_downsample4.py | 9 +- .../yolov4/test_ttnn_downsample5.py | 9 +- .../yolov4/test_ttnn_head.py | 26 +- .../yolov4/test_ttnn_neck.py | 12 +- .../yolov4/test_ttnn_post_processing.py | 80 ++++++ .../yolov4/test_ttnn_yolov4.py | 88 +++--- 24 files changed, 928 insertions(+), 695 deletions(-) create mode 100644 models/demos/yolov4/ttnn/genboxes.py create mode 100644 models/demos/yolov4/web_demo/client/coco.names mode change 100755 => 100644 models/demos/yolov4/web_demo/server/fast_api_yolov4.py create mode 100644 tests/ttnn/integration_tests/yolov4/test_ttnn_post_processing.py diff --git a/models/demos/wormhole/yolov4/test_yolov4_performant.py b/models/demos/wormhole/yolov4/test_yolov4_performant.py index ec4819711a9..81357bfdd70 100644 --- a/models/demos/wormhole/yolov4/test_yolov4_performant.py +++ b/models/demos/wormhole/yolov4/test_yolov4_performant.py @@ -24,7 +24,7 @@ def test_run_yolov4_inference(device, use_program_cache, batch_size, act_dtype, @run_for_wormhole_b0() -@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576, "trace_region_size": 1843200}], indirect=True) +@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576, "trace_region_size": 6422528}], indirect=True) @pytest.mark.parametrize( "batch_size, act_dtype, weight_dtype", ((1, ttnn.bfloat16, ttnn.bfloat16),), @@ -50,7 +50,7 @@ def test_run_yolov4_trace_inference( @run_for_wormhole_b0() @pytest.mark.parametrize( - "device_params", [{"l1_small_size": 24576, "trace_region_size": 3686400, "num_command_queues": 2}], indirect=True + "device_params", [{"l1_small_size": 24576, "trace_region_size": 6397952, "num_command_queues": 2}], indirect=True ) @pytest.mark.parametrize( "batch_size, act_dtype, weight_dtype", diff --git a/models/demos/wormhole/yolov4/test_yolov4_performant_webdemo.py b/models/demos/wormhole/yolov4/test_yolov4_performant_webdemo.py index b4940fbd2ab..bf716285a53 100644 --- a/models/demos/wormhole/yolov4/test_yolov4_performant_webdemo.py +++ b/models/demos/wormhole/yolov4/test_yolov4_performant_webdemo.py @@ -8,52 +8,12 @@ import torch from models.utility_functions import run_for_wormhole_b0 -from models.demos.yolov4.tests.yolov4_perfomant_webdemo import ( - run_yolov4_inference, - run_yolov4_trace_inference, - run_yolov4_trace_2cqs_inference, - Yolov4Trace2CQ, -) - - -@run_for_wormhole_b0() -@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576}], indirect=True) -@pytest.mark.parametrize( - "batch_size, act_dtype, weight_dtype", - ((1, ttnn.bfloat16, ttnn.bfloat16),), -) -def test_run_yolov4_inference(device, use_program_cache, batch_size, act_dtype, weight_dtype, model_location_generator): - run_yolov4_inference(device, batch_size, act_dtype, weight_dtype, model_location_generator) - - -@run_for_wormhole_b0() -@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576, "trace_region_size": 1617920}], indirect=True) -@pytest.mark.parametrize( - "batch_size, act_dtype, weight_dtype", - ((1, ttnn.bfloat16, ttnn.bfloat16),), -) -@pytest.mark.parametrize("enable_async_mode", (False, True), indirect=True) -def test_run_yolov4_trace_inference( - device, - use_program_cache, - batch_size, - act_dtype, - weight_dtype, - enable_async_mode, - model_location_generator, -): - run_yolov4_trace_inference( - device, - batch_size, - act_dtype, - weight_dtype, - model_location_generator, - ) +from models.demos.yolov4.tests.yolov4_perfomant_webdemo import Yolov4Trace2CQ @run_for_wormhole_b0() @pytest.mark.parametrize( - "device_params", [{"l1_small_size": 24576, "trace_region_size": 1617920, "num_command_queues": 2}], indirect=True + "device_params", [{"l1_small_size": 24576, "trace_region_size": 3211264, "num_command_queues": 2}], indirect=True ) @pytest.mark.parametrize( "batch_size, act_dtype, weight_dtype", diff --git a/models/demos/yolov4/README.md b/models/demos/yolov4/README.md index 6e6f560379c..006e1eaacf9 100644 --- a/models/demos/yolov4/README.md +++ b/models/demos/yolov4/README.md @@ -2,24 +2,31 @@ ## How to run yolov4 -- Use the following command to run the yolov4 performant impelementation (95 FPS): +### Model code running with Trace+2CQ +- Use the following command to run the yolov4 performant implementation (71 FPS): + ```bash + pytest models/demos/wormhole/yolov4/test_yolov4_performant_webdemo.py::test_run_yolov4_trace_2cqs_inference[True-1-act_dtype0-weight_dtype0-device_params0] ``` - pytest models/demos/wormhole/yolov4/test_yolov4_performant.py::test_run_yolov4_trace_2cqs_inference[True-1-act_dtype0-weight_dtype0-device_params0] - ``` - -- You may try the interactive web demo following the instructions here: models/demos/yolov4/web_demo/README.md (25-30 FPS). NOTE: The post-processing is currently running on host. It will be moved to device soon which should significantly improve the end to end FPS. - -- Use the following command to run a single-image demo for visualization. NOTE: the following demos are intented for visualization. It is not the performant implementation yet. And, the post processing is currently done on host which we will be moving to device soon. +### Single Image Demo - Use the following command to run the yolov4 with a giraffe image: - ``` + ```bash pytest models/demos/yolov4/demo/demo.py ``` +- The output file `ttnn_yolov4_320_prediction_demo.jpg` will be generated. - Use the following command to run the yolov4 with different input image: - ``` + ```bash pytest --disable-warnings --input-path= models/demos/yolov4/demo/demo.py ``` -Once you run the command, The output file named `ttnn_prediction_demo.jpg` will be generated. + +### mAP Accuracy Test +- To be added soon + +### Web Demo +- You may try the interactive web demo (35 FPS end-2-end) following the instructions: +``` +models/demos/yolov4/web_demo/README.md +``` diff --git a/models/demos/yolov4/demo/demo.py b/models/demos/yolov4/demo/demo.py index 277e28deab0..987f0c7b509 100644 --- a/models/demos/yolov4/demo/demo.py +++ b/models/demos/yolov4/demo/demo.py @@ -140,10 +140,10 @@ def yolo_forward_dynamic( by_bh /= output.size(2) # Shape: [batch, num_anchors * H * W, 1] - bx = bx_bw[:, :num_anchors].view(output.size(0), num_anchors * output.size(2) * output.size(3), 1) - by = by_bh[:, :num_anchors].view(output.size(0), num_anchors * output.size(2) * output.size(3), 1) - bw = bx_bw[:, num_anchors:].view(output.size(0), num_anchors * output.size(2) * output.size(3), 1) - bh = by_bh[:, num_anchors:].view(output.size(0), num_anchors * output.size(2) * output.size(3), 1) + bx = bx_bw[:, :num_anchors].reshape(output.size(0), num_anchors * output.size(2) * output.size(3), 1) + by = by_bh[:, :num_anchors].reshape(output.size(0), num_anchors * output.size(2) * output.size(3), 1) + bw = bx_bw[:, num_anchors:].reshape(output.size(0), num_anchors * output.size(2) * output.size(3), 1) + bh = by_bh[:, num_anchors:].reshape(output.size(0), num_anchors * output.size(2) * output.size(3), 1) bx1 = bx - bw * 0.5 by1 = by - bh * 0.5 @@ -324,12 +324,6 @@ def nms_cpu(boxes, confs, nms_thresh=0.5, min_mode=False): def post_processing(img, conf_thresh, nms_thresh, output): - # anchors = [12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401] - # num_anchors = 9 - # anchor_masks = [[0, 1, 2], [3, 4, 5], [6, 7, 8]] - # strides = [8, 16, 32] - # anchor_step = len(anchors) // num_anchors - # [batch, num, 1, 4] box_array = output[0] # [batch, num, num_classes] @@ -464,34 +458,7 @@ def do_detect(model, img, conf_thresh, nms_thresh, n_classes, device=None, class output_tensor3 = output_tensor3.reshape(1, 10, 10, 255) output_tensor3 = torch.permute(output_tensor3, (0, 3, 1, 2)) - yolo1 = YoloLayer( - anchor_mask=[0, 1, 2], - num_classes=n_classes, - anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], - num_anchors=9, - stride=8, - ) - - yolo2 = YoloLayer( - anchor_mask=[3, 4, 5], - num_classes=n_classes, - anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], - num_anchors=9, - stride=16, - ) - - yolo3 = YoloLayer( - anchor_mask=[6, 7, 8], - num_classes=n_classes, - anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], - num_anchors=9, - stride=32, - ) - - y1 = yolo1(output_tensor1) - y2 = yolo2(output_tensor2) - y3 = yolo3(output_tensor3) - + y1, y2, y3 = gen_yolov4_boxes_confs([output_tensor1, output_tensor2, output_tensor3]) output = get_region_boxes([y1, y2, y3]) t2 = time.time() @@ -511,37 +478,8 @@ def do_detect(model, img, conf_thresh, nms_thresh, n_classes, device=None, class else: t1 = time.time() output = model(img) - - yolo1 = YoloLayer( - anchor_mask=[0, 1, 2], - num_classes=n_classes, - anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], - num_anchors=9, - stride=8, - ) - - yolo2 = YoloLayer( - anchor_mask=[3, 4, 5], - num_classes=n_classes, - anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], - num_anchors=9, - stride=16, - ) - - yolo3 = YoloLayer( - anchor_mask=[6, 7, 8], - num_classes=n_classes, - anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], - num_anchors=9, - stride=32, - ) - - y1 = yolo1(output[0]) - y2 = yolo2(output[1]) - y3 = yolo3(output[2]) - + y1, y2, y3 = gen_yolov4_boxes_confs(output) output = get_region_boxes([y1, y2, y3]) - t2 = time.time() print("-----------------------------------") @@ -556,66 +494,117 @@ def do_detect(model, img, conf_thresh, nms_thresh, n_classes, device=None, class plot_boxes_cv2(img, boxes[0], "torch_prediction_demo.jpg", class_names) +def gen_yolov4_boxes_confs(output): + n_classes = 80 + anchors_array = [12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401] + num_anchors = 9 + anchor_masks = [[0, 1, 2], [3, 4, 5], [6, 7, 8]] + strides = [8, 16, 32] + + yolo1 = YoloLayer( + anchor_mask=anchor_masks[0], + num_classes=n_classes, + anchors=anchors_array, + num_anchors=num_anchors, + stride=strides[0], + ) + + yolo2 = YoloLayer( + anchor_mask=anchor_masks[1], + num_classes=n_classes, + anchors=anchors_array, + num_anchors=num_anchors, + stride=strides[1], + ) + + yolo3 = YoloLayer( + anchor_mask=anchor_masks[2], + num_classes=n_classes, + anchors=anchors_array, + num_anchors=num_anchors, + stride=strides[2], + ) + + y1 = yolo1(output[0]) + y2 = yolo2(output[1]) + y3 = yolo3(output[2]) + + return y1, y2, y3 + + @skip_for_grayskull() @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) -@pytest.mark.parametrize( - "use_pretrained_weight", - [True, False], - ids=[ - "pretrained_weight_true", - "pretrained_weight_false", - ], -) -def test_yolov4_model(device, model_location_generator, reset_seeds, input_path, use_pretrained_weight): +def test_yolov4(device, reset_seeds, model_location_generator): + torch.manual_seed(0) model_path = model_location_generator("models", model_subdir="Yolo") - if use_pretrained_weight: - if model_path == "models": - if not os.path.exists("tests/ttnn/integration_tests/yolov4/yolov4.pth"): # check if yolov4.th is availble - os.system( - "tests/ttnn/integration_tests/yolov4/yolov4_weights_download.sh" - ) # execute the yolov4_weights_download.sh file - - weights_pth = "tests/ttnn/integration_tests/yolov4/yolov4.pth" - else: - weights_pth = str(model_path / "yolov4.pth") - - ttnn_model = TtYOLOv4(device, weights_pth) - torch_model = Yolov4() - new_state_dict = {} - ds_state_dict = {k: v for k, v in ttnn_model.torch_model.items()} - - keys = [name for name, parameter in torch_model.state_dict().items()] - values = [parameter for name, parameter in ds_state_dict.items()] - for i in range(len(keys)): - new_state_dict[keys[i]] = values[i] + if model_path == "models": + if not os.path.exists("tests/ttnn/integration_tests/yolov4/yolov4.pth"): # check if yolov4.th is availble + os.system( + "tests/ttnn/integration_tests/yolov4/yolov4_weights_download.sh" + ) # execute the yolov4_weights_download.sh file - torch_model.load_state_dict(new_state_dict) - torch_model.eval() + weights_pth = "tests/ttnn/integration_tests/yolov4/yolov4.pth" else: - torch_model = Yolov4.from_random_weights() - ttnn_weights = update_weight_parameters(OrderedDict(torch_model.state_dict())) - ttnn_model = TtYOLOv4(device, ttnn_weights) + weights_pth = str(model_path / "yolov4.pth") - n_classes = 80 - namesfile = "models/demos/yolov4/demo/coco.names" - if input_path == "": - imgfile = "models/demos/yolov4/demo/giraffe_320.jpg" - else: - imgfile = input_path + ttnn_model = TtYOLOv4(weights_pth, device) + + imgfile = "models/demos/yolov4/demo/giraffe_320.jpg" width = 320 height = 320 - img = cv2.imread(imgfile) - - # Inference input size is 416*416 does not mean training size is the same - # Training size could be 608*608 or even other sizes - # Optional inference sizes: - # Hight in {320, 416, 512, 608, ... 320 + 96 * n} - # Width in {320, 416, 512, 608, ... 320 + 96 * m} - sized = cv2.resize(img, (width, height)) - sized = cv2.cvtColor(sized, cv2.COLOR_BGR2RGB) - - for i in range(2): # This 'for' loop is for speed check - # Because the first iteration is usually longer - do_detect(ttnn_model, sized, 0.3, 0.4, n_classes, device, class_name=namesfile, imgfile=imgfile) + img = cv2.resize(img, (width, height)) + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + if type(img) == np.ndarray and len(img.shape) == 3: # cv2 image + img = torch.from_numpy(img.transpose(2, 0, 1)).float().div(255.0).unsqueeze(0) + elif type(img) == np.ndarray and len(img.shape) == 4: + img = torch.from_numpy(img.transpose(0, 3, 1, 2)).float().div(255.0) + else: + exit() + torch_input = torch.autograd.Variable(img) + + input_tensor = torch.permute(torch_input, (0, 2, 3, 1)) + ttnn_input = ttnn.from_torch(input_tensor, ttnn.bfloat16) + + torch_model = Yolov4() + new_state_dict = dict(zip(torch_model.state_dict().keys(), ttnn_model.torch_model.values())) + torch_model.load_state_dict(new_state_dict) + torch_model.eval() + + torch_output_tensor = torch_model(torch_input) + + ref1, ref2, ref3 = gen_yolov4_boxes_confs(torch_output_tensor) + ref_boxes, ref_confs = get_region_boxes([ref1, ref2, ref3]) + + ttnn_output_tensor = ttnn_model(ttnn_input) + result_boxes_padded = ttnn.to_torch(ttnn_output_tensor[0]) + result_confs = ttnn.to_torch(ttnn_output_tensor[1]) + + result_boxes_padded = result_boxes_padded.permute(0, 2, 1, 3) + result_boxes_list = [] + # Unpadding + # That ttnn tensor is the concat output of 3 padded tensors + # As a perf workaround I'm doing the unpadding on the torch output here. + # TODO: cleaner ttnn code when ttnn.untilize() is fully optimized + box_1_start_i = 0 + box_1_end_i = 6100 + box_2_start_i = 6128 + box_2_end_i = 6228 + box_3_start_i = 6256 + box_3_end_i = 6356 + result_boxes_list.append(result_boxes_padded[:, box_1_start_i:box_1_end_i]) + result_boxes_list.append(result_boxes_padded[:, box_2_start_i:box_2_end_i]) + result_boxes_list.append(result_boxes_padded[:, box_3_start_i:box_3_end_i]) + result_boxes = torch.cat(result_boxes_list, dim=1) + + ## Giraffe image detection + conf_thresh = 0.3 + nms_thresh = 0.4 + output = [result_boxes.to(torch.float16), result_confs.to(torch.float16)] + + boxes = post_processing(img, conf_thresh, nms_thresh, output) + namesfile = "models/demos/yolov4/demo/coco.names" + class_names = load_class_names(namesfile) + img = cv2.imread(imgfile) + plot_boxes_cv2(img, boxes[0], "ttnn_yolov4_320_prediction_demo.jpg", class_names) diff --git a/models/demos/yolov4/tests/test_perf_yolo.py b/models/demos/yolov4/tests/test_perf_yolo.py index 1b07addbbfe..e5f299b7519 100644 --- a/models/demos/yolov4/tests/test_perf_yolo.py +++ b/models/demos/yolov4/tests/test_perf_yolo.py @@ -26,11 +26,11 @@ def get_expected_compile_time_sec(): - return 60 + return 75 def get_expected_inference_time_sec(): - return 0.237 + return 0.35 @pytest.mark.models_performance_bare_metal @@ -60,14 +60,15 @@ def test_yolov4( weights_pth = "tests/ttnn/integration_tests/yolov4/yolov4.pth" else: weights_pth = str(model_path / "yolov4.pth") - ttnn_model = TtYOLOv4(device, weights_pth) + ttnn_model = TtYOLOv4(weights_pth, device) torch_input_tensor = torch.rand(input_shape, dtype=torch.bfloat16) ttnn_input = ttnn.from_torch(torch_input_tensor, ttnn.bfloat16) logger.info(f"Compiling model with warmup run") profiler.start(f"inference_and_compile_time") - out1, out2, out3 = ttnn_model(ttnn_input) + ttnn_output_tensor = ttnn_model(ttnn_input) + profiler.end(f"inference_and_compile_time") inference_and_compile_time = profiler.get("inference_and_compile_time") @@ -79,10 +80,8 @@ def test_yolov4( for idx in range(iterations): profiler.start("inference_time") profiler.start(f"inference_time_{idx}") - out1, out2, out3 = ttnn_model(ttnn_input) - outputs.append(ttnn.from_device(out1, blocking=False)) - outputs.append(ttnn.from_device(out2, blocking=False)) - outputs.append(ttnn.from_device(out3, blocking=False)) + ttnn_output_tensor = ttnn_model(ttnn_input) + profiler.end(f"inference_time_{idx}") profiler.end("inference_time") @@ -126,7 +125,7 @@ def test_perf_device_bare_metal_yolov4(batch_size, model_name): num_iterations = 1 margin = 0.03 - expected_perf = 234 + expected_perf = 102 command = f"pytest tests/ttnn/integration_tests/yolov4/test_ttnn_yolov4.py" cols = ["DEVICE FW", "DEVICE KERNEL", "DEVICE BRISC KERNEL"] diff --git a/models/demos/yolov4/tests/yolov4_perfomant_webdemo.py b/models/demos/yolov4/tests/yolov4_perfomant_webdemo.py index 0968152e3ce..f8b5486060c 100644 --- a/models/demos/yolov4/tests/yolov4_perfomant_webdemo.py +++ b/models/demos/yolov4/tests/yolov4_perfomant_webdemo.py @@ -9,8 +9,6 @@ is_wormhole_b0, ) from models.demos.yolov4.tests.yolov4_test_infra import create_test_infra -from models.demos.yolov4.demo.demo import YoloLayer - try: from tracy import signpost @@ -31,175 +29,6 @@ def buffer_address(tensor): ttnn.buffer_address = buffer_address -def run_yolov4_inference( - device, - device_batch_size, - act_dtype, - weight_dtype, - model_location_generator, -): - test_infra = create_test_infra( - device, - device_batch_size, - act_dtype, - weight_dtype, - model_location_generator=model_location_generator, - ) - - tt_inputs_host, self.input_mem_config = test_infra.setup_l1_sharded_input(device) - - # # First run configures convs JIT - test_infra.input_tensor = tt_inputs_host.to(device, self.input_mem_config) - test_infra.run() - test_infra.validate() - test_infra.dealloc_output() - - # Optimized run - test_infra.input_tensor = tt_inputs_host.to(device, self.input_mem_config) - test_infra.run() - test_infra.validate() - test_infra.dealloc_output() - - # More optimized run with caching - if use_signpost: - signpost(header="start") - test_infra.input_tensor = tt_inputs_host.to(device, self.input_mem_config) - test_infra.run() - if use_signpost: - signpost(header="stop") - test_infra.validate() - test_infra.dealloc_output() - - -def run_yolov4_trace_inference( - device, - device_batch_size, - act_dtype, - weight_dtype, - model_location_generator, -): - test_infra = create_test_infra( - device, - device_batch_size, - act_dtype, - weight_dtype, - model_location_generator=model_location_generator, - ) - tt_inputs_host, self.input_mem_config = test_infra.setup_l1_sharded_input(device) - - # First run configures convs JIT - test_infra.input_tensor = tt_inputs_host.to(device, self.input_mem_config) - spec = test_infra.input_tensor.spec - test_infra.run() - test_infra.validate() - test_infra.dealloc_output() - - # Optimized run - test_infra.input_tensor = tt_inputs_host.to(device, self.input_mem_config) - test_infra.run() - test_infra.validate() - - # Capture - test_infra.input_tensor = tt_inputs_host.to(device, self.input_mem_config) - test_infra.dealloc_output() - trace_input_addr = ttnn.buffer_address(test_infra.input_tensor) - self.tid = ttnn.begin_trace_capture(device, cq_id=0) - test_infra.run() - tt_image_res = ttnn.allocate_tensor_on_device(spec, device) - ttnn.end_trace_capture(device, self.tid, cq_id=0) - assert trace_input_addr == ttnn.buffer_address(tt_image_res) - - # More optimized run with caching - if use_signpost: - signpost(header="start") - ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res, 0) - ttnn.execute_trace(device, self.tid, cq_id=0, blocking=True) - if use_signpost: - signpost(header="stop") - test_infra.validate() - - ttnn.release_trace(device, self.tid) - test_infra.dealloc_output() - - -def run_yolov4_trace_2cqs_inference( - device, - device_batch_size, - act_dtype, - weight_dtype, - model_location_generator, -): - test_infra = create_test_infra( - device, - device_batch_size, - act_dtype, - weight_dtype, - model_location_generator=model_location_generator, - ) - tt_inputs_host, sharded_mem_config_DRAM, self.input_mem_config = test_infra.setup_dram_sharded_input(device) - tt_image_res = tt_inputs_host.to(device, sharded_mem_config_DRAM) - op_event = ttnn.create_event(device) - write_event = ttnn.create_event(device) - # Initialize the op event so we can write - ttnn.record_event(0, op_event) - - # First run configures convs JIT - ttnn.wait_for_event(1, op_event) - ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res, 1) - ttnn.record_event(1, write_event) - ttnn.wait_for_event(0, write_event) - test_infra.input_tensor = ttnn.to_memory_config(tt_image_res, self.input_mem_config) - spec = test_infra.input_tensor.spec - ttnn.record_event(0, op_event) - test_infra.run() - test_infra.validate() - test_infra.dealloc_output() - - # Optimized run - ttnn.wait_for_event(1, op_event) - ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res, 1) - ttnn.record_event(1, write_event) - ttnn.wait_for_event(0, write_event) - test_infra.input_tensor = ttnn.to_memory_config(tt_image_res, self.input_mem_config) - ttnn.record_event(0, op_event) - test_infra.run() - test_infra.validate() - - # Capture - ttnn.wait_for_event(1, op_event) - ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res, 1) - ttnn.record_event(1, write_event) - ttnn.wait_for_event(0, write_event) - test_infra.input_tensor = ttnn.to_memory_config(tt_image_res, self.input_mem_config) - ttnn.record_event(0, op_event) - test_infra.dealloc_output() - trace_input_addr = ttnn.buffer_address(test_infra.input_tensor) - self.tid = ttnn.begin_trace_capture(device, cq_id=0) - test_infra.run() - self.input_tensor = ttnn.allocate_tensor_on_device(spec, device) - ttnn.end_trace_capture(device, self.tid, cq_id=0) - assert trace_input_addr == ttnn.buffer_address(self.input_tensor) - - # More optimized run with caching - if use_signpost: - signpost(header="start") - for iter in range(0, 2): - ttnn.wait_for_event(1, op_event) - ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res, 1) - ttnn.record_event(1, write_event) - ttnn.wait_for_event(0, write_event) - # TODO: Add in place support to ttnn to_memory_config - self.input_tensor = ttnn.reshard(tt_image_res, self.input_mem_config, self.input_tensor) - ttnn.record_event(0, op_event) - ttnn.execute_trace(device, self.tid, cq_id=0, blocking=False) - ttnn.synchronize_devices(device) - - if use_signpost: - signpost(header="stop") - - ttnn.release_trace(device, self.tid) - - class Yolov4Trace2CQ: def __init__(self): ... @@ -267,12 +96,7 @@ def initialize_yolov4_trace_2cqs_inference( self.device = device - # More optimized run with caching - # if use_signpost: - # signpost(header="start") - def get_region_boxes(self, boxes_and_confs): - print("Getting boxes from boxes and confs ...") boxes_list = [] confs_list = [] @@ -280,8 +104,6 @@ def get_region_boxes(self, boxes_and_confs): boxes_list.append(item[0]) confs_list.append(item[1]) - # boxes: [batch, num1 + num2 + num3, 1, 4] - # confs: [batch, num1 + num2 + num3, num_classes] boxes = torch.cat(boxes_list, dim=1) confs = torch.cat(confs_list, dim=1) @@ -298,57 +120,29 @@ def execute_yolov4_trace_2cqs_inference(self, tt_inputs_host=None): ttnn.record_event(0, self.op_event) ttnn.execute_trace(self.device, self.tid, cq_id=0, blocking=False) ttnn.synchronize_devices(self.device) - output = self.test_infra.output_tensor - - output_tensor1 = ttnn.to_torch(output[0]) - output_tensor1 = output_tensor1.reshape(1, 40, 40, 255) - output_tensor1 = torch.permute(output_tensor1, (0, 3, 1, 2)) - - output_tensor2 = ttnn.to_torch(output[1]) - output_tensor2 = output_tensor2.reshape(1, 20, 20, 255) - output_tensor2 = torch.permute(output_tensor2, (0, 3, 1, 2)) - - output_tensor3 = ttnn.to_torch(output[2]) - output_tensor3 = output_tensor3.reshape(1, 10, 10, 255) - output_tensor3 = torch.permute(output_tensor3, (0, 3, 1, 2)) - - n_classes = 80 - - yolo1 = YoloLayer( - anchor_mask=[0, 1, 2], - num_classes=n_classes, - anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], - num_anchors=9, - stride=8, - ) - - yolo2 = YoloLayer( - anchor_mask=[3, 4, 5], - num_classes=n_classes, - anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], - num_anchors=9, - stride=16, - ) - - yolo3 = YoloLayer( - anchor_mask=[6, 7, 8], - num_classes=n_classes, - anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], - num_anchors=9, - stride=32, - ) - - y1 = yolo1(output_tensor1) - y2 = yolo2(output_tensor2) - y3 = yolo3(output_tensor3) - - output = self.get_region_boxes([y1, y2, y3]) - - return output - # return self.test_infra.output_tensor - # if use_signpost: - # signpost(header="stop") + ttnn_output_tensor = self.test_infra.output_tensor + + result_boxes_padded = ttnn.to_torch(ttnn_output_tensor[0]) + result_confs = ttnn.to_torch(ttnn_output_tensor[1]) + + result_boxes_padded = result_boxes_padded.permute(0, 2, 1, 3) + result_boxes_list = [] + # That ttnn tensor is the concat output of 3 padded tensors + # As a perf workaround I'm doing the unpadding on the torch output here. + # TODO: cleaner ttnn code when ttnn.untilize() is fully optimized + box_1_start_i = 0 + box_1_end_i = 6100 + box_2_start_i = 6128 + box_2_end_i = 6228 + box_3_start_i = 6256 + box_3_end_i = 6356 + result_boxes_list.append(result_boxes_padded[:, box_1_start_i:box_1_end_i]) + result_boxes_list.append(result_boxes_padded[:, box_2_start_i:box_2_end_i]) + result_boxes_list.append(result_boxes_padded[:, box_3_start_i:box_3_end_i]) + result_boxes = torch.cat(result_boxes_list, dim=1) + + return [result_boxes, result_confs] def release_yolov4_trace_2cqs_inference(self): ttnn.release_trace(self.device, self.tid) diff --git a/models/demos/yolov4/tests/yolov4_test_infra.py b/models/demos/yolov4/tests/yolov4_test_infra.py index 1c82369c476..474e2f2e87e 100644 --- a/models/demos/yolov4/tests/yolov4_test_infra.py +++ b/models/demos/yolov4/tests/yolov4_test_infra.py @@ -11,6 +11,8 @@ import ttnn from models.demos.yolov4.reference.yolov4 import Yolov4 from models.demos.yolov4.ttnn.yolov4 import TtYOLOv4 +from models.demos.yolov4.demo.demo import YoloLayer, get_region_boxes, gen_yolov4_boxes_confs + from models.utility_functions import ( is_wormhole_b0, @@ -40,15 +42,7 @@ def load_yolov4_weight(model_location_generator=None): def load_yolov4_model(ttnn_model): torch_model = Yolov4() - new_state_dict = {} - ds_state_dict = {k: v for k, v in ttnn_model.torch_model.items()} - - keys = [name for name, parameter in torch_model.state_dict().items()] - values = [parameter for name, parameter in ds_state_dict.items()] - - for i in range(len(keys)): - new_state_dict[keys[i]] = values[i] - + new_state_dict = dict(zip(torch_model.state_dict().keys(), ttnn_model.torch_model.values())) torch_model.load_state_dict(new_state_dict) torch_model.eval() return torch_model @@ -72,13 +66,16 @@ def __init__( self.act_dtype = act_dtype self.weight_dtype = weight_dtype self.model_location_generator = model_location_generator - self.ttnn_yolov4_model = TtYOLOv4(device, load_yolov4_weight(self.model_location_generator)) + self.ttnn_yolov4_model = TtYOLOv4(load_yolov4_weight(self.model_location_generator), device) + torch_model = load_yolov4_model(self.ttnn_yolov4_model) input_shape = (1, 320, 320, 3) torch_input_tensor = torch.randn(input_shape, dtype=torch.float32) self.input_tensor = ttnn.from_torch(torch_input_tensor, ttnn.bfloat16) self.torch_input_tensor = torch_input_tensor.permute(0, 3, 1, 2) self.torch_output_tensor = torch_model(self.torch_input_tensor) + ref1, ref2, ref3 = gen_yolov4_boxes_confs(self.torch_output_tensor) + self.ref_boxes, self.ref_confs = get_region_boxes([ref1, ref2, ref3]) def run(self): self.output_tensor = self.ttnn_yolov4_model(self.input_tensor) @@ -130,38 +127,42 @@ def setup_dram_sharded_input(self, device, torch_input_tensor=None, mesh_mapper= def validate(self, output_tensor=None): output_tensor = self.output_tensor if output_tensor is None else output_tensor - output_tensor = ttnn.to_torch(self.output_tensor[0]) - output_tensor = output_tensor.reshape(1, 40, 40, 255) - output_tensor = torch.permute(output_tensor, (0, 3, 1, 2)) - - valid_pcc = 0.985 - self.pcc_passed, self.pcc_message = assert_with_pcc(self.torch_output_tensor[0], output_tensor, pcc=valid_pcc) + result_boxes_padded = ttnn.to_torch(self.output_tensor[0]) + result_confs = ttnn.to_torch(self.output_tensor[1]) + + result_boxes_padded = result_boxes_padded.permute(0, 2, 1, 3) + result_boxes_list = [] + # That ttnn tensor is the concat output of 3 padded tensors + # As a perf workaround I'm doing the unpadding on the torch output here. + # TODO: cleaner ttnn code when ttnn.untilize() is fully optimized + box_1_start_i = 0 + box_1_end_i = 6100 + box_2_start_i = 6128 + box_2_end_i = 6228 + box_3_start_i = 6256 + box_3_end_i = 6356 + result_boxes_list.append(result_boxes_padded[:, box_1_start_i:box_1_end_i]) + result_boxes_list.append(result_boxes_padded[:, box_2_start_i:box_2_end_i]) + result_boxes_list.append(result_boxes_padded[:, box_3_start_i:box_3_end_i]) + result_boxes = torch.cat(result_boxes_list, dim=1) + + valid_pcc = 0.99 + self.pcc_passed, self.pcc_message = assert_with_pcc(self.ref_boxes, result_boxes, pcc=valid_pcc) logger.info( - f"Yolov4 batch_size={self.batch_size}, act_dtype={self.act_dtype}, weight_dtype={self.weight_dtype}, PCC={self.pcc_message}" + f"Yolov4 - Bboxes. batch_size={self.batch_size}, act_dtype={self.act_dtype}, weight_dtype={self.weight_dtype}, PCC={self.pcc_message}" ) - output_tensor = ttnn.to_torch(self.output_tensor[1]) - output_tensor = torch.reshape(output_tensor, (self.batch_size, 20, 20, 255)) - output_tensor = torch.permute(output_tensor, (0, 3, 1, 2)) - self.pcc_passed, self.pcc_message = assert_with_pcc(self.torch_output_tensor[1], output_tensor, pcc=valid_pcc) - - logger.info( - f"Yolov4 batch_size={self.batch_size}, act_dtype={self.act_dtype}, weight_dtype={self.weight_dtype}, PCC={self.pcc_message}" - ) + valid_pcc = 0.71 + self.pcc_passed, self.pcc_message = assert_with_pcc(self.ref_confs, result_confs, pcc=valid_pcc) - output_tensor = ttnn.to_torch(self.output_tensor[2]) - output_tensor = torch.reshape(output_tensor, (self.batch_size, 10, 10, 255)) - output_tensor = torch.permute(output_tensor, (0, 3, 1, 2)) - self.pcc_passed, self.pcc_message = assert_with_pcc(self.torch_output_tensor[2], output_tensor, pcc=valid_pcc) logger.info( - f"Yolov4 batch_size={self.batch_size}, act_dtype={self.act_dtype}, weight_dtype={self.weight_dtype}, PCC={self.pcc_message}" + f"Yolov4 - Confs. batch_size={self.batch_size}, act_dtype={self.act_dtype}, weight_dtype={self.weight_dtype}, PCC={self.pcc_message}" ) def dealloc_output(self): ttnn.deallocate(self.output_tensor[0]) ttnn.deallocate(self.output_tensor[1]) - ttnn.deallocate(self.output_tensor[2]) def create_test_infra( diff --git a/models/demos/yolov4/ttnn/common.py b/models/demos/yolov4/ttnn/common.py index 70ead902094..e20814a3a73 100644 --- a/models/demos/yolov4/ttnn/common.py +++ b/models/demos/yolov4/ttnn/common.py @@ -52,9 +52,17 @@ def __init__( else: weight = model[path + ".conv.0.weight"] bias = model[path + ".conv.0.bias"] + # padding the channel dim in the last conv in the head module from 255 to 256 + # to avoid additional padding in the model graph + if weight.shape[0] == 255: + weight = torch.nn.functional.pad(weight, (0, 0, 0, 0, 0, 0, 0, 1)) self.weights = ttnn.from_torch(weight) bias = bias.reshape(1, 1, 1, -1) + # padding the channel dim in the last conv in the head module from 255 to 256 + if bias.shape[-1] == 255: + bias = torch.nn.functional.pad(bias, (0, 1, 0, 0, 0, 0, 0, 0)) self.bias = ttnn.from_torch(bias) + self.input_params = input_params self.kernel_size = (self.weights.shape[2], self.weights.shape[3]) self.conv_params = conv_params diff --git a/models/demos/yolov4/ttnn/genboxes.py b/models/demos/yolov4/ttnn/genboxes.py new file mode 100644 index 00000000000..fb8bb49867d --- /dev/null +++ b/models/demos/yolov4/ttnn/genboxes.py @@ -0,0 +1,256 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +import torch +import math +import numpy as np +import ttnn +from models.utility_functions import _nearest_32 + + +def create_conv_bias_tensor(torch_tensor, N, K, pad=0): + bias_shape = [1, 1, N, K] + bias_padded_shape = [1, 1, _nearest_32(N), _nearest_32(K)] + tt_tensor = ttnn.Tensor(torch.flatten(torch_tensor).tolist(), bias_shape, ttnn.bfloat16, ttnn.ROW_MAJOR_LAYOUT).pad( + bias_shape, (0, 0, 0, 0), 0.0 + ) + tt_tensor = tt_tensor.pad_to_tile(pad).to(ttnn.TILE_LAYOUT) + return tt_tensor + + +class TtGenBoxes: + def __init__(self, device) -> None: + self.thresh = 0.6 + self.num_classes = 80 + self.num_anchors = 3 + + self.grid_x = [] + self.grid_y = [] + for H in (40, 20, 10): + grid_x_i = torch.reshape( + torch.flatten( + torch.from_numpy( + np.expand_dims( + np.expand_dims(np.expand_dims(np.linspace(0, H - 1, H), axis=0).repeat(H, 0), axis=0), + axis=0, + ) + ) + ), + (1, 1, 1, H * H), + ) + + grid_y_i = torch.reshape( + torch.flatten( + torch.from_numpy( + np.expand_dims( + np.expand_dims(np.expand_dims(np.linspace(0, H - 1, H), axis=1).repeat(H, 1), axis=0), + axis=0, + ) + ) + ), + (1, 1, 1, H * H), + ) + self.grid_x.append( + ttnn.from_torch(grid_x_i, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device) + ) # , 1, H*H)) + self.grid_y.append( + ttnn.from_torch(grid_y_i, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device) + ) # , 1, H*H)) + + def __call__(self, device, input_tensor): + B, __, HW, dim = input_tensor.shape + H = W = int(math.sqrt(HW)) + AHW = self.num_anchors * HW + A = self.num_anchors + + if HW == 1600: + group = 0 + elif HW == 400: + group = 1 + elif HW == 100: + group = 2 + + # Pre-derived from the torch function + if group == 0: + anchor_w_a = 1.5 + anchor_w_b = 2.375 + anchor_w_c = 5.0 + anchor_h_a = 2.0 + anchor_h_b = 4.5 + anchor_h_c = 3.5 + elif group == 1: + anchor_w_a = 2.25 + anchor_w_b = 4.75 + anchor_w_c = 4.5 + anchor_h_a = 4.6875 + anchor_h_b = 3.4375 + anchor_h_c = 9.125 + elif group == 2: + anchor_w_a = 4.4375 + anchor_w_b = 6.0 + anchor_w_c = 14.34375 + anchor_h_a = 3.4375 + anchor_h_b = 7.59375 + anchor_h_c = 12.53125 + + input_tensor_i = ttnn.to_memory_config(input_tensor, ttnn.L1_MEMORY_CONFIG) + input_tensor_i = ttnn.to_layout(input_tensor_i, ttnn.ROW_MAJOR_LAYOUT) + input_tensor_i = ttnn.permute(input_tensor_i, (0, 1, 3, 2)) + + # first anchor + bx_a = ttnn.slice(input_tensor_i, [0, 0, 0, 0], [1, 1, 1, HW]) + by_a = ttnn.slice(input_tensor_i, [0, 0, 1, 0], [1, 1, 2, HW]) + bw_a = ttnn.slice(input_tensor_i, [0, 0, 2, 0], [1, 1, 3, HW]) + bh_a = ttnn.slice(input_tensor_i, [0, 0, 3, 0], [1, 1, 4, HW]) + det_confs_a = ttnn.slice(input_tensor_i, [0, 0, 4, 0], [1, 1, 5, HW]) + cls_confs_a = ttnn.slice(input_tensor_i, [0, 0, 5, 0], [1, 1, 85, HW]) + # second anchor + bx_b = ttnn.slice(input_tensor_i, [0, 0, 85, 0], [1, 1, 86, HW]) + by_b = ttnn.slice(input_tensor_i, [0, 0, 86, 0], [1, 1, 87, HW]) + bw_b = ttnn.slice(input_tensor_i, [0, 0, 87, 0], [1, 1, 88, HW]) + bh_b = ttnn.slice(input_tensor_i, [0, 0, 88, 0], [1, 1, 89, HW]) + det_confs_b = ttnn.slice(input_tensor_i, [0, 0, 89, 0], [1, 1, 90, HW]) + cls_confs_b = ttnn.slice(input_tensor_i, [0, 0, 90, 0], [1, 1, 170, HW]) + # third anchor + bx_c = ttnn.slice(input_tensor_i, [0, 0, 170, 0], [1, 1, 171, HW]) + by_c = ttnn.slice(input_tensor_i, [0, 0, 171, 0], [1, 1, 172, HW]) + bw_c = ttnn.slice(input_tensor_i, [0, 0, 172, 0], [1, 1, 173, HW]) + bh_c = ttnn.slice(input_tensor_i, [0, 0, 173, 0], [1, 1, 174, HW]) + det_confs_c = ttnn.slice(input_tensor_i, [0, 0, 174, 0], [1, 1, 175, HW]) + cls_confs_c = ttnn.slice(input_tensor_i, [0, 0, 175, 0], [1, 1, 255, HW]) + + ############# + # Confs + ############# + + det_confs_a = ttnn.to_layout(det_confs_a, ttnn.TILE_LAYOUT) + det_confs_b = ttnn.to_layout(det_confs_b, ttnn.TILE_LAYOUT) + det_confs_c = ttnn.to_layout(det_confs_c, ttnn.TILE_LAYOUT) + cls_confs_a = ttnn.to_layout(cls_confs_a, ttnn.TILE_LAYOUT) + cls_confs_b = ttnn.to_layout(cls_confs_b, ttnn.TILE_LAYOUT) + cls_confs_c = ttnn.to_layout(cls_confs_c, ttnn.TILE_LAYOUT) + + det_confs_a = ttnn.sigmoid(det_confs_a) + det_confs_b = ttnn.sigmoid(det_confs_b) + det_confs_c = ttnn.sigmoid(det_confs_c) + cls_confs_a = ttnn.sigmoid(cls_confs_a) + cls_confs_b = ttnn.sigmoid(cls_confs_b) + cls_confs_c = ttnn.sigmoid(cls_confs_c) + + confs_a = ttnn.multiply(det_confs_a, cls_confs_a) + confs_b = ttnn.multiply(det_confs_b, cls_confs_b) + confs_c = ttnn.multiply(det_confs_c, cls_confs_c) + + confs = ttnn.concat([confs_a, confs_b, confs_c], dim=1) + confs = ttnn.permute(confs, (0, 1, 3, 2)) + confs = ttnn.reshape(confs, (B, AHW, self.num_classes)) + + ################# + ## Boxes + ################# + + # expensive TilizeWithValPadding + bx_a = ttnn.to_layout(bx_a, ttnn.TILE_LAYOUT) + by_a = ttnn.to_layout(by_a, ttnn.TILE_LAYOUT) + bw_a = ttnn.to_layout(bw_a, ttnn.TILE_LAYOUT) + bh_a = ttnn.to_layout(bh_a, ttnn.TILE_LAYOUT) + bx_a = ttnn.sigmoid(bx_a) + by_a = ttnn.sigmoid(by_a) + bw_a = ttnn.exp(bw_a) + bh_a = ttnn.exp(bh_a) + + bx_b = ttnn.to_layout(bx_b, ttnn.TILE_LAYOUT) + by_b = ttnn.to_layout(by_b, ttnn.TILE_LAYOUT) + bw_b = ttnn.to_layout(bw_b, ttnn.TILE_LAYOUT) + bh_b = ttnn.to_layout(bh_b, ttnn.TILE_LAYOUT) + bx_b = ttnn.sigmoid(bx_b) + by_b = ttnn.sigmoid(by_b) + bw_b = ttnn.exp(bw_b) + bh_b = ttnn.exp(bh_b) + + bx_c = ttnn.to_layout(bx_c, ttnn.TILE_LAYOUT) + by_c = ttnn.to_layout(by_c, ttnn.TILE_LAYOUT) + bw_c = ttnn.to_layout(bw_c, ttnn.TILE_LAYOUT) + bh_c = ttnn.to_layout(bh_c, ttnn.TILE_LAYOUT) + bx_c = ttnn.sigmoid(bx_c) + by_c = ttnn.sigmoid(by_c) + bw_c = ttnn.exp(bw_c) + bh_c = ttnn.exp(bh_c) + + #### + ## Grid tensor derivation + #### + + grid_x = self.grid_x[group] # .to(device, mem_config=ttnn.L1_MEMORY_CONFIG) + grid_y = self.grid_y[group] # .to(device, mem_config=ttnn.L1_MEMORY_CONFIG) + + bx_a = ttnn.add(bx_a, grid_x) + by_a = ttnn.add(by_a, grid_y) + bx_b = ttnn.add(bx_b, grid_x) + by_b = ttnn.add(by_b, grid_y) + bx_c = ttnn.add(bx_c, grid_x) + by_c = ttnn.add(by_c, grid_y) + + bx_a = ttnn.multiply(bx_a, 1 / W) + by_a = ttnn.multiply(by_a, 1 / H) + bx_b = ttnn.multiply(bx_b, 1 / W) + by_b = ttnn.multiply(by_b, 1 / H) + bx_c = ttnn.multiply(bx_c, 1 / W) + by_c = ttnn.multiply(by_c, 1 / H) + + bw_a = bw_a * (anchor_w_a / W) + bw_b = bw_b * (anchor_w_b / W) + bw_c = bw_c * (anchor_w_c / W) + + bh_a = bh_a * (anchor_h_a / H) + bh_b = bh_b * (anchor_h_b / H) + bh_c = bh_c * (anchor_h_c / H) + + bw_a_half = bw_a * (0.5) + bw_b_half = bw_b * (0.5) + bw_c_half = bw_c * (0.5) + + bh_a_half = bh_a * (0.5) + bh_b_half = bh_b * (0.5) + bh_c_half = bh_c * (0.5) + + bx1_a = bx_a - bw_a_half + by1_a = by_a - bh_a_half + bx2_a = bx1_a + bw_a + by2_a = by1_a + bh_a + + bx1_b = bx_b - bw_b_half + by1_b = by_b - bh_b_half + bx2_b = bx1_b + bw_b + by2_b = by1_b + bh_b + + bx1_c = bx_c - bw_c_half + by1_c = by_c - bh_c_half + bx2_c = bx1_c + bw_c + by2_c = by1_c + bh_c + + bx1_a = ttnn.to_layout(bx1_a, ttnn.ROW_MAJOR_LAYOUT) + bx2_a = ttnn.to_layout(bx2_a, ttnn.ROW_MAJOR_LAYOUT) + by1_a = ttnn.to_layout(by1_a, ttnn.ROW_MAJOR_LAYOUT) + by2_a = ttnn.to_layout(by2_a, ttnn.ROW_MAJOR_LAYOUT) + + bx1_b = ttnn.to_layout(bx1_b, ttnn.ROW_MAJOR_LAYOUT) + bx2_b = ttnn.to_layout(bx2_b, ttnn.ROW_MAJOR_LAYOUT) + by1_b = ttnn.to_layout(by1_b, ttnn.ROW_MAJOR_LAYOUT) + by2_b = ttnn.to_layout(by2_b, ttnn.ROW_MAJOR_LAYOUT) + + bx1_c = ttnn.to_layout(bx1_c, ttnn.ROW_MAJOR_LAYOUT) + bx2_c = ttnn.to_layout(bx2_c, ttnn.ROW_MAJOR_LAYOUT) + by1_c = ttnn.to_layout(by1_c, ttnn.ROW_MAJOR_LAYOUT) + by2_c = ttnn.to_layout(by2_c, ttnn.ROW_MAJOR_LAYOUT) + + bx1 = ttnn.concat([bx1_a, bx1_b, bx1_c], dim=2) + by1 = ttnn.concat([by1_a, by1_b, by1_c], dim=2) + bx2 = ttnn.concat([bx2_a, bx2_b, bx2_c], dim=2) + by2 = ttnn.concat([by2_a, by2_b, by2_c], dim=2) + + # Shape: [batch, num_anchors * h * w, 4] -> [batch, num_anchors * h * w, 1, 4] + boxes = ttnn.concat((bx1, by1, bx2, by2), dim=1) + + return boxes, confs diff --git a/models/demos/yolov4/ttnn/yolov4.py b/models/demos/yolov4/ttnn/yolov4.py index 42f1a9cd7fe..307e0fc55ca 100644 --- a/models/demos/yolov4/ttnn/yolov4.py +++ b/models/demos/yolov4/ttnn/yolov4.py @@ -21,10 +21,11 @@ from models.demos.yolov4.ttnn.downsample5 import Down5 from models.demos.yolov4.ttnn.neck import TtNeck from models.demos.yolov4.ttnn.head import TtHead +from models.demos.yolov4.ttnn.genboxes import TtGenBoxes class TtYOLOv4: - def __init__(self, device, path) -> None: + def __init__(self, path, device) -> None: if type(path) is str: self.torch_model = torch.load(path) else: @@ -39,7 +40,12 @@ def __init__(self, device, path) -> None: self.neck = TtNeck(device, self) self.head = TtHead(device, self) + self.boxes_confs_0 = TtGenBoxes(device) + self.boxes_confs_1 = TtGenBoxes(device) + self.boxes_confs_2 = TtGenBoxes(device) + self.downs = [] # [self.down1] + self.device = device def __call__(self, input_tensor): d1 = self.down1(input_tensor) @@ -52,7 +58,32 @@ def __call__(self, input_tensor): x20, x13, x6 = self.neck([d5, d4, d3]) x4, x5, x6 = self.head([x20, x13, x6]) - return x4, x5, x6 + orig = 0 + if orig: + return x4, x5, x6 + else: + x4_boxes_confs = self.boxes_confs_0(self.device, x4) + x5_boxes_confs = self.boxes_confs_1(self.device, x5) + x6_boxes_confs = self.boxes_confs_2(self.device, x6) + + confs_1 = ttnn.to_layout(x4_boxes_confs[1], ttnn.ROW_MAJOR_LAYOUT) + confs_2 = ttnn.to_layout(x5_boxes_confs[1], ttnn.ROW_MAJOR_LAYOUT) + confs_3 = ttnn.to_layout(x6_boxes_confs[1], ttnn.ROW_MAJOR_LAYOUT) + confs = ttnn.concat([confs_1, confs_2, confs_3], dim=1) + + boxes_1 = ttnn.to_layout(x4_boxes_confs[0], ttnn.ROW_MAJOR_LAYOUT) + boxes_2 = ttnn.to_layout(x5_boxes_confs[0], ttnn.ROW_MAJOR_LAYOUT) + boxes_3 = ttnn.to_layout(x6_boxes_confs[0], ttnn.ROW_MAJOR_LAYOUT) + boxes_1 = ttnn.reshape(boxes_1, (1, 4, 1, 4800)) + boxes_2 = ttnn.reshape(boxes_2, (1, 4, 1, 1200)) + boxes_3 = ttnn.pad(boxes_3, ((0, 0), (0, 0), (0, 0), (0, 28)), 0) + boxes_3 = ttnn.reshape(boxes_3, (1, 4, 1, 384)) + boxes_1 = ttnn.permute(boxes_1, (0, 2, 3, 1)) + boxes_2 = ttnn.permute(boxes_2, (0, 2, 3, 1)) + boxes_3 = ttnn.permute(boxes_3, (0, 2, 3, 1)) + boxes = ttnn.concat([boxes_1, boxes_2, boxes_3], dim=2) + + return boxes, confs def __str__(self) -> str: this_str = "" diff --git a/models/demos/yolov4/web_demo/README.md b/models/demos/yolov4/web_demo/README.md index d35bb31c518..5b112cadaa6 100644 --- a/models/demos/yolov4/web_demo/README.md +++ b/models/demos/yolov4/web_demo/README.md @@ -12,6 +12,11 @@ pip install -r models/demos/yolov4/web_demo/server/requirements.txt ``` +- After installing the server side requirments, ONLY if you are running the demo on an N300 card,run the following to export the approprite envirement variable for N300. + ``` + export WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml + ``` + - From the server run: ``` source models/demos/yolov4/web_demo/server/run_uvicorn.sh diff --git a/models/demos/yolov4/web_demo/client/coco.names b/models/demos/yolov4/web_demo/client/coco.names new file mode 100644 index 00000000000..ca76c80b5b2 --- /dev/null +++ b/models/demos/yolov4/web_demo/client/coco.names @@ -0,0 +1,80 @@ +person +bicycle +car +motorbike +aeroplane +bus +train +truck +boat +traffic light +fire hydrant +stop sign +parking meter +bench +bird +cat +dog +horse +sheep +cow +elephant +bear +zebra +giraffe +backpack +umbrella +handbag +tie +suitcase +frisbee +skis +snowboard +sports ball +kite +baseball bat +baseball glove +skateboard +surfboard +tennis racket +bottle +wine glass +cup +fork +knife +spoon +bowl +banana +apple +sandwich +orange +broccoli +carrot +hot dog +pizza +donut +cake +chair +sofa +pottedplant +bed +diningtable +toilet +tvmonitor +laptop +mouse +remote +keyboard +cell phone +microwave +oven +toaster +sink +refrigerator +book +clock +vase +scissors +teddy bear +hair drier +toothbrush diff --git a/models/demos/yolov4/web_demo/client/requirements.txt b/models/demos/yolov4/web_demo/client/requirements.txt index 282195275da..be5f168cc74 100644 --- a/models/demos/yolov4/web_demo/client/requirements.txt +++ b/models/demos/yolov4/web_demo/client/requirements.txt @@ -1,3 +1,4 @@ opencv-python==4.6.0.66 streamlit==1.26.0 streamlit-webrtc==0.47.0 +orjson==3.10.12 diff --git a/models/demos/yolov4/web_demo/client/yolov4.py b/models/demos/yolov4/web_demo/client/yolov4.py index 5fc4ea6c692..ada420cbdad 100644 --- a/models/demos/yolov4/web_demo/client/yolov4.py +++ b/models/demos/yolov4/web_demo/client/yolov4.py @@ -11,7 +11,9 @@ import cv2 import requests import torch +import orjson import av +import logging import streamlit as st import numpy as np @@ -20,78 +22,16 @@ from streamlit_webrtc import VideoProcessorBase, webrtc_streamer +# Configure the logger +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", handlers=[logging.StreamHandler()] +) + + class VideoProcessor(VideoProcessorBase): def __init__(self): self.frame_count = 0 - def post_processing(self, img, conf_thresh, nms_thresh, output): - box_array = output[0] - confs = output[1].float() - - t1 = time.time() - - if type(box_array).__name__ != "ndarray": - box_array = box_array.cpu().detach().numpy() - confs = confs.cpu().detach().numpy() - - num_classes = confs.shape[2] - - # [batch, num, 4] - box_array = box_array[:, :, 0] - - # [batch, num, num_classes] --> [batch, num] - max_conf = np.max(confs, axis=2) - max_id = np.argmax(confs, axis=2) - - t2 = time.time() - - bboxes_batch = [] - for i in range(box_array.shape[0]): - argwhere = max_conf[i] > conf_thresh - l_box_array = box_array[i, argwhere, :] - l_max_conf = max_conf[i, argwhere] - l_max_id = max_id[i, argwhere] - - bboxes = [] - # nms for each class - for j in range(num_classes): - cls_argwhere = l_max_id == j - ll_box_array = l_box_array[cls_argwhere, :] - ll_max_conf = l_max_conf[cls_argwhere] - ll_max_id = l_max_id[cls_argwhere] - - keep = self.nms_cpu(ll_box_array, ll_max_conf, nms_thresh) - - if keep.size > 0: - ll_box_array = ll_box_array[keep, :] - ll_max_conf = ll_max_conf[keep] - ll_max_id = ll_max_id[keep] - - for k in range(ll_box_array.shape[0]): - bboxes.append( - [ - ll_box_array[k, 0], - ll_box_array[k, 1], - ll_box_array[k, 2], - ll_box_array[k, 3], - ll_max_conf[k], - ll_max_conf[k], - ll_max_id[k], - ] - ) - - bboxes_batch.append(bboxes) - - t3 = time.time() - - print("-----------------------------------") - print(" max and argmax : %f" % (t2 - t1)) - print(" nms : %f" % (t3 - t2)) - print("Post processing total : %f" % (t3 - t1)) - print("-----------------------------------") - - return bboxes_batch - def load_class_names(self, namesfile): class_names = [] with open(namesfile, "r") as fp: @@ -101,41 +41,6 @@ def load_class_names(self, namesfile): class_names.append(line) return class_names - def nms_cpu(self, boxes, confs, nms_thresh=0.5, min_mode=False): - x1 = boxes[:, 0] - y1 = boxes[:, 1] - x2 = boxes[:, 2] - y2 = boxes[:, 3] - - areas = (x2 - x1) * (y2 - y1) - order = confs.argsort()[::-1] - - keep = [] - while order.size > 0: - idx_self = order[0] - idx_other = order[1:] - - keep.append(idx_self) - - xx1 = np.maximum(x1[idx_self], x1[idx_other]) - yy1 = np.maximum(y1[idx_self], y1[idx_other]) - xx2 = np.minimum(x2[idx_self], x2[idx_other]) - yy2 = np.minimum(y2[idx_self], y2[idx_other]) - - w = np.maximum(0.0, xx2 - xx1) - h = np.maximum(0.0, yy2 - yy1) - inter = w * h - - if min_mode: - over = inter / np.minimum(areas[order[0]], areas[order[1:]]) - else: - over = inter / (areas[order[0]] + areas[order[1:]] - inter) - - inds = np.where(over <= nms_thresh)[0] - order = order[inds + 1] - - return np.array(keep) - def plot_boxes_cv2(self, bgr_img, boxes, savename=None, class_names=None, color=None): img = np.copy(bgr_img) colors = np.array([[1, 0, 1], [0, 0, 1], [0, 1, 1], [0, 1, 0], [1, 1, 0], [1, 0, 0]], dtype=np.float32) @@ -196,52 +101,60 @@ def get_color(c, x, max_val): def recv(self, frame): t0 = time.time() + + # Convert frame to PIL image and resize pil_image = frame.to_image() - # resize on the client side - new_size = (320, 320) - pil_image = pil_image.resize(new_size) + pil_image = pil_image.resize((320, 320)) # Resize to target dimensions t1 = time.time() + + # Save image as JPEG in-memory with optimized settings buf = io.BytesIO() - pil_image.save(buf, format="JPEG") + pil_image.save(buf, format="JPEG", quality=85, optimize=True) byte_im = buf.getvalue() file = {"file": byte_im} - # Argument Parser to grab namespace_id of server pod from user - parser = argparse.ArgumentParser(description="YOLOv4 script") - parser.add_argument("--api-url", type=str, help="URL for the object detection API", required=True) - args = parser.parse_args() - apiurl = args.api_url - url = f"{apiurl}/objdetection_v2" - r = requests.post(url, files=file) - if r.status_code == 200: - try: - # Get the JSON response as a dictionary - response_dict = r.json() - output = [torch.tensor(tensor_data) for tensor_data in response_dict["output"]] - except ValueError: - st.error("Failed to parse JSON. The response is not in JSON format.") - else: - st.error(f"Request failed with status code {r.status_code}") + # Parse API URL once at the class level for efficiency + if not hasattr(self, "api_url"): + parser = argparse.ArgumentParser(description="YOLOv4 script") + parser.add_argument("--api-url", type=str, required=True, help="URL for the object detection API") + args = parser.parse_args() + self.api_url = args.api_url + + url = f"{self.api_url}/objdetection_v2" + + try: + # Use a persistent session for multiple requests + with requests.Session() as session: + # Post request with a timeout + response = session.post(url, files=file, timeout=5) + + # Check if response is successful + if response.status_code == 200: + # Parse JSON response + output = orjson.loads(response.content) + else: + print(f"Request failed with status code {response.status_code}") + # return None + except requests.exceptions.RequestException as e: + print(f"Request failed: {e}") + return None t3 = time.time() + # Convert frame to ndarray and perform post-processing bgr_image = frame.to_ndarray(format="bgr24") conf_thresh = 0.6 nms_thresh = 0.5 - boxes = self.post_processing(bgr_image, conf_thresh, nms_thresh, output) + + # Load class names and plot bounding boxes namesfile = "coco.names" class_names = self.load_class_names(namesfile) + image_final = self.plot_boxes_cv2(bgr_image, output, None, class_names) - # random_number = random.randint(1, 100) - # save_name = "ttnn_prediction_demo" + str(random_number) + ".jpg" - save_name = None - - image_final = self.plot_boxes_cv2(bgr_image, boxes[0], save_name, class_names) t4 = time.time() - print() - print(f" IMG-IN | WH | Post | Total time: ") - print(f" {(t1-t0):.3f} | {(t3-t1):.3f} | {(t4-t3):.3f} || {(t4-t0):.3f} ") + logging.info( + f" IMG-IN | WH | Post | Total time: {(t1-t0):.3f} | {(t3-t1):.3f} | {(t4-t3):.3f} || {(t4-t0):.3f} " + ) - # return image_final return av.VideoFrame.from_ndarray(image_final, format="bgr24") @@ -254,10 +167,8 @@ def recv(self, frame): media_stream_constraints={ "video": { "width": {"min": 320, "ideal": 400, "max": 960}, - # "height": {"min": 180, "ideal": 225, "max": 450}, "height": {"min": 320, "ideal": 400, "max": 960}, "frameRate": {"min": 1, "ideal": 50, "max": 60}, } }, - # async_processing=True # Use asynchronous processing for long tasks ) diff --git a/models/demos/yolov4/web_demo/server/fast_api_yolov4.py b/models/demos/yolov4/web_demo/server/fast_api_yolov4.py old mode 100755 new mode 100644 index 19732cbc074..83af1d6e14b --- a/models/demos/yolov4/web_demo/server/fast_api_yolov4.py +++ b/models/demos/yolov4/web_demo/server/fast_api_yolov4.py @@ -2,6 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 import json +import os +import logging from fastapi import FastAPI, File, UploadFile from io import BytesIO from PIL import Image @@ -25,14 +27,43 @@ async def root(): return {"message": "Hello World"} +# Configure the logger +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", handlers=[logging.StreamHandler()] +) + + +def get_dispatch_core_type(): + # TODO: 11059 move dispatch_core_type to device_params when all tests are updated to not use WH_ARCH_YAML env flag + dispatch_core_type = ttnn.device.DispatchCoreType.WORKER + # if ("WH_ARCH_YAML" in os.environ) and os.environ["WH_ARCH_YAML"] == "wormhole_b0_80_arch_eth_dispatch.yaml": + if os.environ["WH_ARCH_YAML"] == "wormhole_b0_80_arch_eth_dispatch.yaml": + dispatch_core_type = ttnn.device.DispatchCoreType.ETH + return dispatch_core_type + + @app.on_event("startup") async def startup(): - device_id = 0 - device = ttnn.CreateDevice(device_id, l1_small_size=24576, trace_region_size=1617920, num_command_queues=2) - ttnn.enable_program_cache(device) global model - model = Yolov4Trace2CQ() - model.initialize_yolov4_trace_2cqs_inference(device) + if ("WH_ARCH_YAML" in os.environ) and os.environ["WH_ARCH_YAML"] == "wormhole_b0_80_arch_eth_dispatch.yaml": + print("WH_ARCH_YAML:", os.environ.get("WH_ARCH_YAML")) + device_id = 0 + device = ttnn.CreateDevice( + device_id, + dispatch_core_type=get_dispatch_core_type(), + l1_small_size=24576, + trace_region_size=3211264, + num_command_queues=2, + ) + ttnn.enable_program_cache(device) + model = Yolov4Trace2CQ() + model.initialize_yolov4_trace_2cqs_inference(device) + else: + device_id = 0 + device = ttnn.CreateDevice(device_id, l1_small_size=24576, trace_region_size=3211264, num_command_queues=2) + ttnn.enable_program_cache(device) + model = Yolov4Trace2CQ() + model.initialize_yolov4_trace_2cqs_inference(device) @app.on_event("shutdown") @@ -40,16 +71,112 @@ async def shutdown(): model.release_yolov4_trace_2cqs_inference() -def process_request(output): - # Convert all tensors to lists for JSON serialization - output_serializable = {"output": [tensor.tolist() for tensor in output]} - return output_serializable +def process_output(output): + outs = [] + output = output + cnt = 0 + for item in output: + cnt = cnt + 1 + output_i = [element.item() for element in item] + outs.append(output_i) + return outs + + +def post_processing(img, conf_thresh, nms_thresh, output): + box_array = output[0] + confs = output[1] + + box_array = np.array(box_array.to(torch.float32)) + confs = np.array(confs.to(torch.float32)) + + num_classes = confs.shape[2] + + # [batch, num, 4] + box_array = box_array[:, :, 0] + + # [batch, num, num_classes] --> [batch, num] + max_conf = np.max(confs, axis=2) + max_id = np.argmax(confs, axis=2) + + bboxes_batch = [] + for i in range(box_array.shape[0]): + argwhere = max_conf[i] > conf_thresh + l_box_array = box_array[i, argwhere, :] + l_max_conf = max_conf[i, argwhere] + l_max_id = max_id[i, argwhere] + + bboxes = [] + # nms for each class + for j in range(num_classes): + cls_argwhere = l_max_id == j + ll_box_array = l_box_array[cls_argwhere, :] + ll_max_conf = l_max_conf[cls_argwhere] + ll_max_id = l_max_id[cls_argwhere] + + keep = nms_cpu(ll_box_array, ll_max_conf, nms_thresh) + + if keep.size > 0: + ll_box_array = ll_box_array[keep, :] + ll_max_conf = ll_max_conf[keep] + ll_max_id = ll_max_id[keep] + + for k in range(ll_box_array.shape[0]): + bboxes.append( + [ + ll_box_array[k, 0], + ll_box_array[k, 1], + ll_box_array[k, 2], + ll_box_array[k, 3], + ll_max_conf[k], + ll_max_conf[k], + ll_max_id[k], + ] + ) + + bboxes_batch.append(bboxes) + + return bboxes_batch + + +def nms_cpu(boxes, confs, nms_thresh=0.5, min_mode=False): + x1 = boxes[:, 0] + y1 = boxes[:, 1] + x2 = boxes[:, 2] + y2 = boxes[:, 3] + + areas = (x2 - x1) * (y2 - y1) + order = confs.argsort()[::-1] + + keep = [] + while order.size > 0: + idx_self = order[0] + idx_other = order[1:] + + keep.append(idx_self) + + xx1 = np.maximum(x1[idx_self], x1[idx_other]) + yy1 = np.maximum(y1[idx_self], y1[idx_other]) + xx2 = np.minimum(x2[idx_self], x2[idx_other]) + yy2 = np.minimum(y2[idx_self], y2[idx_other]) + + w = np.maximum(0.0, xx2 - xx1) + h = np.maximum(0.0, yy2 - yy1) + inter = w * h + + if min_mode: + over = inter / np.minimum(areas[order[0]], areas[order[1:]]) + else: + over = inter / (areas[order[0]] + areas[order[1:]] - inter) + + inds = np.where(over <= nms_thresh)[0] + order = order[inds + 1] + + return np.array(keep) @app.post("/objdetection_v2") async def objdetection_v2(file: UploadFile = File(...)): contents = await file.read() - # Load and convert the image to RGB image = Image.open(BytesIO(contents)).convert("RGB") image = np.array(image) @@ -60,11 +187,24 @@ async def objdetection_v2(file: UploadFile = File(...)): else: print("unknow image type") exit(-1) + t1 = time.time() response = model.run_traced_inference(image) t2 = time.time() - print("the inference on the sever side took: ", t2 - t1) + logging.info("The inference on the sever side took: %.3f seconds", t2 - t1) + conf_thresh = 0.6 + nms_thresh = 0.5 + + boxes = post_processing(image, conf_thresh, nms_thresh, response) + output = boxes[0] + # output = boxes + try: + output = process_output(output) + except Exception as E: + print("the Exception is: ", E) + print("No objects detected!") + return [] + t3 = time.time() + logging.info("The post-processing to get the boxes took: %.3f seconds", t3 - t2) - # Convert response tensors to JSON-serializable format - output = process_request(response) return output diff --git a/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample1.py b/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample1.py index 3ae46d4970c..9dd13940717 100644 --- a/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample1.py +++ b/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample1.py @@ -36,16 +36,8 @@ def test_down1(device, reset_seeds, model_location_generator): ttnn_input = ttnn.from_torch(torch_input, dtype=ttnn.bfloat16) torch_input = torch_input.permute(0, 3, 1, 2).float() torch_model = DownSample1() - - new_state_dict = {} ds_state_dict = {k: v for k, v in ttnn_model.torch_model.items() if (k.startswith("down1."))} - - keys = [name for name, parameter in torch_model.state_dict().items()] - values = [parameter for name, parameter in ds_state_dict.items()] - - for i in range(len(keys)): - new_state_dict[keys[i]] = values[i] - + new_state_dict = dict(zip(torch_model.state_dict().keys(), ds_state_dict.values())) torch_model.load_state_dict(new_state_dict) torch_model.eval() diff --git a/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample2.py b/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample2.py index 5efc12af3f1..ba7da86ee8c 100644 --- a/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample2.py +++ b/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample2.py @@ -35,16 +35,10 @@ def test_down2(device, reset_seeds, model_location_generator): torch_input = torch.randn((1, 160, 160, 64), dtype=torch.bfloat16) ttnn_input = ttnn.from_torch(torch_input, dtype=ttnn.bfloat16) torch_input = torch_input.permute(0, 3, 1, 2).float() - torch_model = DownSample2() - new_state_dict = {} + torch_model = DownSample2() ds_state_dict = {k: v for k, v in ttnn_model.torch_model.items() if (k.startswith("down2."))} - - keys = [name for name, parameter in torch_model.state_dict().items()] - values = [parameter for name, parameter in ds_state_dict.items()] - for i in range(len(keys)): - new_state_dict[keys[i]] = values[i] - + new_state_dict = dict(zip(torch_model.state_dict().keys(), ds_state_dict.values())) torch_model.load_state_dict(new_state_dict) torch_model.eval() diff --git a/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample3.py b/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample3.py index 23c015fbb5b..8ae58e41470 100644 --- a/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample3.py +++ b/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample3.py @@ -36,15 +36,8 @@ def test_down3(device, reset_seeds, model_location_generator): ttnn_input = ttnn.from_torch(torch_input, dtype=ttnn.bfloat16) torch_input = torch_input.permute(0, 3, 1, 2).float() torch_model = DownSample3() - - new_state_dict = {} ds_state_dict = {k: v for k, v in ttnn_model.torch_model.items() if (k.startswith("down3."))} - - keys = [name for name, parameter in torch_model.state_dict().items()] - values = [parameter for name, parameter in ds_state_dict.items()] - for i in range(len(keys)): - new_state_dict[keys[i]] = values[i] - + new_state_dict = dict(zip(torch_model.state_dict().keys(), ds_state_dict.values())) torch_model.load_state_dict(new_state_dict) torch_model.eval() @@ -58,4 +51,4 @@ def test_down3(device, reset_seeds, model_location_generator): ref = torch_model(torch_input) ref = ref.permute(0, 2, 3, 1) result = result.reshape(ref.shape) - assert_with_pcc(result, ref, 0.95) # PCC 0.95 - The PCC will improve once #3612 is resolved. + assert_with_pcc(result, ref, 0.96) # PCC 0.96 - The PCC will improve once #3612 is resolved. diff --git a/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample4.py b/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample4.py index 35579f14664..b791e9fc813 100644 --- a/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample4.py +++ b/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample4.py @@ -36,15 +36,8 @@ def test_down4(device, reset_seeds, model_location_generator): ttnn_input = ttnn.from_torch(torch_input, dtype=ttnn.bfloat16) torch_input = torch_input.permute(0, 3, 1, 2).float() torch_model = DownSample4() - - new_state_dict = {} ds_state_dict = {k: v for k, v in ttnn_model.torch_model.items() if (k.startswith("down4."))} - - keys = [name for name, parameter in torch_model.state_dict().items()] - values = [parameter for name, parameter in ds_state_dict.items()] - for i in range(len(keys)): - new_state_dict[keys[i]] = values[i] - + new_state_dict = dict(zip(torch_model.state_dict().keys(), ds_state_dict.values())) torch_model.load_state_dict(new_state_dict) torch_model.eval() diff --git a/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample5.py b/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample5.py index 8809d4d8275..d53eab4825e 100644 --- a/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample5.py +++ b/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample5.py @@ -36,15 +36,8 @@ def test_down5(device, reset_seeds, model_location_generator): ttnn_input = ttnn.from_torch(torch_input, dtype=ttnn.bfloat16) torch_input = torch_input.permute(0, 3, 1, 2).float() torch_model = DownSample5() - - new_state_dict = {} ds_state_dict = {k: v for k, v in ttnn_model.torch_model.items() if (k.startswith("down5."))} - - keys = [name for name, parameter in torch_model.state_dict().items()] - values = [parameter for name, parameter in ds_state_dict.items()] - for i in range(len(keys)): - new_state_dict[keys[i]] = values[i] - + new_state_dict = dict(zip(torch_model.state_dict().keys(), ds_state_dict.values())) torch_model.load_state_dict(new_state_dict) torch_model.eval() diff --git a/tests/ttnn/integration_tests/yolov4/test_ttnn_head.py b/tests/ttnn/integration_tests/yolov4/test_ttnn_head.py index 126e3713645..155885f2cb3 100644 --- a/tests/ttnn/integration_tests/yolov4/test_ttnn_head.py +++ b/tests/ttnn/integration_tests/yolov4/test_ttnn_head.py @@ -6,6 +6,7 @@ import ttnn from models.demos.yolov4.reference.head import Head from tests.ttnn.utils_for_testing import assert_with_pcc +from models.utility_functions import skip_for_grayskull import pytest import time from models.demos.yolov4.ttnn.head import TtHead @@ -13,6 +14,7 @@ import os +@skip_for_grayskull() @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) def test_head(device, reset_seeds, model_location_generator): torch.manual_seed(0) @@ -56,15 +58,8 @@ def test_head(device, reset_seeds, model_location_generator): torch_input_tensor = [torch_input_tensor1, torch_input_tensor2, torch_input_tensor3] torch_model = Head() - - new_state_dict = {} ds_state_dict = {k: v for k, v in ttnn_model.torch_model.items() if (k.startswith("head."))} - - keys = [name for name, parameter in torch_model.state_dict().items()] - values = [parameter for name, parameter in ds_state_dict.items()] - for i in range(len(keys)): - new_state_dict[keys[i]] = values[i] - + new_state_dict = dict(zip(torch_model.state_dict().keys(), ds_state_dict.values())) torch_model.load_state_dict(new_state_dict) torch_model.eval() @@ -79,19 +74,22 @@ def test_head(device, reset_seeds, model_location_generator): result_3 = ttnn.to_torch(result_ttnn[2]) ref1, ref2, ref3 = torch_model(torch_input_tensor[0], torch_input_tensor[1], torch_input_tensor[2]) - result_1 = result_1.reshape(1, ref1.shape[2], ref1.shape[3], 255) + num_channels = ref1.shape[1] # 255 + num_channels_padded = num_channels + 1 + + result_1 = result_1.reshape(1, ref1.shape[2], ref1.shape[3], num_channels_padded) result_1 = result_1.permute(0, 3, 1, 2) - result_2 = result_2.reshape(1, ref2.shape[2], ref2.shape[3], 255) + result_2 = result_2.reshape(1, ref2.shape[2], ref2.shape[3], num_channels_padded) result_2 = result_2.permute(0, 3, 1, 2) - result_3 = result_3.reshape(1, ref3.shape[2], ref3.shape[3], 255) + result_3 = result_3.reshape(1, ref3.shape[2], ref3.shape[3], num_channels_padded) result_3 = result_3.permute(0, 3, 1, 2) # Output is sliced because ttnn.conv returns 256 channels instead of 255. - result_1 = result_1[:, :255, :, :] - result_2 = result_2[:, :255, :, :] - result_3 = result_3[:, :255, :, :] + result_1 = result_1[:, :num_channels, :, :] + result_2 = result_2[:, :num_channels, :, :] + result_3 = result_3[:, :num_channels, :, :] pcc_passed, pcc_message = assert_with_pcc(result_1, ref1, 0.99) logger.info(pcc_message) diff --git a/tests/ttnn/integration_tests/yolov4/test_ttnn_neck.py b/tests/ttnn/integration_tests/yolov4/test_ttnn_neck.py index 41ac8781fc1..02c9d81f75d 100644 --- a/tests/ttnn/integration_tests/yolov4/test_ttnn_neck.py +++ b/tests/ttnn/integration_tests/yolov4/test_ttnn_neck.py @@ -6,6 +6,7 @@ import ttnn from models.demos.yolov4.ttnn.neck import TtNeck from models.demos.yolov4.reference.neck import Neck +from models.utility_functions import skip_for_grayskull from tests.ttnn.utils_for_testing import assert_with_pcc import pytest import time @@ -13,6 +14,7 @@ import os +@skip_for_grayskull() @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) def test_neck(device, reset_seeds, model_location_generator): torch.manual_seed(0) @@ -50,16 +52,10 @@ def test_neck(device, reset_seeds, model_location_generator): torch_input_tensor2 = torch_input_tensor2.permute(0, 3, 1, 2).float() torch_input_tensor3 = torch_input_tensor3.permute(0, 3, 1, 2).float() torch_input_tensor = [torch_input_tensor1, torch_input_tensor2, torch_input_tensor3] - torch_model = Neck() - new_state_dict = {} + torch_model = Neck() ds_state_dict = {k: v for k, v in ttnn_model.torch_model.items() if (k.startswith("neek."))} - - keys = [name for name, parameter in torch_model.state_dict().items()] - values = [parameter for name, parameter in ds_state_dict.items()] - for i in range(len(keys)): - new_state_dict[keys[i]] = values[i] - + new_state_dict = dict(zip(torch_model.state_dict().keys(), ds_state_dict.values())) torch_model.load_state_dict(new_state_dict) torch_model.eval() diff --git a/tests/ttnn/integration_tests/yolov4/test_ttnn_post_processing.py b/tests/ttnn/integration_tests/yolov4/test_ttnn_post_processing.py new file mode 100644 index 00000000000..128a0c93f43 --- /dev/null +++ b/tests/ttnn/integration_tests/yolov4/test_ttnn_post_processing.py @@ -0,0 +1,80 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +import torch +import ttnn +from models.utility_functions import skip_for_grayskull +from tests.ttnn.utils_for_testing import assert_with_pcc +from models.demos.yolov4.ttnn.genboxes import TtGenBoxes +from models.demos.yolov4.demo.demo import YoloLayer, get_region_boxes, gen_yolov4_boxes_confs + +import pytest +import os + + +@skip_for_grayskull() +@pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) +def test_yolov4_post_processing(device, reset_seeds, model_location_generator): + torch.manual_seed(0) + + torch_input_1 = torch.randn((1, 1, 1600, 256), dtype=torch.bfloat16) + ttnn_input_1 = ttnn.from_torch( + torch_input_1, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device, memory_config=ttnn.L1_MEMORY_CONFIG + ) + torch_input_2 = torch.randn((1, 1, 400, 256), dtype=torch.bfloat16) + ttnn_input_2 = ttnn.from_torch( + torch_input_2, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device, memory_config=ttnn.L1_MEMORY_CONFIG + ) + torch_input_3 = torch.randn((1, 1, 100, 256), dtype=torch.bfloat16) + ttnn_input_3 = ttnn.from_torch( + torch_input_3, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device, memory_config=ttnn.L1_MEMORY_CONFIG + ) + + torch_input_1 = torch_input_1[:, :, :, :255] + torch_input_1 = torch_input_1.reshape(1, 40, 40, 255) + torch_input_1 = torch.permute(torch_input_1, (0, 3, 1, 2)) + torch_input_2 = torch_input_2[:, :, :, :255] + torch_input_2 = torch_input_2.reshape(1, 20, 20, 255) + torch_input_2 = torch.permute(torch_input_2, (0, 3, 1, 2)) + torch_input_3 = torch_input_3[:, :, :, :255] + torch_input_3 = torch_input_3.reshape(1, 10, 10, 255) + torch_input_3 = torch.permute(torch_input_3, (0, 3, 1, 2)) + + ref1, ref2, ref3 = gen_yolov4_boxes_confs([torch_input_1, torch_input_2, torch_input_3]) + + boxes_confs_1 = TtGenBoxes(device) + boxes_confs_2 = TtGenBoxes(device) + boxes_confs_3 = TtGenBoxes(device) + + result_1 = boxes_confs_1(device, ttnn_input_1) + result_2 = boxes_confs_2(device, ttnn_input_2) + result_3 = boxes_confs_3(device, ttnn_input_3) + + result_1_bb = ttnn.to_torch(result_1[0]) + result_2_bb = ttnn.to_torch(result_2[0]) + result_3_bb = ttnn.to_torch(result_3[0]) + + result_1_bb = result_1_bb.permute(0, 2, 3, 1) + result_2_bb = result_2_bb.permute(0, 2, 3, 1) + result_3_bb = result_3_bb.permute(0, 2, 3, 1) + + result_1_bb = result_1_bb.reshape(1, 4800, 1, 4) + result_2_bb = result_2_bb.reshape(1, 1200, 1, 4) + result_3_bb = result_3_bb.reshape(1, 300, 1, 4) + + result_1_conf = ttnn.to_torch(result_1[1]) + result_2_conf = ttnn.to_torch(result_2[1]) + result_3_conf = ttnn.to_torch(result_3[1]) + + assert_with_pcc(ref1[0], result_1_bb, 0.99) + assert_with_pcc(ref2[0], result_2_bb, 0.99) + assert_with_pcc(ref3[0], result_3_bb, 0.99) + + assert_with_pcc(ref1[1], result_1_conf, 0.99) + assert_with_pcc(ref2[1], result_2_conf, 0.99) + assert_with_pcc(ref3[1], result_3_conf, 0.99) + + output = get_region_boxes( + [(result_1_bb, result_1_conf), (result_2_bb, result_2_conf), (result_3_bb, result_3_conf)] + ) diff --git a/tests/ttnn/integration_tests/yolov4/test_ttnn_yolov4.py b/tests/ttnn/integration_tests/yolov4/test_ttnn_yolov4.py index ff9a9d4c1dc..2a338bf6438 100644 --- a/tests/ttnn/integration_tests/yolov4/test_ttnn_yolov4.py +++ b/tests/ttnn/integration_tests/yolov4/test_ttnn_yolov4.py @@ -4,10 +4,15 @@ import torch import ttnn -from models.utility_functions import skip_for_grayskull from models.demos.yolov4.reference.yolov4 import Yolov4 from tests.ttnn.utils_for_testing import assert_with_pcc +from models.utility_functions import skip_for_grayskull from models.demos.yolov4.ttnn.yolov4 import TtYOLOv4 +from models.demos.yolov4.demo.demo import YoloLayer, get_region_boxes, gen_yolov4_boxes_confs + +import cv2 +import numpy as np + import pytest import os @@ -28,46 +33,53 @@ def test_yolov4(device, reset_seeds, model_location_generator): else: weights_pth = str(model_path / "yolov4.pth") - ttnn_model = TtYOLOv4(device, weights_pth) - - torch_input = torch.randn((1, 320, 320, 3), dtype=torch.bfloat16) - ttnn_input = ttnn.from_torch(torch_input, dtype=ttnn.bfloat16) - torch_input = torch_input.permute(0, 3, 1, 2).float() - torch_model = Yolov4() - - new_state_dict = {} - ds_state_dict = {k: v for k, v in ttnn_model.torch_model.items()} + ttnn_model = TtYOLOv4(weights_pth, device) - keys = [name for name, parameter in torch_model.state_dict().items()] - values = [parameter for name, parameter in ds_state_dict.items()] + imgfile = "models/demos/yolov4/demo/giraffe_320.jpg" + width = 320 + height = 320 + img = cv2.imread(imgfile) + img = cv2.resize(img, (width, height)) + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + if type(img) == np.ndarray and len(img.shape) == 3: # cv2 image + img = torch.from_numpy(img.transpose(2, 0, 1)).float().div(255.0).unsqueeze(0) + elif type(img) == np.ndarray and len(img.shape) == 4: + img = torch.from_numpy(img.transpose(0, 3, 1, 2)).float().div(255.0) + torch_input = torch.autograd.Variable(img) - for i in range(len(keys)): - new_state_dict[keys[i]] = values[i] + input_tensor = torch.permute(torch_input, (0, 2, 3, 1)) + ttnn_input = ttnn.from_torch(input_tensor, ttnn.bfloat16) + torch_model = Yolov4() + new_state_dict = dict(zip(torch_model.state_dict().keys(), ttnn_model.torch_model.values())) torch_model.load_state_dict(new_state_dict) torch_model.eval() - result_1, result_2, result_3 = ttnn_model(ttnn_input) - result_1 = ttnn.to_torch(result_1) - result_2 = ttnn.to_torch(result_2) - result_3 = ttnn.to_torch(result_3) - - ref1, ref2, ref3 = torch_model(torch_input) - - result_1 = result_1.reshape(1, ref1.shape[2], ref1.shape[3], 255) - result_1 = result_1.permute(0, 3, 1, 2) - - result_2 = result_2.reshape(1, ref2.shape[2], ref2.shape[3], 255) - result_2 = result_2.permute(0, 3, 1, 2) - - result_3 = result_3.reshape(1, ref3.shape[2], ref3.shape[3], 255) - result_3 = result_3.permute(0, 3, 1, 2) - - # Output is sliced because ttnn.conv returns 256 channels instead of 255. - result_1 = result_1[:, :255, :, :] - result_2 = result_2[:, :255, :, :] - result_3 = result_3[:, :255, :, :] - - assert_with_pcc(result_1, ref1, 0.99) - assert_with_pcc(result_2, ref2, 0.99) - assert_with_pcc(result_3, ref3, 0.98) + torch_output_tensor = torch_model(torch_input) + + ref1, ref2, ref3 = gen_yolov4_boxes_confs(torch_output_tensor) + ref_boxes, ref_confs = get_region_boxes([ref1, ref2, ref3]) + + ttnn_output_tensor = ttnn_model(ttnn_input) + result_boxes_padded = ttnn.to_torch(ttnn_output_tensor[0]) + result_confs = ttnn.to_torch(ttnn_output_tensor[1]) + + result_boxes_padded = result_boxes_padded.permute(0, 2, 1, 3) + result_boxes_list = [] + # Unpadding + # That ttnn tensor is the concat output of 3 padded tensors + # As a perf workaround I'm doing the unpadding on the torch output here. + # TODO: cleaner ttnn code when ttnn.untilize() is fully optimized + box_1_start_i = 0 + box_1_end_i = 6100 + box_2_start_i = 6128 + box_2_end_i = 6228 + box_3_start_i = 6256 + box_3_end_i = 6356 + result_boxes_list.append(result_boxes_padded[:, box_1_start_i:box_1_end_i]) + result_boxes_list.append(result_boxes_padded[:, box_2_start_i:box_2_end_i]) + result_boxes_list.append(result_boxes_padded[:, box_3_start_i:box_3_end_i]) + result_boxes = torch.cat(result_boxes_list, dim=1) + + assert_with_pcc(ref_boxes, result_boxes, 0.99) + assert_with_pcc(ref_confs, result_confs, 0.71) From c1b88f2fcd61dd76bfd06916b854e87754a1082e Mon Sep 17 00:00:00 2001 From: Nigel Huang Date: Tue, 18 Feb 2025 05:38:39 +0000 Subject: [PATCH 227/316] #0: add tensix l1 base&size --- tt_metal/api/tt-metalium/hal_exp.hpp | 16 ++++++++++++++++ tt_metal/experimental/hal.cpp | 16 ++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/tt_metal/api/tt-metalium/hal_exp.hpp b/tt_metal/api/tt-metalium/hal_exp.hpp index 5e14b0a5353..2b769aac65f 100644 --- a/tt_metal/api/tt-metalium/hal_exp.hpp +++ b/tt_metal/api/tt-metalium/hal_exp.hpp @@ -68,6 +68,22 @@ uint32_t get_erisc_l1_unreserved_base(); */ uint32_t get_erisc_l1_unreserved_size(); +/** + * @brief Uses the hardware abstraction layer to inform client of architecture specific address. + * this address corresponds to the beginning of free space in the TENSIX core's L1 SRAM + * + * @return address + */ +uint32_t get_tensix_l1_unreserved_base(); + +/** + * @brief Uses the hardware abstraction layer to inform client of architecture specific size. + * this size corresponds to the total free space in the TENSIX core's L1 SRAM for host usage + * + * @return size in bytes + */ +uint32_t get_tensix_l1_unreserved_size(); + /** * @brief Uses the hardware abstraction layer to fetch the representable epsilon value. * diff --git a/tt_metal/experimental/hal.cpp b/tt_metal/experimental/hal.cpp index d67c8d87e9c..7fe4108e31b 100644 --- a/tt_metal/experimental/hal.cpp +++ b/tt_metal/experimental/hal.cpp @@ -50,6 +50,22 @@ uint32_t get_erisc_l1_unreserved_size() { return 0; } +uint32_t get_tensix_l1_unreserved_base() { + auto& hal = HalSingleton::getInstance(); + if (hal.get_arch() != tt::ARCH::GRAYSKULL) { + return hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); + } + return 0; +} + +uint32_t get_tensix_l1_unreserved_size() { + auto& hal = HalSingleton::getInstance(); + if (hal.get_arch() != tt::ARCH::GRAYSKULL) { + return hal.get_dev_size(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); + } + return 0; +} + float get_eps() { return HalSingleton::getInstance().get_eps(); } float get_nan() { return HalSingleton::getInstance().get_nan(); } From 3c3cfe7b5767a4c43cf03db0567367801b0ca630 Mon Sep 17 00:00:00 2001 From: Nigel Huang Date: Tue, 18 Feb 2025 07:22:53 +0000 Subject: [PATCH 228/316] #0: add MB and GB to literals --- tt_metal/api/tt-metalium/helpers.hpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tt_metal/api/tt-metalium/helpers.hpp b/tt_metal/api/tt-metalium/helpers.hpp index 0bcc9b25d9c..aebf3f3f69a 100644 --- a/tt_metal/api/tt-metalium/helpers.hpp +++ b/tt_metal/api/tt-metalium/helpers.hpp @@ -8,9 +8,15 @@ namespace tt::tt_metal { -// Si KB Prefix +// KiB Prefix literal constexpr auto operator""_KB(const unsigned long long v) -> uint32_t { return 1024 * v; } +// MiB prefix literal +constexpr auto operator""_MB(const unsigned long long v) -> uint32_t { return 1024 * 1024 * v; } + +// GiB prefix literal +constexpr auto operator""_GB(const unsigned long long v) -> uint32_t { return 1024 * 1024 * 1024 * v; } + // Returns the size rounded up to the given alignment inline uint32_t round_size(uint32_t sz, uint32_t alignment) { return ((sz + alignment - 1) / alignment * alignment); From 532dd26223ae0ac824945fd32827ad8595f32fe2 Mon Sep 17 00:00:00 2001 From: Nigel Huang Date: Tue, 18 Feb 2025 09:29:47 +0000 Subject: [PATCH 229/316] #0: comprehensive mem benchmark tool - Benchmark various host copy and device pcie pull patterns --- .../tt-metalium/command_queue_interface.hpp | 7 +- .../impl/dispatch/util/dispatch_settings.cpp | 2 +- .../dispatch/util/size_literals.hpp} | 6 +- tt_metal/tools/CMakeLists.txt | 2 + tt_metal/tools/mem_bench/CMakeLists.txt | 40 ++ tt_metal/tools/mem_bench/README.md | 42 ++ tt_metal/tools/mem_bench/context.hpp | 78 +++ tt_metal/tools/mem_bench/device_utils.cpp | 92 +++ tt_metal/tools/mem_bench/device_utils.hpp | 26 + tt_metal/tools/mem_bench/host_utils.cpp | 87 +++ tt_metal/tools/mem_bench/host_utils.hpp | 85 +++ .../mem_bench/kernels/mem_bench_kernel.cpp | 99 ++++ tt_metal/tools/mem_bench/mem_bench.cpp | 545 ++++++++++++++++++ tt_metal/tools/mem_bench/work_thread.hpp | 77 +++ 14 files changed, 1178 insertions(+), 10 deletions(-) rename tt_metal/{api/tt-metalium/helpers.hpp => impl/dispatch/util/size_literals.hpp} (75%) create mode 100644 tt_metal/tools/mem_bench/CMakeLists.txt create mode 100644 tt_metal/tools/mem_bench/README.md create mode 100644 tt_metal/tools/mem_bench/context.hpp create mode 100644 tt_metal/tools/mem_bench/device_utils.cpp create mode 100644 tt_metal/tools/mem_bench/device_utils.hpp create mode 100644 tt_metal/tools/mem_bench/host_utils.cpp create mode 100644 tt_metal/tools/mem_bench/host_utils.hpp create mode 100644 tt_metal/tools/mem_bench/kernels/mem_bench_kernel.cpp create mode 100644 tt_metal/tools/mem_bench/mem_bench.cpp create mode 100644 tt_metal/tools/mem_bench/work_thread.hpp diff --git a/tt_metal/api/tt-metalium/command_queue_interface.hpp b/tt_metal/api/tt-metalium/command_queue_interface.hpp index 30de4f2e631..53f6eb068ea 100644 --- a/tt_metal/api/tt-metalium/command_queue_interface.hpp +++ b/tt_metal/api/tt-metalium/command_queue_interface.hpp @@ -3,11 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 #pragma once -#include #include #include #include -#include #include "cq_commands.hpp" #include "dispatch_core_manager.hpp" @@ -15,7 +13,6 @@ #include "memcpy.hpp" #include "hal.hpp" #include "dispatch_settings.hpp" -#include "helpers.hpp" #include "buffer.hpp" #include "umd/device/tt_core_coordinates.h" @@ -193,8 +190,8 @@ class DispatchMemMap { uint32_t prefetch_dispatch_unreserved_base = device_cq_addrs_[tt::utils::underlying_type( CommandQueueDeviceAddrType::UNRESERVED)]; - cmddat_q_base_ = prefetch_dispatch_unreserved_base + round_size(settings.prefetch_q_size_, pcie_alignment); - scratch_db_base_ = cmddat_q_base_ + round_size(settings.prefetch_cmddat_q_size_, pcie_alignment); + cmddat_q_base_ = align(prefetch_dispatch_unreserved_base + settings.prefetch_q_size_, pcie_alignment); + scratch_db_base_ = align(cmddat_q_base_ + settings.prefetch_cmddat_q_size_, pcie_alignment); dispatch_buffer_base_ = align(prefetch_dispatch_unreserved_base, 1 << DispatchSettings::DISPATCH_BUFFER_LOG_PAGE_SIZE); dispatch_buffer_block_size_pages_ = settings.dispatch_pages_ / DispatchSettings::DISPATCH_BUFFER_SIZE_BLOCKS; const uint32_t dispatch_cb_end = dispatch_buffer_base_ + settings.dispatch_size_; diff --git a/tt_metal/impl/dispatch/util/dispatch_settings.cpp b/tt_metal/impl/dispatch/util/dispatch_settings.cpp index 7912a1f825d..a6003177a96 100644 --- a/tt_metal/impl/dispatch/util/dispatch_settings.cpp +++ b/tt_metal/impl/dispatch/util/dispatch_settings.cpp @@ -8,7 +8,7 @@ #include "magic_enum/magic_enum.hpp" #include "umd/device/tt_core_coordinates.h" #include -#include +#include "size_literals.hpp" namespace tt::tt_metal { diff --git a/tt_metal/api/tt-metalium/helpers.hpp b/tt_metal/impl/dispatch/util/size_literals.hpp similarity index 75% rename from tt_metal/api/tt-metalium/helpers.hpp rename to tt_metal/impl/dispatch/util/size_literals.hpp index aebf3f3f69a..061d9880904 100644 --- a/tt_metal/api/tt-metalium/helpers.hpp +++ b/tt_metal/impl/dispatch/util/size_literals.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. // // SPDX-License-Identifier: Apache-2.0 @@ -18,8 +18,6 @@ constexpr auto operator""_MB(const unsigned long long v) -> uint32_t { return 10 constexpr auto operator""_GB(const unsigned long long v) -> uint32_t { return 1024 * 1024 * 1024 * v; } // Returns the size rounded up to the given alignment -inline uint32_t round_size(uint32_t sz, uint32_t alignment) { - return ((sz + alignment - 1) / alignment * alignment); -} +inline uint32_t round_size(uint32_t sz, uint32_t alignment) { return ((sz + alignment - 1) / alignment * alignment); } } // namespace tt::tt_metal diff --git a/tt_metal/tools/CMakeLists.txt b/tt_metal/tools/CMakeLists.txt index 3509710519a..186c1ea86c7 100644 --- a/tt_metal/tools/CMakeLists.txt +++ b/tt_metal/tools/CMakeLists.txt @@ -1,6 +1,7 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/profiler) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/watcher_dump) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/lightmetal_runner) +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/mem_bench) set(TOOLS_SRC ${CMAKE_CURRENT_SOURCE_DIR}/memset.cpp) @@ -10,6 +11,7 @@ target_link_libraries( PUBLIC profiler Metalium::Metal::LLRT + Metalium::Metal PRIVATE TT::Metalium::HostDevCommon ) diff --git a/tt_metal/tools/mem_bench/CMakeLists.txt b/tt_metal/tools/mem_bench/CMakeLists.txt new file mode 100644 index 00000000000..72127b9bb1c --- /dev/null +++ b/tt_metal/tools/mem_bench/CMakeLists.txt @@ -0,0 +1,40 @@ +set(IMPL_SRC + ${CMAKE_CURRENT_SOURCE_DIR}/mem_bench.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/host_utils.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_utils.cpp +) + +set(HEADERS_SRC + ${CMAKE_CURRENT_SOURCE_DIR}/host_utils.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_utils.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/work_thread.hpp +) + +add_executable( + mem_bench + ${IMPL_SRC} + ${HEADERS_SRC} +) +target_link_libraries( + mem_bench + PRIVATE + tt_metal + test_metal_common_libs + numa + benchmark::benchmark +) +target_include_directories( + mem_bench + PRIVATE + ${PROJECT_SOURCE_DIR} + ${PROJECT_SOURCE_DIR}/tt_metal + ${PROJECT_SOURCE_DIR}/tt_metal/common + ${PROJECT_SOURCE_DIR}/tests + ${CMAKE_CURRENT_SOURCE_DIR} +) +set_target_properties( + mem_bench + PROPERTIES + RUNTIME_OUTPUT_DIRECTORY + ${PROJECT_BINARY_DIR}/tools +) diff --git a/tt_metal/tools/mem_bench/README.md b/tt_metal/tools/mem_bench/README.md new file mode 100644 index 00000000000..b10a228789d --- /dev/null +++ b/tt_metal/tools/mem_bench/README.md @@ -0,0 +1,42 @@ +# tt mem_bench + +Utility to measure host and device bandwidth on Tenstorrent devices. + +## Build + +Tools are included in `tt_metal` builds. Using a release build is required for accurate perf measurements. + +## Usage + +By default, each test is run for 5 iterations and only basic tests are executed. All test patterns can be executed by specifying `--full`. Additional run parameters are listed below. + +Tests will report host bandwidth and/or device bandwidth. If device bandwidth is reported, then the average of all cores is reported as well as bandwidth for just a single core. + +> [!NOTE] +Reducing the `tt_metal` library log level by exporting `TT_METAL_LOGGER_LEVEL=fatal` will increase the readability of the output. + +> [!NOTE] +On NUMA systems, the host page for the device's command queue data is pinned on the memory node closest to where the device is located. If `tt_metal` is run on a different node then bandwidth will degrade because it'll need to cross sockets. Therefore, it's important to run `tt_metal` on the closest node. On Linux, the execution policy can be set using `numactl`. E.g., if the device is located on node 0, then `numactl --cpubind=0 --membind=0 ` will allocate resources closer to the device. + +``` +./build/tools/mem_bench --help +benchmark [--benchmark_list_tests={true|false}] + [--benchmark_filter=] + [--benchmark_min_time=`x` OR `s` ] + [--benchmark_min_warmup_time=] + [--benchmark_repetitions=] + [--benchmark_dry_run={true|false}] + [--benchmark_enable_random_interleaving={true|false}] + [--benchmark_report_aggregates_only={true|false}] + [--benchmark_display_aggregates_only={true|false}] + [--benchmark_format=] + [--benchmark_out=] + [--benchmark_out_format=] + [--benchmark_color={auto|true|false}] + [--benchmark_counters_tabular={true|false}] + [--benchmark_context==,...] + [--benchmark_time_unit={ns|us|ms|s}] + [--v=] + [--help] Shows this help message + [--full] Run all tests +``` diff --git a/tt_metal/tools/mem_bench/context.hpp b/tt_metal/tools/mem_bench/context.hpp new file mode 100644 index 00000000000..4bf8d8ff450 --- /dev/null +++ b/tt_metal/tools/mem_bench/context.hpp @@ -0,0 +1,78 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include +#include +#include + +namespace tt::tt_metal::tools::mem_bench { + +struct TestResult { + double host_bytes_processed{0}; + double host_time_elapsed{0}; + double host_wait_for_kernel_time_elapsed{0}; + + double total_cores_cycles{0}; + double total_cores_time{0}; + double total_cores_bytes_rd{0}; + double total_cores_bytes_wr{0}; + + double kernel_0_cycles{0}; + double kernel_0_time{0}; + double kernel_0_bytes_rd{0}; + double kernel_0_bytes_wr{0}; + + // Any additional values to be included in benchmark reports + std::map arb_counters; +}; + +struct L1MemoryMap { + uint32_t cycles; + uint32_t rd_bytes; + uint32_t wr_bytes; + uint32_t unreserved; +}; + +struct Context { + std::map devices; + L1MemoryMap device_address; + uint32_t total_size{0}; + uint32_t page_size{0}; + int threads{0}; + int number_reader_kernels{0}; + int number_writer_kernels{0}; + bool enable_host_copy_with_kernels{0}; + int iterations{0}; + + Context( + const std::map& devices_, + uint32_t total_size_, + uint32_t page_size_, + int threads_, + int readers_, + int writers_, + bool enable_host_copy_with_kernels_, + int iterations_) { + auto l1_alignment = experimental::hal::get_l1_alignment(); + auto l1_base = experimental::hal::get_tensix_l1_unreserved_base(); + device_address.cycles = l1_base; + device_address.rd_bytes = align(device_address.cycles + sizeof(uint32_t), l1_alignment); + device_address.wr_bytes = align(device_address.rd_bytes + sizeof(uint32_t), l1_alignment); + device_address.unreserved = align(device_address.wr_bytes + sizeof(uint32_t), l1_alignment); + devices = devices_; + total_size = total_size_; + page_size = page_size_; + threads = threads_; + number_reader_kernels = readers_; + number_writer_kernels = writers_; + enable_host_copy_with_kernels = enable_host_copy_with_kernels_; + iterations = iterations_; + } +}; + +} // namespace tt::tt_metal::tools::mem_bench diff --git a/tt_metal/tools/mem_bench/device_utils.cpp b/tt_metal/tools/mem_bench/device_utils.cpp new file mode 100644 index 00000000000..bd650a3c052 --- /dev/null +++ b/tt_metal/tools/mem_bench/device_utils.cpp @@ -0,0 +1,92 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include "device_utils.hpp" +#include "context.hpp" + +namespace tt::tt_metal::tools::mem_bench { + +std::vector read_cores(tt::tt_metal::IDevice* device, const CoreRange& cores, uint32_t addr) { + std::vector data; + for (int xi = cores.start_coord.x; xi <= cores.end_coord.x; ++xi) { + for (int yi = cores.start_coord.y; yi <= cores.end_coord.y; ++yi) { + std::vector single_data; + tt::tt_metal::detail::ReadFromDeviceL1(device, CoreCoord{xi, yi}, addr, sizeof(uint32_t), single_data); + data.push_back(single_data[0]); + } + } + return data; +} + +std::optional configure_kernels( + tt::tt_metal::IDevice* device, + tt::tt_metal::Program& program, + const Context& context, + uint32_t start_y, + uint32_t num_kernels, + bool is_writer, + uint32_t pcie_size, + uint32_t pcie_offset) { + constexpr std::string_view k_PcieBenchKernel = "tt_metal/tools/mem_bench/kernels/mem_bench_kernel.cpp"; + const auto grid_size = device->logical_grid_size(); + const auto max_x = grid_size.x; + const auto max_y = grid_size.y; + uint32_t total_kernel_transfer = context.total_size; + uint32_t kernel_transfer_size = context.page_size; + + if (!kernel_transfer_size) { + kernel_transfer_size = total_kernel_transfer; + } else if (!num_kernels) { + return {}; + } + + // Number readers either less than one row + // or a multiple of the rows + CoreCoord start_coord{0, start_y}; + CoreCoord end_coord; + if (num_kernels <= max_x) { + end_coord.x = start_coord.x + num_kernels - 1; + end_coord.y = start_coord.y; + } else { + const auto number_of_rows = num_kernels / max_x; + const auto last_row_width = (num_kernels % max_x) ? num_kernels % max_x : max_x; + end_coord.x = start_coord.x + last_row_width - 1; + end_coord.y = number_of_rows - 1; + } + CoreRange core_range{start_coord, end_coord}; + + std::vector pcie_bench_compile_args(12, 0); + if (is_writer) { + pcie_bench_compile_args[5] = 0; // reserved_0 + pcie_bench_compile_args[6] = pcie_offset; // pcie_wr_base + pcie_bench_compile_args[7] = pcie_size; // pcie_wr_size + pcie_bench_compile_args[8] = kernel_transfer_size; // pcie_wr_transfer_size + } else { + pcie_bench_compile_args[0] = context.device_address.unreserved; // my_rd_dst_addr + pcie_bench_compile_args[1] = pcie_offset; // pcie_rd_base + pcie_bench_compile_args[2] = pcie_size; // pcie_rd_size + pcie_bench_compile_args[3] = kernel_transfer_size; // pcie_rd_transfer_size + } + pcie_bench_compile_args[4] = context.device_address.rd_bytes; // my_bytes_rd_addr + pcie_bench_compile_args[9] = context.device_address.wr_bytes; // my_bytes_wr_addr + pcie_bench_compile_args[10] = total_kernel_transfer; + pcie_bench_compile_args[11] = context.device_address.cycles; + + [[maybe_unused]] auto kernel = tt::tt_metal::CreateKernel( + program, + std::string{k_PcieBenchKernel}, + core_range, + DataMovementConfig{ + .processor = DataMovementProcessor::RISCV_1, + .noc = tt::tt_metal::NOC_0, + .compile_args = pcie_bench_compile_args, + .defines = {}, + }); + + return core_range; +} + +} // namespace tt::tt_metal::tools::mem_bench diff --git a/tt_metal/tools/mem_bench/device_utils.hpp b/tt_metal/tools/mem_bench/device_utils.hpp new file mode 100644 index 00000000000..ab20ebfc3cc --- /dev/null +++ b/tt_metal/tools/mem_bench/device_utils.hpp @@ -0,0 +1,26 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include +#include "context.hpp" + +namespace tt::tt_metal::tools::mem_bench { + +std::vector read_cores(tt::tt_metal::IDevice* device, const CoreRange& cores, uint32_t addr); + +std::optional configure_kernels( + tt::tt_metal::IDevice* device, + tt::tt_metal::Program& program, + const Context& context, + uint32_t start_y, + uint32_t num_kernels, + bool is_writer, + uint32_t pcie_size, + uint32_t pcie_offset = 0); + +} // namespace tt::tt_metal::tools::mem_bench diff --git a/tt_metal/tools/mem_bench/host_utils.cpp b/tt_metal/tools/mem_bench/host_utils.cpp new file mode 100644 index 00000000000..9aad3fe59fa --- /dev/null +++ b/tt_metal/tools/mem_bench/host_utils.cpp @@ -0,0 +1,87 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "host_utils.hpp" +#include +#include +#include +#include +#include +#include + +namespace tt::tt_metal::tools::mem_bench { + +void* get_hugepage(int device_id, uint32_t base_offset) { + auto& cluster = tt::Cluster::instance(); + auto mmio_device_id = cluster.get_associated_mmio_device(device_id); + auto channel = cluster.get_assigned_channel_for_device(device_id); + return (void*)(cluster.host_dma_address(base_offset, mmio_device_id, channel)); +} + +uint32_t get_hugepage_size(int device_id) { + auto& cluster = tt::Cluster::instance(); + auto mmio_device_id = cluster.get_associated_mmio_device(device_id); + auto channel = cluster.get_assigned_channel_for_device(device_id); + return cluster.get_host_channel_size(mmio_device_id, channel); +} + +tt::tt_metal::vector_memcpy_aligned generate_random_src_data(uint32_t num_bytes) { + std::uniform_int_distribution distribution( + std::numeric_limits::min(), std::numeric_limits::max()); + std::default_random_engine generator; + + tt::tt_metal::vector_memcpy_aligned vec(num_bytes / sizeof(uint32_t)); + std::generate(vec.begin(), vec.end(), [&]() { return distribution(generator); }); + + return vec; +} + +double get_current_time_seconds() { + return std::chrono::duration(std::chrono::high_resolution_clock::now().time_since_epoch()).count(); +} + +std::vector get_mmio_device_ids(int number_of_devices, int numa_node) { + auto& cluster = tt::Cluster::instance(); + const auto pcie_devices = cluster.number_of_pci_devices(); + std::vector device_ids; + + // Assumes PCIe device IDs are iterated first + for (int device_id = 0; device_id < pcie_devices && device_ids.size() < number_of_devices; ++device_id) { + // Not an MMIO device + if (cluster.get_associated_mmio_device(device_id) != device_id) { + continue; + } + + auto associated_node = cluster.get_numa_node_for_device(device_id); + if (numa_node == -1 || associated_node == numa_node) { + device_ids.push_back(device_id); + } + } + + return device_ids; +} + +std::vector get_mmio_device_ids_unique_nodes(int number_of_devices) { + auto& cluster = tt::Cluster::instance(); + const auto pcie_devices = cluster.number_of_pci_devices(); + std::vector device_ids; + std::unordered_set numa_nodes; + + for (int device_id = 0; device_id < pcie_devices && device_ids.size() < number_of_devices; ++device_id) { + auto associated_node = cluster.get_numa_node_for_device(device_id); + if (!numa_nodes.contains(associated_node)) { + device_ids.push_back(device_id); + numa_nodes.insert(associated_node); + } + } + + return device_ids; +} + +int get_number_of_mmio_devices() { + auto& cluster = tt::Cluster::instance(); + return cluster.number_of_pci_devices(); +} + +} // namespace tt::tt_metal::tools::mem_bench diff --git a/tt_metal/tools/mem_bench/host_utils.hpp b/tt_metal/tools/mem_bench/host_utils.hpp new file mode 100644 index 00000000000..c00d3e40ac3 --- /dev/null +++ b/tt_metal/tools/mem_bench/host_utils.hpp @@ -0,0 +1,85 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +// #include "work_thread.hpp" + +namespace tt::tt_metal::tools::mem_bench { + +// Generate random data aligned for memcpy_to_device. +tt::tt_metal::vector_memcpy_aligned generate_random_src_data(uint32_t num_bytes); + +// Get current host time, in seconds. +double get_current_time_seconds(); + +// Return device ids. If numa_node is specified then only device ids on that +// node will be returned. If numa_node == -1, then the node is not taken into +// consideration. Note: Less than number_of_devices may be returned. +std::vector get_mmio_device_ids(int number_of_devices, int numa_node); + +// Returns device ids. All devices are on different nodes. Note: Less than +// number_of_devices may be returned. +std::vector get_mmio_device_ids_unique_nodes(int number_of_devices); + +// Returns the number of MMIO connected chips. +int get_number_of_mmio_devices(); + +// Returns the hugepage pointer assigned to a device. +void* get_hugepage(int device_id, uint32_t base_offset); + +// Returns the size of the hugepage assigned to a device. +uint32_t get_hugepage_size(int device_id); + +// Copy data to hugepage. Returns the duration. +// repeating_src_vector: Keep copying the same elements to hugepage. This should force the source data in stay in the +// caches. fence: Memory barrier at the end of each copy. Returns the time in seconds +template +double copy_to_hugepage( + void* hugepage_base, + uint32_t hugepage_size, + std::span src_data, + size_t total_size, + size_t page_size, + bool repeating_src_vector) { + uint64_t hugepage_addr = reinterpret_cast(hugepage_base); + uint64_t hugepage_end = hugepage_addr + hugepage_size; + uint64_t src_addr = reinterpret_cast(src_data.data()); + size_t num_pages; + if (!page_size) { + num_pages = 1; + page_size = total_size; + } else { + num_pages = total_size / page_size; + } + + auto start = get_current_time_seconds(); + for (int i = 0; i < num_pages; ++i) { + tt::tt_metal::memcpy_to_device((void*)(hugepage_addr), (void*)(src_addr), page_size); + + // 64 bit host address alignment + hugepage_addr = ((hugepage_addr + page_size - 1) | (tt::tt_metal::MEMCPY_ALIGNMENT - 1)) + 1; + + if (!repeating_src_vector) { + src_addr += page_size; + } + + // Wrap back to the beginning of hugepage + if (hugepage_addr + page_size >= hugepage_end) { + hugepage_addr = reinterpret_cast(hugepage_base); + } + } + auto end = get_current_time_seconds(); + + return end - start; +} + +}; // namespace tt::tt_metal::tools::mem_bench diff --git a/tt_metal/tools/mem_bench/kernels/mem_bench_kernel.cpp b/tt_metal/tools/mem_bench/kernels/mem_bench_kernel.cpp new file mode 100644 index 00000000000..e04b02013de --- /dev/null +++ b/tt_metal/tools/mem_bench/kernels/mem_bench_kernel.cpp @@ -0,0 +1,99 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "dataflow_api.h" +#include +#include +#include "noc_overlay_parameters.h" + +// +// Test Kernel for mem_bench +// +// Performs PCIe reads and/or writes +// + +// reader kernel +constexpr uint32_t my_rd_dst_addr = get_compile_time_arg_val(0); // L1 +constexpr uint32_t pcie_rd_base = get_compile_time_arg_val(1); +constexpr uint32_t pcie_rd_size = get_compile_time_arg_val(2); +constexpr uint32_t pcie_rd_end = pcie_rd_base + pcie_rd_size; +constexpr uint32_t pcie_rd_transfer_size = get_compile_time_arg_val(3); +constexpr uint32_t my_bytes_rd_addr = get_compile_time_arg_val(4); + +// writer kernel +constexpr uint32_t reserved_0 = get_compile_time_arg_val(5); +constexpr uint32_t pcie_wr_base = get_compile_time_arg_val(6); +constexpr uint32_t pcie_wr_size = get_compile_time_arg_val(7); +constexpr uint32_t pcie_wr_end = pcie_wr_base + pcie_wr_size; +constexpr uint32_t pcie_wr_transfer_size = get_compile_time_arg_val(8); +constexpr uint32_t my_bytes_wr_addr = get_compile_time_arg_val(9); + +// common to both +constexpr uint32_t my_total_work = get_compile_time_arg_val(10); // Total bytes to read+write +constexpr uint32_t my_cycles_addr = get_compile_time_arg_val(11); + +static_assert(my_bytes_rd_addr && my_bytes_wr_addr, "Must provide addresses for my_bytes_rd/wr_addr"); +static_assert(my_cycles_addr, "Must provide L1 address for cycles elapsed"); + +uint64_t get_cycles() { + uint32_t timestamp_low = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L); + uint32_t timestamp_high = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_H); + return (((uint64_t)timestamp_high) << 32) | timestamp_low; +} + +void kernel_main() { + auto my_cycles = reinterpret_cast(my_cycles_addr); + auto my_bytes_read = reinterpret_cast(my_bytes_rd_addr); + auto my_bytes_written = reinterpret_cast(my_bytes_wr_addr); + + my_bytes_read[0] = 0; + my_bytes_written[0] = 0; + my_cycles[0] = 0; + + uint64_t pcie_noc_xy_encoding = (uint64_t)NOC_XY_PCIE_ENCODING(PCIE_NOC_X, PCIE_NOC_Y); + uint32_t rd_ptr = pcie_rd_base; + uint32_t wr_ptr = pcie_wr_base; + + const auto start = get_cycles(); + + uint32_t total_bytes_read = 0; + uint32_t total_bytes_written = 0; + while (total_bytes_read + total_bytes_written < my_total_work) { + if constexpr (my_rd_dst_addr) { + uint64_t host_src_addr = pcie_noc_xy_encoding | rd_ptr; + noc_async_read(host_src_addr, my_rd_dst_addr, pcie_rd_transfer_size); + rd_ptr += pcie_rd_transfer_size; + total_bytes_read += pcie_rd_transfer_size; + if (rd_ptr >= pcie_rd_end) { + rd_ptr = pcie_rd_base; + } + } + if constexpr (pcie_wr_size) { + uint64_t host_dst_addr = pcie_noc_xy_encoding | wr_ptr; + noc_async_write( + wr_ptr, // Any data + host_dst_addr, + pcie_wr_transfer_size); + wr_ptr += pcie_wr_transfer_size; + total_bytes_written += pcie_wr_transfer_size; + if (wr_ptr >= pcie_wr_end) { + wr_ptr = pcie_wr_base; + } + } + } + + if constexpr (my_rd_dst_addr) { + noc_async_read_barrier(); + } + if constexpr (pcie_wr_size) { + noc_async_write_barrier(); + } + + auto end = get_cycles(); + my_cycles[0] = end - start; + my_bytes_read[0] = total_bytes_read; + my_bytes_written[0] = total_bytes_written; +} diff --git a/tt_metal/tools/mem_bench/mem_bench.cpp b/tt_metal/tools/mem_bench/mem_bench.cpp new file mode 100644 index 00000000000..da0b2a8a8af --- /dev/null +++ b/tt_metal/tools/mem_bench/mem_bench.cpp @@ -0,0 +1,545 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include "context.hpp" +#include "host_utils.hpp" +#include "device_utils.hpp" +#include "work_thread.hpp" +#include "tt_metal/impl/dispatch/util/size_literals.hpp" + +using namespace tt; +using namespace tt::tt_metal; +using namespace tt::tt_metal::tools::mem_bench; + +// Read L1 counters (cycles, bytes rd, bytes wr) and increment test_results +void read_inc_data_from_cores(const Context& ctx, IDevice* device, const CoreRange& cores, TestResult& test_results) { + auto dev_cycles = read_cores(device, cores, ctx.device_address.cycles); + auto dev_bytes_read = read_cores(device, cores, ctx.device_address.rd_bytes); + auto dev_bytes_written = read_cores(device, cores, ctx.device_address.wr_bytes); + auto dev_clk = tt::Cluster::instance().get_device_aiclk(device->id()) * 1e6; // Hz + + double total_cycles = std::reduce(dev_cycles.begin(), dev_cycles.end(), 0ULL); + + test_results.total_cores_cycles += total_cycles; + test_results.total_cores_time += total_cycles / dev_clk; + // Reduce with 64 bits to prevent overflow as values read from device is 32 bits + test_results.total_cores_bytes_rd += std::reduce(dev_bytes_read.begin(), dev_bytes_read.end(), 0ULL); + test_results.total_cores_bytes_wr += std::reduce(dev_bytes_written.begin(), dev_bytes_written.end(), 0ULL); + + test_results.kernel_0_cycles += dev_cycles[0]; + test_results.kernel_0_time += dev_cycles[0] / dev_clk; + test_results.kernel_0_bytes_rd += dev_bytes_read[0]; + test_results.kernel_0_bytes_wr += dev_bytes_written[0]; +} + +// Report device bandwidth to the benchmark state +// Average bw will be reported as "dev_bw" as well as the bw for the +// first core will also be reported by itself as "kernel_0_bw". +void report_device_bw(benchmark::State& state, const TestResult& test_results) { + state.counters["dev_bw"] = + (test_results.total_cores_bytes_rd + test_results.total_cores_bytes_wr) / test_results.total_cores_time; + state.counters["dev_rd_bytes"] = test_results.total_cores_bytes_rd; + state.counters["dev_wr_bytes"] = test_results.total_cores_bytes_wr; + state.counters["dev_rd_bw"] = test_results.total_cores_bytes_rd / test_results.total_cores_time; + state.counters["dev_wr_bw"] = test_results.total_cores_bytes_wr / test_results.total_cores_time; + state.counters["dev_cycles"] = test_results.total_cores_cycles; + + state.counters["kernel_0_bw"] = + (test_results.kernel_0_bytes_rd + test_results.kernel_0_bytes_wr) / test_results.kernel_0_time; + state.counters["kernel_0_rd_bw"] = test_results.kernel_0_bytes_rd / test_results.kernel_0_time; + state.counters["kernel_0_wr_bw"] = test_results.kernel_0_bytes_wr / test_results.kernel_0_time; + state.counters["kernel_0_cycles"] = test_results.kernel_0_cycles; +} + +// Benchmark various memcpy_to_device transfer sizes. +// Reports host bw. +TestResult mem_bench_page_sizing(benchmark::State& state) { + constexpr uint32_t k_DeviceId = 0; + TestResult results; + Context ctx{ + {}, + state.range(0), // Total size + state.range(1), // Page size + 0, // Threads + 0, // Readers + 0, // Writers + true, // Enable host copy + 0, // Iterations is managed by the benchmark framework + }; + + auto src_data = generate_random_src_data(ctx.total_size); + auto hugepage = get_hugepage(k_DeviceId, 0); + auto hugepage_size = get_hugepage_size(k_DeviceId); + bool cached = state.range(2); + + for (auto _ : state) { + const double iteration_time = + cached ? copy_to_hugepage(hugepage, hugepage_size, src_data, ctx.total_size, ctx.page_size, true) + : copy_to_hugepage(hugepage, hugepage_size, src_data, ctx.total_size, ctx.page_size, false); + results.host_bytes_processed += ctx.total_size; + results.host_time_elapsed += iteration_time; + + state.SetIterationTime(iteration_time); + } + state.SetBytesProcessed(ctx.total_size * state.iterations()); + return results; +} + +// Benchmark memcpy_to_device on multiple threads to try saturating host bandwidth. +// Reports host bw. +TestResult mem_bench_copy_multithread(benchmark::State& state) { + static_assert((MEMCPY_ALIGNMENT & ((MEMCPY_ALIGNMENT)-1)) == 0); + constexpr uint32_t k_DeviceId = 0; + TestResult results; + Context ctx{ + {}, + state.range(0), // Total size + state.range(1), // Page size + state.range(2), // Threads + 0, // Readers + 0, // Writers + true, // Enable host copy + 0, // Iterations is managed by the benchmark framework + }; + auto src_data = generate_random_src_data(ctx.total_size); + auto hugepage = get_hugepage(0, 0); + const auto hugepage_size = get_hugepage_size(0); + const auto bytes_per_thread = ((ctx.total_size / ctx.threads) + (MEMCPY_ALIGNMENT)-1) & -(MEMCPY_ALIGNMENT); + const auto last_thread_bytes = ctx.total_size - (bytes_per_thread * (ctx.threads - 1)); + + for (auto _ : state) { + auto iteration_time = execute_work_synced_start( + ctx.threads, + [&](int thread_idx) { + uint64_t thread_dst = (uint64_t)hugepage + (thread_idx * bytes_per_thread); + uint64_t thread_bytes = (thread_idx == ctx.threads - 1) ? last_thread_bytes : bytes_per_thread; + std::span thread_src{src_data}; + thread_src = thread_src.subspan( + (thread_idx * bytes_per_thread) / sizeof(uint32_t), thread_bytes / sizeof(uint32_t)); + copy_to_hugepage( + (void*)thread_dst, hugepage_size, thread_src, thread_bytes, ctx.page_size, false); + }, + []() {}); + + results.host_bytes_processed += ctx.total_size; + results.host_time_elapsed += iteration_time; + + state.SetIterationTime(iteration_time); + } + + state.SetBytesProcessed(ctx.total_size * state.iterations()); + return results; +} + +// Benchmark memcpy_to_device while the device is reading the hugepage. +// Reports host bw and device bw. +TestResult mem_bench_copy_with_active_kernel(benchmark::State& state) { + TestResult results; + auto devices = tt::tt_metal::detail::CreateDevices(get_mmio_device_ids(1, -1)); + IDevice* device = (*(devices.begin())).second; + Context ctx{ + devices, + state.range(0), // Total size + state.range(1), // Page size + 0, // Threads + state.range(2), // Readers + 0, // Writers + state.range(3), // Enable host copy + 0, // Iterations is managed by the benchmark framework + }; + + auto src_data = generate_random_src_data(ctx.total_size); + auto hugepage = get_hugepage(device->id(), 0); + auto hugepage_size = get_hugepage_size(device->id()); + + for (auto _ : state) { + auto pgm = CreateProgram(); + auto configured_cores = configure_kernels(device, pgm, ctx, 0, ctx.number_reader_kernels, false, hugepage_size); + double host_copy_time = 1; // Set to 1 so it doesn't divide by 0 if host copy is disabled + + double wait_for_kernel_time = execute_work_synced_start( + 1, + [device, &pgm](int thread_idx) { + // Program + tt::tt_metal::detail::LaunchProgram(device, pgm, true); + }, + [&]() { + if (ctx.enable_host_copy_with_kernels) { + // Host copy while waiting for program + host_copy_time = + copy_to_hugepage(hugepage, hugepage_size, src_data, ctx.total_size, ctx.page_size, false); + results.host_bytes_processed += ctx.total_size; + results.host_time_elapsed += host_copy_time; + } + }); + + results.host_wait_for_kernel_time_elapsed += wait_for_kernel_time; + + read_inc_data_from_cores(ctx, device, configured_cores.value(), results); + + state.SetIterationTime(host_copy_time); + } + if (ctx.enable_host_copy_with_kernels) { + state.SetBytesProcessed(ctx.total_size * state.iterations()); + } else { + state.SetBytesProcessed(0); + } + + report_device_bw(state, results); + tt::tt_metal::detail::CloseDevices(devices); + return results; +} + +// Host writing to a hugepage while the device pulls from another hugepage. +// Reports host bw and device bw. +TestResult mem_bench_copy_active_kernel_different_page(benchmark::State& state) { + TestResult results; + auto devices = tt::tt_metal::detail::CreateDevices(get_mmio_device_ids(1, -1)); + IDevice* device = (*(devices.begin())).second; + Context ctx{ + devices, + state.range(0), // Total size + state.range(1), // Page size + 0, // Threads + state.range(2), // Readers + 0, // Writers + true, // Enable host copy + 0, // Iterations is managed by the benchmark framework + }; + + auto src_data = generate_random_src_data(ctx.total_size); + auto device_hugepage_size = get_hugepage_size(device->id()); + + // 2nd open device is not required + auto host_hugepage = get_hugepage(device->id() + 1, 0); + auto host_hugepage_size = get_hugepage_size(device->id() + 1); + + for (auto _ : state) { + auto pgm = CreateProgram(); + auto configured_cores = + configure_kernels(device, pgm, ctx, 0, ctx.number_reader_kernels, false, device_hugepage_size).value(); + double host_copy_time = 0; + + double wait_for_kernel_time = execute_work_synced_start( + 1, + [device, &pgm](int thread_idx) { + // Program + tt::tt_metal::detail::LaunchProgram(device, pgm, true); + }, + [&]() { + // Host copy while waiting for program + host_copy_time = + copy_to_hugepage(host_hugepage, host_hugepage_size, src_data, ctx.total_size, ctx.page_size, false); + results.host_bytes_processed += ctx.total_size; + results.host_time_elapsed += host_copy_time; + }); + + results.host_wait_for_kernel_time_elapsed += wait_for_kernel_time; + + read_inc_data_from_cores(ctx, device, configured_cores, results); + + state.SetIterationTime(host_copy_time); + } + + state.SetBytesProcessed(ctx.total_size * state.iterations()); + + report_device_bw(state, results); + tt::tt_metal::detail::CloseDevices(devices); + return results; +} + +// Common Multi MMIO device test. +TestResult mem_bench_multi_mmio_devices( + benchmark::State& state, std::map& devices, const Context& ctx) { + TestResult results; + + // One thread to wait for program on each device + int num_threads = devices.size(); + + for (auto _ : state) { + std::map programs; // device : programs + std::map configured_core_ranges; // device : cores + for (auto [device_id, device] : devices) { + programs[device_id] = CreateProgram(); + Program& pgm = programs[device_id]; + auto device_hugepage = get_hugepage(device_id, 0); + auto device_hugepage_size = get_hugepage_size(device_id); + configured_core_ranges.insert( + {device_id, + configure_kernels(device, pgm, ctx, 0, ctx.number_reader_kernels, false, device_hugepage_size) + .value()}); + } + + double host_copy_time = 0; + execute_work_synced_start( + 1, + [devices, &programs](int thread_idx) { + // Program + for (auto& [device_id, pgm] : programs) { + tt::tt_metal::detail::LaunchProgram(devices.at(device_id), pgm, false); + } + }, + []() {}); + + // Wait all programs to complete + for (auto& [device_id, pgm] : programs) { + tt::tt_metal::detail::WaitProgramDone(devices.at(device_id), pgm); + } + + // Read counters from each core + for (auto& [device_id, core_range] : configured_core_ranges) { + read_inc_data_from_cores(ctx, devices.at(device_id), core_range, results); + } + + // This test does not report host bw + state.SetIterationTime(1); + } + + state.SetBytesProcessed(0); + report_device_bw(state, results); + state.counters["num_mmio_devices"] = devices.size(); + + return results; +} + +// Multi MMIO devices reading on the same NUMA node. +TestResult mem_bench_multi_mmio_devices_reading_same_node(benchmark::State& state) { + // Node 0 + auto devices = tt::tt_metal::detail::CreateDevices(get_mmio_device_ids(get_number_of_mmio_devices(), 0)); + + Context ctx{ + devices, + state.range(0), // Total size + state.range(1), // Page size + 0, // Threads + state.range(2), // Readers on each device + 0, // Writers + false, // Enable host copy + 0, // Iterations is managed by the benchmark framework + }; + + TestResult results = mem_bench_multi_mmio_devices(state, devices, ctx); + tt::tt_metal::detail::CloseDevices(devices); + + return results; +} + +// Multi MMIO devices reading on different NUMA nodes. +TestResult mem_bench_multi_mmio_devices_reading_different_node(benchmark::State& state) { + auto devices = tt::tt_metal::detail::CreateDevices(get_mmio_device_ids_unique_nodes(get_number_of_mmio_devices())); + + Context ctx{ + devices, + state.range(0), // Total size + state.range(1), // Page size + 0, // Threads + state.range(2), // Readers on each device + 0, // Writers + false, // Enable host copy + 0, // Iterations is managed by the benchmark framework + }; + + TestResult results = mem_bench_multi_mmio_devices(state, devices, ctx); + tt::tt_metal::detail::CloseDevices(devices); + + return results; +} + +// Benchmark memcpy_to_device while device is reading (prefetching) and writing (dispatching data back to host) +// First half of hugepage will be written to by host +// Second half will be written to by device +TestResult mem_bench_copy_with_read_and_write_kernel(benchmark::State& state) { + auto devices = tt::tt_metal::detail::CreateDevices(get_mmio_device_ids(1, -1)); + IDevice* device = (*(devices.begin())).second; + Context ctx{ + devices, + state.range(0), // Total size + state.range(1), // Page size + 0, // Threads + state.range(2), // Readers + state.range(3), // Writers + true, // Enable host copy + 0, // Iterations is managed by the benchmark framework + }; + + auto src_data = generate_random_src_data(ctx.total_size); + auto hugepage = get_hugepage(device->id(), 0); + auto hugepage_size = get_hugepage_size(device->id()); + + // Don't need to seperate device results + // Readers will have 0 bytes written + // Writers will have 0 bytes read. Will not mix. + TestResult results; + + for (auto _ : state) { + auto pgm = CreateProgram(); + auto configured_read_cores = + configure_kernels(device, pgm, ctx, 0, ctx.number_reader_kernels, false, hugepage_size / 2).value(); + // Offset write cores to second half of PCIe + // Use second row + auto configured_write_cores = + configure_kernels( + device, pgm, ctx, 1, ctx.number_writer_kernels, true, hugepage_size / 2, hugepage_size / 2) + .value(); + double host_copy_time = 0; + + double wait_for_kernel_time = execute_work_synced_start( + 1, + [device, &pgm](int thread_idx) { + // Program + tt::tt_metal::detail::LaunchProgram(device, pgm, true); + }, + [&]() { + // Host copy while waiting for program + host_copy_time = + copy_to_hugepage(hugepage, hugepage_size / 2, src_data, ctx.total_size, ctx.page_size, false); + results.host_bytes_processed += ctx.total_size; + results.host_time_elapsed += host_copy_time; + }); + + results.host_wait_for_kernel_time_elapsed += wait_for_kernel_time; + + read_inc_data_from_cores(ctx, device, configured_read_cores, results); + read_inc_data_from_cores(ctx, device, configured_write_cores, results); + + state.SetIterationTime(host_copy_time); + } + + state.SetBytesProcessed(ctx.total_size * state.iterations()); + report_device_bw(state, results); + tt::tt_metal::detail::CloseDevices(devices); + return results; +} + +void global_bench_args(benchmark::internal::Benchmark* b) { b->UseManualTime()->Iterations(5); } + +void register_basic_benchmark_suite() { + ::benchmark::RegisterBenchmark("Host Copy Page Sizing", mem_bench_page_sizing) + ->Apply(global_bench_args) + ->ArgsProduct({ + {1_GB}, + {16, 8_KB, 16_KB, 32_KB}, + {false}, + }); + ::benchmark::RegisterBenchmark("Host Copy (Cached)", mem_bench_page_sizing) + ->Apply(global_bench_args) + ->ArgsProduct({ + {1_GB}, + {16, 8_KB, 16_KB, 32_KB}, + {true}, + }); + ::benchmark::RegisterBenchmark("Host Copy Saturation", mem_bench_copy_multithread) + ->Apply(global_bench_args) + ->ArgsProduct({ + {1_GB}, + {32_KB}, + {1, 2, 3, 4, 5, 6, 7, 8}, + }); + ::benchmark::RegisterBenchmark("Device Reading Host", mem_bench_copy_with_active_kernel) + ->Apply(global_bench_args) + ->ArgsProduct({ + {1_GB}, + {32_KB}, + {1, 2, 3, 4}, + {false}, + }); +} + +void register_full_benchmark_suite() { + ::benchmark::RegisterBenchmark("Host Copy with Active Kernel", mem_bench_copy_with_active_kernel) + ->Apply(global_bench_args) + ->ArgsProduct({ + {1_GB}, + {32_KB}, + {1, 2, 3, 4}, + {false}, + }); + ::benchmark::RegisterBenchmark( + "Host Copy with Active Kernel on Different Hugepages", mem_bench_copy_active_kernel_different_page) + ->Apply(global_bench_args) + ->ArgsProduct({ + {1_GB}, + {32_KB}, + {1, 2, 3, 4}, + }); + ::benchmark::RegisterBenchmark( + "Host Copy with Active Kernel Reading and Writing", mem_bench_copy_with_read_and_write_kernel) + ->Apply(global_bench_args) + ->ArgsProduct({ + {1_GB}, + {32_KB}, + {1, 2}, + {1, 2}, + }); + ::benchmark::RegisterBenchmark( + "Multiple MMIO Devices Reading (Same NUMA node)", mem_bench_multi_mmio_devices_reading_same_node) + ->Apply(global_bench_args) + ->ArgsProduct({ + {1_GB}, + {32_KB}, + {1, 2}, + }); + ::benchmark::RegisterBenchmark( + "Multiple MMIO Devices Reading (Different NUMA node)", mem_bench_multi_mmio_devices_reading_different_node) + ->Apply(global_bench_args) + ->ArgsProduct({ + {1_GB}, + {32_KB}, + {1, 2}, + }); +} + +void print_help() { + ::benchmark::PrintDefaultHelp(); + std::cout << " [--help] Shows this help message\n"; + std::cout << " [--full] Run all tests\n"; + std::cout << "\nCounters\n"; + std::cout << " bytes_per_second: Aggregate Host copy to hugepage bandwidth. 0 if not measured.\n"; + std::cout << " dev_bw: Average device core PCIe pull bandwidth. 0 if not measured.\n"; +} + +int main(int argc, char* argv[]) { + std::vector input_args(argv, argv + argc); + if (test_args::has_command_option(input_args, "--help")) { + print_help(); + return 0; + } + + // Force TT_METAL options + setenv("TT_METAL_SLOW_DISPATCH_MODE", "true", true); + setenv("TT_METAL_CLEAR_L1", "1", true); + // May be overridden by the user + setenv("TT_METAL_LOGGER_LEVEL", "FATAL", false); + + char arg0_default[] = "benchmark"; + char* args_default = arg0_default; + if (!argv) { + argc = 1; + argv = &args_default; + } + + // Run basic benchmarks + register_basic_benchmark_suite(); + + // Run all benchmarks + if (test_args::has_command_option(input_args, "--full")) { + register_full_benchmark_suite(); + } + + ::benchmark::Initialize(&argc, argv); + ::benchmark::RunSpecifiedBenchmarks(); + ::benchmark::Shutdown(); + return 0; +} diff --git a/tt_metal/tools/mem_bench/work_thread.hpp b/tt_metal/tools/mem_bench/work_thread.hpp new file mode 100644 index 00000000000..53a89f6010b --- /dev/null +++ b/tt_metal/tools/mem_bench/work_thread.hpp @@ -0,0 +1,77 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "host_utils.hpp" + +namespace tt::tt_metal::tools::mem_bench { + +// Execute work_fn on num_threads threads and also do intermediate_fn on the side. +// Returns time taken in seconds for all work_fn to complete. Time is calculated by latest thread end - earliest thread +// start. +template +double execute_work_synced_start(int num_threads, F&& work_fn, IntermediateF&& intermediate_fn, Args&&... args) { + std::mutex m; + int threads_ready{0}; + std::condition_variable go_cv; // Signal to all threads to go + auto total_threads = num_threads + 1; // Including intermediate + std::vector thread_start_times(num_threads); + std::vector thread_end_times(num_threads); + std::vector threads(total_threads); + + for (int i = 0; i < num_threads; ++i) { + threads[i] = std::thread([i, + &m, + &go_cv, + &threads_ready, + &thread_start_times, + &thread_end_times, + total_threads, + work_fn = std::forward(work_fn), + ... args = std::forward(args)]() mutable { + { + std::unique_lock lk{m}; + threads_ready++; + if (threads_ready == total_threads) { + go_cv.notify_all(); + } + go_cv.wait(lk, [&] { return threads_ready == total_threads; }); + } + + thread_start_times[i] = get_current_time_seconds(); + work_fn(i, std::forward(args)...); + thread_end_times[i] = get_current_time_seconds(); + }); + } + + threads[num_threads] = std::thread([&]() mutable { + std::unique_lock lk{m}; + threads_ready++; + if (threads_ready == total_threads) { + go_cv.notify_all(); + } + go_cv.wait(lk, [&] { return threads_ready == total_threads; }); + + intermediate_fn(); + }); + + for (auto& thread : threads) { + thread.join(); + } + + // Calculate work time based on earliest start and latest end + double earliest_start = *std::min_element(thread_start_times.begin(), thread_start_times.end()); + double latest_end = *std::max_element(thread_end_times.begin(), thread_end_times.end()); + + return latest_end - earliest_start; +} + +}; // namespace tt::tt_metal::tools::mem_bench From 4a0a20b4fdc1676e7ffb0edc142f2b1b5bec32ca Mon Sep 17 00:00:00 2001 From: Nigel Huang Date: Fri, 21 Feb 2025 07:27:01 +0000 Subject: [PATCH 230/316] #0: update README.md --- tt_metal/tools/mem_bench/README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tt_metal/tools/mem_bench/README.md b/tt_metal/tools/mem_bench/README.md index b10a228789d..03f2731d0d0 100644 --- a/tt_metal/tools/mem_bench/README.md +++ b/tt_metal/tools/mem_bench/README.md @@ -13,7 +13,7 @@ By default, each test is run for 5 iterations and only basic tests are executed. Tests will report host bandwidth and/or device bandwidth. If device bandwidth is reported, then the average of all cores is reported as well as bandwidth for just a single core. > [!NOTE] -Reducing the `tt_metal` library log level by exporting `TT_METAL_LOGGER_LEVEL=fatal` will increase the readability of the output. +The `tt_metal` library log level can be adjusted by exporting `TT_METAL_LOGGER_LEVEL=fatal|info|error|debug`. > [!NOTE] On NUMA systems, the host page for the device's command queue data is pinned on the memory node closest to where the device is located. If `tt_metal` is run on a different node then bandwidth will degrade because it'll need to cross sockets. Therefore, it's important to run `tt_metal` on the closest node. On Linux, the execution policy can be set using `numactl`. E.g., if the device is located on node 0, then `numactl --cpubind=0 --membind=0 ` will allocate resources closer to the device. @@ -39,4 +39,8 @@ benchmark [--benchmark_list_tests={true|false}] [--v=] [--help] Shows this help message [--full] Run all tests + +Counters + bytes_per_second: Aggregate Host copy to hugepage bandwidth. 0 if not measured. + dev_bw: Average device core PCIe pull bandwidth. 0 if not measured. ``` From 5c8cbd2150ac7cf2a0f468da42d6599b4e4a1f45 Mon Sep 17 00:00:00 2001 From: Stanislav Minakov Date: Fri, 21 Feb 2025 22:06:17 +0000 Subject: [PATCH 231/316] Remove autoformat argument from get_workers_for_op_output (#18163) ### Ticket ### Problem description We're trying to simplify and remove autoformat ### What's changed Removed enable_autoformat_device from get_workers_for_op_output in run_operation ### Checklist - [x] [All post commit CI passes](https://github.com/tenstorrent/tt-metal/actions/runs/13446219871) - [x] New/Existing tests provide coverage for changes --- ttnn/cpp/ttnn/decorators.hpp | 9 +++------ ttnn/cpp/ttnn/run_operation.cpp | 16 +--------------- ttnn/cpp/ttnn/run_operation.hpp | 4 +--- 3 files changed, 5 insertions(+), 24 deletions(-) diff --git a/ttnn/cpp/ttnn/decorators.hpp b/ttnn/cpp/ttnn/decorators.hpp index f571ed9c86e..7a08ad5d57c 100644 --- a/ttnn/cpp/ttnn/decorators.hpp +++ b/ttnn/cpp/ttnn/decorators.hpp @@ -54,8 +54,6 @@ auto extract_args_to_vector(args_t&&... args) { template inline auto create_async_output_tensors( const Tensors& inputs, const OptionalConstTensors& optional_inputs, args_t&&... args) { - bool enable_autoformat_device = false; - constexpr bool custom_create_async_outputs = requires(const operation_t& t) { t.create_async_output_tensors(inputs, optional_inputs); }; @@ -72,15 +70,14 @@ inline auto create_async_output_tensors( return operation_t::create_async_optional_output_tensors(std::forward(args)...); } else if constexpr (std::is_same_v, Tensor>) { - return std::vector{Tensor( - tt::tt_metal::operation::get_workers_for_op_output(inputs, optional_inputs, enable_autoformat_device))}; + return std::vector{Tensor(tt::tt_metal::operation::get_workers_for_op_output(inputs, optional_inputs))}; } else if constexpr (detail::is_homogenous_tuple()) { Tensors output_tensors; output_tensors.reserve(std::tuple_size_v); for (auto index = 0; index < std::tuple_size_v; index++) { - output_tensors.emplace_back(Tensor( - tt::tt_metal::operation::get_workers_for_op_output(inputs, optional_inputs, enable_autoformat_device))); + output_tensors.emplace_back( + Tensor(tt::tt_metal::operation::get_workers_for_op_output(inputs, optional_inputs))); } return output_tensors; } else { diff --git a/ttnn/cpp/ttnn/run_operation.cpp b/ttnn/cpp/ttnn/run_operation.cpp index 022ac257070..3e317d67a22 100644 --- a/ttnn/cpp/ttnn/run_operation.cpp +++ b/ttnn/cpp/ttnn/run_operation.cpp @@ -571,9 +571,7 @@ void validate_workers_and_storage( } std::vector get_workers_for_op_output( - const std::vector& inputs, - const std::vector>& optional_inputs, - bool enable_autoformat_device) { + const std::vector& inputs, const std::vector>& optional_inputs) { using ttnn::operations::experimental::auto_format::AutoFormat; std::vector workers_for_op = {}; // Infer output workers from inputs. For multi-device tensors the number @@ -600,18 +598,6 @@ std::vector get_workers_for_op_output( } } } - if (enable_autoformat_device) { - validate_workers_and_storage(inputs, optional_inputs, workers_for_op); - // Workers not specified - inputs are on host and not multi-device. - // Use the default device from autoformat. - if (not workers_for_op.size()) { - TT_FATAL( - AutoFormat::GetDefaultDevice(), - "Default device must be specified using AutoFormat::SetDefaultDevice, if workers are not specified for " - "inputs to op."); - workers_for_op = {AutoFormat::GetDefaultDevice()}; - } - } return workers_for_op; } diff --git a/ttnn/cpp/ttnn/run_operation.hpp b/ttnn/cpp/ttnn/run_operation.hpp index aa1a44367c0..f83319dd02f 100644 --- a/ttnn/cpp/ttnn/run_operation.hpp +++ b/ttnn/cpp/ttnn/run_operation.hpp @@ -157,9 +157,7 @@ void launch_with_autoformat( const OptionalTensors& optional_output_tensors = {}); std::vector get_workers_for_op_output( - const std::vector& inputs, - const std::vector>& optional_inputs = {}, - bool enable_autoformat_device = true); + const std::vector& inputs, const std::vector>& optional_inputs = {}); namespace detail { IDevice* get_device(const Tensors& input_tensors, const OptionalConstTensors& optional_input_tensors = {}); From 36ea77910c71b85d13dd69c58e05e6aa0b95fde1 Mon Sep 17 00:00:00 2001 From: Nigel Huang Date: Fri, 21 Feb 2025 23:58:05 +0000 Subject: [PATCH 232/316] Revert "#0: update README.md" This reverts commit 4a0a20b4fdc1676e7ffb0edc142f2b1b5bec32ca. --- tt_metal/tools/mem_bench/README.md | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tt_metal/tools/mem_bench/README.md b/tt_metal/tools/mem_bench/README.md index 03f2731d0d0..b10a228789d 100644 --- a/tt_metal/tools/mem_bench/README.md +++ b/tt_metal/tools/mem_bench/README.md @@ -13,7 +13,7 @@ By default, each test is run for 5 iterations and only basic tests are executed. Tests will report host bandwidth and/or device bandwidth. If device bandwidth is reported, then the average of all cores is reported as well as bandwidth for just a single core. > [!NOTE] -The `tt_metal` library log level can be adjusted by exporting `TT_METAL_LOGGER_LEVEL=fatal|info|error|debug`. +Reducing the `tt_metal` library log level by exporting `TT_METAL_LOGGER_LEVEL=fatal` will increase the readability of the output. > [!NOTE] On NUMA systems, the host page for the device's command queue data is pinned on the memory node closest to where the device is located. If `tt_metal` is run on a different node then bandwidth will degrade because it'll need to cross sockets. Therefore, it's important to run `tt_metal` on the closest node. On Linux, the execution policy can be set using `numactl`. E.g., if the device is located on node 0, then `numactl --cpubind=0 --membind=0 ` will allocate resources closer to the device. @@ -39,8 +39,4 @@ benchmark [--benchmark_list_tests={true|false}] [--v=] [--help] Shows this help message [--full] Run all tests - -Counters - bytes_per_second: Aggregate Host copy to hugepage bandwidth. 0 if not measured. - dev_bw: Average device core PCIe pull bandwidth. 0 if not measured. ``` From 785d4544cd18705b9b20b1602d1e6377cf30694b Mon Sep 17 00:00:00 2001 From: Nigel Huang Date: Fri, 21 Feb 2025 23:58:11 +0000 Subject: [PATCH 233/316] Revert "#0: comprehensive mem benchmark tool" This reverts commit 532dd26223ae0ac824945fd32827ad8595f32fe2. --- .../tt-metalium/command_queue_interface.hpp | 7 +- .../tt-metalium/helpers.hpp} | 6 +- .../impl/dispatch/util/dispatch_settings.cpp | 2 +- tt_metal/tools/CMakeLists.txt | 2 - tt_metal/tools/mem_bench/CMakeLists.txt | 40 -- tt_metal/tools/mem_bench/README.md | 42 -- tt_metal/tools/mem_bench/context.hpp | 78 --- tt_metal/tools/mem_bench/device_utils.cpp | 92 --- tt_metal/tools/mem_bench/device_utils.hpp | 26 - tt_metal/tools/mem_bench/host_utils.cpp | 87 --- tt_metal/tools/mem_bench/host_utils.hpp | 85 --- .../mem_bench/kernels/mem_bench_kernel.cpp | 99 ---- tt_metal/tools/mem_bench/mem_bench.cpp | 545 ------------------ tt_metal/tools/mem_bench/work_thread.hpp | 77 --- 14 files changed, 10 insertions(+), 1178 deletions(-) rename tt_metal/{impl/dispatch/util/size_literals.hpp => api/tt-metalium/helpers.hpp} (75%) delete mode 100644 tt_metal/tools/mem_bench/CMakeLists.txt delete mode 100644 tt_metal/tools/mem_bench/README.md delete mode 100644 tt_metal/tools/mem_bench/context.hpp delete mode 100644 tt_metal/tools/mem_bench/device_utils.cpp delete mode 100644 tt_metal/tools/mem_bench/device_utils.hpp delete mode 100644 tt_metal/tools/mem_bench/host_utils.cpp delete mode 100644 tt_metal/tools/mem_bench/host_utils.hpp delete mode 100644 tt_metal/tools/mem_bench/kernels/mem_bench_kernel.cpp delete mode 100644 tt_metal/tools/mem_bench/mem_bench.cpp delete mode 100644 tt_metal/tools/mem_bench/work_thread.hpp diff --git a/tt_metal/api/tt-metalium/command_queue_interface.hpp b/tt_metal/api/tt-metalium/command_queue_interface.hpp index 53f6eb068ea..30de4f2e631 100644 --- a/tt_metal/api/tt-metalium/command_queue_interface.hpp +++ b/tt_metal/api/tt-metalium/command_queue_interface.hpp @@ -3,9 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 #pragma once +#include #include #include #include +#include #include "cq_commands.hpp" #include "dispatch_core_manager.hpp" @@ -13,6 +15,7 @@ #include "memcpy.hpp" #include "hal.hpp" #include "dispatch_settings.hpp" +#include "helpers.hpp" #include "buffer.hpp" #include "umd/device/tt_core_coordinates.h" @@ -190,8 +193,8 @@ class DispatchMemMap { uint32_t prefetch_dispatch_unreserved_base = device_cq_addrs_[tt::utils::underlying_type( CommandQueueDeviceAddrType::UNRESERVED)]; - cmddat_q_base_ = align(prefetch_dispatch_unreserved_base + settings.prefetch_q_size_, pcie_alignment); - scratch_db_base_ = align(cmddat_q_base_ + settings.prefetch_cmddat_q_size_, pcie_alignment); + cmddat_q_base_ = prefetch_dispatch_unreserved_base + round_size(settings.prefetch_q_size_, pcie_alignment); + scratch_db_base_ = cmddat_q_base_ + round_size(settings.prefetch_cmddat_q_size_, pcie_alignment); dispatch_buffer_base_ = align(prefetch_dispatch_unreserved_base, 1 << DispatchSettings::DISPATCH_BUFFER_LOG_PAGE_SIZE); dispatch_buffer_block_size_pages_ = settings.dispatch_pages_ / DispatchSettings::DISPATCH_BUFFER_SIZE_BLOCKS; const uint32_t dispatch_cb_end = dispatch_buffer_base_ + settings.dispatch_size_; diff --git a/tt_metal/impl/dispatch/util/size_literals.hpp b/tt_metal/api/tt-metalium/helpers.hpp similarity index 75% rename from tt_metal/impl/dispatch/util/size_literals.hpp rename to tt_metal/api/tt-metalium/helpers.hpp index 061d9880904..aebf3f3f69a 100644 --- a/tt_metal/impl/dispatch/util/size_literals.hpp +++ b/tt_metal/api/tt-metalium/helpers.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. // // SPDX-License-Identifier: Apache-2.0 @@ -18,6 +18,8 @@ constexpr auto operator""_MB(const unsigned long long v) -> uint32_t { return 10 constexpr auto operator""_GB(const unsigned long long v) -> uint32_t { return 1024 * 1024 * 1024 * v; } // Returns the size rounded up to the given alignment -inline uint32_t round_size(uint32_t sz, uint32_t alignment) { return ((sz + alignment - 1) / alignment * alignment); } +inline uint32_t round_size(uint32_t sz, uint32_t alignment) { + return ((sz + alignment - 1) / alignment * alignment); +} } // namespace tt::tt_metal diff --git a/tt_metal/impl/dispatch/util/dispatch_settings.cpp b/tt_metal/impl/dispatch/util/dispatch_settings.cpp index a6003177a96..7912a1f825d 100644 --- a/tt_metal/impl/dispatch/util/dispatch_settings.cpp +++ b/tt_metal/impl/dispatch/util/dispatch_settings.cpp @@ -8,7 +8,7 @@ #include "magic_enum/magic_enum.hpp" #include "umd/device/tt_core_coordinates.h" #include -#include "size_literals.hpp" +#include namespace tt::tt_metal { diff --git a/tt_metal/tools/CMakeLists.txt b/tt_metal/tools/CMakeLists.txt index 186c1ea86c7..3509710519a 100644 --- a/tt_metal/tools/CMakeLists.txt +++ b/tt_metal/tools/CMakeLists.txt @@ -1,7 +1,6 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/profiler) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/watcher_dump) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/lightmetal_runner) -add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/mem_bench) set(TOOLS_SRC ${CMAKE_CURRENT_SOURCE_DIR}/memset.cpp) @@ -11,7 +10,6 @@ target_link_libraries( PUBLIC profiler Metalium::Metal::LLRT - Metalium::Metal PRIVATE TT::Metalium::HostDevCommon ) diff --git a/tt_metal/tools/mem_bench/CMakeLists.txt b/tt_metal/tools/mem_bench/CMakeLists.txt deleted file mode 100644 index 72127b9bb1c..00000000000 --- a/tt_metal/tools/mem_bench/CMakeLists.txt +++ /dev/null @@ -1,40 +0,0 @@ -set(IMPL_SRC - ${CMAKE_CURRENT_SOURCE_DIR}/mem_bench.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/host_utils.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_utils.cpp -) - -set(HEADERS_SRC - ${CMAKE_CURRENT_SOURCE_DIR}/host_utils.hpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_utils.hpp - ${CMAKE_CURRENT_SOURCE_DIR}/work_thread.hpp -) - -add_executable( - mem_bench - ${IMPL_SRC} - ${HEADERS_SRC} -) -target_link_libraries( - mem_bench - PRIVATE - tt_metal - test_metal_common_libs - numa - benchmark::benchmark -) -target_include_directories( - mem_bench - PRIVATE - ${PROJECT_SOURCE_DIR} - ${PROJECT_SOURCE_DIR}/tt_metal - ${PROJECT_SOURCE_DIR}/tt_metal/common - ${PROJECT_SOURCE_DIR}/tests - ${CMAKE_CURRENT_SOURCE_DIR} -) -set_target_properties( - mem_bench - PROPERTIES - RUNTIME_OUTPUT_DIRECTORY - ${PROJECT_BINARY_DIR}/tools -) diff --git a/tt_metal/tools/mem_bench/README.md b/tt_metal/tools/mem_bench/README.md deleted file mode 100644 index b10a228789d..00000000000 --- a/tt_metal/tools/mem_bench/README.md +++ /dev/null @@ -1,42 +0,0 @@ -# tt mem_bench - -Utility to measure host and device bandwidth on Tenstorrent devices. - -## Build - -Tools are included in `tt_metal` builds. Using a release build is required for accurate perf measurements. - -## Usage - -By default, each test is run for 5 iterations and only basic tests are executed. All test patterns can be executed by specifying `--full`. Additional run parameters are listed below. - -Tests will report host bandwidth and/or device bandwidth. If device bandwidth is reported, then the average of all cores is reported as well as bandwidth for just a single core. - -> [!NOTE] -Reducing the `tt_metal` library log level by exporting `TT_METAL_LOGGER_LEVEL=fatal` will increase the readability of the output. - -> [!NOTE] -On NUMA systems, the host page for the device's command queue data is pinned on the memory node closest to where the device is located. If `tt_metal` is run on a different node then bandwidth will degrade because it'll need to cross sockets. Therefore, it's important to run `tt_metal` on the closest node. On Linux, the execution policy can be set using `numactl`. E.g., if the device is located on node 0, then `numactl --cpubind=0 --membind=0 ` will allocate resources closer to the device. - -``` -./build/tools/mem_bench --help -benchmark [--benchmark_list_tests={true|false}] - [--benchmark_filter=] - [--benchmark_min_time=`x` OR `s` ] - [--benchmark_min_warmup_time=] - [--benchmark_repetitions=] - [--benchmark_dry_run={true|false}] - [--benchmark_enable_random_interleaving={true|false}] - [--benchmark_report_aggregates_only={true|false}] - [--benchmark_display_aggregates_only={true|false}] - [--benchmark_format=] - [--benchmark_out=] - [--benchmark_out_format=] - [--benchmark_color={auto|true|false}] - [--benchmark_counters_tabular={true|false}] - [--benchmark_context==,...] - [--benchmark_time_unit={ns|us|ms|s}] - [--v=] - [--help] Shows this help message - [--full] Run all tests -``` diff --git a/tt_metal/tools/mem_bench/context.hpp b/tt_metal/tools/mem_bench/context.hpp deleted file mode 100644 index 4bf8d8ff450..00000000000 --- a/tt_metal/tools/mem_bench/context.hpp +++ /dev/null @@ -1,78 +0,0 @@ -// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include -#include -#include -#include -#include - -namespace tt::tt_metal::tools::mem_bench { - -struct TestResult { - double host_bytes_processed{0}; - double host_time_elapsed{0}; - double host_wait_for_kernel_time_elapsed{0}; - - double total_cores_cycles{0}; - double total_cores_time{0}; - double total_cores_bytes_rd{0}; - double total_cores_bytes_wr{0}; - - double kernel_0_cycles{0}; - double kernel_0_time{0}; - double kernel_0_bytes_rd{0}; - double kernel_0_bytes_wr{0}; - - // Any additional values to be included in benchmark reports - std::map arb_counters; -}; - -struct L1MemoryMap { - uint32_t cycles; - uint32_t rd_bytes; - uint32_t wr_bytes; - uint32_t unreserved; -}; - -struct Context { - std::map devices; - L1MemoryMap device_address; - uint32_t total_size{0}; - uint32_t page_size{0}; - int threads{0}; - int number_reader_kernels{0}; - int number_writer_kernels{0}; - bool enable_host_copy_with_kernels{0}; - int iterations{0}; - - Context( - const std::map& devices_, - uint32_t total_size_, - uint32_t page_size_, - int threads_, - int readers_, - int writers_, - bool enable_host_copy_with_kernels_, - int iterations_) { - auto l1_alignment = experimental::hal::get_l1_alignment(); - auto l1_base = experimental::hal::get_tensix_l1_unreserved_base(); - device_address.cycles = l1_base; - device_address.rd_bytes = align(device_address.cycles + sizeof(uint32_t), l1_alignment); - device_address.wr_bytes = align(device_address.rd_bytes + sizeof(uint32_t), l1_alignment); - device_address.unreserved = align(device_address.wr_bytes + sizeof(uint32_t), l1_alignment); - devices = devices_; - total_size = total_size_; - page_size = page_size_; - threads = threads_; - number_reader_kernels = readers_; - number_writer_kernels = writers_; - enable_host_copy_with_kernels = enable_host_copy_with_kernels_; - iterations = iterations_; - } -}; - -} // namespace tt::tt_metal::tools::mem_bench diff --git a/tt_metal/tools/mem_bench/device_utils.cpp b/tt_metal/tools/mem_bench/device_utils.cpp deleted file mode 100644 index bd650a3c052..00000000000 --- a/tt_metal/tools/mem_bench/device_utils.cpp +++ /dev/null @@ -1,92 +0,0 @@ -// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include -#include -#include "device_utils.hpp" -#include "context.hpp" - -namespace tt::tt_metal::tools::mem_bench { - -std::vector read_cores(tt::tt_metal::IDevice* device, const CoreRange& cores, uint32_t addr) { - std::vector data; - for (int xi = cores.start_coord.x; xi <= cores.end_coord.x; ++xi) { - for (int yi = cores.start_coord.y; yi <= cores.end_coord.y; ++yi) { - std::vector single_data; - tt::tt_metal::detail::ReadFromDeviceL1(device, CoreCoord{xi, yi}, addr, sizeof(uint32_t), single_data); - data.push_back(single_data[0]); - } - } - return data; -} - -std::optional configure_kernels( - tt::tt_metal::IDevice* device, - tt::tt_metal::Program& program, - const Context& context, - uint32_t start_y, - uint32_t num_kernels, - bool is_writer, - uint32_t pcie_size, - uint32_t pcie_offset) { - constexpr std::string_view k_PcieBenchKernel = "tt_metal/tools/mem_bench/kernels/mem_bench_kernel.cpp"; - const auto grid_size = device->logical_grid_size(); - const auto max_x = grid_size.x; - const auto max_y = grid_size.y; - uint32_t total_kernel_transfer = context.total_size; - uint32_t kernel_transfer_size = context.page_size; - - if (!kernel_transfer_size) { - kernel_transfer_size = total_kernel_transfer; - } else if (!num_kernels) { - return {}; - } - - // Number readers either less than one row - // or a multiple of the rows - CoreCoord start_coord{0, start_y}; - CoreCoord end_coord; - if (num_kernels <= max_x) { - end_coord.x = start_coord.x + num_kernels - 1; - end_coord.y = start_coord.y; - } else { - const auto number_of_rows = num_kernels / max_x; - const auto last_row_width = (num_kernels % max_x) ? num_kernels % max_x : max_x; - end_coord.x = start_coord.x + last_row_width - 1; - end_coord.y = number_of_rows - 1; - } - CoreRange core_range{start_coord, end_coord}; - - std::vector pcie_bench_compile_args(12, 0); - if (is_writer) { - pcie_bench_compile_args[5] = 0; // reserved_0 - pcie_bench_compile_args[6] = pcie_offset; // pcie_wr_base - pcie_bench_compile_args[7] = pcie_size; // pcie_wr_size - pcie_bench_compile_args[8] = kernel_transfer_size; // pcie_wr_transfer_size - } else { - pcie_bench_compile_args[0] = context.device_address.unreserved; // my_rd_dst_addr - pcie_bench_compile_args[1] = pcie_offset; // pcie_rd_base - pcie_bench_compile_args[2] = pcie_size; // pcie_rd_size - pcie_bench_compile_args[3] = kernel_transfer_size; // pcie_rd_transfer_size - } - pcie_bench_compile_args[4] = context.device_address.rd_bytes; // my_bytes_rd_addr - pcie_bench_compile_args[9] = context.device_address.wr_bytes; // my_bytes_wr_addr - pcie_bench_compile_args[10] = total_kernel_transfer; - pcie_bench_compile_args[11] = context.device_address.cycles; - - [[maybe_unused]] auto kernel = tt::tt_metal::CreateKernel( - program, - std::string{k_PcieBenchKernel}, - core_range, - DataMovementConfig{ - .processor = DataMovementProcessor::RISCV_1, - .noc = tt::tt_metal::NOC_0, - .compile_args = pcie_bench_compile_args, - .defines = {}, - }); - - return core_range; -} - -} // namespace tt::tt_metal::tools::mem_bench diff --git a/tt_metal/tools/mem_bench/device_utils.hpp b/tt_metal/tools/mem_bench/device_utils.hpp deleted file mode 100644 index ab20ebfc3cc..00000000000 --- a/tt_metal/tools/mem_bench/device_utils.hpp +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include -#include -#include -#include "context.hpp" - -namespace tt::tt_metal::tools::mem_bench { - -std::vector read_cores(tt::tt_metal::IDevice* device, const CoreRange& cores, uint32_t addr); - -std::optional configure_kernels( - tt::tt_metal::IDevice* device, - tt::tt_metal::Program& program, - const Context& context, - uint32_t start_y, - uint32_t num_kernels, - bool is_writer, - uint32_t pcie_size, - uint32_t pcie_offset = 0); - -} // namespace tt::tt_metal::tools::mem_bench diff --git a/tt_metal/tools/mem_bench/host_utils.cpp b/tt_metal/tools/mem_bench/host_utils.cpp deleted file mode 100644 index 9aad3fe59fa..00000000000 --- a/tt_metal/tools/mem_bench/host_utils.cpp +++ /dev/null @@ -1,87 +0,0 @@ -// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include "host_utils.hpp" -#include -#include -#include -#include -#include -#include - -namespace tt::tt_metal::tools::mem_bench { - -void* get_hugepage(int device_id, uint32_t base_offset) { - auto& cluster = tt::Cluster::instance(); - auto mmio_device_id = cluster.get_associated_mmio_device(device_id); - auto channel = cluster.get_assigned_channel_for_device(device_id); - return (void*)(cluster.host_dma_address(base_offset, mmio_device_id, channel)); -} - -uint32_t get_hugepage_size(int device_id) { - auto& cluster = tt::Cluster::instance(); - auto mmio_device_id = cluster.get_associated_mmio_device(device_id); - auto channel = cluster.get_assigned_channel_for_device(device_id); - return cluster.get_host_channel_size(mmio_device_id, channel); -} - -tt::tt_metal::vector_memcpy_aligned generate_random_src_data(uint32_t num_bytes) { - std::uniform_int_distribution distribution( - std::numeric_limits::min(), std::numeric_limits::max()); - std::default_random_engine generator; - - tt::tt_metal::vector_memcpy_aligned vec(num_bytes / sizeof(uint32_t)); - std::generate(vec.begin(), vec.end(), [&]() { return distribution(generator); }); - - return vec; -} - -double get_current_time_seconds() { - return std::chrono::duration(std::chrono::high_resolution_clock::now().time_since_epoch()).count(); -} - -std::vector get_mmio_device_ids(int number_of_devices, int numa_node) { - auto& cluster = tt::Cluster::instance(); - const auto pcie_devices = cluster.number_of_pci_devices(); - std::vector device_ids; - - // Assumes PCIe device IDs are iterated first - for (int device_id = 0; device_id < pcie_devices && device_ids.size() < number_of_devices; ++device_id) { - // Not an MMIO device - if (cluster.get_associated_mmio_device(device_id) != device_id) { - continue; - } - - auto associated_node = cluster.get_numa_node_for_device(device_id); - if (numa_node == -1 || associated_node == numa_node) { - device_ids.push_back(device_id); - } - } - - return device_ids; -} - -std::vector get_mmio_device_ids_unique_nodes(int number_of_devices) { - auto& cluster = tt::Cluster::instance(); - const auto pcie_devices = cluster.number_of_pci_devices(); - std::vector device_ids; - std::unordered_set numa_nodes; - - for (int device_id = 0; device_id < pcie_devices && device_ids.size() < number_of_devices; ++device_id) { - auto associated_node = cluster.get_numa_node_for_device(device_id); - if (!numa_nodes.contains(associated_node)) { - device_ids.push_back(device_id); - numa_nodes.insert(associated_node); - } - } - - return device_ids; -} - -int get_number_of_mmio_devices() { - auto& cluster = tt::Cluster::instance(); - return cluster.number_of_pci_devices(); -} - -} // namespace tt::tt_metal::tools::mem_bench diff --git a/tt_metal/tools/mem_bench/host_utils.hpp b/tt_metal/tools/mem_bench/host_utils.hpp deleted file mode 100644 index c00d3e40ac3..00000000000 --- a/tt_metal/tools/mem_bench/host_utils.hpp +++ /dev/null @@ -1,85 +0,0 @@ -// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -// #include "work_thread.hpp" - -namespace tt::tt_metal::tools::mem_bench { - -// Generate random data aligned for memcpy_to_device. -tt::tt_metal::vector_memcpy_aligned generate_random_src_data(uint32_t num_bytes); - -// Get current host time, in seconds. -double get_current_time_seconds(); - -// Return device ids. If numa_node is specified then only device ids on that -// node will be returned. If numa_node == -1, then the node is not taken into -// consideration. Note: Less than number_of_devices may be returned. -std::vector get_mmio_device_ids(int number_of_devices, int numa_node); - -// Returns device ids. All devices are on different nodes. Note: Less than -// number_of_devices may be returned. -std::vector get_mmio_device_ids_unique_nodes(int number_of_devices); - -// Returns the number of MMIO connected chips. -int get_number_of_mmio_devices(); - -// Returns the hugepage pointer assigned to a device. -void* get_hugepage(int device_id, uint32_t base_offset); - -// Returns the size of the hugepage assigned to a device. -uint32_t get_hugepage_size(int device_id); - -// Copy data to hugepage. Returns the duration. -// repeating_src_vector: Keep copying the same elements to hugepage. This should force the source data in stay in the -// caches. fence: Memory barrier at the end of each copy. Returns the time in seconds -template -double copy_to_hugepage( - void* hugepage_base, - uint32_t hugepage_size, - std::span src_data, - size_t total_size, - size_t page_size, - bool repeating_src_vector) { - uint64_t hugepage_addr = reinterpret_cast(hugepage_base); - uint64_t hugepage_end = hugepage_addr + hugepage_size; - uint64_t src_addr = reinterpret_cast(src_data.data()); - size_t num_pages; - if (!page_size) { - num_pages = 1; - page_size = total_size; - } else { - num_pages = total_size / page_size; - } - - auto start = get_current_time_seconds(); - for (int i = 0; i < num_pages; ++i) { - tt::tt_metal::memcpy_to_device((void*)(hugepage_addr), (void*)(src_addr), page_size); - - // 64 bit host address alignment - hugepage_addr = ((hugepage_addr + page_size - 1) | (tt::tt_metal::MEMCPY_ALIGNMENT - 1)) + 1; - - if (!repeating_src_vector) { - src_addr += page_size; - } - - // Wrap back to the beginning of hugepage - if (hugepage_addr + page_size >= hugepage_end) { - hugepage_addr = reinterpret_cast(hugepage_base); - } - } - auto end = get_current_time_seconds(); - - return end - start; -} - -}; // namespace tt::tt_metal::tools::mem_bench diff --git a/tt_metal/tools/mem_bench/kernels/mem_bench_kernel.cpp b/tt_metal/tools/mem_bench/kernels/mem_bench_kernel.cpp deleted file mode 100644 index e04b02013de..00000000000 --- a/tt_metal/tools/mem_bench/kernels/mem_bench_kernel.cpp +++ /dev/null @@ -1,99 +0,0 @@ -// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include - -#include "dataflow_api.h" -#include -#include -#include "noc_overlay_parameters.h" - -// -// Test Kernel for mem_bench -// -// Performs PCIe reads and/or writes -// - -// reader kernel -constexpr uint32_t my_rd_dst_addr = get_compile_time_arg_val(0); // L1 -constexpr uint32_t pcie_rd_base = get_compile_time_arg_val(1); -constexpr uint32_t pcie_rd_size = get_compile_time_arg_val(2); -constexpr uint32_t pcie_rd_end = pcie_rd_base + pcie_rd_size; -constexpr uint32_t pcie_rd_transfer_size = get_compile_time_arg_val(3); -constexpr uint32_t my_bytes_rd_addr = get_compile_time_arg_val(4); - -// writer kernel -constexpr uint32_t reserved_0 = get_compile_time_arg_val(5); -constexpr uint32_t pcie_wr_base = get_compile_time_arg_val(6); -constexpr uint32_t pcie_wr_size = get_compile_time_arg_val(7); -constexpr uint32_t pcie_wr_end = pcie_wr_base + pcie_wr_size; -constexpr uint32_t pcie_wr_transfer_size = get_compile_time_arg_val(8); -constexpr uint32_t my_bytes_wr_addr = get_compile_time_arg_val(9); - -// common to both -constexpr uint32_t my_total_work = get_compile_time_arg_val(10); // Total bytes to read+write -constexpr uint32_t my_cycles_addr = get_compile_time_arg_val(11); - -static_assert(my_bytes_rd_addr && my_bytes_wr_addr, "Must provide addresses for my_bytes_rd/wr_addr"); -static_assert(my_cycles_addr, "Must provide L1 address for cycles elapsed"); - -uint64_t get_cycles() { - uint32_t timestamp_low = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L); - uint32_t timestamp_high = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_H); - return (((uint64_t)timestamp_high) << 32) | timestamp_low; -} - -void kernel_main() { - auto my_cycles = reinterpret_cast(my_cycles_addr); - auto my_bytes_read = reinterpret_cast(my_bytes_rd_addr); - auto my_bytes_written = reinterpret_cast(my_bytes_wr_addr); - - my_bytes_read[0] = 0; - my_bytes_written[0] = 0; - my_cycles[0] = 0; - - uint64_t pcie_noc_xy_encoding = (uint64_t)NOC_XY_PCIE_ENCODING(PCIE_NOC_X, PCIE_NOC_Y); - uint32_t rd_ptr = pcie_rd_base; - uint32_t wr_ptr = pcie_wr_base; - - const auto start = get_cycles(); - - uint32_t total_bytes_read = 0; - uint32_t total_bytes_written = 0; - while (total_bytes_read + total_bytes_written < my_total_work) { - if constexpr (my_rd_dst_addr) { - uint64_t host_src_addr = pcie_noc_xy_encoding | rd_ptr; - noc_async_read(host_src_addr, my_rd_dst_addr, pcie_rd_transfer_size); - rd_ptr += pcie_rd_transfer_size; - total_bytes_read += pcie_rd_transfer_size; - if (rd_ptr >= pcie_rd_end) { - rd_ptr = pcie_rd_base; - } - } - if constexpr (pcie_wr_size) { - uint64_t host_dst_addr = pcie_noc_xy_encoding | wr_ptr; - noc_async_write( - wr_ptr, // Any data - host_dst_addr, - pcie_wr_transfer_size); - wr_ptr += pcie_wr_transfer_size; - total_bytes_written += pcie_wr_transfer_size; - if (wr_ptr >= pcie_wr_end) { - wr_ptr = pcie_wr_base; - } - } - } - - if constexpr (my_rd_dst_addr) { - noc_async_read_barrier(); - } - if constexpr (pcie_wr_size) { - noc_async_write_barrier(); - } - - auto end = get_cycles(); - my_cycles[0] = end - start; - my_bytes_read[0] = total_bytes_read; - my_bytes_written[0] = total_bytes_written; -} diff --git a/tt_metal/tools/mem_bench/mem_bench.cpp b/tt_metal/tools/mem_bench/mem_bench.cpp deleted file mode 100644 index da0b2a8a8af..00000000000 --- a/tt_metal/tools/mem_bench/mem_bench.cpp +++ /dev/null @@ -1,545 +0,0 @@ -// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include - -#include - -#include -#include -#include -#include -#include -#include - -#include "context.hpp" -#include "host_utils.hpp" -#include "device_utils.hpp" -#include "work_thread.hpp" -#include "tt_metal/impl/dispatch/util/size_literals.hpp" - -using namespace tt; -using namespace tt::tt_metal; -using namespace tt::tt_metal::tools::mem_bench; - -// Read L1 counters (cycles, bytes rd, bytes wr) and increment test_results -void read_inc_data_from_cores(const Context& ctx, IDevice* device, const CoreRange& cores, TestResult& test_results) { - auto dev_cycles = read_cores(device, cores, ctx.device_address.cycles); - auto dev_bytes_read = read_cores(device, cores, ctx.device_address.rd_bytes); - auto dev_bytes_written = read_cores(device, cores, ctx.device_address.wr_bytes); - auto dev_clk = tt::Cluster::instance().get_device_aiclk(device->id()) * 1e6; // Hz - - double total_cycles = std::reduce(dev_cycles.begin(), dev_cycles.end(), 0ULL); - - test_results.total_cores_cycles += total_cycles; - test_results.total_cores_time += total_cycles / dev_clk; - // Reduce with 64 bits to prevent overflow as values read from device is 32 bits - test_results.total_cores_bytes_rd += std::reduce(dev_bytes_read.begin(), dev_bytes_read.end(), 0ULL); - test_results.total_cores_bytes_wr += std::reduce(dev_bytes_written.begin(), dev_bytes_written.end(), 0ULL); - - test_results.kernel_0_cycles += dev_cycles[0]; - test_results.kernel_0_time += dev_cycles[0] / dev_clk; - test_results.kernel_0_bytes_rd += dev_bytes_read[0]; - test_results.kernel_0_bytes_wr += dev_bytes_written[0]; -} - -// Report device bandwidth to the benchmark state -// Average bw will be reported as "dev_bw" as well as the bw for the -// first core will also be reported by itself as "kernel_0_bw". -void report_device_bw(benchmark::State& state, const TestResult& test_results) { - state.counters["dev_bw"] = - (test_results.total_cores_bytes_rd + test_results.total_cores_bytes_wr) / test_results.total_cores_time; - state.counters["dev_rd_bytes"] = test_results.total_cores_bytes_rd; - state.counters["dev_wr_bytes"] = test_results.total_cores_bytes_wr; - state.counters["dev_rd_bw"] = test_results.total_cores_bytes_rd / test_results.total_cores_time; - state.counters["dev_wr_bw"] = test_results.total_cores_bytes_wr / test_results.total_cores_time; - state.counters["dev_cycles"] = test_results.total_cores_cycles; - - state.counters["kernel_0_bw"] = - (test_results.kernel_0_bytes_rd + test_results.kernel_0_bytes_wr) / test_results.kernel_0_time; - state.counters["kernel_0_rd_bw"] = test_results.kernel_0_bytes_rd / test_results.kernel_0_time; - state.counters["kernel_0_wr_bw"] = test_results.kernel_0_bytes_wr / test_results.kernel_0_time; - state.counters["kernel_0_cycles"] = test_results.kernel_0_cycles; -} - -// Benchmark various memcpy_to_device transfer sizes. -// Reports host bw. -TestResult mem_bench_page_sizing(benchmark::State& state) { - constexpr uint32_t k_DeviceId = 0; - TestResult results; - Context ctx{ - {}, - state.range(0), // Total size - state.range(1), // Page size - 0, // Threads - 0, // Readers - 0, // Writers - true, // Enable host copy - 0, // Iterations is managed by the benchmark framework - }; - - auto src_data = generate_random_src_data(ctx.total_size); - auto hugepage = get_hugepage(k_DeviceId, 0); - auto hugepage_size = get_hugepage_size(k_DeviceId); - bool cached = state.range(2); - - for (auto _ : state) { - const double iteration_time = - cached ? copy_to_hugepage(hugepage, hugepage_size, src_data, ctx.total_size, ctx.page_size, true) - : copy_to_hugepage(hugepage, hugepage_size, src_data, ctx.total_size, ctx.page_size, false); - results.host_bytes_processed += ctx.total_size; - results.host_time_elapsed += iteration_time; - - state.SetIterationTime(iteration_time); - } - state.SetBytesProcessed(ctx.total_size * state.iterations()); - return results; -} - -// Benchmark memcpy_to_device on multiple threads to try saturating host bandwidth. -// Reports host bw. -TestResult mem_bench_copy_multithread(benchmark::State& state) { - static_assert((MEMCPY_ALIGNMENT & ((MEMCPY_ALIGNMENT)-1)) == 0); - constexpr uint32_t k_DeviceId = 0; - TestResult results; - Context ctx{ - {}, - state.range(0), // Total size - state.range(1), // Page size - state.range(2), // Threads - 0, // Readers - 0, // Writers - true, // Enable host copy - 0, // Iterations is managed by the benchmark framework - }; - auto src_data = generate_random_src_data(ctx.total_size); - auto hugepage = get_hugepage(0, 0); - const auto hugepage_size = get_hugepage_size(0); - const auto bytes_per_thread = ((ctx.total_size / ctx.threads) + (MEMCPY_ALIGNMENT)-1) & -(MEMCPY_ALIGNMENT); - const auto last_thread_bytes = ctx.total_size - (bytes_per_thread * (ctx.threads - 1)); - - for (auto _ : state) { - auto iteration_time = execute_work_synced_start( - ctx.threads, - [&](int thread_idx) { - uint64_t thread_dst = (uint64_t)hugepage + (thread_idx * bytes_per_thread); - uint64_t thread_bytes = (thread_idx == ctx.threads - 1) ? last_thread_bytes : bytes_per_thread; - std::span thread_src{src_data}; - thread_src = thread_src.subspan( - (thread_idx * bytes_per_thread) / sizeof(uint32_t), thread_bytes / sizeof(uint32_t)); - copy_to_hugepage( - (void*)thread_dst, hugepage_size, thread_src, thread_bytes, ctx.page_size, false); - }, - []() {}); - - results.host_bytes_processed += ctx.total_size; - results.host_time_elapsed += iteration_time; - - state.SetIterationTime(iteration_time); - } - - state.SetBytesProcessed(ctx.total_size * state.iterations()); - return results; -} - -// Benchmark memcpy_to_device while the device is reading the hugepage. -// Reports host bw and device bw. -TestResult mem_bench_copy_with_active_kernel(benchmark::State& state) { - TestResult results; - auto devices = tt::tt_metal::detail::CreateDevices(get_mmio_device_ids(1, -1)); - IDevice* device = (*(devices.begin())).second; - Context ctx{ - devices, - state.range(0), // Total size - state.range(1), // Page size - 0, // Threads - state.range(2), // Readers - 0, // Writers - state.range(3), // Enable host copy - 0, // Iterations is managed by the benchmark framework - }; - - auto src_data = generate_random_src_data(ctx.total_size); - auto hugepage = get_hugepage(device->id(), 0); - auto hugepage_size = get_hugepage_size(device->id()); - - for (auto _ : state) { - auto pgm = CreateProgram(); - auto configured_cores = configure_kernels(device, pgm, ctx, 0, ctx.number_reader_kernels, false, hugepage_size); - double host_copy_time = 1; // Set to 1 so it doesn't divide by 0 if host copy is disabled - - double wait_for_kernel_time = execute_work_synced_start( - 1, - [device, &pgm](int thread_idx) { - // Program - tt::tt_metal::detail::LaunchProgram(device, pgm, true); - }, - [&]() { - if (ctx.enable_host_copy_with_kernels) { - // Host copy while waiting for program - host_copy_time = - copy_to_hugepage(hugepage, hugepage_size, src_data, ctx.total_size, ctx.page_size, false); - results.host_bytes_processed += ctx.total_size; - results.host_time_elapsed += host_copy_time; - } - }); - - results.host_wait_for_kernel_time_elapsed += wait_for_kernel_time; - - read_inc_data_from_cores(ctx, device, configured_cores.value(), results); - - state.SetIterationTime(host_copy_time); - } - if (ctx.enable_host_copy_with_kernels) { - state.SetBytesProcessed(ctx.total_size * state.iterations()); - } else { - state.SetBytesProcessed(0); - } - - report_device_bw(state, results); - tt::tt_metal::detail::CloseDevices(devices); - return results; -} - -// Host writing to a hugepage while the device pulls from another hugepage. -// Reports host bw and device bw. -TestResult mem_bench_copy_active_kernel_different_page(benchmark::State& state) { - TestResult results; - auto devices = tt::tt_metal::detail::CreateDevices(get_mmio_device_ids(1, -1)); - IDevice* device = (*(devices.begin())).second; - Context ctx{ - devices, - state.range(0), // Total size - state.range(1), // Page size - 0, // Threads - state.range(2), // Readers - 0, // Writers - true, // Enable host copy - 0, // Iterations is managed by the benchmark framework - }; - - auto src_data = generate_random_src_data(ctx.total_size); - auto device_hugepage_size = get_hugepage_size(device->id()); - - // 2nd open device is not required - auto host_hugepage = get_hugepage(device->id() + 1, 0); - auto host_hugepage_size = get_hugepage_size(device->id() + 1); - - for (auto _ : state) { - auto pgm = CreateProgram(); - auto configured_cores = - configure_kernels(device, pgm, ctx, 0, ctx.number_reader_kernels, false, device_hugepage_size).value(); - double host_copy_time = 0; - - double wait_for_kernel_time = execute_work_synced_start( - 1, - [device, &pgm](int thread_idx) { - // Program - tt::tt_metal::detail::LaunchProgram(device, pgm, true); - }, - [&]() { - // Host copy while waiting for program - host_copy_time = - copy_to_hugepage(host_hugepage, host_hugepage_size, src_data, ctx.total_size, ctx.page_size, false); - results.host_bytes_processed += ctx.total_size; - results.host_time_elapsed += host_copy_time; - }); - - results.host_wait_for_kernel_time_elapsed += wait_for_kernel_time; - - read_inc_data_from_cores(ctx, device, configured_cores, results); - - state.SetIterationTime(host_copy_time); - } - - state.SetBytesProcessed(ctx.total_size * state.iterations()); - - report_device_bw(state, results); - tt::tt_metal::detail::CloseDevices(devices); - return results; -} - -// Common Multi MMIO device test. -TestResult mem_bench_multi_mmio_devices( - benchmark::State& state, std::map& devices, const Context& ctx) { - TestResult results; - - // One thread to wait for program on each device - int num_threads = devices.size(); - - for (auto _ : state) { - std::map programs; // device : programs - std::map configured_core_ranges; // device : cores - for (auto [device_id, device] : devices) { - programs[device_id] = CreateProgram(); - Program& pgm = programs[device_id]; - auto device_hugepage = get_hugepage(device_id, 0); - auto device_hugepage_size = get_hugepage_size(device_id); - configured_core_ranges.insert( - {device_id, - configure_kernels(device, pgm, ctx, 0, ctx.number_reader_kernels, false, device_hugepage_size) - .value()}); - } - - double host_copy_time = 0; - execute_work_synced_start( - 1, - [devices, &programs](int thread_idx) { - // Program - for (auto& [device_id, pgm] : programs) { - tt::tt_metal::detail::LaunchProgram(devices.at(device_id), pgm, false); - } - }, - []() {}); - - // Wait all programs to complete - for (auto& [device_id, pgm] : programs) { - tt::tt_metal::detail::WaitProgramDone(devices.at(device_id), pgm); - } - - // Read counters from each core - for (auto& [device_id, core_range] : configured_core_ranges) { - read_inc_data_from_cores(ctx, devices.at(device_id), core_range, results); - } - - // This test does not report host bw - state.SetIterationTime(1); - } - - state.SetBytesProcessed(0); - report_device_bw(state, results); - state.counters["num_mmio_devices"] = devices.size(); - - return results; -} - -// Multi MMIO devices reading on the same NUMA node. -TestResult mem_bench_multi_mmio_devices_reading_same_node(benchmark::State& state) { - // Node 0 - auto devices = tt::tt_metal::detail::CreateDevices(get_mmio_device_ids(get_number_of_mmio_devices(), 0)); - - Context ctx{ - devices, - state.range(0), // Total size - state.range(1), // Page size - 0, // Threads - state.range(2), // Readers on each device - 0, // Writers - false, // Enable host copy - 0, // Iterations is managed by the benchmark framework - }; - - TestResult results = mem_bench_multi_mmio_devices(state, devices, ctx); - tt::tt_metal::detail::CloseDevices(devices); - - return results; -} - -// Multi MMIO devices reading on different NUMA nodes. -TestResult mem_bench_multi_mmio_devices_reading_different_node(benchmark::State& state) { - auto devices = tt::tt_metal::detail::CreateDevices(get_mmio_device_ids_unique_nodes(get_number_of_mmio_devices())); - - Context ctx{ - devices, - state.range(0), // Total size - state.range(1), // Page size - 0, // Threads - state.range(2), // Readers on each device - 0, // Writers - false, // Enable host copy - 0, // Iterations is managed by the benchmark framework - }; - - TestResult results = mem_bench_multi_mmio_devices(state, devices, ctx); - tt::tt_metal::detail::CloseDevices(devices); - - return results; -} - -// Benchmark memcpy_to_device while device is reading (prefetching) and writing (dispatching data back to host) -// First half of hugepage will be written to by host -// Second half will be written to by device -TestResult mem_bench_copy_with_read_and_write_kernel(benchmark::State& state) { - auto devices = tt::tt_metal::detail::CreateDevices(get_mmio_device_ids(1, -1)); - IDevice* device = (*(devices.begin())).second; - Context ctx{ - devices, - state.range(0), // Total size - state.range(1), // Page size - 0, // Threads - state.range(2), // Readers - state.range(3), // Writers - true, // Enable host copy - 0, // Iterations is managed by the benchmark framework - }; - - auto src_data = generate_random_src_data(ctx.total_size); - auto hugepage = get_hugepage(device->id(), 0); - auto hugepage_size = get_hugepage_size(device->id()); - - // Don't need to seperate device results - // Readers will have 0 bytes written - // Writers will have 0 bytes read. Will not mix. - TestResult results; - - for (auto _ : state) { - auto pgm = CreateProgram(); - auto configured_read_cores = - configure_kernels(device, pgm, ctx, 0, ctx.number_reader_kernels, false, hugepage_size / 2).value(); - // Offset write cores to second half of PCIe - // Use second row - auto configured_write_cores = - configure_kernels( - device, pgm, ctx, 1, ctx.number_writer_kernels, true, hugepage_size / 2, hugepage_size / 2) - .value(); - double host_copy_time = 0; - - double wait_for_kernel_time = execute_work_synced_start( - 1, - [device, &pgm](int thread_idx) { - // Program - tt::tt_metal::detail::LaunchProgram(device, pgm, true); - }, - [&]() { - // Host copy while waiting for program - host_copy_time = - copy_to_hugepage(hugepage, hugepage_size / 2, src_data, ctx.total_size, ctx.page_size, false); - results.host_bytes_processed += ctx.total_size; - results.host_time_elapsed += host_copy_time; - }); - - results.host_wait_for_kernel_time_elapsed += wait_for_kernel_time; - - read_inc_data_from_cores(ctx, device, configured_read_cores, results); - read_inc_data_from_cores(ctx, device, configured_write_cores, results); - - state.SetIterationTime(host_copy_time); - } - - state.SetBytesProcessed(ctx.total_size * state.iterations()); - report_device_bw(state, results); - tt::tt_metal::detail::CloseDevices(devices); - return results; -} - -void global_bench_args(benchmark::internal::Benchmark* b) { b->UseManualTime()->Iterations(5); } - -void register_basic_benchmark_suite() { - ::benchmark::RegisterBenchmark("Host Copy Page Sizing", mem_bench_page_sizing) - ->Apply(global_bench_args) - ->ArgsProduct({ - {1_GB}, - {16, 8_KB, 16_KB, 32_KB}, - {false}, - }); - ::benchmark::RegisterBenchmark("Host Copy (Cached)", mem_bench_page_sizing) - ->Apply(global_bench_args) - ->ArgsProduct({ - {1_GB}, - {16, 8_KB, 16_KB, 32_KB}, - {true}, - }); - ::benchmark::RegisterBenchmark("Host Copy Saturation", mem_bench_copy_multithread) - ->Apply(global_bench_args) - ->ArgsProduct({ - {1_GB}, - {32_KB}, - {1, 2, 3, 4, 5, 6, 7, 8}, - }); - ::benchmark::RegisterBenchmark("Device Reading Host", mem_bench_copy_with_active_kernel) - ->Apply(global_bench_args) - ->ArgsProduct({ - {1_GB}, - {32_KB}, - {1, 2, 3, 4}, - {false}, - }); -} - -void register_full_benchmark_suite() { - ::benchmark::RegisterBenchmark("Host Copy with Active Kernel", mem_bench_copy_with_active_kernel) - ->Apply(global_bench_args) - ->ArgsProduct({ - {1_GB}, - {32_KB}, - {1, 2, 3, 4}, - {false}, - }); - ::benchmark::RegisterBenchmark( - "Host Copy with Active Kernel on Different Hugepages", mem_bench_copy_active_kernel_different_page) - ->Apply(global_bench_args) - ->ArgsProduct({ - {1_GB}, - {32_KB}, - {1, 2, 3, 4}, - }); - ::benchmark::RegisterBenchmark( - "Host Copy with Active Kernel Reading and Writing", mem_bench_copy_with_read_and_write_kernel) - ->Apply(global_bench_args) - ->ArgsProduct({ - {1_GB}, - {32_KB}, - {1, 2}, - {1, 2}, - }); - ::benchmark::RegisterBenchmark( - "Multiple MMIO Devices Reading (Same NUMA node)", mem_bench_multi_mmio_devices_reading_same_node) - ->Apply(global_bench_args) - ->ArgsProduct({ - {1_GB}, - {32_KB}, - {1, 2}, - }); - ::benchmark::RegisterBenchmark( - "Multiple MMIO Devices Reading (Different NUMA node)", mem_bench_multi_mmio_devices_reading_different_node) - ->Apply(global_bench_args) - ->ArgsProduct({ - {1_GB}, - {32_KB}, - {1, 2}, - }); -} - -void print_help() { - ::benchmark::PrintDefaultHelp(); - std::cout << " [--help] Shows this help message\n"; - std::cout << " [--full] Run all tests\n"; - std::cout << "\nCounters\n"; - std::cout << " bytes_per_second: Aggregate Host copy to hugepage bandwidth. 0 if not measured.\n"; - std::cout << " dev_bw: Average device core PCIe pull bandwidth. 0 if not measured.\n"; -} - -int main(int argc, char* argv[]) { - std::vector input_args(argv, argv + argc); - if (test_args::has_command_option(input_args, "--help")) { - print_help(); - return 0; - } - - // Force TT_METAL options - setenv("TT_METAL_SLOW_DISPATCH_MODE", "true", true); - setenv("TT_METAL_CLEAR_L1", "1", true); - // May be overridden by the user - setenv("TT_METAL_LOGGER_LEVEL", "FATAL", false); - - char arg0_default[] = "benchmark"; - char* args_default = arg0_default; - if (!argv) { - argc = 1; - argv = &args_default; - } - - // Run basic benchmarks - register_basic_benchmark_suite(); - - // Run all benchmarks - if (test_args::has_command_option(input_args, "--full")) { - register_full_benchmark_suite(); - } - - ::benchmark::Initialize(&argc, argv); - ::benchmark::RunSpecifiedBenchmarks(); - ::benchmark::Shutdown(); - return 0; -} diff --git a/tt_metal/tools/mem_bench/work_thread.hpp b/tt_metal/tools/mem_bench/work_thread.hpp deleted file mode 100644 index 53a89f6010b..00000000000 --- a/tt_metal/tools/mem_bench/work_thread.hpp +++ /dev/null @@ -1,77 +0,0 @@ -// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include -#include -#include -#include -#include -#include -#include "host_utils.hpp" - -namespace tt::tt_metal::tools::mem_bench { - -// Execute work_fn on num_threads threads and also do intermediate_fn on the side. -// Returns time taken in seconds for all work_fn to complete. Time is calculated by latest thread end - earliest thread -// start. -template -double execute_work_synced_start(int num_threads, F&& work_fn, IntermediateF&& intermediate_fn, Args&&... args) { - std::mutex m; - int threads_ready{0}; - std::condition_variable go_cv; // Signal to all threads to go - auto total_threads = num_threads + 1; // Including intermediate - std::vector thread_start_times(num_threads); - std::vector thread_end_times(num_threads); - std::vector threads(total_threads); - - for (int i = 0; i < num_threads; ++i) { - threads[i] = std::thread([i, - &m, - &go_cv, - &threads_ready, - &thread_start_times, - &thread_end_times, - total_threads, - work_fn = std::forward(work_fn), - ... args = std::forward(args)]() mutable { - { - std::unique_lock lk{m}; - threads_ready++; - if (threads_ready == total_threads) { - go_cv.notify_all(); - } - go_cv.wait(lk, [&] { return threads_ready == total_threads; }); - } - - thread_start_times[i] = get_current_time_seconds(); - work_fn(i, std::forward(args)...); - thread_end_times[i] = get_current_time_seconds(); - }); - } - - threads[num_threads] = std::thread([&]() mutable { - std::unique_lock lk{m}; - threads_ready++; - if (threads_ready == total_threads) { - go_cv.notify_all(); - } - go_cv.wait(lk, [&] { return threads_ready == total_threads; }); - - intermediate_fn(); - }); - - for (auto& thread : threads) { - thread.join(); - } - - // Calculate work time based on earliest start and latest end - double earliest_start = *std::min_element(thread_start_times.begin(), thread_start_times.end()); - double latest_end = *std::max_element(thread_end_times.begin(), thread_end_times.end()); - - return latest_end - earliest_start; -} - -}; // namespace tt::tt_metal::tools::mem_bench From c9feb5ddd96b3a8b169e3455342a5e7e349a0d60 Mon Sep 17 00:00:00 2001 From: Michael Chiou <156848643+ttmchiou@users.noreply.github.com> Date: Fri, 21 Feb 2025 16:21:17 -0800 Subject: [PATCH 234/316] Revert "fix the reverted PR for Optimize the web demo for yolov4 (#15478)" (#18170) Reverts tenstorrent/tt-metal#15838 This PR is failing on Wormhole N150/N300 tests deterministically on Post-Commit. Reverting Sample Workflows https://github.com/tenstorrent/tt-metal/actions/runs/13466018463/job/37632422542 https://github.com/tenstorrent/tt-metal/actions/runs/13465881356/job/37631749227 https://github.com/tenstorrent/tt-metal/actions/runs/13463906253/job/37625792691 --- .../wormhole/yolov4/test_yolov4_performant.py | 4 +- .../yolov4/test_yolov4_performant_webdemo.py | 44 ++- models/demos/yolov4/README.md | 27 +- models/demos/yolov4/demo/demo.py | 231 ++++++++-------- models/demos/yolov4/tests/test_perf_yolo.py | 17 +- .../yolov4/tests/yolov4_perfomant_webdemo.py | 250 +++++++++++++++-- .../demos/yolov4/tests/yolov4_test_infra.py | 63 +++-- models/demos/yolov4/ttnn/common.py | 8 - models/demos/yolov4/ttnn/genboxes.py | 256 ------------------ models/demos/yolov4/ttnn/yolov4.py | 35 +-- models/demos/yolov4/web_demo/README.md | 5 - .../demos/yolov4/web_demo/client/coco.names | 80 ------ .../yolov4/web_demo/client/requirements.txt | 1 - models/demos/yolov4/web_demo/client/yolov4.py | 181 +++++++++---- .../yolov4/web_demo/server/fast_api_yolov4.py | 166 +----------- .../yolov4/test_ttnn_downsample1.py | 10 +- .../yolov4/test_ttnn_downsample2.py | 10 +- .../yolov4/test_ttnn_downsample3.py | 11 +- .../yolov4/test_ttnn_downsample4.py | 9 +- .../yolov4/test_ttnn_downsample5.py | 9 +- .../yolov4/test_ttnn_head.py | 26 +- .../yolov4/test_ttnn_neck.py | 12 +- .../yolov4/test_ttnn_post_processing.py | 80 ------ .../yolov4/test_ttnn_yolov4.py | 88 +++--- 24 files changed, 695 insertions(+), 928 deletions(-) delete mode 100644 models/demos/yolov4/ttnn/genboxes.py delete mode 100644 models/demos/yolov4/web_demo/client/coco.names mode change 100644 => 100755 models/demos/yolov4/web_demo/server/fast_api_yolov4.py delete mode 100644 tests/ttnn/integration_tests/yolov4/test_ttnn_post_processing.py diff --git a/models/demos/wormhole/yolov4/test_yolov4_performant.py b/models/demos/wormhole/yolov4/test_yolov4_performant.py index 81357bfdd70..ec4819711a9 100644 --- a/models/demos/wormhole/yolov4/test_yolov4_performant.py +++ b/models/demos/wormhole/yolov4/test_yolov4_performant.py @@ -24,7 +24,7 @@ def test_run_yolov4_inference(device, use_program_cache, batch_size, act_dtype, @run_for_wormhole_b0() -@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576, "trace_region_size": 6422528}], indirect=True) +@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576, "trace_region_size": 1843200}], indirect=True) @pytest.mark.parametrize( "batch_size, act_dtype, weight_dtype", ((1, ttnn.bfloat16, ttnn.bfloat16),), @@ -50,7 +50,7 @@ def test_run_yolov4_trace_inference( @run_for_wormhole_b0() @pytest.mark.parametrize( - "device_params", [{"l1_small_size": 24576, "trace_region_size": 6397952, "num_command_queues": 2}], indirect=True + "device_params", [{"l1_small_size": 24576, "trace_region_size": 3686400, "num_command_queues": 2}], indirect=True ) @pytest.mark.parametrize( "batch_size, act_dtype, weight_dtype", diff --git a/models/demos/wormhole/yolov4/test_yolov4_performant_webdemo.py b/models/demos/wormhole/yolov4/test_yolov4_performant_webdemo.py index bf716285a53..b4940fbd2ab 100644 --- a/models/demos/wormhole/yolov4/test_yolov4_performant_webdemo.py +++ b/models/demos/wormhole/yolov4/test_yolov4_performant_webdemo.py @@ -8,12 +8,52 @@ import torch from models.utility_functions import run_for_wormhole_b0 -from models.demos.yolov4.tests.yolov4_perfomant_webdemo import Yolov4Trace2CQ +from models.demos.yolov4.tests.yolov4_perfomant_webdemo import ( + run_yolov4_inference, + run_yolov4_trace_inference, + run_yolov4_trace_2cqs_inference, + Yolov4Trace2CQ, +) + + +@run_for_wormhole_b0() +@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576}], indirect=True) +@pytest.mark.parametrize( + "batch_size, act_dtype, weight_dtype", + ((1, ttnn.bfloat16, ttnn.bfloat16),), +) +def test_run_yolov4_inference(device, use_program_cache, batch_size, act_dtype, weight_dtype, model_location_generator): + run_yolov4_inference(device, batch_size, act_dtype, weight_dtype, model_location_generator) + + +@run_for_wormhole_b0() +@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576, "trace_region_size": 1617920}], indirect=True) +@pytest.mark.parametrize( + "batch_size, act_dtype, weight_dtype", + ((1, ttnn.bfloat16, ttnn.bfloat16),), +) +@pytest.mark.parametrize("enable_async_mode", (False, True), indirect=True) +def test_run_yolov4_trace_inference( + device, + use_program_cache, + batch_size, + act_dtype, + weight_dtype, + enable_async_mode, + model_location_generator, +): + run_yolov4_trace_inference( + device, + batch_size, + act_dtype, + weight_dtype, + model_location_generator, + ) @run_for_wormhole_b0() @pytest.mark.parametrize( - "device_params", [{"l1_small_size": 24576, "trace_region_size": 3211264, "num_command_queues": 2}], indirect=True + "device_params", [{"l1_small_size": 24576, "trace_region_size": 1617920, "num_command_queues": 2}], indirect=True ) @pytest.mark.parametrize( "batch_size, act_dtype, weight_dtype", diff --git a/models/demos/yolov4/README.md b/models/demos/yolov4/README.md index 006e1eaacf9..6e6f560379c 100644 --- a/models/demos/yolov4/README.md +++ b/models/demos/yolov4/README.md @@ -2,31 +2,24 @@ ## How to run yolov4 -### Model code running with Trace+2CQ -- Use the following command to run the yolov4 performant implementation (71 FPS): - ```bash - pytest models/demos/wormhole/yolov4/test_yolov4_performant_webdemo.py::test_run_yolov4_trace_2cqs_inference[True-1-act_dtype0-weight_dtype0-device_params0] +- Use the following command to run the yolov4 performant impelementation (95 FPS): ``` + pytest models/demos/wormhole/yolov4/test_yolov4_performant.py::test_run_yolov4_trace_2cqs_inference[True-1-act_dtype0-weight_dtype0-device_params0] + ``` + +- You may try the interactive web demo following the instructions here: models/demos/yolov4/web_demo/README.md (25-30 FPS). NOTE: The post-processing is currently running on host. It will be moved to device soon which should significantly improve the end to end FPS. -### Single Image Demo + +- Use the following command to run a single-image demo for visualization. NOTE: the following demos are intented for visualization. It is not the performant implementation yet. And, the post processing is currently done on host which we will be moving to device soon. - Use the following command to run the yolov4 with a giraffe image: - ```bash + ``` pytest models/demos/yolov4/demo/demo.py ``` -- The output file `ttnn_yolov4_320_prediction_demo.jpg` will be generated. - Use the following command to run the yolov4 with different input image: - ```bash + ``` pytest --disable-warnings --input-path= models/demos/yolov4/demo/demo.py ``` - -### mAP Accuracy Test -- To be added soon - -### Web Demo -- You may try the interactive web demo (35 FPS end-2-end) following the instructions: -``` -models/demos/yolov4/web_demo/README.md -``` +Once you run the command, The output file named `ttnn_prediction_demo.jpg` will be generated. diff --git a/models/demos/yolov4/demo/demo.py b/models/demos/yolov4/demo/demo.py index 987f0c7b509..277e28deab0 100644 --- a/models/demos/yolov4/demo/demo.py +++ b/models/demos/yolov4/demo/demo.py @@ -140,10 +140,10 @@ def yolo_forward_dynamic( by_bh /= output.size(2) # Shape: [batch, num_anchors * H * W, 1] - bx = bx_bw[:, :num_anchors].reshape(output.size(0), num_anchors * output.size(2) * output.size(3), 1) - by = by_bh[:, :num_anchors].reshape(output.size(0), num_anchors * output.size(2) * output.size(3), 1) - bw = bx_bw[:, num_anchors:].reshape(output.size(0), num_anchors * output.size(2) * output.size(3), 1) - bh = by_bh[:, num_anchors:].reshape(output.size(0), num_anchors * output.size(2) * output.size(3), 1) + bx = bx_bw[:, :num_anchors].view(output.size(0), num_anchors * output.size(2) * output.size(3), 1) + by = by_bh[:, :num_anchors].view(output.size(0), num_anchors * output.size(2) * output.size(3), 1) + bw = bx_bw[:, num_anchors:].view(output.size(0), num_anchors * output.size(2) * output.size(3), 1) + bh = by_bh[:, num_anchors:].view(output.size(0), num_anchors * output.size(2) * output.size(3), 1) bx1 = bx - bw * 0.5 by1 = by - bh * 0.5 @@ -324,6 +324,12 @@ def nms_cpu(boxes, confs, nms_thresh=0.5, min_mode=False): def post_processing(img, conf_thresh, nms_thresh, output): + # anchors = [12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401] + # num_anchors = 9 + # anchor_masks = [[0, 1, 2], [3, 4, 5], [6, 7, 8]] + # strides = [8, 16, 32] + # anchor_step = len(anchors) // num_anchors + # [batch, num, 1, 4] box_array = output[0] # [batch, num, num_classes] @@ -458,7 +464,34 @@ def do_detect(model, img, conf_thresh, nms_thresh, n_classes, device=None, class output_tensor3 = output_tensor3.reshape(1, 10, 10, 255) output_tensor3 = torch.permute(output_tensor3, (0, 3, 1, 2)) - y1, y2, y3 = gen_yolov4_boxes_confs([output_tensor1, output_tensor2, output_tensor3]) + yolo1 = YoloLayer( + anchor_mask=[0, 1, 2], + num_classes=n_classes, + anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], + num_anchors=9, + stride=8, + ) + + yolo2 = YoloLayer( + anchor_mask=[3, 4, 5], + num_classes=n_classes, + anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], + num_anchors=9, + stride=16, + ) + + yolo3 = YoloLayer( + anchor_mask=[6, 7, 8], + num_classes=n_classes, + anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], + num_anchors=9, + stride=32, + ) + + y1 = yolo1(output_tensor1) + y2 = yolo2(output_tensor2) + y3 = yolo3(output_tensor3) + output = get_region_boxes([y1, y2, y3]) t2 = time.time() @@ -478,8 +511,37 @@ def do_detect(model, img, conf_thresh, nms_thresh, n_classes, device=None, class else: t1 = time.time() output = model(img) - y1, y2, y3 = gen_yolov4_boxes_confs(output) + + yolo1 = YoloLayer( + anchor_mask=[0, 1, 2], + num_classes=n_classes, + anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], + num_anchors=9, + stride=8, + ) + + yolo2 = YoloLayer( + anchor_mask=[3, 4, 5], + num_classes=n_classes, + anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], + num_anchors=9, + stride=16, + ) + + yolo3 = YoloLayer( + anchor_mask=[6, 7, 8], + num_classes=n_classes, + anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], + num_anchors=9, + stride=32, + ) + + y1 = yolo1(output[0]) + y2 = yolo2(output[1]) + y3 = yolo3(output[2]) + output = get_region_boxes([y1, y2, y3]) + t2 = time.time() print("-----------------------------------") @@ -494,117 +556,66 @@ def do_detect(model, img, conf_thresh, nms_thresh, n_classes, device=None, class plot_boxes_cv2(img, boxes[0], "torch_prediction_demo.jpg", class_names) -def gen_yolov4_boxes_confs(output): - n_classes = 80 - anchors_array = [12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401] - num_anchors = 9 - anchor_masks = [[0, 1, 2], [3, 4, 5], [6, 7, 8]] - strides = [8, 16, 32] - - yolo1 = YoloLayer( - anchor_mask=anchor_masks[0], - num_classes=n_classes, - anchors=anchors_array, - num_anchors=num_anchors, - stride=strides[0], - ) - - yolo2 = YoloLayer( - anchor_mask=anchor_masks[1], - num_classes=n_classes, - anchors=anchors_array, - num_anchors=num_anchors, - stride=strides[1], - ) - - yolo3 = YoloLayer( - anchor_mask=anchor_masks[2], - num_classes=n_classes, - anchors=anchors_array, - num_anchors=num_anchors, - stride=strides[2], - ) - - y1 = yolo1(output[0]) - y2 = yolo2(output[1]) - y3 = yolo3(output[2]) - - return y1, y2, y3 - - @skip_for_grayskull() @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) -def test_yolov4(device, reset_seeds, model_location_generator): - torch.manual_seed(0) +@pytest.mark.parametrize( + "use_pretrained_weight", + [True, False], + ids=[ + "pretrained_weight_true", + "pretrained_weight_false", + ], +) +def test_yolov4_model(device, model_location_generator, reset_seeds, input_path, use_pretrained_weight): model_path = model_location_generator("models", model_subdir="Yolo") + if use_pretrained_weight: + if model_path == "models": + if not os.path.exists("tests/ttnn/integration_tests/yolov4/yolov4.pth"): # check if yolov4.th is availble + os.system( + "tests/ttnn/integration_tests/yolov4/yolov4_weights_download.sh" + ) # execute the yolov4_weights_download.sh file + + weights_pth = "tests/ttnn/integration_tests/yolov4/yolov4.pth" + else: + weights_pth = str(model_path / "yolov4.pth") - if model_path == "models": - if not os.path.exists("tests/ttnn/integration_tests/yolov4/yolov4.pth"): # check if yolov4.th is availble - os.system( - "tests/ttnn/integration_tests/yolov4/yolov4_weights_download.sh" - ) # execute the yolov4_weights_download.sh file + ttnn_model = TtYOLOv4(device, weights_pth) + torch_model = Yolov4() + new_state_dict = {} + ds_state_dict = {k: v for k, v in ttnn_model.torch_model.items()} - weights_pth = "tests/ttnn/integration_tests/yolov4/yolov4.pth" - else: - weights_pth = str(model_path / "yolov4.pth") + keys = [name for name, parameter in torch_model.state_dict().items()] + values = [parameter for name, parameter in ds_state_dict.items()] - ttnn_model = TtYOLOv4(weights_pth, device) + for i in range(len(keys)): + new_state_dict[keys[i]] = values[i] - imgfile = "models/demos/yolov4/demo/giraffe_320.jpg" - width = 320 - height = 320 - img = cv2.imread(imgfile) - img = cv2.resize(img, (width, height)) - img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) - if type(img) == np.ndarray and len(img.shape) == 3: # cv2 image - img = torch.from_numpy(img.transpose(2, 0, 1)).float().div(255.0).unsqueeze(0) - elif type(img) == np.ndarray and len(img.shape) == 4: - img = torch.from_numpy(img.transpose(0, 3, 1, 2)).float().div(255.0) + torch_model.load_state_dict(new_state_dict) + torch_model.eval() else: - exit() - torch_input = torch.autograd.Variable(img) - - input_tensor = torch.permute(torch_input, (0, 2, 3, 1)) - ttnn_input = ttnn.from_torch(input_tensor, ttnn.bfloat16) - - torch_model = Yolov4() - new_state_dict = dict(zip(torch_model.state_dict().keys(), ttnn_model.torch_model.values())) - torch_model.load_state_dict(new_state_dict) - torch_model.eval() - - torch_output_tensor = torch_model(torch_input) - - ref1, ref2, ref3 = gen_yolov4_boxes_confs(torch_output_tensor) - ref_boxes, ref_confs = get_region_boxes([ref1, ref2, ref3]) - - ttnn_output_tensor = ttnn_model(ttnn_input) - result_boxes_padded = ttnn.to_torch(ttnn_output_tensor[0]) - result_confs = ttnn.to_torch(ttnn_output_tensor[1]) - - result_boxes_padded = result_boxes_padded.permute(0, 2, 1, 3) - result_boxes_list = [] - # Unpadding - # That ttnn tensor is the concat output of 3 padded tensors - # As a perf workaround I'm doing the unpadding on the torch output here. - # TODO: cleaner ttnn code when ttnn.untilize() is fully optimized - box_1_start_i = 0 - box_1_end_i = 6100 - box_2_start_i = 6128 - box_2_end_i = 6228 - box_3_start_i = 6256 - box_3_end_i = 6356 - result_boxes_list.append(result_boxes_padded[:, box_1_start_i:box_1_end_i]) - result_boxes_list.append(result_boxes_padded[:, box_2_start_i:box_2_end_i]) - result_boxes_list.append(result_boxes_padded[:, box_3_start_i:box_3_end_i]) - result_boxes = torch.cat(result_boxes_list, dim=1) - - ## Giraffe image detection - conf_thresh = 0.3 - nms_thresh = 0.4 - output = [result_boxes.to(torch.float16), result_confs.to(torch.float16)] - - boxes = post_processing(img, conf_thresh, nms_thresh, output) + torch_model = Yolov4.from_random_weights() + ttnn_weights = update_weight_parameters(OrderedDict(torch_model.state_dict())) + ttnn_model = TtYOLOv4(device, ttnn_weights) + + n_classes = 80 namesfile = "models/demos/yolov4/demo/coco.names" - class_names = load_class_names(namesfile) + if input_path == "": + imgfile = "models/demos/yolov4/demo/giraffe_320.jpg" + else: + imgfile = input_path + width = 320 + height = 320 + img = cv2.imread(imgfile) - plot_boxes_cv2(img, boxes[0], "ttnn_yolov4_320_prediction_demo.jpg", class_names) + + # Inference input size is 416*416 does not mean training size is the same + # Training size could be 608*608 or even other sizes + # Optional inference sizes: + # Hight in {320, 416, 512, 608, ... 320 + 96 * n} + # Width in {320, 416, 512, 608, ... 320 + 96 * m} + sized = cv2.resize(img, (width, height)) + sized = cv2.cvtColor(sized, cv2.COLOR_BGR2RGB) + + for i in range(2): # This 'for' loop is for speed check + # Because the first iteration is usually longer + do_detect(ttnn_model, sized, 0.3, 0.4, n_classes, device, class_name=namesfile, imgfile=imgfile) diff --git a/models/demos/yolov4/tests/test_perf_yolo.py b/models/demos/yolov4/tests/test_perf_yolo.py index e5f299b7519..1b07addbbfe 100644 --- a/models/demos/yolov4/tests/test_perf_yolo.py +++ b/models/demos/yolov4/tests/test_perf_yolo.py @@ -26,11 +26,11 @@ def get_expected_compile_time_sec(): - return 75 + return 60 def get_expected_inference_time_sec(): - return 0.35 + return 0.237 @pytest.mark.models_performance_bare_metal @@ -60,15 +60,14 @@ def test_yolov4( weights_pth = "tests/ttnn/integration_tests/yolov4/yolov4.pth" else: weights_pth = str(model_path / "yolov4.pth") - ttnn_model = TtYOLOv4(weights_pth, device) + ttnn_model = TtYOLOv4(device, weights_pth) torch_input_tensor = torch.rand(input_shape, dtype=torch.bfloat16) ttnn_input = ttnn.from_torch(torch_input_tensor, ttnn.bfloat16) logger.info(f"Compiling model with warmup run") profiler.start(f"inference_and_compile_time") - ttnn_output_tensor = ttnn_model(ttnn_input) - + out1, out2, out3 = ttnn_model(ttnn_input) profiler.end(f"inference_and_compile_time") inference_and_compile_time = profiler.get("inference_and_compile_time") @@ -80,8 +79,10 @@ def test_yolov4( for idx in range(iterations): profiler.start("inference_time") profiler.start(f"inference_time_{idx}") - ttnn_output_tensor = ttnn_model(ttnn_input) - + out1, out2, out3 = ttnn_model(ttnn_input) + outputs.append(ttnn.from_device(out1, blocking=False)) + outputs.append(ttnn.from_device(out2, blocking=False)) + outputs.append(ttnn.from_device(out3, blocking=False)) profiler.end(f"inference_time_{idx}") profiler.end("inference_time") @@ -125,7 +126,7 @@ def test_perf_device_bare_metal_yolov4(batch_size, model_name): num_iterations = 1 margin = 0.03 - expected_perf = 102 + expected_perf = 234 command = f"pytest tests/ttnn/integration_tests/yolov4/test_ttnn_yolov4.py" cols = ["DEVICE FW", "DEVICE KERNEL", "DEVICE BRISC KERNEL"] diff --git a/models/demos/yolov4/tests/yolov4_perfomant_webdemo.py b/models/demos/yolov4/tests/yolov4_perfomant_webdemo.py index f8b5486060c..0968152e3ce 100644 --- a/models/demos/yolov4/tests/yolov4_perfomant_webdemo.py +++ b/models/demos/yolov4/tests/yolov4_perfomant_webdemo.py @@ -9,6 +9,8 @@ is_wormhole_b0, ) from models.demos.yolov4.tests.yolov4_test_infra import create_test_infra +from models.demos.yolov4.demo.demo import YoloLayer + try: from tracy import signpost @@ -29,6 +31,175 @@ def buffer_address(tensor): ttnn.buffer_address = buffer_address +def run_yolov4_inference( + device, + device_batch_size, + act_dtype, + weight_dtype, + model_location_generator, +): + test_infra = create_test_infra( + device, + device_batch_size, + act_dtype, + weight_dtype, + model_location_generator=model_location_generator, + ) + + tt_inputs_host, self.input_mem_config = test_infra.setup_l1_sharded_input(device) + + # # First run configures convs JIT + test_infra.input_tensor = tt_inputs_host.to(device, self.input_mem_config) + test_infra.run() + test_infra.validate() + test_infra.dealloc_output() + + # Optimized run + test_infra.input_tensor = tt_inputs_host.to(device, self.input_mem_config) + test_infra.run() + test_infra.validate() + test_infra.dealloc_output() + + # More optimized run with caching + if use_signpost: + signpost(header="start") + test_infra.input_tensor = tt_inputs_host.to(device, self.input_mem_config) + test_infra.run() + if use_signpost: + signpost(header="stop") + test_infra.validate() + test_infra.dealloc_output() + + +def run_yolov4_trace_inference( + device, + device_batch_size, + act_dtype, + weight_dtype, + model_location_generator, +): + test_infra = create_test_infra( + device, + device_batch_size, + act_dtype, + weight_dtype, + model_location_generator=model_location_generator, + ) + tt_inputs_host, self.input_mem_config = test_infra.setup_l1_sharded_input(device) + + # First run configures convs JIT + test_infra.input_tensor = tt_inputs_host.to(device, self.input_mem_config) + spec = test_infra.input_tensor.spec + test_infra.run() + test_infra.validate() + test_infra.dealloc_output() + + # Optimized run + test_infra.input_tensor = tt_inputs_host.to(device, self.input_mem_config) + test_infra.run() + test_infra.validate() + + # Capture + test_infra.input_tensor = tt_inputs_host.to(device, self.input_mem_config) + test_infra.dealloc_output() + trace_input_addr = ttnn.buffer_address(test_infra.input_tensor) + self.tid = ttnn.begin_trace_capture(device, cq_id=0) + test_infra.run() + tt_image_res = ttnn.allocate_tensor_on_device(spec, device) + ttnn.end_trace_capture(device, self.tid, cq_id=0) + assert trace_input_addr == ttnn.buffer_address(tt_image_res) + + # More optimized run with caching + if use_signpost: + signpost(header="start") + ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res, 0) + ttnn.execute_trace(device, self.tid, cq_id=0, blocking=True) + if use_signpost: + signpost(header="stop") + test_infra.validate() + + ttnn.release_trace(device, self.tid) + test_infra.dealloc_output() + + +def run_yolov4_trace_2cqs_inference( + device, + device_batch_size, + act_dtype, + weight_dtype, + model_location_generator, +): + test_infra = create_test_infra( + device, + device_batch_size, + act_dtype, + weight_dtype, + model_location_generator=model_location_generator, + ) + tt_inputs_host, sharded_mem_config_DRAM, self.input_mem_config = test_infra.setup_dram_sharded_input(device) + tt_image_res = tt_inputs_host.to(device, sharded_mem_config_DRAM) + op_event = ttnn.create_event(device) + write_event = ttnn.create_event(device) + # Initialize the op event so we can write + ttnn.record_event(0, op_event) + + # First run configures convs JIT + ttnn.wait_for_event(1, op_event) + ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res, 1) + ttnn.record_event(1, write_event) + ttnn.wait_for_event(0, write_event) + test_infra.input_tensor = ttnn.to_memory_config(tt_image_res, self.input_mem_config) + spec = test_infra.input_tensor.spec + ttnn.record_event(0, op_event) + test_infra.run() + test_infra.validate() + test_infra.dealloc_output() + + # Optimized run + ttnn.wait_for_event(1, op_event) + ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res, 1) + ttnn.record_event(1, write_event) + ttnn.wait_for_event(0, write_event) + test_infra.input_tensor = ttnn.to_memory_config(tt_image_res, self.input_mem_config) + ttnn.record_event(0, op_event) + test_infra.run() + test_infra.validate() + + # Capture + ttnn.wait_for_event(1, op_event) + ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res, 1) + ttnn.record_event(1, write_event) + ttnn.wait_for_event(0, write_event) + test_infra.input_tensor = ttnn.to_memory_config(tt_image_res, self.input_mem_config) + ttnn.record_event(0, op_event) + test_infra.dealloc_output() + trace_input_addr = ttnn.buffer_address(test_infra.input_tensor) + self.tid = ttnn.begin_trace_capture(device, cq_id=0) + test_infra.run() + self.input_tensor = ttnn.allocate_tensor_on_device(spec, device) + ttnn.end_trace_capture(device, self.tid, cq_id=0) + assert trace_input_addr == ttnn.buffer_address(self.input_tensor) + + # More optimized run with caching + if use_signpost: + signpost(header="start") + for iter in range(0, 2): + ttnn.wait_for_event(1, op_event) + ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res, 1) + ttnn.record_event(1, write_event) + ttnn.wait_for_event(0, write_event) + # TODO: Add in place support to ttnn to_memory_config + self.input_tensor = ttnn.reshard(tt_image_res, self.input_mem_config, self.input_tensor) + ttnn.record_event(0, op_event) + ttnn.execute_trace(device, self.tid, cq_id=0, blocking=False) + ttnn.synchronize_devices(device) + + if use_signpost: + signpost(header="stop") + + ttnn.release_trace(device, self.tid) + + class Yolov4Trace2CQ: def __init__(self): ... @@ -96,7 +267,12 @@ def initialize_yolov4_trace_2cqs_inference( self.device = device + # More optimized run with caching + # if use_signpost: + # signpost(header="start") + def get_region_boxes(self, boxes_and_confs): + print("Getting boxes from boxes and confs ...") boxes_list = [] confs_list = [] @@ -104,6 +280,8 @@ def get_region_boxes(self, boxes_and_confs): boxes_list.append(item[0]) confs_list.append(item[1]) + # boxes: [batch, num1 + num2 + num3, 1, 4] + # confs: [batch, num1 + num2 + num3, num_classes] boxes = torch.cat(boxes_list, dim=1) confs = torch.cat(confs_list, dim=1) @@ -120,29 +298,57 @@ def execute_yolov4_trace_2cqs_inference(self, tt_inputs_host=None): ttnn.record_event(0, self.op_event) ttnn.execute_trace(self.device, self.tid, cq_id=0, blocking=False) ttnn.synchronize_devices(self.device) + output = self.test_infra.output_tensor + + output_tensor1 = ttnn.to_torch(output[0]) + output_tensor1 = output_tensor1.reshape(1, 40, 40, 255) + output_tensor1 = torch.permute(output_tensor1, (0, 3, 1, 2)) + + output_tensor2 = ttnn.to_torch(output[1]) + output_tensor2 = output_tensor2.reshape(1, 20, 20, 255) + output_tensor2 = torch.permute(output_tensor2, (0, 3, 1, 2)) + + output_tensor3 = ttnn.to_torch(output[2]) + output_tensor3 = output_tensor3.reshape(1, 10, 10, 255) + output_tensor3 = torch.permute(output_tensor3, (0, 3, 1, 2)) + + n_classes = 80 + + yolo1 = YoloLayer( + anchor_mask=[0, 1, 2], + num_classes=n_classes, + anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], + num_anchors=9, + stride=8, + ) + + yolo2 = YoloLayer( + anchor_mask=[3, 4, 5], + num_classes=n_classes, + anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], + num_anchors=9, + stride=16, + ) + + yolo3 = YoloLayer( + anchor_mask=[6, 7, 8], + num_classes=n_classes, + anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], + num_anchors=9, + stride=32, + ) + + y1 = yolo1(output_tensor1) + y2 = yolo2(output_tensor2) + y3 = yolo3(output_tensor3) + + output = self.get_region_boxes([y1, y2, y3]) + + return output + # return self.test_infra.output_tensor - ttnn_output_tensor = self.test_infra.output_tensor - - result_boxes_padded = ttnn.to_torch(ttnn_output_tensor[0]) - result_confs = ttnn.to_torch(ttnn_output_tensor[1]) - - result_boxes_padded = result_boxes_padded.permute(0, 2, 1, 3) - result_boxes_list = [] - # That ttnn tensor is the concat output of 3 padded tensors - # As a perf workaround I'm doing the unpadding on the torch output here. - # TODO: cleaner ttnn code when ttnn.untilize() is fully optimized - box_1_start_i = 0 - box_1_end_i = 6100 - box_2_start_i = 6128 - box_2_end_i = 6228 - box_3_start_i = 6256 - box_3_end_i = 6356 - result_boxes_list.append(result_boxes_padded[:, box_1_start_i:box_1_end_i]) - result_boxes_list.append(result_boxes_padded[:, box_2_start_i:box_2_end_i]) - result_boxes_list.append(result_boxes_padded[:, box_3_start_i:box_3_end_i]) - result_boxes = torch.cat(result_boxes_list, dim=1) - - return [result_boxes, result_confs] + # if use_signpost: + # signpost(header="stop") def release_yolov4_trace_2cqs_inference(self): ttnn.release_trace(self.device, self.tid) diff --git a/models/demos/yolov4/tests/yolov4_test_infra.py b/models/demos/yolov4/tests/yolov4_test_infra.py index 474e2f2e87e..1c82369c476 100644 --- a/models/demos/yolov4/tests/yolov4_test_infra.py +++ b/models/demos/yolov4/tests/yolov4_test_infra.py @@ -11,8 +11,6 @@ import ttnn from models.demos.yolov4.reference.yolov4 import Yolov4 from models.demos.yolov4.ttnn.yolov4 import TtYOLOv4 -from models.demos.yolov4.demo.demo import YoloLayer, get_region_boxes, gen_yolov4_boxes_confs - from models.utility_functions import ( is_wormhole_b0, @@ -42,7 +40,15 @@ def load_yolov4_weight(model_location_generator=None): def load_yolov4_model(ttnn_model): torch_model = Yolov4() - new_state_dict = dict(zip(torch_model.state_dict().keys(), ttnn_model.torch_model.values())) + new_state_dict = {} + ds_state_dict = {k: v for k, v in ttnn_model.torch_model.items()} + + keys = [name for name, parameter in torch_model.state_dict().items()] + values = [parameter for name, parameter in ds_state_dict.items()] + + for i in range(len(keys)): + new_state_dict[keys[i]] = values[i] + torch_model.load_state_dict(new_state_dict) torch_model.eval() return torch_model @@ -66,16 +72,13 @@ def __init__( self.act_dtype = act_dtype self.weight_dtype = weight_dtype self.model_location_generator = model_location_generator - self.ttnn_yolov4_model = TtYOLOv4(load_yolov4_weight(self.model_location_generator), device) - + self.ttnn_yolov4_model = TtYOLOv4(device, load_yolov4_weight(self.model_location_generator)) torch_model = load_yolov4_model(self.ttnn_yolov4_model) input_shape = (1, 320, 320, 3) torch_input_tensor = torch.randn(input_shape, dtype=torch.float32) self.input_tensor = ttnn.from_torch(torch_input_tensor, ttnn.bfloat16) self.torch_input_tensor = torch_input_tensor.permute(0, 3, 1, 2) self.torch_output_tensor = torch_model(self.torch_input_tensor) - ref1, ref2, ref3 = gen_yolov4_boxes_confs(self.torch_output_tensor) - self.ref_boxes, self.ref_confs = get_region_boxes([ref1, ref2, ref3]) def run(self): self.output_tensor = self.ttnn_yolov4_model(self.input_tensor) @@ -127,42 +130,38 @@ def setup_dram_sharded_input(self, device, torch_input_tensor=None, mesh_mapper= def validate(self, output_tensor=None): output_tensor = self.output_tensor if output_tensor is None else output_tensor - result_boxes_padded = ttnn.to_torch(self.output_tensor[0]) - result_confs = ttnn.to_torch(self.output_tensor[1]) - - result_boxes_padded = result_boxes_padded.permute(0, 2, 1, 3) - result_boxes_list = [] - # That ttnn tensor is the concat output of 3 padded tensors - # As a perf workaround I'm doing the unpadding on the torch output here. - # TODO: cleaner ttnn code when ttnn.untilize() is fully optimized - box_1_start_i = 0 - box_1_end_i = 6100 - box_2_start_i = 6128 - box_2_end_i = 6228 - box_3_start_i = 6256 - box_3_end_i = 6356 - result_boxes_list.append(result_boxes_padded[:, box_1_start_i:box_1_end_i]) - result_boxes_list.append(result_boxes_padded[:, box_2_start_i:box_2_end_i]) - result_boxes_list.append(result_boxes_padded[:, box_3_start_i:box_3_end_i]) - result_boxes = torch.cat(result_boxes_list, dim=1) - - valid_pcc = 0.99 - self.pcc_passed, self.pcc_message = assert_with_pcc(self.ref_boxes, result_boxes, pcc=valid_pcc) + output_tensor = ttnn.to_torch(self.output_tensor[0]) + output_tensor = output_tensor.reshape(1, 40, 40, 255) + output_tensor = torch.permute(output_tensor, (0, 3, 1, 2)) + + valid_pcc = 0.985 + self.pcc_passed, self.pcc_message = assert_with_pcc(self.torch_output_tensor[0], output_tensor, pcc=valid_pcc) logger.info( - f"Yolov4 - Bboxes. batch_size={self.batch_size}, act_dtype={self.act_dtype}, weight_dtype={self.weight_dtype}, PCC={self.pcc_message}" + f"Yolov4 batch_size={self.batch_size}, act_dtype={self.act_dtype}, weight_dtype={self.weight_dtype}, PCC={self.pcc_message}" ) - valid_pcc = 0.71 - self.pcc_passed, self.pcc_message = assert_with_pcc(self.ref_confs, result_confs, pcc=valid_pcc) + output_tensor = ttnn.to_torch(self.output_tensor[1]) + output_tensor = torch.reshape(output_tensor, (self.batch_size, 20, 20, 255)) + output_tensor = torch.permute(output_tensor, (0, 3, 1, 2)) + self.pcc_passed, self.pcc_message = assert_with_pcc(self.torch_output_tensor[1], output_tensor, pcc=valid_pcc) + + logger.info( + f"Yolov4 batch_size={self.batch_size}, act_dtype={self.act_dtype}, weight_dtype={self.weight_dtype}, PCC={self.pcc_message}" + ) + output_tensor = ttnn.to_torch(self.output_tensor[2]) + output_tensor = torch.reshape(output_tensor, (self.batch_size, 10, 10, 255)) + output_tensor = torch.permute(output_tensor, (0, 3, 1, 2)) + self.pcc_passed, self.pcc_message = assert_with_pcc(self.torch_output_tensor[2], output_tensor, pcc=valid_pcc) logger.info( - f"Yolov4 - Confs. batch_size={self.batch_size}, act_dtype={self.act_dtype}, weight_dtype={self.weight_dtype}, PCC={self.pcc_message}" + f"Yolov4 batch_size={self.batch_size}, act_dtype={self.act_dtype}, weight_dtype={self.weight_dtype}, PCC={self.pcc_message}" ) def dealloc_output(self): ttnn.deallocate(self.output_tensor[0]) ttnn.deallocate(self.output_tensor[1]) + ttnn.deallocate(self.output_tensor[2]) def create_test_infra( diff --git a/models/demos/yolov4/ttnn/common.py b/models/demos/yolov4/ttnn/common.py index e20814a3a73..70ead902094 100644 --- a/models/demos/yolov4/ttnn/common.py +++ b/models/demos/yolov4/ttnn/common.py @@ -52,17 +52,9 @@ def __init__( else: weight = model[path + ".conv.0.weight"] bias = model[path + ".conv.0.bias"] - # padding the channel dim in the last conv in the head module from 255 to 256 - # to avoid additional padding in the model graph - if weight.shape[0] == 255: - weight = torch.nn.functional.pad(weight, (0, 0, 0, 0, 0, 0, 0, 1)) self.weights = ttnn.from_torch(weight) bias = bias.reshape(1, 1, 1, -1) - # padding the channel dim in the last conv in the head module from 255 to 256 - if bias.shape[-1] == 255: - bias = torch.nn.functional.pad(bias, (0, 1, 0, 0, 0, 0, 0, 0)) self.bias = ttnn.from_torch(bias) - self.input_params = input_params self.kernel_size = (self.weights.shape[2], self.weights.shape[3]) self.conv_params = conv_params diff --git a/models/demos/yolov4/ttnn/genboxes.py b/models/demos/yolov4/ttnn/genboxes.py deleted file mode 100644 index fb8bb49867d..00000000000 --- a/models/demos/yolov4/ttnn/genboxes.py +++ /dev/null @@ -1,256 +0,0 @@ -# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. - -# SPDX-License-Identifier: Apache-2.0 - -import torch -import math -import numpy as np -import ttnn -from models.utility_functions import _nearest_32 - - -def create_conv_bias_tensor(torch_tensor, N, K, pad=0): - bias_shape = [1, 1, N, K] - bias_padded_shape = [1, 1, _nearest_32(N), _nearest_32(K)] - tt_tensor = ttnn.Tensor(torch.flatten(torch_tensor).tolist(), bias_shape, ttnn.bfloat16, ttnn.ROW_MAJOR_LAYOUT).pad( - bias_shape, (0, 0, 0, 0), 0.0 - ) - tt_tensor = tt_tensor.pad_to_tile(pad).to(ttnn.TILE_LAYOUT) - return tt_tensor - - -class TtGenBoxes: - def __init__(self, device) -> None: - self.thresh = 0.6 - self.num_classes = 80 - self.num_anchors = 3 - - self.grid_x = [] - self.grid_y = [] - for H in (40, 20, 10): - grid_x_i = torch.reshape( - torch.flatten( - torch.from_numpy( - np.expand_dims( - np.expand_dims(np.expand_dims(np.linspace(0, H - 1, H), axis=0).repeat(H, 0), axis=0), - axis=0, - ) - ) - ), - (1, 1, 1, H * H), - ) - - grid_y_i = torch.reshape( - torch.flatten( - torch.from_numpy( - np.expand_dims( - np.expand_dims(np.expand_dims(np.linspace(0, H - 1, H), axis=1).repeat(H, 1), axis=0), - axis=0, - ) - ) - ), - (1, 1, 1, H * H), - ) - self.grid_x.append( - ttnn.from_torch(grid_x_i, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device) - ) # , 1, H*H)) - self.grid_y.append( - ttnn.from_torch(grid_y_i, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device) - ) # , 1, H*H)) - - def __call__(self, device, input_tensor): - B, __, HW, dim = input_tensor.shape - H = W = int(math.sqrt(HW)) - AHW = self.num_anchors * HW - A = self.num_anchors - - if HW == 1600: - group = 0 - elif HW == 400: - group = 1 - elif HW == 100: - group = 2 - - # Pre-derived from the torch function - if group == 0: - anchor_w_a = 1.5 - anchor_w_b = 2.375 - anchor_w_c = 5.0 - anchor_h_a = 2.0 - anchor_h_b = 4.5 - anchor_h_c = 3.5 - elif group == 1: - anchor_w_a = 2.25 - anchor_w_b = 4.75 - anchor_w_c = 4.5 - anchor_h_a = 4.6875 - anchor_h_b = 3.4375 - anchor_h_c = 9.125 - elif group == 2: - anchor_w_a = 4.4375 - anchor_w_b = 6.0 - anchor_w_c = 14.34375 - anchor_h_a = 3.4375 - anchor_h_b = 7.59375 - anchor_h_c = 12.53125 - - input_tensor_i = ttnn.to_memory_config(input_tensor, ttnn.L1_MEMORY_CONFIG) - input_tensor_i = ttnn.to_layout(input_tensor_i, ttnn.ROW_MAJOR_LAYOUT) - input_tensor_i = ttnn.permute(input_tensor_i, (0, 1, 3, 2)) - - # first anchor - bx_a = ttnn.slice(input_tensor_i, [0, 0, 0, 0], [1, 1, 1, HW]) - by_a = ttnn.slice(input_tensor_i, [0, 0, 1, 0], [1, 1, 2, HW]) - bw_a = ttnn.slice(input_tensor_i, [0, 0, 2, 0], [1, 1, 3, HW]) - bh_a = ttnn.slice(input_tensor_i, [0, 0, 3, 0], [1, 1, 4, HW]) - det_confs_a = ttnn.slice(input_tensor_i, [0, 0, 4, 0], [1, 1, 5, HW]) - cls_confs_a = ttnn.slice(input_tensor_i, [0, 0, 5, 0], [1, 1, 85, HW]) - # second anchor - bx_b = ttnn.slice(input_tensor_i, [0, 0, 85, 0], [1, 1, 86, HW]) - by_b = ttnn.slice(input_tensor_i, [0, 0, 86, 0], [1, 1, 87, HW]) - bw_b = ttnn.slice(input_tensor_i, [0, 0, 87, 0], [1, 1, 88, HW]) - bh_b = ttnn.slice(input_tensor_i, [0, 0, 88, 0], [1, 1, 89, HW]) - det_confs_b = ttnn.slice(input_tensor_i, [0, 0, 89, 0], [1, 1, 90, HW]) - cls_confs_b = ttnn.slice(input_tensor_i, [0, 0, 90, 0], [1, 1, 170, HW]) - # third anchor - bx_c = ttnn.slice(input_tensor_i, [0, 0, 170, 0], [1, 1, 171, HW]) - by_c = ttnn.slice(input_tensor_i, [0, 0, 171, 0], [1, 1, 172, HW]) - bw_c = ttnn.slice(input_tensor_i, [0, 0, 172, 0], [1, 1, 173, HW]) - bh_c = ttnn.slice(input_tensor_i, [0, 0, 173, 0], [1, 1, 174, HW]) - det_confs_c = ttnn.slice(input_tensor_i, [0, 0, 174, 0], [1, 1, 175, HW]) - cls_confs_c = ttnn.slice(input_tensor_i, [0, 0, 175, 0], [1, 1, 255, HW]) - - ############# - # Confs - ############# - - det_confs_a = ttnn.to_layout(det_confs_a, ttnn.TILE_LAYOUT) - det_confs_b = ttnn.to_layout(det_confs_b, ttnn.TILE_LAYOUT) - det_confs_c = ttnn.to_layout(det_confs_c, ttnn.TILE_LAYOUT) - cls_confs_a = ttnn.to_layout(cls_confs_a, ttnn.TILE_LAYOUT) - cls_confs_b = ttnn.to_layout(cls_confs_b, ttnn.TILE_LAYOUT) - cls_confs_c = ttnn.to_layout(cls_confs_c, ttnn.TILE_LAYOUT) - - det_confs_a = ttnn.sigmoid(det_confs_a) - det_confs_b = ttnn.sigmoid(det_confs_b) - det_confs_c = ttnn.sigmoid(det_confs_c) - cls_confs_a = ttnn.sigmoid(cls_confs_a) - cls_confs_b = ttnn.sigmoid(cls_confs_b) - cls_confs_c = ttnn.sigmoid(cls_confs_c) - - confs_a = ttnn.multiply(det_confs_a, cls_confs_a) - confs_b = ttnn.multiply(det_confs_b, cls_confs_b) - confs_c = ttnn.multiply(det_confs_c, cls_confs_c) - - confs = ttnn.concat([confs_a, confs_b, confs_c], dim=1) - confs = ttnn.permute(confs, (0, 1, 3, 2)) - confs = ttnn.reshape(confs, (B, AHW, self.num_classes)) - - ################# - ## Boxes - ################# - - # expensive TilizeWithValPadding - bx_a = ttnn.to_layout(bx_a, ttnn.TILE_LAYOUT) - by_a = ttnn.to_layout(by_a, ttnn.TILE_LAYOUT) - bw_a = ttnn.to_layout(bw_a, ttnn.TILE_LAYOUT) - bh_a = ttnn.to_layout(bh_a, ttnn.TILE_LAYOUT) - bx_a = ttnn.sigmoid(bx_a) - by_a = ttnn.sigmoid(by_a) - bw_a = ttnn.exp(bw_a) - bh_a = ttnn.exp(bh_a) - - bx_b = ttnn.to_layout(bx_b, ttnn.TILE_LAYOUT) - by_b = ttnn.to_layout(by_b, ttnn.TILE_LAYOUT) - bw_b = ttnn.to_layout(bw_b, ttnn.TILE_LAYOUT) - bh_b = ttnn.to_layout(bh_b, ttnn.TILE_LAYOUT) - bx_b = ttnn.sigmoid(bx_b) - by_b = ttnn.sigmoid(by_b) - bw_b = ttnn.exp(bw_b) - bh_b = ttnn.exp(bh_b) - - bx_c = ttnn.to_layout(bx_c, ttnn.TILE_LAYOUT) - by_c = ttnn.to_layout(by_c, ttnn.TILE_LAYOUT) - bw_c = ttnn.to_layout(bw_c, ttnn.TILE_LAYOUT) - bh_c = ttnn.to_layout(bh_c, ttnn.TILE_LAYOUT) - bx_c = ttnn.sigmoid(bx_c) - by_c = ttnn.sigmoid(by_c) - bw_c = ttnn.exp(bw_c) - bh_c = ttnn.exp(bh_c) - - #### - ## Grid tensor derivation - #### - - grid_x = self.grid_x[group] # .to(device, mem_config=ttnn.L1_MEMORY_CONFIG) - grid_y = self.grid_y[group] # .to(device, mem_config=ttnn.L1_MEMORY_CONFIG) - - bx_a = ttnn.add(bx_a, grid_x) - by_a = ttnn.add(by_a, grid_y) - bx_b = ttnn.add(bx_b, grid_x) - by_b = ttnn.add(by_b, grid_y) - bx_c = ttnn.add(bx_c, grid_x) - by_c = ttnn.add(by_c, grid_y) - - bx_a = ttnn.multiply(bx_a, 1 / W) - by_a = ttnn.multiply(by_a, 1 / H) - bx_b = ttnn.multiply(bx_b, 1 / W) - by_b = ttnn.multiply(by_b, 1 / H) - bx_c = ttnn.multiply(bx_c, 1 / W) - by_c = ttnn.multiply(by_c, 1 / H) - - bw_a = bw_a * (anchor_w_a / W) - bw_b = bw_b * (anchor_w_b / W) - bw_c = bw_c * (anchor_w_c / W) - - bh_a = bh_a * (anchor_h_a / H) - bh_b = bh_b * (anchor_h_b / H) - bh_c = bh_c * (anchor_h_c / H) - - bw_a_half = bw_a * (0.5) - bw_b_half = bw_b * (0.5) - bw_c_half = bw_c * (0.5) - - bh_a_half = bh_a * (0.5) - bh_b_half = bh_b * (0.5) - bh_c_half = bh_c * (0.5) - - bx1_a = bx_a - bw_a_half - by1_a = by_a - bh_a_half - bx2_a = bx1_a + bw_a - by2_a = by1_a + bh_a - - bx1_b = bx_b - bw_b_half - by1_b = by_b - bh_b_half - bx2_b = bx1_b + bw_b - by2_b = by1_b + bh_b - - bx1_c = bx_c - bw_c_half - by1_c = by_c - bh_c_half - bx2_c = bx1_c + bw_c - by2_c = by1_c + bh_c - - bx1_a = ttnn.to_layout(bx1_a, ttnn.ROW_MAJOR_LAYOUT) - bx2_a = ttnn.to_layout(bx2_a, ttnn.ROW_MAJOR_LAYOUT) - by1_a = ttnn.to_layout(by1_a, ttnn.ROW_MAJOR_LAYOUT) - by2_a = ttnn.to_layout(by2_a, ttnn.ROW_MAJOR_LAYOUT) - - bx1_b = ttnn.to_layout(bx1_b, ttnn.ROW_MAJOR_LAYOUT) - bx2_b = ttnn.to_layout(bx2_b, ttnn.ROW_MAJOR_LAYOUT) - by1_b = ttnn.to_layout(by1_b, ttnn.ROW_MAJOR_LAYOUT) - by2_b = ttnn.to_layout(by2_b, ttnn.ROW_MAJOR_LAYOUT) - - bx1_c = ttnn.to_layout(bx1_c, ttnn.ROW_MAJOR_LAYOUT) - bx2_c = ttnn.to_layout(bx2_c, ttnn.ROW_MAJOR_LAYOUT) - by1_c = ttnn.to_layout(by1_c, ttnn.ROW_MAJOR_LAYOUT) - by2_c = ttnn.to_layout(by2_c, ttnn.ROW_MAJOR_LAYOUT) - - bx1 = ttnn.concat([bx1_a, bx1_b, bx1_c], dim=2) - by1 = ttnn.concat([by1_a, by1_b, by1_c], dim=2) - bx2 = ttnn.concat([bx2_a, bx2_b, bx2_c], dim=2) - by2 = ttnn.concat([by2_a, by2_b, by2_c], dim=2) - - # Shape: [batch, num_anchors * h * w, 4] -> [batch, num_anchors * h * w, 1, 4] - boxes = ttnn.concat((bx1, by1, bx2, by2), dim=1) - - return boxes, confs diff --git a/models/demos/yolov4/ttnn/yolov4.py b/models/demos/yolov4/ttnn/yolov4.py index 307e0fc55ca..42f1a9cd7fe 100644 --- a/models/demos/yolov4/ttnn/yolov4.py +++ b/models/demos/yolov4/ttnn/yolov4.py @@ -21,11 +21,10 @@ from models.demos.yolov4.ttnn.downsample5 import Down5 from models.demos.yolov4.ttnn.neck import TtNeck from models.demos.yolov4.ttnn.head import TtHead -from models.demos.yolov4.ttnn.genboxes import TtGenBoxes class TtYOLOv4: - def __init__(self, path, device) -> None: + def __init__(self, device, path) -> None: if type(path) is str: self.torch_model = torch.load(path) else: @@ -40,12 +39,7 @@ def __init__(self, path, device) -> None: self.neck = TtNeck(device, self) self.head = TtHead(device, self) - self.boxes_confs_0 = TtGenBoxes(device) - self.boxes_confs_1 = TtGenBoxes(device) - self.boxes_confs_2 = TtGenBoxes(device) - self.downs = [] # [self.down1] - self.device = device def __call__(self, input_tensor): d1 = self.down1(input_tensor) @@ -58,32 +52,7 @@ def __call__(self, input_tensor): x20, x13, x6 = self.neck([d5, d4, d3]) x4, x5, x6 = self.head([x20, x13, x6]) - orig = 0 - if orig: - return x4, x5, x6 - else: - x4_boxes_confs = self.boxes_confs_0(self.device, x4) - x5_boxes_confs = self.boxes_confs_1(self.device, x5) - x6_boxes_confs = self.boxes_confs_2(self.device, x6) - - confs_1 = ttnn.to_layout(x4_boxes_confs[1], ttnn.ROW_MAJOR_LAYOUT) - confs_2 = ttnn.to_layout(x5_boxes_confs[1], ttnn.ROW_MAJOR_LAYOUT) - confs_3 = ttnn.to_layout(x6_boxes_confs[1], ttnn.ROW_MAJOR_LAYOUT) - confs = ttnn.concat([confs_1, confs_2, confs_3], dim=1) - - boxes_1 = ttnn.to_layout(x4_boxes_confs[0], ttnn.ROW_MAJOR_LAYOUT) - boxes_2 = ttnn.to_layout(x5_boxes_confs[0], ttnn.ROW_MAJOR_LAYOUT) - boxes_3 = ttnn.to_layout(x6_boxes_confs[0], ttnn.ROW_MAJOR_LAYOUT) - boxes_1 = ttnn.reshape(boxes_1, (1, 4, 1, 4800)) - boxes_2 = ttnn.reshape(boxes_2, (1, 4, 1, 1200)) - boxes_3 = ttnn.pad(boxes_3, ((0, 0), (0, 0), (0, 0), (0, 28)), 0) - boxes_3 = ttnn.reshape(boxes_3, (1, 4, 1, 384)) - boxes_1 = ttnn.permute(boxes_1, (0, 2, 3, 1)) - boxes_2 = ttnn.permute(boxes_2, (0, 2, 3, 1)) - boxes_3 = ttnn.permute(boxes_3, (0, 2, 3, 1)) - boxes = ttnn.concat([boxes_1, boxes_2, boxes_3], dim=2) - - return boxes, confs + return x4, x5, x6 def __str__(self) -> str: this_str = "" diff --git a/models/demos/yolov4/web_demo/README.md b/models/demos/yolov4/web_demo/README.md index 5b112cadaa6..d35bb31c518 100644 --- a/models/demos/yolov4/web_demo/README.md +++ b/models/demos/yolov4/web_demo/README.md @@ -12,11 +12,6 @@ pip install -r models/demos/yolov4/web_demo/server/requirements.txt ``` -- After installing the server side requirments, ONLY if you are running the demo on an N300 card,run the following to export the approprite envirement variable for N300. - ``` - export WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml - ``` - - From the server run: ``` source models/demos/yolov4/web_demo/server/run_uvicorn.sh diff --git a/models/demos/yolov4/web_demo/client/coco.names b/models/demos/yolov4/web_demo/client/coco.names deleted file mode 100644 index ca76c80b5b2..00000000000 --- a/models/demos/yolov4/web_demo/client/coco.names +++ /dev/null @@ -1,80 +0,0 @@ -person -bicycle -car -motorbike -aeroplane -bus -train -truck -boat -traffic light -fire hydrant -stop sign -parking meter -bench -bird -cat -dog -horse -sheep -cow -elephant -bear -zebra -giraffe -backpack -umbrella -handbag -tie -suitcase -frisbee -skis -snowboard -sports ball -kite -baseball bat -baseball glove -skateboard -surfboard -tennis racket -bottle -wine glass -cup -fork -knife -spoon -bowl -banana -apple -sandwich -orange -broccoli -carrot -hot dog -pizza -donut -cake -chair -sofa -pottedplant -bed -diningtable -toilet -tvmonitor -laptop -mouse -remote -keyboard -cell phone -microwave -oven -toaster -sink -refrigerator -book -clock -vase -scissors -teddy bear -hair drier -toothbrush diff --git a/models/demos/yolov4/web_demo/client/requirements.txt b/models/demos/yolov4/web_demo/client/requirements.txt index be5f168cc74..282195275da 100644 --- a/models/demos/yolov4/web_demo/client/requirements.txt +++ b/models/demos/yolov4/web_demo/client/requirements.txt @@ -1,4 +1,3 @@ opencv-python==4.6.0.66 streamlit==1.26.0 streamlit-webrtc==0.47.0 -orjson==3.10.12 diff --git a/models/demos/yolov4/web_demo/client/yolov4.py b/models/demos/yolov4/web_demo/client/yolov4.py index ada420cbdad..5fc4ea6c692 100644 --- a/models/demos/yolov4/web_demo/client/yolov4.py +++ b/models/demos/yolov4/web_demo/client/yolov4.py @@ -11,9 +11,7 @@ import cv2 import requests import torch -import orjson import av -import logging import streamlit as st import numpy as np @@ -22,16 +20,78 @@ from streamlit_webrtc import VideoProcessorBase, webrtc_streamer -# Configure the logger -logging.basicConfig( - level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", handlers=[logging.StreamHandler()] -) - - class VideoProcessor(VideoProcessorBase): def __init__(self): self.frame_count = 0 + def post_processing(self, img, conf_thresh, nms_thresh, output): + box_array = output[0] + confs = output[1].float() + + t1 = time.time() + + if type(box_array).__name__ != "ndarray": + box_array = box_array.cpu().detach().numpy() + confs = confs.cpu().detach().numpy() + + num_classes = confs.shape[2] + + # [batch, num, 4] + box_array = box_array[:, :, 0] + + # [batch, num, num_classes] --> [batch, num] + max_conf = np.max(confs, axis=2) + max_id = np.argmax(confs, axis=2) + + t2 = time.time() + + bboxes_batch = [] + for i in range(box_array.shape[0]): + argwhere = max_conf[i] > conf_thresh + l_box_array = box_array[i, argwhere, :] + l_max_conf = max_conf[i, argwhere] + l_max_id = max_id[i, argwhere] + + bboxes = [] + # nms for each class + for j in range(num_classes): + cls_argwhere = l_max_id == j + ll_box_array = l_box_array[cls_argwhere, :] + ll_max_conf = l_max_conf[cls_argwhere] + ll_max_id = l_max_id[cls_argwhere] + + keep = self.nms_cpu(ll_box_array, ll_max_conf, nms_thresh) + + if keep.size > 0: + ll_box_array = ll_box_array[keep, :] + ll_max_conf = ll_max_conf[keep] + ll_max_id = ll_max_id[keep] + + for k in range(ll_box_array.shape[0]): + bboxes.append( + [ + ll_box_array[k, 0], + ll_box_array[k, 1], + ll_box_array[k, 2], + ll_box_array[k, 3], + ll_max_conf[k], + ll_max_conf[k], + ll_max_id[k], + ] + ) + + bboxes_batch.append(bboxes) + + t3 = time.time() + + print("-----------------------------------") + print(" max and argmax : %f" % (t2 - t1)) + print(" nms : %f" % (t3 - t2)) + print("Post processing total : %f" % (t3 - t1)) + print("-----------------------------------") + + return bboxes_batch + def load_class_names(self, namesfile): class_names = [] with open(namesfile, "r") as fp: @@ -41,6 +101,41 @@ def load_class_names(self, namesfile): class_names.append(line) return class_names + def nms_cpu(self, boxes, confs, nms_thresh=0.5, min_mode=False): + x1 = boxes[:, 0] + y1 = boxes[:, 1] + x2 = boxes[:, 2] + y2 = boxes[:, 3] + + areas = (x2 - x1) * (y2 - y1) + order = confs.argsort()[::-1] + + keep = [] + while order.size > 0: + idx_self = order[0] + idx_other = order[1:] + + keep.append(idx_self) + + xx1 = np.maximum(x1[idx_self], x1[idx_other]) + yy1 = np.maximum(y1[idx_self], y1[idx_other]) + xx2 = np.minimum(x2[idx_self], x2[idx_other]) + yy2 = np.minimum(y2[idx_self], y2[idx_other]) + + w = np.maximum(0.0, xx2 - xx1) + h = np.maximum(0.0, yy2 - yy1) + inter = w * h + + if min_mode: + over = inter / np.minimum(areas[order[0]], areas[order[1:]]) + else: + over = inter / (areas[order[0]] + areas[order[1:]] - inter) + + inds = np.where(over <= nms_thresh)[0] + order = order[inds + 1] + + return np.array(keep) + def plot_boxes_cv2(self, bgr_img, boxes, savename=None, class_names=None, color=None): img = np.copy(bgr_img) colors = np.array([[1, 0, 1], [0, 0, 1], [0, 1, 1], [0, 1, 0], [1, 1, 0], [1, 0, 0]], dtype=np.float32) @@ -101,60 +196,52 @@ def get_color(c, x, max_val): def recv(self, frame): t0 = time.time() - - # Convert frame to PIL image and resize pil_image = frame.to_image() - pil_image = pil_image.resize((320, 320)) # Resize to target dimensions + # resize on the client side + new_size = (320, 320) + pil_image = pil_image.resize(new_size) t1 = time.time() - - # Save image as JPEG in-memory with optimized settings buf = io.BytesIO() - pil_image.save(buf, format="JPEG", quality=85, optimize=True) + pil_image.save(buf, format="JPEG") byte_im = buf.getvalue() file = {"file": byte_im} + # Argument Parser to grab namespace_id of server pod from user + parser = argparse.ArgumentParser(description="YOLOv4 script") + parser.add_argument("--api-url", type=str, help="URL for the object detection API", required=True) + args = parser.parse_args() + apiurl = args.api_url + url = f"{apiurl}/objdetection_v2" + r = requests.post(url, files=file) - # Parse API URL once at the class level for efficiency - if not hasattr(self, "api_url"): - parser = argparse.ArgumentParser(description="YOLOv4 script") - parser.add_argument("--api-url", type=str, required=True, help="URL for the object detection API") - args = parser.parse_args() - self.api_url = args.api_url - - url = f"{self.api_url}/objdetection_v2" - - try: - # Use a persistent session for multiple requests - with requests.Session() as session: - # Post request with a timeout - response = session.post(url, files=file, timeout=5) - - # Check if response is successful - if response.status_code == 200: - # Parse JSON response - output = orjson.loads(response.content) - else: - print(f"Request failed with status code {response.status_code}") - # return None - except requests.exceptions.RequestException as e: - print(f"Request failed: {e}") - return None + if r.status_code == 200: + try: + # Get the JSON response as a dictionary + response_dict = r.json() + output = [torch.tensor(tensor_data) for tensor_data in response_dict["output"]] + except ValueError: + st.error("Failed to parse JSON. The response is not in JSON format.") + else: + st.error(f"Request failed with status code {r.status_code}") t3 = time.time() - # Convert frame to ndarray and perform post-processing bgr_image = frame.to_ndarray(format="bgr24") conf_thresh = 0.6 nms_thresh = 0.5 - - # Load class names and plot bounding boxes + boxes = self.post_processing(bgr_image, conf_thresh, nms_thresh, output) namesfile = "coco.names" class_names = self.load_class_names(namesfile) - image_final = self.plot_boxes_cv2(bgr_image, output, None, class_names) + # random_number = random.randint(1, 100) + # save_name = "ttnn_prediction_demo" + str(random_number) + ".jpg" + save_name = None + + image_final = self.plot_boxes_cv2(bgr_image, boxes[0], save_name, class_names) t4 = time.time() - logging.info( - f" IMG-IN | WH | Post | Total time: {(t1-t0):.3f} | {(t3-t1):.3f} | {(t4-t3):.3f} || {(t4-t0):.3f} " - ) + print() + print(f" IMG-IN | WH | Post | Total time: ") + print(f" {(t1-t0):.3f} | {(t3-t1):.3f} | {(t4-t3):.3f} || {(t4-t0):.3f} ") + # return image_final return av.VideoFrame.from_ndarray(image_final, format="bgr24") @@ -167,8 +254,10 @@ def recv(self, frame): media_stream_constraints={ "video": { "width": {"min": 320, "ideal": 400, "max": 960}, + # "height": {"min": 180, "ideal": 225, "max": 450}, "height": {"min": 320, "ideal": 400, "max": 960}, "frameRate": {"min": 1, "ideal": 50, "max": 60}, } }, + # async_processing=True # Use asynchronous processing for long tasks ) diff --git a/models/demos/yolov4/web_demo/server/fast_api_yolov4.py b/models/demos/yolov4/web_demo/server/fast_api_yolov4.py old mode 100644 new mode 100755 index 83af1d6e14b..19732cbc074 --- a/models/demos/yolov4/web_demo/server/fast_api_yolov4.py +++ b/models/demos/yolov4/web_demo/server/fast_api_yolov4.py @@ -2,8 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 import json -import os -import logging from fastapi import FastAPI, File, UploadFile from io import BytesIO from PIL import Image @@ -27,43 +25,14 @@ async def root(): return {"message": "Hello World"} -# Configure the logger -logging.basicConfig( - level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", handlers=[logging.StreamHandler()] -) - - -def get_dispatch_core_type(): - # TODO: 11059 move dispatch_core_type to device_params when all tests are updated to not use WH_ARCH_YAML env flag - dispatch_core_type = ttnn.device.DispatchCoreType.WORKER - # if ("WH_ARCH_YAML" in os.environ) and os.environ["WH_ARCH_YAML"] == "wormhole_b0_80_arch_eth_dispatch.yaml": - if os.environ["WH_ARCH_YAML"] == "wormhole_b0_80_arch_eth_dispatch.yaml": - dispatch_core_type = ttnn.device.DispatchCoreType.ETH - return dispatch_core_type - - @app.on_event("startup") async def startup(): + device_id = 0 + device = ttnn.CreateDevice(device_id, l1_small_size=24576, trace_region_size=1617920, num_command_queues=2) + ttnn.enable_program_cache(device) global model - if ("WH_ARCH_YAML" in os.environ) and os.environ["WH_ARCH_YAML"] == "wormhole_b0_80_arch_eth_dispatch.yaml": - print("WH_ARCH_YAML:", os.environ.get("WH_ARCH_YAML")) - device_id = 0 - device = ttnn.CreateDevice( - device_id, - dispatch_core_type=get_dispatch_core_type(), - l1_small_size=24576, - trace_region_size=3211264, - num_command_queues=2, - ) - ttnn.enable_program_cache(device) - model = Yolov4Trace2CQ() - model.initialize_yolov4_trace_2cqs_inference(device) - else: - device_id = 0 - device = ttnn.CreateDevice(device_id, l1_small_size=24576, trace_region_size=3211264, num_command_queues=2) - ttnn.enable_program_cache(device) - model = Yolov4Trace2CQ() - model.initialize_yolov4_trace_2cqs_inference(device) + model = Yolov4Trace2CQ() + model.initialize_yolov4_trace_2cqs_inference(device) @app.on_event("shutdown") @@ -71,112 +40,16 @@ async def shutdown(): model.release_yolov4_trace_2cqs_inference() -def process_output(output): - outs = [] - output = output - cnt = 0 - for item in output: - cnt = cnt + 1 - output_i = [element.item() for element in item] - outs.append(output_i) - return outs - - -def post_processing(img, conf_thresh, nms_thresh, output): - box_array = output[0] - confs = output[1] - - box_array = np.array(box_array.to(torch.float32)) - confs = np.array(confs.to(torch.float32)) - - num_classes = confs.shape[2] - - # [batch, num, 4] - box_array = box_array[:, :, 0] - - # [batch, num, num_classes] --> [batch, num] - max_conf = np.max(confs, axis=2) - max_id = np.argmax(confs, axis=2) - - bboxes_batch = [] - for i in range(box_array.shape[0]): - argwhere = max_conf[i] > conf_thresh - l_box_array = box_array[i, argwhere, :] - l_max_conf = max_conf[i, argwhere] - l_max_id = max_id[i, argwhere] - - bboxes = [] - # nms for each class - for j in range(num_classes): - cls_argwhere = l_max_id == j - ll_box_array = l_box_array[cls_argwhere, :] - ll_max_conf = l_max_conf[cls_argwhere] - ll_max_id = l_max_id[cls_argwhere] - - keep = nms_cpu(ll_box_array, ll_max_conf, nms_thresh) - - if keep.size > 0: - ll_box_array = ll_box_array[keep, :] - ll_max_conf = ll_max_conf[keep] - ll_max_id = ll_max_id[keep] - - for k in range(ll_box_array.shape[0]): - bboxes.append( - [ - ll_box_array[k, 0], - ll_box_array[k, 1], - ll_box_array[k, 2], - ll_box_array[k, 3], - ll_max_conf[k], - ll_max_conf[k], - ll_max_id[k], - ] - ) - - bboxes_batch.append(bboxes) - - return bboxes_batch - - -def nms_cpu(boxes, confs, nms_thresh=0.5, min_mode=False): - x1 = boxes[:, 0] - y1 = boxes[:, 1] - x2 = boxes[:, 2] - y2 = boxes[:, 3] - - areas = (x2 - x1) * (y2 - y1) - order = confs.argsort()[::-1] - - keep = [] - while order.size > 0: - idx_self = order[0] - idx_other = order[1:] - - keep.append(idx_self) - - xx1 = np.maximum(x1[idx_self], x1[idx_other]) - yy1 = np.maximum(y1[idx_self], y1[idx_other]) - xx2 = np.minimum(x2[idx_self], x2[idx_other]) - yy2 = np.minimum(y2[idx_self], y2[idx_other]) - - w = np.maximum(0.0, xx2 - xx1) - h = np.maximum(0.0, yy2 - yy1) - inter = w * h - - if min_mode: - over = inter / np.minimum(areas[order[0]], areas[order[1:]]) - else: - over = inter / (areas[order[0]] + areas[order[1:]] - inter) - - inds = np.where(over <= nms_thresh)[0] - order = order[inds + 1] - - return np.array(keep) +def process_request(output): + # Convert all tensors to lists for JSON serialization + output_serializable = {"output": [tensor.tolist() for tensor in output]} + return output_serializable @app.post("/objdetection_v2") async def objdetection_v2(file: UploadFile = File(...)): contents = await file.read() + # Load and convert the image to RGB image = Image.open(BytesIO(contents)).convert("RGB") image = np.array(image) @@ -187,24 +60,11 @@ async def objdetection_v2(file: UploadFile = File(...)): else: print("unknow image type") exit(-1) - t1 = time.time() response = model.run_traced_inference(image) t2 = time.time() - logging.info("The inference on the sever side took: %.3f seconds", t2 - t1) - conf_thresh = 0.6 - nms_thresh = 0.5 - - boxes = post_processing(image, conf_thresh, nms_thresh, response) - output = boxes[0] - # output = boxes - try: - output = process_output(output) - except Exception as E: - print("the Exception is: ", E) - print("No objects detected!") - return [] - t3 = time.time() - logging.info("The post-processing to get the boxes took: %.3f seconds", t3 - t2) + print("the inference on the sever side took: ", t2 - t1) + # Convert response tensors to JSON-serializable format + output = process_request(response) return output diff --git a/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample1.py b/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample1.py index 9dd13940717..3ae46d4970c 100644 --- a/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample1.py +++ b/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample1.py @@ -36,8 +36,16 @@ def test_down1(device, reset_seeds, model_location_generator): ttnn_input = ttnn.from_torch(torch_input, dtype=ttnn.bfloat16) torch_input = torch_input.permute(0, 3, 1, 2).float() torch_model = DownSample1() + + new_state_dict = {} ds_state_dict = {k: v for k, v in ttnn_model.torch_model.items() if (k.startswith("down1."))} - new_state_dict = dict(zip(torch_model.state_dict().keys(), ds_state_dict.values())) + + keys = [name for name, parameter in torch_model.state_dict().items()] + values = [parameter for name, parameter in ds_state_dict.items()] + + for i in range(len(keys)): + new_state_dict[keys[i]] = values[i] + torch_model.load_state_dict(new_state_dict) torch_model.eval() diff --git a/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample2.py b/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample2.py index ba7da86ee8c..5efc12af3f1 100644 --- a/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample2.py +++ b/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample2.py @@ -35,10 +35,16 @@ def test_down2(device, reset_seeds, model_location_generator): torch_input = torch.randn((1, 160, 160, 64), dtype=torch.bfloat16) ttnn_input = ttnn.from_torch(torch_input, dtype=ttnn.bfloat16) torch_input = torch_input.permute(0, 3, 1, 2).float() - torch_model = DownSample2() + + new_state_dict = {} ds_state_dict = {k: v for k, v in ttnn_model.torch_model.items() if (k.startswith("down2."))} - new_state_dict = dict(zip(torch_model.state_dict().keys(), ds_state_dict.values())) + + keys = [name for name, parameter in torch_model.state_dict().items()] + values = [parameter for name, parameter in ds_state_dict.items()] + for i in range(len(keys)): + new_state_dict[keys[i]] = values[i] + torch_model.load_state_dict(new_state_dict) torch_model.eval() diff --git a/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample3.py b/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample3.py index 8ae58e41470..23c015fbb5b 100644 --- a/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample3.py +++ b/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample3.py @@ -36,8 +36,15 @@ def test_down3(device, reset_seeds, model_location_generator): ttnn_input = ttnn.from_torch(torch_input, dtype=ttnn.bfloat16) torch_input = torch_input.permute(0, 3, 1, 2).float() torch_model = DownSample3() + + new_state_dict = {} ds_state_dict = {k: v for k, v in ttnn_model.torch_model.items() if (k.startswith("down3."))} - new_state_dict = dict(zip(torch_model.state_dict().keys(), ds_state_dict.values())) + + keys = [name for name, parameter in torch_model.state_dict().items()] + values = [parameter for name, parameter in ds_state_dict.items()] + for i in range(len(keys)): + new_state_dict[keys[i]] = values[i] + torch_model.load_state_dict(new_state_dict) torch_model.eval() @@ -51,4 +58,4 @@ def test_down3(device, reset_seeds, model_location_generator): ref = torch_model(torch_input) ref = ref.permute(0, 2, 3, 1) result = result.reshape(ref.shape) - assert_with_pcc(result, ref, 0.96) # PCC 0.96 - The PCC will improve once #3612 is resolved. + assert_with_pcc(result, ref, 0.95) # PCC 0.95 - The PCC will improve once #3612 is resolved. diff --git a/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample4.py b/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample4.py index b791e9fc813..35579f14664 100644 --- a/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample4.py +++ b/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample4.py @@ -36,8 +36,15 @@ def test_down4(device, reset_seeds, model_location_generator): ttnn_input = ttnn.from_torch(torch_input, dtype=ttnn.bfloat16) torch_input = torch_input.permute(0, 3, 1, 2).float() torch_model = DownSample4() + + new_state_dict = {} ds_state_dict = {k: v for k, v in ttnn_model.torch_model.items() if (k.startswith("down4."))} - new_state_dict = dict(zip(torch_model.state_dict().keys(), ds_state_dict.values())) + + keys = [name for name, parameter in torch_model.state_dict().items()] + values = [parameter for name, parameter in ds_state_dict.items()] + for i in range(len(keys)): + new_state_dict[keys[i]] = values[i] + torch_model.load_state_dict(new_state_dict) torch_model.eval() diff --git a/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample5.py b/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample5.py index d53eab4825e..8809d4d8275 100644 --- a/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample5.py +++ b/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample5.py @@ -36,8 +36,15 @@ def test_down5(device, reset_seeds, model_location_generator): ttnn_input = ttnn.from_torch(torch_input, dtype=ttnn.bfloat16) torch_input = torch_input.permute(0, 3, 1, 2).float() torch_model = DownSample5() + + new_state_dict = {} ds_state_dict = {k: v for k, v in ttnn_model.torch_model.items() if (k.startswith("down5."))} - new_state_dict = dict(zip(torch_model.state_dict().keys(), ds_state_dict.values())) + + keys = [name for name, parameter in torch_model.state_dict().items()] + values = [parameter for name, parameter in ds_state_dict.items()] + for i in range(len(keys)): + new_state_dict[keys[i]] = values[i] + torch_model.load_state_dict(new_state_dict) torch_model.eval() diff --git a/tests/ttnn/integration_tests/yolov4/test_ttnn_head.py b/tests/ttnn/integration_tests/yolov4/test_ttnn_head.py index 155885f2cb3..126e3713645 100644 --- a/tests/ttnn/integration_tests/yolov4/test_ttnn_head.py +++ b/tests/ttnn/integration_tests/yolov4/test_ttnn_head.py @@ -6,7 +6,6 @@ import ttnn from models.demos.yolov4.reference.head import Head from tests.ttnn.utils_for_testing import assert_with_pcc -from models.utility_functions import skip_for_grayskull import pytest import time from models.demos.yolov4.ttnn.head import TtHead @@ -14,7 +13,6 @@ import os -@skip_for_grayskull() @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) def test_head(device, reset_seeds, model_location_generator): torch.manual_seed(0) @@ -58,8 +56,15 @@ def test_head(device, reset_seeds, model_location_generator): torch_input_tensor = [torch_input_tensor1, torch_input_tensor2, torch_input_tensor3] torch_model = Head() + + new_state_dict = {} ds_state_dict = {k: v for k, v in ttnn_model.torch_model.items() if (k.startswith("head."))} - new_state_dict = dict(zip(torch_model.state_dict().keys(), ds_state_dict.values())) + + keys = [name for name, parameter in torch_model.state_dict().items()] + values = [parameter for name, parameter in ds_state_dict.items()] + for i in range(len(keys)): + new_state_dict[keys[i]] = values[i] + torch_model.load_state_dict(new_state_dict) torch_model.eval() @@ -74,22 +79,19 @@ def test_head(device, reset_seeds, model_location_generator): result_3 = ttnn.to_torch(result_ttnn[2]) ref1, ref2, ref3 = torch_model(torch_input_tensor[0], torch_input_tensor[1], torch_input_tensor[2]) - num_channels = ref1.shape[1] # 255 - num_channels_padded = num_channels + 1 - - result_1 = result_1.reshape(1, ref1.shape[2], ref1.shape[3], num_channels_padded) + result_1 = result_1.reshape(1, ref1.shape[2], ref1.shape[3], 255) result_1 = result_1.permute(0, 3, 1, 2) - result_2 = result_2.reshape(1, ref2.shape[2], ref2.shape[3], num_channels_padded) + result_2 = result_2.reshape(1, ref2.shape[2], ref2.shape[3], 255) result_2 = result_2.permute(0, 3, 1, 2) - result_3 = result_3.reshape(1, ref3.shape[2], ref3.shape[3], num_channels_padded) + result_3 = result_3.reshape(1, ref3.shape[2], ref3.shape[3], 255) result_3 = result_3.permute(0, 3, 1, 2) # Output is sliced because ttnn.conv returns 256 channels instead of 255. - result_1 = result_1[:, :num_channels, :, :] - result_2 = result_2[:, :num_channels, :, :] - result_3 = result_3[:, :num_channels, :, :] + result_1 = result_1[:, :255, :, :] + result_2 = result_2[:, :255, :, :] + result_3 = result_3[:, :255, :, :] pcc_passed, pcc_message = assert_with_pcc(result_1, ref1, 0.99) logger.info(pcc_message) diff --git a/tests/ttnn/integration_tests/yolov4/test_ttnn_neck.py b/tests/ttnn/integration_tests/yolov4/test_ttnn_neck.py index 02c9d81f75d..41ac8781fc1 100644 --- a/tests/ttnn/integration_tests/yolov4/test_ttnn_neck.py +++ b/tests/ttnn/integration_tests/yolov4/test_ttnn_neck.py @@ -6,7 +6,6 @@ import ttnn from models.demos.yolov4.ttnn.neck import TtNeck from models.demos.yolov4.reference.neck import Neck -from models.utility_functions import skip_for_grayskull from tests.ttnn.utils_for_testing import assert_with_pcc import pytest import time @@ -14,7 +13,6 @@ import os -@skip_for_grayskull() @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) def test_neck(device, reset_seeds, model_location_generator): torch.manual_seed(0) @@ -52,10 +50,16 @@ def test_neck(device, reset_seeds, model_location_generator): torch_input_tensor2 = torch_input_tensor2.permute(0, 3, 1, 2).float() torch_input_tensor3 = torch_input_tensor3.permute(0, 3, 1, 2).float() torch_input_tensor = [torch_input_tensor1, torch_input_tensor2, torch_input_tensor3] - torch_model = Neck() + + new_state_dict = {} ds_state_dict = {k: v for k, v in ttnn_model.torch_model.items() if (k.startswith("neek."))} - new_state_dict = dict(zip(torch_model.state_dict().keys(), ds_state_dict.values())) + + keys = [name for name, parameter in torch_model.state_dict().items()] + values = [parameter for name, parameter in ds_state_dict.items()] + for i in range(len(keys)): + new_state_dict[keys[i]] = values[i] + torch_model.load_state_dict(new_state_dict) torch_model.eval() diff --git a/tests/ttnn/integration_tests/yolov4/test_ttnn_post_processing.py b/tests/ttnn/integration_tests/yolov4/test_ttnn_post_processing.py deleted file mode 100644 index 128a0c93f43..00000000000 --- a/tests/ttnn/integration_tests/yolov4/test_ttnn_post_processing.py +++ /dev/null @@ -1,80 +0,0 @@ -# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. - -# SPDX-License-Identifier: Apache-2.0 - -import torch -import ttnn -from models.utility_functions import skip_for_grayskull -from tests.ttnn.utils_for_testing import assert_with_pcc -from models.demos.yolov4.ttnn.genboxes import TtGenBoxes -from models.demos.yolov4.demo.demo import YoloLayer, get_region_boxes, gen_yolov4_boxes_confs - -import pytest -import os - - -@skip_for_grayskull() -@pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) -def test_yolov4_post_processing(device, reset_seeds, model_location_generator): - torch.manual_seed(0) - - torch_input_1 = torch.randn((1, 1, 1600, 256), dtype=torch.bfloat16) - ttnn_input_1 = ttnn.from_torch( - torch_input_1, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device, memory_config=ttnn.L1_MEMORY_CONFIG - ) - torch_input_2 = torch.randn((1, 1, 400, 256), dtype=torch.bfloat16) - ttnn_input_2 = ttnn.from_torch( - torch_input_2, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device, memory_config=ttnn.L1_MEMORY_CONFIG - ) - torch_input_3 = torch.randn((1, 1, 100, 256), dtype=torch.bfloat16) - ttnn_input_3 = ttnn.from_torch( - torch_input_3, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device, memory_config=ttnn.L1_MEMORY_CONFIG - ) - - torch_input_1 = torch_input_1[:, :, :, :255] - torch_input_1 = torch_input_1.reshape(1, 40, 40, 255) - torch_input_1 = torch.permute(torch_input_1, (0, 3, 1, 2)) - torch_input_2 = torch_input_2[:, :, :, :255] - torch_input_2 = torch_input_2.reshape(1, 20, 20, 255) - torch_input_2 = torch.permute(torch_input_2, (0, 3, 1, 2)) - torch_input_3 = torch_input_3[:, :, :, :255] - torch_input_3 = torch_input_3.reshape(1, 10, 10, 255) - torch_input_3 = torch.permute(torch_input_3, (0, 3, 1, 2)) - - ref1, ref2, ref3 = gen_yolov4_boxes_confs([torch_input_1, torch_input_2, torch_input_3]) - - boxes_confs_1 = TtGenBoxes(device) - boxes_confs_2 = TtGenBoxes(device) - boxes_confs_3 = TtGenBoxes(device) - - result_1 = boxes_confs_1(device, ttnn_input_1) - result_2 = boxes_confs_2(device, ttnn_input_2) - result_3 = boxes_confs_3(device, ttnn_input_3) - - result_1_bb = ttnn.to_torch(result_1[0]) - result_2_bb = ttnn.to_torch(result_2[0]) - result_3_bb = ttnn.to_torch(result_3[0]) - - result_1_bb = result_1_bb.permute(0, 2, 3, 1) - result_2_bb = result_2_bb.permute(0, 2, 3, 1) - result_3_bb = result_3_bb.permute(0, 2, 3, 1) - - result_1_bb = result_1_bb.reshape(1, 4800, 1, 4) - result_2_bb = result_2_bb.reshape(1, 1200, 1, 4) - result_3_bb = result_3_bb.reshape(1, 300, 1, 4) - - result_1_conf = ttnn.to_torch(result_1[1]) - result_2_conf = ttnn.to_torch(result_2[1]) - result_3_conf = ttnn.to_torch(result_3[1]) - - assert_with_pcc(ref1[0], result_1_bb, 0.99) - assert_with_pcc(ref2[0], result_2_bb, 0.99) - assert_with_pcc(ref3[0], result_3_bb, 0.99) - - assert_with_pcc(ref1[1], result_1_conf, 0.99) - assert_with_pcc(ref2[1], result_2_conf, 0.99) - assert_with_pcc(ref3[1], result_3_conf, 0.99) - - output = get_region_boxes( - [(result_1_bb, result_1_conf), (result_2_bb, result_2_conf), (result_3_bb, result_3_conf)] - ) diff --git a/tests/ttnn/integration_tests/yolov4/test_ttnn_yolov4.py b/tests/ttnn/integration_tests/yolov4/test_ttnn_yolov4.py index 2a338bf6438..ff9a9d4c1dc 100644 --- a/tests/ttnn/integration_tests/yolov4/test_ttnn_yolov4.py +++ b/tests/ttnn/integration_tests/yolov4/test_ttnn_yolov4.py @@ -4,15 +4,10 @@ import torch import ttnn +from models.utility_functions import skip_for_grayskull from models.demos.yolov4.reference.yolov4 import Yolov4 from tests.ttnn.utils_for_testing import assert_with_pcc -from models.utility_functions import skip_for_grayskull from models.demos.yolov4.ttnn.yolov4 import TtYOLOv4 -from models.demos.yolov4.demo.demo import YoloLayer, get_region_boxes, gen_yolov4_boxes_confs - -import cv2 -import numpy as np - import pytest import os @@ -33,53 +28,46 @@ def test_yolov4(device, reset_seeds, model_location_generator): else: weights_pth = str(model_path / "yolov4.pth") - ttnn_model = TtYOLOv4(weights_pth, device) + ttnn_model = TtYOLOv4(device, weights_pth) - imgfile = "models/demos/yolov4/demo/giraffe_320.jpg" - width = 320 - height = 320 - img = cv2.imread(imgfile) - img = cv2.resize(img, (width, height)) - img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) - if type(img) == np.ndarray and len(img.shape) == 3: # cv2 image - img = torch.from_numpy(img.transpose(2, 0, 1)).float().div(255.0).unsqueeze(0) - elif type(img) == np.ndarray and len(img.shape) == 4: - img = torch.from_numpy(img.transpose(0, 3, 1, 2)).float().div(255.0) - torch_input = torch.autograd.Variable(img) + torch_input = torch.randn((1, 320, 320, 3), dtype=torch.bfloat16) + ttnn_input = ttnn.from_torch(torch_input, dtype=ttnn.bfloat16) + torch_input = torch_input.permute(0, 3, 1, 2).float() + torch_model = Yolov4() - input_tensor = torch.permute(torch_input, (0, 2, 3, 1)) - ttnn_input = ttnn.from_torch(input_tensor, ttnn.bfloat16) + new_state_dict = {} + ds_state_dict = {k: v for k, v in ttnn_model.torch_model.items()} + + keys = [name for name, parameter in torch_model.state_dict().items()] + values = [parameter for name, parameter in ds_state_dict.items()] + + for i in range(len(keys)): + new_state_dict[keys[i]] = values[i] - torch_model = Yolov4() - new_state_dict = dict(zip(torch_model.state_dict().keys(), ttnn_model.torch_model.values())) torch_model.load_state_dict(new_state_dict) torch_model.eval() - torch_output_tensor = torch_model(torch_input) - - ref1, ref2, ref3 = gen_yolov4_boxes_confs(torch_output_tensor) - ref_boxes, ref_confs = get_region_boxes([ref1, ref2, ref3]) - - ttnn_output_tensor = ttnn_model(ttnn_input) - result_boxes_padded = ttnn.to_torch(ttnn_output_tensor[0]) - result_confs = ttnn.to_torch(ttnn_output_tensor[1]) - - result_boxes_padded = result_boxes_padded.permute(0, 2, 1, 3) - result_boxes_list = [] - # Unpadding - # That ttnn tensor is the concat output of 3 padded tensors - # As a perf workaround I'm doing the unpadding on the torch output here. - # TODO: cleaner ttnn code when ttnn.untilize() is fully optimized - box_1_start_i = 0 - box_1_end_i = 6100 - box_2_start_i = 6128 - box_2_end_i = 6228 - box_3_start_i = 6256 - box_3_end_i = 6356 - result_boxes_list.append(result_boxes_padded[:, box_1_start_i:box_1_end_i]) - result_boxes_list.append(result_boxes_padded[:, box_2_start_i:box_2_end_i]) - result_boxes_list.append(result_boxes_padded[:, box_3_start_i:box_3_end_i]) - result_boxes = torch.cat(result_boxes_list, dim=1) - - assert_with_pcc(ref_boxes, result_boxes, 0.99) - assert_with_pcc(ref_confs, result_confs, 0.71) + result_1, result_2, result_3 = ttnn_model(ttnn_input) + result_1 = ttnn.to_torch(result_1) + result_2 = ttnn.to_torch(result_2) + result_3 = ttnn.to_torch(result_3) + + ref1, ref2, ref3 = torch_model(torch_input) + + result_1 = result_1.reshape(1, ref1.shape[2], ref1.shape[3], 255) + result_1 = result_1.permute(0, 3, 1, 2) + + result_2 = result_2.reshape(1, ref2.shape[2], ref2.shape[3], 255) + result_2 = result_2.permute(0, 3, 1, 2) + + result_3 = result_3.reshape(1, ref3.shape[2], ref3.shape[3], 255) + result_3 = result_3.permute(0, 3, 1, 2) + + # Output is sliced because ttnn.conv returns 256 channels instead of 255. + result_1 = result_1[:, :255, :, :] + result_2 = result_2[:, :255, :, :] + result_3 = result_3[:, :255, :, :] + + assert_with_pcc(result_1, ref1, 0.99) + assert_with_pcc(result_2, ref2, 0.99) + assert_with_pcc(result_3, ref3, 0.98) From 4eb7c33e2d43944289ba5aece475fc1f17becd73 Mon Sep 17 00:00:00 2001 From: Debin Chen Date: Fri, 21 Feb 2025 17:11:13 -0800 Subject: [PATCH 235/316] #17682 Improve eltwise binary ng test coverage (#17684) ### Ticket [Link to Github Issue](https://github.com/tenstorrent/tt-metal/issues/17682) ### Problem description Improve test coverage, and negative testing. ### What's changed Fixed bug to support sharding col_major, more than one CoreRange for core grid, 5D/ND sad path checking, and various test cases for binary and sharding. ### Checklist - [x] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes - [ ] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [ ] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [ ] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [ ] New/Existing tests provide coverage for changes --- .../operations/eltwise/test_binary_bcast.py | 372 +++++++++++++++++- .../device/binary_ng_device_operation.cpp | 30 +- .../device/binary_ng_program_factory.cpp | 10 +- 3 files changed, 389 insertions(+), 23 deletions(-) diff --git a/tests/ttnn/unit_tests/operations/eltwise/test_binary_bcast.py b/tests/ttnn/unit_tests/operations/eltwise/test_binary_bcast.py index cb1248efbd0..a7c179efc53 100644 --- a/tests/ttnn/unit_tests/operations/eltwise/test_binary_bcast.py +++ b/tests/ttnn/unit_tests/operations/eltwise/test_binary_bcast.py @@ -257,6 +257,7 @@ def test_binary_scalar_ops_invalid_bcast(a_shape, b_shape, ttnn_fn, device): @pytest.mark.parametrize( "a_shape, b_shape", [ + [[1, 71, 7, 7], [1]], [[1, 71, 7, 7], [7, 7]], [[920, 1, 256], [256]], [[4, 12, 64, 64], [12, 1, 1]], @@ -295,39 +296,86 @@ def test_unequal_ranks(a_shape, b_shape, device): ([1, 2], [3, 4], [4, 6]), ], ) -@pytest.mark.parametrize("memory_config", [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG]) -def test_01_volume_tensors(device, a, b, c_golden, memory_config): +@pytest.mark.parametrize( + "memory_config_a, memory_config_b", + [ + (ttnn.DRAM_MEMORY_CONFIG, ttnn.DRAM_MEMORY_CONFIG), + (ttnn.L1_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG), + (ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG), + (ttnn.L1_MEMORY_CONFIG, ttnn.DRAM_MEMORY_CONFIG), + ], +) +def test_01_volume_tensors(device, a, b, c_golden, memory_config_a, memory_config_b): a = torch.BFloat16Tensor(a) b = torch.BFloat16Tensor(b) assert torch.add(a, b).tolist() == c_golden - ttnn_a = ttnn.from_torch(a, layout=ttnn.TILE_LAYOUT, device=device, memory_config=memory_config) - ttnn_b = ttnn.from_torch(b, layout=ttnn.TILE_LAYOUT, device=device, memory_config=memory_config) + ttnn_a = ttnn.from_torch(a, layout=ttnn.TILE_LAYOUT, device=device, memory_config=memory_config_a) + ttnn_b = ttnn.from_torch(b, layout=ttnn.TILE_LAYOUT, device=device, memory_config=memory_config_b) ttnn_c = ttnn.experimental.add(ttnn_a, ttnn_b) c = ttnn.to_torch(ttnn_c).reshape((-1)) assert c.tolist() == c_golden +@pytest.mark.parametrize( + "a_shape, b_shape", + [ + [[2, 4, 12, 64, 64], [12, 1, 1]], + [[12, 1, 1], [2, 4, 12, 64, 64]], + [[2, 4, 12, 64, 64], [2, 4, 12, 64, 64]], + ], +) +def test_binary_invalid_rank(device, a_shape, b_shape): + torch.manual_seed(0) + pt_a, tt_a = rand_bf16_gen(a_shape, device) + pt_b, tt_b = rand_bf16_gen(b_shape, device) + + with pytest.raises(RuntimeError): + tt_c = ttnn.experimental.add(tt_a, tt_b) + + height_sharded_memory_config = ttnn.create_sharded_memory_config( - [320, 128], - core_grid=ttnn.CoreRangeSet({ttnn.CoreRange((0, 0), (6, 0))}), + # [320, 128], # 7 cores + # core_grid=ttnn.CoreRangeSet({ttnn.CoreRange((0, 0), (0, 6))}), + # [160, 128], # 14 cores + [128, 160], + # config 1 single rectangle start from 0, 0 + # core_grid=ttnn.CoreRangeSet({ttnn.CoreRange((0, 0), (1, 6))}), + # config 2 single rectangle not start from 0, 0 + # core_grid=ttnn.CoreRangeSet({ttnn.CoreRange((1, 0), (2, 6))}), + # config 3 two grids any + core_grid=ttnn.CoreRangeSet({ttnn.CoreRange((1, 0), (1, 6)), ttnn.CoreRange((3, 0), (3, 6))}), + # [32, 128] should work with 70 cores + # [64, 128], # 35 cores + # core_grid=ttnn.CoreRangeSet({ttnn.CoreRange((0, 0), (4, 6))}), strategy=ttnn.ShardStrategy.HEIGHT, - orientation=ttnn.ShardOrientation.ROW_MAJOR, + orientation=ttnn.ShardOrientation.COL_MAJOR, use_height_and_width_as_shard_shape=True, ) +# width sharding is not good for large and tall (w is small) tensors +# because each core may ends up with a large tensor as well, then out of L1 space width_sharded_memory_config = ttnn.create_sharded_memory_config( - [2240, 64], - core_grid=ttnn.CoreRangeSet({ttnn.CoreRange((0, 0), (0, 1))}), + # [2240, 64], + # core_grid=ttnn.CoreRangeSet({ttnn.CoreRange((0, 0), (0, 1))}), + [2240, 32], + # core_grid=ttnn.CoreRangeSet({ttnn.CoreRange((0, 0), (0, 3))}), + # core_grid=ttnn.CoreRangeSet({ttnn.CoreRange((1, 0), (1, 3))}), + core_grid=ttnn.CoreRangeSet({ttnn.CoreRange((0, 0), (0, 1)), ttnn.CoreRange((2, 2), (2, 3))}), strategy=ttnn.ShardStrategy.WIDTH, orientation=ttnn.ShardOrientation.ROW_MAJOR, use_height_and_width_as_shard_shape=True, ) block_sharded_memory_config = ttnn.create_sharded_memory_config( - [320, 64], - core_grid=ttnn.CoreRangeSet({ttnn.CoreRange((0, 0), (1, 6))}), + # [320, 64], # 128 / 64 = 2, core grid is 2x6 + # core_grid=ttnn.CoreRangeSet({ttnn.CoreRange((0, 0), (1, 6))}), + # following is better, more cores + [320, 32], # 128 / 32 = 4, core grid is 4x6 + # core_grid=ttnn.CoreRangeSet({ttnn.CoreRange((0, 0), (3, 6))}), + core_grid=ttnn.CoreRangeSet({ttnn.CoreRange((1, 0), (4, 6))}), + # [160, 32] will not work, because it needs core grid 4x14 strategy=ttnn.ShardStrategy.BLOCK, orientation=ttnn.ShardOrientation.ROW_MAJOR, use_height_and_width_as_shard_shape=True, @@ -346,16 +394,40 @@ def test_01_volume_tensors(device, a, b, c_golden, memory_config): block_sharded_memory_config, ], ) -def test_binary_sharded(a_shape, b_shape, sharded_config, device): +@pytest.mark.parametrize( + "dtype_pt, dtype_tt", + ( + [torch.bfloat16, ttnn.bfloat16], + [torch.int32, ttnn.int32], + [torch.float32, ttnn.float32], + ), +) +def test_binary_sharded(a_shape, b_shape, sharded_config, dtype_pt, dtype_tt, device): input_combinations = ( (ttnn.DRAM_MEMORY_CONFIG, sharded_config), (sharded_config, ttnn.DRAM_MEMORY_CONFIG), (sharded_config, sharded_config), + (ttnn.DRAM_MEMORY_CONFIG, ttnn.DRAM_MEMORY_CONFIG), ) for src_config, dst_config in input_combinations: - a_pt, a_tt = rand_bf16_gen(a_shape, device, memory_config=src_config) - b_pt, b_tt = rand_bf16_gen(b_shape, device, memory_config=dst_config) + a_pt = gen_func_with_cast_tt(partial(torch_random, low=-100, high=100, dtype=dtype_pt), dtype_tt)(a_shape) + b_pt = gen_func_with_cast_tt(partial(torch_random, low=-100, high=100, dtype=dtype_pt), dtype_tt)(b_shape) + + a_tt = ttnn.from_torch( + a_pt, + dtype=dtype_tt, + device=device, + layout=ttnn.TILE_LAYOUT, + memory_config=src_config, + ) + b_tt = ttnn.from_torch( + b_pt, + dtype=dtype_tt, + device=device, + layout=ttnn.TILE_LAYOUT, + memory_config=dst_config, + ) out_pt = torch.add(a_pt, b_pt) out_tt_interleaved = ttnn.experimental.add(a_tt, b_tt, memory_config=ttnn.DRAM_MEMORY_CONFIG) @@ -367,6 +439,56 @@ def test_binary_sharded(a_shape, b_shape, sharded_config, device): assert ttnn.pearson_correlation_coefficient(out_tt_sharded, out_pt) >= 0.99988 +@pytest.mark.parametrize( + "a_shape, b_shape", + ((torch.Size([5, 7, 64, 128]), torch.Size([5, 7, 64, 128])),), +) +@pytest.mark.parametrize( + "sharded_core_grid", + ( + ttnn.CoreRangeSet({ttnn.CoreRange((0, 0), (1, 6))}), + ttnn.CoreRangeSet({ttnn.CoreRange((1, 0), (2, 6))}), + ttnn.CoreRangeSet({ttnn.CoreRange((0, 0), (0, 6)), ttnn.CoreRange((1, 0), (1, 6))}), + ttnn.CoreRangeSet({ttnn.CoreRange((1, 0), (1, 6)), ttnn.CoreRange((3, 0), (3, 6))}), + ), +) +def test_binary_sharded_core_grid(device, a_shape, b_shape, sharded_core_grid): + sharded_config = ttnn.create_sharded_memory_config( + [160, 128], # 14 cores + core_grid=sharded_core_grid, + strategy=ttnn.ShardStrategy.HEIGHT, + orientation=ttnn.ShardOrientation.ROW_MAJOR, + use_height_and_width_as_shard_shape=True, + ) + a_pt = gen_func_with_cast_tt(partial(torch_random, low=-50, high=50, dtype=torch.bfloat16), ttnn.bfloat16)(a_shape) + b_pt = gen_func_with_cast_tt(partial(torch_random, low=-50, high=50, dtype=torch.bfloat16), ttnn.bfloat16)(b_shape) + + a_tt = ttnn.from_torch( + a_pt, + dtype=ttnn.bfloat16, + device=device, + layout=ttnn.TILE_LAYOUT, + memory_config=sharded_config, + ) + b_tt = ttnn.from_torch( + b_pt, + dtype=ttnn.bfloat16, + device=device, + layout=ttnn.TILE_LAYOUT, + memory_config=sharded_config, + ) + + out_pt = torch.add(a_pt, b_pt) + + out_tt_interleaved = ttnn.experimental.add(a_tt, b_tt, memory_config=ttnn.DRAM_MEMORY_CONFIG) + out_tt_interleaved = ttnn.to_torch(out_tt_interleaved) + assert ttnn.pearson_correlation_coefficient(out_tt_interleaved, out_pt) >= 0.99988 + + out_tt_sharded = ttnn.experimental.add(a_tt, b_tt, memory_config=sharded_config) + out_tt_sharded = ttnn.to_torch(out_tt_sharded) + assert ttnn.pearson_correlation_coefficient(out_tt_sharded, out_pt) >= 0.99988 + + @skip_for_grayskull("Requires wormhole_b0 to run") @pytest.mark.parametrize( "input_shapes", @@ -862,6 +984,7 @@ def test_inplace_binary_ops_fp32(input_shapes, ttnn_fn, device): (torch.Size([1, 1, 31, 32]), torch.Size([5, 3, 32, 32])), (torch.Size([5, 2, 64, 1]), torch.Size([1, 3, 1, 128])), (torch.Size([5, 1, 1, 64]), torch.Size([2, 3, 128, 1])), + (torch.Size([2, 2, 3, 128, 1]), torch.Size([2, 3, 128, 1])), ), ) @pytest.mark.parametrize( @@ -959,7 +1082,16 @@ def test_binary_opt_output_invalid_bcast(a_shape, b_shape, out_shape, ttnn_fn, d ttnn_op(input_tensor_a, input_tensor_b, queue_id=cq_id, output_tensor=out_tt) -def test_binary_sharded_bcast_w(device): +@skip_for_grayskull() +@pytest.mark.parametrize( + "dtype_pt, dtype_tt", + ( + [torch.bfloat16, ttnn.bfloat16], + [torch.int32, ttnn.int32], + [torch.float32, ttnn.float32], + ), +) +def test_binary_sharded_bcast_w(device, dtype_pt, dtype_tt): a_shape = torch.Size([5, 7, 2 * 32, 4 * 32]) b_shape = torch.Size([5, 7, 2 * 32, 1]) @@ -986,8 +1118,23 @@ def test_binary_sharded_bcast_w(device): ) for src_config, dst_config in input_combinations: - a_pt, a_tt = rand_bf16_gen(a_shape, device, memory_config=src_config) - b_pt, b_tt = rand_bf16_gen(b_shape, device, memory_config=dst_config) + a_pt = gen_func_with_cast_tt(partial(torch_random, low=-50, high=50, dtype=dtype_pt), dtype_tt)(a_shape) + b_pt = gen_func_with_cast_tt(partial(torch_random, low=-50, high=50, dtype=dtype_pt), dtype_tt)(b_shape) + + a_tt = ttnn.from_torch( + a_pt, + dtype=dtype_tt, + device=device, + layout=ttnn.TILE_LAYOUT, + memory_config=src_config, + ) + b_tt = ttnn.from_torch( + b_pt, + dtype=dtype_tt, + device=device, + layout=ttnn.TILE_LAYOUT, + memory_config=dst_config, + ) out_pt = torch.add(a_pt, b_pt) out_tt_sharded = ttnn.experimental.add(a_tt, b_tt, memory_config=ttnn.DRAM_MEMORY_CONFIG) @@ -997,3 +1144,194 @@ def test_binary_sharded_bcast_w(device): out_tt_sharded = ttnn.experimental.add(a_tt, b_tt, memory_config=a_sharded_config) out_tt_sharded = ttnn.to_torch(out_tt_sharded) torch.testing.assert_close(out_tt_sharded, out_pt) + + +def test_binary_sharded_invalid_bcast(device): + a_shape = torch.Size([5, 1, 2 * 32, 4 * 32]) + b_shape = torch.Size([5, 7, 2 * 32, 1]) + + a_sharded_config = ttnn.create_sharded_memory_config( + [10 * 32, 4 * 32], + core_grid=ttnn.CoreRangeSet({ttnn.CoreRange((0, 0), (0, 6))}), + strategy=ttnn.ShardStrategy.HEIGHT, + orientation=ttnn.ShardOrientation.ROW_MAJOR, + use_height_and_width_as_shard_shape=True, + ) + + b_sharded_config = ttnn.create_sharded_memory_config( + [10 * 32, 32], + core_grid=ttnn.CoreRangeSet({ttnn.CoreRange((0, 0), (0, 6))}), + strategy=ttnn.ShardStrategy.HEIGHT, + orientation=ttnn.ShardOrientation.ROW_MAJOR, + use_height_and_width_as_shard_shape=True, + ) + + a_pt, a_tt = rand_bf16_gen(a_shape, device, memory_config=a_sharded_config) + b_pt, b_tt = rand_bf16_gen(b_shape, device, memory_config=b_sharded_config) + + with pytest.raises(RuntimeError): + out_tt_sharded = ttnn.experimental.add(a_tt, b_tt, memory_config=a_sharded_config) + + +@pytest.mark.parametrize( + "a_shape, b_shape", + ((torch.Size([1, 5, 7, 2, 35]), torch.Size([1, 5, 7, 2, 35])),), +) +@pytest.mark.parametrize( + "shard_type, shard_size, core_range", + ( + [ttnn.ShardStrategy.HEIGHT, [32, 64], ttnn.CoreRangeSet({ttnn.CoreRange((0, 0), (4, 6))})], + [ttnn.ShardStrategy.WIDTH, [35 * 32, 32], ttnn.CoreRangeSet({ttnn.CoreRange((0, 0), (0, 1))})], + [ttnn.ShardStrategy.BLOCK, [32 * 5, 32], ttnn.CoreRangeSet({ttnn.CoreRange((0, 0), (1, 6))})], + ), +) +def test_binary_sharded_small_tile(a_shape, b_shape, shard_type, shard_size, core_range, device): + a_pt = gen_func_with_cast_tt(partial(torch_random, low=-50, high=50, dtype=torch.bfloat16), ttnn.bfloat16)(a_shape) + b_pt = gen_func_with_cast_tt(partial(torch_random, low=-50, high=50, dtype=torch.bfloat16), ttnn.bfloat16)(b_shape) + + shard_config = ttnn.create_sharded_memory_config( + shard_size, + core_grid=core_range, + strategy=shard_type, + orientation=ttnn.ShardOrientation.ROW_MAJOR, + use_height_and_width_as_shard_shape=True, + ) + + a_tt = ttnn.from_torch( + a_pt, + dtype=ttnn.bfloat16, + device=device, + layout=ttnn.TILE_LAYOUT, + memory_config=shard_config, + ) + b_tt = ttnn.from_torch( + b_pt, + dtype=ttnn.bfloat16, + device=device, + layout=ttnn.TILE_LAYOUT, + memory_config=shard_config, + ) + + out_pt = torch.add(a_pt, b_pt) + out_tt_sharded = ttnn.experimental.add(a_tt, b_tt, memory_config=shard_config) + out_tt_sharded = ttnn.to_torch(out_tt_sharded) + assert ttnn.pearson_correlation_coefficient(out_tt_sharded, out_pt) >= 0.99988 + + +@pytest.mark.parametrize( + "ttnn_fn", + [ + ttnn.experimental.add, + ttnn.experimental.sub, + ttnn.experimental.mul, + # ttnn.experimental.div, + # ttnn.experimental.rsub, + ttnn.experimental.eq, + ttnn.experimental.ne, + ttnn.experimental.gt, + ttnn.experimental.gte, + ttnn.experimental.lt, + # ttnn.experimental.lte, + ttnn.experimental.logical_or, + # ttnn.experimental.logical_xor, + ttnn.experimental.logical_and, + # ttnn.experimental.ldexp, + # ttnn.experimental.logaddexp, + # ttnn.experimental.logaddexp2, + # ttnn.experimental.squared_difference, + # ttnn.experimental.bias_gelu, + ], +) +@pytest.mark.parametrize( + "a_shape, b_shape, shard_type, shard_size, core_range", + ( + [ + torch.Size([5, 7, 2, 35]), + torch.Size([5, 7, 2, 35]), + ttnn.ShardStrategy.HEIGHT, + [64, 32], + ttnn.CoreRangeSet({ttnn.CoreRange((0, 0), (4, 6))}), + ], + [ + torch.Size([5, 7, 2, 35]), + torch.Size([5, 7, 2, 35]), + ttnn.ShardStrategy.WIDTH, + [32, 35 * 32], + ttnn.CoreRangeSet({ttnn.CoreRange((0, 0), (0, 1))}), + ], + [ + torch.Size([5, 7, 2, 35]), + torch.Size([5, 7, 2, 35]), + ttnn.ShardStrategy.BLOCK, + [32, 32 * 5], + ttnn.CoreRangeSet({ttnn.CoreRange((0, 0), (6, 1))}), + ], + [ + torch.Size([1, 1, 1024, 1024]), + torch.Size([1, 1, 1024, 1024]), + ttnn.ShardStrategy.HEIGHT, + [1024, 128], + ttnn.CoreRangeSet({ttnn.CoreRange((0, 0), (7, 0))}), + ], + [ + torch.Size([1, 1, 1024, 1024]), + torch.Size([1, 1, 1024, 1024]), + ttnn.ShardStrategy.WIDTH, + [128, 1024], + ttnn.CoreRangeSet({ttnn.CoreRange((0, 0), (7, 0))}), + ], + [ + torch.Size([1, 1, 1024, 1024]), + torch.Size([1, 1, 1024, 1024]), + ttnn.ShardStrategy.BLOCK, + [256, 256], + ttnn.CoreRangeSet({ttnn.CoreRange((0, 0), (3, 3))}), + ], + ), +) +def test_binary_sharded_col_major(a_shape, b_shape, shard_type, shard_size, core_range, ttnn_fn, device): + golden_function = ttnn.get_golden_function(ttnn_fn) + + a_pt = gen_func_with_cast_tt(partial(torch_random, low=-50, high=50, dtype=torch.bfloat16), ttnn.bfloat16)(a_shape) + b_pt = gen_func_with_cast_tt(partial(torch_random, low=-50, high=50, dtype=torch.bfloat16), ttnn.bfloat16)(b_shape) + + shard_config = ttnn.create_sharded_memory_config( + shard_size, + core_grid=core_range, + strategy=shard_type, + orientation=ttnn.ShardOrientation.COL_MAJOR, + use_height_and_width_as_shard_shape=True, + ) + + input_combinations = ( + (ttnn.DRAM_MEMORY_CONFIG, shard_config), + (shard_config, ttnn.DRAM_MEMORY_CONFIG), + (shard_config, shard_config), + (ttnn.DRAM_MEMORY_CONFIG, ttnn.DRAM_MEMORY_CONFIG), + ) + + for src_config, dst_config in input_combinations: + a_tt = ttnn.from_torch( + a_pt, + dtype=ttnn.bfloat16, + device=device, + layout=ttnn.TILE_LAYOUT, + memory_config=src_config, + ) + b_tt = ttnn.from_torch( + b_pt, + dtype=ttnn.bfloat16, + device=device, + layout=ttnn.TILE_LAYOUT, + memory_config=dst_config, + ) + + out_pt = golden_function(a_pt, b_pt) + + out_tt_sharded = ttnn_fn(a_tt, b_tt, memory_config=shard_config) + out_tt_sharded = ttnn.to_torch(out_tt_sharded) + assert ttnn.pearson_correlation_coefficient(out_tt_sharded, out_pt) >= 0.99988 + + out_tt_interleaved = ttnn_fn(a_tt, b_tt, memory_config=ttnn.DRAM_MEMORY_CONFIG) + out_tt_interleaved = ttnn.to_torch(out_tt_interleaved) + assert ttnn.pearson_correlation_coefficient(out_tt_interleaved, out_pt) >= 0.99988 diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/binary_ng_device_operation.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/binary_ng_device_operation.cpp index 4c65a5473f3..59219b000f5 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/binary_ng_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/binary_ng_device_operation.cpp @@ -155,6 +155,23 @@ void BinaryNgDeviceOperation::validate_on_program_cache_miss( const auto& input_tensor_b = tensor_args.input_tensor_b; const auto& output_tensor = tensor_args.output_tensor; + auto nd_support = [](const auto& shape) { + bool valid = true; + for (int i = -5; i >= -shape.rank(); --i) { + if (shape[i] != 1) { + valid = false; + break; + } + } + return valid; + }; + + TT_FATAL(nd_support(input_tensor_a.get_logical_shape()), "Tensor a does not support 5D or more"); + + if (input_tensor_b.has_value()) { + TT_FATAL(nd_support(input_tensor_b->get_logical_shape()), "Tensor b does not support 5D or more"); + } + TT_FATAL( input_tensor_b.has_value() != attributes.scalar.has_value(), "Either the tensor b or scalar should be set"); @@ -246,6 +263,7 @@ void BinaryNgDeviceOperation::validate_on_program_cache_hit( const int rank_a = input_shape_a.rank(); const int rank_b = input_shape_b.rank(); const int larger_rank = std::max(rank_a, rank_b); + for (int i = -1; i >= -larger_rank; --i) { auto a_dim = (i >= -rank_a) ? input_shape_a[i] : 1; auto b_dim = (i >= -rank_b) ? input_shape_b[i] : 1; @@ -256,10 +274,20 @@ void BinaryNgDeviceOperation::validate_on_program_cache_hit( a_dim, b_dim); + if (i <= -5) { + TT_FATAL( + a_dim == 1 && b_dim == 1, + "Broadcasting rule violation for 5D {}, dim a: {}, dim b: {}", + i, + a_dim, + b_dim); + } + if (has_shard_spec and i != -1) { TT_FATAL( a_dim == b_dim, - "Cannot broadcast sharded tensors on dims other than W, violation for rank {}, dim a: {}, dim b: {}", + "Cannot broadcast sharded tensors on dims other than W, violation for rank {}, dim a: {}, dim b: " + "{}", i, a_dim, b_dim); diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/binary_ng_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/binary_ng_program_factory.cpp index 6c886ef4733..5b805d5f46a 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/binary_ng_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/binary_ng_program_factory.cpp @@ -45,8 +45,7 @@ struct AllShardSpecs { ShardSpec c_shard_spec; }; -ShardSpec adjust_to_shape( - const ShardSpec& shard_spec, const ttnn::Shape& from_shape, const ttnn::Shape& to_shape) { +ShardSpec adjust_to_shape(const ShardSpec& shard_spec, const ttnn::Shape& from_shape, const ttnn::Shape& to_shape) { auto ret = shard_spec; ret.shape[0] = (ret.shape[0] * to_shape[-2]) / from_shape[-2]; @@ -168,11 +167,13 @@ void set_or_update_runtime_arguments( const auto [cN, cC, cHt, cWt] = get_shape_dims(c); const uint32_t cHt_unrolled = cN * cC * cHt; - bool row_major = true; const auto shard_specs = get_shard_specs(a, b, c); const bool has_sharding = shard_specs.has_value(); auto grid = has_sharding ? shard_specs->a_shard_spec.grid : CoreRangeSet{}; + bool row_major = + has_sharding ? shard_specs->a_shard_spec.orientation == ShardOrientation::ROW_MAJOR ? true : false : true; + // zero_start_grid is a flag to indicate that we are using a single rectangular grid that starts at (0, 0) // as well as having the sharded tensors (if any) start at (0, 0) // This will run the original work/core distribution algorithms that are specifically for this setup, as these @@ -180,7 +181,7 @@ void set_or_update_runtime_arguments( bool zero_start_grid = false; CoreCoord compute_with_storage_grid; const auto& all_device_cores = operation_attributes.worker_grid; - if (all_device_cores.size() == 1) { + if (grid.size() == 1) { const auto& cr = *all_device_cores.ranges().begin(); if (cr.start_coord.x == 0 && cr.start_coord.y == 0) { if (has_sharding) { @@ -384,7 +385,6 @@ BinaryNgDeviceOperation::ProgramFactory::cached_program_t BinaryNgDeviceOperatio uint32_t c_single_tile_size = tt_metal::detail::TileSize(c_data_format); // we parallelize the computation across the output tiles - constexpr bool row_major = true; const auto& all_device_cores = operation_attributes.worker_grid; Buffer* a_buffer = a.buffer(); From 5aab19f90956a3780511cfea06818758b3cff43e Mon Sep 17 00:00:00 2001 From: Denys Makoviichuk Date: Fri, 21 Feb 2025 18:21:06 -0800 Subject: [PATCH 236/316] [TT-Train] Clip norm fix for ddp (#17628) ### Problem description clip grad norm takes std::vector but infra doesn't support it for sharding in the multidevice case. ### What's changed Add multidevice support for std::vector. Also added additional checks. So any op with std::vector as input didn't work as expected for n300. We found 3 ops: 1) concat - fix triggered another issue. We accidentally deallocated input tensors in the op. 2) moreh_clip_grad - didn't require any fixes because it takes only std::vector 3) moreh_get_item. - current infra cannot see a difference between tensor and vector of tensors because it puts everything into the vector and it is hard to take it back in the right order without a huge changes to the decorators. Thats why we updated first parameter to be the optional. ### Checklist - [x] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes: https://github.com/tenstorrent/tt-metal/actions/runs/13444739840 - [x] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) - [x] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [x] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [x] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [x] New/Existing tests provide coverage for changes --------- Co-authored-by: Jay Kruer --- .../workflows/all-post-commit-workflows.yaml | 3 +- tt-train/tests/core/n300_utils_test.cpp | 27 ++++++++++++ tt-train/tests/ttnn_fixed/concat_op_test.cpp | 44 +++++++++++++++++++ ttnn/cpp/ttnn/decorators.hpp | 20 ++++++++- .../data_movement/concat/concat.cpp | 6 --- .../moreh/moreh_getitem/moreh_getitem.cpp | 11 ++++- .../moreh/moreh_getitem/moreh_getitem.hpp | 2 +- 7 files changed, 101 insertions(+), 12 deletions(-) create mode 100644 tt-train/tests/ttnn_fixed/concat_op_test.cpp diff --git a/.github/workflows/all-post-commit-workflows.yaml b/.github/workflows/all-post-commit-workflows.yaml index b39ceed6881..5b1c59fdf69 100644 --- a/.github/workflows/all-post-commit-workflows.yaml +++ b/.github/workflows/all-post-commit-workflows.yaml @@ -154,8 +154,7 @@ jobs: matrix: test-group: [ { arch: wormhole_b0, runner-label: N150 }, - # Disabled due to https://github.com/tenstorrent/tt-metal/issues/16012 - # { arch: wormhole_b0, runner-label: N300 }, + { arch: wormhole_b0, runner-label: N300 }, ] uses: ./.github/workflows/tt-train-post-commit.yaml with: diff --git a/tt-train/tests/core/n300_utils_test.cpp b/tt-train/tests/core/n300_utils_test.cpp index e4f05a45bf0..358c5475420 100644 --- a/tt-train/tests/core/n300_utils_test.cpp +++ b/tt-train/tests/core/n300_utils_test.cpp @@ -236,3 +236,30 @@ TEST_F(N300UtilsTest, DropoutDifferentSeed) { EXPECT_FALSE(xt::allclose(xtensors_back[0], xtensors_back[1], /*rtol=*/1e-4, /*atol=*/1e-3)); } } + +TEST_F(N300UtilsTest, MorehClipGradNorm) { + auto* device = &ttml::autograd::ctx().get_device(); + auto mesh_shape = device->shape(); + xt::xarray xtensor = xt::ones({4, 1, 20, 5}); + + ttml::core::XTensorToMeshVariant replicate_composer = ttml::core::ReplicateXTensorToMesh(mesh_shape); + auto tensor = ttml::core::from_xtensor(xtensor, device, replicate_composer, ttnn::Layout::TILE); + auto do_it = [&tensor]() { + ttnn::moreh_clip_grad_norm( + std::vector{tensor}, + 1.0F, + 2.0F, + false, + /* total_norm */ std::nullopt, + /* memory_config */ std::nullopt, + ttml::core::ComputeKernelConfig::precise()); + }; + // ensure that moreh clip grad norm works without throwing a + // bad_variant_access on n300. + EXPECT_NO_THROW(do_it()); + xt::xarray expected_res = xt::full_like(xtensor, 0.05F); + + ttml::core::MeshToXTensorVariant identity_composer = ttml::core::VectorMeshToXTensor(mesh_shape); + auto res_back = ttml::core::to_xtensor(tensor, identity_composer)[0]; + EXPECT_TRUE(xt::allclose(expected_res, res_back, 2e-2F)); +} diff --git a/tt-train/tests/ttnn_fixed/concat_op_test.cpp b/tt-train/tests/ttnn_fixed/concat_op_test.cpp new file mode 100644 index 00000000000..e0ec985d33f --- /dev/null +++ b/tt-train/tests/ttnn_fixed/concat_op_test.cpp @@ -0,0 +1,44 @@ +// SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include +#include +#include + +#include "autograd/auto_context.hpp" +#include "core/tt_tensor_utils.hpp" + +class ConcatOpTest : public ::testing::Test { +protected: + void SetUp() override { + ttml::autograd::ctx().open_device(); + } + + void TearDown() override { + ttml::autograd::ctx().close_device(); + } +}; + +TEST_F(ConcatOpTest, TestConcatLastDim) { + auto* device = &ttml::autograd::ctx().get_device(); + device->enable_async(true); + auto N = 1; + auto C = 1; + auto H = 12; + auto W = 50; + auto prod = N * C * H * W; + xt::xarray xtensor_a = xt::arange(0.F, prod).reshape({N, C, H, W}); + xt::xarray xtensor_b = xt::arange(prod, 2 * prod).reshape({N, C, H, W}); + + xt::xarray expected = xt::concatenate(xt::xtuple(xtensor_a, xtensor_b), 3); + + auto tensor_a = ttml::core::from_xtensor(xtensor_a, device); + auto tensor_b = ttml::core::from_xtensor(xtensor_b, device); + + auto ttnn_concat = ttnn::concat(std::vector{tensor_a, tensor_b}, 3); + auto ttnn_concat_xtensor = ttml::core::to_xtensor(ttnn_concat); + EXPECT_TRUE(xt::allclose(ttnn_concat_xtensor, expected, 7e-3F, 1e-6F)); +} diff --git a/ttnn/cpp/ttnn/decorators.hpp b/ttnn/cpp/ttnn/decorators.hpp index 7a08ad5d57c..3e9d8ac323a 100644 --- a/ttnn/cpp/ttnn/decorators.hpp +++ b/ttnn/cpp/ttnn/decorators.hpp @@ -105,6 +105,9 @@ auto map_launch_op_args_to_execute_on_worker_thread_args( &optional_output_tensor_index, &optional_output_tensors](auto&& arg) { using T = std::decay_t; + if constexpr (std::is_same_v>) { + return input_tensors; + } if constexpr (std::is_same_v) { return input_tensors.at(input_tensor_index++); } else if constexpr (std::is_same_v>) { @@ -304,9 +307,24 @@ struct registered_operation_t { using execute_on_worker_thread_return_t = decltype(operation_t::invoke(args...)); - const Tensors input_tensors = detail::extract_args_to_vector(args...); + Tensors single_input_tensor = detail::extract_args_to_vector(args...); const OptionalConstTensors optional_input_tensors = detail::extract_args_to_vector>(args...); + std::vector> vec_input_tensors = + detail::extract_args_to_vector>(args...); + if (!(single_input_tensor.empty() || vec_input_tensors.empty())) { + TT_THROW( + "Only one of single_input_tensor or vec_input_tensors can be specified." + "Ensure that your invoke function does not have both Tensor and std::vector as input " + "parameters"); + } + if (single_input_tensor.empty() && vec_input_tensors.size() > 1) { + TT_THROW( + "You have more than one std::vector input parameters in the invoke. Only one vector is " + "allowed"); + } + + auto& input_tensors = !vec_input_tensors.empty() ? vec_input_tensors[0] : single_input_tensor; auto output_tensors = detail::create_async_output_tensors( input_tensors, optional_input_tensors, args...); diff --git a/ttnn/cpp/ttnn/operations/data_movement/concat/concat.cpp b/ttnn/cpp/ttnn/operations/data_movement/concat/concat.cpp index d0192a1a4b6..fb9c6581982 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/concat/concat.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/concat/concat.cpp @@ -159,9 +159,6 @@ MassagedConcat build_untilize_rm_retilize_concat( const std::vector& tensors, int dim, unsigned int groups) -> ttnn::Tensor { std::vector itensors(tensors); auto res = concat_impl(itensors, dim, groups, output_memory_config); - for (auto& tensor : itensors) { - tensor.deallocate(); - } return res; }}); } @@ -323,9 +320,6 @@ ttnn::Tensor ConcatOperation::invoke( std::vector itensors(input_tensors); auto res = massaged_concat(itensors, dim, groups); - for (auto& tensor : itensors) { - tensor.deallocate(); - } return res; } diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/moreh_getitem.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/moreh_getitem.cpp index 86a484e901c..a6bf89b6635 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/moreh_getitem.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/moreh_getitem.cpp @@ -6,12 +6,19 @@ namespace ttnn::operations::moreh::moreh_getitem { Tensor MorehGetItem::invoke( - const Tensor& input, + const std::optional& input, const std::vector& index_tensors, const ttnn::SmallVector& index_dims, const std::optional& output, // const CoreRange core_range, const std::optional& memory_config) { - return ttnn::prim::moreh_getitem(input, index_tensors, index_dims, output, memory_config); + if (!input.has_value()) { + // FIXME: This is a hack to work around limitations in the decorator + // infra which requires either an input tensor or a vector of input + // tensors but not both; wrapping the input tensor in an optional allows + // us to work around this without rewriting half of the runtime. + TT_THROW("Input tensor is required for moreh_getitem operation."); + } + return ttnn::prim::moreh_getitem(input.value(), index_tensors, index_dims, output, memory_config); } } // namespace ttnn::operations::moreh::moreh_getitem diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/moreh_getitem.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/moreh_getitem.hpp index a983404bcf7..5c08d20edea 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/moreh_getitem.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/moreh_getitem.hpp @@ -10,7 +10,7 @@ namespace ttnn::operations::moreh::moreh_getitem { struct MorehGetItem { static Tensor invoke( - const Tensor& input, + const std::optional& input, const std::vector& index_tensors, const ttnn::SmallVector& index_dims, const std::optional& output, From a409dad3b19c521837e7ae526cc85ef382938943 Mon Sep 17 00:00:00 2001 From: Atul Krishnadas Date: Fri, 21 Feb 2025 19:17:05 -0800 Subject: [PATCH 237/316] =?UTF-8?q?#17077:=20convert=20bfp8=20to=20bf16=20?= =?UTF-8?q?before=20performing=20fillpad,=20and=20convert=20b=E2=80=A6=20(?= =?UTF-8?q?#18063)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …ack to bf8 after ### Ticket [#17077 ](https://github.com/tenstorrent/tt-metal/issues/17077) ### Problem description Support BFP8 for fil_implicit_pad Also going to address some comments from the original PR merge for fill_pad ### What's changed Just convert bfp8 to bfp16 and back. ### Checklist - [ ] [All post commit](https://github.com/tenstorrent/tt-metal/actions/runs/13466079605) --- .../unit_tests/operations/test_fill_pad.py | 95 ++++++++++++++++++- .../device/fill_pad_program_factory.cpp | 8 +- .../kernels/dataflow/fill_pad_writer.cpp | 17 ++-- .../data_movement/fill_pad/fill_pad.cpp | 21 ++-- 4 files changed, 121 insertions(+), 20 deletions(-) diff --git a/tests/ttnn/unit_tests/operations/test_fill_pad.py b/tests/ttnn/unit_tests/operations/test_fill_pad.py index 4b7884503f5..22bbdd3bda8 100644 --- a/tests/ttnn/unit_tests/operations/test_fill_pad.py +++ b/tests/ttnn/unit_tests/operations/test_fill_pad.py @@ -51,9 +51,100 @@ def create_nd_padded_tiled_tensor(shape, tile_size, fill_value, dtype): ttnn_dtype_to_torch_dtype = { ttnn.uint32: torch.int32, ttnn.bfloat16: torch.float32, + ttnn.bfloat8_b: torch.bfloat16, } +@pytest.mark.parametrize( + "shape", + [ + (1, 16), + (16, 1), + (1, 17), + (17, 1), + (16, 16), + (17, 17), + (31, 31), + (33, 33), + (65, 65), + (97, 97), + (1, 2, 3, 2, 1, 2, 97, 97), + ], +) +@pytest.mark.parametrize("fill_value", [1.5, float("inf"), float("-inf")]) +@pytest.mark.parametrize("dtype", [ttnn.bfloat16]) +@pytest.mark.parametrize("input_mem_config", [ttnn.DRAM_MEMORY_CONFIG]) +@pytest.mark.parametrize("output_mem_config", [ttnn.DRAM_MEMORY_CONFIG]) +def test_fill_pad_bfloat16( + device, + shape, + fill_value, + dtype, + input_mem_config, + output_mem_config, +): + torch.manual_seed(1234) + torch_input_tensor, padded_torch_tensor = create_nd_padded_tiled_tensor( + shape, 32, fill_value, ttnn_dtype_to_torch_dtype[dtype] + ) + input_tensor = ttnn.to_device( + ttnn.from_torch(torch_input_tensor, dtype=dtype, layout=ttnn.TILE_LAYOUT), + device, + memory_config=input_mem_config, + ) + + output_tensor = ttnn.fill_implicit_tile_padding(input_tensor, fill_value, memory_config=output_mem_config) + padded_torch_output_tensor = ttnn.from_device(output_tensor).to_torch_with_padded_shape() + + assert_with_pcc(padded_torch_tensor, padded_torch_output_tensor) + + +@pytest.mark.parametrize( + "shape", + [ + (1, 32), + (16, 32), + (1, 32), + (17, 32), + (16, 32), + (17, 32), + (31, 32), + (33, 32), + (65, 64), + (97, 96), + (1, 2, 3, 2, 1, 2, 97, 96), + ], +) + +# separate test for bfloat8_b where last dim is tile_width aligned (required for bf8b) +@pytest.mark.parametrize("fill_value", [1.5, float("inf"), float("-inf")]) +@pytest.mark.parametrize("dtype", [ttnn.bfloat8_b]) +@pytest.mark.parametrize("input_mem_config", [ttnn.DRAM_MEMORY_CONFIG]) +@pytest.mark.parametrize("output_mem_config", [ttnn.DRAM_MEMORY_CONFIG]) +def test_fill_pad_bfloat8_b( + device, + shape, + fill_value, + dtype, + input_mem_config, + output_mem_config, +): + torch.manual_seed(1234) + torch_input_tensor, padded_torch_tensor = create_nd_padded_tiled_tensor( + shape, 32, fill_value, ttnn_dtype_to_torch_dtype[dtype] + ) + input_tensor = ttnn.to_device( + ttnn.from_torch(torch_input_tensor, dtype=dtype, layout=ttnn.TILE_LAYOUT), + device, + memory_config=input_mem_config, + ) + + output_tensor = ttnn.fill_implicit_tile_padding(input_tensor, fill_value, memory_config=output_mem_config) + padded_torch_output_tensor = ttnn.from_device(output_tensor).to_torch_with_padded_shape() + + assert_with_pcc(padded_torch_tensor, padded_torch_output_tensor) + + @pytest.mark.parametrize( "shape", [ @@ -71,10 +162,10 @@ def create_nd_padded_tiled_tensor(shape, tile_size, fill_value, dtype): ], ) @pytest.mark.parametrize("fill_value", [1]) -@pytest.mark.parametrize("dtype", [ttnn.uint32, ttnn.bfloat16]) +@pytest.mark.parametrize("dtype", [ttnn.uint32]) @pytest.mark.parametrize("input_mem_config", [ttnn.DRAM_MEMORY_CONFIG]) @pytest.mark.parametrize("output_mem_config", [ttnn.DRAM_MEMORY_CONFIG]) -def test_fill_pad( +def test_fill_pad_int( device, shape, fill_value, diff --git a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_program_factory.cpp index b07c6e65bf0..fa2895ea815 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_program_factory.cpp @@ -85,18 +85,20 @@ operation::ProgramWithCallbacks fill_pad_multi_core(const Tensor& input_tensor, (std::uint32_t)tiles_per_2d_tensor, (std::uint32_t)tiles_per_tile_row, (std::uint32_t)tt::constants::TILE_HEIGHT, - (std::uint32_t)tt::constants::FACE_HEIGHT, - (std::uint32_t)sharded}; + (std::uint32_t)tt::constants::FACE_HEIGHT}; + std::map compute_defines; if (sharded) { shard_builder::extend_sharding_compile_time_args(input_tensor, writer_compile_time_args); + compute_defines["SHARDED"] = "1"; } tt::tt_metal::KernelHandle writer_kernel_id = tt::tt_metal::CreateKernel( program, "ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/kernels/dataflow/fill_pad_writer.cpp", all_cores, - tt_metal::WriterDataMovementConfig(writer_compile_time_args)); // writer only for in-place operation + tt_metal::WriterDataMovementConfig( + writer_compile_time_args, compute_defines)); // writer only for in-place operation auto cores = grid_to_cores(num_cores, num_cores_x, num_cores_y, false); std::vector writer_runtime_args = { diff --git a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/kernels/dataflow/fill_pad_writer.cpp b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/kernels/dataflow/fill_pad_writer.cpp index e2ecff02ddc..0d074e6da54 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/kernels/dataflow/fill_pad_writer.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/kernels/dataflow/fill_pad_writer.cpp @@ -21,7 +21,6 @@ void kernel_main() { constexpr uint32_t tile_size = get_compile_time_arg_val(10); constexpr uint32_t tile_hw = tile_size * tile_size; constexpr uint32_t face_size = get_compile_time_arg_val(11); -#define SHARDED get_compile_time_arg_val(12) == 1 constexpr uint32_t face_hw = face_size * face_size; constexpr uint32_t alignment_adjustor = 16; @@ -31,15 +30,15 @@ void kernel_main() { uint32_t starting_tile_offset = get_arg_val(rt_arg_ind++); uint32_t num_2d_tensors = get_arg_val(rt_arg_ind++); -#if (SHARDED) +#ifdef SHARDED typedef ShardedInfo< - get_compile_time_arg_val(13), - get_compile_time_arg_val(14), - get_compile_time_arg_val(15), - get_compile_time_arg_val(16), - get_compile_time_arg_val(17), - get_compile_time_arg_val(18), - get_compile_time_arg_val(19)> + get_compile_time_arg_val(12), // Memory layout + get_compile_time_arg_val(13), // The number of sharding cores + get_compile_time_arg_val(14), // The page size we offset each write to + get_compile_time_arg_val(15), // The number of pages in each sharding row not including padding pages + get_compile_time_arg_val(16), // This defines times when contiguous pages can't be calculated + get_compile_time_arg_val(17), // pages_per_shard_x + get_compile_time_arg_val(18)> // pages_per_shard_y tensor_shard_info; const auto [mapping_table, rt_increment] = diff --git a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/fill_pad.cpp b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/fill_pad.cpp index 85a08a96718..26074b26045 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/fill_pad.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/fill_pad.cpp @@ -9,6 +9,7 @@ #include "ttnn/common/queue_id.hpp" #include "ttnn/operations/core/core.hpp" #include +#include "cpp/ttnn/operations/copy.hpp" using namespace tt::tt_metal; @@ -27,10 +28,14 @@ ttnn::Tensor FillPadOperation::invoke( if (padded_width == input_tensor.get_logical_shape()[-1] && padded_height == input_tensor.get_logical_shape()[-2]) { return input_tensor; } + auto mutable_input_tensor = input_tensor; auto output_memory_config = memory_config.value_or(input_tensor.memory_config()); + if (input_tensor.get_dtype() == DataType::BFLOAT8_B) { + mutable_input_tensor = ttnn::typecast(mutable_input_tensor, DataType::BFLOAT16); + } // if input_tensor is rank > 3, then we need to reshape it to rank 3 such that the last 2 dims are the same - if (input_tensor.get_logical_shape().rank() > 3) { - ttnn::Shape original_shape = input_tensor.get_logical_shape(); + if (mutable_input_tensor.get_logical_shape().rank() > 3) { + ttnn::Shape original_shape = mutable_input_tensor.get_logical_shape(); uint32_t third_dim = 1; for (uint32_t i = 0; i < original_shape.rank() - 2; i++) { @@ -38,16 +43,20 @@ ttnn::Tensor FillPadOperation::invoke( } ttnn::Shape new_shape = ttnn::Shape{std::array{third_dim, original_shape[-2], original_shape[-1]}}; - auto reshaped_tensor = ttnn::reshape(input_tensor, new_shape); + auto reshaped_tensor = ttnn::reshape(mutable_input_tensor, new_shape); reshaped_tensor = operation::run_without_autoformat( FillPad{fill_value, output_memory_config}, {reshaped_tensor}, {}, {}, queue_id) .at(0); return ttnn::reshape(reshaped_tensor, original_shape); } - return operation::run_without_autoformat( - FillPad{fill_value, output_memory_config}, {input_tensor}, {}, {}, queue_id) - .at(0); + auto output_tensor = operation::run_without_autoformat( + FillPad{fill_value, output_memory_config}, {mutable_input_tensor}, {}, {}, queue_id) + .at(0); + if (input_tensor.get_dtype() == DataType::BFLOAT8_B) { + return ttnn::typecast(output_tensor, DataType::BFLOAT8_B); + } + return output_tensor; } } // namespace ttnn::operations::data_movement From 43df51324b49877d2efd658221eb5a83a9489cb0 Mon Sep 17 00:00:00 2001 From: Bryan Wilder Field Lozano Date: Sat, 22 Feb 2025 09:22:41 -0800 Subject: [PATCH 238/316] [skip [skip ci] Update remove-stale-branches.yaml (#18176) --- .github/workflows/remove-stale-branches.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/remove-stale-branches.yaml b/.github/workflows/remove-stale-branches.yaml index 8a7823368ea..274af73f61d 100644 --- a/.github/workflows/remove-stale-branches.yaml +++ b/.github/workflows/remove-stale-branches.yaml @@ -7,6 +7,7 @@ on: jobs: remove-stale-branches: + if: github.repository == 'tenstorrent/tt-metal' runs-on: ubuntu-latest steps: - uses: blozano-tt/remove-stale-branches@379c5b1430ca2951a1365427e7eb6574cfc4c7dd From 5a2c003f1ff928fa3766a5a4d96f81f3eb703b1e Mon Sep 17 00:00:00 2001 From: Sankar Manoj Date: Sat, 22 Feb 2025 09:43:02 -0800 Subject: [PATCH 239/316] #14080: Preprocess weights for Conv2D on Device (#16750) ### Ticket #14080 ### Problem description Currently weights preprocessing takes place on the host, on a single thread. This is slow, especially when there is a large weights matrix, and Debug mode is enabled. ### What's changed The weights are loaded to the device in the same format as PyTorch. All other processing, including permute, padding, etc are done on the Device. ### Checklist - [x] Post commit CI [passes](https://github.com/tenstorrent/tt-metal/actions/runs/13315764885) - [ ] **(For models and ops writers)** Full [new models](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) tests passes - [x] New/Existing tests provide coverage for changes --- .../unit_tests/operations/test_new_conv2d.py | 39 ++- .../operations/test_prepare_conv_weights.py | 130 -------- .../ttnn/operations/conv/conv2d/conv2d.cpp | 49 ++- .../operations/conv/conv2d/conv2d_pybind.cpp | 6 + .../operations/conv/conv2d/conv2d_utils.cpp | 7 +- .../conv/conv2d/device/conv2d_op.hpp | 11 + .../conv2d_op_sharded_program_factory.cpp | 151 +++++++-- .../conv/conv2d/prepare_conv2d_weights.cpp | 303 +++++++++++++++++- .../conv/conv2d/prepare_conv2d_weights.hpp | 16 + .../pad/device/pad_program_factory.cpp | 15 +- .../ttnn/operations/data_movement/pad/pad.cpp | 12 +- 11 files changed, 549 insertions(+), 190 deletions(-) diff --git a/tests/ttnn/unit_tests/operations/test_new_conv2d.py b/tests/ttnn/unit_tests/operations/test_new_conv2d.py index 082cb3c90fa..c9e6e60576e 100644 --- a/tests/ttnn/unit_tests/operations/test_new_conv2d.py +++ b/tests/ttnn/unit_tests/operations/test_new_conv2d.py @@ -58,6 +58,7 @@ def run_conv( config_override, dilation=1, use_shallow_conv_variant=False, + transpose_shards=True, # https://github.com/tenstorrent/tt-metal/issues/17897 fp32_accum=False, packer_l1_acc=False, output_layout=ttnn.TILE_LAYOUT, @@ -72,6 +73,7 @@ def run_conv( weight_mesh_mapper=None, output_mesh_composer=None, enable_split_reader=False, + preprocess_weights_on_device=True, ): if isinstance(device, ttnn.MeshDevice): assert input_mesh_mapper is not None, "Expected mesh mapper for input tensor when using device mesh" @@ -91,7 +93,7 @@ def run_conv( torch_input_tensor = torch.permute(torch_input_tensor_nchw, (0, 2, 3, 1)) torch_weight_tensor = randomize_torch_tensor(torch_tensor_map, conv_weight_shape) - torch_bias_tensor = randomize_torch_tensor(torch_tensor_map, conv_bias_shape) if has_bias else None + torch_bias_tensor = randomize_torch_tensor(torch_tensor_map, conv_bias_shape) * 10 if has_bias else None torch_out_golden_tensor = torch.nn.functional.conv2d( torch_input_tensor_nchw, @@ -134,6 +136,9 @@ def run_conv( enable_split_reader=enable_split_reader, enable_subblock_padding=False, output_layout=output_layout, + transpose_shards=transpose_shards, + preprocess_weights_on_device=preprocess_weights_on_device, + always_preprocess_weights=True, ) compute_config = ttnn.init_device_compute_kernel_config( device.arch(), @@ -153,7 +158,7 @@ def run_conv( conv_config.override_sharding_config = True print("Setting num_cores_nhw to 98") - [tt_output_tensor_on_device, [out_height, out_width]] = ttnn.conv2d( + [tt_output_tensor_on_device, [out_height, out_width], [d_w, d_b]] = ttnn.conv2d( input_tensor=tt_input_tensor, weight_tensor=tt_weight_tensor, in_channels=input_channels, @@ -174,8 +179,8 @@ def run_conv( groups=groups, memory_config=memory_config, return_output_dim=True, + return_weights_and_bias=True, ) - tt_output_tensor = ttnn.from_device(tt_output_tensor_on_device) torch_output_tensor = ttnn.to_torch(tt_output_tensor, mesh_composer=output_mesh_composer) @@ -191,6 +196,8 @@ def run_conv( if not fp32_accum: pcc = 0.985 + if input_channels * filter_height * filter_width > 10000: + pcc = 0.97 elif math_fidelity == ttnn.MathFidelity.LoFi and activations_dtype == ttnn.bfloat8_b: pcc = 0.996 else: @@ -384,6 +391,9 @@ def test_conv_features( if output_layout == ttnn.ROW_MAJOR_LAYOUT and activations_dtype == ttnn.bfloat8_b: pytest.skip("Row major layout not compatible with bfloat8_b") + if output_layout == ttnn.ROW_MAJOR_LAYOUT and activations_dtype == ttnn.bfloat16 and packer_l1_acc and fp32_accum: + pytest.skip("skipping due to pack_untilize_dst issue!") + run_conv( device, torch_tensor_map, @@ -407,6 +417,7 @@ def test_conv_features( has_bias=True, fp32_accum=fp32_accum, packer_l1_acc=packer_l1_acc, + preprocess_weights_on_device=True, ) @@ -778,7 +789,7 @@ def test_conv_for_segformer_512x512( ) @pytest.mark.parametrize( "weights_dtype", - [ttnn.bfloat16, ttnn.bfloat8_b], + [ttnn.bfloat16], ) @pytest.mark.parametrize( "activations_dtype", @@ -961,6 +972,7 @@ def test_resnet50_conv_wh( pad_w, config_override=config_override, use_shallow_conv_variant=use_shallow_conv_variant, + transpose_shards=True, ## use RM (transpose_mcast=False) with 2D on WH packer_l1_acc=packer_l1_acc, fp32_accum=False, has_bias=has_bias, @@ -1022,6 +1034,7 @@ def test_conv_mem_config_wh( shard_layout=shard_layout, config_override=config_override, use_shallow_conv_variant=use_shallow_conv_variant, + transpose_shards=True, ## use RM (transpose_mcast=False) with 2D on WH packer_l1_acc=True, fp32_accum=False, has_bias=True, @@ -1207,7 +1220,7 @@ def test_resnet50_conv_wh_fp32( ) @pytest.mark.parametrize( "weights_dtype", - [ttnn.bfloat8_b], + [ttnn.bfloat16], ) @pytest.mark.parametrize( "activations_dtype", @@ -1349,7 +1362,7 @@ def test_sd_conv( ) @pytest.mark.parametrize( "activations_dtype", - [ttnn.bfloat16, ttnn.bfloat8_b], + [ttnn.bfloat16], ) @pytest.mark.parametrize( "fp32_accum", @@ -1490,7 +1503,7 @@ def test_sd_conv_wh( ) @pytest.mark.parametrize( "weights_dtype", - [ttnn.bfloat8_b], + [ttnn.bfloat16], ) @pytest.mark.parametrize( "activations_dtype", @@ -1642,6 +1655,7 @@ def test_unet_conv_wh( config_override, shard_layout=shard_layout, use_shallow_conv_variant=use_shallow_conv_variant, + transpose_shards=True, ## use RM (transpose_mcast=False) with 2D on WH output_layout=output_layout, auto_shard=auto_shard, ) @@ -1740,6 +1754,7 @@ def test_unet_conv_groups_2_wh( config_override, shard_layout=shard_layout, use_shallow_conv_variant=use_shallow_conv_variant, + transpose_shards=True, ## use RM (transpose_mcast=False) with 2D on WH output_layout=output_layout, auto_shard=auto_shard, groups=groups, @@ -1837,6 +1852,7 @@ def test_unet_conv_groups_4_6_wh( config_override, shard_layout=shard_layout, use_shallow_conv_variant=use_shallow_conv_variant, + transpose_shards=True, ## use RM (transpose_mcast=False) with 2D on WH output_layout=output_layout, groups=groups, ) @@ -1935,12 +1951,14 @@ def test_unet_conv_groups_8_wh( config_override, shard_layout=shard_layout, use_shallow_conv_variant=use_shallow_conv_variant, + transpose_shards=True, ## use RM (transpose_mcast=False) with 2D on WH output_layout=output_layout, auto_shard=auto_shard, groups=groups, ) +@skip_for_grayskull() @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) @pytest.mark.parametrize( "batch_size, output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, config_override", @@ -2002,6 +2020,7 @@ def test_halo_reshard_conv( ) +@skip_for_grayskull() @pytest.mark.skip("New API needs to be tested") @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) @pytest.mark.parametrize( @@ -2243,6 +2262,7 @@ def test_conv_groups( ) +@skip_for_grayskull() @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) @pytest.mark.parametrize( "batch_size, output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, shard_layout, config_override, use_shallow_conv_variant, groups", @@ -2363,6 +2383,7 @@ def test_yolov4_conv_groups_larger_than_one( ) +@skip_for_grayskull() @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) @pytest.mark.parametrize( " output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, shard_layout, config_override, use_shallow_conv_variant, groups", @@ -2651,6 +2672,7 @@ def test_shallow_conv_with_tiled_input(device): # Tests running conv2d which maps to matmul w/o sharding the input tensor. # Output tensor is in DRAM. +@skip_for_grayskull() @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) @pytest.mark.parametrize("tiled_input", [True, False]) @pytest.mark.parametrize("input_on_device", [True, False]) @@ -2776,6 +2798,9 @@ def test_small_in_large_out_channels_auto_shard(device, torch_tensor_map): padding = (0, 0) height = 128 width = 128 + if device.core_grid.y != 8 and is_wormhole_b0(): + pytest.skip("Needs 8x8 grid for wormhole_b0") + run_conv( device, torch_tensor_map, diff --git a/tests/ttnn/unit_tests/operations/test_prepare_conv_weights.py b/tests/ttnn/unit_tests/operations/test_prepare_conv_weights.py index c71c5cfbd26..1543913a051 100644 --- a/tests/ttnn/unit_tests/operations/test_prepare_conv_weights.py +++ b/tests/ttnn/unit_tests/operations/test_prepare_conv_weights.py @@ -196,133 +196,3 @@ def test_prepare_conv_weights( passing, pcc_msg = check_with_pcc_without_tensor_printout(torch_output_tensor, torch_out_golden_tensor, pcc=pcc) logger.info(f"PCC = {pcc_msg}. Threshold = {pcc}") assert passing - - -@skip_for_grayskull() -@skip_for_blackhole() -# @skip_for_wormhole_b0() -@pytest.mark.parametrize( - "batch_size, output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, use_1d_systolic_array, config_override", - ( - # rn50 layer1 - (8, 64, 64, 56, 56, 3, 3, 1, 1, 1, 1, True, None), - (16, 64, 64, 56, 56, 3, 3, 1, 1, 1, 1, True, None), - (20, 64, 64, 56, 56, 3, 3, 1, 1, 1, 1, True, None), - ), -) -@pytest.mark.parametrize("packer_l1_acc", [True, False], ids=["pack_l1", "no_pack_l1"]) -@pytest.mark.parametrize("has_bias", [True, False], ids=["has_bias", "no_bias"]) -@pytest.mark.parametrize("device_params", [{"l1_small_size": 2**15}], indirect=True) -def test_prepare_bias( - batch_size, - output_channels, - input_channels, - input_height, - input_width, - filter_height, - filter_width, - stride_h, - stride_w, - pad_h, - pad_w, - use_1d_systolic_array, - packer_l1_acc, - config_override, - has_bias, - device, -): - if device.core_grid.y == 7: - pytest.skip("Issue #6992: Statically allocated circular buffers in program clash with L1 buffers on core range") - - if batch_size == 20 and ( - output_channels == 64 or (stride_h == 2 and (output_channels == 256 or output_channels == 128)) - ): - pytest.skip("Skipping test because it won't fit in L1!") - - inp_shape = (batch_size, input_channels, input_height, input_width) - conv_weight_shape = (output_channels, input_channels, filter_height, filter_width) - torch_weight_tensor = torch.randn(conv_weight_shape, dtype=torch.bfloat16) - torch_input_tensor = torch.randn(inp_shape, dtype=torch.bfloat16) - torch_bias_tensor = torch.randn((1, 1, 1, output_channels), dtype=torch.bfloat16) if has_bias else None - - torch_out_golden_tensor = torch.nn.functional.conv2d( - torch_input_tensor, - torch_weight_tensor, - bias=torch_bias_tensor.reshape(-1) if has_bias else None, - stride=(stride_h, stride_w), - padding=(pad_h, pad_w), - dilation=(1, 1), - groups=1, - ).permute(0, 2, 3, 1) - - tt_input_tensor = ttnn.from_torch(torch_input_tensor.transpose(-3, -2).transpose(-2, -1), ttnn.bfloat16) - tt_weight_tensor = ttnn.from_torch(torch_weight_tensor, ttnn.bfloat16) - tt_bias_tensor = ttnn.from_torch(torch_bias_tensor, ttnn.bfloat16) if has_bias else None - - conv_config = ttnn.Conv2dConfig( - dtype=ttnn.bfloat16, - weights_dtype=ttnn.bfloat16, - input_channels_alignment=(16 if input_channels == 16 and input_height == 115 else 32), - enable_act_double_buffer=False, - enable_split_reader=False, - enable_subblock_padding=False, - ) - compute_config = ttnn.init_device_compute_kernel_config(device.arch(), packer_l1_acc=packer_l1_acc) - if config_override and "act_block_h" in config_override: - conv_config.act_block_h_override = config_override["act_block_h"] - - if config_override and "act_block_w_div" in config_override: - conv_config.act_block_w_div = config_override["act_block_w_div"] - - if config_override and "num_cores_nhw" in config_override: - if config_override["num_cores_nhw"] == 98: - conv_config.core_grid = ttnn.CoreRangeSet({ttnn.CoreRange((0, 0), (11, 7)), ttnn.CoreRange((0, 8), (1, 8))}) - conv_config.override_sharding_config = True - print("Setting num_cores_nhw to 98") - - conv_kwargs = { - "input_layout": ttnn.ROW_MAJOR_LAYOUT, - "in_channels": input_channels, - "out_channels": output_channels, - "batch_size": batch_size, - "input_height": input_height, - "input_width": input_width, - "kernel_size": (filter_height, filter_width), - "stride": (stride_h, stride_w), - "padding": (pad_h, pad_w), - "dilation": (1, 1), - "groups": 1, - "device": device, - "conv_config": conv_config, - } - - tt_input_tensor = ttnn.to_device(tt_input_tensor, device) - - tt_bias_tensor_formatted = ( - ttnn.prepare_conv_bias( - bias_tensor=tt_bias_tensor, input_memory_config=tt_input_tensor.memory_config(), **conv_kwargs - ) - if has_bias - else None - ) - - tt_bias_tensor_formatted = ttnn.to_device(tt_bias_tensor_formatted, device) if has_bias else None - (k := next(iter(conv_kwargs)), conv_kwargs.pop(k)) ##removing 1st element from dict - tt_output_tensor_on_device = ttnn.conv2d( - input_tensor=tt_input_tensor, - weight_tensor=tt_weight_tensor, - bias_tensor=tt_bias_tensor_formatted, - **conv_kwargs, - compute_config=compute_config, - ) - - tt_output_tensor = ttnn.from_device(tt_output_tensor_on_device) - torch_output_tensor = ttnn.to_torch(tt_output_tensor) - - torch_output_tensor = torch_output_tensor[:, :, :, :output_channels] - torch_output_tensor = torch_output_tensor.reshape(torch_out_golden_tensor.shape) - - pcc = 0.99 - passing, pcc_msg = check_with_pcc_without_tensor_printout(torch_output_tensor, torch_out_golden_tensor, pcc=pcc) - logger.info(f"PCC = {pcc_msg}. Threshold = {pcc}") - assert passing diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp index a3928a36629..3f856572366 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp @@ -119,22 +119,41 @@ Result conv2d( bool weight_is_on_device = ttnn::is_tensor_on_device_or_multidevice(weight_tensor); ttnn::Tensor weight_tensor_on_device = weight_tensor; std::optional bias_tensor_on_device = bias_tensor; - if (!weight_is_on_device) { + if (!weight_is_on_device || conv_config.always_preprocess_weights) { // prepare weights in desired layout and move to device - tie(weight_tensor_on_device, bias_tensor_on_device) = prepare_conv_weights_biases_and_move_to_device( - weight_tensor, - bias_tensor, - conv_config.input_channels_alignment, - conv_config.weights_dtype, - opt_conv_op_block_config.act_block_w_ntiles, - opt_conv_op_block_config.out_subblock_w_ntiles, - parallel_config, - output_parallel_config, - device, - groups, - opt_conv_op_block_config.act_block_h_ntiles, - input_width, - true); + + // TODO: Implement heuristic to decide if weights should be preprocessed on device. + if (conv_config.preprocess_weights_on_device == false) { + tie(weight_tensor_on_device, bias_tensor_on_device) = prepare_conv_weights_biases_and_move_to_device( + weight_tensor, + bias_tensor, + conv_config.input_channels_alignment, + conv_config.weights_dtype, + opt_conv_op_block_config.act_block_w_ntiles, + opt_conv_op_block_config.out_subblock_w_ntiles, + parallel_config, + output_parallel_config, + device, + groups, + opt_conv_op_block_config.act_block_h_ntiles, + input_width, + true); + } else { + tie(weight_tensor_on_device, bias_tensor_on_device) = prepare_conv_weights_biases_on_device( + weight_tensor, + bias_tensor, + conv_config.input_channels_alignment, + conv_config.weights_dtype, + opt_conv_op_block_config.act_block_w_ntiles, + opt_conv_op_block_config.out_subblock_w_ntiles, + parallel_config, + output_parallel_config, + device, + groups, + opt_conv_op_block_config.act_block_h_ntiles, + input_width, + true); + } } // if 1x1 conv w/ stride 1, convert input tensor to tile layout if required if (mm_conv) { diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp index 0591ed02d0c..8d169240b72 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp @@ -335,6 +335,8 @@ void py_bind_conv2d(py::module& module) { bool, bool, bool, + bool, + bool, bool>(), py::kw_only(), py::arg("dtype") = DataType::BFLOAT16, @@ -351,6 +353,8 @@ void py_bind_conv2d(py::module& module) { py::arg("core_grid") = std::nullopt, py::arg("transpose_shards") = true, py::arg("output_layout") = Layout::TILE, + py::arg("preprocess_weights_on_device") = true, + py::arg("always_preprocess_weights") = false, py::arg("enable_act_double_buffer") = false, py::arg("enable_weights_double_buffer") = false, py::arg("enable_split_reader") = false, @@ -369,6 +373,8 @@ void py_bind_conv2d(py::module& module) { py_conv_config.def_readwrite("core_grid", &Conv2dConfig::core_grid); py_conv_config.def_readwrite("transpose_shards", &Conv2dConfig::transpose_shards); py_conv_config.def_readwrite("output_layout", &Conv2dConfig::output_layout); + py_conv_config.def_readwrite("preprocess_weights_on_device", &Conv2dConfig::preprocess_weights_on_device); + py_conv_config.def_readwrite("always_preprocess_weights", &Conv2dConfig::always_preprocess_weights); py_conv_config.def_readwrite("enable_act_double_buffer", &Conv2dConfig::enable_act_double_buffer); py_conv_config.def_readwrite("enable_weights_double_buffer", &Conv2dConfig::enable_weights_double_buffer); py_conv_config.def_readwrite("enable_split_reader", &Conv2dConfig::enable_split_reader); diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp index 6f67fb238a6..7bdc858a526 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp @@ -869,9 +869,12 @@ std::tuple #include "ttnn/operations/sliding_window/sliding_window.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/run_operation.hpp" @@ -61,6 +62,13 @@ struct Conv2dConfig { // BFLOAT8 is always Tile layout. Layout output_layout = Layout::TILE; + // Select between preprocessing weights on device or on host. + bool preprocess_weights_on_device = true; + + // If false, only preprocess weights if they are originally located on host. + // If true, preprocess weights regarding of original location. + bool always_preprocess_weights = false; + // Doubles the size of the CBs for activation. // Increased perf, but increased L1 usage. bool enable_act_double_buffer = false; @@ -73,6 +81,7 @@ struct Conv2dConfig { bool enable_split_reader = false; bool enable_subblock_padding = false; + static constexpr auto attribute_names = std::make_tuple( "dtype", "weights_dtype", @@ -88,6 +97,7 @@ struct Conv2dConfig { "core_grid", "transpose_shards", "output_layout", + "preprocess_weights_on_device", "enable_act_double_buffer", "enable_weights_double_buffer", "enable_split_reader", @@ -108,6 +118,7 @@ struct Conv2dConfig { std::cref(this->core_grid), std::cref(this->transpose_shards), std::cref(this->output_layout), + std::cref(this->preprocess_weights_on_device), std::cref(this->enable_act_double_buffer), std::cref(this->enable_weights_double_buffer), std::cref(this->enable_split_reader), diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp index 32fd24971e8..ce2999e4ca8 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp @@ -474,7 +474,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( } } - // assert(out_block_h_ntiles == act_block_h_ntiles); // TODO: fix output block sizing + // TT_FATAL(out_block_h_ntiles == act_block_h_ntiles); // TODO: fix output block sizing TT_FATAL( out_block_h_ntiles >= act_block_h_ntiles, "Output block height (in # of tiles) ({}) should be greater than or equal to activation block height (in # of " @@ -578,8 +578,8 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( sliding_window_config, parallelization_config.num_cores_nhw, out_block_h_ntiles); - assert(act_matrix_shape.size() == 3); - assert(act_matrix_shape[0] == 1); + TT_FATAL(act_matrix_shape.size() == 3, "act_matrix_shape should have be of size 3"); + TT_FATAL(act_matrix_shape[0] == 1, "act_matrix_shape should have 1 as the first dimension"); uint32_t act_matrix_height = (uint32_t)act_matrix_shape[1]; uint32_t act_matrix_width = (uint32_t)act_matrix_shape[2]; if (block_sharded) { @@ -589,7 +589,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( uint32_t act_matrix_height_unpadded = (uint32_t)act_matrix_shape_unpadded[1]; uint32_t act_matrix_width_unpadded = (uint32_t)act_matrix_shape_unpadded[2]; - // TODO: Move all these asserts/checks to validate? + // TODO: Move all these TT_FATALs/checks to validate? uint32_t input_width = ashape[2]; uint32_t input_channels = ashape[3]; @@ -611,7 +611,10 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( // matrix multiplication shape check valid for all convs except depthwise conv1d if (!is_conv_1d_depthwise_conv) { TT_FATAL( - act_matrix_width == weight_matrix_height, "The width of tensor a needs to match the height of tensor b"); + act_matrix_width == weight_matrix_height, + "The width of tensor a {} needs to match the height of tensor b {}", + act_matrix_width, + weight_matrix_height); } // Tile size divisibility checks TT_FATAL(act_matrix_height % TILE_HEIGHT == 0, "Height of activation matrix needs to be divisible by 32"); @@ -635,10 +638,26 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( uint32_t act_matrix_height_ntiles = act_matrix_height / TILE_HEIGHT; uint32_t act_matrix_width_ntiles = act_matrix_width / TILE_WIDTH; - assert(act_matrix_height_ntiles % act_block_h_ntiles == 0); - assert(act_matrix_width_ntiles % act_block_w_ntiles == 0); - assert(weight_matrix_width_ntiles % weight_block_w_ntiles == 0); - assert(act_matrix_height_ntiles % out_block_h_ntiles == 0); + TT_FATAL( + act_matrix_height_ntiles % act_block_h_ntiles == 0, + "act_matrix_height_ntiles {} should be divisible by act_block_h_ntiles {}", + act_matrix_height_ntiles, + act_block_h_ntiles); + TT_FATAL( + act_matrix_width_ntiles % act_block_w_ntiles == 0, + "act_matrix_width_ntiles {} should be divisible by act_block_w_ntiles {}", + act_matrix_width_ntiles, + act_block_w_ntiles); + TT_FATAL( + weight_matrix_width_ntiles % weight_block_w_ntiles == 0, + "weight_matrix_width_ntiles {} should be divisible by weight_block_w_ntiles {}", + weight_matrix_width_ntiles, + weight_block_w_ntiles); + TT_FATAL( + act_matrix_height_ntiles % out_block_h_ntiles == 0, + "act_matrix_height_ntiles {} should be divisible by out_block_h_ntiles {}", + act_matrix_height_ntiles, + out_block_h_ntiles); uint32_t num_blocks_act_h = act_matrix_height_ntiles / act_block_h_ntiles; uint32_t num_blocks_out_h = act_matrix_height_ntiles / out_block_h_ntiles; @@ -672,7 +691,11 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( // weight block info uint32_t weight_block_w_datums = weight_matrix_width / num_blocks_weight_w; - assert(weight_block_w_ntiles % out_subblock_w_ntiles == 0); + TT_FATAL( + weight_block_w_ntiles % out_subblock_w_ntiles == 0, + "weight_block_w_ntiles {} should be divisible by weight_block_w_ntiles {}", + weight_block_w_ntiles, + out_subblock_w_ntiles); uint32_t weight_num_subblocks = weight_block_w_ntiles / out_subblock_w_ntiles; uint32_t weight_block_h_ntiles = is_conv_1d_depthwise_conv ? act_block_h_ntiles : act_block_w_ntiles; uint32_t weight_block_num_tiles = weight_block_w_ntiles * weight_block_h_ntiles; @@ -681,14 +704,21 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( // writer of conv op partially removes padding on the width // it removes the padding done for block width but it doesn't remove padding done for tiled width uint32_t output_channels_padded_to_tile_width = round_up(output_channels, TILE_WIDTH); - assert(output_channels_padded_to_tile_width <= weight_matrix_width); + TT_FATAL( + output_channels_padded_to_tile_width <= weight_matrix_width, + "output_channels_padded_to_tile_width {} should be less than or equal to weight_matrix_width {}", + output_channels_padded_to_tile_width, + weight_matrix_width); uint32_t output_width_num_tiles = output_channels_padded_to_tile_width / TILE_WIDTH; uint32_t num_blocks_output_w = (uint32_t)std::ceil((double)output_channels_padded_to_tile_width / (double)weight_block_w_datums); uint32_t last_block_width_datums = (output_channels_padded_to_tile_width % weight_block_w_datums == 0) ? weight_block_w_datums : (output_channels_padded_to_tile_width % weight_block_w_datums); - assert(last_block_width_datums % TILE_WIDTH == 0); + TT_FATAL( + last_block_width_datums % TILE_WIDTH == 0, + "last_block_width_datums {} should be divisible by TILE_WIDTH", + last_block_width_datums); uint32_t out_block_h_datums = out_block_h_ntiles * TILE_HEIGHT; @@ -706,9 +736,12 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( // act uint32_t act_dram_addr = src0_dram_buffer->address(); - assert(act_matrix_width_ntiles % act_block_w_ntiles == 0); - assert(act_block_h_ntiles % out_subblock_h_ntiles == 0); - // assert(out_block_h_ntiles % out_subblock_h_ntiles == 0); + TT_FATAL( + act_block_h_ntiles % out_subblock_h_ntiles == 0, + "act_block_h_ntiles {} should be divisible by out_subblock_h_ntiles {}", + act_block_h_ntiles, + out_subblock_h_ntiles); + // TT_FATAL(out_block_h_ntiles % out_subblock_h_ntiles == 0); uint32_t act_num_subblocks = act_block_h_ntiles / out_subblock_h_ntiles; uint32_t act_block_num_tiles = act_block_h_ntiles * act_block_w_ntiles; uint32_t act_subblock_h_ntiles = out_subblock_h_ntiles; @@ -743,7 +776,11 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( uint32_t output_height_padded_to_tile_height = round_up(act_matrix_height_unpadded, TILE_HEIGHT); uint32_t output_height_num_tiles = output_height_padded_to_tile_height / TILE_HEIGHT; - assert(output_height_num_tiles <= act_matrix_height_ntiles); + TT_FATAL( + output_height_num_tiles <= act_matrix_height_ntiles, + "output_height_num_tiles {} should be less than or equal to act_matrix_height_ntiles {}", + output_height_num_tiles, + act_matrix_height_ntiles); uint32_t src_dram_act_buffer_size_bytes = src0_dram_buffer->size(); uint32_t src_dram_weight_buffer_size_bytes = src1_dram_buffer->size(); @@ -840,46 +877,94 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( reader_defines["WINDOW_INNER"] = std::to_string(window_inner); log_debug(LogOp, "window_outer: {}, window_inner: {}", window_outer, window_inner); - assert(weight_matrix_width_ntiles % per_core_out_matrix_width_ntiles == 0); - assert(per_core_out_matrix_width_ntiles % weight_block_w_ntiles == 0); + TT_FATAL( + weight_matrix_width_ntiles % per_core_out_matrix_width_ntiles == 0, + "weight_matrix_width_ntiles {} should be divisible by per_core_out_matrix_width_ntiles {}", + weight_matrix_width_ntiles, + per_core_out_matrix_width_ntiles); + TT_FATAL( + per_core_out_matrix_width_ntiles % weight_block_w_ntiles == 0, + "per_core_out_matrix_width_ntiles {} should be divisible by weight_block_w_ntiles {}", + per_core_out_matrix_width_ntiles, + weight_block_w_ntiles); uint32_t num_blocks_weight_w_per_core = per_core_out_matrix_width_ntiles / weight_block_w_ntiles; if (not weight_width_sliced) { - assert(num_blocks_weight_w_per_core == num_blocks_weight_w); + TT_FATAL( + num_blocks_weight_w_per_core == num_blocks_weight_w, + "num_blocks_weight_w_per_core {} should be equal to num_blocks_weight_w {}", + num_blocks_weight_w_per_core, + num_blocks_weight_w); } uint32_t num_weight_slices_width = weight_matrix_width_ntiles / per_core_out_matrix_width_ntiles; uint32_t total_num_cores_per_weight_slice = 0; uint32_t total_num_cores_per_act_slice = 0; // only used when (BLOCK_SHARDING && !transpose_mcast) if (weight_width_sliced) { if (transpose_mcast) { - assert(num_cores_y % num_weight_slices_width == 0); + TT_FATAL( + num_cores_y % num_weight_slices_width == 0, + "num_cores_y {} should be divisible by num_weight_slices_width {}", + num_cores_y, + num_weight_slices_width); uint32_t num_cores_y_per_weight_slice_width = num_cores_y / num_weight_slices_width; total_num_cores_per_weight_slice = num_cores_y_per_weight_slice_width * num_cores_x; } else { - assert(num_cores_x % num_weight_slices_width == 0); + TT_FATAL( + num_cores_x % num_weight_slices_width == 0, + "num_cores_x {} should be divisible by num_weight_slices_width {}", + num_cores_x, + num_weight_slices_width); uint32_t num_cores_x_per_weight_slice_width = num_cores_x / num_weight_slices_width; uint32_t num_act_slices_height = act_matrix_height_ntiles / per_core_out_matrix_height_ntiles; total_num_cores_per_act_slice = num_cores_x * num_cores_y / num_act_slices_height; log_debug(LogOp, "total_num_cores_per_act_slice: {}", total_num_cores_per_act_slice); total_num_cores_per_weight_slice = num_cores_x_per_weight_slice_width * num_cores_y; } - assert(total_num_cores_per_weight_slice * per_core_out_matrix_height_ntiles == act_matrix_height_ntiles); + TT_FATAL( + total_num_cores_per_weight_slice * per_core_out_matrix_height_ntiles == act_matrix_height_ntiles, + "total_num_cores_per_weight_slice {} * per_core_out_matrix_height_ntiles {} should be equal to " + "act_matrix_height_ntiles {}", + total_num_cores_per_weight_slice, + per_core_out_matrix_height_ntiles, + act_matrix_height_ntiles); } else { - assert(num_cores_y % num_weight_slices_width == 0); + TT_FATAL( + num_cores_y % num_weight_slices_width == 0, + "num_cores_y {} should be divisible by num_weight_slices_width {}", + num_cores_y, + num_weight_slices_width); uint32_t num_cores_y_per_weight_slice_width = num_cores_y / num_weight_slices_width; total_num_cores_per_weight_slice = num_cores_y_per_weight_slice_width * num_cores_x; - assert(total_num_cores * per_core_out_matrix_height_ntiles >= act_matrix_height_ntiles); + TT_FATAL( + total_num_cores * per_core_out_matrix_height_ntiles >= act_matrix_height_ntiles, + "total_num_cores {} * per_core_out_matrix_height_ntiles {} should be greater than or equal to " + "act_matrix_height_ntiles {}", + total_num_cores, + per_core_out_matrix_height_ntiles, + act_matrix_height_ntiles); } - assert(per_core_out_matrix_height_ntiles % act_block_h_ntiles == 0); + TT_FATAL( + per_core_out_matrix_height_ntiles % act_block_h_ntiles == 0, + "per_core_out_matrix_height_ntiles {} should be divisible by act_block_h_ntiles {}", + per_core_out_matrix_height_ntiles, + act_block_h_ntiles); uint32_t num_blocks_act_h_per_core = per_core_out_matrix_height_ntiles / act_block_h_ntiles; - // assert(per_core_out_matrix_height_ntiles % out_block_h_ntiles == 0); + // TT_FATAL(per_core_out_matrix_height_ntiles % out_block_h_ntiles == 0); // uint32_t num_blocks_out_h_per_core = per_core_out_matrix_height_ntiles / out_block_h_ntiles; uint32_t num_blocks_out_h_per_core = (per_core_out_matrix_height_ntiles + out_block_h_ntiles - 1) / out_block_h_ntiles; bool act_height_sliced = per_core_out_matrix_height_ntiles < act_matrix_height_ntiles; if (not act_height_sliced) { - TT_FATAL(num_blocks_act_h_per_core == num_blocks_act_h, "Error"); - TT_FATAL(num_blocks_out_h_per_core == num_blocks_out_h, "Error"); - TT_FATAL(num_cores_x == 1, "Error"); + TT_FATAL( + num_blocks_act_h_per_core == num_blocks_act_h, + "num_blocks_act_h_per_core {} should be equal to num_blocks_act_h {}", + num_blocks_act_h_per_core, + num_blocks_act_h); + TT_FATAL( + num_blocks_out_h_per_core == num_blocks_out_h, + "num_blocks_out_h_per_core {} should be equal to num_blocks_out_h {}", + num_blocks_out_h_per_core, + num_blocks_out_h); + TT_FATAL(num_cores_x == 1, "num_cores_x {} should be equal to 1", num_cores_x); } uint32_t act_block_h_datums_last_block = (per_core_out_matrix_height_ntiles - (num_blocks_act_h_per_core - 1) * act_block_h_ntiles) * TILE_HEIGHT; @@ -1135,7 +1220,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( if (filter_h >= 1 and filter_w >= 1) { if (!is_conv1d and weight_width_sliced) { // 2D conv - assert(read_window_in_inner_loop == true); + TT_FATAL(read_window_in_inner_loop == true, "read_window_in_inner_loop should be true for this conv"); reader_kernel = "ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/" "reader_conv_activations_2d_mcast_padded_with_halo_3x3_weights_v2.cpp"; @@ -1447,7 +1532,11 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( uint32_t out_start_tile_id_w = weight_slice_i * per_core_out_matrix_width_ntiles; uint32_t bias_tile_offset = weight_slice_i * per_core_out_matrix_width_ntiles; if (has_bias) { - assert(bias_tile_offset < bias_ntiles); + TT_FATAL( + bias_tile_offset < bias_ntiles, + "bias_tile_offset {} should be less than bias_ntiles {}", + bias_tile_offset, + bias_ntiles); } if (weight_width_sliced) { diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.cpp index 2f7b82a170e..726b4ba4049 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.cpp @@ -9,6 +9,10 @@ #include "ttnn/operations/core/core.hpp" #include "ttnn/operations/data_movement/pad/pad.hpp" #include "ttnn/tensor/types.hpp" +#include "ttnn/operations/data_movement/permute/permute.hpp" +#include "ttnn/operations/data_movement/reshape_view/reshape.hpp" +#include "ttnn/operations/data_movement/tilize/tilize.hpp" +#include "ttnn/operations/sliding_window/sliding_window.hpp" using namespace tt; namespace ttnn { namespace operations::conv { @@ -475,8 +479,6 @@ Tensor convert_conv_weight_tensor_to_depthwise_layout( } void validate_weight_tensor(const ttnn::Tensor& weight_tensor) { - TT_FATAL( - !ttnn::has_storage_type_of(weight_tensor, ttnn::DEVICE_STORAGE_TYPE), "conv weight should be placed on host"); TT_FATAL(weight_tensor.get_layout() == Layout::ROW_MAJOR, "conv weight layout should be in row_major layout"); TT_FATAL(weight_tensor.get_logical_shape().rank() == 4, "conv weight should be 4D tensor"); } @@ -631,6 +633,272 @@ static OptimizedConvBlockConfig get_opt_block_config( conv_config.enable_split_reader); } +template +std::pair> prepare_conv_weights_biases_on_device( + const ttnn::Tensor& weight_tensor, + std::optional& bias_tensor, + uint32_t input_channels_alignment, + DataType weights_bias_dtype, + uint32_t weight_block_h_ntiles, + uint32_t weight_block_w_ntiles, + const sliding_window::ParallelConfig& input_parallel_config, + const sliding_window::ParallelConfig& output_parallel_config, + T* device, + uint32_t groups, + uint32_t act_block_h_ntiles, + uint32_t input_width, + const bool parameters_on_device) { + validate_weight_tensor(weight_tensor); + ttnn::Tensor weight_tensor_; // tensor to return + ttnn::Tensor bias_tensor_; + + auto original_weights_shape = weight_tensor.get_logical_shape(); + uint32_t original_weights_out_channels = original_weights_shape[0]; + uint32_t original_weights_in_channels = original_weights_shape[1]; + uint32_t original_weights_window_h = original_weights_shape[2]; + uint32_t original_weights_window_w = original_weights_shape[3]; + + bool is_conv1d = original_weights_window_w == 1 && input_width == 1; + bool is_depthwise_conv = groups == original_weights_out_channels && original_weights_in_channels == 1; + + weight_tensor_ = weight_tensor; + // Convert weight tensor to 0 padded shape if groups > 1 + if (groups > 1 and is_tensor_on_device_or_multidevice(weight_tensor_)) { + TT_THROW( + "Grouped Convolution not supported when weights are on device. Please move the weights tensor to host"); + } + if (!is_conv1d and groups > 1) { + weight_tensor_ = convert_conv_weight_tensor_to_grouped_layout(weight_tensor_, groups, weights_bias_dtype); + } else if (is_conv1d and groups > 1) { + if (is_depthwise_conv) { + weight_tensor_ = + convert_conv_weight_tensor_to_depthwise_layout(weight_tensor_, act_block_h_ntiles, weights_bias_dtype); + weight_block_h_ntiles = act_block_h_ntiles; + } else { + weight_tensor_ = convert_conv_weight_tensor_to_grouped_layout(weight_tensor_, groups, weights_bias_dtype); + } + } + + weight_tensor_ = ttnn::operations::core::to_device(weight_tensor_, device, std::nullopt); + + auto weights_shape = weight_tensor_.get_logical_shape(); + uint32_t out_channels = weights_shape[0]; + uint32_t in_channels = weights_shape[1]; + uint32_t window_h = weights_shape[2]; + uint32_t window_w = weights_shape[3]; + + uint32_t input_num_cores_channels = get_num_cores_channels_from_parallel_config(input_parallel_config); + uint32_t output_num_cores_channels = get_num_cores_channels_from_parallel_config(output_parallel_config); + + uint32_t out_channels_padded = tt::round_up(out_channels, output_num_cores_channels * tt::constants::TILE_WIDTH); + uint32_t in_channels_padded = tt::round_up(in_channels, input_num_cores_channels * input_channels_alignment); + uint32_t out_channel_padding = out_channels_padded - out_channels; + + ttnn::Shape weights_channels_padded_shape( + std::array({out_channels_padded, in_channels_padded, window_h, window_w})); + if (weights_bias_dtype == DataType::BFLOAT8_B) { + TT_ASSERT(weight_tensor_.get_dtype() == DataType::FLOAT32); + if (bias_tensor.has_value()) { + TT_ASSERT(bias_tensor.value().get_dtype() == DataType::FLOAT32); + } + } else { + // TODO: fix the need to check this. We should be able to accept any datatype and convert + TT_ASSERT(weight_tensor_.get_dtype() == weights_bias_dtype); + if (bias_tensor.has_value()) { + TT_ASSERT(bias_tensor.value().get_dtype() == weights_bias_dtype); + } + } + weight_tensor_ = ttnn::pad( + weight_tensor_, + weights_channels_padded_shape.to_array_4D(), + tt::tt_metal::Array4D({0, 0, 0, 0}), + 0.0f, + true, + std::nullopt); + + // Block sharding re-orders the weights by dividing the input_channels along number of in_channel_cores. + if (input_parallel_config.shard_scheme == TensorMemoryLayout::BLOCK_SHARDED) { + TT_FATAL( + input_num_cores_channels == output_num_cores_channels, + "Input and output cores must be the same for Block Sharded Conv2d"); + TT_FATAL( + in_channels_padded % input_num_cores_channels == 0, + "Input channels {} must be divisble by num cores {}", + in_channels_padded, + input_num_cores_channels); + auto in_channels_per_core = in_channels_padded / input_num_cores_channels; + + TT_FATAL( + out_channels_padded % output_num_cores_channels == 0, + "output channels {} must be divisble by num cores {}", + out_channels_padded, + output_num_cores_channels); + auto out_channels_per_core = out_channels_padded / output_num_cores_channels; + auto rounded_weight_block_height = + tt::round_up(window_h * window_w * in_channels_per_core, constants::TILE_HEIGHT); + auto rounded_weight_block_width = tt::round_up(out_channels_per_core, constants::TILE_WIDTH); + + auto final_out_channels_padded = rounded_weight_block_width * output_num_cores_channels; + + if (final_out_channels_padded != out_channels_padded) { + weight_tensor_ = ttnn::reshape( + weight_tensor_, + ttnn::Shape( + {output_num_cores_channels, out_channels_per_core, in_channels_padded * window_h, window_w})); + + weight_tensor_ = ttnn::pad( + weight_tensor_, + tt::tt_metal::Array4D( + {output_num_cores_channels, rounded_weight_block_width, in_channels_padded * window_h, window_w}), + tt::tt_metal::Array4D({0, 0, 0, 0}), + 0, + true, + std::nullopt); + } + weight_tensor_ = ttnn::reshape( + weight_tensor_, + ttnn::Shape( + {final_out_channels_padded, input_num_cores_channels, in_channels_per_core, window_h, window_w})); + + weight_tensor_ = ttnn::permute(weight_tensor_, ttnn::SmallVector({1, 3, 4, 2, 0})); + // Shape is now {input_num_cores_channels, window_h, window_w, in_channels_per_core, out_channels_padded} + + weight_tensor_ = ttnn::reshape( + weight_tensor_, + ttnn::Shape( + {1, input_num_cores_channels, in_channels_per_core * window_h * window_w, final_out_channels_padded})); + weight_tensor_ = ttnn::pad( + weight_tensor_, + tt::tt_metal::Array4D( + {1, input_num_cores_channels, rounded_weight_block_height, final_out_channels_padded}), + tt::tt_metal::Array4D({0, 0, 0, 0}), + 0, + true, + std::nullopt); + weight_tensor_ = ttnn::reshape( + weight_tensor_, + ttnn::Shape({1, 1, rounded_weight_block_height * input_num_cores_channels, final_out_channels_padded})); + } else { + // Reshape the weights to 5D, and permute in 5D. + weight_tensor_ = ttnn::reshape( + weight_tensor_, ttnn::Shape({1, out_channels_padded, in_channels_padded, window_h, window_w})); + + weight_tensor_ = ttnn::permute(weight_tensor_, ttnn::SmallVector({0, 3, 4, 2, 1})); + // Shape is now {1, window_h, window_w, in_channels_padded, out_channels_padded} + auto weight_block_h_datums = weight_block_h_ntiles * constants::TILE_HEIGHT; + if ((weight_block_h_datums > (window_w * in_channels_padded)) && + (input_parallel_config.shard_scheme == TensorMemoryLayout::HEIGHT_SHARDED)) { + weight_tensor_ = ttnn::reshape( + weight_tensor_, ttnn::Shape({1, window_h, window_w * in_channels_padded, out_channels_padded})); + weight_tensor_ = ttnn::pad( + weight_tensor_, + tt::tt_metal::Array4D({1, window_h, weight_block_h_datums, out_channels_padded}), + tt::tt_metal::Array4D({0, 0, 0, 0}), + 0.0f, + true, + std::nullopt); + weight_tensor_ = ttnn::reshape( + weight_tensor_, ttnn::Shape({1, 1, window_h * weight_block_h_datums, out_channels_padded})); + } else { + weight_tensor_ = ttnn::reshape( + weight_tensor_, ttnn::Shape({1, 1, window_h * window_w * in_channels_padded, out_channels_padded})); + } + } + weight_tensor_ = ttnn::tilize( + weight_tensor_, + ttnn::MemoryConfig( + {.memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED, + .buffer_type = tt::tt_metal::BufferType::DRAM}), + weights_bias_dtype, + true); + + uint32_t weight_matrix_height = in_channels * window_h * window_w; + int32_t weight_matrix_height_padding = weight_tensor_.get_logical_shape()[2] - weight_matrix_height; + TT_FATAL(weight_matrix_height_padding >= 0, " Matrix Height Padding can't be negative"); + + ttnn::Shape target_shape(std::array{1, 1, weight_matrix_height, out_channels}); + + weight_tensor_ = ttnn::reshape(weight_tensor_, target_shape, weight_tensor_.get_padded_shape()); + + if (bias_tensor.has_value()) { + bias_tensor_ = bias_tensor.value(); + bool is_bias_tensor_is_on_device = ttnn::is_tensor_on_device_or_multidevice(bias_tensor_); + if (!is_bias_tensor_is_on_device) { + bias_tensor_ = ttnn::operations::core::to_device(bias_tensor_, device, std::nullopt); + } + if (input_parallel_config.shard_scheme == TensorMemoryLayout::BLOCK_SHARDED) { + auto bias_out_channels = bias_tensor_.get_logical_shape()[3]; + ttnn::Shape bias_channels_padded_shape({1, 1, 1, out_channels_padded}); + bias_tensor_ = ttnn::pad( + bias_tensor_, + bias_channels_padded_shape.to_array_4D(), + tt::tt_metal::Array4D{0, 0, 0, 0}, + 0, + true, + std::nullopt); + auto out_channels_per_core = out_channels_padded / output_num_cores_channels; + auto rounded_weight_block_width = tt::round_up(out_channels_per_core, constants::TILE_WIDTH); + + auto final_out_channels_padded = rounded_weight_block_width * output_num_cores_channels; + + if (final_out_channels_padded != out_channels_padded) { + bias_tensor_ = + ttnn::reshape(bias_tensor_, ttnn::Shape({1, 1, output_num_cores_channels, out_channels_per_core})); + + bias_tensor_ = ttnn::pad( + bias_tensor_, + tt::tt_metal::Array4D({1, 1, output_num_cores_channels, rounded_weight_block_width}), + tt::tt_metal::Array4D({0, 0, 0, 0}), + 0, + true, + std::nullopt); + } + bias_tensor_ = ttnn::reshape(bias_tensor_, ttnn::Shape({1, 1, 1, final_out_channels_padded})); + bias_tensor_ = ttnn::pad( + bias_tensor_, + tt::tt_metal::Array4D({1, 1, 32, final_out_channels_padded}), + tt::tt_metal::Array4D{0, 0, 0, 0}, + 0, + true, + std::nullopt); + } else { + ttnn::Shape bias_channels_padded_shape({1, 1, 32, round_up(out_channels, weight_block_w_ntiles * 32)}); + bias_tensor_ = ttnn::pad( + bias_tensor_, + bias_channels_padded_shape.to_array_4D(), + tt::tt_metal::Array4D{0, 0, 0, 0}, + 0, + true, + std::nullopt); + } + bias_tensor_ = ttnn::tilize( + bias_tensor_, + ttnn::MemoryConfig( + {.memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED, + .buffer_type = tt::tt_metal::BufferType::DRAM}), + weights_bias_dtype, + true); + + ttnn::Shape bias_target_shape(std::array{1, 1, 1, out_channels}); + bias_tensor_ = ttnn::reshape(bias_tensor_, bias_target_shape, bias_tensor_.get_padded_shape()); + + // TT_FATAL( + // bias_tensor_.get_logical_shape()[3] == out_channels, + // "Bias must have the same length as output channels"); + // bias_tensor_ = conv_bias_layout_convert( + // bias_tensor_, + // weights_bias_dtype, + // weight_block_h_ntiles, + // weight_block_w_ntiles, + // output_parallel_config, + // device, + // out_channels_padded, + // is_non_tile_mul_width); + } + + return {weight_tensor_, bias_tensor.has_value() ? bias_tensor_ : std::optional()}; +} + template std::pair> prepare_conv_weights_biases_and_move_to_device( const ttnn::Tensor& weight_tensor, @@ -703,7 +971,6 @@ std::pair> prepare_conv_weights_biases } weight_tensor_ = ttnn::pad(weight_tensor_, weights_channels_padded_shape.to_array_4D(), tt::tt_metal::Array4D({0, 0, 0, 0}), 0); - // for conv op, pad the weights to block shape if (input_parallel_config.shard_scheme == TensorMemoryLayout::HEIGHT_SHARDED) { weight_tensor_ = convert_conv_weight_tensor_to_special_padding_tiled_layout( @@ -985,6 +1252,36 @@ template ttnn::Tensor prepare_conv_weights( const std::optional& conv_config_, const std::optional& compute_config_); +template std::pair> prepare_conv_weights_biases_on_device( + const ttnn::Tensor& weight_tensor, + std::optional& bias_tensor, + uint32_t input_channels_alignment, + DataType weights_bias_dtype, + uint32_t weight_block_h_ntiles, + uint32_t weight_block_w_ntiles, + const sliding_window::ParallelConfig& input_parallel_config, + const sliding_window::ParallelConfig& output_parallel_config, + IDevice* device, + uint32_t groups, + uint32_t act_block_h_ntiles, + uint32_t input_width, + const bool parameters_on_device); + +template std::pair> prepare_conv_weights_biases_on_device( + const ttnn::Tensor& weight_tensor, + std::optional& bias_tensor, + uint32_t input_channels_alignment, + DataType weights_bias_dtype, + uint32_t weight_block_h_ntiles, + uint32_t weight_block_w_ntiles, + const sliding_window::ParallelConfig& input_parallel_config, + const sliding_window::ParallelConfig& output_parallel_config, + MeshDevice* device, + uint32_t groups, + uint32_t act_block_h_ntiles, + uint32_t input_width, + const bool parameters_on_device); + template std::pair> prepare_conv_weights_biases_and_move_to_device( const ttnn::Tensor& weight_tensor, std::optional& bias_tensor, diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.hpp b/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.hpp index 5377a62a345..2824a9cd4fe 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.hpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.hpp @@ -103,6 +103,22 @@ ttnn::Tensor prepare_conv_bias( const std::optional& conv_config_, const std::optional& compute_config_); +template +std::pair> prepare_conv_weights_biases_on_device( + const ttnn::Tensor& weight_tensor, + std::optional& bias_tensor, + uint32_t input_channels_alignment, + DataType weights_bias_dtype, + uint32_t weight_block_h_ntiles, + uint32_t weight_block_w_ntiles, + const sliding_window::ParallelConfig& input_parallel_config, + const sliding_window::ParallelConfig& output_parallel_config, + T* device, + uint32_t groups, + uint32_t act_block_h_ntiles, + uint32_t input_width, + const bool parameters_on_device); + template std::pair> prepare_conv_weights_biases_and_move_to_device( const ttnn::Tensor& weight_tensor, diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp index a009d7d00aa..7f34adea279 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp @@ -792,6 +792,13 @@ std::vector, std::vector>> get_runtime return ret_val; } +uint32_t get_num_max_sticks(uint32_t num_sticks_to_read, uint32_t stick_size, uint32_t max_read_size) { + uint32_t num_sticks = tt::round_up(max_read_size, stick_size) / stick_size; + while (num_sticks * stick_size > max_read_size || num_sticks_to_read % num_sticks != 0) { + num_sticks--; + } + return num_sticks; +} operation::ProgramWithCallbacks pad_rm_reader_writer_multi_core_v2( const Tensor& a, Tensor& output, @@ -841,8 +848,14 @@ operation::ProgramWithCallbacks pad_rm_reader_writer_multi_core_v2( ? num_sticks_padded_per_core_group_1 : num_sticks_padded_per_core_group_2; + uint32_t max_read_size = 256 * 1024; + uint32_t W_bytes = a.get_padded_shape()[3] * a.element_size(); + auto num_sticks_per_core_read = get_num_max_sticks(num_sticks, W_bytes, max_read_size); + auto input_cb_pages = std::min(num_sticks_per_core_read, num_sticks); + tt::tt_metal::CircularBufferConfig cb_src0_config = - tt::tt_metal::CircularBufferConfig(num_sticks * stick_size_padded_aligned, {{src0_cb_index, cb_data_format}}) + tt::tt_metal::CircularBufferConfig( + input_cb_pages * stick_size_padded_aligned, {{src0_cb_index, cb_data_format}}) .set_page_size(src0_cb_index, stick_size_padded_aligned); auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, total_cores, cb_src0_config); diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/pad.cpp b/ttnn/cpp/ttnn/operations/data_movement/pad/pad.cpp index 9e4382f3d73..d8c78a70cdd 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/pad/pad.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/pad/pad.cpp @@ -51,7 +51,17 @@ static ttnn::Tensor pad_impl( const auto rank = input_tensor_shape.rank(); TT_FATAL(rank == 4, "ttnn.pad: input tensor passed to pad_impl must have rank == 4, but got rank {}.", rank); - + bool input_output_same = true; + for (size_t i = 0; i < rank; i++) { + if (input_tensor_shape[i] != output_padded_shape[i]) { + input_output_same = false; + break; + } + } + if (input_output_same) { + tt::log_debug("Pad Input and Output Shapes are the same. Skipping pad and returning input tensor."); + return input_tensor; + } using ShardStrategy = ttnn::operations::data_movement::ShardStrategy; using ShardOrientation = tt::tt_metal::ShardOrientation; using Layout = tt::tt_metal::Layout; From 4036f9b8ef02c0e5ae12740235308096fc7d67c3 Mon Sep 17 00:00:00 2001 From: asaigal Date: Fri, 21 Feb 2025 00:35:04 +0000 Subject: [PATCH 240/316] Add TT-Mesh Programming example demonstrating MeshTrace and Multi-MeshCQ --- tests/scripts/t3000/run_t3000_unit_tests.sh | 1 + .../CMakeLists.txt | 18 ++ .../distributed_trace_and_events.cpp | 285 ++++++++++++++++++ .../distributed/CMakeLists.txt | 1 + 4 files changed, 305 insertions(+) create mode 100644 tt_metal/programming_examples/distributed/4_distributed_trace_and_events/CMakeLists.txt create mode 100644 tt_metal/programming_examples/distributed/4_distributed_trace_and_events/distributed_trace_and_events.cpp diff --git a/tests/scripts/t3000/run_t3000_unit_tests.sh b/tests/scripts/t3000/run_t3000_unit_tests.sh index e4e54a510b1..7f709db3316 100755 --- a/tests/scripts/t3000/run_t3000_unit_tests.sh +++ b/tests/scripts/t3000/run_t3000_unit_tests.sh @@ -26,6 +26,7 @@ run_t3000_ttmetal_tests() { ./build/programming_examples/distributed/distributed_program_dispatch ./build/programming_examples/distributed/distributed_buffer_rw ./build/programming_examples/distributed/distributed_eltwise_add + ./build/programming_examples/distributed/distributed_trace_and_events # Record the end time end_time=$(date +%s) diff --git a/tt_metal/programming_examples/distributed/4_distributed_trace_and_events/CMakeLists.txt b/tt_metal/programming_examples/distributed/4_distributed_trace_and_events/CMakeLists.txt new file mode 100644 index 00000000000..736e5ddcf76 --- /dev/null +++ b/tt_metal/programming_examples/distributed/4_distributed_trace_and_events/CMakeLists.txt @@ -0,0 +1,18 @@ +set(DISTRIBUTED_TRCE_AND_EVENTS ${CMAKE_CURRENT_SOURCE_DIR}/distributed_trace_and_events.cpp) +add_executable(distributed_trace_and_events ${DISTRIBUTED_TRCE_AND_EVENTS}) + +target_link_libraries( + distributed_trace_and_events + PUBLIC + tt_metal + pthread +) + +target_include_directories(distributed_trace_and_events PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) + +set_target_properties( + distributed_trace_and_events + PROPERTIES + RUNTIME_OUTPUT_DIRECTORY + ${PROJECT_BINARY_DIR}/programming_examples/distributed +) diff --git a/tt_metal/programming_examples/distributed/4_distributed_trace_and_events/distributed_trace_and_events.cpp b/tt_metal/programming_examples/distributed/4_distributed_trace_and_events/distributed_trace_and_events.cpp new file mode 100644 index 00000000000..c438e65dcb3 --- /dev/null +++ b/tt_metal/programming_examples/distributed/4_distributed_trace_and_events/distributed_trace_and_events.cpp @@ -0,0 +1,285 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +using namespace tt; +using namespace tt::tt_metal; +using namespace tt::tt_metal::distributed; + +// The following is an advanced programming example that demonstrates: +// +// 1. Initializing a MeshDevice with 2 MeshCommandQueues and a dedicated memory region to store MeshWorkload Traces +// 2. Loading a SubDevice configuration on a Virtual Mesh, and how this configuration gets replicated across all +// physical devices +// 3. Allocating MeshBuffers in the distributed memory space exposed by the Virtual Mesh, to shard data across physical +// devices +// 4. Constructing programs targeting different SubDevices +// 5. Constructing homogenous (same program dispatched to all physical devices) and heterogenous (different programs +// dispatched +// to physical different devices) MeshWorkloads from programs +// 6. Capturing the execution of MeshWorkloads inside a MeshTrace that gets loaded onto the Virtual Mesh +// 7. Performing IO and MeshTrace execution on different MeshCommandQueues and using MeshEvents for MeshCQ <--> MeshCQ +// synchronization + +std::shared_ptr EltwiseBinaryProgramGenerator( + std::shared_ptr src0_buf, + std::shared_ptr src1_buf, + std::shared_ptr output_buf, + const SubDevice& sub_device_for_program, + uint32_t num_tiles, + uint32_t single_tile_size, + uint32_t eltwise_op_index) { + // Program Generation helper function: Can be used to run addition, multiplication and subtraction + // on a SubDevice. + // Requires: + // 1. The src (input) and output buffers + // 2. The SubDevice being targeted + // 3. The number of tiles that must be processed by the op + // 4. The size of the tile in bytes + // The op specifier: Addition (0), Multiplication (1), Subtraction (2) + const std::vector op_id_to_op_define = {"add_tiles", "mul_tiles", "sub_tiles"}; + const std::vector op_id_to_op_type_define = { + "EltwiseBinaryType::ELWADD", "EltwiseBinaryType::ELWMUL", "EltwiseBinaryType::ELWSUB"}; + + const auto cores_for_program = sub_device_for_program.cores(HalProgrammableCoreType::TENSIX); + + std::shared_ptr program = std::make_shared(); + + uint32_t src0_cb_index = tt::CBIndex::c_0; + uint32_t num_input_tiles = 2; + tt_metal::CircularBufferConfig cb_src0_config = + tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}}) + .set_page_size(src0_cb_index, single_tile_size); + auto cb_src0 = tt_metal::CreateCircularBuffer(*program, cores_for_program, cb_src0_config); + + uint32_t src1_cb_index = tt::CBIndex::c_1; + tt_metal::CircularBufferConfig cb_src1_config = + tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src1_cb_index, tt::DataFormat::Float16_b}}) + .set_page_size(src1_cb_index, single_tile_size); + auto cb_src1 = tt_metal::CreateCircularBuffer(*program, cores_for_program, cb_src1_config); + + uint32_t output_cb_index = tt::CBIndex::c_16; + uint32_t num_output_tiles = 2; + tt_metal::CircularBufferConfig cb_output_config = + tt_metal::CircularBufferConfig( + num_output_tiles * single_tile_size, {{output_cb_index, tt::DataFormat::Float16_b}}) + .set_page_size(output_cb_index, single_tile_size); + auto cb_output = tt_metal::CreateCircularBuffer(*program, cores_for_program, cb_output_config); + + auto binary_reader_kernel = tt_metal::CreateKernel( + *program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_dual_8bank.cpp", + cores_for_program, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); + + auto unary_writer_kernel = tt_metal::CreateKernel( + *program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp", + cores_for_program, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); + + std::vector compute_kernel_args = {}; + + bool fp32_dest_acc_en = false; + bool math_approx_mode = false; + std::map binary_defines = { + {"ELTWISE_OP", op_id_to_op_define[eltwise_op_index]}, + {"ELTWISE_OP_TYPE", op_id_to_op_type_define[eltwise_op_index]}}; + auto eltwise_binary_kernel = tt_metal::CreateKernel( + *program, + "tt_metal/kernels/compute/eltwise_binary.cpp", + cores_for_program, + tt_metal::ComputeConfig{.compile_args = compute_kernel_args, .defines = binary_defines}); + + SetRuntimeArgs(*program, eltwise_binary_kernel, cores_for_program, {num_tiles, 1}); + + const std::array reader_args = { + src0_buf->address(), 0, num_tiles, src1_buf->address(), 0, num_tiles, 0}; + + const std::array writer_args = {output_buf->address(), 0, num_tiles}; + + SetRuntimeArgs(*program, unary_writer_kernel, cores_for_program, writer_args); + SetRuntimeArgs(*program, binary_reader_kernel, cores_for_program, reader_args); + + return program; +} + +int main(int argc, char** argv) { + using tt::constants::TILE_HEIGHT; + using tt::constants::TILE_WIDTH; + // Initialize constants used to define the workload + constexpr uint32_t ADD_OP_ID = 0; + constexpr uint32_t MULTIPLY_OP_ID = 1; + constexpr uint32_t SUBTRACT_OP_ID = 2; + // Create a 2x4 MeshDevice with 2 MeshCQs, 16MB allocated to the trace region and Ethernet Dispatch enabled + auto mesh_device = MeshDevice::create( + MeshDeviceConfig{.mesh_shape = MeshShape(2, 4)}, // Shape of MeshDevice + 0, // l1 small size + 16 << 20, // trace region size + 2, // num MeshCQs + DispatchCoreType::ETH /* Dispatch Configuration: 8 Chip Wormhole systems can only support 2 MeshCQs when Ethernet Dispatch is enabled */); + + // Initialize command queue ids used for data movement and workload dispatch + constexpr uint8_t data_movement_cq_id = 1; + constexpr uint8_t workload_cq_id = 0; + auto data_movement_cq = mesh_device->mesh_command_queue(data_movement_cq_id); + auto workload_cq = mesh_device->mesh_command_queue(workload_cq_id); + + // =========== Step 1: Initialize and load two SubDevices =========== + // Each SubDevice contains a single core. This SubDevice configuration is loaded on each physical device + // in the Virtual Mesh + SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {0, 0}))}); + SubDevice sub_device_2(std::array{CoreRangeSet(CoreRange({1, 1}, {1, 1}))}); + auto sub_device_manager = mesh_device->create_sub_device_manager( + {sub_device_1, sub_device_2}, 3200 /* size of L1 region allocated for the SubDevices */); + mesh_device->load_sub_device_manager(sub_device_manager); + + // =========== Step 2: Initialize IO Buffers and Workload parameters =========== + uint32_t single_tile_size = sizeof(bfloat16) * TILE_HEIGHT * TILE_WIDTH; // Using bfloat16 in this example + uint32_t num_tiles_per_device = 2048; // Number of tiles sent to each physical device + uint32_t num_tiles_in_mesh = + num_tiles_per_device * mesh_device->num_devices(); // The total number of tiles in the distributed memory space + + // Specify data layout in distributed memory space - Data will be sharded in row major order across the Virtual Mesh + tt::tt_metal::distributed::ShardedBufferConfig global_buffer_config{ + .global_size = single_tile_size * num_tiles_in_mesh, // Total size of the sharded buffer + .global_buffer_shape = + {num_tiles_in_mesh * TILE_WIDTH, TILE_HEIGHT}, // Data represents horizontally concatenated tiles + .shard_shape = {num_tiles_per_device * TILE_WIDTH, TILE_HEIGHT}, // Row major sharding + .shard_orientation = ShardOrientation::ROW_MAJOR // Row major sharding + }; + // Specify data layout on a single physical device + DeviceLocalBufferConfig per_device_buffer_config{ + .page_size = single_tile_size, + .buffer_type = tt_metal::BufferType::DRAM, + .buffer_layout = TensorMemoryLayout::INTERLEAVED, + .bottom_up = true}; + // Allocate buffers in distributed memory space for first MeshWorkload + auto add_src0_buf = MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device.get()); + auto add_src1_buf = MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device.get()); + auto add_output_buf = MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device.get()); + // Allocate buffers in distributed memory space for second MeshWorkload + auto mul_sub_src0_buf = MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device.get()); + auto mul_sub_src1_buf = MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device.get()); + auto mul_sub_output_buf = MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device.get()); + + // =========== Step 3: Create Workloads to run on the Virtual Mesh =========== + // Specify Device Ranges on which the Workloads will run + LogicalDeviceRange all_devices({0, 0}, {mesh_device->num_cols() - 1, mesh_device->num_rows() - 1}); + LogicalDeviceRange top_row({0, 0}, {mesh_device->num_cols() - 1, 0}); + LogicalDeviceRange bottom_row( + {0, mesh_device->num_rows() - 1}, {mesh_device->num_cols() - 1, mesh_device->num_rows() - 1}); + // Create three eltwise binary ops using a simple program generation function + auto add_program = EltwiseBinaryProgramGenerator( + add_src0_buf, + add_src1_buf, + add_output_buf, + sub_device_1, // Addition runs on the first SubDevice + num_tiles_per_device, + single_tile_size, + ADD_OP_ID); + auto multiply_program = EltwiseBinaryProgramGenerator( + mul_sub_src0_buf, + mul_sub_src1_buf, + mul_sub_output_buf, + sub_device_2, // Multiplication runs on the second SubDevice + num_tiles_per_device, + single_tile_size, + MULTIPLY_OP_ID); + auto subtract_program = EltwiseBinaryProgramGenerator( + mul_sub_src0_buf, + mul_sub_src1_buf, + mul_sub_output_buf, + sub_device_2, // Subtraction runs on the second SubDevice + num_tiles_per_device, + single_tile_size, + SUBTRACT_OP_ID); + // Create MeshWorkloads and add programs to them. A MeshWorkload allows a program to target + // multiple Physical Devices in the Virtual Mesh. + auto add_mesh_workload = CreateMeshWorkload(); + auto multiply_and_subtract_mesh_workload = CreateMeshWorkload(); + AddProgramToMeshWorkload( + add_mesh_workload, *add_program, all_devices); // Addition runs on the full grid (sub_device 1) + AddProgramToMeshWorkload( + multiply_and_subtract_mesh_workload, + *multiply_program, + top_row); // Multiplication runs on the top row (sub_device 2) + AddProgramToMeshWorkload( + multiply_and_subtract_mesh_workload, + *subtract_program, + bottom_row); // Subtraction runs on the bottom row (sub device 2) + + // =========== Step 4: Compile and Load Workloads on the Mesh =========== + EnqueueMeshWorkload(mesh_device->mesh_command_queue(), add_mesh_workload, true); + EnqueueMeshWorkload(mesh_device->mesh_command_queue(), multiply_and_subtract_mesh_workload, true); + // =========== Step 5: Trace the MeshWorkloads using the Workload Dispatch CQ =========== + auto trace_id = BeginTraceCapture(mesh_device.get(), workload_cq_id); + EnqueueMeshWorkload(mesh_device->mesh_command_queue(), add_mesh_workload, false); + EnqueueMeshWorkload(mesh_device->mesh_command_queue(), multiply_and_subtract_mesh_workload, false); + EndTraceCapture(mesh_device.get(), workload_cq_id, trace_id); + + // =========== Step 6: Populate inputs =========== + uint32_t workload_0_src0_val = 2; + uint32_t workload_0_src1_val = 3; + uint32_t workload_1_src0_val = 7; + uint32_t workload_1_src1_val = 5; + // Uniform values passed to the add operation + std::vector add_src0_vec = create_constant_vector_of_bfloat16(add_src0_buf->size(), workload_0_src0_val); + std::vector add_src1_vec = create_constant_vector_of_bfloat16(add_src1_buf->size(), workload_0_src1_val); + // Uniform values passed to the multiply and subtract operations (the top row runs multiplication with subtraction + // on the bottom row of the Virtual Mesh) + std::vector mul_sub_src0_vec = + create_constant_vector_of_bfloat16(mul_sub_src0_buf->size(), workload_1_src0_val); + std::vector mul_sub_src1_vec = + create_constant_vector_of_bfloat16(mul_sub_src1_buf->size(), workload_1_src1_val); + + // =========== Step 7: Write inputs on MeshCQ1 =========== + // IO is done through MeshCQ1 and Workload dispatch is done through MeshCQ0. Use MeshEvents to synchronize the + // independent MeshCQs. + std::shared_ptr write_event = std::make_shared(); + std::shared_ptr trace_event = std::make_shared(); + + EnqueueWriteMeshBuffer(data_movement_cq, add_src0_buf, add_src0_vec); + EnqueueWriteMeshBuffer(data_movement_cq, add_src1_buf, add_src1_vec); + EnqueueWriteMeshBuffer(data_movement_cq, mul_sub_src0_buf, mul_sub_src0_vec); + EnqueueWriteMeshBuffer(data_movement_cq, mul_sub_src1_buf, mul_sub_src1_vec); + // Synchronize + EnqueueRecordEvent(data_movement_cq, write_event); + EnqueueWaitForEvent(workload_cq, write_event); + // =========== Step 8: Run MeshTrace on MeshCQ0 =========== + ReplayTrace(mesh_device.get(), workload_cq_id, trace_id, false); + // Synchronize + EnqueueRecordEvent(workload_cq, trace_event); + EnqueueWaitForEvent(data_movement_cq, trace_event); + // =========== Step 9: Read Outputs on MeshCQ1 =========== + std::vector add_dst_vec = {}; + std::vector mul_sub_dst_vec = {}; + EnqueueReadMeshBuffer(data_movement_cq, add_dst_vec, add_output_buf); + EnqueueReadMeshBuffer(data_movement_cq, mul_sub_dst_vec, mul_sub_output_buf); + + // =========== Step 10: Verify Outputs =========== + bool pass = true; + for (int i = 0; i < add_dst_vec.size(); i++) { + pass &= (add_dst_vec[i].to_float() == workload_0_src0_val + workload_0_src1_val); + } + for (int i = 0; i < mul_sub_dst_vec.size(); i++) { + if (i < mul_sub_dst_vec.size() / 2) { + pass &= (mul_sub_dst_vec[i].to_float() == workload_1_src0_val * workload_1_src1_val); + } else { + pass &= (mul_sub_dst_vec[i].to_float() == workload_1_src0_val - workload_1_src1_val); + } + } + ReleaseTrace(mesh_device.get(), trace_id); + if (pass) { + std::cout << "Running EltwiseBinary MeshTraces on 2 MeshCQs Passed!" << std::endl; + return 0; + } else { + std::cout << "Running EltwiseBinary MeshTraces on 2 MeshCQs Failed with Incorrect Outputs!" << std::endl; + return 1; + } +} diff --git a/tt_metal/programming_examples/distributed/CMakeLists.txt b/tt_metal/programming_examples/distributed/CMakeLists.txt index e887109662d..7dcd7fc8583 100644 --- a/tt_metal/programming_examples/distributed/CMakeLists.txt +++ b/tt_metal/programming_examples/distributed/CMakeLists.txt @@ -1,3 +1,4 @@ add_subdirectory(1_distributed_program_dispatch) add_subdirectory(2_distributed_buffer_rw) add_subdirectory(3_distributed_eltwise_add) +add_subdirectory(4_distributed_trace_and_events) From 29650dd6962728d31f012124c2fb77ad6a2d93f7 Mon Sep 17 00:00:00 2001 From: asaigal Date: Sat, 22 Feb 2025 17:46:01 -0800 Subject: [PATCH 241/316] #0: Resolve clang-tidy errors in distributed programming example --- .../distributed_trace_and_events.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tt_metal/programming_examples/distributed/4_distributed_trace_and_events/distributed_trace_and_events.cpp b/tt_metal/programming_examples/distributed/4_distributed_trace_and_events/distributed_trace_and_events.cpp index c438e65dcb3..f64154f3c74 100644 --- a/tt_metal/programming_examples/distributed/4_distributed_trace_and_events/distributed_trace_and_events.cpp +++ b/tt_metal/programming_examples/distributed/4_distributed_trace_and_events/distributed_trace_and_events.cpp @@ -25,9 +25,9 @@ using namespace tt::tt_metal::distributed; // synchronization std::shared_ptr EltwiseBinaryProgramGenerator( - std::shared_ptr src0_buf, - std::shared_ptr src1_buf, - std::shared_ptr output_buf, + const std::shared_ptr& src0_buf, + const std::shared_ptr& src1_buf, + const std::shared_ptr& output_buf, const SubDevice& sub_device_for_program, uint32_t num_tiles, uint32_t single_tile_size, From fc42103f31b0155edd8f55c299ef640dc72ce404 Mon Sep 17 00:00:00 2001 From: Austin Ho Date: Fri, 21 Feb 2025 23:05:28 +0000 Subject: [PATCH 242/316] #18184: Add low latency routing mode to EDM This encodes the full mcast/unicast path in the packet header to simplify decoding on the routers --- .../gtests/ccl/kernels/edm_fabric_writer.cpp | 21 +- ...c_erisc_datamover_sender_worker_reader.cpp | 2 +- ...c_erisc_datamover_sender_worker_sender.cpp | 6 +- .../fabric_worker_sender_multi_input.cpp | 6 +- .../ccl/kernels/test_kernels.common.hpp | 7 +- .../kernel_common/kernel_writers.hpp | 12 +- .../ccl/common/kernels/ccl_send_reader.cpp | 2 +- .../kernels/ccl_send_reader_two_input.cpp | 22 +- .../ccl/common/kernels/ccl_send_utils.hpp | 20 +- .../ccl/common/kernels/ccl_send_writer.cpp | 4 +- .../edm_fabric/edm_fabric_worker_adapters.hpp | 12 +- .../edm_fabric/fabric_edm_packet_header.hpp | 207 +++++++++++++++++- .../fabric_edm_packet_header_validate.hpp | 5 + .../fabric_edm_packet_transmission.hpp | 33 ++- .../edm_fabric/fabric_erisc_datamover.cpp | 89 +++++--- .../fabric_erisc_datamover_channels.hpp | 8 +- .../interleaved_dim3_1_1_32_any_writer.cpp | 14 +- .../llama_post_binary_matmul_shape_writer.cpp | 14 +- .../device/kernels/minimal_ccl_common.hpp | 8 +- 19 files changed, 371 insertions(+), 121 deletions(-) diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp index 91fe40d181e..c22ae1d57f3 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp @@ -18,8 +18,8 @@ static constexpr bool enable_any_synchronization = enable_start_synchronization FORCE_INLINE void line_sync( FabricConnectionManager& fabric_connection, - volatile tt::fabric::PacketHeader* mcast_fwd_packet_header, - volatile tt::fabric::PacketHeader* mcast_bwd_packet_header, + volatile PACKET_HEADER_TYPE* mcast_fwd_packet_header, + volatile PACKET_HEADER_TYPE* mcast_bwd_packet_header, size_t sync_bank_addr, size_t sync_noc_x, size_t sync_noc_y, @@ -33,7 +33,7 @@ FORCE_INLINE void line_sync( fabric_connection.get_forward_connection().wait_for_empty_write_slot(); print_pkt_header(mcast_fwd_packet_header); fabric_connection.get_forward_connection().send_payload_flush_non_blocking_from_address( - (uint32_t)mcast_fwd_packet_header, sizeof(tt::fabric::PacketHeader)); + (uint32_t)mcast_fwd_packet_header, sizeof(PACKET_HEADER_TYPE)); } if (fabric_connection.has_backward_connection()) { @@ -41,7 +41,7 @@ FORCE_INLINE void line_sync( fabric_connection.get_backward_connection().wait_for_empty_write_slot(); print_pkt_header(mcast_bwd_packet_header); fabric_connection.get_backward_connection().send_payload_flush_non_blocking_from_address( - (uint32_t)mcast_bwd_packet_header, sizeof(tt::fabric::PacketHeader)); + (uint32_t)mcast_bwd_packet_header, sizeof(PACKET_HEADER_TYPE)); } noc_semaphore_inc(get_noc_addr(sync_noc_x, sync_noc_y, sync_bank_addr), 1); if (sync_noc_x == my_x[0] && sync_noc_y == my_y[0]) { @@ -98,11 +98,11 @@ void kernel_main() { const auto source_l1_buffer_address = get_write_ptr(source_l1_cb_index); const auto packet_header_buffer_address = get_write_ptr(packet_header_cb); - auto* mcast_fwd_packet_header = reinterpret_cast(packet_header_buffer_address); + auto* mcast_fwd_packet_header = reinterpret_cast(packet_header_buffer_address); auto* mcast_bwd_packet_header = - reinterpret_cast(packet_header_buffer_address + sizeof(tt::fabric::PacketHeader)); + reinterpret_cast(packet_header_buffer_address + sizeof(PACKET_HEADER_TYPE)); auto* unicast_packet_header = - reinterpret_cast(packet_header_buffer_address + sizeof(tt::fabric::PacketHeader) * 2); + reinterpret_cast(packet_header_buffer_address + sizeof(PACKET_HEADER_TYPE) * 2); mcast_fwd_packet_header->to_chip_multicast(MulticastRoutingCommandHeader{1, static_cast(mcast_fwd_hops)}); mcast_bwd_packet_header->to_chip_multicast(MulticastRoutingCommandHeader{1, static_cast(mcast_bwd_hops)}); @@ -146,7 +146,7 @@ void kernel_main() { fabric_connection.get_forward_connection().send_payload_without_header_non_blocking_from_address( source_l1_buffer_address, packet_payload_size_bytes); fabric_connection.get_forward_connection().send_payload_flush_non_blocking_from_address( - (uint32_t)mcast_fwd_packet_header, sizeof(tt::fabric::PacketHeader)); + (uint32_t)mcast_fwd_packet_header, sizeof(PACKET_HEADER_TYPE)); } if (fabric_connection.has_backward_connection()) { @@ -157,7 +157,7 @@ void kernel_main() { fabric_connection.get_backward_connection().send_payload_without_header_non_blocking_from_address( source_l1_buffer_address, packet_payload_size_bytes); fabric_connection.get_backward_connection().send_payload_flush_non_blocking_from_address( - (uint32_t)mcast_bwd_packet_header, sizeof(tt::fabric::PacketHeader)); + (uint32_t)mcast_bwd_packet_header, sizeof(PACKET_HEADER_TYPE)); } { noc_async_writes_flushed(); @@ -174,8 +174,7 @@ void kernel_main() { fabric_conn.wait_for_empty_write_slot(); fabric_conn.send_payload_without_header_non_blocking_from_address( source_l1_buffer_address, packet_payload_size_bytes); - fabric_conn.send_payload_blocking_from_address( - (uint32_t)unicast_packet_header, sizeof(tt::fabric::PacketHeader)); + fabric_conn.send_payload_blocking_from_address((uint32_t)unicast_packet_header, sizeof(PACKET_HEADER_TYPE)); } if (enable_finish_synchronization) { diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_reader.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_reader.cpp index 976f579ab4d..46c421049f0 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_reader.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_reader.cpp @@ -30,7 +30,7 @@ void kernel_main() { uint32_t pages_to_read = std::min(pages_per_edm_buffer, num_pages_to_read_total - num_pages_read); cb_reserve_back(cb_id_in0, pages_to_read); uint32_t local_l1_read_addr = get_write_ptr(cb_id_in0); - local_l1_read_addr += sizeof(tt::fabric::PacketHeader); + local_l1_read_addr += sizeof(PACKET_HEADER_TYPE); for (uint32_t p = 0; p < pages_to_read; ++p) { uint64_t src_noc_addr = get_noc_addr(num_pages_read + p, source_address_generator); diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp index b210f32efb5..7bc4ad00b90 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp @@ -122,9 +122,9 @@ void kernel_main() { // bit of a hack to extract X/Y const auto dest_noc_address = get_noc_addr(p, dest_addr_gen, 0, NORMALIZED_NOC_INDEX); - const size_t packet_size = page_size + sizeof(tt::fabric::PacketHeader); + const size_t packet_size = page_size + sizeof(PACKET_HEADER_TYPE); auto packet_addr = get_read_ptr(cb_id_in0); - auto* packet_header = reinterpret_cast(packet_addr); + auto* packet_header = reinterpret_cast(packet_addr); if constexpr (mcast_mode) { packet_header ->to_chip_multicast( @@ -145,7 +145,7 @@ void kernel_main() { if constexpr (!mcast_mode) { sender.wait_for_empty_write_slot(); - auto& packet_header = *reinterpret_cast(a_packet_header_addr); + auto& packet_header = *reinterpret_cast(a_packet_header_addr); ASSERT(*last_message_semaphore_address == 0); uint64_t last_message_semaphore_noc0_addr = safe_get_noc_addr(my_x[0], my_y[0], (uint32_t)last_message_semaphore_address, 0); diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp index eaa14a0e40f..23b9789b998 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp @@ -52,10 +52,10 @@ auto forward_to_fabric_from_cb( // bit of a hack to extract X/Y const auto noc0_dest_address = get_noc_addr(current_page, dest_addr_gen, 0, NORMALIZED_NOC_INDEX); - const size_t packet_size = page_size + sizeof(tt::fabric::PacketHeader); + const size_t packet_size = page_size + sizeof(PACKET_HEADER_TYPE); auto packet_addr = get_read_ptr(cb_id); - auto &packet_header = *reinterpret_cast(packet_addr); + auto& packet_header = *reinterpret_cast(packet_addr); if constexpr (mcast_mode) { packet_header .to_chip_multicast(tt::fabric::MulticastRoutingCommandHeader{config.mcast.distance, config.mcast.range}) @@ -182,7 +182,7 @@ void kernel_main() { sender.wait_for_empty_write_slot(); constexpr size_t kLoopbackNumHopsToMyChip = 2; - auto &packet_header = *reinterpret_cast(a_packet_header_addr); + auto& packet_header = *reinterpret_cast(a_packet_header_addr); ASSERT(*last_message_semaphore_address == 0); packet_header.reserved = 0xE; packet_header.reserved2 = 0xFFFF; diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/test_kernels.common.hpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/test_kernels.common.hpp index ae5e9135a2b..8f5287ee0d7 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/kernels/test_kernels.common.hpp +++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/test_kernels.common.hpp @@ -29,13 +29,14 @@ bool terminate_fabric_endpoints_farthest_to_nearest ( get_noc_addr(edm_noc_x, edm_noc_y, termination_addr), tt::fabric::TerminationSignal::GRACEFULLY_TERMINATE); } else { - auto &packet_header = *reinterpret_cast(a_packet_header_addr); - reinterpret_cast(a_packet_header_addr)[sizeof(tt::fabric::PacketHeader) >> 2] = tt::fabric::TerminationSignal::GRACEFULLY_TERMINATE; + auto& packet_header = *reinterpret_cast(a_packet_header_addr); + reinterpret_cast(a_packet_header_addr)[sizeof(PACKET_HEADER_TYPE) >> 2] = + tt::fabric::TerminationSignal::GRACEFULLY_TERMINATE; sender.wait_for_empty_write_slot(); packet_header.to_chip_unicast(static_cast(distance)) .to_noc_unicast_write( tt::fabric::NocUnicastCommandHeader{termination_sig_noc_addr}, - sizeof(tt::fabric::PacketHeader) + sizeof(uint32_t)); + sizeof(PACKET_HEADER_TYPE) + sizeof(uint32_t)); sender.send_payload_blocking_from_address(a_packet_header_addr, packet_header.get_payload_size_including_header()); noc_async_writes_flushed(); } diff --git a/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/kernel_writers.hpp b/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/kernel_writers.hpp index fd6bae7f5ee..aa8fd3f04f0 100644 --- a/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/kernel_writers.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/kernel_writers.hpp @@ -28,7 +28,7 @@ FORCE_INLINE void write_and_advance_local_read_address_for_fabric_write( uint32_t payload_size_bytes) { const size_t payload_l1_address = l1_read_addr; - auto pkt_hdr = reinterpret_cast(packet_header_buffer_addr); + auto pkt_hdr = reinterpret_cast(packet_header_buffer_addr); #ifdef DEBUG_PRINT_ENABLED pkt_hdr->reserved2 = my_chip_id; #endif @@ -44,7 +44,7 @@ FORCE_INLINE void write_and_advance_local_read_address_for_fabric_write( pkt_hdr->to_chip_unicast(unicast_args.distance_in_hops); fabric_conn.wait_for_empty_write_slot(); fabric_conn.send_payload_without_header_non_blocking_from_address(l1_read_addr, payload_size_bytes); - fabric_conn.send_payload_flush_blocking_from_address((uint32_t)pkt_hdr, sizeof(tt::fabric::PacketHeader)); + fabric_conn.send_payload_flush_blocking_from_address((uint32_t)pkt_hdr, sizeof(PACKET_HEADER_TYPE)); } break; case ttnn::ccl::cmd::CclCommandDestType::CHIP_MULTICAST: { noc_async_write( @@ -57,7 +57,7 @@ FORCE_INLINE void write_and_advance_local_read_address_for_fabric_write( fabric_connection.get_forward_connection().send_payload_without_header_non_blocking_from_address( l1_read_addr, payload_size_bytes); fabric_connection.get_forward_connection().send_payload_flush_blocking_from_address( - (uint32_t)pkt_hdr, sizeof(tt::fabric::PacketHeader)); + (uint32_t)pkt_hdr, sizeof(PACKET_HEADER_TYPE)); } if (fabric_connection.has_backward_connection()) { @@ -67,7 +67,7 @@ FORCE_INLINE void write_and_advance_local_read_address_for_fabric_write( fabric_connection.get_backward_connection().send_payload_without_header_non_blocking_from_address( l1_read_addr, payload_size_bytes); fabric_connection.get_backward_connection().send_payload_flush_blocking_from_address( - (uint32_t)pkt_hdr, sizeof(tt::fabric::PacketHeader)); + (uint32_t)pkt_hdr, sizeof(PACKET_HEADER_TYPE)); } } break; default: { @@ -87,8 +87,8 @@ FORCE_INLINE void write_payload_then_advance_read_address( size_t& l1_read_addr, size_t payload_size_bytes) { static_assert( - ((sizeof(tt::fabric::PacketHeader) - 1) & sizeof(tt::fabric::PacketHeader)) == 0, - "sizeof(sizeof(tt::fabric::PacketHeader)) is not a power of two which violates the below assertion"); + is_power_of_2(sizeof(PACKET_HEADER_TYPE)), + "sizeof(tt::fabric::PacketHeader) is not a power of two which violates the below assertion"); switch (current_cmd_header.dest_type) { case ttnn::ccl::cmd::CclCommandDestType::CHIP_UNICAST: [[fallthrough]]; diff --git a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader.cpp b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader.cpp index bb62676afbf..172222d7abf 100644 --- a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader.cpp @@ -162,7 +162,7 @@ void kernel_main() { for (uint32_t p = 0; p < command_tensor.worker_pages_per_slice; p += packet_size_in_pages) { cb_reserve_back(cb_id, packet_size_in_pages); const uint32_t local_l1_scratch_buffer_address = - get_write_ptr(cb_id) + sizeof(tt::fabric::PacketHeader); + get_write_ptr(cb_id) + sizeof(PACKET_HEADER_TYPE); uint32_t n_pages = std::min(packet_size_in_pages, command_tensor.worker_pages_per_slice - p); ASSERT(command_tensor.worker_start_offset_in_slice.w == 0); diff --git a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp index 731ed70359e..8107d2d992e 100644 --- a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp @@ -437,7 +437,7 @@ void try_advance_inline_write_or_atomic_inc(command_context_t& cmd_ctx) ASSERT(cmd_ctx.core_desc_type == ttnn::ccl::cmd::CclCommandCoreDescriptorType::NOC_XY); ASSERT(cmd_ctx.packet_header_buffer_addr != 0); - auto* pkt_hdr = reinterpret_cast(cmd_ctx.packet_header_buffer_addr); + auto* pkt_hdr = reinterpret_cast(cmd_ctx.packet_header_buffer_addr); uint64_t dest_noc_addr_for_pkt = safe_get_noc_addr(dest_noc0_x, dest_noc0_y, dest_bank_addr, 0); if (cmd_ctx.current_cmd_header.code == ttnn::ccl::cmd::CclCommandCode::ATOMIC_INC) { @@ -457,7 +457,7 @@ void try_advance_inline_write_or_atomic_inc(command_context_t& cmd_ctx) : cmd_ctx.fabric_connection.get_backward_connection(); fabric_connection.wait_for_empty_write_slot(); fabric_connection.send_payload_flush_blocking_from_address( - cmd_ctx.packet_header_buffer_addr, sizeof(tt::fabric::PacketHeader)); + cmd_ctx.packet_header_buffer_addr, sizeof(PACKET_HEADER_TYPE)); } break; case ttnn::ccl::cmd::CclCommandDestType::CHIP_MULTICAST: { write_local = true; @@ -467,7 +467,7 @@ void try_advance_inline_write_or_atomic_inc(command_context_t& cmd_ctx) 1, static_cast(mcast_args.num_targets_forward_direction)}); cmd_ctx.fabric_connection.get_forward_connection().wait_for_empty_write_slot(); cmd_ctx.fabric_connection.get_forward_connection().send_payload_flush_blocking_from_address( - cmd_ctx.packet_header_buffer_addr, sizeof(tt::fabric::PacketHeader)); + cmd_ctx.packet_header_buffer_addr, sizeof(PACKET_HEADER_TYPE)); } // Write the mcast packet (backward) @@ -476,7 +476,7 @@ void try_advance_inline_write_or_atomic_inc(command_context_t& cmd_ctx) 1, static_cast(mcast_args.num_targets_backward_direction)}); cmd_ctx.fabric_connection.get_backward_connection().wait_for_empty_write_slot(); cmd_ctx.fabric_connection.get_backward_connection().send_payload_non_blocking_from_address( - cmd_ctx.packet_header_buffer_addr, sizeof(tt::fabric::PacketHeader)); + cmd_ctx.packet_header_buffer_addr, sizeof(PACKET_HEADER_TYPE)); } } break; @@ -559,7 +559,7 @@ void write_and_advance_local_read_address_for_fabric_write( uint32_t payload_size_bytes) { const size_t payload_l1_address = l1_read_addr; - auto pkt_hdr = reinterpret_cast(packet_header_buffer_addr); + auto pkt_hdr = reinterpret_cast(packet_header_buffer_addr); pkt_hdr->to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_noc_addr}, payload_size_bytes); @@ -573,7 +573,7 @@ void write_and_advance_local_read_address_for_fabric_write( fabric_conn.wait_for_empty_write_slot(); fabric_conn.send_payload_without_header_non_blocking_from_address(l1_read_addr, payload_size_bytes); - fabric_conn.send_payload_flush_blocking_from_address((uint32_t)pkt_hdr, sizeof(tt::fabric::PacketHeader)); + fabric_conn.send_payload_flush_blocking_from_address((uint32_t)pkt_hdr, sizeof(PACKET_HEADER_TYPE)); } break; case ttnn::ccl::cmd::CclCommandDestType::CHIP_MULTICAST: { const auto [dest_noc_xy, dest_addr] = get_noc_address_components(noc0_dest_noc_addr); @@ -588,7 +588,7 @@ void write_and_advance_local_read_address_for_fabric_write( fabric_connection.get_forward_connection().send_payload_without_header_non_blocking_from_address( l1_read_addr, payload_size_bytes); fabric_connection.get_forward_connection().send_payload_flush_blocking_from_address( - (uint32_t)pkt_hdr, sizeof(tt::fabric::PacketHeader)); + (uint32_t)pkt_hdr, sizeof(PACKET_HEADER_TYPE)); } if (fabric_connection.has_backward_connection()) { @@ -598,7 +598,7 @@ void write_and_advance_local_read_address_for_fabric_write( fabric_connection.get_backward_connection().send_payload_without_header_non_blocking_from_address( l1_read_addr, payload_size_bytes); fabric_connection.get_backward_connection().send_payload_flush_blocking_from_address( - (uint32_t)pkt_hdr, sizeof(tt::fabric::PacketHeader)); + (uint32_t)pkt_hdr, sizeof(PACKET_HEADER_TYPE)); } } break; default: { @@ -618,8 +618,8 @@ FORCE_INLINE void write_payload_then_advance_read_address( size_t& l1_read_addr, size_t payload_size_bytes) { static_assert( - ((sizeof(tt::fabric::PacketHeader) - 1) & sizeof(tt::fabric::PacketHeader)) == 0, - "sizeof(sizeof(tt::fabric::PacketHeader)) is not a power of two which violates the below assertion"); + is_power_of_2(sizeof(PACKET_HEADER_TYPE)), + "sizeof(PACKET_HEADER_TYPE) is not a power of two which violates the below assertion"); switch (current_cmd_header.dest_type) { case ttnn::ccl::cmd::CclCommandDestType::CHIP_UNICAST: [[fallthrough]]; @@ -933,7 +933,7 @@ void kernel_main() { cb_reserve_back(reserved_packet_header_cb_id, num_packet_headers_storable); auto packet_header_buffer_addr0 = get_write_ptr(reserved_packet_header_cb_id); auto packet_header_buffer_addr1 = - packet_header_buffer_addr0 + (num_packet_headers_storable >> 2) * sizeof(tt::fabric::PacketHeader); + packet_header_buffer_addr0 + (num_packet_headers_storable >> 2) * sizeof(PACKET_HEADER_TYPE); auto operand_0_cmd_ctx = command_context_t( fabric_connection, diff --git a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp index 904cd775a9a..decb79c8070 100644 --- a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp @@ -96,7 +96,7 @@ void mcast_contig_pages_to_noc_address( size_t backward_direction_num_hops) { const size_t payload_size_bytes = contig_pages_advanced * payload_page_size; const auto [dest_noc_xy, dest_addr] = get_noc_address_components(noc0_dest_addr); - const size_t payload_l1_address = l1_read_addr + sizeof(tt::fabric::PacketHeader); + const size_t payload_l1_address = l1_read_addr + sizeof(PACKET_HEADER_TYPE); // Local chip write noc_async_write( @@ -106,15 +106,15 @@ void mcast_contig_pages_to_noc_address( // coords it is necessary get_noc_addr(dest_noc_xy.x, dest_noc_xy.y, dest_addr, noc_index), payload_size_bytes); - size_t packet_send_size_bytes = payload_size_bytes + sizeof(tt::fabric::PacketHeader); + size_t packet_send_size_bytes = payload_size_bytes + sizeof(PACKET_HEADER_TYPE); // Forward fabric connection if (has_forward_fabric_connection) { static_assert( - ((sizeof(tt::fabric::PacketHeader) - 1) & sizeof(tt::fabric::PacketHeader)) == 0, - "sizeof(sizeof(tt::fabric::PacketHeader)) is not a power of two which violates the below assertion"); + is_power_of_2(sizeof(PACKET_HEADER_TYPE)), + "sizeof(tt::fabric::PacketHeader) is not a power of two which violates the below assertion"); - auto& pkt_hdr = *reinterpret_cast(l1_read_addr); + auto& pkt_hdr = *reinterpret_cast(l1_read_addr); pkt_hdr .to_chip_multicast( tt::fabric::MulticastRoutingCommandHeader{1, static_cast(forward_direction_num_hops)}) @@ -125,7 +125,7 @@ void mcast_contig_pages_to_noc_address( // Backward fabric connection if (has_backward_fabric_connection) { - auto& pkt_hdr = *reinterpret_cast(l1_read_addr); + auto& pkt_hdr = *reinterpret_cast(l1_read_addr); pkt_hdr .to_chip_multicast( tt::fabric::MulticastRoutingCommandHeader{1, static_cast(backward_direction_num_hops)}) @@ -286,11 +286,11 @@ void mcast_sync_signal_to_addr( size_t remote_sem_l1_addr, size_t directional_num_hops) { static_assert( - ((sizeof(tt::fabric::PacketHeader) - 1) & sizeof(tt::fabric::PacketHeader)) == 0, - "sizeof(sizeof(tt::fabric::PacketHeader)) is not a power of two which violates the below assertion"); - ASSERT((pkt_addr & (sizeof(tt::fabric::PacketHeader) - 1)) == 0); + is_power_of_2(sizeof(PACKET_HEADER_TYPE)), + "sizeof(tt::fabric::PacketHeader) is not a power of two which violates the below assertion"); + ASSERT((pkt_addr & (sizeof(PACKET_HEADER_TYPE) - 1)) == 0); - auto& pkt_hdr = *reinterpret_cast(pkt_addr); + auto& pkt_hdr = *reinterpret_cast(pkt_addr); pkt_hdr .to_chip_multicast(tt::fabric::MulticastRoutingCommandHeader{1, static_cast(directional_num_hops)}) .to_noc_unicast_atomic_inc(tt::fabric::NocUnicastAtomicIncCommandHeader{ diff --git a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_writer.cpp b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_writer.cpp index 71865c224e5..766cdd0b688 100644 --- a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_writer.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_writer.cpp @@ -125,7 +125,7 @@ void kernel_main() { // out when we start enabling other modes const size_t packet_size_in_pages = get_arg_val(arg_idx++); const size_t payload_page_size = get_arg_val(arg_idx++); - const size_t l1_scratch_page_size = payload_page_size + sizeof(tt::fabric::PacketHeader); + const size_t l1_scratch_page_size = payload_page_size + sizeof(PACKET_HEADER_TYPE); const size_t forward_direction_num_hops = get_arg_val(arg_idx++); const size_t backward_direction_num_hops = get_arg_val(arg_idx++); const bool has_forward_fabric_connection = get_arg_val(arg_idx++) != 0; @@ -248,7 +248,7 @@ void kernel_main() { DPRINT << "ccl_send_writer Sending payload completion sync signals\n"; ASSERT(some_buffering_addr != 0); some_buffering_addr = - (some_buffering_addr + (sizeof(tt::fabric::PacketHeader))) & ~(sizeof(tt::fabric::PacketHeader) - 1); + (some_buffering_addr + (sizeof(PACKET_HEADER_TYPE))) & ~(sizeof(PACKET_HEADER_TYPE) - 1); mcast_sync_signal_to_addr( some_buffering_addr, diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp index 564ed163999..87ba5ea5fba 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp @@ -326,7 +326,7 @@ struct WorkerToFabricEdmSenderImpl { FORCE_INLINE void send_packet_header_and_notify_fabric(uint32_t source_address) { uint64_t buffer_address = this->compute_dest_buffer_slot_noc_addr(); - send_chunk_from_address(source_address, 1, sizeof(tt::fabric::PacketHeader), buffer_address); + send_chunk_from_address(source_address, 1, sizeof(PACKET_HEADER_TYPE), buffer_address); post_send_payload_increment_pointers(); } @@ -335,23 +335,23 @@ struct WorkerToFabricEdmSenderImpl { uint64_t buffer_address = this->compute_dest_buffer_slot_noc_addr(); // skip past the first part of the buffer which will be occupied by the packet header - send_chunk_from_address(source_address, 1, size_bytes, buffer_address + sizeof(tt::fabric::PacketHeader)); + send_chunk_from_address(source_address, 1, size_bytes, buffer_address + sizeof(PACKET_HEADER_TYPE)); } template FORCE_INLINE void send_payload_from_address_impl(uint32_t source_address, size_t size_bytes) { uint64_t buffer_address = this->compute_dest_buffer_slot_noc_addr(); ASSERT(size_bytes <= this->buffer_size_bytes); - ASSERT(tt::fabric::is_valid(*const_cast( - reinterpret_cast(source_address)))); + ASSERT(tt::fabric::is_valid(*const_cast( + reinterpret_cast(source_address)))); send_chunk_from_address(source_address, 1, size_bytes, buffer_address); post_send_payload_increment_pointers(); } template FORCE_INLINE void send_payload_from_address_with_trid_impl(uint32_t source_address, size_t size_bytes, uint8_t trid) { ASSERT(size_bytes <= this->buffer_size_bytes); - ASSERT(tt::fabric::is_valid(*const_cast( - reinterpret_cast(source_address)))); + ASSERT(tt::fabric::is_valid(*const_cast( + reinterpret_cast(source_address)))); send_chunk_from_address_with_trid(source_address, 1, size_bytes, this->edm_buffer_addr, trid, this->edm_noc_cmd_buf); post_send_payload_increment_pointers(); } diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp index af3c53f27b5..c6ba0fe24e0 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp @@ -89,11 +89,11 @@ struct NocMulticastAtomicIncCommandHeader { uint8_t size_x; uint8_t size_y; }; -static_assert(sizeof(NocUnicastCommandHeader) == 8, "NocUnicastCommandHeader size is not 1 byte"); -static_assert(sizeof(NocMulticastCommandHeader) == 8, "NocMulticastCommandHeader size is not 1 byte"); -static_assert(sizeof(NocUnicastInlineWriteCommandHeader) == 16, "NocMulticastCommandHeader size is not 1 byte"); -static_assert(sizeof(NocUnicastAtomicIncCommandHeader) == 16, "NocUnicastCommandHeader size is not 1 byte"); -static_assert(sizeof(NocMulticastAtomicIncCommandHeader) == 12, "NocAtomicIncCommandHeader size is not 1 byte"); +static_assert(sizeof(NocUnicastCommandHeader) == 8, "NocUnicastCommandHeader size is not 8 bytes"); +static_assert(sizeof(NocMulticastCommandHeader) == 8, "NocMulticastCommandHeader size is not 8 bytes"); +static_assert(sizeof(NocUnicastInlineWriteCommandHeader) == 16, "NocMulticastCommandHeader size is not 16 bytes"); +static_assert(sizeof(NocUnicastAtomicIncCommandHeader) == 16, "NocUnicastCommandHeader size is not 16 bytes"); +static_assert(sizeof(NocMulticastAtomicIncCommandHeader) == 12, "NocAtomicIncCommandHeader size is not 12 bytes"); union NocCommandFields{ NocUnicastCommandHeader unicast_write; NocUnicastInlineWriteCommandHeader unicast_inline_write; @@ -251,11 +251,208 @@ struct PacketHeader { inline void set_src_ch_id(uint8_t ch_id) volatile { this->src_ch_id = ch_id; } }; +struct LowLatencyRoutingFields { + static constexpr uint32_t FIELD_WIDTH = 2; + static constexpr uint32_t FIELD_MASK = 0b11; + static constexpr uint32_t NOOP = 0b00; + static constexpr uint32_t WRITE_ONLY = 0b01; + static constexpr uint32_t FORWARD_ONLY = 0b10; + static constexpr uint32_t WRITE_AND_FORWARD = 0b11; + static constexpr uint32_t FWD_ONLY_FIELD = 0xAAAAAAAA; + static constexpr uint32_t WR_AND_FWD_FIELD = 0xFFFFFFFF; + uint32_t value; +}; + +// TODO: wrap this in a debug version that holds type info so we can assert for field/command/ +struct LowLatencyPacketHeader { + // TODO: trim this down noc_send_type 2 bits (4 values): + // -> unicast_write, mcast_write, unicast_seminc, mcast_seminc + // For now, kept it separate so I could do reads which would be handled differently + // but for our purposes we shouldn't need read so we should be able to omit the support + NocSendType noc_send_type : 4; + + // Used only by the EDM sender and receiver channels. Populated by EDM sender channel to + // indicate to the receiver channel what channel was the source of this packet. Reserved + // otherwise. + uint8_t src_ch_id : 4; + + LowLatencyRoutingFields routing_fields; + uint16_t payload_size_bytes; // excludes header size + NocCommandFields command_fields; // size = 16B due to uint64_t alignment + + // Sort of hack to work-around DRAM read alignment issues that must be 32B aligned + // To simplify worker kernel code, we for now decide to pad up the packet header + // to 32B so the user can simplify shift into their CB chunk by sizeof(tt::fabric::PacketHeader) + // and automatically work around the DRAM read alignment bug. + // + // Future changes will remove this padding and require the worker kernel to be aware of this bug + // and pad their own CBs conditionally when reading from DRAM. It'll be up to the users to + // manage this complexity. + + inline void set_noc_send_type(NocSendType &type) { this->noc_send_type = type; } + inline void set_routing_fields(LowLatencyRoutingFields &fields) { this->routing_fields = fields; } + inline void set_command_fields(NocCommandFields &fields) { this->command_fields = fields; } + + // Returns size of payload in bytes - TODO: convert to words (4B) + size_t get_payload_size_excluding_header() volatile const { + return this->payload_size_bytes; + } + inline size_t get_payload_size_including_header() volatile const { + return get_payload_size_excluding_header() + sizeof(LowLatencyPacketHeader); + } + + inline LowLatencyPacketHeader& to_chip_unicast(uint8_t distance_in_hops) { + // Example of unicast 3 hops away + // First line will do 0xAAAAAAAA & 0b1111 = 0b1010. This means starting from our neighbor, we will forward twice (forward to neighbor is not encoded in the field) + // Last line will do 0b01 << 4 = 0b010000. This means that on the 3rd chip, we will write only + // Together this means the final encoding is 0b011010 + this->routing_fields.value = + (LowLatencyRoutingFields::FWD_ONLY_FIELD & ((1 << (distance_in_hops - 1) * LowLatencyRoutingFields::FIELD_WIDTH) - 1)) | + (LowLatencyRoutingFields::WRITE_ONLY << (distance_in_hops - 1) * LowLatencyRoutingFields::FIELD_WIDTH); + return *this; + } + inline LowLatencyPacketHeader& to_chip_multicast( + const MulticastRoutingCommandHeader& chip_multicast_command_header) { + + // Example of starting 3 hops away mcasting to 2 chips + // First line will do 0xAAAAAAAA & 0b1111 = 0b1010. This means starting from our neighbor, we will forward twice (forward to neighbor is not encoded in the field) + // Second line will do 0xFFFFFFFF & 0b11 = 0b11. 0b11 << 4 = 0b110000. This means starting from the 3rd chip, we will write and forward once + // Last line will do 0b01 << 6 = 0b01000000. This means that on the 5th chip, we will write only + // Together this means the final encoding is 0b01111010 + this->routing_fields.value = + (LowLatencyRoutingFields::FWD_ONLY_FIELD & ((1 << (chip_multicast_command_header.start_distance_in_hops - 1) * LowLatencyRoutingFields::FIELD_WIDTH) - 1)) | + (LowLatencyRoutingFields::WR_AND_FWD_FIELD & ((1 << (chip_multicast_command_header.range_hops - 1) * LowLatencyRoutingFields::FIELD_WIDTH) - 1) << + ((chip_multicast_command_header.start_distance_in_hops - 1) * LowLatencyRoutingFields::FIELD_WIDTH)) | + (LowLatencyRoutingFields::WRITE_ONLY << (chip_multicast_command_header.start_distance_in_hops + chip_multicast_command_header.range_hops - 2) * LowLatencyRoutingFields::FIELD_WIDTH); + return *this; + } + + inline LowLatencyPacketHeader &to_noc_unicast_write(NocUnicastCommandHeader const &noc_unicast_command_header, size_t payload_size_bytes) { + this->noc_send_type = NOC_UNICAST_WRITE; + this->command_fields.unicast_write = noc_unicast_command_header; + this->payload_size_bytes = payload_size_bytes; + return *this; + } + inline LowLatencyPacketHeader &to_noc_unicast_inline_write(NocUnicastInlineWriteCommandHeader const &noc_unicast_command_header) { + this->noc_send_type = NOC_UNICAST_INLINE_WRITE; + this->command_fields.unicast_inline_write = noc_unicast_command_header; + this->payload_size_bytes = 0; + return *this; + } + inline LowLatencyPacketHeader &to_noc_multicast_write(NocMulticastCommandHeader const &noc_multicast_command_header, size_t payload_size_bytes) { + this->noc_send_type = NOC_MULTICAST_WRITE; + this->command_fields.mcast_write = noc_multicast_command_header; + this->payload_size_bytes = payload_size_bytes; + return *this; + } + inline LowLatencyPacketHeader &to_noc_unicast_atomic_inc(NocUnicastAtomicIncCommandHeader const &noc_unicast_atomic_inc_command_header) { + this->noc_send_type = NOC_UNICAST_ATOMIC_INC; + this->command_fields.unicast_seminc = noc_unicast_atomic_inc_command_header; + this->payload_size_bytes = 0; + return *this; + } + inline LowLatencyPacketHeader &to_noc_multicast_atomic_inc(NocMulticastAtomicIncCommandHeader const &noc_multicast_command_header, size_t payload_size_bytes) { + #if defined(KERNEL_BUILD) || defined(FW_BUILD) + ASSERT(false); + while (1) {}; + #endif + this->payload_size_bytes = payload_size_bytes; + return *this; + } + + inline volatile LowLatencyPacketHeader* to_chip_unicast(uint8_t distance_in_hops) volatile { + // Example of unicast 3 hops away + // First line will do 0xAAAAAAAA & 0b1111 = 0b1010. This means starting from our neighbor, we will forward twice (forward to neighbor is not encoded in the field) + // Last line will do 0b01 << 4 = 0b010000. This means that on the 3rd chip, we will write only + // Together this means the final encoding is 0b011010 + this->routing_fields.value = + (LowLatencyRoutingFields::FWD_ONLY_FIELD & ((1 << (distance_in_hops - 1) * LowLatencyRoutingFields::FIELD_WIDTH) - 1)) | + (LowLatencyRoutingFields::WRITE_ONLY << (distance_in_hops - 1) * LowLatencyRoutingFields::FIELD_WIDTH); + return this; + } + inline volatile LowLatencyPacketHeader* to_chip_multicast( + const MulticastRoutingCommandHeader& chip_multicast_command_header) volatile { + // Example of starting 3 hops away mcasting to 2 chips + // First line will do 0xAAAAAAAA & 0b1111 = 0b1010. This means starting from our neighbor, we will forward twice (forward to neighbor is not encoded in the field) + // Second line will do 0xFFFFFFFF & 0b11 = 0b11. 0b11 << 4 = 0b110000. This means starting from the 3rd chip, we will write and forward once + // Last line will do 0b01 << 6 = 0b01000000. This means that on the 5th chip, we will write only + // Together this means the final encoding is 0b01111010 + this->routing_fields.value = + (LowLatencyRoutingFields::FWD_ONLY_FIELD & ((1 << (chip_multicast_command_header.start_distance_in_hops - 1) * LowLatencyRoutingFields::FIELD_WIDTH) - 1)) | + (LowLatencyRoutingFields::WR_AND_FWD_FIELD & ((1 << (chip_multicast_command_header.range_hops - 1) * LowLatencyRoutingFields::FIELD_WIDTH) - 1) << + ((chip_multicast_command_header.start_distance_in_hops - 1) * LowLatencyRoutingFields::FIELD_WIDTH)) | + (LowLatencyRoutingFields::WRITE_ONLY << (chip_multicast_command_header.start_distance_in_hops + chip_multicast_command_header.range_hops - 2) * LowLatencyRoutingFields::FIELD_WIDTH); + return this; + } + inline volatile LowLatencyPacketHeader *to_noc_unicast_write(NocUnicastCommandHeader const &noc_unicast_command_header, size_t payload_size_bytes) volatile { + this->noc_send_type = NOC_UNICAST_WRITE; + this->command_fields.unicast_write.noc_address = noc_unicast_command_header.noc_address; + this->payload_size_bytes = payload_size_bytes; + + return this; + } + inline volatile LowLatencyPacketHeader &to_noc_unicast_inline_write(NocUnicastInlineWriteCommandHeader const &noc_unicast_command_header) volatile { + this->noc_send_type = NOC_UNICAST_INLINE_WRITE; + this->command_fields.unicast_inline_write.noc_address = noc_unicast_command_header.noc_address; + this->command_fields.unicast_inline_write.value = noc_unicast_command_header.value; + this->payload_size_bytes = 0; + return *this; + } + inline volatile LowLatencyPacketHeader *to_noc_multicast(NocMulticastCommandHeader const &noc_multicast_command_header, size_t payload_size_bytes) volatile { + this->noc_send_type = NOC_MULTICAST_WRITE; + this->command_fields.mcast_write.mcast_rect_size_x = noc_multicast_command_header.mcast_rect_size_x; + this->command_fields.mcast_write.mcast_rect_size_y = noc_multicast_command_header.mcast_rect_size_y; + this->command_fields.mcast_write.noc_x_start = noc_multicast_command_header.noc_x_start; + this->command_fields.mcast_write.noc_y_start = noc_multicast_command_header.noc_y_start; + this->payload_size_bytes = payload_size_bytes; + this->command_fields.mcast_write.address = noc_multicast_command_header.address; + + return this; + } + inline volatile LowLatencyPacketHeader *to_noc_unicast_atomic_inc( + NocUnicastAtomicIncCommandHeader const &noc_unicast_atomic_inc_command_header) volatile { + this->noc_send_type = NOC_UNICAST_ATOMIC_INC; + this->command_fields.unicast_seminc.noc_address = noc_unicast_atomic_inc_command_header.noc_address; + this->command_fields.unicast_seminc.val = noc_unicast_atomic_inc_command_header.val; + this->command_fields.unicast_seminc.wrap = noc_unicast_atomic_inc_command_header.wrap; + this->payload_size_bytes = 0; + + return this; + } + inline volatile LowLatencyPacketHeader *to_noc_multicast_atomic_inc( + NocMulticastAtomicIncCommandHeader const &noc_multicast_atomic_inc_command_header, size_t payload_size_bytes) volatile { + this->noc_send_type = NOC_MULTICAST_ATOMIC_INC; + this->command_fields.mcast_seminc.address = noc_multicast_atomic_inc_command_header.address; + this->command_fields.mcast_seminc.noc_x_start = noc_multicast_atomic_inc_command_header.noc_x_start; + this->command_fields.mcast_seminc.noc_y_start = noc_multicast_atomic_inc_command_header.noc_y_start; + this->command_fields.mcast_seminc.size_x = noc_multicast_atomic_inc_command_header.size_x; + this->command_fields.mcast_seminc.size_y = noc_multicast_atomic_inc_command_header.size_y; + this->command_fields.mcast_seminc.val = noc_multicast_atomic_inc_command_header.val; + this->command_fields.mcast_seminc.wrap = noc_multicast_atomic_inc_command_header.wrap; + this->payload_size_bytes = payload_size_bytes; + + return this; + } + inline void set_src_ch_id(uint8_t ch_id) volatile { this->src_ch_id = ch_id; } +}; + // TODO: When we remove the 32B padding requirement, reduce to 16B size check static_assert(sizeof(PacketHeader) == 32, "sizeof(PacketHeader) is not equal to 32B"); +// Host code still hardcoded to sizeof(PacketHeader) so we need to keep this check +static_assert(sizeof(LowLatencyPacketHeader) == sizeof(PacketHeader), "sizeof(LowLatencyPacketHeader) is not equal to 32B"); static constexpr size_t header_size_bytes = sizeof(PacketHeader); +#define FABRIC_LOW_LATENCY_MODE 1 + +#if defined FABRIC_LOW_LATENCY_MODE and FABRIC_LOW_LATENCY_MODE == 1 +#define PACKET_HEADER_TYPE tt::fabric::LowLatencyPacketHeader +#define ROUTING_FIELDS_TYPE tt::fabric::LowLatencyRoutingFields +#else +#define PACKET_HEADER_TYPE tt::fabric::PacketHeader +#define ROUTING_FIELDS_TYPE tt::fabric::RoutingFields +#endif + } // namespace tt::fabric diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header_validate.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header_validate.hpp index 2589c8f526a..a284320d4d1 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header_validate.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header_validate.hpp @@ -16,4 +16,9 @@ FORCE_INLINE bool is_valid(PacketHeader const& packet_header) { return (packet_header.chip_send_type <= CHIP_SEND_TYPE_LAST) && (packet_header.noc_send_type <= NOC_SEND_TYPE_LAST); } +FORCE_INLINE void validate(const LowLatencyPacketHeader& packet_header) {} +FORCE_INLINE bool is_valid(const LowLatencyPacketHeader& packet_header) { + return (packet_header.noc_send_type <= NOC_SEND_TYPE_LAST); +} + } // namespace tt::fabric diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp index 85553bf6dab..5e8f59954c2 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp @@ -32,7 +32,13 @@ FORCE_INLINE void print_pkt_hdr_routing_fields(volatile tt::fabric::PacketHeader #endif } -FORCE_INLINE void print_pkt_header_noc_fields(volatile tt::fabric::PacketHeader *const packet_start) { +FORCE_INLINE void print_pkt_hdr_routing_fields(volatile tt::fabric::LowLatencyPacketHeader *const packet_start) { + #ifdef DEBUG_PRINT_ENABLED + DPRINT << "ROUTE:" << packet_start->routing_fields.value << "\n"; + #endif +} + +FORCE_INLINE void print_pkt_header_noc_fields(volatile PACKET_HEADER_TYPE *const packet_start) { #ifdef DEBUG_PRINT_ENABLED switch (packet_start->noc_send_type) { case tt::fabric::NocSendType::NOC_UNICAST_WRITE: { @@ -62,12 +68,23 @@ FORCE_INLINE void print_pkt_header(volatile tt::fabric::PacketHeader *const pack #endif } +FORCE_INLINE void print_pkt_header(volatile tt::fabric::LowLatencyPacketHeader *const packet_start) { + #ifdef DEBUG_PRINT_ENABLED + auto const& header = *packet_start; + DPRINT << "PKT: nsnd_t:" << (uint32_t) packet_start->noc_send_type << + ", src_chip:" << (uint32_t) packet_start->src_ch_id << + ", payload_size_bytes:" << (uint32_t) packet_start->payload_size_bytes << "\n"; + print_pkt_hdr_routing_fields(packet_start); + print_pkt_header_noc_fields(packet_start); + #endif + } + // Since we unicast to local, we must omit the packet header FORCE_INLINE void execute_chip_unicast_to_local_chip( - volatile tt::fabric::PacketHeader *const packet_start, uint16_t payload_size_bytes, uint32_t transaction_id) { + volatile PACKET_HEADER_TYPE *const packet_start, uint16_t payload_size_bytes, uint32_t transaction_id) { auto const& header = *packet_start; - uint32_t payload_start_address = reinterpret_cast(packet_start) + sizeof(tt::fabric::PacketHeader); + uint32_t payload_start_address = reinterpret_cast(packet_start) + sizeof(PACKET_HEADER_TYPE); tt::fabric::NocSendType noc_send_type = packet_start->noc_send_type; switch (noc_send_type) { @@ -116,6 +133,10 @@ FORCE_INLINE void update_packet_header_for_next_hop(volatile tt::fabric::PacketH packet_header->routing_fields.value = cached_routing_fields.value - decrement_val; } +FORCE_INLINE void update_packet_header_for_next_hop(volatile tt::fabric::LowLatencyPacketHeader * packet_header, tt::fabric::LowLatencyRoutingFields cached_routing_fields) { + packet_header->routing_fields.value >>= tt::fabric::LowLatencyRoutingFields::FIELD_WIDTH; +} + // This function forwards a packet to the downstream EDM channel for eventual sending // to the next chip in the line/ring // @@ -127,9 +148,9 @@ FORCE_INLINE void update_packet_header_for_next_hop(volatile tt::fabric::PacketH // !!!WARNING!!! template FORCE_INLINE void forward_payload_to_downstream_edm( - volatile tt::fabric::PacketHeader *packet_header, + volatile PACKET_HEADER_TYPE *packet_header, uint16_t payload_size_bytes, - tt::fabric::RoutingFields cached_routing_fields, + ROUTING_FIELDS_TYPE cached_routing_fields, tt::fabric::EdmToEdmSender &downstream_edm_interface, uint8_t transaction_id ) { @@ -141,6 +162,6 @@ FORCE_INLINE void forward_payload_to_downstream_edm( update_packet_header_for_next_hop(packet_header, cached_routing_fields); downstream_edm_interface.send_payload_non_blocking_from_address_with_trid( reinterpret_cast(packet_header), - payload_size_bytes + sizeof(tt::fabric::PacketHeader), + payload_size_bytes + sizeof(PACKET_HEADER_TYPE), transaction_id); } diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp index be1ec45d50d..f80505d936d 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp @@ -433,18 +433,18 @@ struct ReceiverChannelPointers { }; struct PacketHeaderRecorder { - volatile tt::fabric::PacketHeader *buffer_ptr; + volatile uint32_t *buffer_ptr; size_t buffer_n_headers; size_t buffer_index; - PacketHeaderRecorder(volatile tt::fabric::PacketHeader *buffer_ptr, size_t buffer_n_headers) : buffer_ptr(buffer_ptr), buffer_n_headers(buffer_n_headers), buffer_index(0) {} + PacketHeaderRecorder(volatile uint32_t *buffer_ptr, size_t buffer_n_headers) : buffer_ptr(buffer_ptr), buffer_n_headers(buffer_n_headers), buffer_index(0) {} - void record_packet_header(volatile tt::fabric::PacketHeader *packet_header_ptr) { - uint32_t dest_l1_addr = (uint32_t)buffer_ptr + buffer_index * sizeof(tt::fabric::PacketHeader); + void record_packet_header(volatile uint32_t *packet_header_ptr) { + uint32_t dest_l1_addr = (uint32_t)buffer_ptr + buffer_index * sizeof(PACKET_HEADER_TYPE); noc_async_write( (uint32_t)packet_header_ptr, get_noc_addr(my_x[0], my_y[0], dest_l1_addr), - sizeof(tt::fabric::PacketHeader), + sizeof(PACKET_HEADER_TYPE), 1 - noc_index // avoid the contention on main noc ); buffer_index++; @@ -541,8 +541,8 @@ FORCE_INLINE void send_next_data( // NOTE: if we always send full packet, then we don't need the second branch below dedicated for // channel sync auto volatile *pkt_header = - reinterpret_cast(sender_buffer_channel.get_buffer_address(local_sender_wrptr_buffer_index)); - ASSERT(tt::fabric::is_valid(*const_cast(pkt_header))); + reinterpret_cast(sender_buffer_channel.get_buffer_address(local_sender_wrptr_buffer_index)); + ASSERT(tt::fabric::is_valid(*const_cast(pkt_header))); size_t payload_size_bytes = pkt_header->get_payload_size_including_header(); pkt_header->src_ch_id = sender_channel_index; @@ -582,7 +582,7 @@ FORCE_INLINE void receiver_send_received_ack( // Set the acknowledgement bits. We have a different location than the auto receiver_buffer_index = receiver_channel_ptr.get_buffer_index(); - auto volatile *pkt_header = reinterpret_cast(local_receiver_buffer_channel.get_buffer_address(receiver_buffer_index)); + auto volatile *pkt_header = reinterpret_cast(local_receiver_buffer_channel.get_buffer_address(receiver_buffer_index)); const auto src_id = pkt_header->src_ch_id; remote_update_ptr_val(to_sender_packets_acked_streams[src_id], 1); } @@ -597,7 +597,7 @@ FORCE_INLINE void receiver_send_completion_ack( auto receiver_buffer_index = receiver_channel_ptr.get_buffer_index(); - auto volatile *pkt_header = reinterpret_cast(local_receiver_buffer_channel.get_buffer_address(receiver_buffer_index)); + auto volatile *pkt_header = reinterpret_cast(local_receiver_buffer_channel.get_buffer_address(receiver_buffer_index)); const auto src_id = pkt_header->src_ch_id; remote_update_ptr_val(to_sender_packets_completed_streams[src_id], 1); receiver_channel_ptr.increment(); @@ -607,11 +607,16 @@ FORCE_INLINE void receiver_send_completion_ack( template FORCE_INLINE bool can_forward_packet_completely( - tt::fabric::RoutingFields cached_routing_fields, + ROUTING_FIELDS_TYPE cached_routing_fields, tt::fabric::EdmToEdmSender& downstream_edm_interface) { // We always check if it is the terminal mcast packet value. We can do this because all unicast packets have the // mcast terminal value masked in to the routing field. This simplifies the check here to a single compare. - bool deliver_locally_only = cached_routing_fields.value == tt::fabric::RoutingFields::LAST_MCAST_VAL; + bool deliver_locally_only; + if constexpr (std::is_same_v) { + deliver_locally_only = cached_routing_fields.value == tt::fabric::RoutingFields::LAST_MCAST_VAL; + } else if constexpr (std::is_same_v) { + deliver_locally_only = (cached_routing_fields.value & tt::fabric::LowLatencyRoutingFields::FIELD_MASK) == tt::fabric::LowLatencyRoutingFields::WRITE_ONLY; + } return deliver_locally_only || downstream_edm_interface.edm_has_space_for_packet(); } @@ -619,19 +624,39 @@ FORCE_INLINE bool can_forward_packet_completely( template FORCE_INLINE void receiver_forward_packet( // TODO: have a separate cached copy of the packet header to save some additional L1 loads - volatile tt::fabric::PacketHeader *packet_start, - tt::fabric::RoutingFields cached_routing_fields, + volatile PACKET_HEADER_TYPE *packet_start, + ROUTING_FIELDS_TYPE cached_routing_fields, tt::fabric::EdmToEdmSender &downstream_edm_interface, uint8_t transaction_id) { - bool start_distance_is_terminal_value = (cached_routing_fields.value & tt::fabric::RoutingFields::HOP_DISTANCE_MASK) == tt::fabric::RoutingFields::LAST_HOP_DISTANCE_VAL; - uint16_t payload_size_bytes = packet_start->payload_size_bytes; - if (start_distance_is_terminal_value) { - execute_chip_unicast_to_local_chip(packet_start, payload_size_bytes, transaction_id); - } - bool not_last_destination_device = cached_routing_fields.value != tt::fabric::RoutingFields::LAST_MCAST_VAL; - if (not_last_destination_device) { - forward_payload_to_downstream_edm(packet_start, payload_size_bytes, cached_routing_fields, downstream_edm_interface, transaction_id); + if constexpr (std::is_same_v) { + // If the packet is a terminal packet, then we can just deliver it locally + bool start_distance_is_terminal_value = (cached_routing_fields.value & tt::fabric::RoutingFields::HOP_DISTANCE_MASK) == tt::fabric::RoutingFields::LAST_HOP_DISTANCE_VAL; + uint16_t payload_size_bytes = packet_start->payload_size_bytes; + if (start_distance_is_terminal_value) { + execute_chip_unicast_to_local_chip(packet_start, payload_size_bytes, transaction_id); + } + bool not_last_destination_device = cached_routing_fields.value != tt::fabric::RoutingFields::LAST_MCAST_VAL; + if (not_last_destination_device) { + forward_payload_to_downstream_edm(packet_start, payload_size_bytes, cached_routing_fields, downstream_edm_interface, transaction_id); + } + } else if constexpr (std::is_same_v) { + uint32_t routing = cached_routing_fields.value & tt::fabric::LowLatencyRoutingFields::FIELD_MASK; + uint16_t payload_size_bytes = packet_start->payload_size_bytes; + switch (routing) { + case tt::fabric::LowLatencyRoutingFields::WRITE_ONLY: + execute_chip_unicast_to_local_chip(packet_start, payload_size_bytes, transaction_id); + break; + case tt::fabric::LowLatencyRoutingFields::FORWARD_ONLY: + forward_payload_to_downstream_edm(packet_start, payload_size_bytes, cached_routing_fields, downstream_edm_interface, transaction_id); + break; + case tt::fabric::LowLatencyRoutingFields::WRITE_AND_FORWARD: + execute_chip_unicast_to_local_chip(packet_start, payload_size_bytes, transaction_id); + forward_payload_to_downstream_edm(packet_start, payload_size_bytes, cached_routing_fields, downstream_edm_interface, transaction_id); + break; + default: + ASSERT(false); + } } } @@ -663,10 +688,10 @@ FORCE_INLINE bool run_sender_channel_step( bool sender_backpressured_from_sender_side = !(local_sender_channel_worker_interface.local_rdptr.distance_behind(local_sender_channel_worker_interface.local_wrptr) < SENDER_NUM_BUFFERS); if (!sender_backpressured_from_sender_side) { did_something = true; - auto packet_header = reinterpret_cast(local_sender_channel.get_buffer_address(local_sender_channel_worker_interface.local_wrptr.get_buffer_index())); + auto packet_header = reinterpret_cast(local_sender_channel.get_buffer_address(local_sender_channel_worker_interface.local_wrptr.get_buffer_index())); if constexpr (enable_packet_header_recording) { tt::fabric::validate(*packet_header); - packet_header_recorder.record_packet_header(packet_header); + packet_header_recorder.record_packet_header(reinterpret_cast(packet_header)); } send_next_data( local_sender_channel, @@ -780,9 +805,9 @@ FORCE_INLINE void run_receiver_channel_step( bool unwritten_packets = !wr_sent_ptr.is_caught_up_to(ack_ptr); if (unwritten_packets) { auto receiver_buffer_index = wr_sent_ptr.get_buffer_index(); - volatile auto packet_header = local_receiver_channel.get_packet_header(receiver_buffer_index); + volatile auto packet_header = local_receiver_channel.template get_packet_header(receiver_buffer_index); - tt::fabric::RoutingFields cached_routing_fields = const_cast(packet_header)->routing_fields; + ROUTING_FIELDS_TYPE cached_routing_fields = const_cast(packet_header)->routing_fields; bool can_send_to_all_local_chip_receivers = can_forward_packet_completely( cached_routing_fields, downstream_edm_interface); @@ -1054,14 +1079,14 @@ void kernel_main() { std::array sender_channel_packet_recorders{ PacketHeaderRecorder( - reinterpret_cast(sender_0_completed_packet_header_cb_address), + reinterpret_cast(sender_0_completed_packet_header_cb_address), sender_0_completed_packet_header_cb_size_headers), PacketHeaderRecorder( - reinterpret_cast(sender_1_completed_packet_header_cb_address), + reinterpret_cast(sender_1_completed_packet_header_cb_address), sender_1_completed_packet_header_cb_size_headers) }; PacketHeaderRecorder receiver_channel_packet_recorder( - reinterpret_cast(receiver_completed_packet_header_cb_address), + reinterpret_cast(receiver_completed_packet_header_cb_address), receiver_completed_packet_header_cb_size_headers); static_assert(SENDER_NUM_BUFFERS > 0, "compile time argument [1]: SENDER_NUM_BUFFERS must be > 0"); @@ -1178,14 +1203,14 @@ void kernel_main() { auto local_receiver_channel = tt::fabric::EthChannelBuffer( local_receiver_channel_buffer_address, channel_buffer_size, - tt::fabric::header_size_bytes, + sizeof(PACKET_HEADER_TYPE), eth_transaction_ack_word_addr, // Assume for receiver channel, this address points to a chunk of memory that // can fit 2 eth_channel_syncs cfor ack receiver_channel_id); auto remote_receiver_channel = tt::fabric::EthChannelBuffer( remote_receiver_channel_buffer_address, channel_buffer_size, - tt::fabric::header_size_bytes, + sizeof(PACKET_HEADER_TYPE), eth_transaction_ack_word_addr, // Assume for receiver channel, this address points to a chunk of memory that // can fit 2 eth_channel_syncs cfor ack receiver_channel_id); @@ -1196,13 +1221,13 @@ void kernel_main() { new (&local_sender_channels[i]) tt::fabric::EthChannelBuffer( local_sender_buffer_addresses[i], channel_buffer_size, - tt::fabric::header_size_bytes, + sizeof(PACKET_HEADER_TYPE), 0, // For sender channels there is no eth_transaction_ack_word_addr because they don't send acks i); new (&remote_sender_channels[i]) tt::fabric::EthChannelBuffer( remote_sender_buffer_addresses[i], channel_buffer_size, - tt::fabric::header_size_bytes, + sizeof(PACKET_HEADER_TYPE), 0, // For sender channels there is no eth_transaction_ack_word_addr because they don't send acks i); diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp index 369c4f57f33..4bf3cad530e 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp @@ -64,12 +64,14 @@ class EthChannelBuffer final { return this->buffer_addresses[buffer_index]; } - [[nodiscard]] FORCE_INLINE volatile PacketHeader *get_packet_header(BufferIndex const& buffer_index) const { - return reinterpret_cast(this->buffer_addresses[buffer_index]); + template + [[nodiscard]] FORCE_INLINE volatile T *get_packet_header(BufferIndex const& buffer_index) const { + return reinterpret_cast(this->buffer_addresses[buffer_index]); } + template [[nodiscard]] FORCE_INLINE size_t get_payload_size(BufferIndex const& buffer_index) const { - return get_packet_header(buffer_index)->get_payload_size_including_header(); + return get_packet_header(buffer_index)->get_payload_size_including_header(); } [[nodiscard]] FORCE_INLINE size_t get_channel_buffer_max_size_in_bytes(BufferIndex const& buffer_index) const { return this->buffer_size_in_bytes; diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/interleaved_dim3_1_1_32_any_writer.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/interleaved_dim3_1_1_32_any_writer.cpp index a8dbeb8ade7..487df3be943 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/interleaved_dim3_1_1_32_any_writer.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/interleaved_dim3_1_1_32_any_writer.cpp @@ -94,10 +94,10 @@ void kernel_main() { DPRINT << "packet_header_buffer_seminc: " << (uint32_t)packet_header_buffer_seminc << "\n"; // pre-populate packet headers - volatile tt::fabric::PacketHeader* pkt_hdr_forward = - reinterpret_cast(packet_header_buffer_addr_forward); - volatile tt::fabric::PacketHeader* pkt_hdr_backward = - reinterpret_cast(packet_header_buffer_addr_backward); + volatile PACKET_HEADER_TYPE* pkt_hdr_forward = + reinterpret_cast(packet_header_buffer_addr_forward); + volatile PACKET_HEADER_TYPE* pkt_hdr_backward = + reinterpret_cast(packet_header_buffer_addr_backward); pkt_hdr_forward->to_chip_multicast( tt::fabric::MulticastRoutingCommandHeader{1, static_cast(num_targets_forward_direction)}); pkt_hdr_backward->to_chip_multicast( @@ -152,7 +152,7 @@ void kernel_main() { // 2. mcast output ready semaphore uint64_t out_ready_sem_noc_addr_in_pkt = safe_get_noc_addr(out_ready_sem_noc0_x, out_ready_sem_noc0_y, out_ready_sem_bank_addr, 0); - auto* pkt_hdr = reinterpret_cast(packet_header_buffer_seminc); + auto* pkt_hdr = reinterpret_cast(packet_header_buffer_seminc); pkt_hdr->to_noc_unicast_atomic_inc(tt::fabric::NocUnicastAtomicIncCommandHeader{ out_ready_sem_noc_addr_in_pkt, static_cast(1), // increment 1 @@ -163,7 +163,7 @@ void kernel_main() { pkt_hdr->to_chip_multicast( tt::fabric::MulticastRoutingCommandHeader{1, static_cast(num_targets_forward_direction)}); fabric_connection.get_forward_connection().send_payload_flush_blocking_from_address( - packet_header_buffer_seminc, sizeof(tt::fabric::PacketHeader)); + packet_header_buffer_seminc, sizeof(PACKET_HEADER_TYPE)); } // Write the mcast packet (backward) if (fabric_connection.has_backward_connection()) { @@ -171,7 +171,7 @@ void kernel_main() { tt::fabric::MulticastRoutingCommandHeader{1, static_cast(num_targets_backward_direction)}); fabric_connection.get_backward_connection().wait_for_empty_write_slot(); fabric_connection.get_backward_connection().send_payload_non_blocking_from_address( - packet_header_buffer_seminc, sizeof(tt::fabric::PacketHeader)); + packet_header_buffer_seminc, sizeof(PACKET_HEADER_TYPE)); } // increment locally uint64_t out_ready_sem_noc_addr = diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/llama_post_binary_matmul_shape_writer.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/llama_post_binary_matmul_shape_writer.cpp index b9f306cc42b..aad1e889c68 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/llama_post_binary_matmul_shape_writer.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/llama_post_binary_matmul_shape_writer.cpp @@ -103,10 +103,10 @@ void kernel_main() { DPRINT << "packet_header_buffer_seminc: " << (uint32_t)packet_header_buffer_seminc << "\n"; // pre-populate packet headers - volatile tt::fabric::PacketHeader* pkt_hdr_forward = - reinterpret_cast(packet_header_buffer_addr_forward); - volatile tt::fabric::PacketHeader* pkt_hdr_backward = - reinterpret_cast(packet_header_buffer_addr_backward); + volatile PACKET_HEADER_TYPE* pkt_hdr_forward = + reinterpret_cast(packet_header_buffer_addr_forward); + volatile PACKET_HEADER_TYPE* pkt_hdr_backward = + reinterpret_cast(packet_header_buffer_addr_backward); pkt_hdr_forward->to_chip_multicast( tt::fabric::MulticastRoutingCommandHeader{1, static_cast(num_targets_forward_direction)}); pkt_hdr_backward->to_chip_multicast( @@ -158,7 +158,7 @@ void kernel_main() { } // 2. mcast output ready semaphore - auto* pkt_hdr = reinterpret_cast(packet_header_buffer_seminc); + auto* pkt_hdr = reinterpret_cast(packet_header_buffer_seminc); uint64_t out_ready_sem_noc_addr_in_pkt = safe_get_noc_addr(out_ready_sem_noc0_x, out_ready_sem_noc0_y, out_ready_sem_bank_addr, 0); pkt_hdr->to_noc_unicast_atomic_inc(tt::fabric::NocUnicastAtomicIncCommandHeader{ @@ -171,7 +171,7 @@ void kernel_main() { pkt_hdr->to_chip_multicast( tt::fabric::MulticastRoutingCommandHeader{1, static_cast(num_targets_forward_direction)}); fabric_connection.get_forward_connection().send_payload_flush_blocking_from_address( - packet_header_buffer_seminc, sizeof(tt::fabric::PacketHeader)); + packet_header_buffer_seminc, sizeof(PACKET_HEADER_TYPE)); } // Write the mcast packet (backward) if (fabric_connection.has_backward_connection()) { @@ -179,7 +179,7 @@ void kernel_main() { tt::fabric::MulticastRoutingCommandHeader{1, static_cast(num_targets_backward_direction)}); fabric_connection.get_backward_connection().wait_for_empty_write_slot(); fabric_connection.get_backward_connection().send_payload_non_blocking_from_address( - packet_header_buffer_seminc, sizeof(tt::fabric::PacketHeader)); + packet_header_buffer_seminc, sizeof(PACKET_HEADER_TYPE)); } // increment locally uint64_t out_ready_sem_noc_addr = diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/minimal_ccl_common.hpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/minimal_ccl_common.hpp index 641e6cee244..55e2668d5d1 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/minimal_ccl_common.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/minimal_ccl_common.hpp @@ -12,8 +12,8 @@ FORCE_INLINE void write_and_advance_local_read_address_for_fabric_write( uint64_t noc0_dest_noc_addr, - volatile tt::fabric::PacketHeader* pkt_hdr_forward, - volatile tt::fabric::PacketHeader* pkt_hdr_backward, + volatile PACKET_HEADER_TYPE* pkt_hdr_forward, + volatile PACKET_HEADER_TYPE* pkt_hdr_backward, FabricConnectionManager& fabric_connection, size_t& l1_read_addr, uint32_t payload_size_bytes) { @@ -29,7 +29,7 @@ FORCE_INLINE void write_and_advance_local_read_address_for_fabric_write( fabric_connection.get_forward_connection().send_payload_without_header_non_blocking_from_address( l1_read_addr, payload_size_bytes); fabric_connection.get_forward_connection().send_payload_flush_blocking_from_address( - (uint32_t)pkt_hdr_forward, sizeof(tt::fabric::PacketHeader)); + (uint32_t)pkt_hdr_forward, sizeof(PACKET_HEADER_TYPE)); } if (fabric_connection.has_backward_connection()) { @@ -37,7 +37,7 @@ FORCE_INLINE void write_and_advance_local_read_address_for_fabric_write( fabric_connection.get_backward_connection().send_payload_without_header_non_blocking_from_address( l1_read_addr, payload_size_bytes); fabric_connection.get_backward_connection().send_payload_flush_blocking_from_address( - (uint32_t)pkt_hdr_backward, sizeof(tt::fabric::PacketHeader)); + (uint32_t)pkt_hdr_backward, sizeof(PACKET_HEADER_TYPE)); } noc_async_writes_flushed(); From 87dcfd7a78293b515cba915c64c9177866ea7e2b Mon Sep 17 00:00:00 2001 From: Austin Ho Date: Sat, 22 Feb 2025 16:47:40 +0000 Subject: [PATCH 243/316] #18184: Use CRTP for packet header structs --- .../edm_fabric/fabric_edm_packet_header.hpp | 337 +++++++----------- 1 file changed, 129 insertions(+), 208 deletions(-) diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp index c6ba0fe24e0..468777220e8 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp @@ -104,116 +104,107 @@ union NocCommandFields{ static_assert(sizeof(NocCommandFields) <= 16, "CommandFields size is not 16 bytes"); // TODO: wrap this in a debug version that holds type info so we can assert for field/command/ -struct PacketHeader { +template +struct PacketHeaderBase { + NocCommandFields command_fields; // size = 16B due to uint64_t alignment + uint16_t payload_size_bytes; // TODO: trim this down noc_send_type 2 bits (4 values): // -> unicast_write, mcast_write, unicast_seminc, mcast_seminc // For now, kept it separate so I could do reads which would be handled differently // but for our purposes we shouldn't need read so we should be able to omit the support NocSendType noc_send_type : 3; + // ChipSendType only used by PacketHeader, but keep here for now for bit-fields ChipSendType chip_send_type : 1; - // Used only by the EDM sender and receiver channels. Populated by EDM sender channel to // indicate to the receiver channel what channel was the source of this packet. Reserved // otherwise. uint8_t src_ch_id : 4; - RoutingFields routing_fields; - uint16_t payload_size_bytes; // excludes header size - NocCommandFields command_fields; // size = 16B due to uint64_t alignment - - // Sort of hack to work-around DRAM read alignment issues that must be 32B aligned - // To simplify worker kernel code, we for now decide to pad up the packet header - // to 32B so the user can simplify shift into their CB chunk by sizeof(tt::fabric::PacketHeader) - // and automatically work around the DRAM read alignment bug. - // - // Future changes will remove this padding and require the worker kernel to be aware of this bug - // and pad their own CBs conditionally when reading from DRAM. It'll be up to the users to - // manage this complexity. - uint32_t padding0; - uint32_t padding1; - - inline void set_chip_send_type(ChipSendType &type) { this->chip_send_type = type; } - inline void set_noc_send_type(NocSendType &type) { this->noc_send_type = type; } - inline void set_routing_fields(RoutingFields &fields) { this->routing_fields = fields; } - inline void set_command_fields(NocCommandFields &fields) { this->command_fields = fields; } - // Returns size of payload in bytes - TODO: convert to words (4B) size_t get_payload_size_excluding_header() volatile const { return this->payload_size_bytes; } + inline size_t get_payload_size_including_header() volatile const { - return get_payload_size_excluding_header() + sizeof(PacketHeader); + return get_payload_size_excluding_header() + sizeof(Derived); } - inline PacketHeader &to_chip_unicast(uint8_t distance_in_hops) { - this->chip_send_type = CHIP_UNICAST; - this->routing_fields.value = RoutingFields::LAST_CHIP_IN_MCAST_VAL | distance_in_hops; - return *this; + // Setters for noc_send_type, routing_fields, and command_fields + inline void set_noc_send_type(NocSendType &type) { this->noc_send_type = type; } + inline void set_command_fields(NocCommandFields &fields) { this->command_fields = fields; } + + inline Derived &to_chip_unicast(uint8_t distance_in_hops) { + static_cast(this)->to_chip_unicast_impl(distance_in_hops); + return *static_cast(this); } - inline PacketHeader &to_chip_multicast(MulticastRoutingCommandHeader const &chip_multicast_command_header) { - this->chip_send_type = CHIP_MULTICAST; - this->routing_fields.value = ((static_cast(chip_multicast_command_header.range_hops) << RoutingFields::START_DISTANCE_FIELD_BIT_WIDTH)) | static_cast(chip_multicast_command_header.start_distance_in_hops); - return *this; + + inline Derived &to_chip_multicast(MulticastRoutingCommandHeader const &mcast_routing_command_header) { + static_cast(this)->to_chip_multicast_impl(mcast_routing_command_header); + return *static_cast(this); } - inline PacketHeader &to_noc_unicast_write(NocUnicastCommandHeader const &noc_unicast_command_header, size_t payload_size_bytes) { + inline Derived &to_noc_unicast_write(NocUnicastCommandHeader const &noc_unicast_command_header, size_t payload_size_bytes) { this->noc_send_type = NOC_UNICAST_WRITE; this->command_fields.unicast_write = noc_unicast_command_header; this->payload_size_bytes = payload_size_bytes; - return *this; + return *static_cast(this); } - inline PacketHeader &to_noc_unicast_inline_write(NocUnicastInlineWriteCommandHeader const &noc_unicast_command_header) { + + inline Derived &to_noc_unicast_inline_write(NocUnicastInlineWriteCommandHeader const &noc_unicast_command_header) { this->noc_send_type = NOC_UNICAST_INLINE_WRITE; this->command_fields.unicast_inline_write = noc_unicast_command_header; this->payload_size_bytes = 0; - return *this; + return *static_cast(this); } - inline PacketHeader &to_noc_multicast_write(NocMulticastCommandHeader const &noc_multicast_command_header, size_t payload_size_bytes) { + + inline Derived &to_noc_multicast(NocMulticastCommandHeader const &noc_multicast_command_header, size_t payload_size_bytes) { this->noc_send_type = NOC_MULTICAST_WRITE; this->command_fields.mcast_write = noc_multicast_command_header; this->payload_size_bytes = payload_size_bytes; - return *this; + return *static_cast(this); } - inline PacketHeader &to_noc_unicast_atomic_inc(NocUnicastAtomicIncCommandHeader const &noc_unicast_atomic_inc_command_header) { + + inline Derived &to_noc_unicast_atomic_inc(NocUnicastAtomicIncCommandHeader const &noc_unicast_atomic_inc_command_header) { this->noc_send_type = NOC_UNICAST_ATOMIC_INC; this->command_fields.unicast_seminc = noc_unicast_atomic_inc_command_header; this->payload_size_bytes = 0; - return *this; + return *static_cast(this); } - inline PacketHeader &to_noc_multicast_atomic_inc(NocMulticastAtomicIncCommandHeader const &noc_multicast_command_header, size_t payload_size_bytes) { - #if defined(KERNEL_BUILD) || defined(FW_BUILD) - ASSERT(false); - while (1) {}; - #endif + + inline Derived &to_noc_multicast_atomic_inc( + NocMulticastAtomicIncCommandHeader const &noc_multicast_atomic_inc_command_header, size_t payload_size_bytes) { + this->noc_send_type = NOC_MULTICAST_ATOMIC_INC; + this->command_fields.mcast_seminc = noc_multicast_atomic_inc_command_header; this->payload_size_bytes = payload_size_bytes; - return *this; + return *static_cast(this); } - inline volatile PacketHeader *to_chip_unicast(uint8_t distance_in_hops) volatile { - this->chip_send_type = CHIP_UNICAST; - this->routing_fields.value = RoutingFields::LAST_CHIP_IN_MCAST_VAL | distance_in_hops; - return this; + inline volatile Derived* to_chip_unicast(uint8_t distance_in_hops) volatile { + static_cast(this)->to_chip_unicast_impl(distance_in_hops); + return static_cast(this); } - inline volatile PacketHeader *to_chip_multicast(MulticastRoutingCommandHeader const &chip_multicast_command_header) volatile { - this->chip_send_type = CHIP_MULTICAST; - this->routing_fields.value = (static_cast(chip_multicast_command_header.range_hops) << RoutingFields::START_DISTANCE_FIELD_BIT_WIDTH) | chip_multicast_command_header.start_distance_in_hops; - return this; + + inline volatile Derived* to_chip_multicast(MulticastRoutingCommandHeader const &mcast_routing_command_header) volatile { + static_cast(this)->to_chip_multicast_impl(mcast_routing_command_header); + return static_cast(this); } - inline volatile PacketHeader *to_noc_unicast_write(NocUnicastCommandHeader const &noc_unicast_command_header, size_t payload_size_bytes) volatile { + + inline volatile Derived* to_noc_unicast_write(NocUnicastCommandHeader const &noc_unicast_command_header, size_t payload_size_bytes) volatile { this->noc_send_type = NOC_UNICAST_WRITE; this->command_fields.unicast_write.noc_address = noc_unicast_command_header.noc_address; this->payload_size_bytes = payload_size_bytes; - - return this; + return static_cast(this); } - inline volatile PacketHeader &to_noc_unicast_inline_write(NocUnicastInlineWriteCommandHeader const &noc_unicast_command_header) volatile { + + inline volatile Derived* to_noc_unicast_inline_write(NocUnicastInlineWriteCommandHeader const &noc_unicast_command_header) volatile { this->noc_send_type = NOC_UNICAST_INLINE_WRITE; this->command_fields.unicast_inline_write.noc_address = noc_unicast_command_header.noc_address; this->command_fields.unicast_inline_write.value = noc_unicast_command_header.value; this->payload_size_bytes = 0; - return *this; + return static_cast(this); } - inline volatile PacketHeader *to_noc_multicast(NocMulticastCommandHeader const &noc_multicast_command_header, size_t payload_size_bytes) volatile { + + inline volatile Derived* to_noc_multicast(NocMulticastCommandHeader const &noc_multicast_command_header, size_t payload_size_bytes) volatile { this->noc_send_type = NOC_MULTICAST_WRITE; this->command_fields.mcast_write.mcast_rect_size_x = noc_multicast_command_header.mcast_rect_size_x; this->command_fields.mcast_write.mcast_rect_size_y = noc_multicast_command_header.mcast_rect_size_y; @@ -221,20 +212,19 @@ struct PacketHeader { this->command_fields.mcast_write.noc_y_start = noc_multicast_command_header.noc_y_start; this->payload_size_bytes = payload_size_bytes; this->command_fields.mcast_write.address = noc_multicast_command_header.address; - - return this; + return static_cast(this); } - inline volatile PacketHeader *to_noc_unicast_atomic_inc( - NocUnicastAtomicIncCommandHeader const &noc_unicast_atomic_inc_command_header) volatile { + + inline volatile Derived* to_noc_unicast_atomic_inc(NocUnicastAtomicIncCommandHeader const &noc_unicast_atomic_inc_command_header) volatile { this->noc_send_type = NOC_UNICAST_ATOMIC_INC; this->command_fields.unicast_seminc.noc_address = noc_unicast_atomic_inc_command_header.noc_address; this->command_fields.unicast_seminc.val = noc_unicast_atomic_inc_command_header.val; this->command_fields.unicast_seminc.wrap = noc_unicast_atomic_inc_command_header.wrap; this->payload_size_bytes = 0; - - return this; + return static_cast(this); } - inline volatile PacketHeader *to_noc_multicast_atomic_inc( + + inline volatile Derived *to_noc_multicast_atomic_inc( NocMulticastAtomicIncCommandHeader const &noc_multicast_atomic_inc_command_header, size_t payload_size_bytes) volatile { this->noc_send_type = NOC_MULTICAST_ATOMIC_INC; this->command_fields.mcast_seminc.address = noc_multicast_atomic_inc_command_header.address; @@ -245,41 +235,16 @@ struct PacketHeader { this->command_fields.mcast_seminc.val = noc_multicast_atomic_inc_command_header.val; this->command_fields.mcast_seminc.wrap = noc_multicast_atomic_inc_command_header.wrap; this->payload_size_bytes = payload_size_bytes; - - return this; + return static_cast(this); } - inline void set_src_ch_id(uint8_t ch_id) volatile { this->src_ch_id = ch_id; } -}; -struct LowLatencyRoutingFields { - static constexpr uint32_t FIELD_WIDTH = 2; - static constexpr uint32_t FIELD_MASK = 0b11; - static constexpr uint32_t NOOP = 0b00; - static constexpr uint32_t WRITE_ONLY = 0b01; - static constexpr uint32_t FORWARD_ONLY = 0b10; - static constexpr uint32_t WRITE_AND_FORWARD = 0b11; - static constexpr uint32_t FWD_ONLY_FIELD = 0xAAAAAAAA; - static constexpr uint32_t WR_AND_FWD_FIELD = 0xFFFFFFFF; - uint32_t value; + inline void set_src_ch_id(uint8_t ch_id) volatile { + this->src_ch_id = ch_id; + } }; -// TODO: wrap this in a debug version that holds type info so we can assert for field/command/ -struct LowLatencyPacketHeader { - // TODO: trim this down noc_send_type 2 bits (4 values): - // -> unicast_write, mcast_write, unicast_seminc, mcast_seminc - // For now, kept it separate so I could do reads which would be handled differently - // but for our purposes we shouldn't need read so we should be able to omit the support - NocSendType noc_send_type : 4; - - // Used only by the EDM sender and receiver channels. Populated by EDM sender channel to - // indicate to the receiver channel what channel was the source of this packet. Reserved - // otherwise. - uint8_t src_ch_id : 4; - - LowLatencyRoutingFields routing_fields; - uint16_t payload_size_bytes; // excludes header size - NocCommandFields command_fields; // size = 16B due to uint64_t alignment - +struct PacketHeader : public PacketHeaderBase { + RoutingFields routing_fields; // Sort of hack to work-around DRAM read alignment issues that must be 32B aligned // To simplify worker kernel code, we for now decide to pad up the packet header // to 32B so the user can simplify shift into their CB chunk by sizeof(tt::fabric::PacketHeader) @@ -288,155 +253,111 @@ struct LowLatencyPacketHeader { // Future changes will remove this padding and require the worker kernel to be aware of this bug // and pad their own CBs conditionally when reading from DRAM. It'll be up to the users to // manage this complexity. + uint32_t padding0; + uint32_t padding1; - inline void set_noc_send_type(NocSendType &type) { this->noc_send_type = type; } - inline void set_routing_fields(LowLatencyRoutingFields &fields) { this->routing_fields = fields; } - inline void set_command_fields(NocCommandFields &fields) { this->command_fields = fields; } + private: - // Returns size of payload in bytes - TODO: convert to words (4B) - size_t get_payload_size_excluding_header() volatile const { - return this->payload_size_bytes; + inline static uint32_t calculate_chip_unicast_routing_fields_value(uint8_t distance_in_hops) { + return RoutingFields::LAST_CHIP_IN_MCAST_VAL | distance_in_hops; } - inline size_t get_payload_size_including_header() volatile const { - return get_payload_size_excluding_header() + sizeof(LowLatencyPacketHeader); + inline static uint32_t calculate_chip_multicast_routing_fields_value( + const MulticastRoutingCommandHeader& chip_multicast_command_header) { + return ((static_cast(chip_multicast_command_header.range_hops) << RoutingFields::START_DISTANCE_FIELD_BIT_WIDTH)) | static_cast(chip_multicast_command_header.start_distance_in_hops); } - inline LowLatencyPacketHeader& to_chip_unicast(uint8_t distance_in_hops) { - // Example of unicast 3 hops away - // First line will do 0xAAAAAAAA & 0b1111 = 0b1010. This means starting from our neighbor, we will forward twice (forward to neighbor is not encoded in the field) - // Last line will do 0b01 << 4 = 0b010000. This means that on the 3rd chip, we will write only - // Together this means the final encoding is 0b011010 - this->routing_fields.value = - (LowLatencyRoutingFields::FWD_ONLY_FIELD & ((1 << (distance_in_hops - 1) * LowLatencyRoutingFields::FIELD_WIDTH) - 1)) | - (LowLatencyRoutingFields::WRITE_ONLY << (distance_in_hops - 1) * LowLatencyRoutingFields::FIELD_WIDTH); - return *this; - } - inline LowLatencyPacketHeader& to_chip_multicast( - const MulticastRoutingCommandHeader& chip_multicast_command_header) { + public: - // Example of starting 3 hops away mcasting to 2 chips - // First line will do 0xAAAAAAAA & 0b1111 = 0b1010. This means starting from our neighbor, we will forward twice (forward to neighbor is not encoded in the field) - // Second line will do 0xFFFFFFFF & 0b11 = 0b11. 0b11 << 4 = 0b110000. This means starting from the 3rd chip, we will write and forward once - // Last line will do 0b01 << 6 = 0b01000000. This means that on the 5th chip, we will write only - // Together this means the final encoding is 0b01111010 - this->routing_fields.value = - (LowLatencyRoutingFields::FWD_ONLY_FIELD & ((1 << (chip_multicast_command_header.start_distance_in_hops - 1) * LowLatencyRoutingFields::FIELD_WIDTH) - 1)) | - (LowLatencyRoutingFields::WR_AND_FWD_FIELD & ((1 << (chip_multicast_command_header.range_hops - 1) * LowLatencyRoutingFields::FIELD_WIDTH) - 1) << - ((chip_multicast_command_header.start_distance_in_hops - 1) * LowLatencyRoutingFields::FIELD_WIDTH)) | - (LowLatencyRoutingFields::WRITE_ONLY << (chip_multicast_command_header.start_distance_in_hops + chip_multicast_command_header.range_hops - 2) * LowLatencyRoutingFields::FIELD_WIDTH); - return *this; - } + // Setters for PacketHeader-specific fields + inline void set_chip_send_type(ChipSendType &type) { this->chip_send_type = type; } - inline LowLatencyPacketHeader &to_noc_unicast_write(NocUnicastCommandHeader const &noc_unicast_command_header, size_t payload_size_bytes) { - this->noc_send_type = NOC_UNICAST_WRITE; - this->command_fields.unicast_write = noc_unicast_command_header; - this->payload_size_bytes = payload_size_bytes; - return *this; - } - inline LowLatencyPacketHeader &to_noc_unicast_inline_write(NocUnicastInlineWriteCommandHeader const &noc_unicast_command_header) { - this->noc_send_type = NOC_UNICAST_INLINE_WRITE; - this->command_fields.unicast_inline_write = noc_unicast_command_header; - this->payload_size_bytes = 0; - return *this; + inline void set_routing_fields(RoutingFields &fields) { this->routing_fields = fields; } + + inline void to_chip_unicast_impl(uint8_t distance_in_hops) { + this->chip_send_type = CHIP_UNICAST; + this->routing_fields.value = PacketHeader::calculate_chip_unicast_routing_fields_value(distance_in_hops); } - inline LowLatencyPacketHeader &to_noc_multicast_write(NocMulticastCommandHeader const &noc_multicast_command_header, size_t payload_size_bytes) { - this->noc_send_type = NOC_MULTICAST_WRITE; - this->command_fields.mcast_write = noc_multicast_command_header; - this->payload_size_bytes = payload_size_bytes; - return *this; + inline void to_chip_multicast_impl(MulticastRoutingCommandHeader const &chip_multicast_command_header) { + this->chip_send_type = CHIP_MULTICAST; + this->routing_fields.value = PacketHeader::calculate_chip_multicast_routing_fields_value(chip_multicast_command_header); } - inline LowLatencyPacketHeader &to_noc_unicast_atomic_inc(NocUnicastAtomicIncCommandHeader const &noc_unicast_atomic_inc_command_header) { - this->noc_send_type = NOC_UNICAST_ATOMIC_INC; - this->command_fields.unicast_seminc = noc_unicast_atomic_inc_command_header; - this->payload_size_bytes = 0; - return *this; + + inline void to_chip_unicast_impl(uint8_t distance_in_hops) volatile { + this->chip_send_type = CHIP_UNICAST; + this->routing_fields.value = PacketHeader::calculate_chip_unicast_routing_fields_value(distance_in_hops); } - inline LowLatencyPacketHeader &to_noc_multicast_atomic_inc(NocMulticastAtomicIncCommandHeader const &noc_multicast_command_header, size_t payload_size_bytes) { - #if defined(KERNEL_BUILD) || defined(FW_BUILD) - ASSERT(false); - while (1) {}; - #endif - this->payload_size_bytes = payload_size_bytes; - return *this; + inline void to_chip_multicast_impl(MulticastRoutingCommandHeader const &chip_multicast_command_header) volatile{ + this->chip_send_type = CHIP_MULTICAST; + this->routing_fields.value = PacketHeader::calculate_chip_multicast_routing_fields_value(chip_multicast_command_header); } +}; + +struct LowLatencyRoutingFields { + static constexpr uint32_t FIELD_WIDTH = 2; + static constexpr uint32_t FIELD_MASK = 0b11; + static constexpr uint32_t NOOP = 0b00; + static constexpr uint32_t WRITE_ONLY = 0b01; + static constexpr uint32_t FORWARD_ONLY = 0b10; + static constexpr uint32_t WRITE_AND_FORWARD = 0b11; + static constexpr uint32_t FWD_ONLY_FIELD = 0xAAAAAAAA; + static constexpr uint32_t WR_AND_FWD_FIELD = 0xFFFFFFFF; + uint32_t value; +}; + +struct LowLatencyPacketHeader : public PacketHeaderBase { + uint8_t padding0; + LowLatencyRoutingFields routing_fields; + uint32_t padding1; + + private: - inline volatile LowLatencyPacketHeader* to_chip_unicast(uint8_t distance_in_hops) volatile { + inline static uint32_t calculate_chip_unicast_routing_fields_value(uint8_t distance_in_hops) { // Example of unicast 3 hops away // First line will do 0xAAAAAAAA & 0b1111 = 0b1010. This means starting from our neighbor, we will forward twice (forward to neighbor is not encoded in the field) // Last line will do 0b01 << 4 = 0b010000. This means that on the 3rd chip, we will write only // Together this means the final encoding is 0b011010 - this->routing_fields.value = + return (LowLatencyRoutingFields::FWD_ONLY_FIELD & ((1 << (distance_in_hops - 1) * LowLatencyRoutingFields::FIELD_WIDTH) - 1)) | (LowLatencyRoutingFields::WRITE_ONLY << (distance_in_hops - 1) * LowLatencyRoutingFields::FIELD_WIDTH); - return this; } - inline volatile LowLatencyPacketHeader* to_chip_multicast( - const MulticastRoutingCommandHeader& chip_multicast_command_header) volatile { + inline static uint32_t calculate_chip_multicast_routing_fields_value( + const MulticastRoutingCommandHeader& chip_multicast_command_header) { // Example of starting 3 hops away mcasting to 2 chips // First line will do 0xAAAAAAAA & 0b1111 = 0b1010. This means starting from our neighbor, we will forward twice (forward to neighbor is not encoded in the field) // Second line will do 0xFFFFFFFF & 0b11 = 0b11. 0b11 << 4 = 0b110000. This means starting from the 3rd chip, we will write and forward once // Last line will do 0b01 << 6 = 0b01000000. This means that on the 5th chip, we will write only // Together this means the final encoding is 0b01111010 - this->routing_fields.value = + return (LowLatencyRoutingFields::FWD_ONLY_FIELD & ((1 << (chip_multicast_command_header.start_distance_in_hops - 1) * LowLatencyRoutingFields::FIELD_WIDTH) - 1)) | (LowLatencyRoutingFields::WR_AND_FWD_FIELD & ((1 << (chip_multicast_command_header.range_hops - 1) * LowLatencyRoutingFields::FIELD_WIDTH) - 1) << ((chip_multicast_command_header.start_distance_in_hops - 1) * LowLatencyRoutingFields::FIELD_WIDTH)) | (LowLatencyRoutingFields::WRITE_ONLY << (chip_multicast_command_header.start_distance_in_hops + chip_multicast_command_header.range_hops - 2) * LowLatencyRoutingFields::FIELD_WIDTH); - return this; } - inline volatile LowLatencyPacketHeader *to_noc_unicast_write(NocUnicastCommandHeader const &noc_unicast_command_header, size_t payload_size_bytes) volatile { - this->noc_send_type = NOC_UNICAST_WRITE; - this->command_fields.unicast_write.noc_address = noc_unicast_command_header.noc_address; - this->payload_size_bytes = payload_size_bytes; - return this; - } - inline volatile LowLatencyPacketHeader &to_noc_unicast_inline_write(NocUnicastInlineWriteCommandHeader const &noc_unicast_command_header) volatile { - this->noc_send_type = NOC_UNICAST_INLINE_WRITE; - this->command_fields.unicast_inline_write.noc_address = noc_unicast_command_header.noc_address; - this->command_fields.unicast_inline_write.value = noc_unicast_command_header.value; - this->payload_size_bytes = 0; - return *this; - } - inline volatile LowLatencyPacketHeader *to_noc_multicast(NocMulticastCommandHeader const &noc_multicast_command_header, size_t payload_size_bytes) volatile { - this->noc_send_type = NOC_MULTICAST_WRITE; - this->command_fields.mcast_write.mcast_rect_size_x = noc_multicast_command_header.mcast_rect_size_x; - this->command_fields.mcast_write.mcast_rect_size_y = noc_multicast_command_header.mcast_rect_size_y; - this->command_fields.mcast_write.noc_x_start = noc_multicast_command_header.noc_x_start; - this->command_fields.mcast_write.noc_y_start = noc_multicast_command_header.noc_y_start; - this->payload_size_bytes = payload_size_bytes; - this->command_fields.mcast_write.address = noc_multicast_command_header.address; + public: - return this; + // Specialized implementations for LowLatencyPacketHeader + inline void set_routing_fields(LowLatencyRoutingFields &fields) { + this->routing_fields = fields; } - inline volatile LowLatencyPacketHeader *to_noc_unicast_atomic_inc( - NocUnicastAtomicIncCommandHeader const &noc_unicast_atomic_inc_command_header) volatile { - this->noc_send_type = NOC_UNICAST_ATOMIC_INC; - this->command_fields.unicast_seminc.noc_address = noc_unicast_atomic_inc_command_header.noc_address; - this->command_fields.unicast_seminc.val = noc_unicast_atomic_inc_command_header.val; - this->command_fields.unicast_seminc.wrap = noc_unicast_atomic_inc_command_header.wrap; - this->payload_size_bytes = 0; - return this; + inline void to_chip_unicast_impl(uint8_t distance_in_hops) { + this->routing_fields.value = LowLatencyPacketHeader::calculate_chip_unicast_routing_fields_value(distance_in_hops); + } + inline void to_chip_multicast_impl( + const MulticastRoutingCommandHeader& chip_multicast_command_header) { + this->routing_fields.value = LowLatencyPacketHeader::calculate_chip_multicast_routing_fields_value(chip_multicast_command_header); } - inline volatile LowLatencyPacketHeader *to_noc_multicast_atomic_inc( - NocMulticastAtomicIncCommandHeader const &noc_multicast_atomic_inc_command_header, size_t payload_size_bytes) volatile { - this->noc_send_type = NOC_MULTICAST_ATOMIC_INC; - this->command_fields.mcast_seminc.address = noc_multicast_atomic_inc_command_header.address; - this->command_fields.mcast_seminc.noc_x_start = noc_multicast_atomic_inc_command_header.noc_x_start; - this->command_fields.mcast_seminc.noc_y_start = noc_multicast_atomic_inc_command_header.noc_y_start; - this->command_fields.mcast_seminc.size_x = noc_multicast_atomic_inc_command_header.size_x; - this->command_fields.mcast_seminc.size_y = noc_multicast_atomic_inc_command_header.size_y; - this->command_fields.mcast_seminc.val = noc_multicast_atomic_inc_command_header.val; - this->command_fields.mcast_seminc.wrap = noc_multicast_atomic_inc_command_header.wrap; - this->payload_size_bytes = payload_size_bytes; - return this; + inline void to_chip_unicast_impl(uint8_t distance_in_hops) volatile { + this->routing_fields.value = LowLatencyPacketHeader::calculate_chip_unicast_routing_fields_value(distance_in_hops); + } + inline void to_chip_multicast_impl( + const MulticastRoutingCommandHeader& chip_multicast_command_header) volatile { + this->routing_fields.value = LowLatencyPacketHeader::calculate_chip_multicast_routing_fields_value(chip_multicast_command_header); } - inline void set_src_ch_id(uint8_t ch_id) volatile { this->src_ch_id = ch_id; } }; - // TODO: When we remove the 32B padding requirement, reduce to 16B size check static_assert(sizeof(PacketHeader) == 32, "sizeof(PacketHeader) is not equal to 32B"); // Host code still hardcoded to sizeof(PacketHeader) so we need to keep this check From c674c26e79c251e19723309744ceb2b65b1a36f2 Mon Sep 17 00:00:00 2001 From: David Ma Date: Fri, 21 Feb 2025 19:09:05 +0000 Subject: [PATCH 244/316] #0: Suppress device init warnings after the first to avoid spam --- tt_metal/api/tt-metalium/device_impl.hpp | 2 ++ tt_metal/impl/device/device.cpp | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/tt_metal/api/tt-metalium/device_impl.hpp b/tt_metal/api/tt-metalium/device_impl.hpp index 21d017789c0..878569038d2 100644 --- a/tt_metal/api/tt-metalium/device_impl.hpp +++ b/tt_metal/api/tt-metalium/device_impl.hpp @@ -270,6 +270,8 @@ class Device : public IDevice { program_cache::detail::ProgramCache program_cache_; uint32_t trace_buffers_size_ = 0; + bool uninitialized_error_fired_ = + false; // To avoid spam with warnings about calling Device methods when it's not initialized. }; } // namespace v0 diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index 8df3eb90854..4afa1b342a7 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -961,7 +961,10 @@ bool Device::initialize(const uint8_t num_hw_cqs, size_t l1_small_size, size_t t void Device::push_work(std::function work, bool blocking) { if (not this->initialized_) { - log_warning("Attempting to push work to Device {} which is not initialized. Ignoring...", this->id_); + if (!uninitialized_error_fired_) { + log_fatal("Attempting to push work to Device {} which is not initialized. Ignoring...", this->id_); + uninitialized_error_fired_ = true; + } return; } this->work_executor_.push_work(std::move(work), blocking); From 27f749fa4954e869dbeb98ed9341a2e1b8de392d Mon Sep 17 00:00:00 2001 From: Oleg Milyutin Date: Sun, 23 Feb 2025 18:37:07 -0500 Subject: [PATCH 245/316] #0: Add ttnn distributed tests to t3k unit tests suite (#18165) ### Ticket N/A ### Problem description ttnn distributed tests weren't part of the CI. ### What's changed * Move TTNN distributed tests to the metal directory. These tests don't have any dependencies on TTNN, so it makes sense to consolidate. * Remove `test_distributed_atexit.cpp` test: * The test is currently broken, and we decided not to support this use case. The problem is that `MeshDevice` destructor attempts to close devices, which were previously closed by `DevicePool` singleton. There is no way to make it work without hacks that implicitly instantiate `DevicePool` or make `MeshDevice` aware that `DevicePool` might have closed the devices silently behind the scenes. * The test was "passing" at the initial commit because the executable was bundled with other test files, which extended `DevicePool` lifetime to the necessary point until after the function local static variable in the test was being destroyed. * In general, static data with non-trivial destructors is a bad idea. It is commonly banned altogether; e.g. see https://google.github.io/styleguide/cppguide.html#Static_and_Global_Variables. * Adopt `Indestructible` for `SystemMesh`. * Better `operator<<` for `MeshCoordinate`. * Use `get_physical_device_id` as it performs nice boundary and dimensionality checks for user-friendly error messages. ### Checklist - [x] [All post commit](https://github.com/tenstorrent/tt-metal/actions/runs/13466935830) - [X] New/Existing tests provide coverage for changes. Confirmed the tests are passing locally on a t3k machine. --- tests/tt_metal/distributed/CMakeLists.txt | 3 +- .../tt_metal/distributed/test_distributed.cpp | 31 ---- .../tt_metal/distributed/test_mesh_device.cpp | 93 +++++++++++ .../distributed/test_mesh_device_reshape.cpp} | 144 ++++++++++-------- tests/ttnn/CMakeLists.txt | 1 - tests/ttnn/distributed/CMakeLists.txt | 13 -- tests/ttnn/distributed/test_distributed.cpp | 99 ------------ .../distributed/test_distributed_atexit.cpp | 27 ---- tt_metal/CMakeLists.txt | 2 +- tt_metal/api/tt-metalium/system_mesh.hpp | 9 +- tt_metal/common/mesh_coord.cpp | 9 +- tt_metal/distributed/mesh_device.cpp | 4 +- tt_metal/distributed/system_mesh.cpp | 24 ++- 13 files changed, 198 insertions(+), 261 deletions(-) delete mode 100644 tests/tt_metal/distributed/test_distributed.cpp create mode 100644 tests/tt_metal/distributed/test_mesh_device.cpp rename tests/{ttnn/distributed/test_distributed_reshape.cpp => tt_metal/distributed/test_mesh_device_reshape.cpp} (62%) delete mode 100644 tests/ttnn/distributed/CMakeLists.txt delete mode 100644 tests/ttnn/distributed/test_distributed.cpp delete mode 100644 tests/ttnn/distributed/test_distributed_atexit.cpp diff --git a/tests/tt_metal/distributed/CMakeLists.txt b/tests/tt_metal/distributed/CMakeLists.txt index 922e19ef993..88890c7eded 100644 --- a/tests/tt_metal/distributed/CMakeLists.txt +++ b/tests/tt_metal/distributed/CMakeLists.txt @@ -1,7 +1,8 @@ set(UNIT_TESTS_DISTRIBUTED_SRC - ${CMAKE_CURRENT_SOURCE_DIR}/test_distributed.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_buffer.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_coord.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_device.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_device_reshape.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_workload.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_sub_device.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_allocator.cpp diff --git a/tests/tt_metal/distributed/test_distributed.cpp b/tests/tt_metal/distributed/test_distributed.cpp deleted file mode 100644 index bf8877879e3..00000000000 --- a/tests/tt_metal/distributed/test_distributed.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include - -#include "tests/tt_metal/tt_metal/common/multi_device_fixture.hpp" - -namespace tt::tt_metal::distributed::test { -namespace { - -TEST_F(T3000MeshDeviceFixture, SimpleMeshDeviceTest) { - EXPECT_EQ(mesh_device_->num_devices(), 8); - EXPECT_EQ(mesh_device_->num_rows(), 2); - EXPECT_EQ(mesh_device_->num_cols(), 4); -} - -TEST(MeshDeviceSuite, Test1x1SystemMeshInitialize) { - auto& sys = tt::tt_metal::distributed::SystemMesh::instance(); - - auto config = tt::tt_metal::distributed::MeshDeviceConfig{.mesh_shape = MeshShape(1, 1)}; - - EXPECT_NO_THROW({ - auto mesh = tt::tt_metal::distributed::MeshDevice::create( - config, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); - mesh->close(); - }); -} - -} // namespace -} // namespace tt::tt_metal::distributed::test diff --git a/tests/tt_metal/distributed/test_mesh_device.cpp b/tests/tt_metal/distributed/test_mesh_device.cpp new file mode 100644 index 00000000000..c87c87cae35 --- /dev/null +++ b/tests/tt_metal/distributed/test_mesh_device.cpp @@ -0,0 +1,93 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#include "mesh_device.hpp" +#include "system_mesh.hpp" + +#include "tests/tt_metal/tt_metal/common/multi_device_fixture.hpp" + +namespace tt::tt_metal::distributed { +namespace { + +using ::testing::IsEmpty; +using ::testing::SizeIs; +using ::tt::tt_metal::distributed::MeshContainer; + +TEST(MeshDeviceInitTest, Init1x1Mesh) { + auto& sys = SystemMesh::instance(); + + auto config = tt::tt_metal::distributed::MeshDeviceConfig{.mesh_shape = MeshShape(1, 1)}; + + EXPECT_NO_THROW({ + auto mesh = tt::tt_metal::distributed::MeshDevice::create( + config, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); + mesh->close(); + }); +} + +using MeshDeviceTest = T3000MeshDeviceFixture; + +TEST_F(MeshDeviceTest, SystemMeshTearDownWithoutClose) { + auto& sys = SystemMesh::instance(); + + const auto system_shape = sys.get_shape(); + ASSERT_EQ(system_shape.dims(), 2); + EXPECT_EQ(system_shape[0], 2); + EXPECT_EQ(system_shape[1], 4); +} + +TEST_F(MeshDeviceTest, MemoryAllocationStatistics) { + auto stats = mesh_device_->allocator()->get_statistics(tt::tt_metal::BufferType::DRAM); + for (auto* device : mesh_device_->get_devices()) { + auto device_stats = device->allocator()->get_statistics(tt::tt_metal::BufferType::DRAM); + EXPECT_EQ(stats.total_allocatable_size_bytes, device_stats.total_allocatable_size_bytes); + } +} + +TEST_F(MeshDeviceTest, NumDramChannels) { + EXPECT_EQ(mesh_device_->num_dram_channels(), 96); // 8 devices * 12 channels +} + +TEST_F(MeshDeviceTest, ViewIs2D) { + std::vector devices = mesh_device_->get_devices(); + + MeshContainer container_1d(SimpleMeshShape(8), devices); + MeshDeviceView view_1d(container_1d); + EXPECT_FALSE(view_1d.is_mesh_2d()); + + MeshContainer container_2d(SimpleMeshShape(2, 4), devices); + MeshDeviceView view_2d(container_2d); + EXPECT_TRUE(view_2d.is_mesh_2d()); + + MeshContainer container_3d(SimpleMeshShape(2, 2, 2), devices); + MeshDeviceView view_3d(container_3d); + EXPECT_FALSE(view_3d.is_mesh_2d()); +} + +TEST_F(MeshDeviceTest, Submesh) { + EXPECT_EQ(mesh_device_->shape().num_rows, 2); + EXPECT_EQ(mesh_device_->shape().num_cols, 4); + EXPECT_THAT(mesh_device_->get_devices(), SizeIs(8)); + EXPECT_TRUE(mesh_device_->is_parent_mesh()); + EXPECT_THAT(mesh_device_->get_submeshes(), IsEmpty()); + + auto submesh = mesh_device_->create_submesh(MeshShape{1, 2}, MeshOffset{1, 1}); + EXPECT_THAT(mesh_device_->get_submeshes(), SizeIs(1)); + EXPECT_EQ(submesh->shape().num_rows, 1); + EXPECT_EQ(submesh->shape().num_cols, 2); + EXPECT_THAT(submesh->get_devices(), SizeIs(2)); + EXPECT_FALSE(submesh->is_parent_mesh()); + EXPECT_THAT(submesh->get_submeshes(), IsEmpty()); + + // Verify coordinates are correct. + EXPECT_EQ(mesh_device_->get_device(MeshCoordinate{1, 1})->id(), submesh->get_device(MeshCoordinate{0, 0})->id()); + EXPECT_EQ(mesh_device_->get_device(MeshCoordinate{1, 2})->id(), submesh->get_device(MeshCoordinate{0, 1})->id()); + EXPECT_EQ(submesh->get_device(1, 1), nullptr); +} + +} // namespace +} // namespace tt::tt_metal::distributed diff --git a/tests/ttnn/distributed/test_distributed_reshape.cpp b/tests/tt_metal/distributed/test_mesh_device_reshape.cpp similarity index 62% rename from tests/ttnn/distributed/test_distributed_reshape.cpp rename to tests/tt_metal/distributed/test_mesh_device_reshape.cpp index f3a085d0700..893ad9aca1a 100644 --- a/tests/ttnn/distributed/test_distributed_reshape.cpp +++ b/tests/tt_metal/distributed/test_mesh_device_reshape.cpp @@ -6,29 +6,20 @@ #include #include #include -#include -#include + +#include "host_api.hpp" +#include "mesh_config.hpp" +#include "mesh_device.hpp" #include "mesh_coord.hpp" + +#include "system_mesh.hpp" #include "tests/tt_metal/test_utils/env_vars.hpp" -namespace ttnn::distributed::test { +namespace tt::tt_metal::distributed { namespace { using ::testing::SizeIs; -// Helper function to check test environment -void check_t3k_test_environment() { - auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); - const auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); - const size_t num_devices = tt::tt_metal::GetNumAvailableDevices(); - if (slow_dispatch) { - GTEST_SKIP() << "Skipping Multi-Device test suite, since it can only be run in Fast Dispatch Mode."; - } - if (num_devices < 8 or arch != tt::ARCH::WORMHOLE_B0) { - GTEST_SKIP() << "Skipping T3K Multi-Device test suite on non T3K machine."; - } -} - std::vector get_physical_device_ids(const MeshDevice& mesh) { std::vector device_ids; for (auto* device : mesh.get_devices()) { @@ -37,46 +28,56 @@ std::vector get_physical_device_ids(const MeshDevice& mesh) { return device_ids; } -static constexpr std::array kMeshShapes{ - {{1, 1}, {1, 2}, {1, 3}, {1, 4}, {1, 5}, {1, 6}, {1, 7}, {1, 8}, {2, 1}, {2, 2}, {2, 3}, {2, 4}, - {3, 1}, {3, 2}, {4, 1}, {4, 2}, {8, 1}, {7, 1}, {6, 1}, {5, 1}, {4, 1}, {3, 1}, {2, 1}, {1, 1}}}; - -class MeshConfigurationTest : public ::testing::TestWithParam { -protected: - void SetUp() override { check_t3k_test_environment(); } +class T3KTestFixture : public ::testing::Test { +public: + void SetUp() override { + auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); + const auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); + const size_t num_devices = tt::tt_metal::GetNumAvailableDevices(); + if (slow_dispatch) { + GTEST_SKIP() << "Skipping Multi-Device test suite, since it can only be run in Fast Dispatch Mode."; + } + if (num_devices < 8 or arch != tt::ARCH::WORMHOLE_B0) { + GTEST_SKIP() << "Skipping T3K Multi-Device test suite on non T3K machine."; + } + } }; +constexpr std::array kMeshShapes{{{1, 1}, {1, 2}, {1, 3}, {1, 4}, {1, 5}, {1, 6}, {1, 7}, {1, 8}, + {2, 1}, {2, 2}, {2, 3}, {2, 4}, {3, 1}, {3, 2}, {4, 1}, {4, 2}, + {8, 1}, {7, 1}, {6, 1}, {5, 1}, {4, 1}, {3, 1}, {2, 1}, {1, 1}}}; + +class MeshConfigurationTest : public T3KTestFixture, public ::testing::WithParamInterface {}; + TEST_P(MeshConfigurationTest, MeshConfigurations) { const auto& shape = GetParam(); - auto mesh = ttnn::distributed::open_mesh_device( - {shape.num_rows, shape.num_cols}, + auto mesh = tt::tt_metal::distributed::MeshDevice::create( + MeshDeviceConfig{.mesh_shape = SimpleMeshShape(shape.num_rows, shape.num_cols)}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); EXPECT_EQ(mesh->num_rows(), shape.num_rows); EXPECT_EQ(mesh->num_cols(), shape.num_cols); - ttnn::distributed::close_mesh_device(mesh); + mesh->close(); } TEST_P(MeshConfigurationTest, GetPhysicalDeviceIds) { const auto& shape = GetParam(); - auto& system_mesh = tt::tt_metal::distributed::SystemMesh::instance(); + auto& system_mesh = SystemMesh::instance(); EXPECT_THAT( system_mesh.get_mapped_physical_device_ids(MeshDeviceConfig{.mesh_shape = SimpleMeshShape(shape)}), SizeIs(shape.num_cols * shape.num_rows)); } // Test all possible mesh configurations on T3000 -INSTANTIATE_TEST_SUITE_P(MeshShapes, MeshConfigurationTest, ::testing::ValuesIn(kMeshShapes)); +INSTANTIATE_TEST_SUITE_P(AllMeshShapes, MeshConfigurationTest, ::testing::ValuesIn(kMeshShapes)); -class MeshReshapeTest : public ::testing::TestWithParam> { -protected: - void SetUp() override { check_t3k_test_environment(); } -}; +class MeshDeviceReshapeRoundtripTest : public T3KTestFixture, + public ::testing::WithParamInterface> {}; -TEST_P(MeshReshapeTest, ReshapeBetweenConfigurations) { +TEST_P(MeshDeviceReshapeRoundtripTest, ReshapeBetweenConfigurations) { const auto& [old_shape, new_shape] = GetParam(); if ((old_shape.num_rows * old_shape.num_cols) != (new_shape.num_rows * new_shape.num_cols)) { @@ -86,8 +87,8 @@ TEST_P(MeshReshapeTest, ReshapeBetweenConfigurations) { GTEST_SKIP() << "Old shape is 1xN or Nx1; we test this in From1x4To2x2Invalid"; } - auto mesh = ttnn::distributed::open_mesh_device( - {old_shape.num_rows, old_shape.num_cols}, + auto mesh = tt::tt_metal::distributed::MeshDevice::create( + MeshDeviceConfig{.mesh_shape = SimpleMeshShape(old_shape.num_rows, old_shape.num_cols)}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, @@ -112,17 +113,14 @@ TEST_P(MeshReshapeTest, ReshapeBetweenConfigurations) { // Generate all possible combinations of shapes from kMeshShapes INSTANTIATE_TEST_SUITE_P( - ReshapeConfigurations, - MeshReshapeTest, + AllMeshShapes, + MeshDeviceReshapeRoundtripTest, ::testing::Combine(::testing::ValuesIn(kMeshShapes), ::testing::ValuesIn(kMeshShapes))); // Base class for non-parameterized tests -class T3000ReshapeTest : public ::testing::Test { -protected: - void SetUp() override { check_t3k_test_environment(); } -}; +using MeshDeviceReshapeTest = T3KTestFixture; -TEST_F(T3000ReshapeTest, InvalidRequestedShape) { +TEST_F(MeshDeviceReshapeTest, InvalidRequestedShape) { auto& system_mesh = tt::tt_metal::distributed::SystemMesh::instance(); // Shape too big. @@ -144,9 +142,13 @@ TEST_F(T3000ReshapeTest, InvalidRequestedShape) { MeshDeviceConfig{.mesh_shape = SimpleMeshShape(8), .offset = MeshCoordinate(1)})); } -TEST_F(T3000ReshapeTest, InvalidReshapeDimensions) { - auto mesh = ttnn::distributed::open_mesh_device( - {1, 8}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); +TEST_F(MeshDeviceReshapeTest, InvalidReshapeDimensions) { + auto mesh = tt::tt_metal::distributed::MeshDevice::create( + MeshDeviceConfig{.mesh_shape = SimpleMeshShape(1, 8)}, + DEFAULT_L1_SMALL_SIZE, + DEFAULT_TRACE_REGION_SIZE, + 1, + tt::tt_metal::DispatchCoreType::WORKER); // Test reshaping to dimensions that don't match total device count EXPECT_THROW(mesh->reshape({3, 3}), std::runtime_error); // 9 devices != 8 @@ -157,9 +159,13 @@ TEST_F(T3000ReshapeTest, InvalidReshapeDimensions) { EXPECT_EQ(mesh->num_cols(), 8); } -TEST_F(T3000ReshapeTest, From1x8To2x4ThenBackTo1x8) { - auto mesh = ttnn::distributed::open_mesh_device( - {1, 8}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); +TEST_F(MeshDeviceReshapeTest, From1x8To2x4ThenBackTo1x8) { + auto mesh = tt::tt_metal::distributed::MeshDevice::create( + MeshDeviceConfig{.mesh_shape = SimpleMeshShape(1, 8)}, + DEFAULT_L1_SMALL_SIZE, + DEFAULT_TRACE_REGION_SIZE, + 1, + tt::tt_metal::DispatchCoreType::WORKER); EXPECT_EQ(mesh->num_rows(), 1); EXPECT_EQ(mesh->num_cols(), 8); @@ -187,9 +193,13 @@ TEST_F(T3000ReshapeTest, From1x8To2x4ThenBackTo1x8) { EXPECT_EQ(mesh->get_device_ids(), original_order); } -TEST_F(T3000ReshapeTest, InvalidTotalDeviceCount) { - auto mesh = ttnn::distributed::open_mesh_device( - {1, 8}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); +TEST_F(MeshDeviceReshapeTest, InvalidTotalDeviceCount) { + auto mesh = tt::tt_metal::distributed::MeshDevice::create( + MeshDeviceConfig{.mesh_shape = SimpleMeshShape(1, 8)}, + DEFAULT_L1_SMALL_SIZE, + DEFAULT_TRACE_REGION_SIZE, + 1, + tt::tt_metal::DispatchCoreType::WORKER); // Test reshaping to dimensions that don't match total device count EXPECT_THROW(mesh->reshape({3, 3}), std::runtime_error); // 9 devices != 8 @@ -200,15 +210,19 @@ TEST_F(T3000ReshapeTest, InvalidTotalDeviceCount) { EXPECT_EQ(mesh->num_cols(), 8); } -TEST_F(T3000ReshapeTest, From1x4To2x2Invalid) { - auto mesh = ttnn::distributed::open_mesh_device( - {1, 4}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); +TEST_F(MeshDeviceReshapeTest, From1x4To2x2Invalid) { + auto mesh = tt::tt_metal::distributed::MeshDevice::create( + MeshDeviceConfig{.mesh_shape = SimpleMeshShape(1, 4)}, + DEFAULT_L1_SMALL_SIZE, + DEFAULT_TRACE_REGION_SIZE, + 1, + tt::tt_metal::DispatchCoreType::WORKER); // This is an invalid reshape because the 1x4 mesh does not fully cover the 2x2 mesh EXPECT_THROW(mesh->reshape({2, 2}), std::runtime_error); } -TEST_F(T3000ReshapeTest, From1x4To2x2Valid) { +TEST_F(MeshDeviceReshapeTest, From1x4To2x2Valid) { auto& system_mesh = tt::tt_metal::distributed::SystemMesh::instance(); // Fetch the device ids for a physically connected 2x2 mesh. @@ -218,14 +232,12 @@ TEST_F(T3000ReshapeTest, From1x4To2x2Valid) { // Supply the physical device ids to the mesh constructor that we know we know is 2x2 physically connected. // We will create a 1x4 mesh and then reshape it to 2x2. - auto mesh = ttnn::distributed::open_mesh_device( - {1, 4}, + auto mesh = tt::tt_metal::distributed::MeshDevice::create( + MeshDeviceConfig{.mesh_shape = SimpleMeshShape(1, 4), .physical_device_ids = physical_device_ids}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, - tt::tt_metal::DispatchCoreType::WORKER, - MeshOffset{0, 0}, - physical_device_ids); + tt::tt_metal::DispatchCoreType::WORKER); mesh->reshape({2, 2}); EXPECT_EQ(mesh->num_rows(), 2); @@ -236,9 +248,13 @@ TEST_F(T3000ReshapeTest, From1x4To2x2Valid) { } } -TEST_F(T3000ReshapeTest, From2x2To1x4) { - auto mesh = ttnn::distributed::open_mesh_device( - {2, 2}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); +TEST_F(MeshDeviceReshapeTest, From2x2To1x4) { + auto mesh = tt::tt_metal::distributed::MeshDevice::create( + MeshDeviceConfig{.mesh_shape = SimpleMeshShape(2, 2)}, + DEFAULT_L1_SMALL_SIZE, + DEFAULT_TRACE_REGION_SIZE, + 1, + tt::tt_metal::DispatchCoreType::WORKER); auto mesh_2x2_device_ids = mesh->get_device_ids(); @@ -258,4 +274,4 @@ TEST_F(T3000ReshapeTest, From2x2To1x4) { } } // namespace -} // namespace ttnn::distributed::test +} // namespace tt::tt_metal::distributed diff --git a/tests/ttnn/CMakeLists.txt b/tests/ttnn/CMakeLists.txt index 3117e6b8920..7e3c43ea023 100644 --- a/tests/ttnn/CMakeLists.txt +++ b/tests/ttnn/CMakeLists.txt @@ -25,5 +25,4 @@ function(setup_ttnn_test_target target_name) ) endfunction() -add_subdirectory(distributed) add_subdirectory(unit_tests/gtests) diff --git a/tests/ttnn/distributed/CMakeLists.txt b/tests/ttnn/distributed/CMakeLists.txt deleted file mode 100644 index 5823925eec3..00000000000 --- a/tests/ttnn/distributed/CMakeLists.txt +++ /dev/null @@ -1,13 +0,0 @@ -add_executable( - test_distributed - test_distributed.cpp - test_distributed_reshape.cpp -) -add_executable(test_distributed_atexit test_distributed_atexit.cpp) - -# Set up properties for the target -setup_ttnn_test_target(test_distributed) -setup_ttnn_test_target(test_distributed_atexit) -# Add test to CTest -add_test(NAME test_distributed COMMAND test_distributed) -add_test(NAME test_distributed_atexit COMMAND test_distributed_atexit) diff --git a/tests/ttnn/distributed/test_distributed.cpp b/tests/ttnn/distributed/test_distributed.cpp deleted file mode 100644 index ee9d2f83fb4..00000000000 --- a/tests/ttnn/distributed/test_distributed.cpp +++ /dev/null @@ -1,99 +0,0 @@ -// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include -#include - -#include - -#include -#include -#include "ttnn/distributed/types.hpp" - -namespace ttnn::distributed::test { - -using ::testing::IsEmpty; -using ::testing::SizeIs; -using ::tt::tt_metal::distributed::MeshContainer; - -class DistributedTest : public ::testing::Test { -protected: - void SetUp() override {} - void TearDown() override {} -}; - -TEST_F(DistributedTest, TestSystemMeshTearDownWithoutClose) { - auto& sys = SystemMesh::instance(); - auto mesh = ttnn::distributed::open_mesh_device( - /*mesh_shape=*/{2, 4}, - DEFAULT_L1_SMALL_SIZE, - DEFAULT_TRACE_REGION_SIZE, - 1, - tt::tt_metal::DispatchCoreType::WORKER); - - const auto system_shape = sys.get_shape(); - ASSERT_EQ(system_shape.dims(), 2); - EXPECT_EQ(system_shape[0], 2); - EXPECT_EQ(system_shape[1], 4); -} - -TEST_F(DistributedTest, TestMemoryAllocationStatistics) { - auto mesh = ttnn::distributed::open_mesh_device( - {2, 4}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); - auto stats = mesh->allocator()->get_statistics(tt::tt_metal::BufferType::DRAM); - for (auto* device : mesh->get_devices()) { - auto device_stats = device->allocator()->get_statistics(tt::tt_metal::BufferType::DRAM); - EXPECT_EQ(stats.total_allocatable_size_bytes, device_stats.total_allocatable_size_bytes); - } -} - -TEST_F(DistributedTest, TestNumDramChannels) { - auto mesh = ttnn::distributed::open_mesh_device( - {2, 4}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); - EXPECT_EQ(mesh->num_dram_channels(), 96); // 8 devices * 12 channels -} - -TEST_F(DistributedTest, ViewIs2D) { - auto mesh = ttnn::distributed::open_mesh_device( - {2, 4}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); - std::vector devices = mesh->get_devices(); - - MeshContainer container_1d(SimpleMeshShape(8), devices); - MeshDeviceView view_1d(container_1d); - EXPECT_FALSE(view_1d.is_mesh_2d()); - - MeshContainer container_2d(SimpleMeshShape(2, 4), devices); - MeshDeviceView view_2d(container_2d); - EXPECT_TRUE(view_2d.is_mesh_2d()); - - MeshContainer container_3d(SimpleMeshShape(2, 2, 2), devices); - MeshDeviceView view_3d(container_3d); - EXPECT_FALSE(view_3d.is_mesh_2d()); -} - -TEST_F(DistributedTest, Submesh) { - auto mesh = ttnn::distributed::open_mesh_device( - {2, 4}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); - - EXPECT_EQ(mesh->shape().num_rows, 2); - EXPECT_EQ(mesh->shape().num_cols, 4); - EXPECT_THAT(mesh->get_devices(), SizeIs(8)); - EXPECT_TRUE(mesh->is_parent_mesh()); - EXPECT_THAT(mesh->get_submeshes(), IsEmpty()); - - auto submesh = mesh->create_submesh(MeshShape{1, 2}, MeshOffset{1, 1}); - EXPECT_THAT(mesh->get_submeshes(), SizeIs(1)); - EXPECT_EQ(submesh->shape().num_rows, 1); - EXPECT_EQ(submesh->shape().num_cols, 2); - EXPECT_THAT(submesh->get_devices(), SizeIs(2)); - EXPECT_FALSE(submesh->is_parent_mesh()); - EXPECT_THAT(submesh->get_submeshes(), IsEmpty()); - - // Verify coordinates are correct. - EXPECT_EQ(mesh->get_device(MeshCoordinate{1, 1})->id(), submesh->get_device(MeshCoordinate{0, 0})->id()); - EXPECT_EQ(mesh->get_device(MeshCoordinate{1, 2})->id(), submesh->get_device(MeshCoordinate{0, 1})->id()); - EXPECT_EQ(submesh->get_device(1, 1), nullptr); - -} // namespace ttnn::distributed::test -} // namespace ttnn::distributed::test diff --git a/tests/ttnn/distributed/test_distributed_atexit.cpp b/tests/ttnn/distributed/test_distributed_atexit.cpp deleted file mode 100644 index 6d4461f7386..00000000000 --- a/tests/ttnn/distributed/test_distributed_atexit.cpp +++ /dev/null @@ -1,27 +0,0 @@ -// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include - -#include -#include -#include -#include - -namespace ttnn::distributed::test { - -// Simplified test without fixture, and mesh variable moved inside test -TEST(DistributedTestStandalone, TestSystemMeshTearDownWithoutClose) { - static std::shared_ptr mesh; - auto& sys = tt::tt_metal::distributed::SystemMesh::instance(); - mesh = ttnn::distributed::open_mesh_device( - {2, 4}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); - - const auto system_shape = sys.get_shape(); - ASSERT_EQ(system_shape.dims(), 2); - EXPECT_EQ(system_shape[0], 2); - EXPECT_EQ(system_shape[1], 4); -} - -} // namespace ttnn::distributed::test diff --git a/tt_metal/CMakeLists.txt b/tt_metal/CMakeLists.txt index 7d96a44a239..9dce6002708 100644 --- a/tt_metal/CMakeLists.txt +++ b/tt_metal/CMakeLists.txt @@ -142,7 +142,7 @@ if(BUILD_PROGRAMMING_EXAMPLES) endif() # Allow internal files to access the public API "by default" and without the -# scoping that external consumers must use. Scaoping may still be used if desired. +# scoping that external consumers must use. Scoping may still be used if desired. include_directories( api api/tt-metalium diff --git a/tt_metal/api/tt-metalium/system_mesh.hpp b/tt_metal/api/tt-metalium/system_mesh.hpp index 1ee91588dcc..f904de46044 100644 --- a/tt_metal/api/tt-metalium/system_mesh.hpp +++ b/tt_metal/api/tt-metalium/system_mesh.hpp @@ -9,7 +9,7 @@ #include "mesh_config.hpp" #include "mesh_coord.hpp" - +#include "indestructible.hpp" namespace tt::tt_metal::distributed { // SystemMesh creates a virtualization over the physical devices in the system. @@ -21,6 +21,8 @@ class SystemMesh { std::unique_ptr pimpl_; SystemMesh(); + friend class tt::stl::Indestructible; + public: static SystemMesh& instance(); SystemMesh(const SystemMesh&) = delete; @@ -28,12 +30,13 @@ class SystemMesh { SystemMesh(SystemMesh&&) = delete; SystemMesh& operator=(SystemMesh&&) = delete; + // Returns the shape of the system mesh const SimpleMeshShape& get_shape() const; - // Gets the physical device ID for a given logical row and column index + // Returns the physical device ID for a given logical row and column index chip_id_t get_physical_device_id(const MeshCoordinate& coord) const; - // Get the physical device IDs mapped to a MeshDevice + // Returns the physical device IDs mapped to a MeshDevice std::vector get_mapped_physical_device_ids(const MeshDeviceConfig& config) const; std::vector request_available_devices(const MeshDeviceConfig& config) const; }; diff --git a/tt_metal/common/mesh_coord.cpp b/tt_metal/common/mesh_coord.cpp index 19dab608c35..88f4309cd90 100644 --- a/tt_metal/common/mesh_coord.cpp +++ b/tt_metal/common/mesh_coord.cpp @@ -82,9 +82,12 @@ bool operator==(const MeshCoordinate& lhs, const MeshCoordinate& rhs) { bool operator!=(const MeshCoordinate& lhs, const MeshCoordinate& rhs) { return !(lhs == rhs); } std::ostream& operator<<(std::ostream& os, const MeshCoordinate& coord) { - os << "MeshCoordinate(" << coord.dims() << ", ["; - for (size_t dim : coord.coords()) { - os << dim << ", "; + os << "MeshCoordinate(["; + for (size_t i = 0; i < coord.dims(); ++i) { + if (i > 0) { + os << ", "; + } + os << coord[i]; } os << "])"; return os; diff --git a/tt_metal/distributed/mesh_device.cpp b/tt_metal/distributed/mesh_device.cpp index 7190e8e3806..80535e32674 100644 --- a/tt_metal/distributed/mesh_device.cpp +++ b/tt_metal/distributed/mesh_device.cpp @@ -69,9 +69,7 @@ MeshDevice::ScopedDevices::ScopedDevices( size_t num_command_queues, const DispatchCoreConfig& dispatch_core_config, const MeshDeviceConfig& config) { - auto& system_mesh = SystemMesh::instance(); - auto physical_device_ids = system_mesh.request_available_devices(config); - + auto physical_device_ids = SystemMesh::instance().request_available_devices(config); opened_devices_ = tt::tt_metal::detail::CreateDevices( physical_device_ids, num_command_queues, l1_small_size, trace_region_size, dispatch_core_config); diff --git a/tt_metal/distributed/system_mesh.cpp b/tt_metal/distributed/system_mesh.cpp index b2eff3b89d2..10a20b6e433 100644 --- a/tt_metal/distributed/system_mesh.cpp +++ b/tt_metal/distributed/system_mesh.cpp @@ -8,6 +8,7 @@ #include "umd/device/types/cluster_descriptor_types.h" #include "tt_metal/distributed/coordinate_translation.hpp" +#include "indestructible.hpp" #include "mesh_coord.hpp" #include "tt_cluster.hpp" @@ -29,8 +30,6 @@ class SystemMesh::Impl { const SimpleMeshShape& get_shape() const; std::vector get_mapped_physical_device_ids(const MeshDeviceConfig& config) const; std::vector request_available_devices(const MeshDeviceConfig& config) const; - - IDevice* get_device(const chip_id_t physical_device_id) const; chip_id_t get_physical_device_id(const MeshCoordinate& coord) const; }; @@ -128,7 +127,7 @@ std::vector SystemMesh::Impl::get_mapped_physical_device_ids(const Me auto line_length = config.mesh_shape.mesh_size(); for (const auto& logical_coordinate : MeshDeviceView::get_line_coordinates(line_length, shape_2d)) { - auto physical_device_id = logical_to_device_id_.at(logical_coordinate); + auto physical_device_id = get_physical_device_id(logical_coordinate); physical_device_ids.push_back(physical_device_id); log_debug( @@ -176,14 +175,9 @@ std::vector SystemMesh::Impl::get_mapped_physical_device_ids(const Me MeshCoordinateRange system_range(system_offset, MeshCoordinate(end_coord)); for (const auto& system_coord : system_range) { - auto physical_device_id = logical_to_device_id_.find(system_coord); - TT_FATAL( - physical_device_id != logical_to_device_id_.end(), - "Logical coordinate: {} not found in SystemMesh of shape {}", - system_coord, - logical_mesh_shape_); - physical_device_ids.push_back(physical_device_id->second); - log_debug(LogMetal, "Logical coordinate: {}, Physical device ID: {}", system_coord, physical_device_id->second); + auto physical_device_id = get_physical_device_id(system_coord); + physical_device_ids.push_back(physical_device_id); + log_debug(LogMetal, "Logical coordinate: {}, Physical device ID: {}", system_coord, physical_device_id); } return physical_device_ids; } @@ -201,11 +195,11 @@ std::vector SystemMesh::Impl::request_available_devices(const MeshDev SystemMesh::SystemMesh() : pimpl_(std::make_unique()) {} SystemMesh& SystemMesh::instance() { - static SystemMesh instance; - if (!instance.pimpl_->is_system_mesh_initialized()) { - instance.pimpl_->initialize(); + static tt::stl::Indestructible instance; + if (!instance.get().pimpl_->is_system_mesh_initialized()) { + instance.get().pimpl_->initialize(); } - return instance; + return instance.get(); } chip_id_t SystemMesh::get_physical_device_id(const MeshCoordinate& coord) const { From c05fd330528603ef5cfdc3ac58df94822670e620 Mon Sep 17 00:00:00 2001 From: Bryan Wilder Field Lozano Date: Sun, 23 Feb 2025 16:32:11 -0800 Subject: [PATCH 246/316] Remove `test_common.hpp` from public API (#18169) --- .../tt_metal/test_utils}/test_common.hpp | 4 ++-- tests/tt_metal/tt_metal/CMakeLists.txt | 1 + .../test_dram_read_remote_cb.cpp | 2 ++ .../test_remote_cb_sync_matmul.cpp | 2 ++ .../perf_microbenchmark/1_compute_mm/test_compute_mm.cpp | 2 ++ .../2_noc_adjacent/test_noc_adjacent.cpp | 2 ++ .../perf_microbenchmark/2_noc_rtor/test_noc_rtor.cpp | 2 ++ .../3_pcie_transfer/test_pull_from_pcie.cpp | 2 ++ .../perf_microbenchmark/3_pcie_transfer/test_rw_buffer.cpp | 2 ++ .../6_dram_offchip/test_dram_offchip.cpp | 2 ++ .../7_kernel_launch/test_kernel_launch.cpp | 2 ++ .../8_dram_adjacent_core_read/test_dram_read.cpp | 2 ++ .../test_dram_read_l1_write.cpp | 2 ++ tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt | 1 + .../perf_microbenchmark/dispatch/test_bw_and_latency.cpp | 2 +- .../perf_microbenchmark/dispatch/test_dispatcher.cpp | 2 ++ .../perf_microbenchmark/dispatch/test_pgm_dispatch.cpp | 2 +- .../perf_microbenchmark/dispatch/test_prefetcher.cpp | 2 ++ .../perf_microbenchmark/old/matmul/matmul_global_l1.cpp | 2 +- .../perf_microbenchmark/old/matmul/matmul_local_l1.cpp | 2 +- .../old/noc/test_noc_read_global_l1.cpp | 2 +- .../perf_microbenchmark/old/noc/test_noc_read_local_l1.cpp | 2 +- .../old/pcie/test_enqueue_rw_buffer.cpp | 2 +- .../perf_microbenchmark/old/pcie/test_rw_buffer.cpp | 2 +- .../perf_microbenchmark/old/pcie/test_rw_device_dram.cpp | 2 +- .../perf_microbenchmark/old/pcie/test_rw_device_l1.cpp | 2 +- .../routing/{test_common.hpp => routing_test_common.hpp} | 7 ++++--- .../perf_microbenchmark/routing/test_mux_demux.cpp | 1 + .../perf_microbenchmark/routing/test_mux_demux_2level.cpp | 1 + .../routing/test_tt_fabric_multi_hop_sanity.cpp | 1 + .../perf_microbenchmark/routing/test_tt_fabric_sanity.cpp | 1 + .../routing/test_tt_fabric_socket_sanity.cpp | 1 + .../tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp | 1 + .../perf_microbenchmark/routing/test_vc_bi_tunnel_2ep.cpp | 1 + .../perf_microbenchmark/routing/test_vc_bi_tunnel_4ep.cpp | 1 + .../routing/test_vc_loopback_tunnel.cpp | 1 + .../perf_microbenchmark/routing/test_vc_mux_demux.cpp | 1 + .../perf_microbenchmark/routing/test_vc_uni_tunnel.cpp | 1 + tests/tt_metal/tt_metal/test_interleaved_layouts.cpp | 2 ++ .../tt_metal/test_matmul_multi_core_multi_dram.cpp | 2 ++ tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp | 2 ++ tests/tt_metal/tt_metal/test_stress_noc_mcast.cpp | 2 +- tt_metal/llrt/tt_cluster.cpp | 1 - tt_metal/llrt/tt_cluster.hpp | 1 - 44 files changed, 62 insertions(+), 18 deletions(-) rename {tt_metal/api/tt-metalium => tests/tt_metal/test_utils}/test_common.hpp (99%) rename tests/tt_metal/tt_metal/perf_microbenchmark/routing/{test_common.hpp => routing_test_common.hpp} (80%) diff --git a/tt_metal/api/tt-metalium/test_common.hpp b/tests/tt_metal/test_utils/test_common.hpp similarity index 99% rename from tt_metal/api/tt-metalium/test_common.hpp rename to tests/tt_metal/test_utils/test_common.hpp index 7a81c7a0732..dbcf2c50e25 100644 --- a/tt_metal/api/tt-metalium/test_common.hpp +++ b/tests/tt_metal/test_utils/test_common.hpp @@ -23,7 +23,7 @@ template constexpr std::false_type always_false{}; template -T parse(std::string const& s) { +T parse(const std::string& s) { if constexpr (std::is_same_v) { return std::stoul(s, 0, 0); } else if constexpr (std::is_same_v) { @@ -39,7 +39,7 @@ T parse(std::string const& s) { } } -inline std::string strip(std::string const& s) { +inline std::string strip(const std::string& s) { std::string whitespace = " \t\n"; std::size_t start = s.find_first_not_of(whitespace); std::size_t end = s.find_last_not_of(whitespace); diff --git a/tests/tt_metal/tt_metal/CMakeLists.txt b/tests/tt_metal/tt_metal/CMakeLists.txt index bafab7885dd..9065d45acd5 100644 --- a/tests/tt_metal/tt_metal/CMakeLists.txt +++ b/tests/tt_metal/tt_metal/CMakeLists.txt @@ -48,6 +48,7 @@ foreach(TEST_SRC ${TT_METAL_TESTS_SRCS}) PRIVATE "$" ${PROJECT_SOURCE_DIR}/tests + ${PROJECT_SOURCE_DIR}/tests/tt_metal/test_utils ${CMAKE_CURRENT_SOURCE_DIR} ) set_target_properties( diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp index ff359239b1e..4ab8453a76d 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp @@ -28,6 +28,8 @@ #include "tt_metal/tt_metal/common/matmul_test_utils.hpp" #include +#include "test_common.hpp" + using std::vector; using namespace tt; using std::chrono::duration_cast; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/test_remote_cb_sync_matmul.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/test_remote_cb_sync_matmul.cpp index 16ceb8092cd..784f8814af0 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/test_remote_cb_sync_matmul.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/test_remote_cb_sync_matmul.cpp @@ -27,6 +27,8 @@ #include "tt_metal/test_utils/deprecated/tensor.hpp" #include "tt_metal/tt_metal/common/matmul_test_utils.hpp" +#include "test_common.hpp" + using std::vector; using namespace tt; using std::chrono::duration_cast; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp index 38b82e910e7..24382b4ff73 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp @@ -28,6 +28,8 @@ #include "tt_metal/tt_metal/common/matmul_test_utils.hpp" #include +#include "test_common.hpp" + using std::vector; using namespace tt; //////////////////////////////////////////////////////////////////////////////// diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_adjacent/test_noc_adjacent.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_adjacent/test_noc_adjacent.cpp index 11944860693..a877ef09d0a 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_adjacent/test_noc_adjacent.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_adjacent/test_noc_adjacent.cpp @@ -13,6 +13,8 @@ #include #include "tt_metal/tt_metal/perf_microbenchmark/common/util.hpp" +#include "test_common.hpp" + using namespace tt; using namespace tt::tt_metal; using std::chrono::duration_cast; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_rtor/test_noc_rtor.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_rtor/test_noc_rtor.cpp index 661d0018769..27cb5adcff2 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_rtor/test_noc_rtor.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_rtor/test_noc_rtor.cpp @@ -14,6 +14,8 @@ #include #include "tt_metal/tt_metal/perf_microbenchmark/common/util.hpp" +#include "test_common.hpp" + using namespace tt; using namespace tt::tt_metal; using std::chrono::duration_cast; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_pull_from_pcie.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_pull_from_pcie.cpp index 8d83a1b175b..9e7ff0e7f05 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_pull_from_pcie.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_pull_from_pcie.cpp @@ -16,6 +16,8 @@ #include #include "tt_metal/tt_metal/perf_microbenchmark/common/util.hpp" +#include "test_common.hpp" + using namespace tt; using namespace tt::tt_metal; using std::chrono::duration_cast; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_rw_buffer.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_rw_buffer.cpp index 306c3463bd3..6ce45cc0efe 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_rw_buffer.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_rw_buffer.cpp @@ -17,6 +17,8 @@ #include "tt_cluster.hpp" #include "tt_metal/tt_metal/perf_microbenchmark/common/util.hpp" +#include "test_common.hpp" + using namespace tt; using namespace tt::tt_metal; using std::chrono::duration_cast; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/6_dram_offchip/test_dram_offchip.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/6_dram_offchip/test_dram_offchip.cpp index df8fe9407aa..3a9589bc218 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/6_dram_offchip/test_dram_offchip.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/6_dram_offchip/test_dram_offchip.cpp @@ -19,6 +19,8 @@ #include "tt_metal/tt_metal/perf_microbenchmark/common/util.hpp" #include +#include "test_common.hpp" + using namespace tt; using std::chrono::duration_cast; using std::chrono::microseconds; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/7_kernel_launch/test_kernel_launch.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/7_kernel_launch/test_kernel_launch.cpp index 9889aa430b9..2bc2d18553f 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/7_kernel_launch/test_kernel_launch.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/7_kernel_launch/test_kernel_launch.cpp @@ -13,6 +13,8 @@ #include #include "tt_metal/tt_metal/perf_microbenchmark/common/util.hpp" +#include "test_common.hpp" + using std::vector; using namespace tt; using namespace tt::tt_metal; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp index d40e9384635..554c85e559c 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp @@ -21,6 +21,8 @@ #include #include +#include "test_common.hpp" + using namespace tt; using std::chrono::duration_cast; using std::chrono::microseconds; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp index 301ceea8c21..9340465fe2c 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp @@ -24,6 +24,8 @@ #include #include +#include "test_common.hpp" + using namespace tt; using std::chrono::duration_cast; using std::chrono::microseconds; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt b/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt index e4178cba02b..598e4125424 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt @@ -79,6 +79,7 @@ foreach(arch ${ARCHITECTURES}) "$" ${PROJECT_SOURCE_DIR}/ttnn/cpp/ttnn/deprecated # this all should go away and be replaced with link to ttnn ${PROJECT_SOURCE_DIR}/tests + ${PROJECT_SOURCE_DIR}/tests/tt_metal/test_utils ${CMAKE_CURRENT_SOURCE_DIR} ) set_target_properties( diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp index 3053fd4c7ed..31f7c2296ed 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp @@ -12,7 +12,7 @@ #include "logger.hpp" #include #include -#include +#include "test_common.hpp" #include #include #include diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp index e751187a2ab..d0f0fea005b 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp @@ -14,6 +14,8 @@ #include #include "common.h" +#include "test_common.hpp" + constexpr uint32_t DEFAULT_ITERATIONS = 10000; constexpr uint32_t DEFAULT_WARMUP_ITERATIONS = 100; constexpr uint32_t DEFAULT_DISPATCH_BUFFER_LOG_PAGE_SIZE = 12; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp index b9e3aaaf083..0d9c0eefd8f 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp @@ -6,7 +6,7 @@ #include #include #include -#include +#include "test_common.hpp" #include #include #include diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp index 0b1dc88bec3..0c6b581e7c3 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp @@ -21,6 +21,8 @@ #include #include "llrt.hpp" +#include "test_common.hpp" + #define CQ_PREFETCH_CMD_BARE_MIN_SIZE tt::tt_metal::hal.get_alignment(tt::tt_metal::HalMemType::HOST) constexpr uint32_t DEFAULT_TEST_TYPE = 0; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp index 13eb1015602..73c0fb19225 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp @@ -12,7 +12,7 @@ #include #include #include -#include +#include "test_common.hpp" #include #include #include "dprint_server.hpp" diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1.cpp index b15d222a21d..acef9bfcd07 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1.cpp @@ -11,7 +11,7 @@ #include #include #include -#include +#include "test_common.hpp" #include #include "dprint_server.hpp" #include "tt_metal/test_utils/deprecated/tensor.hpp" diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp index 24580476130..20ce9327a65 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp @@ -11,7 +11,7 @@ #include #include #include -#include +#include "test_common.hpp" #include #include "dprint_server.hpp" #include "tt_metal/test_utils/deprecated/tensor.hpp" diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_local_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_local_l1.cpp index a08ec04c278..9ae53cb1e28 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_local_l1.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_local_l1.cpp @@ -11,7 +11,7 @@ #include #include #include -#include +#include "test_common.hpp" #include #include "dprint_server.hpp" #include "tt_metal/test_utils/deprecated/tensor.hpp" diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_enqueue_rw_buffer.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_enqueue_rw_buffer.cpp index caa962ab89e..da12baa481f 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_enqueue_rw_buffer.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_enqueue_rw_buffer.cpp @@ -8,7 +8,7 @@ #include #include -#include +#include "test_common.hpp" #include #include diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_buffer.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_buffer.cpp index 714e0b2af26..c1f5b1426f9 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_buffer.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_buffer.cpp @@ -9,7 +9,7 @@ #include #include #include -#include +#include "test_common.hpp" #include using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_dram.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_dram.cpp index 4ab4568663b..89dedffba0a 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_dram.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_dram.cpp @@ -9,7 +9,7 @@ #include #include -#include +#include "test_common.hpp" #include using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_l1.cpp index 04ae58dc362..844d2e4bb9e 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_l1.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_l1.cpp @@ -9,7 +9,7 @@ #include #include -#include +#include "test_common.hpp" #include using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/routing_test_common.hpp similarity index 80% rename from tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp rename to tests/tt_metal/tt_metal/perf_microbenchmark/routing/routing_test_common.hpp index ad6c6eff13b..1dcd801b127 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/routing_test_common.hpp @@ -17,7 +17,8 @@ static inline std::string to_string(pkt_dest_size_choices_t choice) { } } -static inline void log_phys_coord_to_json(nlohmann::json& config, const std::vector& phys_cores, const std::string& name) { +static inline void log_phys_coord_to_json( + nlohmann::json& config, const std::vector& phys_cores, const std::string& name) { for (int i = 0; i < phys_cores.size(); ++i) { config[fmt::format("{}_{}", name, i)] = fmt::format("({}, {})", phys_cores[i].x, phys_cores[i].y); } @@ -28,9 +29,9 @@ static inline void log_phys_coord_to_json(nlohmann::json& config, const CoreCoor } inline uint64_t get_64b_result(uint32_t* buf, uint32_t index) { - return (((uint64_t)buf[index]) << 32) | buf[index+1]; + return (((uint64_t)buf[index]) << 32) | buf[index + 1]; } inline uint64_t get_64b_result(const std::vector& vec, uint32_t index) { - return (((uint64_t)vec[index]) << 32) | vec[index+1]; + return (((uint64_t)vec[index]) << 32) | vec[index + 1]; } diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp index f267a746382..eda89407079 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp @@ -9,6 +9,7 @@ #include #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" #include "test_common.hpp" +#include "routing_test_common.hpp" #include "llrt.hpp" using std::vector; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp index dc4a8f132fd..2834227a93e 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp @@ -8,6 +8,7 @@ #include #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" #include "test_common.hpp" +#include "routing_test_common.hpp" #include "llrt.hpp" using std::vector; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp index bacca186d10..00761a5843a 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp @@ -11,6 +11,7 @@ // #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" #include "tt_metal/fabric/hw/inc/tt_fabric_status.h" #include "test_common.hpp" +#include "routing_test_common.hpp" #include "eth_l1_address_map.h" #include "tt_metal/fabric/hw/inc/tt_fabric_interface.h" diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp index f495c0b5e7b..c6d48b3f670 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp @@ -13,6 +13,7 @@ //#include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" #include "tt_metal/fabric/hw/inc/tt_fabric_status.h" #include "test_common.hpp" +#include "routing_test_common.hpp" #include "eth_l1_address_map.h" #include "tt_metal/fabric/hw/inc/tt_fabric_interface.h" #include diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp index b6a5e0182c8..198246ce0da 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp @@ -11,6 +11,7 @@ // #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" #include "tt_metal/fabric/hw/inc/tt_fabric_status.h" #include "test_common.hpp" +#include "routing_test_common.hpp" #include "eth_l1_address_map.h" #include "tt_metal/fabric/hw/inc/tt_fabric_interface.h" diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp index a645b972fa6..d8a5c7263bd 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp @@ -8,6 +8,7 @@ #include #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" #include "test_common.hpp" +#include "routing_test_common.hpp" #include "utils.hpp" #include "llrt.hpp" diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_2ep.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_2ep.cpp index 99d271f3ce0..bfaaadb2a0c 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_2ep.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_2ep.cpp @@ -9,6 +9,7 @@ #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" #include #include "test_common.hpp" +#include "routing_test_common.hpp" using std::vector; using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_4ep.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_4ep.cpp index 8c70290d9c3..23a4e9db4f7 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_4ep.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_4ep.cpp @@ -9,6 +9,7 @@ #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" #include #include "test_common.hpp" +#include "routing_test_common.hpp" using std::vector; using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_loopback_tunnel.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_loopback_tunnel.cpp index 0b9cf4ae5b4..c34eea39242 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_loopback_tunnel.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_loopback_tunnel.cpp @@ -9,6 +9,7 @@ #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" #include #include "test_common.hpp" +#include "routing_test_common.hpp" using std::vector; using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp index 805ea48ca01..28a89013e54 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp @@ -9,6 +9,7 @@ #include #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" #include "test_common.hpp" +#include "routing_test_common.hpp" using std::vector; using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp index 32d69fb8586..b4c37a1ff14 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp @@ -9,6 +9,7 @@ #include #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" #include "test_common.hpp" +#include "routing_test_common.hpp" using std::vector; using namespace tt; diff --git a/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp b/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp index f6ffce0e797..9cb9cf85c0c 100644 --- a/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp +++ b/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp @@ -15,6 +15,8 @@ #include "dprint_server.hpp" +#include "test_common.hpp" + ////////////////////////////////////////////////////////////////////////////////////////// // TODO: explain what test does ////////////////////////////////////////////////////////////////////////////////////////// diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp index 2b11027b701..2affd969e68 100644 --- a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp @@ -13,6 +13,8 @@ #include #include +#include "test_common.hpp" + ////////////////////////////////////////////////////////////////////////////////////////// // TODO: explain what test does ////////////////////////////////////////////////////////////////////////////////////////// diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp index 2d457de3e58..b50fdd0f708 100644 --- a/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp @@ -12,6 +12,8 @@ #include "tt_metal/test_utils/deprecated/tensor.hpp" #include +#include "test_common.hpp" + ////////////////////////////////////////////////////////////////////////////////////////// // TODO: explain what test does ////////////////////////////////////////////////////////////////////////////////////////// diff --git a/tests/tt_metal/tt_metal/test_stress_noc_mcast.cpp b/tests/tt_metal/tt_metal/test_stress_noc_mcast.cpp index 2ab7e642602..d69f71d3588 100644 --- a/tests/tt_metal/tt_metal/test_stress_noc_mcast.cpp +++ b/tests/tt_metal/tt_metal/test_stress_noc_mcast.cpp @@ -18,7 +18,7 @@ #include "logger.hpp" #include #include -#include +#include "test_common.hpp" #include #include #include diff --git a/tt_metal/llrt/tt_cluster.cpp b/tt_metal/llrt/tt_cluster.cpp index afa0a600254..b7cecc47732 100644 --- a/tt_metal/llrt/tt_cluster.cpp +++ b/tt_metal/llrt/tt_cluster.cpp @@ -23,7 +23,6 @@ #include "fmt/base.h" #include #include -#include #include #include "umd/device/types/arch.h" #include "umd/device/tt_cluster_descriptor.h" diff --git a/tt_metal/llrt/tt_cluster.hpp b/tt_metal/llrt/tt_cluster.hpp index 1b54e3a1213..6f91b01300e 100644 --- a/tt_metal/llrt/tt_cluster.hpp +++ b/tt_metal/llrt/tt_cluster.hpp @@ -8,7 +8,6 @@ #include #include "metal_soc_descriptor.h" -#include "test_common.hpp" #include "tt_backend_api_types.hpp" #include "umd/device/device_api_metal.h" #include "umd/device/tt_cluster_descriptor.h" From c895538d56c9f207ae9f104bbe34d41cefdfe7c4 Mon Sep 17 00:00:00 2001 From: Andrew Fuller Date: Sun, 23 Feb 2025 20:29:31 -0500 Subject: [PATCH 247/316] Afuller/bisect patch (#18187) ### Ticket None ### Problem description Bisect is useful, but sometimes we need to adjust each trial. eg: by reverting a pre-existing breakage to find the hidden breakage. ### What's changed Added a `patch` option to have applied at each step of the bisect. --- .github/workflows/bisect-dispatch.yaml | 13 ++++++++----- tests/scripts/tt_bisect.sh | 21 +++++++++++++++++++-- 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/.github/workflows/bisect-dispatch.yaml b/.github/workflows/bisect-dispatch.yaml index dce44222ea7..3905c86cd0c 100644 --- a/.github/workflows/bisect-dispatch.yaml +++ b/.github/workflows/bisect-dispatch.yaml @@ -50,12 +50,12 @@ on: required: true type: string description: "Timeout (eg: 5m, 1h)" - description: - type: string - default: "Git bisect dispatch" + patch: required: false + type: string + description: "Commit-ish to cherry-pick for each step" -run-name: ${{ inputs.description }} +run-name: "Bisect on ${{ inputs.runner-label }}" jobs: build-artifact: uses: ./.github/workflows/build-artifact.yaml @@ -88,8 +88,11 @@ jobs: - uses: ./.github/actions/install-python-deps - name: Run Git Bisect shell: bash + env: + GIT_COMMITTER_NAME: "GitHub Actions" + GIT_COMMITTER_EMAIL: "actions@github.com" run: | source ${{ github.workspace }}/python_env/bin/activate cd $TT_METAL_HOME export PYTHONPATH=$TT_METAL_HOME - ./tests/scripts/tt_bisect.sh -t ${{ inputs.timeout }} -f "${{ inputs.command }}" -b ${{ inputs.bad-commit }} -g ${{ inputs.good-commit }} + ./tests/scripts/tt_bisect.sh -t ${{ inputs.timeout }} -f "${{ inputs.command }}" -b ${{ inputs.bad-commit }} -g ${{ inputs.good-commit }} -p "${{ inputs.patch }}" diff --git a/tests/scripts/tt_bisect.sh b/tests/scripts/tt_bisect.sh index 5304803d18b..077656456d8 100755 --- a/tests/scripts/tt_bisect.sh +++ b/tests/scripts/tt_bisect.sh @@ -7,6 +7,8 @@ Flags: -f | --file : test file to run, also the test that broke -g | --good : good commit to start bisect -b | --bad : bad commit to start bisect + -p | --path : commit-ish to cherry-pick onto each commit before building + -t | --timeout : timeout duration for the test Example: ./tests/scripts/tt_bisect.sh -f ./build/test/tt_metal/test_add_two_ints -b HEAD -g 1eb7930 If the test involves multiple words you have to do "test_file": @@ -19,7 +21,8 @@ source python_env/bin/activate export PYTHONPATH=$TT_METAL_HOME timeout_duration=2m -while getopts "f:g:b:t:" opt; do +patch="" +while getopts "f:g:b:t:p:" opt; do case $opt in f | file) test=$OPTARG @@ -33,6 +36,9 @@ while getopts "f:g:b:t:" opt; do t | timeout) timeout_duration=$OPTARG ;; + p | patch) + patch=$OPTARG + ;; \?) echo "Invalid option: -$OPTARG" >&2 exit 1 @@ -48,14 +54,20 @@ fi echo "Time to find who broke it :)" echo "Good commit:" $good_commit echo "Bad commit:" $bad_commit +if ([ ! -z "$patch" ]); then + echo "Cherry-pick commit:" $patch +fi found=false git bisect start $bad_commit $good_commit -- while [[ "$found" = "false" ]]; do - git submodule update --recursive echo "::group::Building `git rev-parse HEAD`" + if ([ ! -z "$patch" ]); then + git cherry-pick $patch + fi + git submodule update --recursive build_rc=0 ./build_metal.sh --build-tests > /dev/null || build_rc=$? echo "::endgroup::" @@ -70,6 +82,11 @@ while [[ "$found" = "false" ]]; do timeout_rc=0 timeout "$timeout_duration" bash -c "$test" || timeout_rc=$? echo "Exit code: $timeout_rc" + + if ([ ! -z "$patch" ]); then + # Must reset HEAD or git bisect good/bad will retry the merge base and we'll be stuck in a loop + git reset --hard HEAD^ + fi echo "::endgroup::" if [ $timeout_rc -eq 0 ]; then From d8837b68b0c0a0650bc66999dfdfb3446e2637f3 Mon Sep 17 00:00:00 2001 From: Andrew Fuller Date: Sun, 23 Feb 2025 23:23:34 -0500 Subject: [PATCH 248/316] Dockerize TGG Unit Tests (#18189) ### Ticket #18188 ### Problem description This workflow was limited to the OS of the host machine. ### What's changed Dockerized the workflow. ### Checklist - [x] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI [passes](https://github.com/tenstorrent/tt-metal/actions/runs/13487539999) - [x] TGG Unit Tests [passes](https://github.com/tenstorrent/tt-metal/actions/runs/13488696140) --- .github/workflows/tgg-unit-tests-impl.yaml | 65 ++++++++++++++++++---- .github/workflows/tgg-unit-tests.yaml | 5 ++ tests/scripts/run_tests.sh | 8 ++- 3 files changed, 65 insertions(+), 13 deletions(-) diff --git a/.github/workflows/tgg-unit-tests-impl.yaml b/.github/workflows/tgg-unit-tests-impl.yaml index 140230c82b2..22a56b63189 100644 --- a/.github/workflows/tgg-unit-tests-impl.yaml +++ b/.github/workflows/tgg-unit-tests-impl.yaml @@ -2,6 +2,13 @@ name: "[internal] TGG unit tests impl" on: workflow_call: + inputs: + docker-image: + required: true + type: string + wheel-artifact-name: + required: true + type: string jobs: TGG-tests: @@ -17,26 +24,60 @@ jobs: }, ] name: ${{ matrix.test-group.name }} - env: - ARCH_NAME: ${{ matrix.test-group.arch }} - LOGURU_LEVEL: INFO - LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib runs-on: ${{ matrix.test-group.runs-on }} + container: + image: ${{ inputs.docker-image }} + env: + TT_METAL_HOME: /work + PYTHONPATH: /work + LD_LIBRARY_PATH: /work/build/lib + LOGURU_LEVEL: INFO + ARCH_NAME: ${{ matrix.test-group.arch }} + volumes: + - ${{ github.workspace }}/docker-job:/work # Subdir to workaround https://github.com/actions/runner/issues/691 + - /dev/hugepages-1G:/dev/hugepages-1G + - /mnt/MLPerf:/mnt/MLPerf + options: "--device /dev/tenstorrent" + defaults: + run: + shell: bash + working-directory: /work # https://github.com/actions/runner/issues/878 steps: - - uses: tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main - - name: Set up dynamic env vars for build - run: | - echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV + - name: ⬇️ Checkout + uses: actions/checkout@v4 + with: + submodules: recursive + path: docker-job # Here be dragons; keep it scoped to our desired volume, yet must be under github.workspace and be sure to clean up at the end - uses: actions/download-artifact@v4 with: name: TTMetal_build_any + path: /work - name: Extract files run: tar -xvf ttm_any.tar - - uses: ./.github/actions/install-python-deps + - name: ⬇️ Download Wheel + uses: actions/download-artifact@v4 + with: + name: ${{ inputs.wheel-artifact-name }} + path: /work + - name: Install Wheel + run: | + WHEEL_FILENAME=$(ls -1 *.whl) + pip3 install $WHEEL_FILENAME - name: Run unit regression tests timeout-minutes: 60 run: | - source ${{ github.workspace }}/python_env/bin/activate - cd $TT_METAL_HOME - export PYTHONPATH=$TT_METAL_HOME + set -x + pwd + echo $PYTHONPATH + ls -al ${{ matrix.test-group.cmd }} + - name: Cleanup + if: always() + run: | + # We are forced to checkout the repo into a subdir of the host's workdir; this pollutes the host + # with root-owned files. Be sure to clean up after ourselves in case we're on a non-ephemeral runner. + echo "pre rm" + ls -al /__w/tt-metal/tt-metal + rm -rf /__w/tt-metal/tt-metal/docker-job + echo "post rm" + ls -al /__w/tt-metal/tt-metal diff --git a/.github/workflows/tgg-unit-tests.yaml b/.github/workflows/tgg-unit-tests.yaml index 6c42ff61f4f..9d1bba42a64 100644 --- a/.github/workflows/tgg-unit-tests.yaml +++ b/.github/workflows/tgg-unit-tests.yaml @@ -9,7 +9,12 @@ jobs: build-artifact: uses: ./.github/workflows/build-artifact.yaml secrets: inherit + with: + build-wheel: true TGG-tests: needs: build-artifact secrets: inherit uses: ./.github/workflows/tgg-unit-tests-impl.yaml + with: + docker-image: ${{ needs.build-artifact.outputs.ci-build-docker-image }} + wheel-artifact-name: ${{ needs.build-artifact.outputs.wheel-artifact-name }} diff --git a/tests/scripts/run_tests.sh b/tests/scripts/run_tests.sh index 0f4d4480a11..a048cd440c5 100755 --- a/tests/scripts/run_tests.sh +++ b/tests/scripts/run_tests.sh @@ -431,7 +431,13 @@ set_up_chdir() { return fi done - echo "Could not find the 'tt-metal' directory in your PYTHONPATH." 1>&2 + for ENTRY in "${ENTRIES[@]}"; do + if [[ -d "$ENTRY/tt_metal" ]]; then + cd "$ENTRY" + return + fi + done + echo "Could not find the 'tt-metal' directory in your PYTHONPATH." 1>&2 exit 1 } From a64bb70f0801ea93e01371c206dd6fcdf8c065fa Mon Sep 17 00:00:00 2001 From: pjosipovic Date: Sun, 23 Feb 2025 16:27:19 +0000 Subject: [PATCH 249/316] Add tensor cache to conv2d UT --- tests/ttnn/unit_tests/operations/test_new_conv2d.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/ttnn/unit_tests/operations/test_new_conv2d.py b/tests/ttnn/unit_tests/operations/test_new_conv2d.py index c9e6e60576e..f1aa3faa084 100644 --- a/tests/ttnn/unit_tests/operations/test_new_conv2d.py +++ b/tests/ttnn/unit_tests/operations/test_new_conv2d.py @@ -2676,7 +2676,7 @@ def test_shallow_conv_with_tiled_input(device): @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) @pytest.mark.parametrize("tiled_input", [True, False]) @pytest.mark.parametrize("input_on_device", [True, False]) -def test_dram_input_mm_conv(device, tiled_input, input_on_device): +def test_dram_input_mm_conv(device, torch_tensor_map, tiled_input, input_on_device): batch_size = 1 out_channels, in_channels = 256, 1024 img_h, img_w = 128, 128 @@ -2689,10 +2689,10 @@ def test_dram_input_mm_conv(device, tiled_input, input_on_device): pad = (0, 0) kernel_shape = (out_channels, in_channels, kernel_h, kernel_w) - torch_kernel = torch.randn(kernel_shape, dtype=torch.bfloat16) + torch_kernel = randomize_torch_tensor(torch_tensor_map, kernel_shape) tt_kernel = ttnn.from_torch(torch_kernel) - torch_input = torch.randn(input_shape, dtype=torch.bfloat16) + torch_input = randomize_torch_tensor(torch_tensor_map, input_shape) if input_on_device: tt_input = ttnn.from_torch(torch_input, device=device) tt_input = ttnn.permute(tt_input, (0, 2, 3, 1)) From 4fb909f38abdc23c5c000d8679b66c3b33ff6bf7 Mon Sep 17 00:00:00 2001 From: Slavko Krstic Date: Mon, 24 Feb 2025 11:34:08 +0100 Subject: [PATCH 250/316] Enable Conv2d_Transposed tests for blackhole (#18194) - removed `skip_for_blackhole()` - removed unused code from `tests/ttnn/unit_tests/operations/test_conv_transpose2d.py` --- .../operations/test_conv_transpose2d.py | 22 +++---------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/tests/ttnn/unit_tests/operations/test_conv_transpose2d.py b/tests/ttnn/unit_tests/operations/test_conv_transpose2d.py index 00285276dc5..c324250f237 100644 --- a/tests/ttnn/unit_tests/operations/test_conv_transpose2d.py +++ b/tests/ttnn/unit_tests/operations/test_conv_transpose2d.py @@ -9,28 +9,13 @@ from models.utility_functions import ( is_wormhole_b0, skip_for_grayskull, - is_grayskull, - is_wormhole_b0, - is_x2_harvested, - is_blackhole, - skip_for_blackhole, - is_blackhole, ) -from tests.ttnn.utils_for_testing import assert_with_pcc, check_with_pcc, check_with_pcc_without_tensor_printout +from tests.ttnn.utils_for_testing import check_with_pcc_without_tensor_printout import ttnn -import readline # optional, will allow Up/Down/History in the console -import code torch.set_printoptions(linewidth=400, profile="full", sci_mode=False) -def drop_to_interpreter(): - variables = globals().copy() - variables.update(locals()) - shell = code.InteractiveConsole(variables) - shell.interact() - - def run_conv_transpose2d( device, math_fidelity, @@ -178,7 +163,6 @@ def run_conv_transpose2d( assert passing -@skip_for_blackhole() @skip_for_grayskull() @pytest.mark.parametrize("device_params", [{"l1_small_size": 64 * 1024}], indirect=True) @pytest.mark.parametrize( @@ -234,8 +218,8 @@ def test_simple_conv_t2d( shard_layout, mirror_kernel, ): - if device.core_grid.y != 8: - pytest.skip("Needs 8x8 Grid") + if device.core_grid.y != 8 and is_wormhole_b0(): + pytest.skip("Needs 8x8 Grid for Wormhole_b0") run_conv_transpose2d( device, math_fidelity=ttnn.MathFidelity.HiFi4, From 42adc106c4018a4e455b577d90cfd2b232fd064b Mon Sep 17 00:00:00 2001 From: Sean Nijjar Date: Mon, 24 Feb 2025 08:52:13 -0500 Subject: [PATCH 251/316] Apply Various 1D Fabric Optimizations - Improve Performance by ~500 MB/s for 4k packet size (#18186) Apply various small optimizations. The transformations and their performance deltas are listed below. Note that the measurements below are when -O3 is enabled for fabric kernel build, even though -Os is used in main. The reason for this is that -O3 will be enabled later this week - currently blocked by some dependencies - so this is the most representative performance delta. Baselining and measuring at -Os would not be representative. ``` Baseline unicast 112816548 -> 15.43 GB/s mcast 274540294 -> 12.68 GB/s # Cache noc addr: 110155221 -> 15.8 GB/s 276839301 -> 12.57 GB/s ## Flatten main loop sender, 1st branch nest: 107584162 unicast -> 16.18 GB/s 269844156 mcast -> 12.9 GB/s ## Flatten receiver last branch nest: 106827158 unicast -> 16.3 267551029 mcast -> 13.0 GB/s Swapping fwd vs local noc write order to do forwarding write first: 104042988 unicast -> 16.7 GB/s 258379905 mcast -> 13.47 GB/s ``` Note that the cached noc addr showed a minor perf degradation for mcast, although there is no reason it should cause a slow down. I did try dropping that commit but keeping the rest of the change sequence and saw a net perf degradation of 1-3% so I think the cached_noc_addr change was probably perturbing other code indirectly and causing a degradation. When applied as a last commit there is an improvement. Update after rebase ontop of @tt-aho's latest changes to routing fields in packet header, new numbers are mcast -> 13.81 GB/s, up from 13.3 GB/s --- .../edm_fabric/fabric_erisc_datamover.cpp | 66 +++++++++---------- .../fabric_erisc_datamover_channels.hpp | 24 ++++--- 2 files changed, 46 insertions(+), 44 deletions(-) diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp index f80505d936d..e345fc70b8b 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp @@ -633,13 +633,13 @@ FORCE_INLINE void receiver_forward_packet( // If the packet is a terminal packet, then we can just deliver it locally bool start_distance_is_terminal_value = (cached_routing_fields.value & tt::fabric::RoutingFields::HOP_DISTANCE_MASK) == tt::fabric::RoutingFields::LAST_HOP_DISTANCE_VAL; uint16_t payload_size_bytes = packet_start->payload_size_bytes; - if (start_distance_is_terminal_value) { - execute_chip_unicast_to_local_chip(packet_start, payload_size_bytes, transaction_id); - } bool not_last_destination_device = cached_routing_fields.value != tt::fabric::RoutingFields::LAST_MCAST_VAL; if (not_last_destination_device) { forward_payload_to_downstream_edm(packet_start, payload_size_bytes, cached_routing_fields, downstream_edm_interface, transaction_id); } + if (start_distance_is_terminal_value) { + execute_chip_unicast_to_local_chip(packet_start, payload_size_bytes, transaction_id); + } } else if constexpr (std::is_same_v) { uint32_t routing = cached_routing_fields.value & tt::fabric::LowLatencyRoutingFields::FIELD_MASK; uint16_t payload_size_bytes = packet_start->payload_size_bytes; @@ -682,25 +682,22 @@ FORCE_INLINE bool run_sender_channel_step( // when moving to stream regs to manage rd/wr ptrs // TODO: update to be stream reg based. Initialize to space available and simply check for non-zero bool receiver_has_space_for_packet = outbound_to_receiver_channel_pointers.has_space_for_packet(); - if (receiver_has_space_for_packet && !internal_::eth_txq_is_busy(DEFAULT_ETH_TXQ)) { - bool has_unsent_packet = local_sender_channel_worker_interface.has_unsent_payload(); - if (has_unsent_packet) { - bool sender_backpressured_from_sender_side = !(local_sender_channel_worker_interface.local_rdptr.distance_behind(local_sender_channel_worker_interface.local_wrptr) < SENDER_NUM_BUFFERS); - if (!sender_backpressured_from_sender_side) { - did_something = true; - auto packet_header = reinterpret_cast(local_sender_channel.get_buffer_address(local_sender_channel_worker_interface.local_wrptr.get_buffer_index())); - if constexpr (enable_packet_header_recording) { - tt::fabric::validate(*packet_header); - packet_header_recorder.record_packet_header(reinterpret_cast(packet_header)); - } - send_next_data( - local_sender_channel, - local_sender_channel_worker_interface, - outbound_to_receiver_channel_pointers, - remote_receiver_channel, - sender_channel_index); - } + bool has_unsent_packet = local_sender_channel_worker_interface.has_unsent_payload(); + bool sender_backpressured_from_sender_side = !(local_sender_channel_worker_interface.local_rdptr.distance_behind(local_sender_channel_worker_interface.local_wrptr) < SENDER_NUM_BUFFERS); + bool can_send = receiver_has_space_for_packet && !internal_::eth_txq_is_busy(DEFAULT_ETH_TXQ) && has_unsent_packet && !sender_backpressured_from_sender_side; + if (can_send) { + did_something = true; + auto packet_header = reinterpret_cast(local_sender_channel.get_buffer_address(local_sender_channel_worker_interface.local_wrptr.get_buffer_index())); + if constexpr (enable_packet_header_recording) { + tt::fabric::validate(*packet_header); + packet_header_recorder.record_packet_header(reinterpret_cast(packet_header)); } + send_next_data( + local_sender_channel, + local_sender_channel_worker_interface, + outbound_to_receiver_channel_pointers, + remote_receiver_channel, + sender_channel_index); } // Process COMPLETIONs from receiver @@ -753,6 +750,7 @@ FORCE_INLINE bool run_sender_channel_step( } did_something = true; channel_connection_established = true; + local_sender_channel_worker_interface.cache_producer_noc_addr(); if constexpr (enable_first_level_ack) { local_sender_channel_worker_interface.update_worker_copy_of_read_ptr(local_sender_channel_worker_interface.local_ackptr.get_ptr()); } else { @@ -848,19 +846,19 @@ FORCE_INLINE void run_receiver_channel_step( auto &wr_flush_ptr = receiver_channel_pointers.wr_flush_ptr; // Currently unclear if it's better to loop here or not... Also unclear if merging these // two pointers is better or not... Seems to be maybe 5-10% better merged but need more data - if (!wr_flush_ptr.is_caught_up_to(wr_sent_ptr) && !internal_::eth_txq_is_busy(DEFAULT_ETH_TXQ)) { - auto receiver_buffer_index = wr_flush_ptr.get_buffer_index(); - bool next_trid_flushed = receiver_channel_trid_tracker.transaction_flushed(receiver_buffer_index); - if (next_trid_flushed) { - auto &completion_ptr = receiver_channel_pointers.completion_ptr; - wr_flush_ptr.increment(); - receiver_channel_trid_tracker.clear_trid_at_buffer_slot(receiver_buffer_index); - receiver_send_completion_ack( - remote_eth_sender_wrptrs, - remote_sender_channnels, - completion_ptr, - local_receiver_channel); - } + bool unflushed_writes_and_eth_txq_not_busy = !wr_flush_ptr.is_caught_up_to(wr_sent_ptr) && !internal_::eth_txq_is_busy(DEFAULT_ETH_TXQ); + auto receiver_buffer_index = wr_flush_ptr.get_buffer_index(); + bool next_trid_flushed = receiver_channel_trid_tracker.transaction_flushed(receiver_buffer_index); + bool can_send_completion = unflushed_writes_and_eth_txq_not_busy && next_trid_flushed; + if (can_send_completion) { + auto &completion_ptr = receiver_channel_pointers.completion_ptr; + wr_flush_ptr.increment(); + receiver_channel_trid_tracker.clear_trid_at_buffer_slot(receiver_buffer_index); + receiver_send_completion_ack( + remote_eth_sender_wrptrs, + remote_sender_channnels, + completion_ptr, + local_receiver_channel); } } diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp index 4bf3cad530e..3c1801b294d 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp @@ -117,6 +117,7 @@ template struct EdmChannelWorkerInterface { EdmChannelWorkerInterface() : worker_location_info_ptr(nullptr), + cached_worker_semaphore_address(0), remote_producer_wrptr(nullptr), connection_live_semaphore(nullptr), local_wrptr(), @@ -134,6 +135,7 @@ struct EdmChannelWorkerInterface { volatile tt_l1_ptr uint32_t *const remote_producer_wrptr, volatile tt_l1_ptr uint32_t *const connection_live_semaphore) : worker_location_info_ptr(worker_location_info_ptr), + cached_worker_semaphore_address(0), remote_producer_wrptr(remote_producer_wrptr), connection_live_semaphore(connection_live_semaphore), local_wrptr(), @@ -155,14 +157,11 @@ struct EdmChannelWorkerInterface { } [[nodiscard]] FORCE_INLINE uint32_t get_worker_semaphore_address() const { - return worker_location_info_ptr->worker_semaphore_address; + return cached_worker_semaphore_address & 0xFFFFFFFF; } FORCE_INLINE void update_worker_copy_of_read_ptr(BufferPtr new_ptr_val) { - auto const &worker_info = *worker_location_info_ptr; - uint64_t worker_semaphore_address = get_noc_addr( - (uint32_t)worker_info.worker_xy.x, (uint32_t)worker_info.worker_xy.y, worker_info.worker_semaphore_address); - noc_inline_dw_write(worker_semaphore_address, new_ptr_val); + noc_inline_dw_write(this->cached_worker_semaphore_address, new_ptr_val); } // Connection management methods @@ -180,6 +179,15 @@ struct EdmChannelWorkerInterface { noc_semaphore_inc(worker_semaphore_address, 1); } + FORCE_INLINE void cache_producer_noc_addr() { + auto const &worker_info = *worker_location_info_ptr; + uint64_t worker_semaphore_address = get_noc_addr( + (uint32_t)worker_info.worker_xy.x, + (uint32_t)worker_info.worker_xy.y, + worker_info.worker_semaphore_address); + this->cached_worker_semaphore_address = worker_semaphore_address; + } + FORCE_INLINE bool all_eth_packets_acked() const { return this->local_ackptr.is_caught_up_to(this->local_wrptr); } @@ -187,15 +195,11 @@ struct EdmChannelWorkerInterface { return this->local_rdptr.is_caught_up_to(this->local_wrptr); } - // Call to keep the connection flow control info fresh with worker. - FORCE_INLINE void propagate_ackptr_to_connection_info() { - worker_location_info_ptr->edm_rdptr = local_ackptr.get_ptr(); - } - [[nodiscard]] FORCE_INLINE bool has_worker_teardown_request() const { return *connection_live_semaphore == tt::fabric::EdmToEdmSender<0>::close_connection_request_value; } [[nodiscard]] FORCE_INLINE bool connection_is_live() const { return *connection_live_semaphore == tt::fabric::EdmToEdmSender<0>::open_connection_value; } volatile EDMChannelWorkerLocationInfo *worker_location_info_ptr; + uint64_t cached_worker_semaphore_address = 0; volatile tt_l1_ptr uint32_t *const remote_producer_wrptr; volatile tt_l1_ptr uint32_t *const connection_live_semaphore; From 190547b5dcdbd12724b4717b40a72ac627a2196b Mon Sep 17 00:00:00 2001 From: Raymond Kim <109366641+tt-rkim@users.noreply.github.com> Date: Mon, 24 Feb 2025 08:56:47 -0500 Subject: [PATCH 252/316] Revert "#14080: Preprocess weights for Conv2D on Device (#16750)" (#18203) This reverts commit 5a2c003f1ff928fa3766a5a4d96f81f3eb703b1e. This is to fix functional unet on single card nightly ttnn and conv2d sweeps. --- .../unit_tests/operations/test_new_conv2d.py | 39 +-- .../operations/test_prepare_conv_weights.py | 130 ++++++++ .../ttnn/operations/conv/conv2d/conv2d.cpp | 49 +-- .../operations/conv/conv2d/conv2d_pybind.cpp | 6 - .../operations/conv/conv2d/conv2d_utils.cpp | 7 +- .../conv/conv2d/device/conv2d_op.hpp | 11 - .../conv2d_op_sharded_program_factory.cpp | 151 ++------- .../conv/conv2d/prepare_conv2d_weights.cpp | 303 +----------------- .../conv/conv2d/prepare_conv2d_weights.hpp | 16 - .../pad/device/pad_program_factory.cpp | 15 +- .../ttnn/operations/data_movement/pad/pad.cpp | 12 +- 11 files changed, 190 insertions(+), 549 deletions(-) diff --git a/tests/ttnn/unit_tests/operations/test_new_conv2d.py b/tests/ttnn/unit_tests/operations/test_new_conv2d.py index f1aa3faa084..471e2aa3817 100644 --- a/tests/ttnn/unit_tests/operations/test_new_conv2d.py +++ b/tests/ttnn/unit_tests/operations/test_new_conv2d.py @@ -58,7 +58,6 @@ def run_conv( config_override, dilation=1, use_shallow_conv_variant=False, - transpose_shards=True, # https://github.com/tenstorrent/tt-metal/issues/17897 fp32_accum=False, packer_l1_acc=False, output_layout=ttnn.TILE_LAYOUT, @@ -73,7 +72,6 @@ def run_conv( weight_mesh_mapper=None, output_mesh_composer=None, enable_split_reader=False, - preprocess_weights_on_device=True, ): if isinstance(device, ttnn.MeshDevice): assert input_mesh_mapper is not None, "Expected mesh mapper for input tensor when using device mesh" @@ -93,7 +91,7 @@ def run_conv( torch_input_tensor = torch.permute(torch_input_tensor_nchw, (0, 2, 3, 1)) torch_weight_tensor = randomize_torch_tensor(torch_tensor_map, conv_weight_shape) - torch_bias_tensor = randomize_torch_tensor(torch_tensor_map, conv_bias_shape) * 10 if has_bias else None + torch_bias_tensor = randomize_torch_tensor(torch_tensor_map, conv_bias_shape) if has_bias else None torch_out_golden_tensor = torch.nn.functional.conv2d( torch_input_tensor_nchw, @@ -136,9 +134,6 @@ def run_conv( enable_split_reader=enable_split_reader, enable_subblock_padding=False, output_layout=output_layout, - transpose_shards=transpose_shards, - preprocess_weights_on_device=preprocess_weights_on_device, - always_preprocess_weights=True, ) compute_config = ttnn.init_device_compute_kernel_config( device.arch(), @@ -158,7 +153,7 @@ def run_conv( conv_config.override_sharding_config = True print("Setting num_cores_nhw to 98") - [tt_output_tensor_on_device, [out_height, out_width], [d_w, d_b]] = ttnn.conv2d( + [tt_output_tensor_on_device, [out_height, out_width]] = ttnn.conv2d( input_tensor=tt_input_tensor, weight_tensor=tt_weight_tensor, in_channels=input_channels, @@ -179,8 +174,8 @@ def run_conv( groups=groups, memory_config=memory_config, return_output_dim=True, - return_weights_and_bias=True, ) + tt_output_tensor = ttnn.from_device(tt_output_tensor_on_device) torch_output_tensor = ttnn.to_torch(tt_output_tensor, mesh_composer=output_mesh_composer) @@ -196,8 +191,6 @@ def run_conv( if not fp32_accum: pcc = 0.985 - if input_channels * filter_height * filter_width > 10000: - pcc = 0.97 elif math_fidelity == ttnn.MathFidelity.LoFi and activations_dtype == ttnn.bfloat8_b: pcc = 0.996 else: @@ -391,9 +384,6 @@ def test_conv_features( if output_layout == ttnn.ROW_MAJOR_LAYOUT and activations_dtype == ttnn.bfloat8_b: pytest.skip("Row major layout not compatible with bfloat8_b") - if output_layout == ttnn.ROW_MAJOR_LAYOUT and activations_dtype == ttnn.bfloat16 and packer_l1_acc and fp32_accum: - pytest.skip("skipping due to pack_untilize_dst issue!") - run_conv( device, torch_tensor_map, @@ -417,7 +407,6 @@ def test_conv_features( has_bias=True, fp32_accum=fp32_accum, packer_l1_acc=packer_l1_acc, - preprocess_weights_on_device=True, ) @@ -789,7 +778,7 @@ def test_conv_for_segformer_512x512( ) @pytest.mark.parametrize( "weights_dtype", - [ttnn.bfloat16], + [ttnn.bfloat16, ttnn.bfloat8_b], ) @pytest.mark.parametrize( "activations_dtype", @@ -972,7 +961,6 @@ def test_resnet50_conv_wh( pad_w, config_override=config_override, use_shallow_conv_variant=use_shallow_conv_variant, - transpose_shards=True, ## use RM (transpose_mcast=False) with 2D on WH packer_l1_acc=packer_l1_acc, fp32_accum=False, has_bias=has_bias, @@ -1034,7 +1022,6 @@ def test_conv_mem_config_wh( shard_layout=shard_layout, config_override=config_override, use_shallow_conv_variant=use_shallow_conv_variant, - transpose_shards=True, ## use RM (transpose_mcast=False) with 2D on WH packer_l1_acc=True, fp32_accum=False, has_bias=True, @@ -1220,7 +1207,7 @@ def test_resnet50_conv_wh_fp32( ) @pytest.mark.parametrize( "weights_dtype", - [ttnn.bfloat16], + [ttnn.bfloat8_b], ) @pytest.mark.parametrize( "activations_dtype", @@ -1362,7 +1349,7 @@ def test_sd_conv( ) @pytest.mark.parametrize( "activations_dtype", - [ttnn.bfloat16], + [ttnn.bfloat16, ttnn.bfloat8_b], ) @pytest.mark.parametrize( "fp32_accum", @@ -1503,7 +1490,7 @@ def test_sd_conv_wh( ) @pytest.mark.parametrize( "weights_dtype", - [ttnn.bfloat16], + [ttnn.bfloat8_b], ) @pytest.mark.parametrize( "activations_dtype", @@ -1655,7 +1642,6 @@ def test_unet_conv_wh( config_override, shard_layout=shard_layout, use_shallow_conv_variant=use_shallow_conv_variant, - transpose_shards=True, ## use RM (transpose_mcast=False) with 2D on WH output_layout=output_layout, auto_shard=auto_shard, ) @@ -1754,7 +1740,6 @@ def test_unet_conv_groups_2_wh( config_override, shard_layout=shard_layout, use_shallow_conv_variant=use_shallow_conv_variant, - transpose_shards=True, ## use RM (transpose_mcast=False) with 2D on WH output_layout=output_layout, auto_shard=auto_shard, groups=groups, @@ -1852,7 +1837,6 @@ def test_unet_conv_groups_4_6_wh( config_override, shard_layout=shard_layout, use_shallow_conv_variant=use_shallow_conv_variant, - transpose_shards=True, ## use RM (transpose_mcast=False) with 2D on WH output_layout=output_layout, groups=groups, ) @@ -1951,14 +1935,12 @@ def test_unet_conv_groups_8_wh( config_override, shard_layout=shard_layout, use_shallow_conv_variant=use_shallow_conv_variant, - transpose_shards=True, ## use RM (transpose_mcast=False) with 2D on WH output_layout=output_layout, auto_shard=auto_shard, groups=groups, ) -@skip_for_grayskull() @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) @pytest.mark.parametrize( "batch_size, output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, config_override", @@ -2020,7 +2002,6 @@ def test_halo_reshard_conv( ) -@skip_for_grayskull() @pytest.mark.skip("New API needs to be tested") @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) @pytest.mark.parametrize( @@ -2262,7 +2243,6 @@ def test_conv_groups( ) -@skip_for_grayskull() @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) @pytest.mark.parametrize( "batch_size, output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, shard_layout, config_override, use_shallow_conv_variant, groups", @@ -2383,7 +2363,6 @@ def test_yolov4_conv_groups_larger_than_one( ) -@skip_for_grayskull() @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) @pytest.mark.parametrize( " output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, shard_layout, config_override, use_shallow_conv_variant, groups", @@ -2672,7 +2651,6 @@ def test_shallow_conv_with_tiled_input(device): # Tests running conv2d which maps to matmul w/o sharding the input tensor. # Output tensor is in DRAM. -@skip_for_grayskull() @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) @pytest.mark.parametrize("tiled_input", [True, False]) @pytest.mark.parametrize("input_on_device", [True, False]) @@ -2798,9 +2776,6 @@ def test_small_in_large_out_channels_auto_shard(device, torch_tensor_map): padding = (0, 0) height = 128 width = 128 - if device.core_grid.y != 8 and is_wormhole_b0(): - pytest.skip("Needs 8x8 grid for wormhole_b0") - run_conv( device, torch_tensor_map, diff --git a/tests/ttnn/unit_tests/operations/test_prepare_conv_weights.py b/tests/ttnn/unit_tests/operations/test_prepare_conv_weights.py index 1543913a051..c71c5cfbd26 100644 --- a/tests/ttnn/unit_tests/operations/test_prepare_conv_weights.py +++ b/tests/ttnn/unit_tests/operations/test_prepare_conv_weights.py @@ -196,3 +196,133 @@ def test_prepare_conv_weights( passing, pcc_msg = check_with_pcc_without_tensor_printout(torch_output_tensor, torch_out_golden_tensor, pcc=pcc) logger.info(f"PCC = {pcc_msg}. Threshold = {pcc}") assert passing + + +@skip_for_grayskull() +@skip_for_blackhole() +# @skip_for_wormhole_b0() +@pytest.mark.parametrize( + "batch_size, output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, use_1d_systolic_array, config_override", + ( + # rn50 layer1 + (8, 64, 64, 56, 56, 3, 3, 1, 1, 1, 1, True, None), + (16, 64, 64, 56, 56, 3, 3, 1, 1, 1, 1, True, None), + (20, 64, 64, 56, 56, 3, 3, 1, 1, 1, 1, True, None), + ), +) +@pytest.mark.parametrize("packer_l1_acc", [True, False], ids=["pack_l1", "no_pack_l1"]) +@pytest.mark.parametrize("has_bias", [True, False], ids=["has_bias", "no_bias"]) +@pytest.mark.parametrize("device_params", [{"l1_small_size": 2**15}], indirect=True) +def test_prepare_bias( + batch_size, + output_channels, + input_channels, + input_height, + input_width, + filter_height, + filter_width, + stride_h, + stride_w, + pad_h, + pad_w, + use_1d_systolic_array, + packer_l1_acc, + config_override, + has_bias, + device, +): + if device.core_grid.y == 7: + pytest.skip("Issue #6992: Statically allocated circular buffers in program clash with L1 buffers on core range") + + if batch_size == 20 and ( + output_channels == 64 or (stride_h == 2 and (output_channels == 256 or output_channels == 128)) + ): + pytest.skip("Skipping test because it won't fit in L1!") + + inp_shape = (batch_size, input_channels, input_height, input_width) + conv_weight_shape = (output_channels, input_channels, filter_height, filter_width) + torch_weight_tensor = torch.randn(conv_weight_shape, dtype=torch.bfloat16) + torch_input_tensor = torch.randn(inp_shape, dtype=torch.bfloat16) + torch_bias_tensor = torch.randn((1, 1, 1, output_channels), dtype=torch.bfloat16) if has_bias else None + + torch_out_golden_tensor = torch.nn.functional.conv2d( + torch_input_tensor, + torch_weight_tensor, + bias=torch_bias_tensor.reshape(-1) if has_bias else None, + stride=(stride_h, stride_w), + padding=(pad_h, pad_w), + dilation=(1, 1), + groups=1, + ).permute(0, 2, 3, 1) + + tt_input_tensor = ttnn.from_torch(torch_input_tensor.transpose(-3, -2).transpose(-2, -1), ttnn.bfloat16) + tt_weight_tensor = ttnn.from_torch(torch_weight_tensor, ttnn.bfloat16) + tt_bias_tensor = ttnn.from_torch(torch_bias_tensor, ttnn.bfloat16) if has_bias else None + + conv_config = ttnn.Conv2dConfig( + dtype=ttnn.bfloat16, + weights_dtype=ttnn.bfloat16, + input_channels_alignment=(16 if input_channels == 16 and input_height == 115 else 32), + enable_act_double_buffer=False, + enable_split_reader=False, + enable_subblock_padding=False, + ) + compute_config = ttnn.init_device_compute_kernel_config(device.arch(), packer_l1_acc=packer_l1_acc) + if config_override and "act_block_h" in config_override: + conv_config.act_block_h_override = config_override["act_block_h"] + + if config_override and "act_block_w_div" in config_override: + conv_config.act_block_w_div = config_override["act_block_w_div"] + + if config_override and "num_cores_nhw" in config_override: + if config_override["num_cores_nhw"] == 98: + conv_config.core_grid = ttnn.CoreRangeSet({ttnn.CoreRange((0, 0), (11, 7)), ttnn.CoreRange((0, 8), (1, 8))}) + conv_config.override_sharding_config = True + print("Setting num_cores_nhw to 98") + + conv_kwargs = { + "input_layout": ttnn.ROW_MAJOR_LAYOUT, + "in_channels": input_channels, + "out_channels": output_channels, + "batch_size": batch_size, + "input_height": input_height, + "input_width": input_width, + "kernel_size": (filter_height, filter_width), + "stride": (stride_h, stride_w), + "padding": (pad_h, pad_w), + "dilation": (1, 1), + "groups": 1, + "device": device, + "conv_config": conv_config, + } + + tt_input_tensor = ttnn.to_device(tt_input_tensor, device) + + tt_bias_tensor_formatted = ( + ttnn.prepare_conv_bias( + bias_tensor=tt_bias_tensor, input_memory_config=tt_input_tensor.memory_config(), **conv_kwargs + ) + if has_bias + else None + ) + + tt_bias_tensor_formatted = ttnn.to_device(tt_bias_tensor_formatted, device) if has_bias else None + (k := next(iter(conv_kwargs)), conv_kwargs.pop(k)) ##removing 1st element from dict + tt_output_tensor_on_device = ttnn.conv2d( + input_tensor=tt_input_tensor, + weight_tensor=tt_weight_tensor, + bias_tensor=tt_bias_tensor_formatted, + **conv_kwargs, + compute_config=compute_config, + ) + + tt_output_tensor = ttnn.from_device(tt_output_tensor_on_device) + torch_output_tensor = ttnn.to_torch(tt_output_tensor) + + torch_output_tensor = torch_output_tensor[:, :, :, :output_channels] + torch_output_tensor = torch_output_tensor.reshape(torch_out_golden_tensor.shape) + + pcc = 0.99 + passing, pcc_msg = check_with_pcc_without_tensor_printout(torch_output_tensor, torch_out_golden_tensor, pcc=pcc) + logger.info(f"PCC = {pcc_msg}. Threshold = {pcc}") + assert passing diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp index 3f856572366..a3928a36629 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp @@ -119,41 +119,22 @@ Result conv2d( bool weight_is_on_device = ttnn::is_tensor_on_device_or_multidevice(weight_tensor); ttnn::Tensor weight_tensor_on_device = weight_tensor; std::optional bias_tensor_on_device = bias_tensor; - if (!weight_is_on_device || conv_config.always_preprocess_weights) { + if (!weight_is_on_device) { // prepare weights in desired layout and move to device - - // TODO: Implement heuristic to decide if weights should be preprocessed on device. - if (conv_config.preprocess_weights_on_device == false) { - tie(weight_tensor_on_device, bias_tensor_on_device) = prepare_conv_weights_biases_and_move_to_device( - weight_tensor, - bias_tensor, - conv_config.input_channels_alignment, - conv_config.weights_dtype, - opt_conv_op_block_config.act_block_w_ntiles, - opt_conv_op_block_config.out_subblock_w_ntiles, - parallel_config, - output_parallel_config, - device, - groups, - opt_conv_op_block_config.act_block_h_ntiles, - input_width, - true); - } else { - tie(weight_tensor_on_device, bias_tensor_on_device) = prepare_conv_weights_biases_on_device( - weight_tensor, - bias_tensor, - conv_config.input_channels_alignment, - conv_config.weights_dtype, - opt_conv_op_block_config.act_block_w_ntiles, - opt_conv_op_block_config.out_subblock_w_ntiles, - parallel_config, - output_parallel_config, - device, - groups, - opt_conv_op_block_config.act_block_h_ntiles, - input_width, - true); - } + tie(weight_tensor_on_device, bias_tensor_on_device) = prepare_conv_weights_biases_and_move_to_device( + weight_tensor, + bias_tensor, + conv_config.input_channels_alignment, + conv_config.weights_dtype, + opt_conv_op_block_config.act_block_w_ntiles, + opt_conv_op_block_config.out_subblock_w_ntiles, + parallel_config, + output_parallel_config, + device, + groups, + opt_conv_op_block_config.act_block_h_ntiles, + input_width, + true); } // if 1x1 conv w/ stride 1, convert input tensor to tile layout if required if (mm_conv) { diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp index 8d169240b72..0591ed02d0c 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp @@ -335,8 +335,6 @@ void py_bind_conv2d(py::module& module) { bool, bool, bool, - bool, - bool, bool>(), py::kw_only(), py::arg("dtype") = DataType::BFLOAT16, @@ -353,8 +351,6 @@ void py_bind_conv2d(py::module& module) { py::arg("core_grid") = std::nullopt, py::arg("transpose_shards") = true, py::arg("output_layout") = Layout::TILE, - py::arg("preprocess_weights_on_device") = true, - py::arg("always_preprocess_weights") = false, py::arg("enable_act_double_buffer") = false, py::arg("enable_weights_double_buffer") = false, py::arg("enable_split_reader") = false, @@ -373,8 +369,6 @@ void py_bind_conv2d(py::module& module) { py_conv_config.def_readwrite("core_grid", &Conv2dConfig::core_grid); py_conv_config.def_readwrite("transpose_shards", &Conv2dConfig::transpose_shards); py_conv_config.def_readwrite("output_layout", &Conv2dConfig::output_layout); - py_conv_config.def_readwrite("preprocess_weights_on_device", &Conv2dConfig::preprocess_weights_on_device); - py_conv_config.def_readwrite("always_preprocess_weights", &Conv2dConfig::always_preprocess_weights); py_conv_config.def_readwrite("enable_act_double_buffer", &Conv2dConfig::enable_act_double_buffer); py_conv_config.def_readwrite("enable_weights_double_buffer", &Conv2dConfig::enable_weights_double_buffer); py_conv_config.def_readwrite("enable_split_reader", &Conv2dConfig::enable_split_reader); diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp index 7bdc858a526..6f67fb238a6 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp @@ -869,12 +869,9 @@ std::tuple #include "ttnn/operations/sliding_window/sliding_window.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/run_operation.hpp" @@ -62,13 +61,6 @@ struct Conv2dConfig { // BFLOAT8 is always Tile layout. Layout output_layout = Layout::TILE; - // Select between preprocessing weights on device or on host. - bool preprocess_weights_on_device = true; - - // If false, only preprocess weights if they are originally located on host. - // If true, preprocess weights regarding of original location. - bool always_preprocess_weights = false; - // Doubles the size of the CBs for activation. // Increased perf, but increased L1 usage. bool enable_act_double_buffer = false; @@ -81,7 +73,6 @@ struct Conv2dConfig { bool enable_split_reader = false; bool enable_subblock_padding = false; - static constexpr auto attribute_names = std::make_tuple( "dtype", "weights_dtype", @@ -97,7 +88,6 @@ struct Conv2dConfig { "core_grid", "transpose_shards", "output_layout", - "preprocess_weights_on_device", "enable_act_double_buffer", "enable_weights_double_buffer", "enable_split_reader", @@ -118,7 +108,6 @@ struct Conv2dConfig { std::cref(this->core_grid), std::cref(this->transpose_shards), std::cref(this->output_layout), - std::cref(this->preprocess_weights_on_device), std::cref(this->enable_act_double_buffer), std::cref(this->enable_weights_double_buffer), std::cref(this->enable_split_reader), diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp index ce2999e4ca8..32fd24971e8 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp @@ -474,7 +474,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( } } - // TT_FATAL(out_block_h_ntiles == act_block_h_ntiles); // TODO: fix output block sizing + // assert(out_block_h_ntiles == act_block_h_ntiles); // TODO: fix output block sizing TT_FATAL( out_block_h_ntiles >= act_block_h_ntiles, "Output block height (in # of tiles) ({}) should be greater than or equal to activation block height (in # of " @@ -578,8 +578,8 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( sliding_window_config, parallelization_config.num_cores_nhw, out_block_h_ntiles); - TT_FATAL(act_matrix_shape.size() == 3, "act_matrix_shape should have be of size 3"); - TT_FATAL(act_matrix_shape[0] == 1, "act_matrix_shape should have 1 as the first dimension"); + assert(act_matrix_shape.size() == 3); + assert(act_matrix_shape[0] == 1); uint32_t act_matrix_height = (uint32_t)act_matrix_shape[1]; uint32_t act_matrix_width = (uint32_t)act_matrix_shape[2]; if (block_sharded) { @@ -589,7 +589,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( uint32_t act_matrix_height_unpadded = (uint32_t)act_matrix_shape_unpadded[1]; uint32_t act_matrix_width_unpadded = (uint32_t)act_matrix_shape_unpadded[2]; - // TODO: Move all these TT_FATALs/checks to validate? + // TODO: Move all these asserts/checks to validate? uint32_t input_width = ashape[2]; uint32_t input_channels = ashape[3]; @@ -611,10 +611,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( // matrix multiplication shape check valid for all convs except depthwise conv1d if (!is_conv_1d_depthwise_conv) { TT_FATAL( - act_matrix_width == weight_matrix_height, - "The width of tensor a {} needs to match the height of tensor b {}", - act_matrix_width, - weight_matrix_height); + act_matrix_width == weight_matrix_height, "The width of tensor a needs to match the height of tensor b"); } // Tile size divisibility checks TT_FATAL(act_matrix_height % TILE_HEIGHT == 0, "Height of activation matrix needs to be divisible by 32"); @@ -638,26 +635,10 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( uint32_t act_matrix_height_ntiles = act_matrix_height / TILE_HEIGHT; uint32_t act_matrix_width_ntiles = act_matrix_width / TILE_WIDTH; - TT_FATAL( - act_matrix_height_ntiles % act_block_h_ntiles == 0, - "act_matrix_height_ntiles {} should be divisible by act_block_h_ntiles {}", - act_matrix_height_ntiles, - act_block_h_ntiles); - TT_FATAL( - act_matrix_width_ntiles % act_block_w_ntiles == 0, - "act_matrix_width_ntiles {} should be divisible by act_block_w_ntiles {}", - act_matrix_width_ntiles, - act_block_w_ntiles); - TT_FATAL( - weight_matrix_width_ntiles % weight_block_w_ntiles == 0, - "weight_matrix_width_ntiles {} should be divisible by weight_block_w_ntiles {}", - weight_matrix_width_ntiles, - weight_block_w_ntiles); - TT_FATAL( - act_matrix_height_ntiles % out_block_h_ntiles == 0, - "act_matrix_height_ntiles {} should be divisible by out_block_h_ntiles {}", - act_matrix_height_ntiles, - out_block_h_ntiles); + assert(act_matrix_height_ntiles % act_block_h_ntiles == 0); + assert(act_matrix_width_ntiles % act_block_w_ntiles == 0); + assert(weight_matrix_width_ntiles % weight_block_w_ntiles == 0); + assert(act_matrix_height_ntiles % out_block_h_ntiles == 0); uint32_t num_blocks_act_h = act_matrix_height_ntiles / act_block_h_ntiles; uint32_t num_blocks_out_h = act_matrix_height_ntiles / out_block_h_ntiles; @@ -691,11 +672,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( // weight block info uint32_t weight_block_w_datums = weight_matrix_width / num_blocks_weight_w; - TT_FATAL( - weight_block_w_ntiles % out_subblock_w_ntiles == 0, - "weight_block_w_ntiles {} should be divisible by weight_block_w_ntiles {}", - weight_block_w_ntiles, - out_subblock_w_ntiles); + assert(weight_block_w_ntiles % out_subblock_w_ntiles == 0); uint32_t weight_num_subblocks = weight_block_w_ntiles / out_subblock_w_ntiles; uint32_t weight_block_h_ntiles = is_conv_1d_depthwise_conv ? act_block_h_ntiles : act_block_w_ntiles; uint32_t weight_block_num_tiles = weight_block_w_ntiles * weight_block_h_ntiles; @@ -704,21 +681,14 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( // writer of conv op partially removes padding on the width // it removes the padding done for block width but it doesn't remove padding done for tiled width uint32_t output_channels_padded_to_tile_width = round_up(output_channels, TILE_WIDTH); - TT_FATAL( - output_channels_padded_to_tile_width <= weight_matrix_width, - "output_channels_padded_to_tile_width {} should be less than or equal to weight_matrix_width {}", - output_channels_padded_to_tile_width, - weight_matrix_width); + assert(output_channels_padded_to_tile_width <= weight_matrix_width); uint32_t output_width_num_tiles = output_channels_padded_to_tile_width / TILE_WIDTH; uint32_t num_blocks_output_w = (uint32_t)std::ceil((double)output_channels_padded_to_tile_width / (double)weight_block_w_datums); uint32_t last_block_width_datums = (output_channels_padded_to_tile_width % weight_block_w_datums == 0) ? weight_block_w_datums : (output_channels_padded_to_tile_width % weight_block_w_datums); - TT_FATAL( - last_block_width_datums % TILE_WIDTH == 0, - "last_block_width_datums {} should be divisible by TILE_WIDTH", - last_block_width_datums); + assert(last_block_width_datums % TILE_WIDTH == 0); uint32_t out_block_h_datums = out_block_h_ntiles * TILE_HEIGHT; @@ -736,12 +706,9 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( // act uint32_t act_dram_addr = src0_dram_buffer->address(); - TT_FATAL( - act_block_h_ntiles % out_subblock_h_ntiles == 0, - "act_block_h_ntiles {} should be divisible by out_subblock_h_ntiles {}", - act_block_h_ntiles, - out_subblock_h_ntiles); - // TT_FATAL(out_block_h_ntiles % out_subblock_h_ntiles == 0); + assert(act_matrix_width_ntiles % act_block_w_ntiles == 0); + assert(act_block_h_ntiles % out_subblock_h_ntiles == 0); + // assert(out_block_h_ntiles % out_subblock_h_ntiles == 0); uint32_t act_num_subblocks = act_block_h_ntiles / out_subblock_h_ntiles; uint32_t act_block_num_tiles = act_block_h_ntiles * act_block_w_ntiles; uint32_t act_subblock_h_ntiles = out_subblock_h_ntiles; @@ -776,11 +743,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( uint32_t output_height_padded_to_tile_height = round_up(act_matrix_height_unpadded, TILE_HEIGHT); uint32_t output_height_num_tiles = output_height_padded_to_tile_height / TILE_HEIGHT; - TT_FATAL( - output_height_num_tiles <= act_matrix_height_ntiles, - "output_height_num_tiles {} should be less than or equal to act_matrix_height_ntiles {}", - output_height_num_tiles, - act_matrix_height_ntiles); + assert(output_height_num_tiles <= act_matrix_height_ntiles); uint32_t src_dram_act_buffer_size_bytes = src0_dram_buffer->size(); uint32_t src_dram_weight_buffer_size_bytes = src1_dram_buffer->size(); @@ -877,94 +840,46 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( reader_defines["WINDOW_INNER"] = std::to_string(window_inner); log_debug(LogOp, "window_outer: {}, window_inner: {}", window_outer, window_inner); - TT_FATAL( - weight_matrix_width_ntiles % per_core_out_matrix_width_ntiles == 0, - "weight_matrix_width_ntiles {} should be divisible by per_core_out_matrix_width_ntiles {}", - weight_matrix_width_ntiles, - per_core_out_matrix_width_ntiles); - TT_FATAL( - per_core_out_matrix_width_ntiles % weight_block_w_ntiles == 0, - "per_core_out_matrix_width_ntiles {} should be divisible by weight_block_w_ntiles {}", - per_core_out_matrix_width_ntiles, - weight_block_w_ntiles); + assert(weight_matrix_width_ntiles % per_core_out_matrix_width_ntiles == 0); + assert(per_core_out_matrix_width_ntiles % weight_block_w_ntiles == 0); uint32_t num_blocks_weight_w_per_core = per_core_out_matrix_width_ntiles / weight_block_w_ntiles; if (not weight_width_sliced) { - TT_FATAL( - num_blocks_weight_w_per_core == num_blocks_weight_w, - "num_blocks_weight_w_per_core {} should be equal to num_blocks_weight_w {}", - num_blocks_weight_w_per_core, - num_blocks_weight_w); + assert(num_blocks_weight_w_per_core == num_blocks_weight_w); } uint32_t num_weight_slices_width = weight_matrix_width_ntiles / per_core_out_matrix_width_ntiles; uint32_t total_num_cores_per_weight_slice = 0; uint32_t total_num_cores_per_act_slice = 0; // only used when (BLOCK_SHARDING && !transpose_mcast) if (weight_width_sliced) { if (transpose_mcast) { - TT_FATAL( - num_cores_y % num_weight_slices_width == 0, - "num_cores_y {} should be divisible by num_weight_slices_width {}", - num_cores_y, - num_weight_slices_width); + assert(num_cores_y % num_weight_slices_width == 0); uint32_t num_cores_y_per_weight_slice_width = num_cores_y / num_weight_slices_width; total_num_cores_per_weight_slice = num_cores_y_per_weight_slice_width * num_cores_x; } else { - TT_FATAL( - num_cores_x % num_weight_slices_width == 0, - "num_cores_x {} should be divisible by num_weight_slices_width {}", - num_cores_x, - num_weight_slices_width); + assert(num_cores_x % num_weight_slices_width == 0); uint32_t num_cores_x_per_weight_slice_width = num_cores_x / num_weight_slices_width; uint32_t num_act_slices_height = act_matrix_height_ntiles / per_core_out_matrix_height_ntiles; total_num_cores_per_act_slice = num_cores_x * num_cores_y / num_act_slices_height; log_debug(LogOp, "total_num_cores_per_act_slice: {}", total_num_cores_per_act_slice); total_num_cores_per_weight_slice = num_cores_x_per_weight_slice_width * num_cores_y; } - TT_FATAL( - total_num_cores_per_weight_slice * per_core_out_matrix_height_ntiles == act_matrix_height_ntiles, - "total_num_cores_per_weight_slice {} * per_core_out_matrix_height_ntiles {} should be equal to " - "act_matrix_height_ntiles {}", - total_num_cores_per_weight_slice, - per_core_out_matrix_height_ntiles, - act_matrix_height_ntiles); + assert(total_num_cores_per_weight_slice * per_core_out_matrix_height_ntiles == act_matrix_height_ntiles); } else { - TT_FATAL( - num_cores_y % num_weight_slices_width == 0, - "num_cores_y {} should be divisible by num_weight_slices_width {}", - num_cores_y, - num_weight_slices_width); + assert(num_cores_y % num_weight_slices_width == 0); uint32_t num_cores_y_per_weight_slice_width = num_cores_y / num_weight_slices_width; total_num_cores_per_weight_slice = num_cores_y_per_weight_slice_width * num_cores_x; - TT_FATAL( - total_num_cores * per_core_out_matrix_height_ntiles >= act_matrix_height_ntiles, - "total_num_cores {} * per_core_out_matrix_height_ntiles {} should be greater than or equal to " - "act_matrix_height_ntiles {}", - total_num_cores, - per_core_out_matrix_height_ntiles, - act_matrix_height_ntiles); + assert(total_num_cores * per_core_out_matrix_height_ntiles >= act_matrix_height_ntiles); } - TT_FATAL( - per_core_out_matrix_height_ntiles % act_block_h_ntiles == 0, - "per_core_out_matrix_height_ntiles {} should be divisible by act_block_h_ntiles {}", - per_core_out_matrix_height_ntiles, - act_block_h_ntiles); + assert(per_core_out_matrix_height_ntiles % act_block_h_ntiles == 0); uint32_t num_blocks_act_h_per_core = per_core_out_matrix_height_ntiles / act_block_h_ntiles; - // TT_FATAL(per_core_out_matrix_height_ntiles % out_block_h_ntiles == 0); + // assert(per_core_out_matrix_height_ntiles % out_block_h_ntiles == 0); // uint32_t num_blocks_out_h_per_core = per_core_out_matrix_height_ntiles / out_block_h_ntiles; uint32_t num_blocks_out_h_per_core = (per_core_out_matrix_height_ntiles + out_block_h_ntiles - 1) / out_block_h_ntiles; bool act_height_sliced = per_core_out_matrix_height_ntiles < act_matrix_height_ntiles; if (not act_height_sliced) { - TT_FATAL( - num_blocks_act_h_per_core == num_blocks_act_h, - "num_blocks_act_h_per_core {} should be equal to num_blocks_act_h {}", - num_blocks_act_h_per_core, - num_blocks_act_h); - TT_FATAL( - num_blocks_out_h_per_core == num_blocks_out_h, - "num_blocks_out_h_per_core {} should be equal to num_blocks_out_h {}", - num_blocks_out_h_per_core, - num_blocks_out_h); - TT_FATAL(num_cores_x == 1, "num_cores_x {} should be equal to 1", num_cores_x); + TT_FATAL(num_blocks_act_h_per_core == num_blocks_act_h, "Error"); + TT_FATAL(num_blocks_out_h_per_core == num_blocks_out_h, "Error"); + TT_FATAL(num_cores_x == 1, "Error"); } uint32_t act_block_h_datums_last_block = (per_core_out_matrix_height_ntiles - (num_blocks_act_h_per_core - 1) * act_block_h_ntiles) * TILE_HEIGHT; @@ -1220,7 +1135,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( if (filter_h >= 1 and filter_w >= 1) { if (!is_conv1d and weight_width_sliced) { // 2D conv - TT_FATAL(read_window_in_inner_loop == true, "read_window_in_inner_loop should be true for this conv"); + assert(read_window_in_inner_loop == true); reader_kernel = "ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/" "reader_conv_activations_2d_mcast_padded_with_halo_3x3_weights_v2.cpp"; @@ -1532,11 +1447,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( uint32_t out_start_tile_id_w = weight_slice_i * per_core_out_matrix_width_ntiles; uint32_t bias_tile_offset = weight_slice_i * per_core_out_matrix_width_ntiles; if (has_bias) { - TT_FATAL( - bias_tile_offset < bias_ntiles, - "bias_tile_offset {} should be less than bias_ntiles {}", - bias_tile_offset, - bias_ntiles); + assert(bias_tile_offset < bias_ntiles); } if (weight_width_sliced) { diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.cpp index 726b4ba4049..2f7b82a170e 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.cpp @@ -9,10 +9,6 @@ #include "ttnn/operations/core/core.hpp" #include "ttnn/operations/data_movement/pad/pad.hpp" #include "ttnn/tensor/types.hpp" -#include "ttnn/operations/data_movement/permute/permute.hpp" -#include "ttnn/operations/data_movement/reshape_view/reshape.hpp" -#include "ttnn/operations/data_movement/tilize/tilize.hpp" -#include "ttnn/operations/sliding_window/sliding_window.hpp" using namespace tt; namespace ttnn { namespace operations::conv { @@ -479,6 +475,8 @@ Tensor convert_conv_weight_tensor_to_depthwise_layout( } void validate_weight_tensor(const ttnn::Tensor& weight_tensor) { + TT_FATAL( + !ttnn::has_storage_type_of(weight_tensor, ttnn::DEVICE_STORAGE_TYPE), "conv weight should be placed on host"); TT_FATAL(weight_tensor.get_layout() == Layout::ROW_MAJOR, "conv weight layout should be in row_major layout"); TT_FATAL(weight_tensor.get_logical_shape().rank() == 4, "conv weight should be 4D tensor"); } @@ -633,272 +631,6 @@ static OptimizedConvBlockConfig get_opt_block_config( conv_config.enable_split_reader); } -template -std::pair> prepare_conv_weights_biases_on_device( - const ttnn::Tensor& weight_tensor, - std::optional& bias_tensor, - uint32_t input_channels_alignment, - DataType weights_bias_dtype, - uint32_t weight_block_h_ntiles, - uint32_t weight_block_w_ntiles, - const sliding_window::ParallelConfig& input_parallel_config, - const sliding_window::ParallelConfig& output_parallel_config, - T* device, - uint32_t groups, - uint32_t act_block_h_ntiles, - uint32_t input_width, - const bool parameters_on_device) { - validate_weight_tensor(weight_tensor); - ttnn::Tensor weight_tensor_; // tensor to return - ttnn::Tensor bias_tensor_; - - auto original_weights_shape = weight_tensor.get_logical_shape(); - uint32_t original_weights_out_channels = original_weights_shape[0]; - uint32_t original_weights_in_channels = original_weights_shape[1]; - uint32_t original_weights_window_h = original_weights_shape[2]; - uint32_t original_weights_window_w = original_weights_shape[3]; - - bool is_conv1d = original_weights_window_w == 1 && input_width == 1; - bool is_depthwise_conv = groups == original_weights_out_channels && original_weights_in_channels == 1; - - weight_tensor_ = weight_tensor; - // Convert weight tensor to 0 padded shape if groups > 1 - if (groups > 1 and is_tensor_on_device_or_multidevice(weight_tensor_)) { - TT_THROW( - "Grouped Convolution not supported when weights are on device. Please move the weights tensor to host"); - } - if (!is_conv1d and groups > 1) { - weight_tensor_ = convert_conv_weight_tensor_to_grouped_layout(weight_tensor_, groups, weights_bias_dtype); - } else if (is_conv1d and groups > 1) { - if (is_depthwise_conv) { - weight_tensor_ = - convert_conv_weight_tensor_to_depthwise_layout(weight_tensor_, act_block_h_ntiles, weights_bias_dtype); - weight_block_h_ntiles = act_block_h_ntiles; - } else { - weight_tensor_ = convert_conv_weight_tensor_to_grouped_layout(weight_tensor_, groups, weights_bias_dtype); - } - } - - weight_tensor_ = ttnn::operations::core::to_device(weight_tensor_, device, std::nullopt); - - auto weights_shape = weight_tensor_.get_logical_shape(); - uint32_t out_channels = weights_shape[0]; - uint32_t in_channels = weights_shape[1]; - uint32_t window_h = weights_shape[2]; - uint32_t window_w = weights_shape[3]; - - uint32_t input_num_cores_channels = get_num_cores_channels_from_parallel_config(input_parallel_config); - uint32_t output_num_cores_channels = get_num_cores_channels_from_parallel_config(output_parallel_config); - - uint32_t out_channels_padded = tt::round_up(out_channels, output_num_cores_channels * tt::constants::TILE_WIDTH); - uint32_t in_channels_padded = tt::round_up(in_channels, input_num_cores_channels * input_channels_alignment); - uint32_t out_channel_padding = out_channels_padded - out_channels; - - ttnn::Shape weights_channels_padded_shape( - std::array({out_channels_padded, in_channels_padded, window_h, window_w})); - if (weights_bias_dtype == DataType::BFLOAT8_B) { - TT_ASSERT(weight_tensor_.get_dtype() == DataType::FLOAT32); - if (bias_tensor.has_value()) { - TT_ASSERT(bias_tensor.value().get_dtype() == DataType::FLOAT32); - } - } else { - // TODO: fix the need to check this. We should be able to accept any datatype and convert - TT_ASSERT(weight_tensor_.get_dtype() == weights_bias_dtype); - if (bias_tensor.has_value()) { - TT_ASSERT(bias_tensor.value().get_dtype() == weights_bias_dtype); - } - } - weight_tensor_ = ttnn::pad( - weight_tensor_, - weights_channels_padded_shape.to_array_4D(), - tt::tt_metal::Array4D({0, 0, 0, 0}), - 0.0f, - true, - std::nullopt); - - // Block sharding re-orders the weights by dividing the input_channels along number of in_channel_cores. - if (input_parallel_config.shard_scheme == TensorMemoryLayout::BLOCK_SHARDED) { - TT_FATAL( - input_num_cores_channels == output_num_cores_channels, - "Input and output cores must be the same for Block Sharded Conv2d"); - TT_FATAL( - in_channels_padded % input_num_cores_channels == 0, - "Input channels {} must be divisble by num cores {}", - in_channels_padded, - input_num_cores_channels); - auto in_channels_per_core = in_channels_padded / input_num_cores_channels; - - TT_FATAL( - out_channels_padded % output_num_cores_channels == 0, - "output channels {} must be divisble by num cores {}", - out_channels_padded, - output_num_cores_channels); - auto out_channels_per_core = out_channels_padded / output_num_cores_channels; - auto rounded_weight_block_height = - tt::round_up(window_h * window_w * in_channels_per_core, constants::TILE_HEIGHT); - auto rounded_weight_block_width = tt::round_up(out_channels_per_core, constants::TILE_WIDTH); - - auto final_out_channels_padded = rounded_weight_block_width * output_num_cores_channels; - - if (final_out_channels_padded != out_channels_padded) { - weight_tensor_ = ttnn::reshape( - weight_tensor_, - ttnn::Shape( - {output_num_cores_channels, out_channels_per_core, in_channels_padded * window_h, window_w})); - - weight_tensor_ = ttnn::pad( - weight_tensor_, - tt::tt_metal::Array4D( - {output_num_cores_channels, rounded_weight_block_width, in_channels_padded * window_h, window_w}), - tt::tt_metal::Array4D({0, 0, 0, 0}), - 0, - true, - std::nullopt); - } - weight_tensor_ = ttnn::reshape( - weight_tensor_, - ttnn::Shape( - {final_out_channels_padded, input_num_cores_channels, in_channels_per_core, window_h, window_w})); - - weight_tensor_ = ttnn::permute(weight_tensor_, ttnn::SmallVector({1, 3, 4, 2, 0})); - // Shape is now {input_num_cores_channels, window_h, window_w, in_channels_per_core, out_channels_padded} - - weight_tensor_ = ttnn::reshape( - weight_tensor_, - ttnn::Shape( - {1, input_num_cores_channels, in_channels_per_core * window_h * window_w, final_out_channels_padded})); - weight_tensor_ = ttnn::pad( - weight_tensor_, - tt::tt_metal::Array4D( - {1, input_num_cores_channels, rounded_weight_block_height, final_out_channels_padded}), - tt::tt_metal::Array4D({0, 0, 0, 0}), - 0, - true, - std::nullopt); - weight_tensor_ = ttnn::reshape( - weight_tensor_, - ttnn::Shape({1, 1, rounded_weight_block_height * input_num_cores_channels, final_out_channels_padded})); - } else { - // Reshape the weights to 5D, and permute in 5D. - weight_tensor_ = ttnn::reshape( - weight_tensor_, ttnn::Shape({1, out_channels_padded, in_channels_padded, window_h, window_w})); - - weight_tensor_ = ttnn::permute(weight_tensor_, ttnn::SmallVector({0, 3, 4, 2, 1})); - // Shape is now {1, window_h, window_w, in_channels_padded, out_channels_padded} - auto weight_block_h_datums = weight_block_h_ntiles * constants::TILE_HEIGHT; - if ((weight_block_h_datums > (window_w * in_channels_padded)) && - (input_parallel_config.shard_scheme == TensorMemoryLayout::HEIGHT_SHARDED)) { - weight_tensor_ = ttnn::reshape( - weight_tensor_, ttnn::Shape({1, window_h, window_w * in_channels_padded, out_channels_padded})); - weight_tensor_ = ttnn::pad( - weight_tensor_, - tt::tt_metal::Array4D({1, window_h, weight_block_h_datums, out_channels_padded}), - tt::tt_metal::Array4D({0, 0, 0, 0}), - 0.0f, - true, - std::nullopt); - weight_tensor_ = ttnn::reshape( - weight_tensor_, ttnn::Shape({1, 1, window_h * weight_block_h_datums, out_channels_padded})); - } else { - weight_tensor_ = ttnn::reshape( - weight_tensor_, ttnn::Shape({1, 1, window_h * window_w * in_channels_padded, out_channels_padded})); - } - } - weight_tensor_ = ttnn::tilize( - weight_tensor_, - ttnn::MemoryConfig( - {.memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED, - .buffer_type = tt::tt_metal::BufferType::DRAM}), - weights_bias_dtype, - true); - - uint32_t weight_matrix_height = in_channels * window_h * window_w; - int32_t weight_matrix_height_padding = weight_tensor_.get_logical_shape()[2] - weight_matrix_height; - TT_FATAL(weight_matrix_height_padding >= 0, " Matrix Height Padding can't be negative"); - - ttnn::Shape target_shape(std::array{1, 1, weight_matrix_height, out_channels}); - - weight_tensor_ = ttnn::reshape(weight_tensor_, target_shape, weight_tensor_.get_padded_shape()); - - if (bias_tensor.has_value()) { - bias_tensor_ = bias_tensor.value(); - bool is_bias_tensor_is_on_device = ttnn::is_tensor_on_device_or_multidevice(bias_tensor_); - if (!is_bias_tensor_is_on_device) { - bias_tensor_ = ttnn::operations::core::to_device(bias_tensor_, device, std::nullopt); - } - if (input_parallel_config.shard_scheme == TensorMemoryLayout::BLOCK_SHARDED) { - auto bias_out_channels = bias_tensor_.get_logical_shape()[3]; - ttnn::Shape bias_channels_padded_shape({1, 1, 1, out_channels_padded}); - bias_tensor_ = ttnn::pad( - bias_tensor_, - bias_channels_padded_shape.to_array_4D(), - tt::tt_metal::Array4D{0, 0, 0, 0}, - 0, - true, - std::nullopt); - auto out_channels_per_core = out_channels_padded / output_num_cores_channels; - auto rounded_weight_block_width = tt::round_up(out_channels_per_core, constants::TILE_WIDTH); - - auto final_out_channels_padded = rounded_weight_block_width * output_num_cores_channels; - - if (final_out_channels_padded != out_channels_padded) { - bias_tensor_ = - ttnn::reshape(bias_tensor_, ttnn::Shape({1, 1, output_num_cores_channels, out_channels_per_core})); - - bias_tensor_ = ttnn::pad( - bias_tensor_, - tt::tt_metal::Array4D({1, 1, output_num_cores_channels, rounded_weight_block_width}), - tt::tt_metal::Array4D({0, 0, 0, 0}), - 0, - true, - std::nullopt); - } - bias_tensor_ = ttnn::reshape(bias_tensor_, ttnn::Shape({1, 1, 1, final_out_channels_padded})); - bias_tensor_ = ttnn::pad( - bias_tensor_, - tt::tt_metal::Array4D({1, 1, 32, final_out_channels_padded}), - tt::tt_metal::Array4D{0, 0, 0, 0}, - 0, - true, - std::nullopt); - } else { - ttnn::Shape bias_channels_padded_shape({1, 1, 32, round_up(out_channels, weight_block_w_ntiles * 32)}); - bias_tensor_ = ttnn::pad( - bias_tensor_, - bias_channels_padded_shape.to_array_4D(), - tt::tt_metal::Array4D{0, 0, 0, 0}, - 0, - true, - std::nullopt); - } - bias_tensor_ = ttnn::tilize( - bias_tensor_, - ttnn::MemoryConfig( - {.memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED, - .buffer_type = tt::tt_metal::BufferType::DRAM}), - weights_bias_dtype, - true); - - ttnn::Shape bias_target_shape(std::array{1, 1, 1, out_channels}); - bias_tensor_ = ttnn::reshape(bias_tensor_, bias_target_shape, bias_tensor_.get_padded_shape()); - - // TT_FATAL( - // bias_tensor_.get_logical_shape()[3] == out_channels, - // "Bias must have the same length as output channels"); - // bias_tensor_ = conv_bias_layout_convert( - // bias_tensor_, - // weights_bias_dtype, - // weight_block_h_ntiles, - // weight_block_w_ntiles, - // output_parallel_config, - // device, - // out_channels_padded, - // is_non_tile_mul_width); - } - - return {weight_tensor_, bias_tensor.has_value() ? bias_tensor_ : std::optional()}; -} - template std::pair> prepare_conv_weights_biases_and_move_to_device( const ttnn::Tensor& weight_tensor, @@ -971,6 +703,7 @@ std::pair> prepare_conv_weights_biases } weight_tensor_ = ttnn::pad(weight_tensor_, weights_channels_padded_shape.to_array_4D(), tt::tt_metal::Array4D({0, 0, 0, 0}), 0); + // for conv op, pad the weights to block shape if (input_parallel_config.shard_scheme == TensorMemoryLayout::HEIGHT_SHARDED) { weight_tensor_ = convert_conv_weight_tensor_to_special_padding_tiled_layout( @@ -1252,36 +985,6 @@ template ttnn::Tensor prepare_conv_weights( const std::optional& conv_config_, const std::optional& compute_config_); -template std::pair> prepare_conv_weights_biases_on_device( - const ttnn::Tensor& weight_tensor, - std::optional& bias_tensor, - uint32_t input_channels_alignment, - DataType weights_bias_dtype, - uint32_t weight_block_h_ntiles, - uint32_t weight_block_w_ntiles, - const sliding_window::ParallelConfig& input_parallel_config, - const sliding_window::ParallelConfig& output_parallel_config, - IDevice* device, - uint32_t groups, - uint32_t act_block_h_ntiles, - uint32_t input_width, - const bool parameters_on_device); - -template std::pair> prepare_conv_weights_biases_on_device( - const ttnn::Tensor& weight_tensor, - std::optional& bias_tensor, - uint32_t input_channels_alignment, - DataType weights_bias_dtype, - uint32_t weight_block_h_ntiles, - uint32_t weight_block_w_ntiles, - const sliding_window::ParallelConfig& input_parallel_config, - const sliding_window::ParallelConfig& output_parallel_config, - MeshDevice* device, - uint32_t groups, - uint32_t act_block_h_ntiles, - uint32_t input_width, - const bool parameters_on_device); - template std::pair> prepare_conv_weights_biases_and_move_to_device( const ttnn::Tensor& weight_tensor, std::optional& bias_tensor, diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.hpp b/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.hpp index 2824a9cd4fe..5377a62a345 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.hpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.hpp @@ -103,22 +103,6 @@ ttnn::Tensor prepare_conv_bias( const std::optional& conv_config_, const std::optional& compute_config_); -template -std::pair> prepare_conv_weights_biases_on_device( - const ttnn::Tensor& weight_tensor, - std::optional& bias_tensor, - uint32_t input_channels_alignment, - DataType weights_bias_dtype, - uint32_t weight_block_h_ntiles, - uint32_t weight_block_w_ntiles, - const sliding_window::ParallelConfig& input_parallel_config, - const sliding_window::ParallelConfig& output_parallel_config, - T* device, - uint32_t groups, - uint32_t act_block_h_ntiles, - uint32_t input_width, - const bool parameters_on_device); - template std::pair> prepare_conv_weights_biases_and_move_to_device( const ttnn::Tensor& weight_tensor, diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp index 7f34adea279..a009d7d00aa 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp @@ -792,13 +792,6 @@ std::vector, std::vector>> get_runtime return ret_val; } -uint32_t get_num_max_sticks(uint32_t num_sticks_to_read, uint32_t stick_size, uint32_t max_read_size) { - uint32_t num_sticks = tt::round_up(max_read_size, stick_size) / stick_size; - while (num_sticks * stick_size > max_read_size || num_sticks_to_read % num_sticks != 0) { - num_sticks--; - } - return num_sticks; -} operation::ProgramWithCallbacks pad_rm_reader_writer_multi_core_v2( const Tensor& a, Tensor& output, @@ -848,14 +841,8 @@ operation::ProgramWithCallbacks pad_rm_reader_writer_multi_core_v2( ? num_sticks_padded_per_core_group_1 : num_sticks_padded_per_core_group_2; - uint32_t max_read_size = 256 * 1024; - uint32_t W_bytes = a.get_padded_shape()[3] * a.element_size(); - auto num_sticks_per_core_read = get_num_max_sticks(num_sticks, W_bytes, max_read_size); - auto input_cb_pages = std::min(num_sticks_per_core_read, num_sticks); - tt::tt_metal::CircularBufferConfig cb_src0_config = - tt::tt_metal::CircularBufferConfig( - input_cb_pages * stick_size_padded_aligned, {{src0_cb_index, cb_data_format}}) + tt::tt_metal::CircularBufferConfig(num_sticks * stick_size_padded_aligned, {{src0_cb_index, cb_data_format}}) .set_page_size(src0_cb_index, stick_size_padded_aligned); auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, total_cores, cb_src0_config); diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/pad.cpp b/ttnn/cpp/ttnn/operations/data_movement/pad/pad.cpp index d8c78a70cdd..9e4382f3d73 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/pad/pad.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/pad/pad.cpp @@ -51,17 +51,7 @@ static ttnn::Tensor pad_impl( const auto rank = input_tensor_shape.rank(); TT_FATAL(rank == 4, "ttnn.pad: input tensor passed to pad_impl must have rank == 4, but got rank {}.", rank); - bool input_output_same = true; - for (size_t i = 0; i < rank; i++) { - if (input_tensor_shape[i] != output_padded_shape[i]) { - input_output_same = false; - break; - } - } - if (input_output_same) { - tt::log_debug("Pad Input and Output Shapes are the same. Skipping pad and returning input tensor."); - return input_tensor; - } + using ShardStrategy = ttnn::operations::data_movement::ShardStrategy; using ShardOrientation = tt::tt_metal::ShardOrientation; using Layout = tt::tt_metal::Layout; From 724237adf794581997107b11686b9a5f7a7dbffe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bojan=20Ro=C5=A1ko?= <156314064+broskoTT@users.noreply.github.com> Date: Mon, 24 Feb 2025 15:22:51 +0100 Subject: [PATCH 253/316] [UMD] Remove usage of outdated UMD apis (#17645) ### Ticket Related to https://github.com/tenstorrent/tt-metal/issues/17002 ### Problem description Alter some APIs and remove some usages. ### What's changed - Remove harvesting mask from metal_soc_descriptor - Remove usage of getting array of soc descriptor - Change getting harvesting mask. ### Checklist - [x] All post-commit tests : https://github.com/tenstorrent/tt-metal/actions/runs/13498939862 - [x] Blackhole post-commit tests : https://github.com/tenstorrent/tt-metal/actions/runs/13498941755 - [ ] (Single-card) Model perf tests : https://github.com/tenstorrent/tt-metal/actions/runs/13498943871 - [ ] (Single-card) Device perf regressions : https://github.com/tenstorrent/tt-metal/actions/runs/13498946279 - [ ] (T3K) T3000 unit tests : https://github.com/tenstorrent/tt-metal/actions/runs/13498948232 - [ ] (T3K) T3000 demo tests : https://github.com/tenstorrent/tt-metal/actions/runs/13498950335 - [ ] (TG) TG unit tests : https://github.com/tenstorrent/tt-metal/actions/runs/13498952411 - [ ] (TG) TG demo tests : https://github.com/tenstorrent/tt-metal/actions/runs/13498954557 - [x] (TGG) TGG unit tests : https://github.com/tenstorrent/tt-metal/actions/runs/13498956548 - [x] (TGG) TGG demo tests : https://github.com/tenstorrent/tt-metal/actions/runs/13498958682 --- .../TT-Distributed-Architecture-1219.md | 1 - .../tt_metal/api/test_soc_descriptor.cpp | 5 +++-- .../device/test_galaxy_cluster_api.cpp | 4 ++-- .../api/tt-metalium/metal_soc_descriptor.h | 2 +- tt_metal/common/metal_soc_descriptor.cpp | 3 +-- tt_metal/impl/device/device.cpp | 3 ++- tt_metal/llrt/core_descriptor.cpp | 2 +- tt_metal/llrt/tt_cluster.cpp | 21 +++++-------------- tt_metal/llrt/tt_cluster.hpp | 7 ++----- 9 files changed, 17 insertions(+), 31 deletions(-) diff --git a/tech_reports/TT-Distributed/TT-Distributed-Architecture-1219.md b/tech_reports/TT-Distributed/TT-Distributed-Architecture-1219.md index 869d52930df..52a57229e1d 100644 --- a/tech_reports/TT-Distributed/TT-Distributed-Architecture-1219.md +++ b/tech_reports/TT-Distributed/TT-Distributed-Architecture-1219.md @@ -2332,7 +2332,6 @@ bool Device::is_mmio_capable() const; const metal_SocDescriptor& tt_cluster::get_soc_desc(chip_id_t chip) const; // Get harvesting information for this chip -uint32_t tt_cluster::get_harvested_rows(chip_id_t chip) const; uint32_t tt_cluster::get_harvesting_mask(chip_id_t chip) const; // Get the clock frequency for this chip diff --git a/tests/tt_metal/tt_metal/api/test_soc_descriptor.cpp b/tests/tt_metal/tt_metal/api/test_soc_descriptor.cpp index 220fdb277c2..abb9ec14ba4 100644 --- a/tests/tt_metal/tt_metal/api/test_soc_descriptor.cpp +++ b/tests/tt_metal/tt_metal/api/test_soc_descriptor.cpp @@ -18,7 +18,8 @@ using namespace tt::test_utils; namespace unit_tests::basic::soc_desc { std::unordered_set get_harvested_rows(chip_id_t device_id) { - uint32_t harvested_rows_mask = tt::Cluster::instance().get_harvested_rows(device_id); + uint32_t harvested_rows_mask = CoordinateManager::shuffle_tensix_harvesting_mask_to_noc0_coords( + tt::Cluster::instance().get_soc_desc(device_id).arch, tt::Cluster::instance().get_harvesting_mask(device_id)); std::unordered_set harvested_rows; int row_coordinate = 0; int tmp = harvested_rows_mask; @@ -51,7 +52,7 @@ TEST(SOC, TensixValidateLogicalToPhysicalCoreCoordHostMapping) { num_devices = (arch == tt::ARCH::GRAYSKULL) ? 1 : num_devices; for (int device_id = 0; device_id < num_devices; device_id++) { tt_metal::IDevice* device = tt_metal::CreateDevice(device_id); - uint32_t harvested_rows_mask = tt::Cluster::instance().get_harvested_rows(device_id); + uint32_t harvested_rows_mask = tt::Cluster::instance().get_harvesting_mask(device_id); const metal_SocDescriptor& soc_desc = tt::Cluster::instance().get_soc_desc(device_id); log_info(LogTest, "Device {} harvesting mask {}", device_id, harvested_rows_mask); std::unordered_set harvested_rows = unit_tests::basic::soc_desc::get_harvested_rows(device_id); diff --git a/tests/tt_metal/tt_metal/device/test_galaxy_cluster_api.cpp b/tests/tt_metal/tt_metal/device/test_galaxy_cluster_api.cpp index 8c998b1705e..f645182a350 100644 --- a/tests/tt_metal/tt_metal/device/test_galaxy_cluster_api.cpp +++ b/tests/tt_metal/tt_metal/device/test_galaxy_cluster_api.cpp @@ -115,7 +115,7 @@ TEST_F(GalaxyFixture, ValidateAllGalaxyChipsAreUnharvested) { for (IDevice* device : this->devices_) { const chip_id_t device_id = device->id(); if (is_galaxy_device(device_id)) { - const uint32_t harvest_mask = tt::Cluster::instance().get_harvested_rows(device_id); + const uint32_t harvest_mask = tt::Cluster::instance().get_harvesting_mask(device_id); ASSERT_TRUE(harvest_mask == 0) << "Harvest mask for chip " << device_id << ": " << harvest_mask << std::endl; } @@ -128,7 +128,7 @@ TEST_F(GalaxyFixture, ValidateAllMMIOChipsHaveSingleRowHarvested) { const chip_id_t device_id = device->id(); if (!is_galaxy_device(device_id)) { uint32_t num_rows_harvested = 0; - uint32_t harvest_mask = tt::Cluster::instance().get_harvested_rows(device_id); + uint32_t harvest_mask = tt::Cluster::instance().get_harvesting_mask(device_id); while (harvest_mask) { if (harvest_mask & 1) { num_rows_harvested++; diff --git a/tt_metal/api/tt-metalium/metal_soc_descriptor.h b/tt_metal/api/tt-metalium/metal_soc_descriptor.h index e554e1b7040..26d17e84fd8 100644 --- a/tt_metal/api/tt-metalium/metal_soc_descriptor.h +++ b/tt_metal/api/tt-metalium/metal_soc_descriptor.h @@ -25,7 +25,7 @@ struct metal_SocDescriptor : public tt_SocDescriptor { std::map logical_eth_core_to_chan_map; - metal_SocDescriptor(const tt_SocDescriptor& other, uint32_t harvesting_mask, const BoardType& board_type); + metal_SocDescriptor(const tt_SocDescriptor& other, const BoardType& board_type); metal_SocDescriptor() = default; CoreCoord get_preferred_worker_core_for_dram_view(int dram_view) const; diff --git a/tt_metal/common/metal_soc_descriptor.cpp b/tt_metal/common/metal_soc_descriptor.cpp index 7b41d62c8cf..e85f6e0ccb3 100644 --- a/tt_metal/common/metal_soc_descriptor.cpp +++ b/tt_metal/common/metal_soc_descriptor.cpp @@ -226,8 +226,7 @@ void metal_SocDescriptor::update_pcie_cores(const BoardType& board_type) { // removing the harvested physical coordiniates Metal needs the true harvesting state so we generate physical // descriptors from virtual coordinates We also initialize additional lookup tables to translate physical coordinates to // virtual coordinates because UMD APIs expect virtual coordinates. -metal_SocDescriptor::metal_SocDescriptor( - const tt_SocDescriptor& other, uint32_t harvesting_mask, const BoardType& board_type) : +metal_SocDescriptor::metal_SocDescriptor(const tt_SocDescriptor& other, const BoardType& board_type) : tt_SocDescriptor(other) { this->load_dram_metadata_from_device_descriptor(); this->generate_logical_eth_coords_mapping(); diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index 4afa1b342a7..f92904fa902 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -638,7 +638,8 @@ void Device::initialize_and_launch_firmware() { // Determine which noc-coords are harvested // TODO(PGK/Almeet): fix this w/ new UMD std::vector harvested_rows; - uint32_t harvested_noc_rows = tt::Cluster::instance().get_harvested_rows(this->id()); + uint32_t harvested_noc_rows = CoordinateManager::shuffle_tensix_harvesting_mask_to_noc0_coords( + tt::Cluster::instance().get_soc_desc(this->id()).arch, tt::Cluster::instance().get_harvesting_mask(this->id())); for (uint32_t y = 0; y < soc_d.grid_size.y; y++) { bool row_harvested = (harvested_noc_rows >> y) & 0x1; if (row_harvested) { diff --git a/tt_metal/llrt/core_descriptor.cpp b/tt_metal/llrt/core_descriptor.cpp index 99fd72ec096..a4f04dbde80 100644 --- a/tt_metal/llrt/core_descriptor.cpp +++ b/tt_metal/llrt/core_descriptor.cpp @@ -66,7 +66,7 @@ const core_descriptor_t& get_core_descriptor_config( config_by_arch; ARCH arch = tt::Cluster::instance().arch(); - uint32_t harvesting_mask = tt::Cluster::instance().get_harvested_rows(device_id); + uint32_t harvesting_mask = tt::Cluster::instance().get_harvesting_mask(device_id); std::bitset<32> mask_bitset(harvesting_mask); uint32_t num_harvested_rows = mask_bitset.count(); diff --git a/tt_metal/llrt/tt_cluster.cpp b/tt_metal/llrt/tt_cluster.cpp index b7cecc47732..d6f678b7217 100644 --- a/tt_metal/llrt/tt_cluster.cpp +++ b/tt_metal/llrt/tt_cluster.cpp @@ -240,12 +240,10 @@ void Cluster::assign_mem_channels_to_devices( } } -void Cluster::get_metal_desc_from_tt_desc( - const std::unordered_map &input, - const std::unordered_map &per_chip_id_harvesting_masks) { - for (const auto& it : input) { - chip_id_t id = it.first; - this->sdesc_per_chip_.emplace(id, metal_SocDescriptor(it.second, per_chip_id_harvesting_masks.at(id), this->cluster_desc_->get_board_type(id))); +void Cluster::get_metal_desc_from_tt_desc() { + for (const auto& id : this->driver_->get_target_device_ids()) { + this->sdesc_per_chip_.emplace( + id, metal_SocDescriptor(this->driver_->get_soc_descriptor(id), this->cluster_desc_->get_board_type(id))); } } @@ -297,9 +295,8 @@ void Cluster::open_driver(const bool &skip_driver_allocs) { } device_driver->set_barrier_address_params(barrier_params); - this->get_metal_desc_from_tt_desc( - device_driver->get_virtual_soc_descriptors(), device_driver->get_harvesting_masks_for_soc_descriptors()); this->driver_ = std::move(device_driver); + this->get_metal_desc_from_tt_desc(); } void Cluster::start_driver(tt_device_params &device_params) const { @@ -474,14 +471,6 @@ const std::unordered_map Cluster::get_worker_logical_to_virtual_y(chip return worker_logical_to_virtual_y; } -uint32_t Cluster::get_harvested_rows(chip_id_t chip) const { - if (this->target_type_ == TargetDevice::Simulator) { - return 0; - } else { - return this->driver_->harvested_rows_per_target.at(chip); - } -} - int Cluster::get_device_aiclk(const chip_id_t &chip_id) const { if (this->arch_ == tt::ARCH::BLACKHOLE) { // For Blackhole bring up remove AICLK query due to lack of ARC message support diff --git a/tt_metal/llrt/tt_cluster.hpp b/tt_metal/llrt/tt_cluster.hpp index 6f91b01300e..34f56508a75 100644 --- a/tt_metal/llrt/tt_cluster.hpp +++ b/tt_metal/llrt/tt_cluster.hpp @@ -86,9 +86,8 @@ class Cluster { const std::unordered_set& get_virtual_worker_cores(chip_id_t chip_id) const; const std::unordered_set& get_virtual_eth_cores(chip_id_t chip_id) const; - uint32_t get_harvested_rows(chip_id_t chip) const; uint32_t get_harvesting_mask(chip_id_t chip) const { - return this->driver_->get_harvesting_masks_for_soc_descriptors().at(chip); + return this->driver_->get_soc_descriptor(chip).harvesting_masks.tensix_harvesting_mask; } //! device driver and misc apis @@ -282,9 +281,7 @@ class Cluster { void open_driver(const bool& skip_driver_allocs = false); void start_driver(tt_device_params& device_params) const; - void get_metal_desc_from_tt_desc( - const std::unordered_map& input, - const std::unordered_map& per_chip_id_harvesting_masks); + void get_metal_desc_from_tt_desc(); void generate_virtual_to_umd_coord_mapping(); void generate_virtual_to_profiler_flat_id_mapping(); From f1cc691ceade44ee7eb6572a1d0c30cdc2a0945a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bojan=20Ro=C5=A1ko?= <156314064+broskoTT@users.noreply.github.com> Date: Mon, 24 Feb 2025 15:27:23 +0100 Subject: [PATCH 254/316] [UMD] Remove a couple of leftover usages of old soc descriptor API (#17707) ### Ticket Related to https://github.com/tenstorrent/tt-metal/issues/17002 ### Problem description A couple of leftover usages of old soc descriptor API. After this and other PRs from this set, tt-metal will finally build with harvesting code completely removed from tt::umd::Cluster and members of tt_SocDescriptor made private, so that all usages are forced through get_cores() and other APIs. Related PRs: https://github.com/tenstorrent/tt-metal/pull/17620 https://github.com/tenstorrent/tt-metal/pull/17642 https://github.com/tenstorrent/tt-metal/pull/17645 https://github.com/tenstorrent/tt-metal/pull/17674 https://github.com/tenstorrent/tt-metal/pull/17678 ### What's changed - .dram_cores changed with get_cores_for_dram_channel - dram_cores.size() changed with get_grid_size - replace .workers and .ethernet_cores from tlb_config with get_cores ### Checklist - [x] All post-commit tests : https://github.com/tenstorrent/tt-metal/actions/runs/13197621962 - [x] Newest All post-commit tests : https://github.com/tenstorrent/tt-metal/actions/runs/13499189013 - [x] Blackhole post-commit tests : https://github.com/tenstorrent/tt-metal/actions/runs/13197623746 - [ ] (Single-card) Model perf tests : https://github.com/tenstorrent/tt-metal/actions/runs/13197626137 - [ ] (Single-card) Device perf regressions : https://github.com/tenstorrent/tt-metal/actions/runs/13197628487 - [ ] (T3K) T3000 unit tests : https://github.com/tenstorrent/tt-metal/actions/runs/13197630092 - [ ] (T3K) T3000 demo tests : https://github.com/tenstorrent/tt-metal/actions/runs/13197632086 - [ ] (TG) TG unit tests : https://github.com/tenstorrent/tt-metal/actions/runs/13197633394 - [ ] (TG) TG demo tests : https://github.com/tenstorrent/tt-metal/actions/runs/13197635275 - [x] (TGG) TGG unit tests : https://github.com/tenstorrent/tt-metal/actions/runs/13197637219 - [x] (TGG) TGG demo tests : https://github.com/tenstorrent/tt-metal/actions/runs/13197639736 --- tt_metal/common/metal_soc_descriptor.cpp | 14 +++++++++----- tt_metal/llrt/tlb_config.cpp | 13 +++++++------ 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/tt_metal/common/metal_soc_descriptor.cpp b/tt_metal/common/metal_soc_descriptor.cpp index e85f6e0ccb3..80341239f3b 100644 --- a/tt_metal/common/metal_soc_descriptor.cpp +++ b/tt_metal/common/metal_soc_descriptor.cpp @@ -134,18 +134,18 @@ void metal_SocDescriptor::load_dram_metadata_from_device_descriptor() { int worker_endpoint = dram_view["worker_endpoint"].as(); size_t address_offset = dram_view["address_offset"].as(); - if (channel >= dram_cores.size()) { + if (channel >= get_grid_size(CoreType::DRAM).x) { TT_THROW( "DRAM channel {} does not exist in the device descriptor, but is specified in dram_view.channel", channel); } - if (eth_endpoint >= dram_cores[channel].size()) { + if (eth_endpoint >= get_grid_size(CoreType::DRAM).y) { TT_THROW( "DRAM subchannel {} does not exist in the device descriptor, but is specified in " "dram_view.eth_endpoint", eth_endpoint); } - if (worker_endpoint >= dram_cores[channel].size()) { + if (worker_endpoint >= get_grid_size(CoreType::DRAM).y) { TT_THROW( "DRAM subchannel {} does not exist in the device descriptor, but is specified in " "dram_view.worker_endpoint", @@ -153,8 +153,12 @@ void metal_SocDescriptor::load_dram_metadata_from_device_descriptor() { } this->dram_view_channels.push_back(channel); - this->dram_view_eth_cores.push_back(dram_cores[channel][eth_endpoint]); - this->dram_view_worker_cores.push_back(dram_cores[channel][worker_endpoint]); + tt::umd::CoreCoord eth_dram_endpoint_coord = + get_dram_core_for_channel(channel, eth_endpoint, CoordSystem::VIRTUAL); + this->dram_view_eth_cores.push_back({eth_dram_endpoint_coord.x, eth_dram_endpoint_coord.y}); + tt::umd::CoreCoord worker_endpoint_coord = + get_dram_core_for_channel(channel, worker_endpoint, CoordSystem::VIRTUAL); + this->dram_view_worker_cores.push_back({worker_endpoint_coord.x, worker_endpoint_coord.y}); this->dram_view_address_offsets.push_back(address_offset); } } diff --git a/tt_metal/llrt/tlb_config.cpp b/tt_metal/llrt/tlb_config.cpp index e5459ca4c3d..1113be07843 100644 --- a/tt_metal/llrt/tlb_config.cpp +++ b/tt_metal/llrt/tlb_config.cpp @@ -172,14 +172,10 @@ void configure_static_tlbs( default: TT_THROW("Configuring static TLBs is not supported for {}", tt::get_string(arch)); } - auto statically_mapped_cores = sdesc.workers; - statically_mapped_cores.insert( - statically_mapped_cores.end(), sdesc.ethernet_cores.begin(), sdesc.ethernet_cores.end()); std::int32_t address = 0; - // Setup static TLBs for all worker cores - for (auto& core : statically_mapped_cores) { - auto tlb_index = get_static_tlb_index(core); + for (const CoreCoord& core : sdesc.get_cores(CoreType::TENSIX, sdesc.get_umd_coord_system())) { + auto tlb_index = get_static_tlb_index({core.x, core.y}); // TODO // Note: see issue #10107 // Strict is less performant than Posted, however, metal doesn't presently @@ -188,6 +184,11 @@ void configure_static_tlbs( // Revisit this when we have a more flexible UMD api device_driver.configure_tlb(mmio_device_id, core, tlb_index, address, TLB_DATA::Strict); } + // Setup static TLBs for all eth cores + for (const CoreCoord& core : sdesc.get_cores(CoreType::ETH, sdesc.get_umd_coord_system())) { + auto tlb_index = get_static_tlb_index({core.x, core.y}); + device_driver.configure_tlb(mmio_device_id, core, tlb_index, address, TLB_DATA::Strict); + } // TODO (#9932): Remove workaround for BH if (arch != tt::ARCH::BLACKHOLE) { From 504cd3d1f35924f91a3499511bbf74301c152f47 Mon Sep 17 00:00:00 2001 From: William Ly Date: Mon, 24 Feb 2025 09:38:06 -0500 Subject: [PATCH 255/316] [skip ci] #0: Fix crash when gtest xml contains no tests (#18208) ### Ticket ### Problem description Produce data flow started crashing due to an xml file where there are no tests ([job isn't running tests?](https://github.com/tenstorrent/tt-metal/actions/runs/13498939862/job/37712914831)) https://github.com/tenstorrent/tt-metal/actions/runs/13499985487/job/37715810765 ### What's changed Make sure that the length of the xml element tree has len > 0 before indexing into element 0 ### Checklist - [ ] New/Existing tests provide coverage for changes Rerun on existing failed run in fix branch: https://github.com/tenstorrent/tt-metal/actions/runs/13500449797/job/37717313193 Added unit test --- infra/data_collection/github/workflows.py | 2 +- infra/data_collection/junit_xml_utils.py | 4 ++-- .../distributed_unit_tests_wormhole_b0.xml | 3 +++ infra/tests/data_collection/test_cicd.py | 9 +++++++++ 4 files changed, 15 insertions(+), 3 deletions(-) create mode 100644 infra/tests/_data/data_collection/cicd/all_post_commit_job_37712709106/distributed_unit_tests_wormhole_b0.xml diff --git a/infra/data_collection/github/workflows.py b/infra/data_collection/github/workflows.py index be5fbe661c6..1b2d979934a 100644 --- a/infra/data_collection/github/workflows.py +++ b/infra/data_collection/github/workflows.py @@ -264,5 +264,5 @@ def get_tests_from_test_report_path(test_report_path): return tests else: - logger.warning("XML is not pytest junit format (gtest?), skipping for now") + logger.warning("XML is not pytest junit or gtest format, or no tests were found in the XML, skipping for now") return [] diff --git a/infra/data_collection/junit_xml_utils.py b/infra/data_collection/junit_xml_utils.py index 310c5d74a6b..d19f8c3cb6c 100644 --- a/infra/data_collection/junit_xml_utils.py +++ b/infra/data_collection/junit_xml_utils.py @@ -31,7 +31,7 @@ def sanity_check_test_xml_(root_element, is_pytest=True): def is_pytest_junit_xml(root_element): - is_pytest = root_element[0].get("name") == "pytest" + is_pytest = len(root_element) > 0 and root_element[0].get("name") == "pytest" if is_pytest: sanity_check_test_xml_(root_element) @@ -40,7 +40,7 @@ def is_pytest_junit_xml(root_element): def is_gtest_xml(root_element): - is_gtest = root_element[0].get("name") != "pytest" + is_gtest = len(root_element) > 0 and root_element[0].get("name") != "pytest" if is_gtest: sanity_check_test_xml_(root_element, is_pytest=False) diff --git a/infra/tests/_data/data_collection/cicd/all_post_commit_job_37712709106/distributed_unit_tests_wormhole_b0.xml b/infra/tests/_data/data_collection/cicd/all_post_commit_job_37712709106/distributed_unit_tests_wormhole_b0.xml new file mode 100644 index 00000000000..ab29f6fa648 --- /dev/null +++ b/infra/tests/_data/data_collection/cicd/all_post_commit_job_37712709106/distributed_unit_tests_wormhole_b0.xml @@ -0,0 +1,3 @@ + + + diff --git a/infra/tests/data_collection/test_cicd.py b/infra/tests/data_collection/test_cicd.py index 99a97230ef2..440cd4ea115 100644 --- a/infra/tests/data_collection/test_cicd.py +++ b/infra/tests/data_collection/test_cicd.py @@ -1,6 +1,7 @@ import pytest import pathlib +from infra.data_collection.github import workflows from infra.data_collection.cicd import create_cicd_json_for_data_analysis from infra.data_collection.models import InfraErrorV1 @@ -224,3 +225,11 @@ def test_create_pipeline_json_for_gtest_testcases(workflow_run_gh_environment): # fails validation, job is expected be skipped assert len([x for x in pipeline.jobs if x.github_job_id == 37190219113]) == 0 + + +def test_empty_gtest_xml(workflow_run_gh_environment): + github_runner_environment = workflow_run_gh_environment + workflow_outputs_dir = pathlib.Path("tests/_data/data_collection/cicd/all_post_commit_job_37712709106/").resolve() + assert ( + workflows.get_tests_from_test_report_path(workflow_outputs_dir / "distributed_unit_tests_wormhole_b0.xml") == [] + ) From 70afbb0f2b4ecf3eed3e55cf17a92776d7477440 Mon Sep 17 00:00:00 2001 From: Nathan Sidwell Date: Mon, 24 Feb 2025 11:06:25 -0500 Subject: [PATCH 256/316] #0: add .ttinsn to sfpi (#17800) ### Ticket NA ### Problem description We currently create bespoke sfpu insns using .word, that has 2 shortcomings 1) because of mapping symbols, they never disassemble to instructions (the dissassembler always shows them as data) 2) we have to manually swizzle them (move 2 bits from the front to the back) We can't use `.insn` because spu insns do not use the regular riscv encoding, so its length checking goes wrong. ### What's changed Added `.ttinsn` to the assembler. This will do the swizzling, and arrange for them to be disassemblable. Alter the ckernel_ops.h header appropriately (although this is machine-generated the generator is unavailable) There are other instances of the ckernel_ops.h headers in submodules, that will also need updating. This makes disassembling kernels much more pleasant. See SFPI release notes for example (https://github.com/tenstorrent/sfpi/releases/tag/v6.2.0) ### Checklist - [ YES] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes - [ YES] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [ ] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [ ] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [ ] New/Existing tests provide coverage for changes --- tt_metal/hw/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tt_metal/hw/CMakeLists.txt b/tt_metal/hw/CMakeLists.txt index ced61995a75..75023541f37 100644 --- a/tt_metal/hw/CMakeLists.txt +++ b/tt_metal/hw/CMakeLists.txt @@ -21,8 +21,8 @@ set(TYPES include(FetchContent) set(SFPI_x86_64_Linux_RELEASE - "v6.1.0/sfpi-release.tgz" - "da98a135fe95a462c3b6b4e054dc159f" + "v6.2.0/sfpi-release.tgz" + "c546b57c3161b06d03de7473c4add5e5" ) if(DEFINED SFPI_${CMAKE_HOST_SYSTEM_PROCESSOR}_${CMAKE_HOST_SYSTEM_NAME}_RELEASE) set(SFPI_RELEASE "${SFPI_${CMAKE_HOST_SYSTEM_PROCESSOR}_${CMAKE_HOST_SYSTEM_NAME}_RELEASE}") From 2ce3545a22ab3c111c471d1b0306514eac7f6f66 Mon Sep 17 00:00:00 2001 From: Wenbin Lyu Date: Mon, 24 Feb 2025 10:24:49 -0600 Subject: [PATCH 257/316] Fix narrowing conversion in metal header (#17890) ### Ticket N/A ### Problem description Compiling an external metal cpp program with `clang++-17` fails with the following error ``` .../tt_metal/api/tt-metalium/dispatch_settings.hpp:82:59: error: non-constant-expression cannot be narrowed from type 'int' to 'uint32_t' (aka 'unsigned int') in initializer list [-Wc++11-narrowing] 82 | DispatchSettingsContainerKey k{core_type, hw_cqs}; | ^~~~~~ ``` ### What's changed Changed the loop variable to be u32, also removed one unused header. ### Checklist - [ ] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes Signed-off-by: wenbinlyuTT --- tt_metal/api/tt-metalium/dispatch_settings.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tt_metal/api/tt-metalium/dispatch_settings.hpp b/tt_metal/api/tt-metalium/dispatch_settings.hpp index d7a7161741a..c3becfb1467 100644 --- a/tt_metal/api/tt-metalium/dispatch_settings.hpp +++ b/tt_metal/api/tt-metalium/dispatch_settings.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. // // SPDX-License-Identifier: Apache-2.0 @@ -10,7 +10,6 @@ #include "dev_msgs.h" // go_msg_t #include "hal.hpp" #include -#include #include "umd/device/tt_core_coordinates.h" namespace tt { @@ -78,7 +77,7 @@ class DispatchSettings { static constexpr std::array k_SupportedCoreTypes{CoreType::ETH, CoreType::WORKER}; auto& store = get_store(); for (const auto& core_type : k_SupportedCoreTypes) { - for (int hw_cqs = 1; hw_cqs <= MAX_NUM_HW_CQS; ++hw_cqs) { + for (uint32_t hw_cqs = 1; hw_cqs <= MAX_NUM_HW_CQS; ++hw_cqs) { DispatchSettingsContainerKey k{core_type, hw_cqs}; store[k] = DispatchSettings::defaults(core_type, cluster, hw_cqs); } From 585beff314f036c00e8eba322ea735e877a402c4 Mon Sep 17 00:00:00 2001 From: Edwin Lee Date: Mon, 24 Feb 2025 11:28:54 -0500 Subject: [PATCH 258/316] #17482: Add matmul validation to prevent illegal width + block sharded inputs (#17891) ### Ticket [17482](https://github.com/tenstorrent/tt-metal/issues/17482) ### Problem description Performing matmul with a block sharded + a width sharded input resulted in a device hang ### What's changed Added validation to error when invalid combination of block sharded input0 + non-DRAM width sharded input1 are received ### Checklist - [x] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes - https://github.com/tenstorrent/tt-metal/actions/runs/13460303299 - [ ] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [ ] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [ ] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [ ] New/Existing tests provide coverage for changes --- ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp index b027c70e19c..6f8df9d82ff 100644 --- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp @@ -1847,6 +1847,14 @@ void Matmul::validate( auto tensor_b_memory_layout = input_tensor_b.memory_config().memory_layout; TT_FATAL(tensor_b_memory_layout == TensorMemoryLayout::WIDTH_SHARDED, "Error"); if (input_tensor_b.buffer()->buffer_type() != tt_metal::BufferType::DRAM) { + const auto tensor_a_memory_layout = input_tensor_a.memory_config().memory_layout; + TT_FATAL( + (input_tensor_a.memory_config().is_sharded() && + tensor_a_memory_layout == TensorMemoryLayout::HEIGHT_SHARDED) || + tensor_a_memory_layout == TensorMemoryLayout::INTERLEAVED, + "Error - non-DRAM width sharded input B requires input A to be interleaved or height " + "sharded, rather than {}", + tensor_a_memory_layout); TT_FATAL( program_config.per_core_N == (input_tensor_b.shard_spec().value().shape[1] / in1_tile_shape[1]), From 6abd95911679c17e9b3412823b351a6a241c2583 Mon Sep 17 00:00:00 2001 From: Yu Gao <145494740+yugaoTT@users.noreply.github.com> Date: Mon, 24 Feb 2025 12:08:12 -0500 Subject: [PATCH 259/316] =?UTF-8?q?#0:=20bump=20up=20trace=20region=20in?= =?UTF-8?q?=20resnet=20since=20Matmul=20is=20slightly=20la=E2=80=A6=20(#18?= =?UTF-8?q?214)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Checklist - [ ] nightly ttnn https://github.com/tenstorrent/tt-metal/actions/runs/13501295078 --- models/demos/t3000/resnet50/tests/test_resnet50_performant.py | 4 ++-- models/demos/tg/resnet50/tests/test_resnet50_performant.py | 4 ++-- models/demos/tgg/resnet50/tests/test_resnet50_performant.py | 4 ++-- .../demos/wormhole/resnet50/tests/test_resnet50_performant.py | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/models/demos/t3000/resnet50/tests/test_resnet50_performant.py b/models/demos/t3000/resnet50/tests/test_resnet50_performant.py index 4b17c3cea7c..6a9cb1e5230 100644 --- a/models/demos/t3000/resnet50/tests/test_resnet50_performant.py +++ b/models/demos/t3000/resnet50/tests/test_resnet50_performant.py @@ -43,7 +43,7 @@ def test_run_resnet50_inference( @run_for_wormhole_b0() -@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576, "trace_region_size": 800768}], indirect=True) +@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576, "trace_region_size": 803016}], indirect=True) @pytest.mark.parametrize( "device_batch_size, act_dtype, weight_dtype, math_fidelity", ((16, ttnn.bfloat8_b, ttnn.bfloat8_b, ttnn.MathFidelity.LoFi),), @@ -98,7 +98,7 @@ def test_run_resnet50_2cqs_inference( @run_for_wormhole_b0() @pytest.mark.parametrize( - "device_params", [{"l1_small_size": 24576, "trace_region_size": 800768, "num_command_queues": 2}], indirect=True + "device_params", [{"l1_small_size": 24576, "trace_region_size": 803016, "num_command_queues": 2}], indirect=True ) @pytest.mark.parametrize( "device_batch_size, act_dtype, weight_dtype, math_fidelity", diff --git a/models/demos/tg/resnet50/tests/test_resnet50_performant.py b/models/demos/tg/resnet50/tests/test_resnet50_performant.py index e1d7f44db63..532fa1f413f 100644 --- a/models/demos/tg/resnet50/tests/test_resnet50_performant.py +++ b/models/demos/tg/resnet50/tests/test_resnet50_performant.py @@ -48,7 +48,7 @@ def test_run_resnet50_inference( @run_for_wormhole_b0() -@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576, "trace_region_size": 800768}], indirect=True) +@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576, "trace_region_size": 803016}], indirect=True) @pytest.mark.parametrize( "device_batch_size, act_dtype, weight_dtype, math_fidelity", ((16, ttnn.bfloat8_b, ttnn.bfloat8_b, ttnn.MathFidelity.LoFi),), @@ -113,7 +113,7 @@ def test_run_resnet50_2cqs_inference( @run_for_wormhole_b0() @pytest.mark.parametrize( - "device_params", [{"l1_small_size": 24576, "trace_region_size": 800768, "num_command_queues": 2}], indirect=True + "device_params", [{"l1_small_size": 24576, "trace_region_size": 803016, "num_command_queues": 2}], indirect=True ) @pytest.mark.parametrize( "device_batch_size, act_dtype, weight_dtype, math_fidelity", diff --git a/models/demos/tgg/resnet50/tests/test_resnet50_performant.py b/models/demos/tgg/resnet50/tests/test_resnet50_performant.py index ef56feb8199..278bb57a215 100644 --- a/models/demos/tgg/resnet50/tests/test_resnet50_performant.py +++ b/models/demos/tgg/resnet50/tests/test_resnet50_performant.py @@ -49,7 +49,7 @@ def test_run_resnet50_2cqs_inference( @run_for_wormhole_b0() @pytest.mark.parametrize( - "device_params", [{"l1_small_size": 24576, "trace_region_size": 800768, "num_command_queues": 2}], indirect=True + "device_params", [{"l1_small_size": 24576, "trace_region_size": 803016, "num_command_queues": 2}], indirect=True ) @pytest.mark.parametrize( "device_batch_size, act_dtype, weight_dtype, math_fidelity", @@ -114,7 +114,7 @@ def test_run_resnet50_inference( @run_for_wormhole_b0() -@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576, "trace_region_size": 800768}], indirect=True) +@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576, "trace_region_size": 803016}], indirect=True) @pytest.mark.parametrize( "device_batch_size, act_dtype, weight_dtype, math_fidelity", ((16, ttnn.bfloat8_b, ttnn.bfloat8_b, ttnn.MathFidelity.LoFi),), diff --git a/models/demos/wormhole/resnet50/tests/test_resnet50_performant.py b/models/demos/wormhole/resnet50/tests/test_resnet50_performant.py index 169f99fd4a9..5f33ad884b8 100644 --- a/models/demos/wormhole/resnet50/tests/test_resnet50_performant.py +++ b/models/demos/wormhole/resnet50/tests/test_resnet50_performant.py @@ -27,7 +27,7 @@ def test_run_resnet50_inference( @run_for_wormhole_b0() -@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576, "trace_region_size": 800768}], indirect=True) +@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576, "trace_region_size": 803016}], indirect=True) @pytest.mark.parametrize( "batch_size, act_dtype, weight_dtype, math_fidelity", ((16, ttnn.bfloat8_b, ttnn.bfloat8_b, ttnn.MathFidelity.LoFi),), @@ -67,7 +67,7 @@ def test_run_resnet50_2cqs_inference( @run_for_wormhole_b0() @pytest.mark.parametrize( - "device_params", [{"l1_small_size": 24576, "trace_region_size": 800768, "num_command_queues": 2}], indirect=True + "device_params", [{"l1_small_size": 24576, "trace_region_size": 803016, "num_command_queues": 2}], indirect=True ) @pytest.mark.parametrize( "batch_size, act_dtype, weight_dtype, math_fidelity", From aa09a6f63adfd5717460751fd3d0ff06af865ef4 Mon Sep 17 00:00:00 2001 From: William Ly Date: Mon, 24 Feb 2025 12:10:35 -0500 Subject: [PATCH 260/316] [skip ci] #0: Move --exclude-warning-annotations to pytest.ini (#18220) ### Ticket ... ### Problem description Warnings thrown by pytest inside profiler regressions show up as annotations in GHA after https://github.com/tenstorrent/tt-metal/pull/18106 because the tests run inside a bash script and don't have `--exclude-warning-annotations` set. ### What's changed Add `--exclude-warning-annotations` to pytest.ini ### Checklist - [ ] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes - [ ] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [ ] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [ ] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [ ] New/Existing tests provide coverage for changes --- .../fast-dispatch-build-and-unit-tests.yaml | 16 ++++++------ .github/workflows/models-post-commit.yaml | 1 + .github/workflows/ttnn-post-commit.yaml | 26 +++++++++---------- pytest.ini | 2 +- 4 files changed, 23 insertions(+), 22 deletions(-) diff --git a/.github/workflows/fast-dispatch-build-and-unit-tests.yaml b/.github/workflows/fast-dispatch-build-and-unit-tests.yaml index aefef4fa0e2..cf9d0391576 100644 --- a/.github/workflows/fast-dispatch-build-and-unit-tests.yaml +++ b/.github/workflows/fast-dispatch-build-and-unit-tests.yaml @@ -52,14 +52,14 @@ jobs: matrix: os: ["${{ inputs.os }}"] test-group: [ - {name: eager unit tests 1, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 1 --exclude-warning-annotations }, - {name: eager unit tests 2, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 2 --exclude-warning-annotations }, - {name: eager unit tests 3, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 3 --exclude-warning-annotations }, - {name: eager unit tests 4, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 4 --exclude-warning-annotations }, - {name: eager unit tests 5, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 5 --exclude-warning-annotations }, - {name: eager unit tests 6, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 6 --exclude-warning-annotations }, - {name: eager unit tests 7, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 7 --exclude-warning-annotations }, - {name: sweep, cmd: pytest tests/tt_eager/python_api_testing/sweep_tests/pytests/ -xvvv --exclude-warning-annotations }, + {name: eager unit tests 1, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 1 }, + {name: eager unit tests 2, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 2 }, + {name: eager unit tests 3, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 3 }, + {name: eager unit tests 4, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 4 }, + {name: eager unit tests 5, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 5 }, + {name: eager unit tests 6, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 6 }, + {name: eager unit tests 7, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 7 }, + {name: sweep, cmd: pytest tests/tt_eager/python_api_testing/sweep_tests/pytests/ -xvvv }, ] name: ${{ matrix.test-group.name }} ${{ inputs.arch }} ${{ inputs.runner-label }} env: diff --git a/.github/workflows/models-post-commit.yaml b/.github/workflows/models-post-commit.yaml index 0bb512a3dec..fe149160875 100644 --- a/.github/workflows/models-post-commit.yaml +++ b/.github/workflows/models-post-commit.yaml @@ -70,6 +70,7 @@ jobs: docker_password: ${{ secrets.GITHUB_TOKEN }} docker_opts: | -e ARCH_NAME=${{ inputs.arch }} + -e GITHUB_ACTIONS=true run_args: | source tests/scripts/run_python_model_tests.sh && run_python_model_tests_${{ inputs.arch }} - uses: ./.github/actions/slack-report diff --git a/.github/workflows/ttnn-post-commit.yaml b/.github/workflows/ttnn-post-commit.yaml index 5d579306c12..6081b2d9910 100644 --- a/.github/workflows/ttnn-post-commit.yaml +++ b/.github/workflows/ttnn-post-commit.yaml @@ -52,31 +52,31 @@ jobs: os: ["ubuntu-20.04"] test-group: - name: ttnn group 1 - cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --exclude-warning-annotations --group 1 -m "not disable_fast_runtime_mode" + cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --group 1 -m "not disable_fast_runtime_mode" - name: ttnn group 2 - cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --exclude-warning-annotations --group 2 -m "not disable_fast_runtime_mode" + cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --group 2 -m "not disable_fast_runtime_mode" - name: ttnn group 3 - cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --exclude-warning-annotations --group 3 -m "not disable_fast_runtime_mode" + cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --group 3 -m "not disable_fast_runtime_mode" - name: ttnn group 4 - cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --exclude-warning-annotations --group 4 -m "not disable_fast_runtime_mode" + cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --group 4 -m "not disable_fast_runtime_mode" - name: ttnn group 5 - cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --exclude-warning-annotations --group 5 -m "not disable_fast_runtime_mode" + cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --group 5 -m "not disable_fast_runtime_mode" - name: ttnn group 6 - cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --exclude-warning-annotations --group 6 -m "not disable_fast_runtime_mode" + cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --group 6 -m "not disable_fast_runtime_mode" - name: ttnn group 7 - cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --exclude-warning-annotations --group 7 -m "not disable_fast_runtime_mode" + cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --group 7 -m "not disable_fast_runtime_mode" - name: ttnn group 8 - cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --exclude-warning-annotations --group 8 -m "not disable_fast_runtime_mode" + cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --group 8 -m "not disable_fast_runtime_mode" - name: ttnn group 9 - cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --exclude-warning-annotations --group 9 -m "not disable_fast_runtime_mode" + cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --group 9 -m "not disable_fast_runtime_mode" - name: ttnn group 10 - cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --exclude-warning-annotations --group 10 -m "not disable_fast_runtime_mode" + cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --group 10 -m "not disable_fast_runtime_mode" - name: ttnn group 11 - cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --exclude-warning-annotations --group 11 -m "not disable_fast_runtime_mode" + cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --group 11 -m "not disable_fast_runtime_mode" - name: ttnn group 12 - cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --exclude-warning-annotations --group 12 -m "not disable_fast_runtime_mode" + cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --group 12 -m "not disable_fast_runtime_mode" - name: ttnn fast runtime off - cmd: pytest tests/ttnn/unit_tests -xv --exclude-warning-annotations -m requires_fast_runtime_mode_off + cmd: pytest tests/ttnn/unit_tests -xv -m requires_fast_runtime_mode_off fast_runtime_mode_off: true - name: ttnn example tests cmd: ./tests/scripts/run_ttnn_examples.sh diff --git a/pytest.ini b/pytest.ini index aad47e86061..74b9a432203 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,7 +1,7 @@ [pytest] timeout = 300 minversion = 7.2 -addopts = --import-mode=importlib -vvs -rA --durations=0 --junitxml=generated/test_reports/most_recent_tests.xml +addopts = --import-mode=importlib -vvs -rA --durations=0 --junitxml=generated/test_reports/most_recent_tests.xml --exclude-warning-annotations empty_parameter_set_mark = skip markers = post_commit: mark tests to run on post-commit From f64daca995f25c96a90648ce9f7a3e1f7fe7ae52 Mon Sep 17 00:00:00 2001 From: William Ly Date: Mon, 24 Feb 2025 13:15:27 -0500 Subject: [PATCH 261/316] Revert "#0: Move --exclude-warning-annotations to pytest.ini" (#18224) Reverts tenstorrent/tt-metal#18220 --- .../fast-dispatch-build-and-unit-tests.yaml | 16 ++++++------ .github/workflows/models-post-commit.yaml | 1 - .github/workflows/ttnn-post-commit.yaml | 26 +++++++++---------- pytest.ini | 2 +- 4 files changed, 22 insertions(+), 23 deletions(-) diff --git a/.github/workflows/fast-dispatch-build-and-unit-tests.yaml b/.github/workflows/fast-dispatch-build-and-unit-tests.yaml index cf9d0391576..aefef4fa0e2 100644 --- a/.github/workflows/fast-dispatch-build-and-unit-tests.yaml +++ b/.github/workflows/fast-dispatch-build-and-unit-tests.yaml @@ -52,14 +52,14 @@ jobs: matrix: os: ["${{ inputs.os }}"] test-group: [ - {name: eager unit tests 1, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 1 }, - {name: eager unit tests 2, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 2 }, - {name: eager unit tests 3, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 3 }, - {name: eager unit tests 4, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 4 }, - {name: eager unit tests 5, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 5 }, - {name: eager unit tests 6, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 6 }, - {name: eager unit tests 7, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 7 }, - {name: sweep, cmd: pytest tests/tt_eager/python_api_testing/sweep_tests/pytests/ -xvvv }, + {name: eager unit tests 1, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 1 --exclude-warning-annotations }, + {name: eager unit tests 2, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 2 --exclude-warning-annotations }, + {name: eager unit tests 3, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 3 --exclude-warning-annotations }, + {name: eager unit tests 4, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 4 --exclude-warning-annotations }, + {name: eager unit tests 5, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 5 --exclude-warning-annotations }, + {name: eager unit tests 6, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 6 --exclude-warning-annotations }, + {name: eager unit tests 7, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 7 --exclude-warning-annotations }, + {name: sweep, cmd: pytest tests/tt_eager/python_api_testing/sweep_tests/pytests/ -xvvv --exclude-warning-annotations }, ] name: ${{ matrix.test-group.name }} ${{ inputs.arch }} ${{ inputs.runner-label }} env: diff --git a/.github/workflows/models-post-commit.yaml b/.github/workflows/models-post-commit.yaml index fe149160875..0bb512a3dec 100644 --- a/.github/workflows/models-post-commit.yaml +++ b/.github/workflows/models-post-commit.yaml @@ -70,7 +70,6 @@ jobs: docker_password: ${{ secrets.GITHUB_TOKEN }} docker_opts: | -e ARCH_NAME=${{ inputs.arch }} - -e GITHUB_ACTIONS=true run_args: | source tests/scripts/run_python_model_tests.sh && run_python_model_tests_${{ inputs.arch }} - uses: ./.github/actions/slack-report diff --git a/.github/workflows/ttnn-post-commit.yaml b/.github/workflows/ttnn-post-commit.yaml index 6081b2d9910..5d579306c12 100644 --- a/.github/workflows/ttnn-post-commit.yaml +++ b/.github/workflows/ttnn-post-commit.yaml @@ -52,31 +52,31 @@ jobs: os: ["ubuntu-20.04"] test-group: - name: ttnn group 1 - cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --group 1 -m "not disable_fast_runtime_mode" + cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --exclude-warning-annotations --group 1 -m "not disable_fast_runtime_mode" - name: ttnn group 2 - cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --group 2 -m "not disable_fast_runtime_mode" + cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --exclude-warning-annotations --group 2 -m "not disable_fast_runtime_mode" - name: ttnn group 3 - cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --group 3 -m "not disable_fast_runtime_mode" + cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --exclude-warning-annotations --group 3 -m "not disable_fast_runtime_mode" - name: ttnn group 4 - cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --group 4 -m "not disable_fast_runtime_mode" + cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --exclude-warning-annotations --group 4 -m "not disable_fast_runtime_mode" - name: ttnn group 5 - cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --group 5 -m "not disable_fast_runtime_mode" + cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --exclude-warning-annotations --group 5 -m "not disable_fast_runtime_mode" - name: ttnn group 6 - cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --group 6 -m "not disable_fast_runtime_mode" + cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --exclude-warning-annotations --group 6 -m "not disable_fast_runtime_mode" - name: ttnn group 7 - cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --group 7 -m "not disable_fast_runtime_mode" + cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --exclude-warning-annotations --group 7 -m "not disable_fast_runtime_mode" - name: ttnn group 8 - cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --group 8 -m "not disable_fast_runtime_mode" + cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --exclude-warning-annotations --group 8 -m "not disable_fast_runtime_mode" - name: ttnn group 9 - cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --group 9 -m "not disable_fast_runtime_mode" + cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --exclude-warning-annotations --group 9 -m "not disable_fast_runtime_mode" - name: ttnn group 10 - cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --group 10 -m "not disable_fast_runtime_mode" + cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --exclude-warning-annotations --group 10 -m "not disable_fast_runtime_mode" - name: ttnn group 11 - cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --group 11 -m "not disable_fast_runtime_mode" + cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --exclude-warning-annotations --group 11 -m "not disable_fast_runtime_mode" - name: ttnn group 12 - cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --group 12 -m "not disable_fast_runtime_mode" + cmd: pytest tests/ttnn/unit_tests -xv --splits ${{ inputs.num-groups }} --exclude-warning-annotations --group 12 -m "not disable_fast_runtime_mode" - name: ttnn fast runtime off - cmd: pytest tests/ttnn/unit_tests -xv -m requires_fast_runtime_mode_off + cmd: pytest tests/ttnn/unit_tests -xv --exclude-warning-annotations -m requires_fast_runtime_mode_off fast_runtime_mode_off: true - name: ttnn example tests cmd: ./tests/scripts/run_ttnn_examples.sh diff --git a/pytest.ini b/pytest.ini index 74b9a432203..aad47e86061 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,7 +1,7 @@ [pytest] timeout = 300 minversion = 7.2 -addopts = --import-mode=importlib -vvs -rA --durations=0 --junitxml=generated/test_reports/most_recent_tests.xml --exclude-warning-annotations +addopts = --import-mode=importlib -vvs -rA --durations=0 --junitxml=generated/test_reports/most_recent_tests.xml empty_parameter_set_mark = skip markers = post_commit: mark tests to run on post-commit From 4626607a9294ee46724b6e8c7aaf50ef5aed4835 Mon Sep 17 00:00:00 2001 From: Pavle Josipovic Date: Mon, 24 Feb 2025 18:43:20 +0000 Subject: [PATCH 262/316] Fix all post commit Started failing after 190547b5dcdbd12724b4717b40a72ac627a2196b --- tests/ttnn/unit_tests/operations/test_new_conv2d.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/ttnn/unit_tests/operations/test_new_conv2d.py b/tests/ttnn/unit_tests/operations/test_new_conv2d.py index 471e2aa3817..7a6a83ec276 100644 --- a/tests/ttnn/unit_tests/operations/test_new_conv2d.py +++ b/tests/ttnn/unit_tests/operations/test_new_conv2d.py @@ -2668,7 +2668,7 @@ def test_dram_input_mm_conv(device, torch_tensor_map, tiled_input, input_on_devi kernel_shape = (out_channels, in_channels, kernel_h, kernel_w) torch_kernel = randomize_torch_tensor(torch_tensor_map, kernel_shape) - tt_kernel = ttnn.from_torch(torch_kernel) + tt_kernel = ttnn.from_torch(torch_kernel, dtype=ttnn.bfloat16) torch_input = randomize_torch_tensor(torch_tensor_map, input_shape) if input_on_device: @@ -2677,7 +2677,7 @@ def test_dram_input_mm_conv(device, torch_tensor_map, tiled_input, input_on_devi tt_input = ttnn.reshape(tt_input, (1, 1, batch_size * img_h * img_w, in_channels)) else: torch_input_nhwc = torch.permute(torch_input, (0, 2, 3, 1)) - tt_input = ttnn.from_torch(torch_input_nhwc) + tt_input = ttnn.from_torch(torch_input_nhwc, dtype=ttnn.bfloat16) if tiled_input: tt_input = ttnn.to_layout(tt_input, ttnn.TILE_LAYOUT) From 3003a67a713a0e939f6b2975a6106e53c35fe57d Mon Sep 17 00:00:00 2001 From: Andrew Fuller Date: Mon, 24 Feb 2025 13:56:16 -0500 Subject: [PATCH 263/316] Move TGG Unit Tests to 22.04 (#18227) ### Ticket #14393 ### Problem description 20.04 is going the way of the Dodo. ### What's changed Flipped TGG-Unit to 22.04 ### Checklist - [x] TGG Unit [passes](https://github.com/tenstorrent/tt-metal/actions/runs/13504215225) --- .github/workflows/tgg-unit-tests.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/tgg-unit-tests.yaml b/.github/workflows/tgg-unit-tests.yaml index 9d1bba42a64..9251d491e36 100644 --- a/.github/workflows/tgg-unit-tests.yaml +++ b/.github/workflows/tgg-unit-tests.yaml @@ -10,6 +10,7 @@ jobs: uses: ./.github/workflows/build-artifact.yaml secrets: inherit with: + version: "22.04" build-wheel: true TGG-tests: needs: build-artifact From 3b3ca0ce1c7e51c6bacc7142a2ebe7f3f24e623c Mon Sep 17 00:00:00 2001 From: Marko Bezulj <156311081+mbezuljTT@users.noreply.github.com> Date: Mon, 24 Feb 2025 20:13:51 +0100 Subject: [PATCH 264/316] fix sliding window hash calculus (#18053) ### Problem description Sliding Window Infra hash calculus didn't account for is_bilinear, is_transpose and snap_to_tile. This was causing a customer model to fail. ### What's changed Updated SlidingWindowConfig::to_string(), that is used by SlidingWindowConfig::to_hash() ### Checklist - [ ] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes - [ ] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [ ] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [ ] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [ ] New/Existing tests provide coverage for changes --- tests/ttnn/unit_tests/gtests/CMakeLists.txt | 1 + .../gtests/test_sliding_window_infra.cpp | 73 +++++++++++++++++++ .../sliding_window/sliding_window.cpp | 3 +- 3 files changed, 76 insertions(+), 1 deletion(-) create mode 100644 tests/ttnn/unit_tests/gtests/test_sliding_window_infra.cpp diff --git a/tests/ttnn/unit_tests/gtests/CMakeLists.txt b/tests/ttnn/unit_tests/gtests/CMakeLists.txt index 931739e9e6b..93fedd81a9f 100644 --- a/tests/ttnn/unit_tests/gtests/CMakeLists.txt +++ b/tests/ttnn/unit_tests/gtests/CMakeLists.txt @@ -9,6 +9,7 @@ set(TTNN_UNIT_TESTS_SRC ${CMAKE_CURRENT_SOURCE_DIR}/test_graph_query_op_runtime.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_reflect.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_to_and_from_json.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sliding_window_infra.cpp ) set(TTNN_CCL_UNIT_TESTS_SRC diff --git a/tests/ttnn/unit_tests/gtests/test_sliding_window_infra.cpp b/tests/ttnn/unit_tests/gtests/test_sliding_window_infra.cpp new file mode 100644 index 00000000000..c0d345c2e17 --- /dev/null +++ b/tests/ttnn/unit_tests/gtests/test_sliding_window_infra.cpp @@ -0,0 +1,73 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "gtest/gtest.h" + +#include "ttnn/operations/sliding_window/sliding_window.hpp" +#include "tt_metal/api/tt-metalium/core_coord.hpp" + +namespace ttnn::operations::sliding_window::test { + +using namespace tt::tt_metal; + +class SlidingWindowTestFixture : public testing::TestWithParam {}; + +TEST_P(SlidingWindowTestFixture, SlidingWindowHash) { + auto sliding_window_a = GetParam(); + + // start of same input + auto sliding_window_b = sliding_window_a; + log_info(tt::LogTest, "sliding_window_a:[{}] {}", sliding_window_a.get_hash(), sliding_window_a.to_string()); + log_info(tt::LogTest, "sliding_window_b:[{}] {}", sliding_window_b.get_hash(), sliding_window_b.to_string()); + EXPECT_EQ(sliding_window_a.get_hash(), sliding_window_b.get_hash()); + + // flip snap_to_tile + sliding_window_b.snap_to_tile = !sliding_window_a.snap_to_tile; + log_info(tt::LogTest, "sliding_window_a:[{}] {}", sliding_window_a.get_hash(), sliding_window_a.to_string()); + log_info(tt::LogTest, "sliding_window_b:[{}] {}", sliding_window_b.get_hash(), sliding_window_b.to_string()); + EXPECT_NE(sliding_window_a.get_hash(), sliding_window_b.get_hash()); + sliding_window_b.snap_to_tile = !sliding_window_a.snap_to_tile; + + // flip is_bilinear + sliding_window_b.is_bilinear = !sliding_window_a.is_bilinear; + log_info(tt::LogTest, "sliding_window_a:[{}] {}", sliding_window_a.get_hash(), sliding_window_a.to_string()); + log_info(tt::LogTest, "sliding_window_b:[{}] {}", sliding_window_b.get_hash(), sliding_window_b.to_string()); + EXPECT_NE(sliding_window_a.get_hash(), sliding_window_b.get_hash()); + sliding_window_b.is_bilinear = !sliding_window_a.is_bilinear; + + // flip is_transpose + sliding_window_b.is_transpose = !sliding_window_a.is_transpose; + log_info(tt::LogTest, "sliding_window_a:[{}] {}", sliding_window_a.get_hash(), sliding_window_a.to_string()); + log_info(tt::LogTest, "sliding_window_b:[{}] {}", sliding_window_b.get_hash(), sliding_window_b.to_string()); + EXPECT_NE(sliding_window_a.get_hash(), sliding_window_b.get_hash()); + sliding_window_b.is_transpose = !sliding_window_a.is_transpose; + + // flip ceil_mode + sliding_window_b.ceil_mode = !sliding_window_a.ceil_mode; + log_info(tt::LogTest, "sliding_window_a:[{}] {}", sliding_window_a.get_hash(), sliding_window_a.to_string()); + log_info(tt::LogTest, "sliding_window_b:[{}] {}", sliding_window_b.get_hash(), sliding_window_b.to_string()); + EXPECT_NE(sliding_window_a.get_hash(), sliding_window_b.get_hash()); + sliding_window_b.ceil_mode = !sliding_window_a.ceil_mode; +} + +INSTANTIATE_TEST_SUITE_P( + SlidingWindowHashTests, + SlidingWindowTestFixture, + ::testing::Values(SlidingWindowConfig{ + .batch_size = 1, + .input_hw = {32, 32}, + .window_hw = {3, 3}, + .stride_hw = {1, 1}, + .pad_hw = {1, 1}, + .output_pad_hw = {0, 0}, + .dilation_hw = {1, 1}, + .num_cores_nhw = 1, + .num_cores_c = 1, + .core_range_set = tt::tt_metal::CoreRangeSet(tt::tt_metal::CoreRange({0, 0}, {7, 7})), + .snap_to_tile = false, + .is_bilinear = false, + .is_transpose = false, + .ceil_mode = false})); + +} // namespace ttnn::operations::sliding_window::test diff --git a/ttnn/cpp/ttnn/operations/sliding_window/sliding_window.cpp b/ttnn/cpp/ttnn/operations/sliding_window/sliding_window.cpp index b53e4ea806b..c6c94c857a7 100644 --- a/ttnn/cpp/ttnn/operations/sliding_window/sliding_window.cpp +++ b/ttnn/cpp/ttnn/operations/sliding_window/sliding_window.cpp @@ -699,7 +699,8 @@ std::string SlidingWindowConfig::to_string() const { std::to_string(std::get<1>(stride_hw)) + "_" + std::to_string(std::get<0>(pad_hw)) + "_" + std::to_string(std::get<1>(pad_hw)) + "_" + std::to_string(std::get<0>(dilation_hw)) + "_" + std::to_string(std::get<1>(dilation_hw)) + "_" + std::to_string(num_cores_nhw) + "_" + - std::to_string(num_cores_c) + "_" + std::to_string(ceil_mode) + "_" + core_range_set.str(); + std::to_string(num_cores_c) + "_" + core_range_set.str() + (snap_to_tile ? "_snap_to_tile" : "") + + (is_bilinear ? "_bilinear" : "") + (is_transpose ? "_transpose" : "") + (ceil_mode ? "_ceil_mode" : ""); } } // namespace ttnn::operations::sliding_window From a8a19fb0137245a77eaccbb71f5c801e30db2480 Mon Sep 17 00:00:00 2001 From: Raymond Kim Date: Mon, 24 Feb 2025 14:48:29 -0500 Subject: [PATCH 265/316] #18237: Skip new conv2d test_dram_input_mm_conv test because it breaks after reverting pre-calculation changes --- tests/ttnn/unit_tests/operations/test_new_conv2d.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/ttnn/unit_tests/operations/test_new_conv2d.py b/tests/ttnn/unit_tests/operations/test_new_conv2d.py index 7a6a83ec276..759e7255f55 100644 --- a/tests/ttnn/unit_tests/operations/test_new_conv2d.py +++ b/tests/ttnn/unit_tests/operations/test_new_conv2d.py @@ -2651,6 +2651,7 @@ def test_shallow_conv_with_tiled_input(device): # Tests running conv2d which maps to matmul w/o sharding the input tensor. # Output tensor is in DRAM. +@pytest.mark.skip("#18237: Need to fix after pre-calculation revert") @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) @pytest.mark.parametrize("tiled_input", [True, False]) @pytest.mark.parametrize("input_on_device", [True, False]) From 5e4a848bbab4ec15626f2f5ee4ff860eae91f6a1 Mon Sep 17 00:00:00 2001 From: Raymond Kim Date: Mon, 24 Feb 2025 14:49:11 -0500 Subject: [PATCH 266/316] #0: Revert "#18237: Skip new conv2d test_dram_input_mm_conv test because it breaks after reverting pre-calculation changes" This reverts commit a8a19fb0137245a77eaccbb71f5c801e30db2480. Pavle already fixed. MY BAD --- tests/ttnn/unit_tests/operations/test_new_conv2d.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/ttnn/unit_tests/operations/test_new_conv2d.py b/tests/ttnn/unit_tests/operations/test_new_conv2d.py index 759e7255f55..7a6a83ec276 100644 --- a/tests/ttnn/unit_tests/operations/test_new_conv2d.py +++ b/tests/ttnn/unit_tests/operations/test_new_conv2d.py @@ -2651,7 +2651,6 @@ def test_shallow_conv_with_tiled_input(device): # Tests running conv2d which maps to matmul w/o sharding the input tensor. # Output tensor is in DRAM. -@pytest.mark.skip("#18237: Need to fix after pre-calculation revert") @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) @pytest.mark.parametrize("tiled_input", [True, False]) @pytest.mark.parametrize("input_on_device", [True, False]) From fea80ae3fb1b43c61fbdfe6c24ffe09d83bf800b Mon Sep 17 00:00:00 2001 From: Andrew Fuller Date: Mon, 24 Feb 2025 15:12:41 -0500 Subject: [PATCH 267/316] [skip ci] Dockerize TGG Demo tests (#18233) ### Ticket #18188 ### Problem description This workflow was limited to the OS of the host machine. ### What's changed Dockerized the workflow. ### Checklist - [x] TGG Demo [passes](https://github.com/tenstorrent/tt-metal/actions/runs/13505602791) --- .github/workflows/tgg-demo-tests.yaml | 54 ++++++++++++++++++++------- 1 file changed, 41 insertions(+), 13 deletions(-) diff --git a/.github/workflows/tgg-demo-tests.yaml b/.github/workflows/tgg-demo-tests.yaml index 908fd1e0588..4b8b0c4acd7 100644 --- a/.github/workflows/tgg-demo-tests.yaml +++ b/.github/workflows/tgg-demo-tests.yaml @@ -9,6 +9,8 @@ jobs: build-artifact: uses: ./.github/workflows/build-artifact.yaml secrets: inherit + with: + build-wheel: true tgg-demo-tests: needs: build-artifact strategy: @@ -23,26 +25,52 @@ jobs: }, ] name: ${{ matrix.test-group.name }} - env: - ARCH_NAME: ${{ matrix.test-group.arch }} - LOGURU_LEVEL: INFO - LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib runs-on: ${{ matrix.test-group.runs-on }} + container: + image: ${{ needs.build-artifact.outputs.ci-build-docker-image }} + env: + TT_METAL_HOME: /work + PYTHONPATH: /work + LD_LIBRARY_PATH: /work/build/lib + LOGURU_LEVEL: INFO + ARCH_NAME: ${{ matrix.test-group.arch }} + volumes: + - ${{ github.workspace }}/docker-job:/work # Subdir to workaround https://github.com/actions/runner/issues/691 + - /dev/hugepages-1G:/dev/hugepages-1G + - /mnt/MLPerf:/mnt/MLPerf + options: "--device /dev/tenstorrent" + defaults: + run: + shell: bash + working-directory: /work # https://github.com/actions/runner/issues/878 steps: - - uses: tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main - - name: Set up dynamic env vars for build - run: | - echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV + - name: ⬇️ Checkout + uses: actions/checkout@v4 + with: + submodules: recursive + path: docker-job # Here be dragons; keep it scoped to our desired volume, yet must be under github.workspace and be sure to clean up at the end - uses: actions/download-artifact@v4 with: - name: TTMetal_build_any + name: ${{ needs.build-artifact.outputs.build-artifact-name }} + path: /work - name: Extract files run: tar -xvf ttm_any.tar - - uses: ./.github/actions/install-python-deps + - name: ⬇️ Download Wheel + uses: actions/download-artifact@v4 + with: + name: ${{ needs.build-artifact.outputs.wheel-artifact-name }} + path: /work + - name: Install Wheel + run: | + WHEEL_FILENAME=$(ls -1 *.whl) + pip3 install $WHEEL_FILENAME - name: Run demo regression tests timeout-minutes: 180 run: | - source ${{ github.workspace }}/python_env/bin/activate - cd $TT_METAL_HOME - export PYTHONPATH=$TT_METAL_HOME ${{ matrix.test-group.cmd }} + - name: Cleanup + if: always() + run: | + # We are forced to checkout the repo into a subdir of the host's workdir; this pollutes the host + # with root-owned files. Be sure to clean up after ourselves in case we're on a non-ephemeral runner. + rm -rf /__w/tt-metal/tt-metal/docker-job From 6885ea406c58e41d7d5e05e00755b7a67b5c679c Mon Sep 17 00:00:00 2001 From: Andrew Fuller Date: Mon, 24 Feb 2025 15:47:09 -0500 Subject: [PATCH 268/316] [skip ci] Run TGG Demo on 22.04 (#18242) ### Ticket https://github.com/tenstorrent/tt-metal/issues/14393 Problem description 20.04 is going the way of the Dodo. What's changed Flipped TGG-Unit to 22.04 Checklist - [x] TGG Unit [passes](https://github.com/tenstorrent/tt-metal/actions/runs/13507017565) --- .github/workflows/tgg-demo-tests.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/tgg-demo-tests.yaml b/.github/workflows/tgg-demo-tests.yaml index 4b8b0c4acd7..27d53cd91ee 100644 --- a/.github/workflows/tgg-demo-tests.yaml +++ b/.github/workflows/tgg-demo-tests.yaml @@ -11,6 +11,7 @@ jobs: secrets: inherit with: build-wheel: true + version: 22.04 tgg-demo-tests: needs: build-artifact strategy: From 283fa1e9b783a2385c6e4f8999e03159398b86d3 Mon Sep 17 00:00:00 2001 From: William Ly Date: Mon, 24 Feb 2025 15:51:17 -0500 Subject: [PATCH 269/316] #17382: Classify test failure annotations into failing python/cpp test buckets for superset (#18112) ### Ticket https://github.com/tenstorrent/tt-metal/issues/17382 ### Problem description https://github.com/tenstorrent/tt-metal/pull/18106 handles generating test failure annotations in GHA. The next step is to read the annotations and bucket them separately from infra errors. ### What's changed - Create new buckets `TestErrorV1.PY_TEST_FAILURE` and `TestErrorV1.CPP_TEST_FAILURE` for python and cpp tests respectively - Add unit tests - Also add new enum field `job.job_status` (resolves https://github.com/tenstorrent/tt-metal/issues/17811) ### Checklist - [x] New/Existing tests provide coverage for changes --- infra/data_collection/github/utils.py | 26 +- infra/data_collection/models.py | 6 + infra/data_collection/pydantic_models.py | 17 + .../unit_tests_api_grayskull.xml | 232 ++++ .../most_recent_tests.xml | 5 + .../13443325356/logs/37563095078.log | 1198 +++++++++++++++++ .../logs/37563095078_annotations.json | 1 + .../13443325356/logs/37563108566.log | 570 ++++++++ .../logs/37563108566_annotations.json | 1 + .../workflow.json | 1 + .../workflow_jobs.json | 272 ++++ infra/tests/data_collection/test_cicd.py | 57 +- 12 files changed, 2380 insertions(+), 6 deletions(-) create mode 100644 infra/tests/_data/data_collection/cicd/all_post_commit_test_annotations_13443325356/13443325356/artifacts/test_reports_0c4930ff-041a-4c44-ad7d-7f38b72b304a/unit_tests_api_grayskull.xml create mode 100644 infra/tests/_data/data_collection/cicd/all_post_commit_test_annotations_13443325356/13443325356/artifacts/test_reports_3625ce52-baf1-4c13-89e7-fc467452e238/most_recent_tests.xml create mode 100644 infra/tests/_data/data_collection/cicd/all_post_commit_test_annotations_13443325356/13443325356/logs/37563095078.log create mode 100644 infra/tests/_data/data_collection/cicd/all_post_commit_test_annotations_13443325356/13443325356/logs/37563095078_annotations.json create mode 100644 infra/tests/_data/data_collection/cicd/all_post_commit_test_annotations_13443325356/13443325356/logs/37563108566.log create mode 100644 infra/tests/_data/data_collection/cicd/all_post_commit_test_annotations_13443325356/13443325356/logs/37563108566_annotations.json create mode 100644 infra/tests/_data/data_collection/cicd/all_post_commit_test_annotations_13443325356/workflow.json create mode 100644 infra/tests/_data/data_collection/cicd/all_post_commit_test_annotations_13443325356/workflow_jobs.json diff --git a/infra/data_collection/github/utils.py b/infra/data_collection/github/utils.py index 1761285f225..7c58d351b5f 100644 --- a/infra/data_collection/github/utils.py +++ b/infra/data_collection/github/utils.py @@ -11,7 +11,7 @@ from loguru import logger -from infra.data_collection.models import InfraErrorV1 +from infra.data_collection.models import InfraErrorV1, TestErrorV1 from infra.data_collection.pydantic_models import CompleteBenchmarkRun @@ -134,10 +134,24 @@ def get_failure_signature_and_description_from_annotations(github_job, github_jo if job_id in github_job_id_to_annotations: annotation_info = github_job_id_to_annotations[job_id] - # Iterate over list of job annotation's until first failure-level annotation message - failure_description = next((d["message"] for d in annotation_info if d["annotation_level"] == "failure"), None) - if failure_description: - failure_signature = get_job_failure_signature_(github_job, failure_description) + for _annot in annotation_info: + if _annot["annotation_level"] == "failure": + # Unit test failure: a failure exists where the annotation path is not .github + if _annot["path"] != ".github": + failure_description = _annot["path"] + if ".py" in failure_description: + failure_signature = str(TestErrorV1.PY_TEST_FAILURE) + elif ".cpp" in failure_description: + failure_signature = str(TestErrorV1.CPP_TEST_FAILURE) + else: + failure_signature = str(TestErrorV1.UNKNOWN_TEST_FAILURE) + return failure_signature, failure_description + else: + # Infrastructure error + failure_description = _annot.get("message") + if failure_description: + failure_signature = get_job_failure_signature_(github_job, failure_description) + return failure_signature, failure_description return failure_signature, failure_description @@ -234,6 +248,7 @@ def get_job_row_from_github_job(github_job, github_job_id_to_annotations): # skipped jobs are considered passing jobs (nothing was run) job_success = github_job["conclusion"] in ["success", "skipped"] + job_status = github_job["conclusion"] is_build_job = "build" in name or "build" in labels @@ -260,6 +275,7 @@ def get_job_row_from_github_job(github_job, github_job_id_to_annotations): "job_start_ts": job_start_ts, "job_end_ts": job_end_ts, "job_success": job_success, + "job_status": job_status, "is_build_job": is_build_job, "job_matrix_config": job_matrix_config, "docker_image": docker_image, diff --git a/infra/data_collection/models.py b/infra/data_collection/models.py index 078e55d04c2..5adec9cbed0 100644 --- a/infra/data_collection/models.py +++ b/infra/data_collection/models.py @@ -11,3 +11,9 @@ class InfraErrorV1(enum.Enum): RUNNER_SHUTDOWN_FAILURE = enum.auto() API_RATE_LIMIT_FAILURE = enum.auto() RUNNER_CARD_IN_USE_FAILURE = enum.auto() + + +class TestErrorV1(enum.Enum): + PY_TEST_FAILURE = enum.auto() + CPP_TEST_FAILURE = enum.auto() + UNKNOWN_TEST_FAILURE = enum.auto() diff --git a/infra/data_collection/pydantic_models.py b/infra/data_collection/pydantic_models.py index 4972e446d62..d9288df28e3 100644 --- a/infra/data_collection/pydantic_models.py +++ b/infra/data_collection/pydantic_models.py @@ -9,6 +9,7 @@ from datetime import datetime from typing import List, Optional +from enum import Enum from pydantic import BaseModel, Field, model_validator @@ -35,6 +36,17 @@ class Test(BaseModel): tags: Optional[dict] = Field(None, description="Tags associated with the test, as key/value pairs.") +class JobStatus(str, Enum): + success = "success" + failure = "failure" + skipped = "skipped" + cancelled = "cancelled" + neutral = "neutral" + unknown = "unknown" + timed_out = "timed_out" + action_required = "action_required" + + class Job(BaseModel): """ Contains information about the execution of CI/CD jobs, each one associated with a @@ -61,6 +73,11 @@ class Job(BaseModel): "criteria. Failure mechanisms that are only descriptive of the " "job itself." ) + job_status: Optional[JobStatus] = Field( + None, + description="Job execution status, possible statuses include success, failure, " + "skipped, cancelled, neutral, etc.", + ) docker_image: Optional[str] = Field(None, description="Name of the Docker image used for the CI job.") is_build_job: bool = Field(description="Flag identifying if the job is a software build.") job_matrix_config: Optional[dict] = Field( diff --git a/infra/tests/_data/data_collection/cicd/all_post_commit_test_annotations_13443325356/13443325356/artifacts/test_reports_0c4930ff-041a-4c44-ad7d-7f38b72b304a/unit_tests_api_grayskull.xml b/infra/tests/_data/data_collection/cicd/all_post_commit_test_annotations_13443325356/13443325356/artifacts/test_reports_0c4930ff-041a-4c44-ad7d-7f38b72b304a/unit_tests_api_grayskull.xml new file mode 100644 index 00000000000..48d8e35c6d9 --- /dev/null +++ b/infra/tests/_data/data_collection/cicd/all_post_commit_test_annotations_13443325356/13443325356/artifacts/test_reports_0c4930ff-041a-4c44-ad7d-7f38b72b304a/unit_tests_api_grayskull.xml @@ -0,0 +1,232 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/infra/tests/_data/data_collection/cicd/all_post_commit_test_annotations_13443325356/13443325356/artifacts/test_reports_3625ce52-baf1-4c13-89e7-fc467452e238/most_recent_tests.xml b/infra/tests/_data/data_collection/cicd/all_post_commit_test_annotations_13443325356/13443325356/artifacts/test_reports_3625ce52-baf1-4c13-89e7-fc467452e238/most_recent_tests.xml new file mode 100644 index 00000000000..156cdd1cb48 --- /dev/null +++ b/infra/tests/_data/data_collection/cicd/all_post_commit_test_annotations_13443325356/13443325356/artifacts/test_reports_3625ce52-baf1-4c13-89e7-fc467452e238/most_recent_tests.xml @@ -0,0 +1,5 @@ +def test_do_not_submit(): +> assert True == False +E assert True == False + +tests/ttnn/unit_tests/operations/test_examples.py:107: AssertionError diff --git a/infra/tests/_data/data_collection/cicd/all_post_commit_test_annotations_13443325356/13443325356/logs/37563095078.log b/infra/tests/_data/data_collection/cicd/all_post_commit_test_annotations_13443325356/13443325356/logs/37563095078.log new file mode 100644 index 00000000000..bfe5830d6ab --- /dev/null +++ b/infra/tests/_data/data_collection/cicd/all_post_commit_test_annotations_13443325356/13443325356/logs/37563095078.log @@ -0,0 +1,1198 @@ +2025-02-20T20:33:26.4528730Z Current runner version: '2.322.0' +2025-02-20T20:33:26.4535556Z Runner name: 'tt-metal-ci-vm-105' +2025-02-20T20:33:26.4536481Z Runner group name: 'Default' +2025-02-20T20:33:26.4537609Z Machine name: 'tt-metal-ci-vm-105' +2025-02-20T20:33:26.4541541Z ##[group]GITHUB_TOKEN Permissions +2025-02-20T20:33:26.4544075Z Actions: read +2025-02-20T20:33:26.4544743Z Contents: write +2025-02-20T20:33:26.4545547Z Metadata: read +2025-02-20T20:33:26.4546255Z Packages: write +2025-02-20T20:33:26.4546982Z Pages: write +2025-02-20T20:33:26.4547656Z PullRequests: write +2025-02-20T20:33:26.4548497Z ##[endgroup] +2025-02-20T20:33:26.4551927Z Secret source: Actions +2025-02-20T20:33:26.4552884Z Prepare workflow directory +2025-02-20T20:33:26.6947989Z Prepare all required actions +2025-02-20T20:33:26.6999916Z Getting action download info +2025-02-20T20:33:26.8689560Z Download action repository 'tenstorrent/tt-metal@main' (SHA:fd3ed75e96eb5b555f2f39cdefd37d8698ff8418) +2025-02-20T20:33:33.2618527Z Getting action download info +2025-02-20T20:33:33.4208520Z Download action repository 'actions/checkout@v4' (SHA:11bd71901bbe5b1630ceea73d27597364c9af683) +2025-02-20T20:33:33.9990691Z Uses: tenstorrent/tt-metal/.github/workflows/build-and-unit-tests.yaml@refs/heads/williamly/test-failure-annotations (94429171440755ffe7c62085c4807d447dd369dc) +2025-02-20T20:33:33.9993356Z ##[group] Inputs +2025-02-20T20:33:33.9994044Z build-type: Release +2025-02-20T20:33:33.9994905Z with-retries: false +2025-02-20T20:33:33.9995431Z arch: grayskull +2025-02-20T20:33:33.9995928Z runner-label: E150 +2025-02-20T20:33:33.9996906Z timeout: 35 +2025-02-20T20:33:33.9997367Z os: ubuntu-20.04 +2025-02-20T20:33:33.9997860Z ##[endgroup] +2025-02-20T20:33:33.9998474Z Complete job name: sd-unit-tests (grayskull, E150) / grayskull E150 api +2025-02-20T20:33:34.0638643Z A job started hook has been configured by the self-hosted runner administrator +2025-02-20T20:33:34.0775207Z ##[group]Run '/opt/tt_metal_infra/scripts/ci/grayskull/reset.sh' +2025-02-20T20:33:34.0792392Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-20T20:33:34.0793304Z ##[endgroup] +2025-02-20T20:33:34.0965523Z ++ date +2025-02-20T20:33:34.0966249Z Current date / time is Thu Feb 20 20:33:34 UTC 2025 +2025-02-20T20:33:34.0967043Z + echo Current date / time is Thu Feb 20 20:33:34 UTC 2025 +2025-02-20T20:33:34.0967988Z + sudo find /home/ubuntu/actions-runner/_work/tt-metal/tt-metal -user root -exec rm -rf '{}' + +2025-02-20T20:33:34.3789730Z + set_e_was_enabled=false +2025-02-20T20:33:34.3790414Z + [[ ehxB == *e* ]] +2025-02-20T20:33:34.3790866Z + set_e_was_enabled=true +2025-02-20T20:33:34.3791292Z + set +e +2025-02-20T20:33:34.3791687Z + docker image prune +2025-02-20T20:33:34.3919507Z WARNING! This will remove all dangling images. +2025-02-20T20:33:34.3964823Z ++ df +2025-02-20T20:33:34.3971114Z ++ awk '{print $5}' +2025-02-20T20:33:34.3971728Z +++ findmnt -n -o SOURCE / +2025-02-20T20:33:34.3977009Z ++ sed s/%// +2025-02-20T20:33:34.3993717Z ++ grep -w '^/dev/vda3' +2025-02-20T20:33:34.4014324Z + disk_usage_before=60 +2025-02-20T20:33:34.4028669Z Are you sure you want to continue? [y/N] ::notice title=disk-usage-before-startup::Disk usage is 60 % +2025-02-20T20:33:34.4030064Z + echo '::notice title=disk-usage-before-startup::Disk usage is 60 %' +2025-02-20T20:33:34.4030712Z + '[' 60 -ge 90 ']' +2025-02-20T20:33:34.4031126Z ++ df +2025-02-20T20:33:34.4031547Z ++ awk '{print $5}' +2025-02-20T20:33:34.4031983Z ++ sed s/%// +2025-02-20T20:33:34.4032425Z +++ findmnt -n -o SOURCE / +2025-02-20T20:33:34.4052072Z ++ grep -w '^/dev/vda3' +2025-02-20T20:33:34.4071565Z + disk_usage_after=60 +2025-02-20T20:33:34.4072217Z + echo '::notice title=disk-usage-after-startup::Disk usage is 60 %' +2025-02-20T20:33:34.4072856Z + '[' 60 -ge 90 ']' +2025-02-20T20:33:34.4099762Z ##[notice]Disk usage is 60 % +2025-02-20T20:33:34.4107732Z ++ lsmod +2025-02-20T20:33:34.4108249Z + lsmod_output='Module Size Used by +2025-02-20T20:33:34.4109097Z veth 28672 0 +2025-02-20T20:33:34.4109590Z xt_conntrack 16384 1 +2025-02-20T20:33:34.4110534Z xt_MASQUERADE 20480 1 +2025-02-20T20:33:34.4111042Z nf_conntrack_netlink 45056 0 +2025-02-20T20:33:34.4111598Z nfnetlink 16384 2 nf_conntrack_netlink +2025-02-20T20:33:34.4112166Z xfrm_user 36864 1 +2025-02-20T20:33:34.4112672Z xfrm_algo 16384 1 xfrm_user +2025-02-20T20:33:34.4113279Z iptable_nat 16384 1 +2025-02-20T20:33:34.4113832Z nf_nat 45056 2 iptable_nat,xt_MASQUERADE +2025-02-20T20:33:34.4114810Z nf_conntrack 139264 4 xt_conntrack,nf_nat,nf_conntrack_netlink,xt_MASQUERADE +2025-02-20T20:33:34.4115507Z nf_defrag_ipv6 24576 1 nf_conntrack +2025-02-20T20:33:34.4116071Z nf_defrag_ipv4 16384 1 nf_conntrack +2025-02-20T20:33:34.4116627Z xt_addrtype 16384 2 +2025-02-20T20:33:34.4117122Z iptable_filter 16384 1 +2025-02-20T20:33:34.4117620Z bpfilter 32768 0 +2025-02-20T20:33:34.4118125Z br_netfilter 28672 0 +2025-02-20T20:33:34.4118654Z bridge 176128 1 br_netfilter +2025-02-20T20:33:34.4119234Z stp 16384 1 bridge +2025-02-20T20:33:34.4119826Z llc 16384 2 bridge,stp +2025-02-20T20:33:34.4120339Z aufs 262144 0 +2025-02-20T20:33:34.4120836Z xfs 1286144 1 +2025-02-20T20:33:34.4121333Z overlay 118784 0 +2025-02-20T20:33:34.4121828Z rdma_ucm 28672 0 +2025-02-20T20:33:34.4122332Z rdma_cm 110592 1 rdma_ucm +2025-02-20T20:33:34.4122874Z iw_cm 49152 1 rdma_cm +2025-02-20T20:33:34.4123388Z ib_ipoib 131072 0 +2025-02-20T20:33:34.4123905Z ib_cm 114688 2 rdma_cm,ib_ipoib +2025-02-20T20:33:34.4124433Z ib_umad 28672 8 +2025-02-20T20:33:34.4125161Z nls_iso8859_1 16384 1 +2025-02-20T20:33:34.4125672Z dm_multipath 32768 0 +2025-02-20T20:33:34.4126167Z scsi_dh_rdac 16384 0 +2025-02-20T20:33:34.4126657Z scsi_dh_emc 16384 0 +2025-02-20T20:33:34.4127169Z scsi_dh_alua 20480 0 +2025-02-20T20:33:34.4127667Z mlx5_ib 397312 0 +2025-02-20T20:33:34.4128160Z kvm_amd 98304 0 +2025-02-20T20:33:34.4128651Z ccp 90112 1 kvm_amd +2025-02-20T20:33:34.4129248Z ib_uverbs 139264 18 rdma_ucm,mlx5_ib +2025-02-20T20:33:34.4129809Z kvm 667648 1 kvm_amd +2025-02-20T20:33:34.4130325Z joydev 24576 0 +2025-02-20T20:33:34.4130817Z input_leds 16384 0 +2025-02-20T20:33:34.4131317Z serio_raw 20480 0 +2025-02-20T20:33:34.4132019Z ib_core 348160 8 rdma_cm,ib_ipoib,iw_cm,ib_umad,rdma_ucm,ib_uverbs,mlx5_ib,ib_cm +2025-02-20T20:33:34.4132715Z tenstorrent 40960 0 +2025-02-20T20:33:34.4133224Z sch_fq_codel 20480 45 +2025-02-20T20:33:34.4133727Z binfmt_misc 24576 1 +2025-02-20T20:33:34.4134227Z msr 16384 0 +2025-02-20T20:33:34.4134721Z efi_pstore 16384 0 +2025-02-20T20:33:34.4135215Z virtio_rng 16384 0 +2025-02-20T20:33:34.4135793Z ip_tables 32768 2 iptable_filter,iptable_nat +2025-02-20T20:33:34.4136601Z x_tables 40960 5 xt_conntrack,iptable_filter,xt_addrtype,ip_tables,xt_MASQUERADE +2025-02-20T20:33:34.4137321Z autofs4 45056 2 +2025-02-20T20:33:34.4137790Z btrfs 1269760 0 +2025-02-20T20:33:34.4138290Z zstd_compress 167936 1 btrfs +2025-02-20T20:33:34.4138808Z raid10 61440 0 +2025-02-20T20:33:34.4139345Z raid456 155648 0 +2025-02-20T20:33:34.4139845Z async_raid6_recov 24576 1 raid456 +2025-02-20T20:33:34.4140445Z async_memcpy 20480 2 raid456,async_raid6_recov +2025-02-20T20:33:34.4141112Z async_pq 24576 2 raid456,async_raid6_recov +2025-02-20T20:33:34.4141779Z async_xor 20480 3 async_pq,raid456,async_raid6_recov +2025-02-20T20:33:34.4142565Z async_tx 20480 5 async_pq,async_memcpy,async_xor,raid456,async_raid6_recov +2025-02-20T20:33:34.4143376Z xor 24576 2 async_xor,btrfs +2025-02-20T20:33:34.4144043Z raid6_pq 114688 4 async_pq,btrfs,raid456,async_raid6_recov +2025-02-20T20:33:34.4144780Z libcrc32c 16384 5 nf_conntrack,nf_nat,btrfs,xfs,raid456 +2025-02-20T20:33:34.4145411Z raid1 45056 0 +2025-02-20T20:33:34.4145997Z raid0 24576 0 +2025-02-20T20:33:34.4146500Z multipath 20480 0 +2025-02-20T20:33:34.4146995Z linear 20480 0 +2025-02-20T20:33:34.4147488Z hid_generic 16384 0 +2025-02-20T20:33:34.4147981Z usbhid 57344 0 +2025-02-20T20:33:34.4148520Z hid 131072 2 usbhid,hid_generic +2025-02-20T20:33:34.4149066Z crct10dif_pclmul 16384 1 +2025-02-20T20:33:34.4149603Z mlx5_core 1626112 1 mlx5_ib +2025-02-20T20:33:34.4150115Z crc32_pclmul 16384 0 +2025-02-20T20:33:34.4150617Z cirrus 16384 0 +2025-02-20T20:33:34.4151126Z ghash_clmulni_intel 16384 0 +2025-02-20T20:33:34.4151668Z drm_kms_helper 184320 3 cirrus +2025-02-20T20:33:34.4152246Z syscopyarea 16384 1 drm_kms_helper +2025-02-20T20:33:34.4152855Z sysfillrect 16384 1 drm_kms_helper +2025-02-20T20:33:34.4153459Z sysimgblt 16384 1 drm_kms_helper +2025-02-20T20:33:34.4154215Z fb_sys_fops 16384 1 drm_kms_helper +2025-02-20T20:33:34.4154837Z pci_hyperv_intf 16384 1 mlx5_core +2025-02-20T20:33:34.4155379Z mlxdevm 172032 1 mlx5_core +2025-02-20T20:33:34.4155915Z aesni_intel 372736 0 +2025-02-20T20:33:34.4156462Z auxiliary 16384 2 mlx5_ib,mlx5_core +2025-02-20T20:33:34.4157545Z mlx_compat 65536 12 rdma_cm,ib_ipoib,mlxdevm,iw_cm,auxiliary,ib_umad,ib_core,rdma_ucm,ib_uverbs,mlx5_ib,ib_cm,mlx5_core +2025-02-20T20:33:34.4158415Z crypto_simd 16384 1 aesni_intel +2025-02-20T20:33:34.4159045Z cryptd 24576 2 crypto_simd,ghash_clmulni_intel +2025-02-20T20:33:34.4159751Z glue_helper 16384 1 aesni_intel +2025-02-20T20:33:34.4160319Z tls 73728 1 mlx5_core +2025-02-20T20:33:34.4160851Z ahci 40960 0 +2025-02-20T20:33:34.4161340Z psmouse 155648 0 +2025-02-20T20:33:34.4161850Z libahci 36864 1 ahci +2025-02-20T20:33:34.4162379Z mlxfw 32768 1 mlx5_core +2025-02-20T20:33:34.4162956Z drm 495616 3 drm_kms_helper,cirrus +2025-02-20T20:33:34.4163511Z virtio_blk 20480 3 +2025-02-20T20:33:34.4164029Z psample 20480 1 mlx5_core' +2025-02-20T20:33:34.4164563Z + grep -q tenstorrent +2025-02-20T20:33:34.4176192Z + echo Module Size Used by veth 28672 0 xt_conntrack 16384 1 xt_MASQUERADE 20480 1 nf_conntrack_netlink 45056 0 nfnetlink 16384 2 nf_conntrack_netlink xfrm_user 36864 1 xfrm_algo 16384 1 xfrm_user iptable_nat 16384 1 nf_nat 45056 2 iptable_nat,xt_MASQUERADE nf_conntrack 139264 4 xt_conntrack,nf_nat,nf_conntrack_netlink,xt_MASQUERADE nf_defrag_ipv6 24576 1 nf_conntrack nf_defrag_ipv4 16384 1 nf_conntrack xt_addrtype 16384 2 iptable_filter 16384 1 bpfilter 32768 0 br_netfilter 28672 0 bridge 176128 1 br_netfilter stp 16384 1 bridge llc 16384 2 bridge,stp aufs 262144 0 xfs 1286144 1 overlay 118784 0 rdma_ucm 28672 0 rdma_cm 110592 1 rdma_ucm iw_cm 49152 1 rdma_cm ib_ipoib 131072 0 ib_cm 114688 2 rdma_cm,ib_ipoib ib_umad 28672 8 nls_iso8859_1 16384 1 dm_multipath 32768 0 scsi_dh_rdac 16384 0 scsi_dh_emc 16384 0 scsi_dh_alua 20480 0 mlx5_ib 397312 0 kvm_amd 98304 0 ccp 90112 1 kvm_amd ib_uverbs 139264 18 rdma_ucm,mlx5_ib kvm 667648 1 kvm_amd joydev 24576 0 input_leds 16384 0 serio_raw 20480 0 ib_core 348160 8 rdma_cm,ib_ipoib,iw_cm,ib_umad,rdma_ucm,ib_uverbs,mlx5_ib,ib_cm tenstorrent 40960 0 sch_fq_codel 20480 45 binfmt_misc 24576 1 msr 16384 0 efi_pstore 16384 0 virtio_rng 16384 0 ip_tables 32768 2 iptable_filter,iptable_nat x_tables 40960 5 xt_conntrack,iptable_filter,xt_addrtype,ip_tables,xt_MASQUERADE autofs4 45056 2 btrfs 1269760 0 zstd_compress 167936 1 btrfs raid10 61440 0 raid456 155648 0 async_raid6_recov 24576 1 raid456 async_memcpy 20480 2 raid456,async_raid6_recov async_pq 24576 2 raid456,async_raid6_recov async_xor 20480 3 async_pq,raid456,async_raid6_recov async_tx 20480 5 async_pq,async_memcpy,async_xor,raid456,async_raid6_recov xor 24576 2 async_xor,btrfs raid6_pq 114688 4 async_pq,btrfs,raid456,async_raid6_recov libcrc32c 16384 5 nf_conntrack,nf_nat,btrfs,xfs,raid456 raid1 45056 0 raid0 24576 0 multipath 20480 0 linear 20480 0 hid_generic 16384 0 usbhid 57344 0 hid 131072 2 usbhid,hid_generic crct10dif_pclmul 16384 1 mlx5_core 1626112 1 mlx5_ib crc32_pclmul 16384 0 cirrus 16384 0 ghash_clmulni_intel 16384 0 drm_kms_helper 184320 3 cirrus syscopyarea 16384 1 drm_kms_helper sysfillrect 16384 1 drm_kms_helper sysimgblt 16384 1 drm_kms_helper fb_sys_fops 16384 1 drm_kms_helper pci_hyperv_intf 16384 1 mlx5_core mlxdevm 172032 1 mlx5_core aesni_intel 372736 0 auxiliary 16384 2 mlx5_ib,mlx5_core mlx_compat 65536 12 rdma_cm,ib_ipoib,mlxdevm,iw_cm,auxiliary,ib_umad,ib_core,rdma_ucm,ib_uverbs,mlx5_ib,ib_cm,mlx5_core crypto_simd 16384 1 aesni_intel cryptd 24576 2 crypto_simd,ghash_clmulni_intel glue_helper 16384 1 aesni_intel tls 73728 1 mlx5_core ahci 40960 0 psmouse 155648 0 libahci 36864 1 ahci mlxfw 32768 1 mlx5_core drm 495616 3 drm_kms_helper,cirrus virtio_blk 20480 3 psample 20480 1 mlx5_core +2025-02-20T20:33:34.4187197Z + [[ 0 -ne 0 ]] +2025-02-20T20:33:34.4227641Z ++ lsof -w /dev/tenstorrent/0 +2025-02-20T20:33:34.5472150Z + lsof_output= +2025-02-20T20:33:34.5476366Z ##[notice]Touching and printing out SMI info +2025-02-20T20:33:34.5478008Z + '[' -n '' ']' +2025-02-20T20:33:34.5478448Z + i=0 +2025-02-20T20:33:34.5478850Z + iter_limit=10 +2025-02-20T20:33:34.5480026Z + echo '::notice title=printing-smi-info-startup::Touching and printing out SMI info' +2025-02-20T20:33:34.5480853Z + sleep 20 +2025-02-20T20:33:54.5485586Z + sudo touch /opt/tt_metal_infra/smi.log +2025-02-20T20:33:54.5700048Z + sudo chown ubuntu /opt/tt_metal_infra/smi.log +2025-02-20T20:33:54.5913313Z + tt-smi-metal -s -f /opt/tt_metal_infra/smi.log +2025-02-20T20:33:54.9865328Z +2025-02-20T20:33:54.9867344Z  Detected Chips: 1 +2025-02-20T20:33:54.9890256Z  +2025-02-20T20:33:54.9890946Z  Detected Chips: 1 +2025-02-20T20:33:54.9891287Z +2025-02-20T20:33:54.9891590Z  Detecting ARC: | +2025-02-20T20:33:54.9891902Z +2025-02-20T20:33:54.9892191Z  Detecting DRAM: | +2025-02-20T20:33:54.9892585Z +2025-02-20T20:33:54.9892880Z [] ETH: | +2025-02-20T20:33:54.9960266Z Gathering Information ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:00 +2025-02-20T20:33:55.0020281Z  Saved tt-smi log to: /opt/tt_metal_infra/smi.log  +2025-02-20T20:33:55.0624630Z + cat /opt/tt_metal_infra/smi.log +2025-02-20T20:33:55.0631573Z { +2025-02-20T20:33:55.0632719Z + echo '::notice title=attempting-reset-startup::Attempting to reset card(s). Sleeping first' +2025-02-20T20:33:55.0633664Z + sleep 30 +2025-02-20T20:33:55.0634298Z "time": "2025-02-20T20:33:54.989103", +2025-02-20T20:33:55.0635247Z "host_info": { +2025-02-20T20:33:55.0635675Z "OS": "Linux", +2025-02-20T20:33:55.0636128Z "Distro": "Ubuntu 20.04.6 LTS", +2025-02-20T20:33:55.0636626Z "Kernel": "5.4.0-205-generic", +2025-02-20T20:33:55.0637172Z "Hostname": "tt-metal-ci-vm-105", +2025-02-20T20:33:55.0637758Z "Platform": "x86_64", +2025-02-20T20:33:55.0638285Z "Python": "3.8.10", +2025-02-20T20:33:55.0638842Z "Memory": "47.14 GB", +2025-02-20T20:33:55.0639392Z "Driver": "TTKMD 1.27.1" +2025-02-20T20:33:55.0640046Z }, +2025-02-20T20:33:55.0640781Z "device_info": [ +2025-02-20T20:33:55.0641287Z { +2025-02-20T20:33:55.0641751Z "smbus_telem": { +2025-02-20T20:33:55.0642379Z "BOARD_ID": "0x10000361152e069", +2025-02-20T20:33:55.0643185Z "SMBUS_TX_ENUM_VERSION": "0xba5e0001", +2025-02-20T20:33:55.0643791Z "SMBUS_TX_DEVICE_ID": "0xfaca1e52", +2025-02-20T20:33:55.0644369Z "SMBUS_TX_ASIC_RO": null, +2025-02-20T20:33:55.0645683Z "SMBUS_TX_ASIC_IDD": null, +2025-02-20T20:33:55.0646267Z "SMBUS_TX_BOARD_ID_HIGH": "0x1000036", +2025-02-20T20:33:55.0646894Z "SMBUS_TX_BOARD_ID_LOW": "0x1152e069", +2025-02-20T20:33:55.0647544Z "SMBUS_TX_ARC0_FW_VERSION": "0x1070000", +2025-02-20T20:33:55.0648189Z "SMBUS_TX_ARC1_FW_VERSION": "0x1070000", +2025-02-20T20:33:55.0648816Z "SMBUS_TX_ARC2_FW_VERSION": null, +2025-02-20T20:33:55.0649423Z "SMBUS_TX_ARC3_FW_VERSION": "0x1070000", +2025-02-20T20:33:55.0650057Z "SMBUS_TX_SPIBOOTROM_FW_VERSION": null, +2025-02-20T20:33:55.0650680Z "SMBUS_TX_ETH_FW_VERSION": null, +2025-02-20T20:33:55.0651352Z "SMBUS_TX_M3_BL_FW_VERSION": null, +2025-02-20T20:33:55.0651957Z "SMBUS_TX_M3_APP_FW_VERSION": null, +2025-02-20T20:33:55.0652571Z "SMBUS_TX_DDR_SPEED": "0xe74", +2025-02-20T20:33:55.0653147Z "SMBUS_TX_DDR_STATUS": "0x111111", +2025-02-20T20:33:55.0653756Z "SMBUS_TX_ETH_STATUS0": null, +2025-02-20T20:33:55.0654339Z "SMBUS_TX_ETH_STATUS1": null, +2025-02-20T20:33:55.0654925Z "SMBUS_TX_PCIE_STATUS": "0x11040042", +2025-02-20T20:33:55.0655515Z "SMBUS_TX_FAULTS": null, +2025-02-20T20:33:55.0656123Z "SMBUS_TX_ARC0_HEALTH": "0x4bf109", +2025-02-20T20:33:55.0656731Z "SMBUS_TX_ARC1_HEALTH": null, +2025-02-20T20:33:55.0657302Z "SMBUS_TX_ARC2_HEALTH": null, +2025-02-20T20:33:55.0657877Z "SMBUS_TX_ARC3_HEALTH": null, +2025-02-20T20:33:55.0658453Z "SMBUS_TX_FAN_SPEED": "0xff", +2025-02-20T20:33:55.0659405Z "SMBUS_TX_AICLK": "0x4b200fa", +2025-02-20T20:33:55.0660005Z "SMBUS_TX_AXICLK": "0x384", +2025-02-20T20:33:55.0660534Z "SMBUS_TX_ARCCLK": "0x21c", +2025-02-20T20:33:55.0661085Z "SMBUS_TX_THROTTLER": null, +2025-02-20T20:33:55.0661654Z "SMBUS_TX_VCORE": "0x2e4", +2025-02-20T20:33:55.0662254Z "SMBUS_TX_ASIC_TEMPERATURE": "0x290021d", +2025-02-20T20:33:55.0662871Z "SMBUS_TX_VREG_TEMPERATURE": null, +2025-02-20T20:33:55.0663472Z "SMBUS_TX_BOARD_TEMPERATURE": null, +2025-02-20T20:33:55.0664036Z "SMBUS_TX_TDP": "0xaa0012", +2025-02-20T20:33:55.0664579Z "SMBUS_TX_TDC": "0x12c0016", +2025-02-20T20:33:55.0665141Z "SMBUS_TX_VDD_LIMITS": "0x3a202e4", +2025-02-20T20:33:55.0665707Z "SMBUS_TX_THM_LIMITS": "0x53004b", +2025-02-20T20:33:55.0666287Z "SMBUS_TX_WH_FW_DATE": "0x45011317", +2025-02-20T20:33:55.0666883Z "SMBUS_TX_ASIC_TMON0": "0x22222221", +2025-02-20T20:33:55.0667443Z "SMBUS_TX_ASIC_TMON1": "0x2122", +2025-02-20T20:33:55.0668019Z "SMBUS_TX_MVDDQ_POWER": null, +2025-02-20T20:33:55.0668593Z "SMBUS_TX_GDDR_TRAIN_TEMP0": null, +2025-02-20T20:33:55.0669183Z "SMBUS_TX_GDDR_TRAIN_TEMP1": null, +2025-02-20T20:33:55.0669776Z "SMBUS_TX_BOOT_DATE": "0x5214132d", +2025-02-20T20:33:55.0670343Z "SMBUS_TX_RT_SECONDS": null, +2025-02-20T20:33:55.0670903Z "SMBUS_TX_AUX_STATUS": null, +2025-02-20T20:33:55.0671473Z "SMBUS_TX_ETH_DEBUG_STATUS0": null, +2025-02-20T20:33:55.0672089Z "SMBUS_TX_ETH_DEBUG_STATUS1": null, +2025-02-20T20:33:55.0672695Z "SMBUS_TX_TT_FLASH_VERSION": "0x20008" +2025-02-20T20:33:55.0673238Z }, +2025-02-20T20:33:55.0673644Z "board_info": { +2025-02-20T20:33:55.0674335Z "bus_id": "0000:07:00.0", +2025-02-20T20:33:55.0674862Z "board_type": "e150", +2025-02-20T20:33:55.0675357Z "board_id": "010000361152e069", +2025-02-20T20:33:55.0675916Z "coords": "N/A", +2025-02-20T20:33:55.0676406Z "dram_status": true, +2025-02-20T20:33:55.0677106Z "dram_speed": "3700", +2025-02-20T20:33:55.0677636Z "pcie_speed": 4, +2025-02-20T20:33:55.0678123Z "pcie_width": 16 +2025-02-20T20:33:55.0678588Z }, +2025-02-20T20:33:55.0678986Z "telemetry": { +2025-02-20T20:33:55.0679440Z "voltage": "0.74", +2025-02-20T20:33:55.0679950Z "current": " 22.0", +2025-02-20T20:33:55.0680457Z "power": " 18.0", +2025-02-20T20:33:55.0680962Z "aiclk": " 250", +2025-02-20T20:33:55.0681466Z "asic_temperature": "33.8" +2025-02-20T20:33:55.0682019Z }, +2025-02-20T20:33:55.0682438Z "firmwares": { +2025-02-20T20:33:55.0682925Z "arc_fw": "1.7.0.0", +2025-02-20T20:33:55.0683466Z "arc_fw_date": "2024-05-01", +2025-02-20T20:33:55.0684033Z "eth_fw": "N/A", +2025-02-20T20:33:55.0684542Z "m3_bl_fw": "N/A", +2025-02-20T20:33:55.0685057Z "m3_app_fw": "N/A", +2025-02-20T20:33:55.0685622Z "tt_flash_version": "0.2.0.8" +2025-02-20T20:33:55.0686122Z }, +2025-02-20T20:33:55.0686508Z "limits": { +2025-02-20T20:33:55.0686955Z "vdd_min": "0.74", +2025-02-20T20:33:55.0687408Z "vdd_max": "0.93", +2025-02-20T20:33:55.0687902Z "tdp_limit": "170", +2025-02-20T20:33:55.0688393Z "tdc_limit": "300", +2025-02-20T20:33:55.0688884Z "asic_fmax": "1202", +2025-02-20T20:33:55.0689413Z "therm_trip_l1_limit": "83", +2025-02-20T20:33:55.0689939Z "thm_limit": "75", +2025-02-20T20:33:55.0690440Z "bus_peak_limit": null +2025-02-20T20:33:55.0690949Z } +2025-02-20T20:33:55.0691595Z } +2025-02-20T20:33:55.0691991Z ] +2025-02-20T20:33:55.0692767Z }::notice title=attempting-reset-startup::Attempting to reset card(s). Sleeping first +2025-02-20T20:34:25.0643205Z + '[' 0 -lt 10 ']' +2025-02-20T20:34:25.0643615Z + (( i++ )) +2025-02-20T20:34:25.0644227Z ++ tt-smi-metal -r 0 +2025-02-20T20:34:25.5709814Z + reset_output=' Starting tensix reset on GS board at pci index 0  +2025-02-20T20:34:25.5710583Z  Lowering clks to safe value...  +2025-02-20T20:34:25.5711091Z  Beginning reset sequence...  +2025-02-20T20:34:25.5711583Z  Finishing reset sequence...  +2025-02-20T20:34:25.5712114Z  Returning clks to original values...  +2025-02-20T20:34:25.5712686Z  Finished tensix reset on GS board at pci index 0 +2025-02-20T20:34:25.5713184Z  +2025-02-20T20:34:25.5713589Z  Re-initializing boards after reset....  +2025-02-20T20:34:25.5714096Z +2025-02-20T20:34:25.5714330Z  Detected Chips: 1 +2025-02-20T20:34:25.5714742Z  +2025-02-20T20:34:25.5715148Z  Detected Chips: 1 +2025-02-20T20:34:25.5715405Z +2025-02-20T20:34:25.5715609Z  Detecting ARC: | +2025-02-20T20:34:25.5715906Z +2025-02-20T20:34:25.5716164Z  Detecting DRAM: | +2025-02-20T20:34:25.5716490Z +2025-02-20T20:34:25.5716740Z [] ETH: |' +2025-02-20T20:34:25.5717165Z + [[ 0 -ne 0 ]] +2025-02-20T20:34:25.5719013Z ##[notice]tt-smi reset was successful +2025-02-20T20:34:25.5722120Z + [[  Starting tensix reset on GS board at pci index 0  +2025-02-20T20:34:25.5722691Z  Lowering clks to safe value...  +2025-02-20T20:34:25.5723180Z  Beginning reset sequence...  +2025-02-20T20:34:25.5723653Z  Finishing reset sequence...  +2025-02-20T20:34:25.5724163Z  Returning clks to original values...  +2025-02-20T20:34:25.5724731Z  Finished tensix reset on GS board at pci index 0 +2025-02-20T20:34:25.5725276Z  +2025-02-20T20:34:25.5725806Z  Re-initializing boards after reset....  +2025-02-20T20:34:25.5726118Z +2025-02-20T20:34:25.5726334Z  Detected Chips: 1 +2025-02-20T20:34:25.5726726Z  +2025-02-20T20:34:25.5727087Z  Detected Chips: 1 +2025-02-20T20:34:25.5727320Z +2025-02-20T20:34:25.5727514Z  Detecting ARC: | +2025-02-20T20:34:25.5728229Z +2025-02-20T20:34:25.5728433Z  Detecting DRAM: | +2025-02-20T20:34:25.5728673Z +2025-02-20T20:34:25.5728984Z [] ETH: | == *\N\o\ \c\h\i\p\s\ \d\e\t\e\c\t\e\d* ]] +2025-02-20T20:34:25.5729527Z + break +2025-02-20T20:34:25.5729818Z + '[' 1 -eq 10 ']' +2025-02-20T20:34:25.5730346Z + echo '::notice title=reset-successful-startup::tt-smi reset was successful' +2025-02-20T20:34:25.5730974Z + check_hugepages_service_status=0 +2025-02-20T20:34:25.5731489Z + sudo systemctl status tenstorrent-hugepages.service +2025-02-20T20:34:25.5948543Z Unit tenstorrent-hugepages.service could not be found. +2025-02-20T20:34:25.5954996Z + check_hugepages_service_status=4 +2025-02-20T20:34:25.5955561Z + '[' 4 -eq 4 ']' +2025-02-20T20:34:25.5956281Z + echo '::warning title=hugepages-service-not-found-startup::Hugepages service not found. Using old rc.local method' +2025-02-20T20:34:25.5957019Z + sudo /etc/rc.local +2025-02-20T20:34:25.5960816Z ##[warning]Hugepages service not found. Using old rc.local method +2025-02-20T20:34:55.6400623Z ++ date +%s +2025-02-20T20:34:55.6406479Z + hugepages_check_start=1740083695 +2025-02-20T20:34:55.6407055Z + hugepages_check_timeout=60 +2025-02-20T20:34:55.6410252Z ++ cat /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages +2025-02-20T20:34:55.6418181Z + [[ 1 -eq 0 ]] +2025-02-20T20:34:55.6420117Z ##[notice]Hugepages is now setup. +2025-02-20T20:34:55.6422071Z Printing out cpu information... +2025-02-20T20:34:55.6422861Z + echo '::notice title=hugepages-setup-success-startup::Hugepages is now setup.' +2025-02-20T20:34:55.6423657Z + echo 'Printing out cpu information...' +2025-02-20T20:34:55.6424177Z + lscpu +2025-02-20T20:34:55.6455999Z Architecture: x86_64 +2025-02-20T20:34:55.6457372Z CPU op-mode(s): 32-bit, 64-bit +2025-02-20T20:34:55.6458013Z Byte Order: Little Endian +2025-02-20T20:34:55.6458831Z Address sizes: 40 bits physical, 48 bits virtual +2025-02-20T20:34:55.6459547Z CPU(s): 14 +2025-02-20T20:34:55.6460062Z On-line CPU(s) list: 0-13 +2025-02-20T20:34:55.6460709Z Thread(s) per core: 1 +2025-02-20T20:34:55.6461310Z Core(s) per socket: 1 +2025-02-20T20:34:55.6461831Z Socket(s): 14 +2025-02-20T20:34:55.6462337Z NUMA node(s): 2 +2025-02-20T20:34:55.6462835Z Vendor ID: AuthenticAMD +2025-02-20T20:34:55.6463354Z CPU family: 23 +2025-02-20T20:34:55.6463865Z Model: 49 +2025-02-20T20:34:55.6464470Z Model name: AMD EPYC-Rome Processor +2025-02-20T20:34:55.6465019Z Stepping: 0 +2025-02-20T20:34:55.6465574Z CPU MHz: 2299.978 +2025-02-20T20:34:55.6466146Z BogoMIPS: 4599.95 +2025-02-20T20:34:55.6466726Z Virtualization: AMD-V +2025-02-20T20:34:55.6467279Z Hypervisor vendor: KVM +2025-02-20T20:34:55.6467840Z Virtualization type: full +2025-02-20T20:34:55.6468372Z L1d cache: 448 KiB +2025-02-20T20:34:55.6468979Z L1i cache: 448 KiB +2025-02-20T20:34:55.6469513Z L2 cache: 7 MiB +2025-02-20T20:34:55.6470036Z L3 cache: 224 MiB +2025-02-20T20:34:55.6470574Z NUMA node0 CPU(s): 0-6 +2025-02-20T20:34:55.6471096Z NUMA node1 CPU(s): 7-13 +2025-02-20T20:34:55.6471697Z Vulnerability Gather data sampling: Not affected +2025-02-20T20:34:55.6472381Z Vulnerability Itlb multihit: Not affected +2025-02-20T20:34:55.6472975Z Vulnerability L1tf: Not affected +2025-02-20T20:34:55.6473607Z Vulnerability Mds: Not affected +2025-02-20T20:34:55.6474497Z Vulnerability Meltdown: Not affected +2025-02-20T20:34:55.6475157Z Vulnerability Mmio stale data: Not affected +2025-02-20T20:34:55.6476147Z Vulnerability Retbleed: Vulnerable +2025-02-20T20:34:55.6477205Z Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp +2025-02-20T20:34:55.6478600Z Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization +2025-02-20T20:34:55.6480162Z Vulnerability Spectre v2: Mitigation; Retpolines; IBPB conditional; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected +2025-02-20T20:34:55.6481412Z Vulnerability Srbds: Not affected +2025-02-20T20:34:55.6482025Z Vulnerability Tsx async abort: Not affected +2025-02-20T20:34:55.6485636Z Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid tsc_known_freq pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm svm cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core ssbd ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 clzero xsaveerptr wbnoinvd arat npt nrip_save umip rdpid +2025-02-20T20:34:55.6762745Z ##[group]Run tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main +2025-02-20T20:34:55.6763383Z with: +2025-02-20T20:34:55.6763851Z token: *** +2025-02-20T20:34:55.6764178Z fetch-depth: 1 +2025-02-20T20:34:55.6764515Z env: +2025-02-20T20:34:55.6764825Z ARCH_NAME: grayskull +2025-02-20T20:34:55.6765166Z LOGURU_LEVEL: INFO +2025-02-20T20:34:55.6765520Z ##[endgroup] +2025-02-20T20:34:55.6858266Z ##[group]Run set -x +2025-02-20T20:34:55.6858656Z set -x +2025-02-20T20:34:55.6858981Z ls -al +2025-02-20T20:34:55.6859388Z if [ -f "semicolon_delimited_script" ]; then +2025-02-20T20:34:55.6859891Z  file semicolon_delimited_script +2025-02-20T20:34:55.6860430Z  head semicolon_delimited_script +2025-02-20T20:34:55.6860832Z fi +2025-02-20T20:34:55.6861161Z sudo rm -rf deleteme +2025-02-20T20:34:55.6861558Z sudo rm -rf docker-job +2025-02-20T20:34:55.6861956Z if [ -d ".git" ]; then +2025-02-20T20:34:55.6862428Z  echo 'Cleaning repo' +2025-02-20T20:34:55.6862845Z  git clean -xffd +2025-02-20T20:34:55.6863227Z  echo 'Done git clean -xffd' +2025-02-20T20:34:55.6863722Z  echo 'Attempting to delete any lock files' +2025-02-20T20:34:55.6864251Z  find .git -type f -iname '*.lock' -delete +2025-02-20T20:34:55.6864750Z  echo 'Done deleting lock files' +2025-02-20T20:34:55.6865229Z  echo 'De-init-ing submodules' +2025-02-20T20:34:55.6865702Z  git submodule deinit -f --all +2025-02-20T20:34:55.6866163Z  echo 'Done de-initing submodules' +2025-02-20T20:34:55.6866583Z fi +2025-02-20T20:34:55.6885529Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-20T20:34:55.6886061Z env: +2025-02-20T20:34:55.6886400Z ARCH_NAME: grayskull +2025-02-20T20:34:55.6886758Z LOGURU_LEVEL: INFO +2025-02-20T20:34:55.6887122Z ##[endgroup] +2025-02-20T20:34:55.6931366Z + ls -al +2025-02-20T20:34:55.6947339Z total 360064 +2025-02-20T20:34:55.6949751Z + '[' -f semicolon_delimited_script ']' +2025-02-20T20:34:55.6950324Z + sudo rm -rf deleteme +2025-02-20T20:34:55.6950789Z drwxr-xr-x 24 ubuntu ubuntu 4096 Feb 20 20:33 . +2025-02-20T20:34:55.6951323Z drwxr-xr-x 3 ubuntu ubuntu 4096 Jan 9 22:06 .. +2025-02-20T20:34:55.6951871Z drwxr-xr-x 5 ubuntu ubuntu 4096 Feb 20 20:19 .cache +2025-02-20T20:34:55.6952444Z -rw-r--r-- 1 ubuntu ubuntu 3966 Jan 9 22:08 .clang-format +2025-02-20T20:34:55.6953067Z -rw-r--r-- 1 ubuntu ubuntu 6268 Jan 26 16:00 .clang-format-ignore +2025-02-20T20:34:55.6953679Z -rw-r--r-- 1 ubuntu ubuntu 6374 Jan 26 16:00 .clang-tidy +2025-02-20T20:34:55.6954396Z -rw-r--r-- 1 ubuntu ubuntu 43 Jan 9 22:08 .clangd +2025-02-20T20:34:55.6955362Z drwxr-xr-x 3 ubuntu ubuntu 4096 Feb 20 20:19 .config +2025-02-20T20:34:55.6955933Z -rw-r--r-- 1 ubuntu ubuntu 222 Jan 9 22:08 .gersemirc +2025-02-20T20:34:55.6956516Z drwxr-xr-x 10 ubuntu ubuntu 4096 Feb 20 20:33 .git +2025-02-20T20:34:55.6957156Z -rw-r--r-- 1 ubuntu ubuntu 239 Jan 9 22:08 .git-blame-ignore-revs +2025-02-20T20:34:55.6957779Z -rw-r--r-- 1 ubuntu ubuntu 35 Jan 9 22:08 .gitattributes +2025-02-20T20:34:55.6958314Z drwxr-xr-x 6 ubuntu ubuntu 4096 Feb 13 05:33 .github +2025-02-20T20:34:55.6958867Z -rw-r--r-- 1 ubuntu ubuntu 1730 Jan 22 04:17 .gitignore +2025-02-20T20:34:55.6959435Z -rw-r--r-- 1 ubuntu ubuntu 991 Feb 5 15:35 .gitmodules +2025-02-20T20:34:55.6959965Z drwx------ 6 ubuntu ubuntu 4096 Feb 20 20:19 .local +2025-02-20T20:34:55.6960550Z -rw-r--r-- 1 ubuntu ubuntu 932 Jan 9 22:08 .pre-commit-config.yaml +2025-02-20T20:34:55.6961185Z drwxr-xr-x 3 ubuntu ubuntu 4096 Feb 20 20:31 .pytest_cache +2025-02-20T20:34:55.6961814Z -rw-r--r-- 1 ubuntu ubuntu 15813574 Feb 13 05:33 .test_durations +2025-02-20T20:34:55.6962471Z drwxr-xr-x 4 ubuntu ubuntu 4096 Feb 20 20:19 .ttnn_runtime_artifacts +2025-02-20T20:34:55.6963381Z -rw-r--r-- 1 ubuntu ubuntu 213 Jan 9 22:08 .yamllint +2025-02-20T20:34:55.6963968Z -rw-r--r-- 1 ubuntu ubuntu 11526 Feb 20 18:37 CMakeLists.txt +2025-02-20T20:34:55.6964565Z -rw-r--r-- 1 ubuntu ubuntu 2231 Feb 5 15:35 CMakePresets.json +2025-02-20T20:34:55.6965181Z -rw-r--r-- 1 ubuntu ubuntu 11666 Feb 20 19:59 CODEOWNERS +2025-02-20T20:34:55.6965782Z -rw-r--r-- 1 ubuntu ubuntu 5253 Jan 9 22:08 CODE_OF_CONDUCT.md +2025-02-20T20:34:55.6966408Z -rw-r--r-- 1 ubuntu ubuntu 36527 Jan 15 02:14 CONTRIBUTING.md +2025-02-20T20:34:55.6967002Z -rw-r--r-- 1 ubuntu ubuntu 126373 Jan 26 16:00 Doxyfile +2025-02-20T20:34:55.6967610Z -rw-r--r-- 1 ubuntu ubuntu 6046 Feb 5 15:35 INSTALLING.md +2025-02-20T20:34:55.6968257Z -rw-r--r-- 1 ubuntu ubuntu 11825 Jan 9 22:08 LICENSE +2025-02-20T20:34:55.6968838Z -rw-r--r-- 1 ubuntu ubuntu 1562 Jan 27 05:31 MANIFEST.in +2025-02-20T20:34:55.6969465Z -rw-r--r-- 1 ubuntu ubuntu 18478 Feb 20 16:06 METALIUM_GUIDE.md +2025-02-20T20:34:55.6970077Z -rw-r--r-- 1 ubuntu ubuntu 15526 Feb 19 08:11 README.md +2025-02-20T20:34:55.6970634Z drwxr-xr-x 2 ubuntu ubuntu 4096 Feb 20 20:19 __pycache__ +2025-02-20T20:34:55.6971240Z -rwxr-xr-x 1 ubuntu ubuntu 11801 Feb 20 17:08 build_metal.sh +2025-02-20T20:34:55.6971899Z -rw-r--r-- 1 ubuntu ubuntu 1438 Jan 9 22:08 check_copyright_config.yaml +2025-02-20T20:34:55.6972528Z -rw-r--r-- 1 ubuntu ubuntu 1821 Jan 9 22:08 cloc.sh +2025-02-20T20:34:55.6973084Z drwxr-xr-x 4 ubuntu ubuntu 4096 Feb 20 08:05 cmake +2025-02-20T20:34:55.6973669Z -rw-r--r-- 1 ubuntu ubuntu 23178 Feb 20 06:22 conftest.py +2025-02-20T20:34:55.6974283Z drwxr-xr-x 2 ubuntu ubuntu 4096 Feb 20 19:59 contributing +2025-02-20T20:34:55.6974908Z -rwxr-xr-x 1 ubuntu ubuntu 1420 Jan 9 22:08 create_venv.sh +2025-02-20T20:34:55.6975544Z drwxr-xr-x 2 ubuntu ubuntu 4096 Feb 20 01:27 dependencies +2025-02-20T20:34:55.6976154Z drwxr-xr-x 2 ubuntu ubuntu 4096 Feb 20 08:05 dockerfile +2025-02-20T20:34:55.6976710Z drwxr-xr-x 3 ubuntu ubuntu 4096 Feb 7 16:37 docs +2025-02-20T20:34:55.6977366Z drwxr-xr-x 3 ubuntu ubuntu 4096 Feb 20 20:31 generated +2025-02-20T20:34:55.6978122Z drwxr-xr-x 4 ubuntu ubuntu 4096 Feb 5 15:35 infra +2025-02-20T20:34:55.6978819Z -rwxr-xr-x 1 ubuntu ubuntu 6885 Feb 13 05:33 install_dependencies.sh +2025-02-20T20:34:55.6979484Z drwxr-xr-x 10 ubuntu ubuntu 4096 Feb 20 20:19 models +2025-02-20T20:34:55.6980051Z -rw-r--r-- 1 ubuntu ubuntu 1042 Jan 9 22:08 pyproject.toml +2025-02-20T20:34:55.6980667Z -rw-r--r-- 1 ubuntu ubuntu 1200 Jan 9 22:08 pytest.ini +2025-02-20T20:34:55.6981257Z drwxr-xr-x 4 ubuntu ubuntu 4096 Feb 13 05:33 scripts +2025-02-20T20:34:55.6981835Z -rw-r--r-- 1 ubuntu ubuntu 7551 Feb 5 15:35 setup.py +2025-02-20T20:34:55.6982555Z drwxr-xr-x 24 ubuntu ubuntu 4096 Jan 15 02:14 tech_reports +2025-02-20T20:34:55.6983145Z drwxr-xr-x 11 ubuntu ubuntu 4096 Feb 20 01:27 tests +2025-02-20T20:34:55.6983722Z drwxr-xr-x 11 ubuntu ubuntu 4096 Feb 13 05:33 tt-train +2025-02-20T20:34:55.6984316Z drwxr-xr-x 23 ubuntu ubuntu 4096 Feb 20 18:37 tt_metal +2025-02-20T20:34:55.6984892Z drwxr-xr-x 9 ubuntu ubuntu 4096 Feb 20 20:03 ttnn +2025-02-20T20:34:55.6985599Z -rw-r--r-- 1 ubuntu ubuntu 137847969 Feb 20 20:18 ttnn-0.56.0rc36.dev16+any-cp38-cp38-linux_x86_64.whl +2025-02-20T20:34:55.6986434Z -rw-r--r-- 1 ubuntu ubuntu 214541539 Feb 20 20:18 ttnn-0.56.0rc36.dev16+any.tar.gz +2025-02-20T20:34:55.7160842Z + sudo rm -rf docker-job +2025-02-20T20:34:55.7371670Z + '[' -d .git ']' +2025-02-20T20:34:55.7372078Z Cleaning repo +2025-02-20T20:34:55.7372697Z + echo 'Cleaning repo' +2025-02-20T20:34:55.7373211Z + git clean -xffd +2025-02-20T20:34:56.6687877Z Removing .cache/ +2025-02-20T20:34:56.6688525Z Removing .config/ +2025-02-20T20:34:56.6689007Z Removing .local/ +2025-02-20T20:34:56.6689465Z Removing .pytest_cache/ +2025-02-20T20:34:56.6689955Z Removing .ttnn_runtime_artifacts/ +2025-02-20T20:34:56.6690405Z Removing __pycache__/ +2025-02-20T20:34:56.6690952Z Removing generated/ +2025-02-20T20:34:56.6691756Z Removing models/__pycache__/ +2025-02-20T20:34:56.6692224Z Removing models/common/__pycache__/ +2025-02-20T20:34:56.6692712Z Removing models/demos/bert/tt/__pycache__/ +2025-02-20T20:34:56.6693339Z Removing models/demos/metal_BERT_large_11/tt/__pycache__/ +2025-02-20T20:34:56.6694011Z Removing models/experimental/functional_common/__pycache__/ +2025-02-20T20:34:56.6694588Z Removing models/perf/__pycache__/ +2025-02-20T20:34:56.6695080Z Removing tests/scripts/__pycache__/ +2025-02-20T20:34:56.6695593Z Removing tests/sweep_framework/sweep_utils/__pycache__/ +2025-02-20T20:34:56.6696249Z Removing tests/tt_eager/python_api_testing/sweep_tests/__pycache__/ +2025-02-20T20:34:56.6697158Z Removing tests/tt_eager/python_api_testing/unit_testing/backward_ops/__pycache__/ +2025-02-20T20:34:56.6697963Z Removing tests/tt_eager/python_api_testing/unit_testing/misc/__pycache__/ +2025-02-20T20:34:56.6698626Z Removing tests/ttnn/__pycache__/ +2025-02-20T20:34:56.6699276Z Removing tests/ttnn/python_api_testing/sweep_tests/__pycache__/ +2025-02-20T20:34:56.6699960Z Removing tests/ttnn/unit_tests/__pycache__/ +2025-02-20T20:34:56.6700607Z Removing tests/ttnn/unit_tests/benchmarks/__pycache__/ +2025-02-20T20:34:56.6701240Z Removing tests/ttnn/unit_tests/operations/__pycache__/ +2025-02-20T20:34:56.6701891Z Removing tests/ttnn/unit_tests/operations/ccl/__pycache__/ +2025-02-20T20:34:56.6702557Z Removing tests/ttnn/unit_tests/operations/ccl/perf/__pycache__/ +2025-02-20T20:34:56.6703201Z Removing tests/ttnn/unit_tests/operations/eltwise/__pycache__/ +2025-02-20T20:34:56.6703900Z Removing tests/ttnn/unit_tests/operations/eltwise/backward/__pycache__/ +2025-02-20T20:34:56.6704833Z Removing tests/ttnn/unit_tests/operations/eltwise/backward/complex_ops/__pycache__/ +2025-02-20T20:34:56.6705660Z Removing tests/ttnn/unit_tests/operations/eltwise/complex/__pycache__/ +2025-02-20T20:34:56.6706341Z Removing tests/ttnn/unit_tests/tensor/__pycache__/ +2025-02-20T20:34:56.6706892Z Removing tt_metal/tools/profiler/__pycache__/ +2025-02-20T20:34:56.6707484Z Removing ttnn-0.56.0rc36.dev16+any-cp38-cp38-linux_x86_64.whl +2025-02-20T20:34:56.6708106Z Removing ttnn-0.56.0rc36.dev16+any.tar.gz +2025-02-20T20:34:56.6708622Z + echo 'Done git clean -xffd' +2025-02-20T20:34:56.6709133Z + echo 'Attempting to delete any lock files' +2025-02-20T20:34:56.6709626Z + find .git -type f -iname '*.lock' -delete +2025-02-20T20:34:56.6710116Z Done git clean -xffd +2025-02-20T20:34:56.6710544Z Attempting to delete any lock files +2025-02-20T20:34:56.7234234Z + echo 'Done deleting lock files' +2025-02-20T20:34:56.7234806Z Done deleting lock files +2025-02-20T20:34:56.7235371Z + echo 'De-init-ing submodules' +2025-02-20T20:34:56.7235825Z + git submodule deinit -f --all +2025-02-20T20:34:56.7236291Z De-init-ing submodules +2025-02-20T20:34:56.7494873Z Cleared directory 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-20T20:34:56.7527477Z Submodule 'models/demos/t3000/llama2_70b/reference/llama' (https://github.com/tenstorrent-metal/llama.git) unregistered for path 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-20T20:34:56.7528990Z Cleared directory 'tt-train/3rd_party/wandb-cpp' +2025-02-20T20:34:56.7674565Z Submodule '3rd_party/wandb-cpp' (https://github.com/yhisaki/wandb-cpp) unregistered for path 'tt-train/3rd_party/wandb-cpp' +2025-02-20T20:34:56.7675526Z Cleared directory 'tt_metal/third_party/tracy' +2025-02-20T20:34:56.7708716Z Submodule 'tt_metal/third_party/tracy' (https://github.com/tenstorrent-metal/tracy.git) unregistered for path 'tt_metal/third_party/tracy' +2025-02-20T20:34:56.7709850Z Cleared directory 'tt_metal/third_party/tt_llk_blackhole' +2025-02-20T20:34:56.7742768Z Submodule 'tt_metal/third_party/tt_llk_blackhole' (https://github.com/tenstorrent/tt-llk-bh.git) unregistered for path 'tt_metal/third_party/tt_llk_blackhole' +2025-02-20T20:34:56.7744065Z Cleared directory 'tt_metal/third_party/tt_llk_grayskull' +2025-02-20T20:34:56.7779485Z Submodule 'tt_metal/third_party/tt_llk_grayskull' (https://github.com/tenstorrent/tt-llk-gs.git) unregistered for path 'tt_metal/third_party/tt_llk_grayskull' +2025-02-20T20:34:56.7780672Z Cleared directory 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-20T20:34:56.7944860Z Submodule 'tt_metal/third_party/tt_llk_wormhole_b0' (https://github.com/tenstorrent/tt-llk-wh-b0.git) unregistered for path 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-20T20:34:56.7946053Z Cleared directory 'tt_metal/third_party/umd' +2025-02-20T20:34:56.7965149Z Submodule 'tt_metal/third_party/umd' (https://github.com/tenstorrent/tt-umd.git) unregistered for path 'tt_metal/third_party/umd' +2025-02-20T20:34:56.7974809Z + echo 'Done de-initing submodules' +2025-02-20T20:34:56.7975318Z Done de-initing submodules +2025-02-20T20:34:56.8107258Z ##[group]Run actions/checkout@v4 +2025-02-20T20:34:56.8107682Z with: +2025-02-20T20:34:56.8108224Z token: *** +2025-02-20T20:34:56.8108522Z fetch-depth: 1 +2025-02-20T20:34:56.8108836Z lfs: false +2025-02-20T20:34:56.8109140Z submodules: recursive +2025-02-20T20:34:56.8109466Z clean: true +2025-02-20T20:34:56.8109795Z repository: tenstorrent/tt-metal +2025-02-20T20:34:56.8110181Z ssh-strict: true +2025-02-20T20:34:56.8110477Z ssh-user: git +2025-02-20T20:34:56.8110805Z persist-credentials: true +2025-02-20T20:34:56.8111190Z sparse-checkout-cone-mode: true +2025-02-20T20:34:56.8111588Z fetch-tags: false +2025-02-20T20:34:56.8111929Z show-progress: true +2025-02-20T20:34:56.8112289Z set-safe-directory: true +2025-02-20T20:34:56.8112646Z env: +2025-02-20T20:34:56.8112944Z ARCH_NAME: grayskull +2025-02-20T20:34:56.8113270Z LOGURU_LEVEL: INFO +2025-02-20T20:34:56.8113612Z ##[endgroup] +2025-02-20T20:34:56.9344385Z Syncing repository: tenstorrent/tt-metal +2025-02-20T20:34:56.9346086Z ##[group]Getting Git version info +2025-02-20T20:34:56.9346693Z Working directory is '/home/ubuntu/actions-runner/_work/tt-metal/tt-metal' +2025-02-20T20:34:56.9347538Z [command]/usr/bin/git version +2025-02-20T20:34:56.9347927Z git version 2.25.1 +2025-02-20T20:34:56.9374719Z ##[endgroup] +2025-02-20T20:34:56.9387282Z Copying '/home/ubuntu/.gitconfig' to '/home/ubuntu/actions-runner/_work/_temp/bb6c1416-a5e0-4632-b1e9-a9fcfd2a21c0/.gitconfig' +2025-02-20T20:34:56.9402091Z Temporarily overriding HOME='/home/ubuntu/actions-runner/_work/_temp/bb6c1416-a5e0-4632-b1e9-a9fcfd2a21c0' before making global git config changes +2025-02-20T20:34:56.9403634Z Adding repository directory to the temporary git global config as a safe directory +2025-02-20T20:34:56.9418518Z [command]/usr/bin/git config --global --add safe.directory /home/ubuntu/actions-runner/_work/tt-metal/tt-metal +2025-02-20T20:34:56.9465301Z [command]/usr/bin/git config --local --get remote.origin.url +2025-02-20T20:34:56.9488630Z https://github.com/tenstorrent/tt-metal +2025-02-20T20:34:56.9506158Z ##[group]Removing previously created refs, to avoid conflicts +2025-02-20T20:34:56.9510196Z [command]/usr/bin/git rev-parse --symbolic-full-name --verify --quiet HEAD +2025-02-20T20:34:56.9531035Z refs/heads/main +2025-02-20T20:34:56.9540186Z [command]/usr/bin/git checkout --detach +2025-02-20T20:34:57.0130279Z HEAD is now at ed29888f #17999: Fixing invalid barrier test (#18103) +2025-02-20T20:34:57.0693850Z [command]/usr/bin/git branch --delete --force main +2025-02-20T20:34:57.0731542Z Deleted branch main (was ed29888f). +2025-02-20T20:34:57.0933070Z ##[endgroup] +2025-02-20T20:34:57.0937170Z [command]/usr/bin/git submodule status +2025-02-20T20:34:57.1198264Z -29125b7ad8b5513eeaa4417ed92892bf39c8bd74 models/demos/t3000/llama2_70b/reference/llama +2025-02-20T20:34:57.1199529Z -368cd07f89f497df20a66936fbfae3956f151af4 tt-train/3rd_party/wandb-cpp +2025-02-20T20:34:57.1200652Z -71d4c8d378b52af7da7012b9b595a61e9304f0bb tt_metal/third_party/tracy +2025-02-20T20:34:57.1201698Z -8c25441b351646046d8de3fd6b8d895b7c87135d tt_metal/third_party/tt_llk_blackhole +2025-02-20T20:34:57.1202891Z -0c04db64275a4bd36a7e14d3c533855cb33f6a20 tt_metal/third_party/tt_llk_grayskull +2025-02-20T20:34:57.1204002Z -a34e1966683c478d575d5ea79413004955c8a57f tt_metal/third_party/tt_llk_wormhole_b0 +2025-02-20T20:34:57.1205404Z -ebb0f945ed8d3c05e043158978201ed6fab884ec tt_metal/third_party/umd +2025-02-20T20:34:57.1209455Z ##[group]Cleaning the repository +2025-02-20T20:34:57.1213993Z [command]/usr/bin/git clean -ffdx +2025-02-20T20:34:57.1464157Z [command]/usr/bin/git reset --hard HEAD +2025-02-20T20:34:57.1987095Z HEAD is now at ed29888f #17999: Fixing invalid barrier test (#18103) +2025-02-20T20:34:57.1998178Z ##[endgroup] +2025-02-20T20:34:57.1999899Z ##[group]Disabling automatic garbage collection +2025-02-20T20:34:57.2004052Z [command]/usr/bin/git config --local gc.auto 0 +2025-02-20T20:34:57.2029459Z ##[endgroup] +2025-02-20T20:34:57.2030009Z ##[group]Setting up auth +2025-02-20T20:34:57.2036832Z [command]/usr/bin/git config --local --name-only --get-regexp core\.sshCommand +2025-02-20T20:34:57.2068393Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'core\.sshCommand' && git config --local --unset-all 'core.sshCommand' || :" +2025-02-20T20:34:57.2319351Z [command]/usr/bin/git config --local --name-only --get-regexp http\.https\:\/\/github\.com\/\.extraheader +2025-02-20T20:34:57.2346523Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'http\.https\:\/\/github\.com\/\.extraheader' && git config --local --unset-all 'http.https://github.com/.extraheader' || :" +2025-02-20T20:34:57.2610379Z [command]/usr/bin/git config --local http.https://github.com/.extraheader AUTHORIZATION: basic *** +2025-02-20T20:34:57.2649761Z ##[endgroup] +2025-02-20T20:34:57.2650405Z ##[group]Fetching the repository +2025-02-20T20:34:57.2658379Z [command]/usr/bin/git -c protocol.version=2 fetch --no-tags --prune --no-recurse-submodules --depth=1 origin +94429171440755ffe7c62085c4807d447dd369dc:refs/remotes/origin/williamly/test-failure-annotations +2025-02-20T20:34:57.7135656Z From https://github.com/tenstorrent/tt-metal +2025-02-20T20:34:57.7136588Z + 8c56f554...94429171 94429171440755ffe7c62085c4807d447dd369dc -> origin/williamly/test-failure-annotations (forced update) +2025-02-20T20:34:57.7159088Z ##[endgroup] +2025-02-20T20:34:57.7159658Z ##[group]Determining the checkout info +2025-02-20T20:34:57.7160897Z ##[endgroup] +2025-02-20T20:34:57.7161513Z ##[group]Checking out the ref +2025-02-20T20:34:57.7166437Z [command]/usr/bin/git checkout --progress --force -B williamly/test-failure-annotations refs/remotes/origin/williamly/test-failure-annotations +2025-02-20T20:34:57.7926077Z Previous HEAD position was ed29888f #17999: Fixing invalid barrier test (#18103) +2025-02-20T20:34:57.8091742Z Switched to a new branch 'williamly/test-failure-annotations' +2025-02-20T20:34:57.8092789Z Branch 'williamly/test-failure-annotations' set up to track remote branch 'williamly/test-failure-annotations' from 'origin'. +2025-02-20T20:34:57.8746252Z ##[endgroup] +2025-02-20T20:34:57.8747208Z ##[group]Setting up auth for fetching submodules +2025-02-20T20:34:57.8754780Z [command]/usr/bin/git config --global http.https://github.com/.extraheader AUTHORIZATION: basic *** +2025-02-20T20:34:57.8794423Z [command]/usr/bin/git config --global --unset-all url.https://github.com/.insteadOf +2025-02-20T20:34:57.8824952Z [command]/usr/bin/git config --global --add url.https://github.com/.insteadOf git@github.com: +2025-02-20T20:34:57.8854097Z [command]/usr/bin/git config --global --add url.https://github.com/.insteadOf org-64161552@github.com: +2025-02-20T20:34:57.8878493Z ##[endgroup] +2025-02-20T20:34:57.8879050Z ##[group]Fetching submodules +2025-02-20T20:34:57.8881864Z [command]/usr/bin/git submodule sync --recursive +2025-02-20T20:34:57.9128810Z [command]/usr/bin/git -c protocol.version=2 submodule update --init --force --depth=1 --recursive +2025-02-20T20:34:57.9370766Z Submodule 'models/demos/t3000/llama2_70b/reference/llama' (https://github.com/tenstorrent-metal/llama.git) registered for path 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-20T20:34:57.9373569Z Submodule '3rd_party/wandb-cpp' (https://github.com/yhisaki/wandb-cpp) registered for path 'tt-train/3rd_party/wandb-cpp' +2025-02-20T20:34:57.9377252Z Submodule 'tt_metal/third_party/tracy' (https://github.com/tenstorrent-metal/tracy.git) registered for path 'tt_metal/third_party/tracy' +2025-02-20T20:34:57.9380520Z Submodule 'tt_metal/third_party/tt_llk_blackhole' (https://github.com/tenstorrent/tt-llk-bh.git) registered for path 'tt_metal/third_party/tt_llk_blackhole' +2025-02-20T20:34:57.9383960Z Submodule 'tt_metal/third_party/tt_llk_grayskull' (https://github.com/tenstorrent/tt-llk-gs.git) registered for path 'tt_metal/third_party/tt_llk_grayskull' +2025-02-20T20:34:57.9387460Z Submodule 'tt_metal/third_party/tt_llk_wormhole_b0' (https://github.com/tenstorrent/tt-llk-wh-b0.git) registered for path 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-20T20:34:57.9390975Z Submodule 'tt_metal/third_party/umd' (https://github.com/tenstorrent/tt-umd.git) registered for path 'tt_metal/third_party/umd' +2025-02-20T20:34:57.9851860Z Submodule path 'models/demos/t3000/llama2_70b/reference/llama': checked out '29125b7ad8b5513eeaa4417ed92892bf39c8bd74' +2025-02-20T20:34:58.0209018Z Submodule path 'tt-train/3rd_party/wandb-cpp': checked out '368cd07f89f497df20a66936fbfae3956f151af4' +2025-02-20T20:34:58.1651989Z Submodule path 'tt_metal/third_party/tracy': checked out '71d4c8d378b52af7da7012b9b595a61e9304f0bb' +2025-02-20T20:34:58.1971144Z Submodule path 'tt_metal/third_party/tt_llk_blackhole': checked out '8c25441b351646046d8de3fd6b8d895b7c87135d' +2025-02-20T20:34:58.2263621Z Submodule path 'tt_metal/third_party/tt_llk_grayskull': checked out '0c04db64275a4bd36a7e14d3c533855cb33f6a20' +2025-02-20T20:34:58.2574514Z Submodule path 'tt_metal/third_party/tt_llk_wormhole_b0': checked out 'a34e1966683c478d575d5ea79413004955c8a57f' +2025-02-20T20:34:58.5358683Z WARNING: Multiple 'url.*..insteadof' keys with the same alias: "git@github.com:" +2025-02-20T20:34:58.5359545Z WARNING: Multiple 'url.*..insteadof' keys with the same alias: "org-64161552@github.com:" +2025-02-20T20:34:58.5418255Z Submodule path 'tt_metal/third_party/umd': checked out 'ebb0f945ed8d3c05e043158978201ed6fab884ec' +2025-02-20T20:34:58.5489303Z [command]/usr/bin/git submodule foreach --recursive git config --local gc.auto 0 +2025-02-20T20:34:58.5729379Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-20T20:34:58.5769447Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-20T20:34:58.5810203Z Entering 'tt_metal/third_party/tracy' +2025-02-20T20:34:58.5851202Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-20T20:34:58.5888587Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-20T20:34:58.5926167Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-20T20:34:58.5963123Z Entering 'tt_metal/third_party/umd' +2025-02-20T20:34:58.6018974Z ##[endgroup] +2025-02-20T20:34:58.6019567Z ##[group]Persisting credentials for submodules +2025-02-20T20:34:58.6026460Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'url\.https\:\/\/github\.com\/\.insteadOf' && git config --local --unset-all 'url.https://github.com/.insteadOf' || :" +2025-02-20T20:34:58.6266076Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-20T20:34:58.6290226Z url.https://github.com/.insteadof +2025-02-20T20:34:58.6290735Z url.https://github.com/.insteadof +2025-02-20T20:34:58.6321154Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-20T20:34:58.6347063Z url.https://github.com/.insteadof +2025-02-20T20:34:58.6347565Z url.https://github.com/.insteadof +2025-02-20T20:34:58.6378887Z Entering 'tt_metal/third_party/tracy' +2025-02-20T20:34:58.6405268Z url.https://github.com/.insteadof +2025-02-20T20:34:58.6405783Z url.https://github.com/.insteadof +2025-02-20T20:34:58.6438434Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-20T20:34:58.6461434Z url.https://github.com/.insteadof +2025-02-20T20:34:58.6461945Z url.https://github.com/.insteadof +2025-02-20T20:34:58.6493953Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-20T20:34:58.6521162Z url.https://github.com/.insteadof +2025-02-20T20:34:58.6521718Z url.https://github.com/.insteadof +2025-02-20T20:34:58.6556440Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-20T20:34:58.6581109Z url.https://github.com/.insteadof +2025-02-20T20:34:58.6581631Z url.https://github.com/.insteadof +2025-02-20T20:34:58.6610941Z Entering 'tt_metal/third_party/umd' +2025-02-20T20:34:58.6635089Z url.https://github.com/.insteadof +2025-02-20T20:34:58.6635597Z url.https://github.com/.insteadof +2025-02-20T20:34:58.6678381Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local 'http.https://github.com/.extraheader' 'AUTHORIZATION: basic ***' && git config --local --show-origin --name-only --get-regexp remote.origin.url" +2025-02-20T20:34:58.6920033Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-20T20:34:58.6957796Z file:/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.git/modules/models/demos/t3000/llama2_70b/reference/llama/config remote.origin.url +2025-02-20T20:34:58.6975919Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-20T20:34:58.7018491Z file:/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.git/modules/3rd_party/wandb-cpp/config remote.origin.url +2025-02-20T20:34:58.7037503Z Entering 'tt_metal/third_party/tracy' +2025-02-20T20:34:58.7075594Z file:/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.git/modules/tt_metal/third_party/tracy/config remote.origin.url +2025-02-20T20:34:58.7096330Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-20T20:34:58.7135894Z file:/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.git/modules/tt_metal/third_party/tt_llk_blackhole/config remote.origin.url +2025-02-20T20:34:58.7154615Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-20T20:34:58.7196302Z file:/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.git/modules/tt_metal/third_party/tt_llk_grayskull/config remote.origin.url +2025-02-20T20:34:58.7215633Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-20T20:34:58.7255174Z file:/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.git/modules/tt_metal/third_party/tt_llk_wormhole_b0/config remote.origin.url +2025-02-20T20:34:58.7274411Z Entering 'tt_metal/third_party/umd' +2025-02-20T20:34:58.7315333Z file:/home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.git/modules/tt_metal/third_party/umd/config remote.origin.url +2025-02-20T20:34:58.7393743Z [command]/usr/bin/git submodule foreach --recursive git config --local --add 'url.https://github.com/.insteadOf' 'git@github.com:' +2025-02-20T20:34:58.7631026Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-20T20:34:58.7669480Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-20T20:34:58.7717034Z Entering 'tt_metal/third_party/tracy' +2025-02-20T20:34:58.7753343Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-20T20:34:58.7793605Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-20T20:34:58.7834993Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-20T20:34:58.7875228Z Entering 'tt_metal/third_party/umd' +2025-02-20T20:34:58.7930107Z [command]/usr/bin/git submodule foreach --recursive git config --local --add 'url.https://github.com/.insteadOf' 'org-64161552@github.com:' +2025-02-20T20:34:58.8171963Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-20T20:34:58.8209164Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-20T20:34:58.8248112Z Entering 'tt_metal/third_party/tracy' +2025-02-20T20:34:58.8288964Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-20T20:34:58.8325324Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-20T20:34:58.8363182Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-20T20:34:58.8405434Z Entering 'tt_metal/third_party/umd' +2025-02-20T20:34:58.8458346Z ##[endgroup] +2025-02-20T20:37:58.9974751Z +2025-02-20T20:37:58.9975114Z [----------] 4 tests from CompileProgramWithKernelPathEnvVarFixture +2025-02-20T20:37:58.9976078Z [ RUN ] CompileProgramWithKernelPathEnvVarFixture.TensixKernelUnderMetalRootDir +2025-02-20T20:37:58.9977415Z  Test | INFO  | Skipping test: TT_METAL_KERNEL_PATH must be set +2025-02-20T20:37:58.9978570Z /work/tests/tt_metal/tt_metal/api/compile_program_with_kernel_path_env_var_fixture.hpp:17: Skipped +2025-02-20T20:37:58.9979232Z +2025-02-20T20:37:58.9979840Z [ SKIPPED ] CompileProgramWithKernelPathEnvVarFixture.TensixKernelUnderMetalRootDir (0 ms) +2025-02-20T20:37:58.9981000Z [ RUN ] CompileProgramWithKernelPathEnvVarFixture.TensixKernelUnderKernelRootDir +2025-02-20T20:37:58.9982689Z  Test | INFO  | Skipping test: TT_METAL_KERNEL_PATH must be set +2025-02-20T20:37:58.9983810Z /work/tests/tt_metal/tt_metal/api/compile_program_with_kernel_path_env_var_fixture.hpp:17: Skipped +2025-02-20T20:37:58.9984526Z +2025-02-20T20:37:58.9985123Z [ SKIPPED ] CompileProgramWithKernelPathEnvVarFixture.TensixKernelUnderKernelRootDir (0 ms) +2025-02-20T20:37:58.9986559Z [ RUN ] CompileProgramWithKernelPathEnvVarFixture.TensixKernelUnderMetalRootDirAndKernelRootDir +2025-02-20T20:37:58.9988032Z  Test | INFO  | Skipping test: TT_METAL_KERNEL_PATH must be set +2025-02-20T20:37:58.9989124Z /work/tests/tt_metal/tt_metal/api/compile_program_with_kernel_path_env_var_fixture.hpp:17: Skipped +2025-02-20T20:37:58.9989777Z +2025-02-20T20:37:58.9990751Z [ SKIPPED ] CompileProgramWithKernelPathEnvVarFixture.TensixKernelUnderMetalRootDirAndKernelRootDir (0 ms) +2025-02-20T20:37:58.9992107Z [ RUN ] CompileProgramWithKernelPathEnvVarFixture.TensixNonExistentKernel +2025-02-20T20:37:58.9993441Z  Test | INFO  | Skipping test: TT_METAL_KERNEL_PATH must be set +2025-02-20T20:37:58.9994702Z /work/tests/tt_metal/tt_metal/api/compile_program_with_kernel_path_env_var_fixture.hpp:17: Skipped +2025-02-20T20:37:58.9995362Z +2025-02-20T20:37:58.9995891Z [ SKIPPED ] CompileProgramWithKernelPathEnvVarFixture.TensixNonExistentKernel (0 ms) +2025-02-20T20:37:58.9996950Z [----------] 4 tests from CompileProgramWithKernelPathEnvVarFixture (0 ms total) +2025-02-20T20:37:58.9997491Z +2025-02-20T20:37:58.9997913Z [----------] 18 tests from CoreCoordFixture +2025-02-20T20:37:58.9998543Z [ RUN ] CoreCoordFixture.TestCoreRangeIntersects +2025-02-20T20:37:58.9999228Z [ OK ] CoreCoordFixture.TestCoreRangeIntersects (0 ms) +2025-02-20T20:37:59.0000013Z [ RUN ] CoreCoordFixture.TestCoreRangeNotIntersects +2025-02-20T20:37:59.0000825Z [ OK ] CoreCoordFixture.TestCoreRangeNotIntersects (0 ms) +2025-02-20T20:37:59.0001557Z [ RUN ] CoreCoordFixture.TestCoreRangeIterator +2025-02-20T20:37:59.0002288Z [ OK ] CoreCoordFixture.TestCoreRangeIterator (0 ms) +2025-02-20T20:37:59.0002989Z [ RUN ] CoreCoordFixture.TestCoreRangeMerge +2025-02-20T20:37:59.0003685Z [ OK ] CoreCoordFixture.TestCoreRangeMerge (0 ms) +2025-02-20T20:37:59.0004360Z [ RUN ] CoreCoordFixture.TestCoreRangeNotMergeable +2025-02-20T20:37:59.0005108Z [ OK ] CoreCoordFixture.TestCoreRangeNotMergeable (0 ms) +2025-02-20T20:37:59.0005862Z [ RUN ] CoreCoordFixture.TestCoreRangeSetValidConstruct +2025-02-20T20:37:59.0006708Z [ OK ] CoreCoordFixture.TestCoreRangeSetValidConstruct (0 ms) +2025-02-20T20:37:59.0007537Z [ RUN ] CoreCoordFixture.TestCoreRangeSetInvalidConstruct +2025-02-20T20:37:59.0009261Z  Always | FATAL  | Cannot create CoreRangeSet with specified core ranges because core ranges [(x=3,y=3) - (x=5,y=4)] and [(x=1,y=2) - (x=3,y=3)] overlap! +2025-02-20T20:37:59.0011561Z  Always | FATAL  | Cannot create CoreRangeSet with specified core ranges because core ranges [(x=1,y=1) - (x=1,y=1)] and [(x=0,y=0) - (x=1,y=1)] overlap! +2025-02-20T20:37:59.0012886Z [ OK ] CoreCoordFixture.TestCoreRangeSetInvalidConstruct (0 ms) +2025-02-20T20:37:59.0013700Z [ RUN ] CoreCoordFixture.TestCoreRangeSetContains +2025-02-20T20:37:59.0014417Z [ OK ] CoreCoordFixture.TestCoreRangeSetContains (0 ms) +2025-02-20T20:37:59.0015107Z [ RUN ] CoreCoordFixture.TestCoreRangeSetNotContains +2025-02-20T20:37:59.0015819Z [ OK ] CoreCoordFixture.TestCoreRangeSetNotContains (0 ms) +2025-02-20T20:37:59.0016522Z [ RUN ] CoreCoordFixture.TestCoreRangeSetIntersects +2025-02-20T20:37:59.0017143Z [ OK ] CoreCoordFixture.TestCoreRangeSetIntersects (0 ms) +2025-02-20T20:37:59.0017752Z [ RUN ] CoreCoordFixture.TestCoreRangeSetNotIntersects +2025-02-20T20:37:59.0018377Z [ OK ] CoreCoordFixture.TestCoreRangeSetNotIntersects (0 ms) +2025-02-20T20:37:59.0019006Z [ RUN ] CoreCoordFixture.TestCoreRangeSetMergeNoSolution +2025-02-20T20:37:59.0019634Z [ OK ] CoreCoordFixture.TestCoreRangeSetMergeNoSolution (0 ms) +2025-02-20T20:37:59.0020275Z [ RUN ] CoreCoordFixture.TestCoreRangeSetMergeCoreCoord +2025-02-20T20:37:59.0020905Z [ OK ] CoreCoordFixture.TestCoreRangeSetMergeCoreCoord (0 ms) +2025-02-20T20:37:59.0021542Z [ RUN ] CoreCoordFixture.TestCoreRangeSetMergeCoreRange +2025-02-20T20:37:59.0022174Z [ OK ] CoreCoordFixture.TestCoreRangeSetMergeCoreRange (0 ms) +2025-02-20T20:37:59.0022787Z [ RUN ] CoreCoordFixture.TestCoreRangeAdjacent +2025-02-20T20:37:59.0023341Z [ OK ] CoreCoordFixture.TestCoreRangeAdjacent (0 ms) +2025-02-20T20:37:59.0024075Z [ RUN ] CoreCoordFixture.TestCoreRangeNotAdjacent +2025-02-20T20:37:59.0024654Z [ OK ] CoreCoordFixture.TestCoreRangeNotAdjacent (0 ms) +2025-02-20T20:37:59.0025205Z [ RUN ] CoreCoordFixture.TestCoreRangeContains +2025-02-20T20:37:59.0025851Z [ OK ] CoreCoordFixture.TestCoreRangeContains (0 ms) +2025-02-20T20:37:59.0026420Z [ RUN ] CoreCoordFixture.TestCoreRangeNotContains +2025-02-20T20:37:59.0027002Z [ OK ] CoreCoordFixture.TestCoreRangeNotContains (0 ms) +2025-02-20T20:37:59.0027571Z [----------] 18 tests from CoreCoordFixture (1 ms total) +2025-02-20T20:37:59.0027909Z +2025-02-20T20:37:59.0028081Z [----------] 3 tests from FreeListAllocator +2025-02-20T20:37:59.0028635Z [ RUN ] FreeListAllocator.TestDirectedSeriesOfAllocDealloc +2025-02-20T20:37:59.0029401Z [ OK ] FreeListAllocator.TestDirectedSeriesOfAllocDealloc (0 ms) +2025-02-20T20:37:59.0030035Z [ RUN ] FreeListAllocator.TestResizeAllocator +2025-02-20T20:37:59.0030639Z [ OK ] FreeListAllocator.TestResizeAllocator (0 ms) +2025-02-20T20:37:59.0031240Z [ RUN ] FreeListAllocator.TestDirectedResizeAllocator +2025-02-20T20:37:59.0031869Z [ OK ] FreeListAllocator.TestDirectedResizeAllocator (0 ms) +2025-02-20T20:37:59.0032476Z [----------] 3 tests from FreeListAllocator (0 ms total) +2025-02-20T20:37:59.0032817Z +2025-02-20T20:37:59.0032992Z [----------] 18 tests from FreeListOptTest +2025-02-20T20:37:59.0033467Z [ RUN ] FreeListOptTest.Allocation +2025-02-20T20:37:59.0034099Z [ OK ] FreeListOptTest.Allocation (0 ms) +2025-02-20T20:37:59.0034593Z [ RUN ] FreeListOptTest.Alignment +2025-02-20T20:37:59.0035068Z [ OK ] FreeListOptTest.Alignment (0 ms) +2025-02-20T20:37:59.0035580Z [ RUN ] FreeListOptTest.MinAllocationSize +2025-02-20T20:37:59.0036093Z [ OK ] FreeListOptTest.MinAllocationSize (0 ms) +2025-02-20T20:37:59.0036616Z [ RUN ] FreeListOptTest.Clear +2025-02-20T20:37:59.0037068Z [ OK ] FreeListOptTest.Clear (0 ms) +2025-02-20T20:37:59.0037583Z [ RUN ] FreeListOptTest.AllocationAndDeallocation +2025-02-20T20:37:59.0038177Z [ OK ] FreeListOptTest.AllocationAndDeallocation (0 ms) +2025-02-20T20:37:59.0038741Z [ RUN ] FreeListOptTest.AllocateAtAddress +2025-02-20T20:37:59.0039291Z [ OK ] FreeListOptTest.AllocateAtAddress (0 ms) +2025-02-20T20:37:59.0039877Z [ RUN ] FreeListOptTest.AllocateAtAddressInteractions +2025-02-20T20:37:59.0040496Z [ OK ] FreeListOptTest.AllocateAtAddressInteractions (0 ms) +2025-02-20T20:37:59.0041062Z [ RUN ] FreeListOptTest.ShrinkAndReset +2025-02-20T20:37:59.0041551Z [ OK ] FreeListOptTest.ShrinkAndReset (0 ms) +2025-02-20T20:37:59.0042042Z [ RUN ] FreeListOptTest.Statistics +2025-02-20T20:37:59.0042523Z [ OK ] FreeListOptTest.Statistics (0 ms) +2025-02-20T20:37:59.0043020Z [ RUN ] FreeListOptTest.AllocateFromTop +2025-02-20T20:37:59.0043519Z [ OK ] FreeListOptTest.AllocateFromTop (0 ms) +2025-02-20T20:37:59.0044007Z [ RUN ] FreeListOptTest.Coalescing +2025-02-20T20:37:59.0044512Z [ OK ] FreeListOptTest.Coalescing (0 ms) +2025-02-20T20:37:59.0045127Z [ RUN ] FreeListOptTest.CoalescingAfterResetShrink +2025-02-20T20:37:59.0045761Z [ OK ] FreeListOptTest.CoalescingAfterResetShrink (0 ms) +2025-02-20T20:37:59.0046314Z [ RUN ] FreeListOptTest.OutOfMemory +2025-02-20T20:37:59.0046899Z [ OK ] FreeListOptTest.OutOfMemory (0 ms) +2025-02-20T20:37:59.0047502Z [ RUN ] FreeListOptTest.AvailableAddresses +2025-02-20T20:37:59.0048139Z [ OK ] FreeListOptTest.AvailableAddresses (0 ms) +2025-02-20T20:37:59.0048724Z [ RUN ] FreeListOptTest.LowestOccupiedAddress +2025-02-20T20:37:59.0049345Z [ OK ] FreeListOptTest.LowestOccupiedAddress (0 ms) +2025-02-20T20:37:59.0050046Z [ RUN ] FreeListOptTest.LowestOccupiedAddressWithAllocateAt +2025-02-20T20:37:59.0050969Z [ OK ] FreeListOptTest.LowestOccupiedAddressWithAllocateAt (0 ms) +2025-02-20T20:37:59.0051599Z [ RUN ] FreeListOptTest.FirstFit +2025-02-20T20:37:59.0052158Z [ OK ] FreeListOptTest.FirstFit (0 ms) +2025-02-20T20:37:59.0052998Z [ RUN ] FreeListOptTest.FirstFitAllocateAtAddressInteractions +2025-02-20T20:37:59.0053737Z [ OK ] FreeListOptTest.FirstFitAllocateAtAddressInteractions (0 ms) +2025-02-20T20:37:59.0054357Z [----------] 18 tests from FreeListOptTest (3 ms total) +2025-02-20T20:37:59.0054662Z +2025-02-20T20:37:59.0054923Z [----------] 8 tests from BlockfloatCommonTests/ConvertU32ToBfpTests +2025-02-20T20:37:59.0055697Z [ RUN ] BlockfloatCommonTests/ConvertU32ToBfpTests.MantissaRoundingWithPositiveFloat/0 +2025-02-20T20:37:59.0056727Z [ OK ] BlockfloatCommonTests/ConvertU32ToBfpTests.MantissaRoundingWithPositiveFloat/0 (0 ms) +2025-02-20T20:37:59.0057763Z [ RUN ] BlockfloatCommonTests/ConvertU32ToBfpTests.MantissaRoundingWithPositiveFloat/1 +2025-02-20T20:37:59.0058975Z [ OK ] BlockfloatCommonTests/ConvertU32ToBfpTests.MantissaRoundingWithPositiveFloat/1 (0 ms) +2025-02-20T20:37:59.0060086Z [ RUN ] BlockfloatCommonTests/ConvertU32ToBfpTests.MantissaRoundingWithPositiveFloat/2 +2025-02-20T20:37:59.0061173Z [ OK ] BlockfloatCommonTests/ConvertU32ToBfpTests.MantissaRoundingWithPositiveFloat/2 (0 ms) +2025-02-20T20:37:59.0062133Z [ RUN ] BlockfloatCommonTests/ConvertU32ToBfpTests.MantissaRoundingWithPositiveFloat/3 +2025-02-20T20:37:59.0063101Z [ OK ] BlockfloatCommonTests/ConvertU32ToBfpTests.MantissaRoundingWithPositiveFloat/3 (0 ms) +2025-02-20T20:37:59.0064081Z [ RUN ] BlockfloatCommonTests/ConvertU32ToBfpTests.MantissaRoundingWithNegativeFloat/0 +2025-02-20T20:37:59.0065061Z [ OK ] BlockfloatCommonTests/ConvertU32ToBfpTests.MantissaRoundingWithNegativeFloat/0 (0 ms) +2025-02-20T20:37:59.0066116Z [ RUN ] BlockfloatCommonTests/ConvertU32ToBfpTests.MantissaRoundingWithNegativeFloat/1 +2025-02-20T20:37:59.0067097Z [ OK ] BlockfloatCommonTests/ConvertU32ToBfpTests.MantissaRoundingWithNegativeFloat/1 (0 ms) +2025-02-20T20:37:59.0068068Z [ RUN ] BlockfloatCommonTests/ConvertU32ToBfpTests.MantissaRoundingWithNegativeFloat/2 +2025-02-20T20:37:59.0069045Z [ OK ] BlockfloatCommonTests/ConvertU32ToBfpTests.MantissaRoundingWithNegativeFloat/2 (0 ms) +2025-02-20T20:37:59.0069989Z [ RUN ] BlockfloatCommonTests/ConvertU32ToBfpTests.MantissaRoundingWithNegativeFloat/3 +2025-02-20T20:37:59.0071043Z [ OK ] BlockfloatCommonTests/ConvertU32ToBfpTests.MantissaRoundingWithNegativeFloat/3 (0 ms) +2025-02-20T20:37:59.0071910Z [----------] 8 tests from BlockfloatCommonTests/ConvertU32ToBfpTests (0 ms total) +2025-02-20T20:37:59.0072359Z +2025-02-20T20:37:59.0072539Z [----------] Global test environment tear-down +2025-02-20T20:37:59.0073052Z [==========] 167 tests from 14 test suites ran. (25450 ms total) +2025-02-20T20:37:59.0073539Z [ PASSED ] 157 tests. +2025-02-20T20:37:59.0074088Z [ SKIPPED ] 9 tests, listed below: +2025-02-20T20:37:59.0074622Z [ SKIPPED ] NOC.TensixVerifyNocIdentityTranslationTable +2025-02-20T20:37:59.0075321Z [ SKIPPED ] DispatchFixture.TensixDRAMtoL1MulticastExcludeRegionUpLeft +2025-02-20T20:37:59.0076133Z [ SKIPPED ] DispatchFixture.TensixDRAMtoL1MulticastExcludeRegionUpRight +2025-02-20T20:37:59.0076943Z [ SKIPPED ] DispatchFixture.TensixDRAMtoL1MulticastExcludeRegionDownLeft +2025-02-20T20:37:59.0077771Z [ SKIPPED ] DispatchFixture.TensixDRAMtoL1MulticastExcludeRegionDownRight +2025-02-20T20:37:59.0078684Z [ SKIPPED ] CompileProgramWithKernelPathEnvVarFixture.TensixKernelUnderMetalRootDir +2025-02-20T20:37:59.0079704Z [ SKIPPED ] CompileProgramWithKernelPathEnvVarFixture.TensixKernelUnderKernelRootDir +2025-02-20T20:37:59.0080796Z [ SKIPPED ] CompileProgramWithKernelPathEnvVarFixture.TensixKernelUnderMetalRootDirAndKernelRootDir +2025-02-20T20:37:59.0081830Z [ SKIPPED ] CompileProgramWithKernelPathEnvVarFixture.TensixNonExistentKernel +2025-02-20T20:37:59.0082490Z [ FAILED ] 1 test, listed below: +2025-02-20T20:37:59.0082952Z [ FAILED ] TensorShapeBaseTests.DoNotSubmit +2025-02-20T20:37:59.0083252Z +2025-02-20T20:37:59.0083401Z 1 FAILED TEST +2025-02-20T20:37:59.0083741Z YOU HAVE 2 DISABLED TESTS +2025-02-20T20:37:59.0084138Z +2025-02-20T20:37:59.0084674Z  Device | INFO  | Closing user mode device drivers +2025-02-20T20:38:00.5341548Z Prepare all required actions +2025-02-20T20:38:00.5342074Z Getting action download info +2025-02-20T20:38:00.8089935Z Download action repository 'slackapi/slack-github-action@v1.26.0' (SHA:70cd7be8e40a46e8b0eced40b0de447bdb42f68e) +2025-02-20T20:38:01.4412957Z ##[group]Run ./.github/actions/slack-report +2025-02-20T20:38:01.4413393Z with: +2025-02-20T20:38:01.4414219Z slack_webhook_url: *** +2025-02-20T20:38:01.4414582Z owner: U06CXU895AP +2025-02-20T20:38:01.4414914Z env: +2025-02-20T20:38:01.4415214Z ARCH_NAME: grayskull +2025-02-20T20:38:01.4415562Z LOGURU_LEVEL: INFO +2025-02-20T20:38:01.4416071Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:01.4416841Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-20T20:38:01.4417610Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:01.4418456Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:01.4419183Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:01.4419896Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-20T20:38:01.4420628Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-20T20:38:01.4421225Z BUILD_TAG: 77398908e9f0099b5a855e28a7cddf2825c70d41 +2025-02-20T20:38:01.4422064Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:77398908e9f0099b5a855e28a7cddf2825c70d41 +2025-02-20T20:38:01.4422846Z RUNNER_UID: 1000 +2025-02-20T20:38:01.4423170Z RUNNER_GID: 1000 +2025-02-20T20:38:01.4423493Z ##[endgroup] +2025-02-20T20:38:01.4461407Z Prepare all required actions +2025-02-20T20:38:01.4461929Z Getting action download info +2025-02-20T20:38:01.5828407Z Download action repository 'actions/upload-artifact@v4' (SHA:65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08) +2025-02-20T20:38:02.3425795Z ##[group]Run ./.github/actions/upload-artifact-with-job-uuid +2025-02-20T20:38:02.3426308Z with: +2025-02-20T20:38:02.3426644Z path: generated/test_reports/ + +2025-02-20T20:38:02.3427068Z prefix: test_reports_ +2025-02-20T20:38:02.3427438Z env: +2025-02-20T20:38:02.3427734Z ARCH_NAME: grayskull +2025-02-20T20:38:02.3428099Z LOGURU_LEVEL: INFO +2025-02-20T20:38:02.3428649Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:02.3429474Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-20T20:38:02.3430279Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:02.3431135Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:02.3431895Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:02.3432693Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-20T20:38:02.3433449Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-20T20:38:02.3434268Z BUILD_TAG: 77398908e9f0099b5a855e28a7cddf2825c70d41 +2025-02-20T20:38:02.3435157Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:77398908e9f0099b5a855e28a7cddf2825c70d41 +2025-02-20T20:38:02.3435982Z RUNNER_UID: 1000 +2025-02-20T20:38:02.3436333Z RUNNER_GID: 1000 +2025-02-20T20:38:02.3436680Z ##[endgroup] +2025-02-20T20:38:02.3467373Z ##[group]Run uuid=$(uuidgen) +2025-02-20T20:38:02.3467765Z uuid=$(uuidgen) +2025-02-20T20:38:02.3468184Z artifact_name="test_reports_$uuid" +2025-02-20T20:38:02.3468694Z echo "[UPLOAD-ARTIFACT-UUID] $artifact_name" +2025-02-20T20:38:02.3469284Z echo "artifact-name=$artifact_name" >> "$GITHUB_OUTPUT" +2025-02-20T20:38:02.3490733Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-20T20:38:02.3491483Z env: +2025-02-20T20:38:02.3491794Z ARCH_NAME: grayskull +2025-02-20T20:38:02.3492174Z LOGURU_LEVEL: INFO +2025-02-20T20:38:02.3492703Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:02.3493585Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-20T20:38:02.3494365Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:02.3495089Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:02.3495820Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:02.3496565Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-20T20:38:02.3497322Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-20T20:38:02.3497965Z BUILD_TAG: 77398908e9f0099b5a855e28a7cddf2825c70d41 +2025-02-20T20:38:02.3498825Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:77398908e9f0099b5a855e28a7cddf2825c70d41 +2025-02-20T20:38:02.3499612Z RUNNER_UID: 1000 +2025-02-20T20:38:02.3499990Z RUNNER_GID: 1000 +2025-02-20T20:38:02.3500321Z ##[endgroup] +2025-02-20T20:38:02.3636604Z [UPLOAD-ARTIFACT-UUID] test_reports_0c4930ff-041a-4c44-ad7d-7f38b72b304a +2025-02-20T20:38:02.3695514Z ##[group]Run actions/upload-artifact@v4 +2025-02-20T20:38:02.3696018Z with: +2025-02-20T20:38:02.3696475Z name: test_reports_0c4930ff-041a-4c44-ad7d-7f38b72b304a +2025-02-20T20:38:02.3697061Z path: generated/test_reports/ + +2025-02-20T20:38:02.3697551Z if-no-files-found: warn +2025-02-20T20:38:02.3698223Z compression-level: 6 +2025-02-20T20:38:02.3698630Z overwrite: false +2025-02-20T20:38:02.3699035Z include-hidden-files: false +2025-02-20T20:38:02.3699737Z env: +2025-02-20T20:38:02.3700133Z ARCH_NAME: grayskull +2025-02-20T20:38:02.3700558Z LOGURU_LEVEL: INFO +2025-02-20T20:38:02.3701188Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:02.3702060Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-20T20:38:02.3702914Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:02.3703704Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:02.3704493Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:02.3705291Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-20T20:38:02.3706101Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-20T20:38:02.3706802Z BUILD_TAG: 77398908e9f0099b5a855e28a7cddf2825c70d41 +2025-02-20T20:38:02.3707744Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:77398908e9f0099b5a855e28a7cddf2825c70d41 +2025-02-20T20:38:02.3708563Z RUNNER_UID: 1000 +2025-02-20T20:38:02.3709160Z RUNNER_GID: 1000 +2025-02-20T20:38:02.3709576Z ##[endgroup] +2025-02-20T20:38:02.6324840Z With the provided path, there will be 1 file uploaded +2025-02-20T20:38:02.6329358Z Artifact name is valid! +2025-02-20T20:38:02.6330429Z Root directory input is valid! +2025-02-20T20:38:02.8447111Z Beginning upload of artifact content to blob storage +2025-02-20T20:38:03.1216962Z Uploaded bytes 4624 +2025-02-20T20:38:03.1817779Z Finished uploading artifact content to blob storage! +2025-02-20T20:38:03.1820919Z SHA256 hash of uploaded artifact zip is 7d237dab5ee87dd118d90396def8f92c8696f3153eae2beb89b2a11f541bd67d +2025-02-20T20:38:03.1823310Z Finalizing artifact upload +2025-02-20T20:38:03.2873511Z Artifact test_reports_0c4930ff-041a-4c44-ad7d-7f38b72b304a.zip successfully finalized. Artifact ID 2626388536 +2025-02-20T20:38:03.2875082Z Artifact test_reports_0c4930ff-041a-4c44-ad7d-7f38b72b304a has been successfully uploaded! Final size is 4624 bytes. Artifact ID is 2626388536 +2025-02-20T20:38:03.2880525Z Artifact download URL: https://github.com/tenstorrent/tt-metal/actions/runs/13443325356/artifacts/2626388536 +2025-02-20T20:38:03.3011480Z Prepare all required actions +2025-02-20T20:38:03.3013081Z Getting action download info +2025-02-20T20:38:03.4399283Z ##[group]Run ./.github/actions/generate-system-logs +2025-02-20T20:38:03.4399788Z with: +2025-02-20T20:38:03.4400113Z env: +2025-02-20T20:38:03.4400446Z ARCH_NAME: grayskull +2025-02-20T20:38:03.4400820Z LOGURU_LEVEL: INFO +2025-02-20T20:38:03.4401356Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:03.4402167Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-20T20:38:03.4402966Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:03.4403712Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:03.4404448Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:03.4405194Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-20T20:38:03.4406026Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-20T20:38:03.4406670Z BUILD_TAG: 77398908e9f0099b5a855e28a7cddf2825c70d41 +2025-02-20T20:38:03.4407519Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:77398908e9f0099b5a855e28a7cddf2825c70d41 +2025-02-20T20:38:03.4408323Z RUNNER_UID: 1000 +2025-02-20T20:38:03.4408852Z RUNNER_GID: 1000 +2025-02-20T20:38:03.4409197Z ##[endgroup] +2025-02-20T20:38:03.4438449Z ##[group]Run echo "HOSTNAME=$(hostname)" >> $GITHUB_ENV +2025-02-20T20:38:03.4439007Z echo "HOSTNAME=$(hostname)" >> $GITHUB_ENV +2025-02-20T20:38:03.4439576Z echo "TIMESTAMP=$(date +'%Y%m%d%H%M%S')" >> $GITHUB_ENV +2025-02-20T20:38:03.4462022Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-20T20:38:03.4462538Z env: +2025-02-20T20:38:03.4462882Z ARCH_NAME: grayskull +2025-02-20T20:38:03.4463275Z LOGURU_LEVEL: INFO +2025-02-20T20:38:03.4463851Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:03.4464692Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-20T20:38:03.4465494Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:03.4466292Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:03.4467040Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:03.4467807Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-20T20:38:03.4468572Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-20T20:38:03.4469225Z BUILD_TAG: 77398908e9f0099b5a855e28a7cddf2825c70d41 +2025-02-20T20:38:03.4470099Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:77398908e9f0099b5a855e28a7cddf2825c70d41 +2025-02-20T20:38:03.4470918Z RUNNER_UID: 1000 +2025-02-20T20:38:03.4471278Z RUNNER_GID: 1000 +2025-02-20T20:38:03.4471633Z ##[endgroup] +2025-02-20T20:38:03.4578976Z ##[group]Run rm -rf ~/run-log +2025-02-20T20:38:03.4579393Z rm -rf ~/run-log +2025-02-20T20:38:03.4579781Z mkdir -p ~/run-log/ +2025-02-20T20:38:03.4580330Z sudo dmesg > ~/run-log/20250220203803_tt-metal-ci-vm-105_dmesg.log +2025-02-20T20:38:03.4581044Z sudo lspci > ~/run-log/20250220203803_tt-metal-ci-vm-105_lspci.log +2025-02-20T20:38:03.4581708Z sudo lshw > ~/run-log/20250220203803_tt-metal-ci-vm-105_lshw.log +2025-02-20T20:38:03.4596534Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-20T20:38:03.4597014Z env: +2025-02-20T20:38:03.4597299Z ARCH_NAME: grayskull +2025-02-20T20:38:03.4597646Z LOGURU_LEVEL: INFO +2025-02-20T20:38:03.4598150Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:03.4599155Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-20T20:38:03.4600053Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:03.4600777Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:03.4601530Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:03.4602250Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-20T20:38:03.4602982Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-20T20:38:03.4603601Z BUILD_TAG: 77398908e9f0099b5a855e28a7cddf2825c70d41 +2025-02-20T20:38:03.4604428Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:77398908e9f0099b5a855e28a7cddf2825c70d41 +2025-02-20T20:38:03.4605214Z RUNNER_UID: 1000 +2025-02-20T20:38:03.4605539Z RUNNER_GID: 1000 +2025-02-20T20:38:03.4605895Z HOSTNAME: tt-metal-ci-vm-105 +2025-02-20T20:38:03.4606303Z TIMESTAMP: 20250220203803 +2025-02-20T20:38:03.4606660Z ##[endgroup] +2025-02-20T20:38:04.9919339Z ##[group]Run tar -cvf ~/run-log/sys_logs.tar ~/run-log/20250220203803_tt-metal-ci-vm-105_* +2025-02-20T20:38:04.9920205Z tar -cvf ~/run-log/sys_logs.tar ~/run-log/20250220203803_tt-metal-ci-vm-105_* +2025-02-20T20:38:04.9940600Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-20T20:38:04.9941106Z env: +2025-02-20T20:38:04.9941434Z ARCH_NAME: grayskull +2025-02-20T20:38:04.9941800Z LOGURU_LEVEL: INFO +2025-02-20T20:38:04.9942306Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:04.9943115Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-20T20:38:04.9943911Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:04.9946308Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:04.9947069Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:04.9947828Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-20T20:38:04.9948589Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-20T20:38:04.9949217Z BUILD_TAG: 77398908e9f0099b5a855e28a7cddf2825c70d41 +2025-02-20T20:38:04.9950103Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:77398908e9f0099b5a855e28a7cddf2825c70d41 +2025-02-20T20:38:04.9950975Z RUNNER_UID: 1000 +2025-02-20T20:38:04.9951331Z RUNNER_GID: 1000 +2025-02-20T20:38:04.9951702Z HOSTNAME: tt-metal-ci-vm-105 +2025-02-20T20:38:04.9952109Z TIMESTAMP: 20250220203803 +2025-02-20T20:38:04.9952493Z ##[endgroup] +2025-02-20T20:38:05.0005878Z tar: Removing leading `/' from member names +2025-02-20T20:38:05.0008560Z /home/ubuntu/run-log/20250220203803_tt-metal-ci-vm-105_dmesg.log +2025-02-20T20:38:05.0009140Z tar: Removing leading `/' from hard link targets +2025-02-20T20:38:05.0009699Z /home/ubuntu/run-log/20250220203803_tt-metal-ci-vm-105_lshw.log +2025-02-20T20:38:05.0010339Z /home/ubuntu/run-log/20250220203803_tt-metal-ci-vm-105_lspci.log +2025-02-20T20:38:05.0060811Z ##[group]Run actions/upload-artifact@v4 +2025-02-20T20:38:05.0061244Z with: +2025-02-20T20:38:05.0061591Z name: 20250220203803_tt-metal-ci-vm-105_sys_logs +2025-02-20T20:38:05.0062078Z path: ~/run-log/20250220203803_sys_logs.tar +2025-02-20T20:38:05.0062524Z if-no-files-found: warn +2025-02-20T20:38:05.0062912Z compression-level: 6 +2025-02-20T20:38:05.0063288Z overwrite: false +2025-02-20T20:38:05.0063662Z include-hidden-files: false +2025-02-20T20:38:05.0064047Z env: +2025-02-20T20:38:05.0064364Z ARCH_NAME: grayskull +2025-02-20T20:38:05.0064697Z LOGURU_LEVEL: INFO +2025-02-20T20:38:05.0065227Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:05.0066033Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-20T20:38:05.0067267Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:05.0068002Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:05.0068741Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:05.0069531Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-20T20:38:05.0070303Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-20T20:38:05.0070994Z BUILD_TAG: 77398908e9f0099b5a855e28a7cddf2825c70d41 +2025-02-20T20:38:05.0071851Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:77398908e9f0099b5a855e28a7cddf2825c70d41 +2025-02-20T20:38:05.0072646Z RUNNER_UID: 1000 +2025-02-20T20:38:05.0072997Z RUNNER_GID: 1000 +2025-02-20T20:38:05.0073339Z HOSTNAME: tt-metal-ci-vm-105 +2025-02-20T20:38:05.0073752Z TIMESTAMP: 20250220203803 +2025-02-20T20:38:05.0074442Z ##[endgroup] +2025-02-20T20:38:05.2708735Z ##[warning]No files were found with the provided path: ~/run-log/20250220203803_sys_logs.tar. No artifacts will be uploaded. +2025-02-20T20:38:05.2841182Z Prepare all required actions +2025-02-20T20:38:05.2902320Z ##[group]Run ./.github/actions/generate-gtest-failure-message +2025-02-20T20:38:05.2902861Z with: +2025-02-20T20:38:05.2903237Z path: generated/test_reports/ + +2025-02-20T20:38:05.2903664Z env: +2025-02-20T20:38:05.2903997Z ARCH_NAME: grayskull +2025-02-20T20:38:05.2904356Z LOGURU_LEVEL: INFO +2025-02-20T20:38:05.2904904Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:05.2905744Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-20T20:38:05.2906565Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:05.2907318Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:05.2908111Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:05.2908941Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-20T20:38:05.2909709Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-20T20:38:05.2910385Z BUILD_TAG: 77398908e9f0099b5a855e28a7cddf2825c70d41 +2025-02-20T20:38:05.2911252Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:77398908e9f0099b5a855e28a7cddf2825c70d41 +2025-02-20T20:38:05.2912033Z RUNNER_UID: 1000 +2025-02-20T20:38:05.2912391Z RUNNER_GID: 1000 +2025-02-20T20:38:05.2912770Z HOSTNAME: tt-metal-ci-vm-105 +2025-02-20T20:38:05.2913189Z TIMESTAMP: 20250220203803 +2025-02-20T20:38:05.2913579Z ##[endgroup] +2025-02-20T20:38:05.2941323Z ##[group]Run set +e +2025-02-20T20:38:05.2941686Z set +e +2025-02-20T20:38:05.2942303Z python3 .github/scripts/data_analysis/print_gtest_annotations.py generated/test_reports/ +2025-02-20T20:38:05.2964656Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-20T20:38:05.2965169Z env: +2025-02-20T20:38:05.2965476Z ARCH_NAME: grayskull +2025-02-20T20:38:05.2965857Z LOGURU_LEVEL: INFO +2025-02-20T20:38:05.2966400Z pythonLocation: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:05.2967214Z PKG_CONFIG_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib/pkgconfig +2025-02-20T20:38:05.2968023Z Python_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:05.2968769Z Python2_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:05.2969515Z Python3_ROOT_DIR: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64 +2025-02-20T20:38:05.2970293Z LD_LIBRARY_PATH: /home/ubuntu/actions-runner/_work/_tool/Python/3.8.18/x64/lib +2025-02-20T20:38:05.2971057Z VIRTUAL_ENV: /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/python_env +2025-02-20T20:38:05.2971960Z BUILD_TAG: 77398908e9f0099b5a855e28a7cddf2825c70d41 +2025-02-20T20:38:05.2972823Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:77398908e9f0099b5a855e28a7cddf2825c70d41 +2025-02-20T20:38:05.2973605Z RUNNER_UID: 1000 +2025-02-20T20:38:05.2973969Z RUNNER_GID: 1000 +2025-02-20T20:38:05.2974433Z HOSTNAME: tt-metal-ci-vm-105 +2025-02-20T20:38:05.2974870Z TIMESTAMP: 20250220203803 +2025-02-20T20:38:05.2975282Z ##[endgroup] +2025-02-20T20:38:05.4449758Z ##[error]/work/tests/tt_metal/tt_metal/api/test_shape_base.cpp:56 +Expected equality of these values: + 0 + 1 +2025-02-20T20:38:05.4615776Z Post job cleanup. +2025-02-20T20:38:05.4673113Z Post job cleanup. +2025-02-20T20:38:05.5523515Z [command]/usr/bin/git version +2025-02-20T20:38:05.5561400Z git version 2.25.1 +2025-02-20T20:38:05.5599983Z Copying '/home/ubuntu/.gitconfig' to '/home/ubuntu/actions-runner/_work/_temp/d02a0d02-ccce-488a-a35f-0ddfdc3912e7/.gitconfig' +2025-02-20T20:38:05.5611250Z Temporarily overriding HOME='/home/ubuntu/actions-runner/_work/_temp/d02a0d02-ccce-488a-a35f-0ddfdc3912e7' before making global git config changes +2025-02-20T20:38:05.5612486Z Adding repository directory to the temporary git global config as a safe directory +2025-02-20T20:38:05.5615562Z [command]/usr/bin/git config --global --add safe.directory /home/ubuntu/actions-runner/_work/tt-metal/tt-metal +2025-02-20T20:38:05.5641710Z [command]/usr/bin/git config --local --name-only --get-regexp core\.sshCommand +2025-02-20T20:38:05.5677309Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'core\.sshCommand' && git config --local --unset-all 'core.sshCommand' || :" +2025-02-20T20:38:05.5934216Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-20T20:38:05.5979441Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-20T20:38:05.6028355Z Entering 'tt_metal/third_party/tracy' +2025-02-20T20:38:05.6074407Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-20T20:38:05.6117321Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-20T20:38:05.6165531Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-20T20:38:05.6210871Z Entering 'tt_metal/third_party/umd' +2025-02-20T20:38:05.6269153Z [command]/usr/bin/git config --local --name-only --get-regexp http\.https\:\/\/github\.com\/\.extraheader +2025-02-20T20:38:05.6294739Z http.https://github.com/.extraheader +2025-02-20T20:38:05.6303510Z [command]/usr/bin/git config --local --unset-all http.https://github.com/.extraheader +2025-02-20T20:38:05.6330398Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'http\.https\:\/\/github\.com\/\.extraheader' && git config --local --unset-all 'http.https://github.com/.extraheader' || :" +2025-02-20T20:38:05.6575172Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-20T20:38:05.6618730Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-20T20:38:05.6663948Z Entering 'tt_metal/third_party/tracy' +2025-02-20T20:38:05.6708418Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-20T20:38:05.6755084Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-20T20:38:05.6801450Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-20T20:38:05.6846780Z Entering 'tt_metal/third_party/umd' +2025-02-20T20:38:05.7040389Z Post job cleanup. +2025-02-20T20:38:06.0502463Z [command]/usr/bin/docker logout https://ghcr.io +2025-02-20T20:38:06.0645902Z Removing login credentials for ghcr.io +2025-02-20T20:38:06.0686467Z ##[group]Post cache +2025-02-20T20:38:06.0687128Z State not set +2025-02-20T20:38:06.0688361Z ##[endgroup] +2025-02-20T20:38:06.0835860Z Post job cleanup. +2025-02-20T20:38:06.0899781Z Post job cleanup. +2025-02-20T20:38:06.0974421Z Post job cleanup. +2025-02-20T20:38:06.1056765Z Post job cleanup. +2025-02-20T20:38:06.2166669Z [command]/usr/bin/git version +2025-02-20T20:38:06.2207700Z git version 2.25.1 +2025-02-20T20:38:06.2246941Z Copying '/home/ubuntu/.gitconfig' to '/home/ubuntu/actions-runner/_work/_temp/56995755-d909-428f-9b0b-c28912765da1/.gitconfig' +2025-02-20T20:38:06.2258220Z Temporarily overriding HOME='/home/ubuntu/actions-runner/_work/_temp/56995755-d909-428f-9b0b-c28912765da1' before making global git config changes +2025-02-20T20:38:06.2259392Z Adding repository directory to the temporary git global config as a safe directory +2025-02-20T20:38:06.2264751Z [command]/usr/bin/git config --global --add safe.directory /home/ubuntu/actions-runner/_work/tt-metal/tt-metal +2025-02-20T20:38:06.2312189Z [command]/usr/bin/git config --local --name-only --get-regexp core\.sshCommand +2025-02-20T20:38:06.2343772Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'core\.sshCommand' && git config --local --unset-all 'core.sshCommand' || :" +2025-02-20T20:38:06.2610275Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-20T20:38:06.2656288Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-20T20:38:06.2702399Z Entering 'tt_metal/third_party/tracy' +2025-02-20T20:38:06.2745528Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-20T20:38:06.2788917Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-20T20:38:06.2833225Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-20T20:38:06.2874388Z Entering 'tt_metal/third_party/umd' +2025-02-20T20:38:06.2938058Z [command]/usr/bin/git config --local --name-only --get-regexp http\.https\:\/\/github\.com\/\.extraheader +2025-02-20T20:38:06.2968925Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'http\.https\:\/\/github\.com\/\.extraheader' && git config --local --unset-all 'http.https://github.com/.extraheader' || :" +2025-02-20T20:38:06.3210532Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-20T20:38:06.3257643Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-20T20:38:06.3304285Z Entering 'tt_metal/third_party/tracy' +2025-02-20T20:38:06.3353531Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-20T20:38:06.3402385Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-20T20:38:06.3453626Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-20T20:38:06.3501150Z Entering 'tt_metal/third_party/umd' +2025-02-20T20:38:06.3649006Z A job completed hook has been configured by the self-hosted runner administrator +2025-02-20T20:38:06.3681436Z ##[group]Run '/opt/tt_metal_infra/scripts/ci/grayskull/cleanup.sh' +2025-02-20T20:38:06.3696146Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-20T20:38:06.3696706Z ##[endgroup] +2025-02-20T20:38:06.3762191Z Current date / time is Thu Feb 20 20:38:06 UTC 2025 +2025-02-20T20:38:06.5655208Z Cleaning up orphan processes diff --git a/infra/tests/_data/data_collection/cicd/all_post_commit_test_annotations_13443325356/13443325356/logs/37563095078_annotations.json b/infra/tests/_data/data_collection/cicd/all_post_commit_test_annotations_13443325356/13443325356/logs/37563095078_annotations.json new file mode 100644 index 00000000000..04a12b0a55d --- /dev/null +++ b/infra/tests/_data/data_collection/cicd/all_post_commit_test_annotations_13443325356/13443325356/logs/37563095078_annotations.json @@ -0,0 +1 @@ +[{"path":".github","blob_href":"https://github.com/tenstorrent/tt-metal/blob/94429171440755ffe7c62085c4807d447dd369dc/.github","start_line":102,"start_column":null,"end_line":102,"end_column":null,"annotation_level":"notice","title":"","message":"[DEPRECATION] This action is deprecated. Please migrate to reading the Docker image from the pipeline.","raw_details":""},{"path":".github","blob_href":"https://github.com/tenstorrent/tt-metal/blob/94429171440755ffe7c62085c4807d447dd369dc/.github","start_line":113,"start_column":null,"end_line":113,"end_column":null,"annotation_level":"warning","title":"","message":"No files were found with the provided path: ~/run-log/20250220203803_sys_logs.tar. No artifacts will be uploaded.","raw_details":""},{"path":"tests/tt_metal/tt_metal/api/test_shape_base.cpp","blob_href":"https://github.com/tenstorrent/tt-metal/blob/94429171440755ffe7c62085c4807d447dd369dc/tests/tt_metal/tt_metal/api/test_shape_base.cpp","start_line":56,"start_column":null,"end_line":56,"end_column":null,"annotation_level":"failure","title":"","message":"/work/tests/tt_metal/tt_metal/api/test_shape_base.cpp:56\nExpected equality of these values:\n 0\n 1","raw_details":""},{"path":".github","blob_href":"https://github.com/tenstorrent/tt-metal/blob/94429171440755ffe7c62085c4807d447dd369dc/.github","start_line":32,"start_column":null,"end_line":32,"end_column":null,"annotation_level":"notice","title":"disk-usage-after-startup","message":"Disk usage is 60 %","raw_details":""},{"path":".github","blob_href":"https://github.com/tenstorrent/tt-metal/blob/94429171440755ffe7c62085c4807d447dd369dc/.github","start_line":136,"start_column":null,"end_line":136,"end_column":null,"annotation_level":"notice","title":"printing-smi-info-startup","message":"Touching and printing out SMI info","raw_details":""},{"path":".github","blob_href":"https://github.com/tenstorrent/tt-metal/blob/94429171440755ffe7c62085c4807d447dd369dc/.github","start_line":286,"start_column":null,"end_line":286,"end_column":null,"annotation_level":"notice","title":"reset-successful-startup","message":"tt-smi reset was successful","raw_details":""},{"path":".github","blob_href":"https://github.com/tenstorrent/tt-metal/blob/94429171440755ffe7c62085c4807d447dd369dc/.github","start_line":315,"start_column":null,"end_line":315,"end_column":null,"annotation_level":"warning","title":"hugepages-service-not-found-startup","message":"Hugepages service not found. Using old rc.local method","raw_details":""},{"path":".github","blob_href":"https://github.com/tenstorrent/tt-metal/blob/94429171440755ffe7c62085c4807d447dd369dc/.github","start_line":321,"start_column":null,"end_line":321,"end_column":null,"annotation_level":"notice","title":"hugepages-setup-success-startup","message":"Hugepages is now setup.","raw_details":""}] diff --git a/infra/tests/_data/data_collection/cicd/all_post_commit_test_annotations_13443325356/13443325356/logs/37563108566.log b/infra/tests/_data/data_collection/cicd/all_post_commit_test_annotations_13443325356/13443325356/logs/37563108566.log new file mode 100644 index 00000000000..53a5f820e4d --- /dev/null +++ b/infra/tests/_data/data_collection/cicd/all_post_commit_test_annotations_13443325356/13443325356/logs/37563108566.log @@ -0,0 +1,570 @@ +2025-02-20T20:37:44.6266323Z Current runner version: '2.322.0' +2025-02-20T20:37:44.6274296Z Runner name: 'tt-metal-ci-vm-2' +2025-02-20T20:37:44.6275331Z Runner group name: 'Default' +2025-02-20T20:37:44.6276747Z Machine name: 'tt-metal-ci-vm-2' +2025-02-20T20:37:44.6280978Z ##[group]GITHUB_TOKEN Permissions +2025-02-20T20:37:44.6283785Z Actions: read +2025-02-20T20:37:44.6284504Z Contents: write +2025-02-20T20:37:44.6285301Z Metadata: read +2025-02-20T20:37:44.6286017Z Packages: write +2025-02-20T20:37:44.6286688Z Pages: write +2025-02-20T20:37:44.6287425Z PullRequests: write +2025-02-20T20:37:44.6288240Z ##[endgroup] +2025-02-20T20:37:44.6291978Z Secret source: Actions +2025-02-20T20:37:44.6292976Z Prepare workflow directory +2025-02-20T20:37:44.8730117Z Prepare all required actions +2025-02-20T20:37:44.8788627Z Getting action download info +2025-02-20T20:37:45.0485348Z Download action repository 'tenstorrent/tt-metal@main' (SHA:fd3ed75e96eb5b555f2f39cdefd37d8698ff8418) +2025-02-20T20:37:50.8079115Z Download action repository 'actions/download-artifact@v4' (SHA:fa0a91b85d4f404e444e00e005971372dc801d16) +2025-02-20T20:37:51.5785010Z Getting action download info +2025-02-20T20:37:51.7118975Z Download action repository 'actions/checkout@v4' (SHA:11bd71901bbe5b1630ceea73d27597364c9af683) +2025-02-20T20:37:52.3318696Z Uses: tenstorrent/tt-metal/.github/workflows/ttnn-post-commit.yaml@refs/heads/williamly/test-failure-annotations (94429171440755ffe7c62085c4807d447dd369dc) +2025-02-20T20:37:52.3321404Z ##[group] Inputs +2025-02-20T20:37:52.3321830Z build-type: Release +2025-02-20T20:37:52.3322572Z with-retries: false +2025-02-20T20:37:52.3322973Z arch: grayskull +2025-02-20T20:37:52.3323321Z runner-label: E150 +2025-02-20T20:37:52.3324168Z timeout: 45 +2025-02-20T20:37:52.3324575Z num-groups: 12 +2025-02-20T20:37:52.3324909Z ##[endgroup] +2025-02-20T20:37:52.3325445Z Complete job name: ttnn-unit-tests (grayskull, E150) / ttnn group 4 grayskull E150 +2025-02-20T20:37:52.4028036Z A job started hook has been configured by the self-hosted runner administrator +2025-02-20T20:37:52.4178100Z ##[group]Run '/opt/tt_metal_infra/scripts/ci/grayskull/reset.sh' +2025-02-20T20:37:52.4197119Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-20T20:37:52.4197899Z ##[endgroup] +2025-02-20T20:37:52.4364102Z ++ date +2025-02-20T20:37:52.4364682Z Current date / time is Thu Feb 20 20:37:52 UTC 2025 +2025-02-20T20:37:52.4365615Z + echo Current date / time is Thu Feb 20 20:37:52 UTC 2025 +2025-02-20T20:37:52.4366507Z + sudo find /home/ubuntu/actions-runner/_work/tt-metal/tt-metal -user root -exec rm -rf '{}' + +2025-02-20T20:37:52.9022858Z + set_e_was_enabled=false +2025-02-20T20:37:52.9023452Z + [[ ehxB == *e* ]] +2025-02-20T20:37:52.9023785Z + set_e_was_enabled=true +2025-02-20T20:37:52.9024109Z + set +e +2025-02-20T20:37:52.9024387Z + docker image prune +2025-02-20T20:37:52.9156607Z WARNING! This will remove all dangling images. +2025-02-20T20:37:52.9192951Z ++ df +2025-02-20T20:37:52.9195290Z ++ awk '{print $5}' +2025-02-20T20:37:52.9198300Z +++ findmnt -n -o SOURCE / +2025-02-20T20:37:52.9198783Z ++ sed s/%// +2025-02-20T20:37:52.9219550Z ++ grep -w '^/dev/vda1' +2025-02-20T20:37:52.9236544Z + disk_usage_before=86 +2025-02-20T20:37:52.9250924Z Are you sure you want to continue? [y/N] ::notice title=disk-usage-before-startup::Disk usage is 86 % +2025-02-20T20:37:52.9252083Z + echo '::notice title=disk-usage-before-startup::Disk usage is 86 %' +2025-02-20T20:37:52.9252618Z + '[' 86 -ge 90 ']' +2025-02-20T20:37:52.9252925Z ++ df +2025-02-20T20:37:52.9253215Z ++ awk '{print $5}' +2025-02-20T20:37:52.9253525Z ++ sed s/%// +2025-02-20T20:37:52.9256928Z +++ findmnt -n -o SOURCE / +2025-02-20T20:37:52.9281301Z ++ grep -w '^/dev/vda1' +2025-02-20T20:37:52.9296776Z + disk_usage_after=86 +2025-02-20T20:37:52.9297274Z + echo '::notice title=disk-usage-after-startup::Disk usage is 86 %' +2025-02-20T20:37:52.9297796Z + '[' 86 -ge 90 ']' +2025-02-20T20:37:52.9325387Z ##[notice]Disk usage is 86 % +2025-02-20T20:37:52.9333262Z ++ lsmod +2025-02-20T20:37:52.9333630Z + lsmod_output='Module Size Used by +2025-02-20T20:37:52.9334499Z wekafsio 70086656 1 +2025-02-20T20:37:52.9334885Z wekafsgw 40960 4 wekafsio +2025-02-20T20:37:52.9335280Z veth 28672 0 +2025-02-20T20:37:52.9335701Z uio_pci_generic 16384 0 +2025-02-20T20:37:52.9336084Z igb_uio 20480 0 +2025-02-20T20:37:52.9336478Z uio 20480 2 igb_uio,uio_pci_generic +2025-02-20T20:37:52.9336893Z xt_conntrack 16384 1 +2025-02-20T20:37:52.9337358Z xt_MASQUERADE 20480 1 +2025-02-20T20:37:52.9337742Z nf_conntrack_netlink 45056 0 +2025-02-20T20:37:52.9338167Z nfnetlink 16384 2 nf_conntrack_netlink +2025-02-20T20:37:52.9338611Z xfrm_user 36864 1 +2025-02-20T20:37:52.9338990Z xfrm_algo 16384 1 xfrm_user +2025-02-20T20:37:52.9339392Z iptable_nat 16384 1 +2025-02-20T20:37:52.9339815Z nf_nat 45056 2 iptable_nat,xt_MASQUERADE +2025-02-20T20:37:52.9340412Z nf_conntrack 139264 4 xt_conntrack,nf_nat,nf_conntrack_netlink,xt_MASQUERADE +2025-02-20T20:37:52.9341008Z nf_defrag_ipv6 24576 1 nf_conntrack +2025-02-20T20:37:52.9341433Z nf_defrag_ipv4 16384 1 nf_conntrack +2025-02-20T20:37:52.9341840Z xt_addrtype 16384 2 +2025-02-20T20:37:52.9342206Z iptable_filter 16384 1 +2025-02-20T20:37:52.9342596Z bpfilter 32768 0 +2025-02-20T20:37:52.9342961Z br_netfilter 28672 0 +2025-02-20T20:37:52.9343352Z bridge 176128 1 br_netfilter +2025-02-20T20:37:52.9343790Z stp 16384 1 bridge +2025-02-20T20:37:52.9344188Z llc 16384 2 bridge,stp +2025-02-20T20:37:52.9344592Z aufs 262144 0 +2025-02-20T20:37:52.9344933Z xfs 1286144 2 +2025-02-20T20:37:52.9345282Z overlay 118784 0 +2025-02-20T20:37:52.9345632Z rdma_ucm 28672 0 +2025-02-20T20:37:52.9346150Z rdma_cm 110592 1 rdma_ucm +2025-02-20T20:37:52.9346549Z iw_cm 49152 1 rdma_cm +2025-02-20T20:37:52.9346940Z ib_ipoib 131072 0 +2025-02-20T20:37:52.9347309Z ib_cm 114688 2 rdma_cm,ib_ipoib +2025-02-20T20:37:52.9347705Z ib_umad 28672 8 +2025-02-20T20:37:52.9348039Z nls_iso8859_1 16384 1 +2025-02-20T20:37:52.9348392Z dm_multipath 32768 0 +2025-02-20T20:37:52.9348747Z scsi_dh_rdac 16384 0 +2025-02-20T20:37:52.9349093Z scsi_dh_emc 16384 0 +2025-02-20T20:37:52.9349440Z scsi_dh_alua 20480 0 +2025-02-20T20:37:52.9349789Z kvm_amd 98304 0 +2025-02-20T20:37:52.9350134Z mlx5_ib 397312 0 +2025-02-20T20:37:52.9350650Z ccp 90112 1 kvm_amd +2025-02-20T20:37:52.9351020Z input_leds 16384 0 +2025-02-20T20:37:52.9351373Z kvm 667648 1 kvm_amd +2025-02-20T20:37:52.9351752Z joydev 24576 0 +2025-02-20T20:37:52.9352142Z serio_raw 20480 0 +2025-02-20T20:37:52.9352529Z ib_uverbs 139264 18 rdma_ucm,mlx5_ib +2025-02-20T20:37:52.9353109Z ib_core 348160 8 rdma_cm,ib_ipoib,iw_cm,ib_umad,rdma_ucm,ib_uverbs,mlx5_ib,ib_cm +2025-02-20T20:37:52.9353659Z tenstorrent 40960 0 +2025-02-20T20:37:52.9354014Z sch_fq_codel 20480 45 +2025-02-20T20:37:52.9354360Z binfmt_misc 24576 1 +2025-02-20T20:37:52.9354699Z msr 16384 0 +2025-02-20T20:37:52.9355039Z efi_pstore 16384 0 +2025-02-20T20:37:52.9355384Z virtio_rng 16384 0 +2025-02-20T20:37:52.9355787Z ip_tables 32768 2 iptable_filter,iptable_nat +2025-02-20T20:37:52.9356420Z x_tables 40960 5 xt_conntrack,iptable_filter,xt_addrtype,ip_tables,xt_MASQUERADE +2025-02-20T20:37:52.9356984Z autofs4 45056 2 +2025-02-20T20:37:52.9357325Z btrfs 1269760 0 +2025-02-20T20:37:52.9357677Z zstd_compress 167936 1 btrfs +2025-02-20T20:37:52.9358051Z raid10 61440 0 +2025-02-20T20:37:52.9358388Z raid456 155648 0 +2025-02-20T20:37:52.9358890Z async_raid6_recov 24576 1 raid456 +2025-02-20T20:37:52.9359341Z async_memcpy 20480 2 raid456,async_raid6_recov +2025-02-20T20:37:52.9359838Z async_pq 24576 2 raid456,async_raid6_recov +2025-02-20T20:37:52.9360347Z async_xor 20480 3 async_pq,raid456,async_raid6_recov +2025-02-20T20:37:52.9361035Z async_tx 20480 5 async_pq,async_memcpy,async_xor,raid456,async_raid6_recov +2025-02-20T20:37:52.9361600Z xor 24576 2 async_xor,btrfs +2025-02-20T20:37:52.9362092Z raid6_pq 114688 4 async_pq,btrfs,raid456,async_raid6_recov +2025-02-20T20:37:52.9362663Z libcrc32c 16384 5 nf_conntrack,nf_nat,btrfs,xfs,raid456 +2025-02-20T20:37:52.9363139Z raid1 45056 0 +2025-02-20T20:37:52.9363488Z raid0 24576 0 +2025-02-20T20:37:52.9363836Z multipath 20480 0 +2025-02-20T20:37:52.9364198Z linear 20480 0 +2025-02-20T20:37:52.9364545Z hid_generic 16384 0 +2025-02-20T20:37:52.9364900Z usbhid 57344 0 +2025-02-20T20:37:52.9365242Z cirrus 16384 0 +2025-02-20T20:37:52.9365709Z hid 131072 2 usbhid,hid_generic +2025-02-20T20:37:52.9366125Z mlx5_core 1626112 1 mlx5_ib +2025-02-20T20:37:52.9366509Z crct10dif_pclmul 16384 1 +2025-02-20T20:37:52.9366861Z drm_kms_helper 184320 3 cirrus +2025-02-20T20:37:52.9367245Z crc32_pclmul 16384 0 +2025-02-20T20:37:52.9367628Z syscopyarea 16384 1 drm_kms_helper +2025-02-20T20:37:52.9368068Z sysfillrect 16384 1 drm_kms_helper +2025-02-20T20:37:52.9368485Z ghash_clmulni_intel 16384 0 +2025-02-20T20:37:52.9368909Z sysimgblt 16384 1 drm_kms_helper +2025-02-20T20:37:52.9369316Z aesni_intel 372736 0 +2025-02-20T20:37:52.9369700Z pci_hyperv_intf 16384 1 mlx5_core +2025-02-20T20:37:52.9370214Z crypto_simd 16384 1 aesni_intel +2025-02-20T20:37:52.9370632Z mlxdevm 172032 1 mlx5_core +2025-02-20T20:37:52.9371061Z fb_sys_fops 16384 1 drm_kms_helper +2025-02-20T20:37:52.9371781Z cryptd 24576 2 crypto_simd,ghash_clmulni_intel +2025-02-20T20:37:52.9372270Z auxiliary 16384 2 mlx5_ib,mlx5_core +2025-02-20T20:37:52.9373017Z mlx_compat 65536 12 rdma_cm,ib_ipoib,mlxdevm,iw_cm,auxiliary,ib_umad,ib_core,rdma_ucm,ib_uverbs,mlx5_ib,ib_cm,mlx5_core +2025-02-20T20:37:52.9373727Z tls 73728 1 mlx5_core +2025-02-20T20:37:52.9374108Z ahci 40960 0 +2025-02-20T20:37:52.9374473Z glue_helper 16384 1 aesni_intel +2025-02-20T20:37:52.9374870Z mlxfw 32768 1 mlx5_core +2025-02-20T20:37:52.9375262Z psmouse 155648 0 +2025-02-20T20:37:52.9375607Z virtio_blk 20480 3 +2025-02-20T20:37:52.9375984Z drm 495616 3 drm_kms_helper,cirrus +2025-02-20T20:37:52.9376435Z libahci 36864 1 ahci +2025-02-20T20:37:52.9376841Z psample 20480 1 mlx5_core' +2025-02-20T20:37:52.9377234Z + grep -q tenstorrent +2025-02-20T20:37:52.9388495Z + echo Module Size Used by wekafsio 70086656 1 wekafsgw 40960 4 wekafsio veth 28672 0 uio_pci_generic 16384 0 igb_uio 20480 0 uio 20480 2 igb_uio,uio_pci_generic xt_conntrack 16384 1 xt_MASQUERADE 20480 1 nf_conntrack_netlink 45056 0 nfnetlink 16384 2 nf_conntrack_netlink xfrm_user 36864 1 xfrm_algo 16384 1 xfrm_user iptable_nat 16384 1 nf_nat 45056 2 iptable_nat,xt_MASQUERADE nf_conntrack 139264 4 xt_conntrack,nf_nat,nf_conntrack_netlink,xt_MASQUERADE nf_defrag_ipv6 24576 1 nf_conntrack nf_defrag_ipv4 16384 1 nf_conntrack xt_addrtype 16384 2 iptable_filter 16384 1 bpfilter 32768 0 br_netfilter 28672 0 bridge 176128 1 br_netfilter stp 16384 1 bridge llc 16384 2 bridge,stp aufs 262144 0 xfs 1286144 2 overlay 118784 0 rdma_ucm 28672 0 rdma_cm 110592 1 rdma_ucm iw_cm 49152 1 rdma_cm ib_ipoib 131072 0 ib_cm 114688 2 rdma_cm,ib_ipoib ib_umad 28672 8 nls_iso8859_1 16384 1 dm_multipath 32768 0 scsi_dh_rdac 16384 0 scsi_dh_emc 16384 0 scsi_dh_alua 20480 0 kvm_amd 98304 0 mlx5_ib 397312 0 ccp 90112 1 kvm_amd input_leds 16384 0 kvm 667648 1 kvm_amd joydev 24576 0 serio_raw 20480 0 ib_uverbs 139264 18 rdma_ucm,mlx5_ib ib_core 348160 8 rdma_cm,ib_ipoib,iw_cm,ib_umad,rdma_ucm,ib_uverbs,mlx5_ib,ib_cm tenstorrent 40960 0 sch_fq_codel 20480 45 binfmt_misc 24576 1 msr 16384 0 efi_pstore 16384 0 virtio_rng 16384 0 ip_tables 32768 2 iptable_filter,iptable_nat x_tables 40960 5 xt_conntrack,iptable_filter,xt_addrtype,ip_tables,xt_MASQUERADE autofs4 45056 2 btrfs 1269760 0 zstd_compress 167936 1 btrfs raid10 61440 0 raid456 155648 0 async_raid6_recov 24576 1 raid456 async_memcpy 20480 2 raid456,async_raid6_recov async_pq 24576 2 raid456,async_raid6_recov async_xor 20480 3 async_pq,raid456,async_raid6_recov async_tx 20480 5 async_pq,async_memcpy,async_xor,raid456,async_raid6_recov xor 24576 2 async_xor,btrfs raid6_pq 114688 4 async_pq,btrfs,raid456,async_raid6_recov libcrc32c 16384 5 nf_conntrack,nf_nat,btrfs,xfs,raid456 raid1 45056 0 raid0 24576 0 multipath 20480 0 linear 20480 0 hid_generic 16384 0 usbhid 57344 0 cirrus 16384 0 hid 131072 2 usbhid,hid_generic mlx5_core 1626112 1 mlx5_ib crct10dif_pclmul 16384 1 drm_kms_helper 184320 3 cirrus crc32_pclmul 16384 0 syscopyarea 16384 1 drm_kms_helper sysfillrect 16384 1 drm_kms_helper ghash_clmulni_intel 16384 0 sysimgblt 16384 1 drm_kms_helper aesni_intel 372736 0 pci_hyperv_intf 16384 1 mlx5_core crypto_simd 16384 1 aesni_intel mlxdevm 172032 1 mlx5_core fb_sys_fops 16384 1 drm_kms_helper cryptd 24576 2 crypto_simd,ghash_clmulni_intel auxiliary 16384 2 mlx5_ib,mlx5_core mlx_compat 65536 12 rdma_cm,ib_ipoib,mlxdevm,iw_cm,auxiliary,ib_umad,ib_core,rdma_ucm,ib_uverbs,mlx5_ib,ib_cm,mlx5_core tls 73728 1 mlx5_core ahci 40960 0 glue_helper 16384 1 aesni_intel mlxfw 32768 1 mlx5_core psmouse 155648 0 virtio_blk 20480 3 drm 495616 3 drm_kms_helper,cirrus libahci 36864 1 ahci psample 20480 1 mlx5_core +2025-02-20T20:37:52.9400122Z + [[ 0 -ne 0 ]] +2025-02-20T20:37:52.9400432Z ++ lsof -w /dev/tenstorrent/0 +2025-02-20T20:37:53.0792595Z + lsof_output= +2025-02-20T20:37:53.0792967Z + '[' -n '' ']' +2025-02-20T20:37:53.0794550Z ##[notice]Touching and printing out SMI info +2025-02-20T20:37:53.0795643Z + i=0 +2025-02-20T20:37:53.0795927Z + iter_limit=10 +2025-02-20T20:37:53.0796483Z + echo '::notice title=printing-smi-info-startup::Touching and printing out SMI info' +2025-02-20T20:37:53.0797093Z + sleep 20 +2025-02-20T20:38:13.0807996Z + sudo touch /opt/tt_metal_infra/smi.log +2025-02-20T20:38:13.1060424Z + sudo chown ubuntu /opt/tt_metal_infra/smi.log +2025-02-20T20:38:13.1266485Z + tt-smi-metal -s -f /opt/tt_metal_infra/smi.log +2025-02-20T20:38:13.5168751Z +2025-02-20T20:38:13.5170181Z  Detected Chips: 1 +2025-02-20T20:38:13.5288405Z  +2025-02-20T20:38:13.5288781Z  Detected Chips: 1 +2025-02-20T20:38:13.5289303Z +2025-02-20T20:38:13.5289872Z  Detecting ARC: | +2025-02-20T20:38:13.5290186Z +2025-02-20T20:38:13.5290388Z  Detecting DRAM: | +2025-02-20T20:38:13.5290912Z +2025-02-20T20:38:13.5291414Z [] ETH: | +2025-02-20T20:38:13.5361555Z Gathering Information ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:00 +2025-02-20T20:38:13.5382986Z  Saved tt-smi log to: /opt/tt_metal_infra/smi.log  +2025-02-20T20:38:13.6064747Z + cat /opt/tt_metal_infra/smi.log +2025-02-20T20:38:13.6071161Z { +2025-02-20T20:38:13.6071567Z "time": "2025-02-20T20:38:13.528945", +2025-02-20T20:38:13.6074323Z + echo '::notice title=attempting-reset-startup::Attempting to reset card(s). Sleeping first' +2025-02-20T20:38:13.6074960Z + sleep 30 +2025-02-20T20:38:13.6075257Z "host_info": { +2025-02-20T20:38:13.6075581Z "OS": "Linux", +2025-02-20T20:38:13.6075927Z "Distro": "Ubuntu 20.04.3 LTS", +2025-02-20T20:38:13.6076337Z "Kernel": "5.4.0-205-generic", +2025-02-20T20:38:13.6076755Z "Hostname": "tt-metal-ci-vm-2", +2025-02-20T20:38:13.6077195Z "Platform": "x86_64", +2025-02-20T20:38:13.6077877Z "Python": "3.8.10", +2025-02-20T20:38:13.6078791Z "Memory": "47.14 GB", +2025-02-20T20:38:13.6079190Z "Driver": "TTKMD 1.27.1" +2025-02-20T20:38:13.6079880Z }, +2025-02-20T20:38:13.6080183Z "device_info": [ +2025-02-20T20:38:13.6080513Z { +2025-02-20T20:38:13.6080812Z "smbus_telem": { +2025-02-20T20:38:13.6081184Z "BOARD_ID": "0x10000331152304d", +2025-02-20T20:38:13.6081688Z "SMBUS_TX_ENUM_VERSION": "0xba5e0001", +2025-02-20T20:38:13.6082140Z "SMBUS_TX_DEVICE_ID": "0xfaca1e52", +2025-02-20T20:38:13.6082555Z "SMBUS_TX_ASIC_RO": null, +2025-02-20T20:38:13.6082972Z "SMBUS_TX_ASIC_IDD": null, +2025-02-20T20:38:13.6083399Z "SMBUS_TX_BOARD_ID_HIGH": "0x1000033", +2025-02-20T20:38:13.6083836Z "SMBUS_TX_BOARD_ID_LOW": "0x1152304d", +2025-02-20T20:38:13.6084368Z "SMBUS_TX_ARC0_FW_VERSION": "0x1070000", +2025-02-20T20:38:13.6084820Z "SMBUS_TX_ARC1_FW_VERSION": "0x1070000", +2025-02-20T20:38:13.6085273Z "SMBUS_TX_ARC2_FW_VERSION": null, +2025-02-20T20:38:13.6085713Z "SMBUS_TX_ARC3_FW_VERSION": "0x1070000", +2025-02-20T20:38:13.6086162Z "SMBUS_TX_SPIBOOTROM_FW_VERSION": null, +2025-02-20T20:38:13.6086590Z "SMBUS_TX_ETH_FW_VERSION": null, +2025-02-20T20:38:13.6087019Z "SMBUS_TX_M3_BL_FW_VERSION": null, +2025-02-20T20:38:13.6087465Z "SMBUS_TX_M3_APP_FW_VERSION": null, +2025-02-20T20:38:13.6111868Z "SMBUS_TX_DDR_SPEED": "0xe74", +2025-02-20T20:38:13.6112330Z "SMBUS_TX_DDR_STATUS": "0x111111", +2025-02-20T20:38:13.6112784Z "SMBUS_TX_ETH_STATUS0": null, +2025-02-20T20:38:13.6113211Z "SMBUS_TX_ETH_STATUS1": null, +2025-02-20T20:38:13.6113645Z "SMBUS_TX_PCIE_STATUS": "0x11040042", +2025-02-20T20:38:13.6114340Z "SMBUS_TX_FAULTS": null, +2025-02-20T20:38:13.6114757Z "SMBUS_TX_ARC0_HEALTH": "0x91b1f6d", +2025-02-20T20:38:13.6115204Z "SMBUS_TX_ARC1_HEALTH": null, +2025-02-20T20:38:13.6115628Z "SMBUS_TX_ARC2_HEALTH": null, +2025-02-20T20:38:13.6116034Z "SMBUS_TX_ARC3_HEALTH": null, +2025-02-20T20:38:13.6116454Z "SMBUS_TX_FAN_SPEED": "0xff", +2025-02-20T20:38:13.6116887Z "SMBUS_TX_AICLK": "0x4b200fa", +2025-02-20T20:38:13.6117301Z "SMBUS_TX_AXICLK": "0x384", +2025-02-20T20:38:13.6117721Z "SMBUS_TX_ARCCLK": "0x21c", +2025-02-20T20:38:13.6118148Z "SMBUS_TX_THROTTLER": null, +2025-02-20T20:38:13.6118575Z "SMBUS_TX_VCORE": "0x2e4", +2025-02-20T20:38:13.6119020Z "SMBUS_TX_ASIC_TEMPERATURE": "0x2cf021f", +2025-02-20T20:38:13.6119485Z "SMBUS_TX_VREG_TEMPERATURE": null, +2025-02-20T20:38:13.6119933Z "SMBUS_TX_BOARD_TEMPERATURE": null, +2025-02-20T20:38:13.6120377Z "SMBUS_TX_TDP": "0xaa0010", +2025-02-20T20:38:13.6120791Z "SMBUS_TX_TDC": "0x12c0014", +2025-02-20T20:38:13.6121212Z "SMBUS_TX_VDD_LIMITS": "0x3a202e4", +2025-02-20T20:38:13.6121690Z "SMBUS_TX_THM_LIMITS": "0x53004b", +2025-02-20T20:38:13.6122121Z "SMBUS_TX_WH_FW_DATE": "0x45011317", +2025-02-20T20:38:13.6122545Z "SMBUS_TX_ASIC_TMON0": "0x23222222", +2025-02-20T20:38:13.6122965Z "SMBUS_TX_ASIC_TMON1": "0x2222", +2025-02-20T20:38:13.6123384Z "SMBUS_TX_MVDDQ_POWER": null, +2025-02-20T20:38:13.6123807Z "SMBUS_TX_GDDR_TRAIN_TEMP0": null, +2025-02-20T20:38:13.6124323Z "SMBUS_TX_GDDR_TRAIN_TEMP1": null, +2025-02-20T20:38:13.6124761Z "SMBUS_TX_BOOT_DATE": "0x5213170f", +2025-02-20T20:38:13.6125193Z "SMBUS_TX_RT_SECONDS": null, +2025-02-20T20:38:13.6125612Z "SMBUS_TX_AUX_STATUS": null, +2025-02-20T20:38:13.6126047Z "SMBUS_TX_ETH_DEBUG_STATUS0": null, +2025-02-20T20:38:13.6126492Z "SMBUS_TX_ETH_DEBUG_STATUS1": null, +2025-02-20T20:38:13.6127158Z "SMBUS_TX_TT_FLASH_VERSION": "0x20008" +2025-02-20T20:38:13.6127566Z }, +2025-02-20T20:38:13.6127933Z "board_info": { +2025-02-20T20:38:13.6128354Z "bus_id": "0000:07:00.0", +2025-02-20T20:38:13.6128762Z "board_type": "e150", +2025-02-20T20:38:13.6129161Z "board_id": "010000331152304d", +2025-02-20T20:38:13.6129578Z "coords": "N/A", +2025-02-20T20:38:13.6129967Z "dram_status": true, +2025-02-20T20:38:13.6130362Z "dram_speed": "3700", +2025-02-20T20:38:13.6130765Z "pcie_speed": 4, +2025-02-20T20:38:13.6131134Z "pcie_width": 16 +2025-02-20T20:38:13.6131496Z }, +2025-02-20T20:38:13.6131818Z "telemetry": { +2025-02-20T20:38:13.6132203Z "voltage": "0.74", +2025-02-20T20:38:13.6132637Z "current": " 20.0", +2025-02-20T20:38:13.6133038Z "power": " 16.0", +2025-02-20T20:38:13.6133446Z "aiclk": " 250", +2025-02-20T20:38:13.6133819Z "asic_temperature": "33.9" +2025-02-20T20:38:13.6134229Z }, +2025-02-20T20:38:13.6134562Z "firmwares": { +2025-02-20T20:38:13.6134917Z "arc_fw": "1.7.0.0", +2025-02-20T20:38:13.6135324Z "arc_fw_date": "2024-05-01", +2025-02-20T20:38:13.6135739Z "eth_fw": "N/A", +2025-02-20T20:38:13.6136109Z "m3_bl_fw": "N/A", +2025-02-20T20:38:13.6136493Z "m3_app_fw": "N/A", +2025-02-20T20:38:13.6136886Z "tt_flash_version": "0.2.0.8" +2025-02-20T20:38:13.6137299Z }, +2025-02-20T20:38:13.6137626Z "limits": { +2025-02-20T20:38:13.6137964Z "vdd_min": "0.74", +2025-02-20T20:38:13.6138358Z "vdd_max": "0.93", +2025-02-20T20:38:13.6138871Z "tdp_limit": "170", +2025-02-20T20:38:13.6139311Z "tdc_limit": "300", +2025-02-20T20:38:13.6139701Z "asic_fmax": "1202", +2025-02-20T20:38:13.6140082Z "therm_trip_l1_limit": "83", +2025-02-20T20:38:13.6140509Z "thm_limit": "75", +2025-02-20T20:38:13.6140902Z "bus_peak_limit": null +2025-02-20T20:38:13.6141291Z } +2025-02-20T20:38:13.6141590Z } +2025-02-20T20:38:13.6141865Z ] +2025-02-20T20:38:13.6142400Z }::notice title=attempting-reset-startup::Attempting to reset card(s). Sleeping first +2025-02-20T20:38:43.6090170Z + '[' 0 -lt 10 ']' +2025-02-20T20:38:43.6091368Z + (( i++ )) +2025-02-20T20:38:43.6091733Z ++ tt-smi-metal -r 0 +2025-02-20T20:38:44.1220353Z + reset_output=' Starting tensix reset on GS board at pci index 0  +2025-02-20T20:38:44.1221115Z  Lowering clks to safe value...  +2025-02-20T20:38:44.1221617Z  Beginning reset sequence...  +2025-02-20T20:38:44.1222134Z  Finishing reset sequence...  +2025-02-20T20:38:44.1222650Z  Returning clks to original values...  +2025-02-20T20:38:44.1223253Z  Finished tensix reset on GS board at pci index 0 +2025-02-20T20:38:44.1223736Z  +2025-02-20T20:38:44.1224268Z  Re-initializing boards after reset....  +2025-02-20T20:38:44.1224572Z +2025-02-20T20:38:44.1224793Z  Detected Chips: 1 +2025-02-20T20:38:44.1225180Z  +2025-02-20T20:38:44.1225538Z  Detected Chips: 1 +2025-02-20T20:38:44.1225785Z +2025-02-20T20:38:44.1225982Z  Detecting ARC: | +2025-02-20T20:38:44.1226222Z +2025-02-20T20:38:44.1226413Z  Detecting DRAM: | +2025-02-20T20:38:44.1226732Z +2025-02-20T20:38:44.1226922Z [] ETH: |' +2025-02-20T20:38:44.1227286Z + [[ 0 -ne 0 ]] +2025-02-20T20:38:44.1227767Z + [[  Starting tensix reset on GS board at pci index 0  +2025-02-20T20:38:44.1228337Z  Lowering clks to safe value...  +2025-02-20T20:38:44.1228830Z  Beginning reset sequence...  +2025-02-20T20:38:44.1229287Z  Finishing reset sequence...  +2025-02-20T20:38:44.1230734Z  Returning clks to original values...  +2025-02-20T20:38:44.1231422Z  Finished tensix reset on GS board at pci index 0 +2025-02-20T20:38:44.1231943Z  +2025-02-20T20:38:44.1232356Z  Re-initializing boards after reset....  +2025-02-20T20:38:44.1232655Z +2025-02-20T20:38:44.1232868Z  Detected Chips: 1 +2025-02-20T20:38:44.1233266Z  +2025-02-20T20:38:44.1233625Z  Detected Chips: 1 +2025-02-20T20:38:44.1233852Z +2025-02-20T20:38:44.1234046Z  Detecting ARC: | +2025-02-20T20:38:44.1234284Z +2025-02-20T20:38:44.1234480Z  Detecting DRAM: | +2025-02-20T20:38:44.1234722Z +2025-02-20T20:38:44.1235029Z [] ETH: | == *\N\o\ \c\h\i\p\s\ \d\e\t\e\c\t\e\d* ]] +2025-02-20T20:38:44.1235485Z + break +2025-02-20T20:38:44.1235769Z + '[' 1 -eq 10 ']' +2025-02-20T20:38:44.1236397Z + echo '::notice title=reset-successful-startup::tt-smi reset was successful' +2025-02-20T20:38:44.1236966Z + check_hugepages_service_status=0 +2025-02-20T20:38:44.1238685Z ##[notice]tt-smi reset was successful +2025-02-20T20:38:44.1242350Z + sudo systemctl status tenstorrent-hugepages.service +2025-02-20T20:38:44.1493822Z Unit tenstorrent-hugepages.service could not be found. +2025-02-20T20:38:44.1500042Z + check_hugepages_service_status=4 +2025-02-20T20:38:44.1500586Z + '[' 4 -eq 4 ']' +2025-02-20T20:38:44.1501441Z + echo '::warning title=hugepages-service-not-found-startup::Hugepages service not found. Using old rc.local method' +2025-02-20T20:38:44.1502250Z + sudo /etc/rc.local +2025-02-20T20:38:44.1506918Z ##[warning]Hugepages service not found. Using old rc.local method +2025-02-20T20:39:14.1969953Z ++ date +%s +2025-02-20T20:39:14.1977573Z + hugepages_check_start=1740083954 +2025-02-20T20:39:14.1978086Z + hugepages_check_timeout=60 +2025-02-20T20:39:14.1979731Z ++ cat /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages +2025-02-20T20:39:14.1987777Z + [[ 1 -eq 0 ]] +2025-02-20T20:39:14.1989608Z ##[notice]Hugepages is now setup. +2025-02-20T20:39:14.1991947Z + echo '::notice title=hugepages-setup-success-startup::Hugepages is now setup.' +2025-02-20T20:39:14.1992852Z + echo 'Printing out cpu information...' +2025-02-20T20:39:14.1993327Z + lscpu +2025-02-20T20:39:14.1993695Z Printing out cpu information... +2025-02-20T20:39:14.2017147Z Architecture: x86_64 +2025-02-20T20:39:14.2019018Z CPU op-mode(s): 32-bit, 64-bit +2025-02-20T20:39:14.2021805Z Byte Order: Little Endian +2025-02-20T20:39:14.2022529Z Address sizes: 40 bits physical, 48 bits virtual +2025-02-20T20:39:14.2023193Z CPU(s): 14 +2025-02-20T20:39:14.2023699Z On-line CPU(s) list: 0-13 +2025-02-20T20:39:14.2024224Z Thread(s) per core: 1 +2025-02-20T20:39:14.2024724Z Core(s) per socket: 1 +2025-02-20T20:39:14.2025243Z Socket(s): 14 +2025-02-20T20:39:14.2025735Z NUMA node(s): 2 +2025-02-20T20:39:14.2026254Z Vendor ID: AuthenticAMD +2025-02-20T20:39:14.2026818Z CPU family: 23 +2025-02-20T20:39:14.2027322Z Model: 49 +2025-02-20T20:39:14.2027923Z Model name: AMD EPYC-Rome Processor +2025-02-20T20:39:14.2028494Z Stepping: 0 +2025-02-20T20:39:14.2028985Z CPU MHz: 2300.000 +2025-02-20T20:39:14.2029527Z BogoMIPS: 4600.00 +2025-02-20T20:39:14.2030077Z Virtualization: AMD-V +2025-02-20T20:39:14.2030816Z Hypervisor vendor: KVM +2025-02-20T20:39:14.2031381Z Virtualization type: full +2025-02-20T20:39:14.2031926Z L1d cache: 448 KiB +2025-02-20T20:39:14.2032534Z L1i cache: 448 KiB +2025-02-20T20:39:14.2033067Z L2 cache: 7 MiB +2025-02-20T20:39:14.2033608Z L3 cache: 224 MiB +2025-02-20T20:39:14.2034170Z NUMA node0 CPU(s): 0-6 +2025-02-20T20:39:14.2034989Z NUMA node1 CPU(s): 7-13 +2025-02-20T20:39:14.2035560Z Vulnerability Gather data sampling: Not affected +2025-02-20T20:39:14.2036176Z Vulnerability Itlb multihit: Not affected +2025-02-20T20:39:14.2036812Z Vulnerability L1tf: Not affected +2025-02-20T20:39:14.2037428Z Vulnerability Mds: Not affected +2025-02-20T20:39:14.2038047Z Vulnerability Meltdown: Not affected +2025-02-20T20:39:14.2038672Z Vulnerability Mmio stale data: Not affected +2025-02-20T20:39:14.2039277Z Vulnerability Retbleed: Vulnerable +2025-02-20T20:39:14.2040302Z Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp +2025-02-20T20:39:14.2041588Z Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization +2025-02-20T20:39:14.2043090Z Vulnerability Spectre v2: Mitigation; Retpolines; IBPB conditional; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected +2025-02-20T20:39:14.2044294Z Vulnerability Srbds: Not affected +2025-02-20T20:39:14.2044877Z Vulnerability Tsx async abort: Not affected +2025-02-20T20:39:14.2048535Z Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid tsc_known_freq pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm svm cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core ssbd ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 clzero xsaveerptr wbnoinvd arat npt nrip_save umip rdpid +2025-02-20T20:39:14.2273795Z ##[group]Run tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main +2025-02-20T20:39:14.2274448Z with: +2025-02-20T20:39:14.2274927Z token: *** +2025-02-20T20:39:14.2275248Z fetch-depth: 1 +2025-02-20T20:39:14.2275542Z env: +2025-02-20T20:39:14.2275842Z LOGURU_LEVEL: INFO +2025-02-20T20:39:14.2276176Z ##[endgroup] +2025-02-20T20:39:14.2364830Z ##[group]Run set -x +2025-02-20T20:39:14.2365194Z set -x +2025-02-20T20:39:14.2365500Z ls -al +2025-02-20T20:39:14.2365871Z if [ -f "semicolon_delimited_script" ]; then +2025-02-20T20:39:14.2366356Z  file semicolon_delimited_script +2025-02-20T20:39:14.2366802Z  head semicolon_delimited_script +2025-02-20T20:39:14.2367200Z fi +2025-02-20T20:39:14.2367496Z sudo rm -rf deleteme +2025-02-20T20:39:14.2367882Z sudo rm -rf docker-job +2025-02-20T20:39:14.2368264Z if [ -d ".git" ]; then +2025-02-20T20:39:14.2368691Z  echo 'Cleaning repo' +2025-02-20T20:39:14.2369075Z  git clean -xffd +2025-02-20T20:39:14.2369453Z  echo 'Done git clean -xffd' +2025-02-20T20:39:14.2369906Z  echo 'Attempting to delete any lock files' +2025-02-20T20:39:14.2370401Z  find .git -type f -iname '*.lock' -delete +2025-02-20T20:39:14.2370864Z  echo 'Done deleting lock files' +2025-02-20T20:39:14.2371304Z  echo 'De-init-ing submodules' +2025-02-20T20:39:14.2371731Z  git submodule deinit -f --all +2025-02-20T20:39:14.2372161Z  echo 'Done de-initing submodules' +2025-02-20T20:39:14.2372568Z fi +2025-02-20T20:39:14.2392000Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-20T20:39:14.2392541Z env: +2025-02-20T20:39:14.2392824Z LOGURU_LEVEL: INFO +2025-02-20T20:39:14.2393145Z ##[endgroup] +2025-02-20T20:42:54.9354051Z FAILED tests/ttnn/unit_tests/operations/test_examples.py::test_do_not_submit +2025-02-20T20:42:54.9354247Z !!!!!!!!!!!!!!!!!!!!!!!!!! stopping after 1 failures !!!!!!!!!!!!!!!!!!!!!!!!!!! +2025-02-20T20:42:54.9354620Z = 1 failed, 467 passed, 739 skipped, 62150 deselected, 483 warnings in 139.50s (0:02:19) = +2025-02-20T20:42:59.5491429Z  Device | INFO  | Closing user mode device drivers +2025-02-20T20:43:00.0731115Z Prepare all required actions +2025-02-20T20:43:00.0731608Z Getting action download info +2025-02-20T20:43:00.3813552Z Download action repository 'slackapi/slack-github-action@v1.26.0' (SHA:70cd7be8e40a46e8b0eced40b0de447bdb42f68e) +2025-02-20T20:43:00.9477723Z ##[group]Run ./.github/actions/slack-report +2025-02-20T20:43:00.9478128Z with: +2025-02-20T20:43:00.9478835Z slack_webhook_url: *** +2025-02-20T20:43:00.9479165Z owner: U06CXU895AP +2025-02-20T20:43:00.9479475Z env: +2025-02-20T20:43:00.9479770Z LOGURU_LEVEL: INFO +2025-02-20T20:43:00.9480154Z BUILD_TAG: 77398908e9f0099b5a855e28a7cddf2825c70d41 +2025-02-20T20:43:00.9480976Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:77398908e9f0099b5a855e28a7cddf2825c70d41 +2025-02-20T20:43:00.9481861Z RUNNER_UID: 1000 +2025-02-20T20:43:00.9482161Z RUNNER_GID: 1000 +2025-02-20T20:43:00.9482453Z ##[endgroup] +2025-02-20T20:43:00.9559860Z Prepare all required actions +2025-02-20T20:43:00.9560335Z Getting action download info +2025-02-20T20:43:01.0891393Z Download action repository 'actions/upload-artifact@v4' (SHA:65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08) +2025-02-20T20:43:01.8721489Z ##[group]Run ./.github/actions/upload-artifact-with-job-uuid +2025-02-20T20:43:01.8721976Z with: +2025-02-20T20:43:01.8722276Z path: generated/test_reports/ + +2025-02-20T20:43:01.8722641Z prefix: test_reports_ +2025-02-20T20:43:01.8722955Z env: +2025-02-20T20:43:01.8723235Z LOGURU_LEVEL: INFO +2025-02-20T20:43:01.8723643Z BUILD_TAG: 77398908e9f0099b5a855e28a7cddf2825c70d41 +2025-02-20T20:43:01.8724479Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:77398908e9f0099b5a855e28a7cddf2825c70d41 +2025-02-20T20:43:01.8725243Z RUNNER_UID: 1000 +2025-02-20T20:43:01.8725694Z RUNNER_GID: 1000 +2025-02-20T20:43:01.8725999Z ##[endgroup] +2025-02-20T20:43:01.8750888Z ##[group]Run uuid=$(uuidgen) +2025-02-20T20:43:01.8751316Z uuid=$(uuidgen) +2025-02-20T20:43:01.8751687Z artifact_name="test_reports_$uuid" +2025-02-20T20:43:01.8752156Z echo "[UPLOAD-ARTIFACT-UUID] $artifact_name" +2025-02-20T20:43:01.8752696Z echo "artifact-name=$artifact_name" >> "$GITHUB_OUTPUT" +2025-02-20T20:43:01.8774328Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-20T20:43:01.8774790Z env: +2025-02-20T20:43:01.8775060Z LOGURU_LEVEL: INFO +2025-02-20T20:43:01.8775426Z BUILD_TAG: 77398908e9f0099b5a855e28a7cddf2825c70d41 +2025-02-20T20:43:01.8776396Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:77398908e9f0099b5a855e28a7cddf2825c70d41 +2025-02-20T20:43:01.8777152Z RUNNER_UID: 1000 +2025-02-20T20:43:01.8777452Z RUNNER_GID: 1000 +2025-02-20T20:43:01.8777746Z ##[endgroup] +2025-02-20T20:43:01.8833316Z [UPLOAD-ARTIFACT-UUID] test_reports_3625ce52-baf1-4c13-89e7-fc467452e238 +2025-02-20T20:43:01.8911880Z ##[group]Run actions/upload-artifact@v4 +2025-02-20T20:43:01.8912386Z with: +2025-02-20T20:43:01.8912789Z name: test_reports_3625ce52-baf1-4c13-89e7-fc467452e238 +2025-02-20T20:43:01.8913333Z path: generated/test_reports/ + +2025-02-20T20:43:01.8913753Z if-no-files-found: warn +2025-02-20T20:43:01.8914145Z compression-level: 6 +2025-02-20T20:43:01.8914508Z overwrite: false +2025-02-20T20:43:01.8914846Z include-hidden-files: false +2025-02-20T20:43:01.8915229Z env: +2025-02-20T20:43:01.8915539Z LOGURU_LEVEL: INFO +2025-02-20T20:43:01.8915945Z BUILD_TAG: 77398908e9f0099b5a855e28a7cddf2825c70d41 +2025-02-20T20:43:01.8916826Z TT_METAL_DOCKER_IMAGE_TAG: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:77398908e9f0099b5a855e28a7cddf2825c70d41 +2025-02-20T20:43:01.8917629Z RUNNER_UID: 1000 +2025-02-20T20:43:01.8917968Z RUNNER_GID: 1000 +2025-02-20T20:43:01.8918323Z ##[endgroup] +2025-02-20T20:43:02.1714369Z With the provided path, there will be 1 file uploaded +2025-02-20T20:43:02.1719903Z Artifact name is valid! +2025-02-20T20:43:02.1720754Z Root directory input is valid! +2025-02-20T20:43:02.3836160Z Beginning upload of artifact content to blob storage +2025-02-20T20:43:02.6982301Z Uploaded bytes 17982 +2025-02-20T20:43:02.7656377Z Finished uploading artifact content to blob storage! +2025-02-20T20:43:02.7660144Z SHA256 hash of uploaded artifact zip is 519b36026b780d2a342790626d505c12319d86e9984f4d2ff1e3135e5eec25f3 +2025-02-20T20:43:02.7662122Z Finalizing artifact upload +2025-02-20T20:43:02.8858777Z Artifact test_reports_3625ce52-baf1-4c13-89e7-fc467452e238.zip successfully finalized. Artifact ID 2626413708 +2025-02-20T20:43:02.8860682Z Artifact test_reports_3625ce52-baf1-4c13-89e7-fc467452e238 has been successfully uploaded! Final size is 17982 bytes. Artifact ID is 2626413708 +2025-02-20T20:43:02.8867128Z Artifact download URL: https://github.com/tenstorrent/tt-metal/actions/runs/13443325356/artifacts/2626413708 +2025-02-20T20:43:02.9055947Z Post job cleanup. +2025-02-20T20:43:02.9112072Z Post job cleanup. +2025-02-20T20:43:03.0017076Z [command]/usr/bin/git version +2025-02-20T20:43:03.0057631Z git version 2.25.1 +2025-02-20T20:43:03.0102896Z Copying '/home/ubuntu/.gitconfig' to '/home/ubuntu/actions-runner/_work/_temp/fd27d922-79f6-4947-82f7-5e2122bc0a31/.gitconfig' +2025-02-20T20:43:03.0114679Z Temporarily overriding HOME='/home/ubuntu/actions-runner/_work/_temp/fd27d922-79f6-4947-82f7-5e2122bc0a31' before making global git config changes +2025-02-20T20:43:03.0117284Z Adding repository directory to the temporary git global config as a safe directory +2025-02-20T20:43:03.0120422Z [command]/usr/bin/git config --global --add safe.directory /home/ubuntu/actions-runner/_work/tt-metal/tt-metal +2025-02-20T20:43:03.0151129Z [command]/usr/bin/git config --local --name-only --get-regexp core\.sshCommand +2025-02-20T20:43:03.0176945Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'core\.sshCommand' && git config --local --unset-all 'core.sshCommand' || :" +2025-02-20T20:43:03.0449321Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-20T20:43:03.0491624Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-20T20:43:03.0546124Z Entering 'tt_metal/third_party/tracy' +2025-02-20T20:43:03.0595248Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-20T20:43:03.0642155Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-20T20:43:03.0691970Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-20T20:43:03.0741707Z Entering 'tt_metal/third_party/umd' +2025-02-20T20:43:03.0801523Z [command]/usr/bin/git config --local --name-only --get-regexp http\.https\:\/\/github\.com\/\.extraheader +2025-02-20T20:43:03.0822477Z http.https://github.com/.extraheader +2025-02-20T20:43:03.0832841Z [command]/usr/bin/git config --local --unset-all http.https://github.com/.extraheader +2025-02-20T20:43:03.0863000Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'http\.https\:\/\/github\.com\/\.extraheader' && git config --local --unset-all 'http.https://github.com/.extraheader' || :" +2025-02-20T20:43:03.1108769Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-20T20:43:03.1155131Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-20T20:43:03.1221394Z Entering 'tt_metal/third_party/tracy' +2025-02-20T20:43:03.1268817Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-20T20:43:03.1317122Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-20T20:43:03.1376110Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-20T20:43:03.1426523Z Entering 'tt_metal/third_party/umd' +2025-02-20T20:43:03.1614391Z Post job cleanup. +2025-02-20T20:43:03.5586264Z [command]/usr/bin/docker logout https://ghcr.io +2025-02-20T20:43:03.5772141Z Removing login credentials for ghcr.io +2025-02-20T20:43:03.5817905Z ##[group]Post cache +2025-02-20T20:43:03.5818842Z State not set +2025-02-20T20:43:03.5837417Z ##[endgroup] +2025-02-20T20:43:03.6037795Z Post job cleanup. +2025-02-20T20:43:03.6092696Z Post job cleanup. +2025-02-20T20:43:03.7288461Z [command]/usr/bin/git version +2025-02-20T20:43:03.7331900Z git version 2.25.1 +2025-02-20T20:43:03.7376894Z Copying '/home/ubuntu/.gitconfig' to '/home/ubuntu/actions-runner/_work/_temp/7328045f-5047-4458-8702-82868611759f/.gitconfig' +2025-02-20T20:43:03.7389152Z Temporarily overriding HOME='/home/ubuntu/actions-runner/_work/_temp/7328045f-5047-4458-8702-82868611759f' before making global git config changes +2025-02-20T20:43:03.7390757Z Adding repository directory to the temporary git global config as a safe directory +2025-02-20T20:43:03.7396340Z [command]/usr/bin/git config --global --add safe.directory /home/ubuntu/actions-runner/_work/tt-metal/tt-metal +2025-02-20T20:43:03.7434546Z [command]/usr/bin/git config --local --name-only --get-regexp core\.sshCommand +2025-02-20T20:43:03.7464778Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'core\.sshCommand' && git config --local --unset-all 'core.sshCommand' || :" +2025-02-20T20:43:03.7726132Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-20T20:43:03.7769136Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-20T20:43:03.7815876Z Entering 'tt_metal/third_party/tracy' +2025-02-20T20:43:03.7859908Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-20T20:43:03.7905638Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-20T20:43:03.7950225Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-20T20:43:03.7999078Z Entering 'tt_metal/third_party/umd' +2025-02-20T20:43:03.8064432Z [command]/usr/bin/git config --local --name-only --get-regexp http\.https\:\/\/github\.com\/\.extraheader +2025-02-20T20:43:03.8095551Z [command]/usr/bin/git submodule foreach --recursive sh -c "git config --local --name-only --get-regexp 'http\.https\:\/\/github\.com\/\.extraheader' && git config --local --unset-all 'http.https://github.com/.extraheader' || :" +2025-02-20T20:43:03.8347526Z Entering 'models/demos/t3000/llama2_70b/reference/llama' +2025-02-20T20:43:03.8395094Z Entering 'tt-train/3rd_party/wandb-cpp' +2025-02-20T20:43:03.8442108Z Entering 'tt_metal/third_party/tracy' +2025-02-20T20:43:03.8496863Z Entering 'tt_metal/third_party/tt_llk_blackhole' +2025-02-20T20:43:03.8543664Z Entering 'tt_metal/third_party/tt_llk_grayskull' +2025-02-20T20:43:03.8589761Z Entering 'tt_metal/third_party/tt_llk_wormhole_b0' +2025-02-20T20:43:03.8633668Z Entering 'tt_metal/third_party/umd' +2025-02-20T20:43:03.8789059Z A job completed hook has been configured by the self-hosted runner administrator +2025-02-20T20:43:03.8819320Z ##[group]Run '/opt/tt_metal_infra/scripts/ci/grayskull/cleanup.sh' +2025-02-20T20:43:03.8833552Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0} +2025-02-20T20:43:03.8834224Z ##[endgroup] +2025-02-20T20:43:03.8885368Z Current date / time is Thu Feb 20 20:43:03 UTC 2025 +2025-02-20T20:43:04.0987156Z Cleaning up orphan processes diff --git a/infra/tests/_data/data_collection/cicd/all_post_commit_test_annotations_13443325356/13443325356/logs/37563108566_annotations.json b/infra/tests/_data/data_collection/cicd/all_post_commit_test_annotations_13443325356/13443325356/logs/37563108566_annotations.json new file mode 100644 index 00000000000..a37408157d8 --- /dev/null +++ b/infra/tests/_data/data_collection/cicd/all_post_commit_test_annotations_13443325356/13443325356/logs/37563108566_annotations.json @@ -0,0 +1 @@ +[{"path":".github","blob_href":"https://github.com/tenstorrent/tt-metal/blob/94429171440755ffe7c62085c4807d447dd369dc/.github","start_line":73,"start_column":null,"end_line":73,"end_column":null,"annotation_level":"notice","title":"","message":"[DEPRECATION] This action is deprecated. Please migrate to reading the Docker image from the pipeline.","raw_details":""},{"path":"tests/ttnn/unit_tests/operations/test_examples.py","blob_href":"https://github.com/tenstorrent/tt-metal/blob/94429171440755ffe7c62085c4807d447dd369dc/tests/ttnn/unit_tests/operations/test_examples.py","start_line":107,"start_column":null,"end_line":107,"end_column":null,"annotation_level":"failure","title":"","message":"test_do_not_submit\n\nassert True == False","raw_details":""},{"path":".github","blob_href":"https://github.com/tenstorrent/tt-metal/blob/94429171440755ffe7c62085c4807d447dd369dc/.github","start_line":32,"start_column":null,"end_line":32,"end_column":null,"annotation_level":"notice","title":"disk-usage-after-startup","message":"Disk usage is 86 %","raw_details":""},{"path":".github","blob_href":"https://github.com/tenstorrent/tt-metal/blob/94429171440755ffe7c62085c4807d447dd369dc/.github","start_line":142,"start_column":null,"end_line":142,"end_column":null,"annotation_level":"notice","title":"printing-smi-info-startup","message":"Touching and printing out SMI info","raw_details":""},{"path":".github","blob_href":"https://github.com/tenstorrent/tt-metal/blob/94429171440755ffe7c62085c4807d447dd369dc/.github","start_line":313,"start_column":null,"end_line":313,"end_column":null,"annotation_level":"notice","title":"reset-successful-startup","message":"tt-smi reset was successful","raw_details":""},{"path":".github","blob_href":"https://github.com/tenstorrent/tt-metal/blob/94429171440755ffe7c62085c4807d447dd369dc/.github","start_line":320,"start_column":null,"end_line":320,"end_column":null,"annotation_level":"warning","title":"hugepages-service-not-found-startup","message":"Hugepages service not found. Using old rc.local method","raw_details":""},{"path":".github","blob_href":"https://github.com/tenstorrent/tt-metal/blob/94429171440755ffe7c62085c4807d447dd369dc/.github","start_line":326,"start_column":null,"end_line":326,"end_column":null,"annotation_level":"notice","title":"hugepages-setup-success-startup","message":"Hugepages is now setup.","raw_details":""}] diff --git a/infra/tests/_data/data_collection/cicd/all_post_commit_test_annotations_13443325356/workflow.json b/infra/tests/_data/data_collection/cicd/all_post_commit_test_annotations_13443325356/workflow.json new file mode 100644 index 00000000000..402bd004a7c --- /dev/null +++ b/infra/tests/_data/data_collection/cicd/all_post_commit_test_annotations_13443325356/workflow.json @@ -0,0 +1 @@ +{"id":13443325356,"name":"All post-commit tests","node_id":"WFR_kwLOI9Wqc88AAAADIUjdrA","head_branch":"williamly/test-failure-annotations","head_sha":"94429171440755ffe7c62085c4807d447dd369dc","path":".github/workflows/all-post-commit-workflows.yaml","display_title":"All post-commit tests","run_number":26028,"event":"workflow_dispatch","status":"completed","conclusion":"cancelled","workflow_id":67993574,"check_suite_id":34671009832,"check_suite_node_id":"CS_kwDOI9Wqc88AAAAIEo2gKA","url":"https://api.github.com/repos/tenstorrent/tt-metal/actions/runs/13443325356","html_url":"https://github.com/tenstorrent/tt-metal/actions/runs/13443325356","pull_requests":[{"url":"https://api.github.com/repos/tenstorrent/tt-metal/pulls/18106","id":2348084103,"number":18106,"head":{"ref":"williamly/test-failure-annotations","sha":"94429171440755ffe7c62085c4807d447dd369dc","repo":{"id":601205363,"url":"https://api.github.com/repos/tenstorrent/tt-metal","name":"tt-metal"}},"base":{"ref":"main","sha":"cb84d2eb6ab96b94f2e82a1e429ef84859b3528c","repo":{"id":601205363,"url":"https://api.github.com/repos/tenstorrent/tt-metal","name":"tt-metal"}}}],"created_at":"2025-02-20T19:38:56Z","updated_at":"2025-02-20T20:59:57Z","actor":{"login":"williamlyTT","id":193945317,"node_id":"U_kgDOC49e5Q","avatar_url":"https://avatars.githubusercontent.com/u/193945317?v=4","gravatar_id":"","url":"https://api.github.com/users/williamlyTT","html_url":"https://github.com/williamlyTT","followers_url":"https://api.github.com/users/williamlyTT/followers","following_url":"https://api.github.com/users/williamlyTT/following{/other_user}","gists_url":"https://api.github.com/users/williamlyTT/gists{/gist_id}","starred_url":"https://api.github.com/users/williamlyTT/starred{/owner}{/repo}","subscriptions_url":"https://api.github.com/users/williamlyTT/subscriptions","organizations_url":"https://api.github.com/users/williamlyTT/orgs","repos_url":"https://api.github.com/users/williamlyTT/repos","events_url":"https://api.github.com/users/williamlyTT/events{/privacy}","received_events_url":"https://api.github.com/users/williamlyTT/received_events","type":"User","user_view_type":"public","site_admin":false},"run_attempt":1,"referenced_workflows":[{"path":"tenstorrent/tt-metal/.github/workflows/tt-train-post-commit.yaml@94429171440755ffe7c62085c4807d447dd369dc","sha":"94429171440755ffe7c62085c4807d447dd369dc","ref":"refs/heads/williamly/test-failure-annotations"},{"path":"tenstorrent/tt-metal/.github/workflows/models-post-commit.yaml@94429171440755ffe7c62085c4807d447dd369dc","sha":"94429171440755ffe7c62085c4807d447dd369dc","ref":"refs/heads/williamly/test-failure-annotations"},{"path":"tenstorrent/tt-metal/.github/workflows/ttnn-post-commit.yaml@94429171440755ffe7c62085c4807d447dd369dc","sha":"94429171440755ffe7c62085c4807d447dd369dc","ref":"refs/heads/williamly/test-failure-annotations"},{"path":"tenstorrent/tt-metal/.github/workflows/build-artifact.yaml@94429171440755ffe7c62085c4807d447dd369dc","sha":"94429171440755ffe7c62085c4807d447dd369dc","ref":"refs/heads/williamly/test-failure-annotations"},{"path":"tenstorrent/tt-metal/.github/workflows/build-and-unit-tests.yaml@94429171440755ffe7c62085c4807d447dd369dc","sha":"94429171440755ffe7c62085c4807d447dd369dc","ref":"refs/heads/williamly/test-failure-annotations"},{"path":"tenstorrent/tt-metal/.github/workflows/docs-latest-public.yaml@94429171440755ffe7c62085c4807d447dd369dc","sha":"94429171440755ffe7c62085c4807d447dd369dc","ref":"refs/heads/williamly/test-failure-annotations"},{"path":"tenstorrent/tt-metal/.github/workflows/_test-wheels-impl.yaml@94429171440755ffe7c62085c4807d447dd369dc","sha":"94429171440755ffe7c62085c4807d447dd369dc","ref":"refs/heads/williamly/test-failure-annotations"},{"path":"tenstorrent/tt-metal/.github/workflows/fast-dispatch-build-and-unit-tests.yaml@94429171440755ffe7c62085c4807d447dd369dc","sha":"94429171440755ffe7c62085c4807d447dd369dc","ref":"refs/heads/williamly/test-failure-annotations"},{"path":"tenstorrent/tt-metal/.github/workflows/cpp-post-commit.yaml@94429171440755ffe7c62085c4807d447dd369dc","sha":"94429171440755ffe7c62085c4807d447dd369dc","ref":"refs/heads/williamly/test-failure-annotations"},{"path":"tenstorrent/tt-metal/.github/workflows/all-static-checks.yaml@94429171440755ffe7c62085c4807d447dd369dc","sha":"94429171440755ffe7c62085c4807d447dd369dc","ref":"refs/heads/williamly/test-failure-annotations"},{"path":"tenstorrent/tt-metal/.github/workflows/code-analysis.yaml@94429171440755ffe7c62085c4807d447dd369dc","sha":"94429171440755ffe7c62085c4807d447dd369dc","ref":"refs/heads/williamly/test-failure-annotations"},{"path":"tenstorrent/tt-metal/.github/workflows/build-docker-artifact.yaml@94429171440755ffe7c62085c4807d447dd369dc","sha":"94429171440755ffe7c62085c4807d447dd369dc","ref":"refs/heads/williamly/test-failure-annotations"},{"path":"tenstorrent/tt-metal/.github/workflows/run-profiler-regression.yaml@94429171440755ffe7c62085c4807d447dd369dc","sha":"94429171440755ffe7c62085c4807d447dd369dc","ref":"refs/heads/williamly/test-failure-annotations"},{"path":"tenstorrent/tt-metal/.github/workflows/fabric-build-and-unit-tests.yaml@94429171440755ffe7c62085c4807d447dd369dc","sha":"94429171440755ffe7c62085c4807d447dd369dc","ref":"refs/heads/williamly/test-failure-annotations"}],"run_started_at":"2025-02-20T19:38:56Z","triggering_actor":{"login":"williamlyTT","id":193945317,"node_id":"U_kgDOC49e5Q","avatar_url":"https://avatars.githubusercontent.com/u/193945317?v=4","gravatar_id":"","url":"https://api.github.com/users/williamlyTT","html_url":"https://github.com/williamlyTT","followers_url":"https://api.github.com/users/williamlyTT/followers","following_url":"https://api.github.com/users/williamlyTT/following{/other_user}","gists_url":"https://api.github.com/users/williamlyTT/gists{/gist_id}","starred_url":"https://api.github.com/users/williamlyTT/starred{/owner}{/repo}","subscriptions_url":"https://api.github.com/users/williamlyTT/subscriptions","organizations_url":"https://api.github.com/users/williamlyTT/orgs","repos_url":"https://api.github.com/users/williamlyTT/repos","events_url":"https://api.github.com/users/williamlyTT/events{/privacy}","received_events_url":"https://api.github.com/users/williamlyTT/received_events","type":"User","user_view_type":"public","site_admin":false},"jobs_url":"https://api.github.com/repos/tenstorrent/tt-metal/actions/runs/13443325356/attempts/1/jobs","logs_url":"https://api.github.com/repos/tenstorrent/tt-metal/actions/runs/13443325356/attempts/1/logs","check_suite_url":"https://api.github.com/repos/tenstorrent/tt-metal/check-suites/34671009832","artifacts_url":"https://api.github.com/repos/tenstorrent/tt-metal/actions/runs/13443325356/artifacts","cancel_url":"https://api.github.com/repos/tenstorrent/tt-metal/actions/runs/13443325356/cancel","rerun_url":"https://api.github.com/repos/tenstorrent/tt-metal/actions/runs/13443325356/rerun","previous_attempt_url":null,"workflow_url":"https://api.github.com/repos/tenstorrent/tt-metal/actions/workflows/67993574","head_commit":{"id":"94429171440755ffe7c62085c4807d447dd369dc","tree_id":"9172fc831ad1d54c7383f9d188b07e210cb29a40","message":"Update workflows","timestamp":"2025-02-20T19:36:42Z","author":{"name":"William Ly","email":"williamly@tenstorrent.com"},"committer":{"name":"William Ly","email":"williamly@tenstorrent.com"}},"repository":{"id":601205363,"node_id":"R_kgDOI9Wqcw","name":"tt-metal","full_name":"tenstorrent/tt-metal","private":false,"owner":{"login":"tenstorrent","id":64161552,"node_id":"MDEyOk9yZ2FuaXphdGlvbjY0MTYxNTUy","avatar_url":"https://avatars.githubusercontent.com/u/64161552?v=4","gravatar_id":"","url":"https://api.github.com/users/tenstorrent","html_url":"https://github.com/tenstorrent","followers_url":"https://api.github.com/users/tenstorrent/followers","following_url":"https://api.github.com/users/tenstorrent/following{/other_user}","gists_url":"https://api.github.com/users/tenstorrent/gists{/gist_id}","starred_url":"https://api.github.com/users/tenstorrent/starred{/owner}{/repo}","subscriptions_url":"https://api.github.com/users/tenstorrent/subscriptions","organizations_url":"https://api.github.com/users/tenstorrent/orgs","repos_url":"https://api.github.com/users/tenstorrent/repos","events_url":"https://api.github.com/users/tenstorrent/events{/privacy}","received_events_url":"https://api.github.com/users/tenstorrent/received_events","type":"Organization","user_view_type":"public","site_admin":false},"html_url":"https://github.com/tenstorrent/tt-metal","description":":metal: TT-NN operator library, and TT-Metalium low level kernel programming model.","fork":false,"url":"https://api.github.com/repos/tenstorrent/tt-metal","forks_url":"https://api.github.com/repos/tenstorrent/tt-metal/forks","keys_url":"https://api.github.com/repos/tenstorrent/tt-metal/keys{/key_id}","collaborators_url":"https://api.github.com/repos/tenstorrent/tt-metal/collaborators{/collaborator}","teams_url":"https://api.github.com/repos/tenstorrent/tt-metal/teams","hooks_url":"https://api.github.com/repos/tenstorrent/tt-metal/hooks","issue_events_url":"https://api.github.com/repos/tenstorrent/tt-metal/issues/events{/number}","events_url":"https://api.github.com/repos/tenstorrent/tt-metal/events","assignees_url":"https://api.github.com/repos/tenstorrent/tt-metal/assignees{/user}","branches_url":"https://api.github.com/repos/tenstorrent/tt-metal/branches{/branch}","tags_url":"https://api.github.com/repos/tenstorrent/tt-metal/tags","blobs_url":"https://api.github.com/repos/tenstorrent/tt-metal/git/blobs{/sha}","git_tags_url":"https://api.github.com/repos/tenstorrent/tt-metal/git/tags{/sha}","git_refs_url":"https://api.github.com/repos/tenstorrent/tt-metal/git/refs{/sha}","trees_url":"https://api.github.com/repos/tenstorrent/tt-metal/git/trees{/sha}","statuses_url":"https://api.github.com/repos/tenstorrent/tt-metal/statuses/{sha}","languages_url":"https://api.github.com/repos/tenstorrent/tt-metal/languages","stargazers_url":"https://api.github.com/repos/tenstorrent/tt-metal/stargazers","contributors_url":"https://api.github.com/repos/tenstorrent/tt-metal/contributors","subscribers_url":"https://api.github.com/repos/tenstorrent/tt-metal/subscribers","subscription_url":"https://api.github.com/repos/tenstorrent/tt-metal/subscription","commits_url":"https://api.github.com/repos/tenstorrent/tt-metal/commits{/sha}","git_commits_url":"https://api.github.com/repos/tenstorrent/tt-metal/git/commits{/sha}","comments_url":"https://api.github.com/repos/tenstorrent/tt-metal/comments{/number}","issue_comment_url":"https://api.github.com/repos/tenstorrent/tt-metal/issues/comments{/number}","contents_url":"https://api.github.com/repos/tenstorrent/tt-metal/contents/{+path}","compare_url":"https://api.github.com/repos/tenstorrent/tt-metal/compare/{base}...{head}","merges_url":"https://api.github.com/repos/tenstorrent/tt-metal/merges","archive_url":"https://api.github.com/repos/tenstorrent/tt-metal/{archive_format}{/ref}","downloads_url":"https://api.github.com/repos/tenstorrent/tt-metal/downloads","issues_url":"https://api.github.com/repos/tenstorrent/tt-metal/issues{/number}","pulls_url":"https://api.github.com/repos/tenstorrent/tt-metal/pulls{/number}","milestones_url":"https://api.github.com/repos/tenstorrent/tt-metal/milestones{/number}","notifications_url":"https://api.github.com/repos/tenstorrent/tt-metal/notifications{?since,all,participating}","labels_url":"https://api.github.com/repos/tenstorrent/tt-metal/labels{/name}","releases_url":"https://api.github.com/repos/tenstorrent/tt-metal/releases{/id}","deployments_url":"https://api.github.com/repos/tenstorrent/tt-metal/deployments"},"head_repository":{"id":601205363,"node_id":"R_kgDOI9Wqcw","name":"tt-metal","full_name":"tenstorrent/tt-metal","private":false,"owner":{"login":"tenstorrent","id":64161552,"node_id":"MDEyOk9yZ2FuaXphdGlvbjY0MTYxNTUy","avatar_url":"https://avatars.githubusercontent.com/u/64161552?v=4","gravatar_id":"","url":"https://api.github.com/users/tenstorrent","html_url":"https://github.com/tenstorrent","followers_url":"https://api.github.com/users/tenstorrent/followers","following_url":"https://api.github.com/users/tenstorrent/following{/other_user}","gists_url":"https://api.github.com/users/tenstorrent/gists{/gist_id}","starred_url":"https://api.github.com/users/tenstorrent/starred{/owner}{/repo}","subscriptions_url":"https://api.github.com/users/tenstorrent/subscriptions","organizations_url":"https://api.github.com/users/tenstorrent/orgs","repos_url":"https://api.github.com/users/tenstorrent/repos","events_url":"https://api.github.com/users/tenstorrent/events{/privacy}","received_events_url":"https://api.github.com/users/tenstorrent/received_events","type":"Organization","user_view_type":"public","site_admin":false},"html_url":"https://github.com/tenstorrent/tt-metal","description":":metal: TT-NN operator library, and TT-Metalium low level kernel programming model.","fork":false,"url":"https://api.github.com/repos/tenstorrent/tt-metal","forks_url":"https://api.github.com/repos/tenstorrent/tt-metal/forks","keys_url":"https://api.github.com/repos/tenstorrent/tt-metal/keys{/key_id}","collaborators_url":"https://api.github.com/repos/tenstorrent/tt-metal/collaborators{/collaborator}","teams_url":"https://api.github.com/repos/tenstorrent/tt-metal/teams","hooks_url":"https://api.github.com/repos/tenstorrent/tt-metal/hooks","issue_events_url":"https://api.github.com/repos/tenstorrent/tt-metal/issues/events{/number}","events_url":"https://api.github.com/repos/tenstorrent/tt-metal/events","assignees_url":"https://api.github.com/repos/tenstorrent/tt-metal/assignees{/user}","branches_url":"https://api.github.com/repos/tenstorrent/tt-metal/branches{/branch}","tags_url":"https://api.github.com/repos/tenstorrent/tt-metal/tags","blobs_url":"https://api.github.com/repos/tenstorrent/tt-metal/git/blobs{/sha}","git_tags_url":"https://api.github.com/repos/tenstorrent/tt-metal/git/tags{/sha}","git_refs_url":"https://api.github.com/repos/tenstorrent/tt-metal/git/refs{/sha}","trees_url":"https://api.github.com/repos/tenstorrent/tt-metal/git/trees{/sha}","statuses_url":"https://api.github.com/repos/tenstorrent/tt-metal/statuses/{sha}","languages_url":"https://api.github.com/repos/tenstorrent/tt-metal/languages","stargazers_url":"https://api.github.com/repos/tenstorrent/tt-metal/stargazers","contributors_url":"https://api.github.com/repos/tenstorrent/tt-metal/contributors","subscribers_url":"https://api.github.com/repos/tenstorrent/tt-metal/subscribers","subscription_url":"https://api.github.com/repos/tenstorrent/tt-metal/subscription","commits_url":"https://api.github.com/repos/tenstorrent/tt-metal/commits{/sha}","git_commits_url":"https://api.github.com/repos/tenstorrent/tt-metal/git/commits{/sha}","comments_url":"https://api.github.com/repos/tenstorrent/tt-metal/comments{/number}","issue_comment_url":"https://api.github.com/repos/tenstorrent/tt-metal/issues/comments{/number}","contents_url":"https://api.github.com/repos/tenstorrent/tt-metal/contents/{+path}","compare_url":"https://api.github.com/repos/tenstorrent/tt-metal/compare/{base}...{head}","merges_url":"https://api.github.com/repos/tenstorrent/tt-metal/merges","archive_url":"https://api.github.com/repos/tenstorrent/tt-metal/{archive_format}{/ref}","downloads_url":"https://api.github.com/repos/tenstorrent/tt-metal/downloads","issues_url":"https://api.github.com/repos/tenstorrent/tt-metal/issues{/number}","pulls_url":"https://api.github.com/repos/tenstorrent/tt-metal/pulls{/number}","milestones_url":"https://api.github.com/repos/tenstorrent/tt-metal/milestones{/number}","notifications_url":"https://api.github.com/repos/tenstorrent/tt-metal/notifications{?since,all,participating}","labels_url":"https://api.github.com/repos/tenstorrent/tt-metal/labels{/name}","releases_url":"https://api.github.com/repos/tenstorrent/tt-metal/releases{/id}","deployments_url":"https://api.github.com/repos/tenstorrent/tt-metal/deployments"}} diff --git a/infra/tests/_data/data_collection/cicd/all_post_commit_test_annotations_13443325356/workflow_jobs.json b/infra/tests/_data/data_collection/cicd/all_post_commit_test_annotations_13443325356/workflow_jobs.json new file mode 100644 index 00000000000..a11761dd21d --- /dev/null +++ b/infra/tests/_data/data_collection/cicd/all_post_commit_test_annotations_13443325356/workflow_jobs.json @@ -0,0 +1,272 @@ +{ + "total_count": 200, + "jobs": [ + { + "id": 37563095078, + "run_id": 13443325356, + "workflow_name": "All post-commit tests", + "head_branch": "williamly/test-failure-annotations", + "run_url": "https://api.github.com/repos/tenstorrent/tt-metal/actions/runs/13443325356", + "run_attempt": 1, + "node_id": "CR_kwDOI9Wqc88AAAAIvu9YJg", + "head_sha": "94429171440755ffe7c62085c4807d447dd369dc", + "url": "https://api.github.com/repos/tenstorrent/tt-metal/actions/jobs/37563095078", + "html_url": "https://github.com/tenstorrent/tt-metal/actions/runs/13443325356/job/37563095078", + "status": "completed", + "conclusion": "failure", + "created_at": "2025-02-20T19:46:04Z", + "started_at": "2025-02-20T20:33:26Z", + "completed_at": "2025-02-20T20:38:09Z", + "name": "sd-unit-tests (grayskull, E150) / grayskull E150 api", + "steps": [ + { + "name": "Set up job", + "status": "completed", + "conclusion": "success", + "number": 1, + "started_at": "2025-02-20T20:33:26Z", + "completed_at": "2025-02-20T20:33:34Z" + }, + { + "name": "Set up runner", + "status": "completed", + "conclusion": "success", + "number": 2, + "started_at": "2025-02-20T20:33:34Z", + "completed_at": "2025-02-20T20:34:55Z" + }, + { + "name": "Run tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main", + "status": "completed", + "conclusion": "success", + "number": 3, + "started_at": "2025-02-20T20:34:56Z", + "completed_at": "2025-02-20T20:34:58Z" + }, + { + "name": "Run /./.github/actions/prepare-metal-run", + "status": "completed", + "conclusion": "success", + "number": 4, + "started_at": "2025-02-20T20:34:59Z", + "completed_at": "2025-02-20T20:35:44Z" + }, + { + "name": "api tests", + "status": "completed", + "conclusion": "failure", + "number": 5, + "started_at": "2025-02-20T20:35:44Z", + "completed_at": "2025-02-20T20:38:00Z" + }, + { + "name": "Run /./.github/actions/slack-report", + "status": "completed", + "conclusion": "success", + "number": 6, + "started_at": "2025-02-20T20:38:00Z", + "completed_at": "2025-02-20T20:38:01Z" + }, + { + "name": "Run /./.github/actions/upload-artifact-with-job-uuid", + "status": "completed", + "conclusion": "success", + "number": 7, + "started_at": "2025-02-20T20:38:01Z", + "completed_at": "2025-02-20T20:38:03Z" + }, + { + "name": "Generate system logs on failure", + "status": "completed", + "conclusion": "success", + "number": 8, + "started_at": "2025-02-20T20:38:03Z", + "completed_at": "2025-02-20T20:38:05Z" + }, + { + "name": "Generate gtest annotations on failure", + "status": "completed", + "conclusion": "success", + "number": 9, + "started_at": "2025-02-20T20:38:05Z", + "completed_at": "2025-02-20T20:38:05Z" + }, + { + "name": "Post api tests", + "status": "completed", + "conclusion": "success", + "number": 15, + "started_at": "2025-02-20T20:38:05Z", + "completed_at": "2025-02-20T20:38:06Z" + }, + { + "name": "Post Run /./.github/actions/prepare-metal-run", + "status": "completed", + "conclusion": "success", + "number": 16, + "started_at": "2025-02-20T20:38:06Z", + "completed_at": "2025-02-20T20:38:06Z" + }, + { + "name": "Post Run tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main", + "status": "completed", + "conclusion": "success", + "number": 17, + "started_at": "2025-02-20T20:38:06Z", + "completed_at": "2025-02-20T20:38:06Z" + }, + { + "name": "Complete runner", + "status": "completed", + "conclusion": "success", + "number": 18, + "started_at": "2025-02-20T20:38:06Z", + "completed_at": "2025-02-20T20:38:06Z" + }, + { + "name": "Complete job", + "status": "completed", + "conclusion": "success", + "number": 19, + "started_at": "2025-02-20T20:38:06Z", + "completed_at": "2025-02-20T20:38:06Z" + } + ], + "check_run_url": "https://api.github.com/repos/tenstorrent/tt-metal/check-runs/37563095078", + "labels": [ + "E150", + "cloud-virtual-machine", + "in-service" + ], + "runner_id": 143, + "runner_name": "tt-metal-ci-vm-105", + "runner_group_id": 1, + "runner_group_name": "Default" + }, + { + "id": 37563108566, + "run_id": 13443325356, + "workflow_name": "All post-commit tests", + "head_branch": "williamly/test-failure-annotations", + "run_url": "https://api.github.com/repos/tenstorrent/tt-metal/actions/runs/13443325356", + "run_attempt": 1, + "node_id": "CR_kwDOI9Wqc88AAAAIvu-M1g", + "head_sha": "94429171440755ffe7c62085c4807d447dd369dc", + "url": "https://api.github.com/repos/tenstorrent/tt-metal/actions/jobs/37563108566", + "html_url": "https://github.com/tenstorrent/tt-metal/actions/runs/13443325356/job/37563108566", + "status": "completed", + "conclusion": "failure", + "created_at": "2025-02-20T19:46:19Z", + "started_at": "2025-02-20T20:37:45Z", + "completed_at": "2025-02-20T20:43:06Z", + "name": "ttnn-unit-tests (grayskull, E150) / ttnn group 4 grayskull E150", + "steps": [ + { + "name": "Set up job", + "status": "completed", + "conclusion": "success", + "number": 1, + "started_at": "2025-02-20T20:37:44Z", + "completed_at": "2025-02-20T20:37:52Z" + }, + { + "name": "Set up runner", + "status": "completed", + "conclusion": "success", + "number": 2, + "started_at": "2025-02-20T20:37:53Z", + "completed_at": "2025-02-20T20:39:14Z" + }, + { + "name": "Run tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main", + "status": "completed", + "conclusion": "success", + "number": 3, + "started_at": "2025-02-20T20:39:14Z", + "completed_at": "2025-02-20T20:39:19Z" + }, + { + "name": "Run actions/download-artifact@v4", + "status": "completed", + "conclusion": "success", + "number": 4, + "started_at": "2025-02-20T20:39:19Z", + "completed_at": "2025-02-20T20:39:42Z" + }, + { + "name": "Set ttnn fast runtime if exists in config", + "status": "completed", + "conclusion": "skipped", + "number": 5, + "started_at": "2025-02-20T20:39:42Z", + "completed_at": "2025-02-20T20:39:42Z" + }, + { + "name": "ttnn group 4 tests", + "status": "completed", + "conclusion": "failure", + "number": 6, + "started_at": "2025-02-20T20:39:42Z", + "completed_at": "2025-02-20T20:43:00Z" + }, + { + "name": "Run /./.github/actions/slack-report", + "status": "completed", + "conclusion": "success", + "number": 7, + "started_at": "2025-02-20T20:43:00Z", + "completed_at": "2025-02-20T20:43:00Z" + }, + { + "name": "Run /./.github/actions/upload-artifact-with-job-uuid", + "status": "completed", + "conclusion": "success", + "number": 8, + "started_at": "2025-02-20T20:43:01Z", + "completed_at": "2025-02-20T20:43:02Z" + }, + { + "name": "Post ttnn group 4 tests", + "status": "completed", + "conclusion": "success", + "number": 14, + "started_at": "2025-02-20T20:43:03Z", + "completed_at": "2025-02-20T20:43:03Z" + }, + { + "name": "Post Run tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main", + "status": "completed", + "conclusion": "success", + "number": 15, + "started_at": "2025-02-20T20:43:03Z", + "completed_at": "2025-02-20T20:43:03Z" + }, + { + "name": "Complete runner", + "status": "completed", + "conclusion": "success", + "number": 16, + "started_at": "2025-02-20T20:43:06Z", + "completed_at": "2025-02-20T20:43:06Z" + }, + { + "name": "Complete job", + "status": "completed", + "conclusion": "success", + "number": 17, + "started_at": "2025-02-20T20:43:04Z", + "completed_at": "2025-02-20T20:43:04Z" + } + ], + "check_run_url": "https://api.github.com/repos/tenstorrent/tt-metal/check-runs/37563108566", + "labels": [ + "E150", + "in-service" + ], + "runner_id": 123, + "runner_name": "tt-metal-ci-vm-2", + "runner_group_id": 1, + "runner_group_name": "Default" + } + ] +} diff --git a/infra/tests/data_collection/test_cicd.py b/infra/tests/data_collection/test_cicd.py index 440cd4ea115..386d0aff0d0 100644 --- a/infra/tests/data_collection/test_cicd.py +++ b/infra/tests/data_collection/test_cicd.py @@ -3,7 +3,8 @@ from infra.data_collection.github import workflows from infra.data_collection.cicd import create_cicd_json_for_data_analysis -from infra.data_collection.models import InfraErrorV1 +from infra.data_collection.models import InfraErrorV1, TestErrorV1 +from infra.data_collection.pydantic_models import JobStatus def test_dummy(): @@ -76,6 +77,7 @@ def test_create_pipeline_json_to_detect_job_timeout_error_v1(workflow_run_gh_env if job.github_job_id == 30531878948: assert job.failure_signature == str(InfraErrorV1.JOB_CUMULATIVE_TIMEOUT_FAILURE) assert job.failure_description is not None + assert job.job_status == JobStatus.failure else: assert job.failure_signature is None assert job.failure_description is None @@ -114,6 +116,7 @@ def test_create_pipeline_json_to_detect_runner_comm_error_v1_among_other_failure if job.github_job_id == 30868260202: assert job.failure_signature == str(InfraErrorV1.RUNNER_COMM_FAILURE) assert job.failure_description is not None + assert job.job_status == JobStatus.failure else: assert job.failure_signature is None assert job.failure_description is None @@ -146,6 +149,7 @@ def test_create_pipeline_json_for_run_github_timed_out_job(workflow_run_gh_envir for job in pipeline.jobs: if job.github_job_id == 30868260202: assert len(job.tests) > 0 + assert job.job_status == JobStatus.failure def test_create_pipeline_json_for_timeout_bad_testcase(workflow_run_gh_environment): @@ -175,6 +179,7 @@ def test_create_pipeline_json_for_timeout_bad_testcase(workflow_run_gh_environme for job in pipeline.jobs: if job.github_job_id == 36492361640: assert len(job.tests) > 0 + assert job.job_status == JobStatus.failure def test_create_pipeline_json_for_gtest_testcases(workflow_run_gh_environment): @@ -206,22 +211,26 @@ def test_create_pipeline_json_for_gtest_testcases(workflow_run_gh_environment): if job.github_job_id == 37190230023: assert len(job.tests) > 0 assert job.job_success is True + assert job.job_status == JobStatus.success # failing gtest testcase if job.github_job_id == 37190213375: assert len(job.tests) > 0 assert job.job_success is False # check that there are failing gtests stored in the pydantic testcase list assert len([x for x in job.tests if not x.success]) > 0 + assert job.job_status == JobStatus.failure # passing pytest testcase if job.github_job_id == 37190252200: assert len(job.tests) > 0 assert job.job_success is True + assert job.job_status == JobStatus.success # failing pytest testcase if job.github_job_id == 37190251054: assert len(job.tests) > 0 assert job.job_success is False # check that there are failing pytests stored in the pydantic testcase list assert len([x for x in job.tests if not x.success]) > 0 + assert job.job_status == JobStatus.failure # fails validation, job is expected be skipped assert len([x for x in pipeline.jobs if x.github_job_id == 37190219113]) == 0 @@ -233,3 +242,49 @@ def test_empty_gtest_xml(workflow_run_gh_environment): assert ( workflows.get_tests_from_test_report_path(workflow_outputs_dir / "distributed_unit_tests_wormhole_b0.xml") == [] ) + + +def test_create_pipeline_json_for_testcases_with_annotations(workflow_run_gh_environment): + github_runner_environment = workflow_run_gh_environment + github_pipeline_json_filename = ( + "tests/_data/data_collection/cicd/all_post_commit_test_annotations_13443325356/workflow.json" + ) + github_jobs_json_filename = ( + "tests/_data/data_collection/cicd/all_post_commit_test_annotations_13443325356/workflow_jobs.json" + ) + + workflow_outputs_dir = pathlib.Path( + "tests/_data/data_collection/cicd/all_post_commit_test_annotations_13443325356/" + ).resolve() + assert workflow_outputs_dir.is_dir() + assert workflow_outputs_dir.exists() + + pipeline = create_cicd_json_for_data_analysis( + workflow_outputs_dir, + github_runner_environment, + github_pipeline_json_filename, + github_jobs_json_filename, + ) + + assert pipeline.github_pipeline_id == 13443325356 + + for job in pipeline.jobs: + # failing gtest testcase + if job.github_job_id == 37563095078: + assert len(job.tests) > 0 + assert job.job_success is False + # check that there are failing gtests stored in the pydantic testcase list + assert len([x for x in job.tests if not x.success]) == 1 + # check that the job signature and description are present + assert job.failure_signature == str(TestErrorV1.CPP_TEST_FAILURE) + assert job.failure_description is not None and ".cpp" in job.failure_description + assert job.job_status == JobStatus.failure + # failing pytest testcase + if job.github_job_id == 37563108566: + assert len(job.tests) > 0 + assert job.job_success is False + # check that there are failing pytests stored in the pydantic testcase list + assert len([x for x in job.tests if not x.success]) == 1 + assert job.failure_signature == str(TestErrorV1.PY_TEST_FAILURE) + assert job.failure_description is not None and ".py" in job.failure_description + assert job.job_status == JobStatus.failure From 93dfba7013ed0c5100bb395a3a9a322b378ff1ec Mon Sep 17 00:00:00 2001 From: Yu Gao <145494740+yugaoTT@users.noreply.github.com> Date: Mon, 24 Feb 2025 16:31:04 -0500 Subject: [PATCH 270/316] Add perf bound for EDM bandwidth test (#18141) ### Ticket Add perf check for fabric edm ### Checklist - [x] [All post commit] https://github.com/tenstorrent/tt-metal/actions/runs/13465440935 - [x] ubenchmark https://github.com/tenstorrent/tt-metal/actions/runs/13465183176/job/37629375894 - [ ] T3K unit test https://github.com/tenstorrent/tt-metal/actions/runs/13465436374 --- .../workflows/metal-run-microbenchmarks.yaml | 9 + tests/scripts/run_tests.sh | 13 + .../ethernet/test_fabric_edm_bandwidth.py | 135 + tests/ttnn/unit_tests/gtests/CMakeLists.txt | 4 + .../gtests/ccl/kernels/edm_fabric_writer.cpp | 25 +- .../unit_tests/gtests/ccl/test_fabric_edm.cpp | 22 + .../gtests/ccl/test_fabric_edm_common.hpp | 2344 ++++++++++++++++ ...erisc_data_mover_loopback_with_workers.cpp | 2385 +---------------- 8 files changed, 2571 insertions(+), 2366 deletions(-) create mode 100644 tests/tt_metal/microbenchmarks/ethernet/test_fabric_edm_bandwidth.py create mode 100644 tests/ttnn/unit_tests/gtests/ccl/test_fabric_edm.cpp create mode 100644 tests/ttnn/unit_tests/gtests/ccl/test_fabric_edm_common.hpp diff --git a/.github/workflows/metal-run-microbenchmarks.yaml b/.github/workflows/metal-run-microbenchmarks.yaml index b5dd7892857..cf7c8bf112a 100644 --- a/.github/workflows/metal-run-microbenchmarks.yaml +++ b/.github/workflows/metal-run-microbenchmarks.yaml @@ -19,6 +19,13 @@ jobs: # N300 {arch: wormhole_b0, runs-on: ["N300", "pipeline-perf", "bare-metal", "in-service"]}, {arch: wormhole_b0, runs-on: ["N300", "pipeline-perf", "bare-metal", "in-service"], ccl: true}, + # T3000 + { + name: "T3000 uBenchmark tests", + arch: wormhole_b0, + runs-on: ["arch-wormhole_b0", "config-t3000", "pipeline-perf", "in-service"], + is-t3k: true + }, ] env: # Use BM for microbenchmarks @@ -40,6 +47,8 @@ jobs: PIPELINE_TYPE="microbenchmarks" if [ "${{ matrix.runner-info.ccl }}" == "true" ]; then PIPELINE_TYPE="ccl_microbenchmarks" + elif [ "${{ matrix.runner-info.is-t3k }}" == "true" ]; then + PIPELINE_TYPE="T3K_microbenchmark" else TT_METAL_SLOW_DISPATCH_MODE=1 ./tests/scripts/run_tunneler_tests.sh --machine-type ${{ matrix.runner-info.runs-on[0] }} fi diff --git a/tests/scripts/run_tests.sh b/tests/scripts/run_tests.sh index a048cd440c5..cfd3ee09e3f 100755 --- a/tests/scripts/run_tests.sh +++ b/tests/scripts/run_tests.sh @@ -203,6 +203,17 @@ run_ccl_microbenchmarks_pipeline_tests() { fi } +run_T3K_microbenchmarks_pipeline_tests() { + local tt_arch=$1 + local pipeline_type=$2 + local dispatch_mode=$3 + + export TT_METAL_DEVICE_PROFILER=1 + + source python_env/bin/activate + pytest -svv tests/tt_metal/microbenchmarks/ethernet/test_fabric_edm_bandwidth.py +} + run_ttnn_sweeps_pipeline_tests() { local tt_arch=$1 local pipeline_type=$2 @@ -351,6 +362,8 @@ run_pipeline_tests() { run_microbenchmarks_pipeline_tests "$tt_arch" "$pipeline_type" "$dispatch_mode" elif [[ $pipeline_type == "ccl_microbenchmarks" ]]; then run_ccl_microbenchmarks_pipeline_tests "$tt_arch" "$pipeline_type" "$dispatch_mode" + elif [[ $pipeline_type == "T3K_microbenchmark" ]]; then + run_T3K_microbenchmarks_pipeline_tests "$tt_arch" "$pipeline_type" "$dispatch_mode" elif [[ $pipeline_type == "ttnn_sweeps" ]]; then run_ttnn_sweeps_pipeline_tests "$tt_arch" "$pipeline_type" "$dispatch_mode" # T3000 pipelines diff --git a/tests/tt_metal/microbenchmarks/ethernet/test_fabric_edm_bandwidth.py b/tests/tt_metal/microbenchmarks/ethernet/test_fabric_edm_bandwidth.py new file mode 100644 index 00000000000..de0e3ac5181 --- /dev/null +++ b/tests/tt_metal/microbenchmarks/ethernet/test_fabric_edm_bandwidth.py @@ -0,0 +1,135 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +# +# SPDX-License-Identifier: Apache-2.0 + +import os +import sys + +from loguru import logger +import pytest +import csv +from tt_metal.tools.profiler.process_device_log import import_log_run_stats +import tt_metal.tools.profiler.device_post_proc_config as device_post_proc_config + +from tt_metal.tools.profiler.common import PROFILER_LOGS_DIR, PROFILER_DEVICE_SIDE_LOG + +profiler_log_path = PROFILER_LOGS_DIR / PROFILER_DEVICE_SIDE_LOG + + +def get_device_freq(): + setup = device_post_proc_config.default_setup() + setup.deviceInputLog = profiler_log_path + deviceData = import_log_run_stats(setup) + freq = deviceData["deviceInfo"]["freq"] + return freq + + +def profile_results(is_unicast, num_mcasts, num_unicasts, line_size, packet_size): + freq = get_device_freq() / 1000.0 + setup = device_post_proc_config.default_setup() + setup.deviceInputLog = profiler_log_path + main_test_body_string = "MAIN-WRITE-UNICAST-ZONE" if is_unicast else "MAIN-WRITE-MCAST-ZONE" + setup.timerAnalysis = { + main_test_body_string: { + "across": "device", + "type": "session_first_last", + "start": {"core": "ANY", "risc": "ANY", "zone_name": main_test_body_string}, + "end": {"core": "ANY", "risc": "ANY", "zone_name": main_test_body_string}, + }, + } + devices_data = import_log_run_stats(setup) + devices = list(devices_data["devices"].keys()) + + # MAIN-TEST-BODY + main_loop_cycles = [] + for device in devices: + main_loop_cycle = devices_data["devices"][device]["cores"]["DEVICE"]["analysis"][main_test_body_string][ + "stats" + ]["Average"] + main_loop_cycles.append(main_loop_cycle) + + packets_per_src_chip = num_unicasts if is_unicast else num_mcasts + traffic_streams_through_boundary = line_size / 2 + total_byte_sent = packets_per_src_chip * traffic_streams_through_boundary * packet_size + bandwidth = total_byte_sent / max(main_loop_cycles) + + return bandwidth + + +def run_fabric_edm( + is_unicast, num_mcasts, num_unicasts, num_links, num_op_invocations, line_sync, line_size, packet_size, expected_bw +): + logger.warning("removing file profile_log_device.csv") + os.system(f"rm -rf {os.environ['TT_METAL_HOME']}/generated/profiler/.logs/profile_log_device.csv") + + cmd = f"TT_METAL_DEVICE_PROFILER=1 \ + {os.environ['TT_METAL_HOME']}/build/test/ttnn/unit_tests_ttnn_fabric_edm \ + {num_mcasts} \ + {num_unicasts} \ + {num_links} \ + {num_op_invocations} \ + {int(line_sync)} \ + {line_size} \ + {packet_size} " + rc = os.system(cmd) + if rc != 0: + logger.info("Error in running the test") + assert False + + bandwidth = profile_results(is_unicast, num_mcasts, num_unicasts, line_size, packet_size) + logger.info("bandwidth: {} B/c", bandwidth) + assert expected_bw - 0.2 <= bandwidth <= expected_bw + 0.2 + + +@pytest.mark.parametrize("num_mcasts", [200000]) +@pytest.mark.parametrize("num_unicasts", [0]) +@pytest.mark.parametrize("num_links", [1]) +@pytest.mark.parametrize("num_op_invocations", [1]) +@pytest.mark.parametrize("line_sync", [True]) +@pytest.mark.parametrize("line_size", [4]) +@pytest.mark.parametrize("packet_size", [4096]) +@pytest.mark.parametrize( + "expected_bw", + [5.65], +) +def test_fabric_edm_mcast_bw( + num_mcasts, num_unicasts, num_links, num_op_invocations, line_sync, line_size, packet_size, expected_bw +): + run_fabric_edm( + False, + num_mcasts, + num_unicasts, + num_links, + num_op_invocations, + line_sync, + line_size, + packet_size, + expected_bw, + ) + + +@pytest.mark.parametrize("num_mcasts", [0]) +@pytest.mark.parametrize("num_unicasts", [200000]) +@pytest.mark.parametrize("num_links", [1]) +@pytest.mark.parametrize("num_op_invocations", [1]) +@pytest.mark.parametrize("line_sync", [True]) +@pytest.mark.parametrize("line_size", [2]) +@pytest.mark.parametrize("packet_size", [4096]) +@pytest.mark.parametrize( + "expected_bw", + [7.13], +) +def test_fabric_edm_unicast_bw( + num_mcasts, num_unicasts, num_links, num_op_invocations, line_sync, line_size, packet_size, expected_bw +): + run_fabric_edm( + True, + num_mcasts, + num_unicasts, + num_links, + num_op_invocations, + line_sync, + line_size, + packet_size, + expected_bw, + ) diff --git a/tests/ttnn/unit_tests/gtests/CMakeLists.txt b/tests/ttnn/unit_tests/gtests/CMakeLists.txt index 93fedd81a9f..4afd6a0cbf6 100644 --- a/tests/ttnn/unit_tests/gtests/CMakeLists.txt +++ b/tests/ttnn/unit_tests/gtests/CMakeLists.txt @@ -23,6 +23,8 @@ set(TTNN_CCL_UNIT_TESTS_SRC ${CMAKE_CURRENT_SOURCE_DIR}/ccl/test_ccl_reduce_scatter_host_helpers.cpp ) +set(TTNN_FABRIC_EDM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/ccl/test_fabric_edm.cpp) + set(TTNN_TENSOR_UNIT_TESTS_SRC ${CMAKE_CURRENT_SOURCE_DIR}/tensor/common_tensor_test_utils.cpp ${CMAKE_CURRENT_SOURCE_DIR}/tensor/test_create_tensor.cpp @@ -40,6 +42,7 @@ set(TTNN_TENSOR_UNIT_TESTS_SRC add_executable(unit_tests_ttnn ${TTNN_UNIT_TESTS_SRC}) TT_ENABLE_UNITY_BUILD(unit_tests_ttnn) add_executable(unit_tests_ttnn_ccl ${TTNN_CCL_UNIT_TESTS_SRC}) +add_executable(unit_tests_ttnn_fabric_edm ${TTNN_FABRIC_EDM_SRC}) add_executable(unit_tests_ttnn_tensor ${TTNN_TENSOR_UNIT_TESTS_SRC}) add_executable(test_multi_device ${CMAKE_CURRENT_SOURCE_DIR}/test_multi_device.cpp) add_executable(galaxy_unit_tests_ttnn ${CMAKE_CURRENT_SOURCE_DIR}/test_ccl_on_galaxy.cpp) @@ -47,6 +50,7 @@ add_executable(galaxy_unit_tests_ttnn ${CMAKE_CURRENT_SOURCE_DIR}/test_ccl_on_ga # Set up properties for all targets setup_ttnn_test_target(unit_tests_ttnn) setup_ttnn_test_target(unit_tests_ttnn_ccl) +setup_ttnn_test_target(unit_tests_ttnn_fabric_edm) setup_ttnn_test_target(unit_tests_ttnn_tensor) setup_ttnn_test_target(test_multi_device) setup_ttnn_test_target(galaxy_unit_tests_ttnn) diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp index c22ae1d57f3..fc38137a98e 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp @@ -131,7 +131,7 @@ void kernel_main() { unicast_packet_header->to_chip_unicast(static_cast(unicast_hops)); { - DeviceZoneScopedN("MAIN-WRITE-ZONE"); + DeviceZoneScopedN("MAIN-WRITE-MCAST-ZONE"); for (size_t i = 0; i < num_mcasts; i++) { auto noc0_dest_addr = safe_get_noc_addr( static_cast(dest_noc_x), static_cast(dest_noc_y), dest_bank_addr, 0); @@ -165,16 +165,19 @@ void kernel_main() { } } - for (size_t i = 0; i < num_unicasts; i++) { - auto noc0_dest_addr = - safe_get_noc_addr(static_cast(dest_noc_x), static_cast(dest_noc_y), dest_bank_addr, 0); - auto& fabric_conn = - unicast_is_fwd ? fabric_connection.get_forward_connection() : fabric_connection.get_backward_connection(); - unicast_packet_header->to_noc_unicast_write(NocUnicastCommandHeader{noc0_dest_addr}, packet_payload_size_bytes); - fabric_conn.wait_for_empty_write_slot(); - fabric_conn.send_payload_without_header_non_blocking_from_address( - source_l1_buffer_address, packet_payload_size_bytes); - fabric_conn.send_payload_blocking_from_address((uint32_t)unicast_packet_header, sizeof(PACKET_HEADER_TYPE)); + { + DeviceZoneScopedN("MAIN-WRITE-UNICAST-ZONE"); + for (size_t i = 0; i < num_unicasts; i++) { + auto noc0_dest_addr = + safe_get_noc_addr(static_cast(dest_noc_x), static_cast(dest_noc_y), dest_bank_addr, 0); + auto& fabric_conn = + unicast_is_fwd ? fabric_connection.get_forward_connection() : fabric_connection.get_backward_connection(); + unicast_packet_header->to_noc_unicast_write(NocUnicastCommandHeader{noc0_dest_addr}, packet_payload_size_bytes); + fabric_conn.wait_for_empty_write_slot(); + fabric_conn.send_payload_without_header_non_blocking_from_address( + source_l1_buffer_address, packet_payload_size_bytes); + fabric_conn.send_payload_blocking_from_address((uint32_t)unicast_packet_header, sizeof(PACKET_HEADER_TYPE)); + } } if (enable_finish_synchronization) { diff --git a/tests/ttnn/unit_tests/gtests/ccl/test_fabric_edm.cpp b/tests/ttnn/unit_tests/gtests/ccl/test_fabric_edm.cpp new file mode 100644 index 00000000000..6563ecff3a0 --- /dev/null +++ b/tests/ttnn/unit_tests/gtests/ccl/test_fabric_edm.cpp @@ -0,0 +1,22 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "tests/ttnn/unit_tests/gtests/ccl/test_fabric_edm_common.hpp" + +int main(int argc, char** argv) { + std::size_t arg_idx = 1; + std::size_t num_mcasts = std::stoi(argv[arg_idx++]); + std::size_t num_unicasts = std::stoi(argv[arg_idx++]); + std::size_t num_links = std::stoi(argv[arg_idx++]); + std::size_t num_op_invocations = std::stoi(argv[arg_idx++]); + bool line_sync = std::stoi(argv[arg_idx++]); + std::size_t line_size = std::stoi(argv[arg_idx++]); + std::size_t packet_payload_size_bytes = std::stoi(argv[arg_idx++]); + + WriteThroughputStabilityTestWithPersistentFabricParams params; + params.line_sync = line_sync; + params.line_size = line_size; + RunWriteThroughputStabilityTestWithPersistentFabric( + num_mcasts, num_unicasts, num_links, num_op_invocations, params, packet_payload_size_bytes); +} diff --git a/tests/ttnn/unit_tests/gtests/ccl/test_fabric_edm_common.hpp b/tests/ttnn/unit_tests/gtests/ccl/test_fabric_edm_common.hpp new file mode 100644 index 00000000000..1a9465f67b7 --- /dev/null +++ b/tests/ttnn/unit_tests/gtests/ccl/test_fabric_edm_common.hpp @@ -0,0 +1,2344 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "tt-metalium/kernel_types.hpp" +#include "tt_metal/test_utils/df/df.hpp" +#include "tt_metal/test_utils/env_vars.hpp" +#include "ttnn/common/queue_id.hpp" +#include "ttnn/cpp/ttnn/operations/ccl/ccl_common.hpp" +#include "ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp" +#include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp" +#include "ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.hpp" +#include "ttnn/operations/ccl/common/uops/ccl_host_commands.hpp" +#include "ttnn/cpp/ttnn/operations/creation.hpp" +#include "ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_command.hpp" +#include "ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types_args_emitters.hpp" +#include "ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.hpp" +#include "ttnn/cpp/ttnn/operations/ccl/common/host/ccl_command_stream_builders.hpp" + +#include +#include +#include "ttnn/cpp/ttnn/operations/experimental/reshape/view.hpp" + +#include + +#include "umd/device/types/arch.h" +#include "umd/device/types/cluster_descriptor_types.h" +#include "gtest/gtest.h" + +#include +#include +#include +#include +#include + +#include "tests/ttnn/unit_tests/gtests/ccl/test_fabric_edm_common.hpp" + +using namespace tt; +using namespace tt::test_utils; +using namespace tt::test_utils::df; + +enum TwoInputReaderKernelWriteMode { LOCAL_WRITEBACK, FABRIC_UNICAST, FABRIC_MULTICAST }; + +static constexpr size_t TEST_WORKERS_SUBDEVICE_INDEX = 0; +static constexpr size_t TEST_EDM_FABRIC_SUBDEVICE_INDEX = 1; + +using subdevice_managers_t = std::unordered_map; +struct SubdeviceInfo { + std::unordered_map sub_device_managers; + std::unordered_map worker_subdevice_id; + std::unordered_map fabric_subdevice_id; +}; + +using tt::tt_metal::distributed::MeshCoordinate; +using tt::tt_metal::distributed::MeshDevice; +using tt::tt_metal::distributed::MeshDeviceConfig; +using tt::tt_metal::distributed::MeshDeviceView; +using tt::tt_metal::distributed::MeshShape; +class T3000TestDevice { +public: + T3000TestDevice() : device_open(false) { + auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); + if (slow_dispatch) { + TT_THROW("This suite can only be run without TT_METAL_SLOW_DISPATCH_MODE set"); + } + arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); + + num_devices_ = tt::tt_metal::GetNumAvailableDevices(); + if (arch_ == tt::ARCH::WORMHOLE_B0 and num_devices_ == 8 and tt::tt_metal::GetNumPCIeDevices() == 4) { + mesh_device_ = MeshDevice::create(MeshDeviceConfig{.mesh_shape = MeshShape{2, 4}}); + + std::vector ids(num_devices_, 0); + std::iota(ids.begin(), ids.end(), 0); + + } else { + TT_THROW("This suite can only be run on T3000 Wormhole devices"); + } + device_open = true; + } + ~T3000TestDevice() { + if (device_open) { + TearDown(); + } + } + + void TearDown() { + device_open = false; + mesh_device_->close(); + } + + tt::ARCH arch_; + size_t num_devices_; + std::shared_ptr mesh_device_; + +private: + bool device_open; +}; + +struct BankedConfig { + size_t num_pages; + size_t size_bytes; + size_t page_size_bytes; + BufferType input_buffer_type; + BufferType output_buffer_type; + tt::DataFormat l1_data_format; +}; + +struct KernelXY { + uint16_t x; + uint16_t y; + + uint32_t to_uint32() const { return y << 16 | x; } +}; + +enum Correctness { Correct, Incorrect }; + +template +Correctness run_output_check(CONTAINER_T const& inputs, CONTAINER_T output_buffer) { + constexpr bool debug_mode = true; + + log_info(tt::LogTest, "Checking outputs"); + bool pass = true; + + std::size_t num_printed_mismatches = 0; + for (size_t i = 0; i < inputs.size() && num_printed_mismatches < 64; i++) { + if (output_buffer[i] != inputs[i]) { + if (debug_mode) { + if (pass) { + log_error("Output mismatch"); + } + log_error("[{}]: expected {} got {}", i, inputs[i], output_buffer[i]); + num_printed_mismatches++; + } + pass = false; + } + } + if (num_printed_mismatches > 0) { + log_error("... (remaining mismatches omitted)"); + } + + log_info(tt::LogTest, "Output check: {}", pass ? "PASS" : "FAIL"); + return pass ? Correctness::Correct : Correctness::Incorrect; +}; + +static SubdeviceInfo create_subdevices(const std::vector& devices) { + SubdeviceInfo subdevice_info; + std::unordered_map sub_device_manager_ids; + for (auto device : devices) { + const auto& tensix_sub_device = + tt_metal::SubDevice(std::array{device->worker_cores(HalProgrammableCoreType::TENSIX, SubDeviceId{0})}); + const auto& eth_sub_device = tt_metal::SubDevice( + std::array{CoreRangeSet(), device->worker_cores(HalProgrammableCoreType::ACTIVE_ETH, SubDeviceId{0})}); + subdevice_info.sub_device_managers.insert( + {device->id(), device->create_sub_device_manager({tensix_sub_device, eth_sub_device}, 0)}); + device->load_sub_device_manager(subdevice_info.sub_device_managers.at(device->id())); + subdevice_info.worker_subdevice_id.insert( + {device->id(), device->get_sub_device_ids().at(TEST_WORKERS_SUBDEVICE_INDEX)}); + subdevice_info.fabric_subdevice_id.insert( + {device->id(), device->get_sub_device_ids().at(TEST_EDM_FABRIC_SUBDEVICE_INDEX)}); + device->set_sub_device_stall_group({subdevice_info.worker_subdevice_id.at(device->id())}); + } + + return subdevice_info; +} + +Correctness run_output_check( + const std::vector& all_zeros, + const std::vector& inputs, + std::shared_ptr& output_buffer) { + constexpr bool debug_mode = true; + std::vector readback_data_vec(all_zeros.size(), 0); // init to 0 data for easier debug + + tt_metal::detail::ReadFromBuffer(output_buffer, readback_data_vec); + return run_output_check(inputs, readback_data_vec); +}; + +void run_programs(std::vector& programs, const std::vector& devices) { + EXPECT_EQ(programs.size(), devices.size()); + const size_t num_programs = programs.size(); + try { + for (size_t i = 0; i < num_programs; i++) { + tt::tt_metal::detail::CompileProgram(devices.at(i), programs.at(i)); + } + } catch (std::exception& e) { + log_error("Failed compile: {}", e.what()); + throw e; + } + + log_info(tt::LogTest, "Running..."); + + std::vector threads; + threads.reserve(num_programs); + if (std::getenv("TT_METAL_SLOW_DISPATCH_MODE")) { + for (size_t i = 0; i < num_programs; i++) { + threads.emplace_back(std::thread([&] { tt_metal::detail::LaunchProgram(devices.at(i), programs.at(i)); })); + } + + std::ranges::for_each(threads, [](std::thread& t) { t.join(); }); + } else { + for (size_t i = 0; i < num_programs; i++) { + tt_metal::EnqueueProgram(devices.at(i)->command_queue(), programs.at(i), false); + } + + log_debug(tt::LogTest, "Calling Finish"); + for (size_t i = 0; i < num_programs; i++) { + tt_metal::Finish(devices.at(i)->command_queue()); + } + } +} + +std::tuple, std::vector> build_input_buffer( + IDevice* first_device, size_t tensor_size_bytes, const BankedConfig& test_config) { + auto inputs = std::vector(tensor_size_bytes / sizeof(uint32_t), 0); + std::iota(inputs.begin(), inputs.end(), 0); + + // Input buffer + auto local_input_buffer = CreateBuffer(InterleavedBufferConfig{ + first_device, test_config.size_bytes, test_config.page_size_bytes, test_config.input_buffer_type}); + tt_metal::detail::WriteToBuffer(local_input_buffer, inputs); + return {local_input_buffer, inputs}; +} + +static void build_and_enqueue( + const std::vector& devices, std::vector& programs, bool enqueue_only = false) { + TT_FATAL( + devices.size() == programs.size(), + "Number of devices must match number of programs when calling build_and_enqueue in test"); + if (!enqueue_only) { + for (size_t i = 0; i < devices.size(); i++) { + tt::tt_metal::detail::CompileProgram(devices[i], programs[i]); + } + } + for (size_t i = 0; i < devices.size(); i++) { + tt_metal::EnqueueProgram(devices[i]->command_queue(), programs[i], false); + } +} + +struct EthLinkHop { + CoreCoord hop_src; + CoreCoord hop_dest; +}; + +struct ChipConnection { + std::vector links; +}; + +struct unicast_send { + size_t distance; +}; +struct mcast_send { + size_t distance; + size_t range; +}; + +using mode_variant_t = std::variant; + +static constexpr size_t PACKET_HEADER_SIZE_BYTES = sizeof(tt::fabric::PacketHeader); +void generate_sender_worker_kernels( + Program& program, + IDevice* device, + const CoreCoord& worker_core, + const ttnn::ccl::SenderWorkerAdapterSpec& worker_fabric_connection, + const mode_variant_t& mode, + std::size_t edm_buffer_size, + uint32_t page_plus_header_size, + uint32_t num_pages_total, + uint32_t num_pages_per_edm_buffer, + uint32_t local_worker_fabric_semaphore_id, + uint32_t local_worker_teardown_semaphore_id, + uint32_t local_worker_last_message_semaphore_id, + uint32_t dram_input_buffer_base_addr, + bool src_is_dram, + uint32_t dram_output_buffer_base_addr, + bool dest_is_dram, + uint32_t worker_buffer_index_semaphore_id, + // farthest to closest + const std::vector& edm_termination_infos) { + const auto& edm_noc_core = CoreCoord(worker_fabric_connection.edm_noc_x, worker_fabric_connection.edm_noc_y); + std::vector sender_worker_reader_compile_args{ + src_is_dram, // + num_pages_total, // + page_plus_header_size - PACKET_HEADER_SIZE_BYTES, + num_pages_per_edm_buffer}; + std::vector sender_worker_reader_runtime_args{dram_input_buffer_base_addr}; + + log_trace(tt::LogTest, "\tSenderReader CT Args"); + for (const auto& arg : sender_worker_reader_compile_args) { + log_trace(tt::LogTest, "\t\t{}", arg); + } + log_trace(tt::LogTest, "\tSenderReader RT Args"); + for (const auto& arg : sender_worker_reader_runtime_args) { + log_trace(tt::LogTest, "\t\t{}", arg); + } + + std::vector sender_worker_writer_compile_args{ + num_pages_per_edm_buffer, + num_pages_total, + page_plus_header_size - PACKET_HEADER_SIZE_BYTES, + worker_fabric_connection.num_buffers_per_channel, + dest_is_dram, + std::holds_alternative(mode) ? 1 : 0}; + log_trace(tt::LogTest, "worker_fabric_connection.edm_l1_sem_addr: {}", worker_fabric_connection.edm_l1_sem_addr); + log_trace(tt::LogTest, "worker_buffer_index_semaphore_id: {}", worker_buffer_index_semaphore_id); + log_trace(tt::LogTest, "last_message_semaphore_address: {}", local_worker_last_message_semaphore_id); + log_trace( + tt::LogTest, "Sender communicating with EDM: x={}, y={}", (uint32_t)edm_noc_core.x, (uint32_t)edm_noc_core.y); + std::vector sender_worker_writer_runtime_args{ + worker_fabric_connection.edm_buffer_base_addr, + worker_fabric_connection.edm_l1_sem_addr, + local_worker_fabric_semaphore_id, + local_worker_teardown_semaphore_id, + (uint32_t)edm_noc_core.x, + (uint32_t)edm_noc_core.y, + worker_fabric_connection.num_buffers_per_channel, + + worker_fabric_connection.edm_connection_handshake_addr, + worker_fabric_connection.edm_worker_location_info_addr, + edm_buffer_size, + dram_output_buffer_base_addr, + local_worker_last_message_semaphore_id, + worker_buffer_index_semaphore_id, + worker_fabric_connection.persistent_fabric ? 1 : 0, + worker_fabric_connection.buffer_index_semaphore_id}; + + if (std::holds_alternative(mode)) { + sender_worker_writer_runtime_args.push_back(std::get(mode).distance); + sender_worker_writer_runtime_args.push_back(std::get(mode).range); + } else { + sender_worker_writer_runtime_args.push_back(std::get(mode).distance); + } + + get_runtime_args_for_edm_termination_infos(edm_termination_infos, sender_worker_writer_runtime_args); + + uint32_t src0_cb_index = CBIndex::c_0; + log_trace(tt::LogTest, "\tSenderWriter CT Args"); + for (const auto& arg : sender_worker_writer_compile_args) { + log_trace(tt::LogTest, "\t\t{}", arg); + } + log_trace(tt::LogTest, "\tSenderWriter RT Args"); + for (const auto& arg : sender_worker_writer_runtime_args) { + log_trace(tt::LogTest, "\t\t{}", arg); + } + + // Just want a dummy DF + tt::DataFormat df = (page_plus_header_size - PACKET_HEADER_SIZE_BYTES) == 1024 ? tt::DataFormat::Bfp8 + : (page_plus_header_size - PACKET_HEADER_SIZE_BYTES) == 2048 ? tt::DataFormat::Float16 + : tt::DataFormat::Float32; + tt_metal::CircularBufferConfig cb_src0_config = + tt_metal::CircularBufferConfig(2 * num_pages_per_edm_buffer * page_plus_header_size, {{src0_cb_index, df}}) + .set_page_size(src0_cb_index, page_plus_header_size); + CBHandle sender_workers_cb = CreateCircularBuffer(program, worker_core, cb_src0_config); + auto sender_worker_reader_kernel = tt_metal::CreateKernel( + program, + "tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_reader.cpp", + worker_core, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_0, + .noc = tt_metal::NOC::RISCV_0_default, + .compile_args = sender_worker_reader_compile_args}); + auto sender_worker_writer_kernel = tt_metal::CreateKernel( + program, + "tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp", + worker_core, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_1, + .noc = tt_metal::NOC::RISCV_1_default, + .compile_args = sender_worker_writer_compile_args}); + tt_metal::SetRuntimeArgs(program, sender_worker_reader_kernel, worker_core, sender_worker_reader_runtime_args); + tt_metal::SetRuntimeArgs(program, sender_worker_writer_kernel, worker_core, sender_worker_writer_runtime_args); +} + +bool RunLoopbackTest( + tt_metal::IDevice* sender_device, + tt_metal::IDevice* receiver_device, + + const CoreCoord& eth_sender_core, + const CoreCoord& eth_receiver_core, + + const uint32_t page_size, + const uint32_t num_pages_total, + bool src_is_dram, + bool dest_is_dram, + std::vector& programs, + ttnn::ccl::FabricEriscDatamoverBuilder& chip_0_edm_builder, + std::optional& subdevice_managers, + bool enable_persistent_fabric) { + auto& sender_program = programs.at(0); + std::size_t page_plus_header_size = page_size + sizeof(tt::fabric::PacketHeader); + std::size_t tensor_size_bytes = num_pages_total * page_size; + + std::vector worker_cores = {CoreCoord(0, 0)}; + + auto local_worker_fabric_semaphore_id = tt::tt_metal::CreateSemaphore(sender_program, worker_cores.at(0), 0); + auto local_worker_teardown_semaphore_id = tt::tt_metal::CreateSemaphore(sender_program, worker_cores.at(0), 0); + auto local_worker_last_message_semaphore_id = tt::tt_metal::CreateSemaphore(sender_program, worker_cores.at(0), 0); + auto worker_buffer_index_semaphore_id = tt::tt_metal::CreateSemaphore(sender_program, worker_cores.at(0), 0); + + // Generate inputs + //////////////////////////////////////////////////////////////////////////// + // SETUP THE INPUT CB + //////////////////////////////////////////////////////////////////////////// + + BankedConfig test_config = BankedConfig{ + .num_pages = num_pages_total, + .size_bytes = tensor_size_bytes, + .page_size_bytes = page_size, + .input_buffer_type = src_is_dram ? BufferType::DRAM : BufferType::L1, + .output_buffer_type = dest_is_dram ? BufferType::DRAM : BufferType::L1, + .l1_data_format = tt::DataFormat::Float16_b}; + + auto [local_input_buffer, inputs] = build_input_buffer(sender_device, tensor_size_bytes, test_config); + + std::vector all_zeros(inputs.size(), 0); + auto local_output_buffer = CreateBuffer(InterleavedBufferConfig{ + sender_device, test_config.size_bytes, test_config.page_size_bytes, test_config.output_buffer_type}); + + tt_metal::detail::WriteToBuffer(local_output_buffer, all_zeros); + + auto local_input_buffer_address = local_input_buffer->address(); + auto local_output_buffer_address = local_output_buffer->address(); + + //////////////////////////////////////////////////////////////////////////// + // EDM Builder Setup + //////////////////////////////////////////////////////////////////////////// + + static constexpr std::size_t edm_buffer_size = + ttnn::ccl::FabricEriscDatamoverBuilder::default_packet_payload_size_bytes + PACKET_HEADER_SIZE_BYTES; + + auto chip0_worker_fabric_connection = chip_0_edm_builder.build_connection_to_worker_channel(); + //////////////////////////////////////////////////////////////////////////// + // Build Workers + //////////////////////////////////////////////////////////////////////////// + log_trace(tt::LogTest, "Generating local_sender -> remote_receiver workers"); + const std::size_t pages_per_send = + (chip0_worker_fabric_connection.buffer_size_bytes - PACKET_HEADER_SIZE_BYTES) / page_size; + const auto& worker_core = worker_cores.at(0); + log_trace(tt::LogTest, "Worker {}. On Core x={},y={}", 0, worker_core.x, worker_core.y); + + const auto& edm_config = ttnn::ccl::FabricEriscDatamoverConfig(edm_buffer_size, 1, 2); + const std::vector& edm_termination_infos = + enable_persistent_fabric ? std::vector{} + : std::vector{ + {1, + sender_device->ethernet_core_from_logical_core(eth_receiver_core).x, + sender_device->ethernet_core_from_logical_core(eth_receiver_core).y, + chip_0_edm_builder.config.termination_signal_address}, + {0, + sender_device->ethernet_core_from_logical_core(eth_sender_core).x, + sender_device->ethernet_core_from_logical_core(eth_sender_core).y, + chip_0_edm_builder.config.termination_signal_address}}; + + TT_ASSERT( + (enable_persistent_fabric && edm_termination_infos.size() == 0) || + (!enable_persistent_fabric && edm_termination_infos.size() > 0)); + generate_sender_worker_kernels( + sender_program, + sender_device, + worker_core, + chip0_worker_fabric_connection, + unicast_send{2}, // 2 hops because we are looping back to ourselves + edm_buffer_size, + page_plus_header_size, + num_pages_total, + pages_per_send, + local_worker_fabric_semaphore_id, + local_worker_teardown_semaphore_id, + local_worker_last_message_semaphore_id, + local_input_buffer_address, + src_is_dram, + local_output_buffer_address, + dest_is_dram, + worker_buffer_index_semaphore_id, + edm_termination_infos); + + //////////////////////////////////////////////////////////////////////////// + // Compile and Execute Application + //////////////////////////////////////////////////////////////////////////// + std::vector devices = {sender_device}; + if (!enable_persistent_fabric) { + devices.push_back(receiver_device); + } + log_trace(tt::LogTest, "{} programs, {} devices", programs.size(), devices.size()); + run_programs(programs, devices); + log_info(tt::LogTest, "Reading back outputs"); + + bool pass = true; + constexpr bool enable_check = true; + if constexpr (enable_check) { + pass &= run_output_check(all_zeros, inputs, local_output_buffer) == Correctness::Correct; + } + return pass; +} + +void generate_multi_input_test_worker_reader_kernel( + Program& program, + const std::vector& cb_indices, + const std::vector& tensors, + IDevice* device, + uint32_t page_size, + const CoreRangeSet& worker_core_range, + uint32_t num_pages_per_edm_buffer, + const ttnn::ccl::v2::TensorSlice& in0_command_tensor_slice, + const ttnn::ccl::v2::TensorSlice& in1_command_tensor_slice, + ttnn::ccl::cmd::CclCommandCode command_type, + const DataMovementConfig& datamovement_kernel_config, + const std::optional& chip0_worker_forward_fabric_connection, + const std::optional& chip0_worker_backward_fabric_connection, + const std::optional& optional_teardown_sequence, + const ttnn::ccl::cmd::CclCommandDestArgs& dest_args) { + bool fabric_enabled = std::holds_alternative(dest_args) || + std::holds_alternative(dest_args); + using namespace ttnn::ccl::cmd::uops; + using namespace ttnn::ccl::cmd; + log_trace( + tt::LogTest, + "Generating multi input test worker reader kernel for command type: {}", + static_cast(command_type)); + + TT_FATAL( + command_type == ttnn::ccl::cmd::CclCommandCode::STREAM_TENSOR_TO_CB || + command_type == ttnn::ccl::cmd::CclCommandCode::STREAM_CB_TO_TENSOR, + "Unsupported tensor IO command type"); + + TT_ASSERT(tensors.size() > 0 && tensors.size() <= 2); + TT_ASSERT(cb_indices.size() == tensors.size()); + + auto sender_worker_reader_kernel = ttnn::ccl::worker_detail::generate_multi_command_stream_kernel_ct_args( + program, cb_indices, tensors, worker_core_range, datamovement_kernel_config); + + std::vector ccl_command_stream0; + std::vector ccl_command_stream1; + + // Add the main tensor slice commands + if (command_type == ttnn::ccl::cmd::CclCommandCode::STREAM_TENSOR_TO_CB) { + log_trace(tt::LogTest, "Adding local noc read"); + if (fabric_enabled) { + ccl_command_stream0.push_back( + read_tensor_slice_to_cb_for_eventual_fabric_write(in0_command_tensor_slice, cb_indices.at(0))); + ccl_command_stream1.push_back( + read_tensor_slice_to_cb_for_eventual_fabric_write(in1_command_tensor_slice, cb_indices.at(1))); + } else { + ccl_command_stream0.push_back(read_tensor_slice_to_cb(in0_command_tensor_slice, cb_indices.at(0))); + ccl_command_stream1.push_back(read_tensor_slice_to_cb(in1_command_tensor_slice, cb_indices.at(1))); + } + } else { + if (std::holds_alternative(dest_args)) { + log_trace(tt::LogTest, "Adding local noc write"); + ccl_command_stream0.push_back(local_write_cb_to_tensor_slice(in0_command_tensor_slice, cb_indices.at(0))); + ccl_command_stream1.push_back(local_write_cb_to_tensor_slice(in1_command_tensor_slice, cb_indices.at(1))); + } else { + if (std::holds_alternative(dest_args)) { + log_trace( + tt::LogTest, + "Adding fabric unicast write command. Distance: {}. Forward: {}", + std::get(dest_args).distance_in_hops, + std::get(dest_args).is_forward_direction); + ccl_command_stream0.push_back(fabric_write_cb_to_tensor_slice( + in0_command_tensor_slice, + cb_indices.at(0), + UnicastCommandDestArgs{std::get(dest_args)})); + ccl_command_stream1.push_back(fabric_write_cb_to_tensor_slice( + in1_command_tensor_slice, + cb_indices.at(1), + UnicastCommandDestArgs{std::get(dest_args)})); + } else if (std::holds_alternative(dest_args)) { + log_trace( + tt::LogTest, + "Adding fabric multicast write command. Forward: {}. Backward: {}", + std::get(dest_args).num_targets_forward_direction, + std::get(dest_args).num_targets_backward_direction); + ccl_command_stream0.push_back(fabric_write_cb_to_tensor_slice( + in0_command_tensor_slice, + cb_indices.at(0), + MulticastCommandDestArgs{std::get(dest_args)})); + ccl_command_stream1.push_back(fabric_write_cb_to_tensor_slice( + in1_command_tensor_slice, + cb_indices.at(1), + MulticastCommandDestArgs{std::get(dest_args)})); + } else { + log_trace(tt::LogTest, "WTF? Should have been caught earlier"); + TT_FATAL(true, "Unsupported dest args type"); + } + } + } + + // Now, because we are bringing up/tearing down the fabric per op with this program, we need to queue up the + // commands to teardown the fabric + // We need to make sure only one of the command streams is sending out the termination signals, and we + // need to make sure it only does that after the other command stream is done - so what we do is + // make the termination command stream wait for a semaphore value (locally) that the other command stream + // will set after it has finished. + if (optional_teardown_sequence.has_value()) { + std::ranges::copy(optional_teardown_sequence.value(), std::back_inserter(ccl_command_stream0)); + } + + ttnn::ccl::worker_detail::generate_multi_input_command_stream_kernel_rt_args( + program, + sender_worker_reader_kernel, + tensors, + {page_size, page_size}, + device, + num_pages_per_edm_buffer, // TODO: get from fabric + worker_core_range, + ccl_command_stream0, + ccl_command_stream1, + chip0_worker_forward_fabric_connection, + chip0_worker_backward_fabric_connection); +} + +void generate_multi_input_test_worker_kernels_for_local_tensor_write( + Program& program, + IDevice* device, + Tensor& input_tensor0, + Tensor& input_tensor1, + Tensor& output_tensor0, + Tensor& output_tensor1, + size_t first_cb_index, + size_t second_cb_index, + const CoreCoord& worker_core, + const uint32_t page_plus_header_size, + const uint32_t num_pages_per_edm_buffer, + const ttnn::ccl::v2::TensorSlice& in0_tensor_slice, + const ttnn::ccl::v2::TensorSlice& in1_tensor_slice, + const ttnn::ccl::v2::TensorSlice& out0_tensor_slice, + const ttnn::ccl::v2::TensorSlice& out1_tensor_slice, + const std::optional& optional_teardown_sequence, + std::optional& chip0_worker_forward_fabric_connection, + std::optional& chip0_worker_backward_fabric_connection, + const ttnn::ccl::cmd::CclCommandDestArgs& dest_args) { + // Just want a dummy DF + tt::DataFormat df = (page_plus_header_size - PACKET_HEADER_SIZE_BYTES) == 1024 ? tt::DataFormat::Bfp8 + : (page_plus_header_size - PACKET_HEADER_SIZE_BYTES) == 2048 ? tt::DataFormat::Float16 + : tt::DataFormat::Float32; + + { + tt_metal::CircularBufferConfig cb_src0_config = + tt_metal::CircularBufferConfig(2 * num_pages_per_edm_buffer * page_plus_header_size, {{first_cb_index, df}}) + .set_page_size(first_cb_index, page_plus_header_size); + CBHandle cb0 = CreateCircularBuffer(program, worker_core, cb_src0_config); + } + { + tt_metal::CircularBufferConfig cb_src1_config = + tt_metal::CircularBufferConfig( + 2 * num_pages_per_edm_buffer * page_plus_header_size, {{second_cb_index, df}}) + .set_page_size(second_cb_index, page_plus_header_size); + CBHandle cb1 = CreateCircularBuffer(program, worker_core, cb_src1_config); + } + + generate_multi_input_test_worker_reader_kernel( + program, + {first_cb_index, second_cb_index}, + {&input_tensor0, &input_tensor1}, + device, + page_plus_header_size - PACKET_HEADER_SIZE_BYTES, + CoreRangeSet({CoreRange(worker_core)}), + num_pages_per_edm_buffer, + in0_tensor_slice, + in1_tensor_slice, + ttnn::ccl::cmd::CclCommandCode::STREAM_TENSOR_TO_CB, + tt_metal::ReaderDataMovementConfig{}, + std::nullopt, + std::nullopt, + std::nullopt, + dest_args); + + generate_multi_input_test_worker_reader_kernel( + program, + {first_cb_index, second_cb_index}, + {&output_tensor0, &output_tensor1}, + device, + page_plus_header_size - PACKET_HEADER_SIZE_BYTES, + CoreRangeSet({CoreRange(worker_core)}), + num_pages_per_edm_buffer, + out0_tensor_slice, + out1_tensor_slice, + ttnn::ccl::cmd::CclCommandCode::STREAM_CB_TO_TENSOR, + tt_metal::WriterDataMovementConfig{}, + chip0_worker_forward_fabric_connection, + chip0_worker_backward_fabric_connection, + optional_teardown_sequence, + dest_args); +} + +bool RunLocalTestWithMultiInputReaders( + const std::vector& devices, + std::vector& programs, + std::optional& line_fabric, + + Tensor& input_tensor0, + Tensor& input_tensor1, + Tensor& output_tensor0, + Tensor& output_tensor1, + std::vector input0_tensors, // Device + std::vector input1_tensors, // Device + std::vector output0_tensors, // Device + std::vector output1_tensors, // Device + + const ttnn::ccl::v2::TensorSlice& in0_tensor_slice, + const ttnn::ccl::v2::TensorSlice& in1_tensor_slice, + const ttnn::ccl::v2::TensorSlice& out0_tensor_slice, + const ttnn::ccl::v2::TensorSlice& out1_tensor_slice, + + const uint32_t page_size, + TwoInputReaderKernelWriteMode test_mode, + const ttnn::ccl::cmd::CclCommandDestArgs& dest_args, + std::optional& subdevice_managers, + bool enable_persistent_fabric) { + const bool fabric_enabled = test_mode != TwoInputReaderKernelWriteMode::LOCAL_WRITEBACK; + tt_metal::IDevice* device = devices.at(0); + for (size_t i = 0; i < devices.size(); i++) { + log_info(tt::LogTest, "Device[{}] ID: {}", i, devices.at(i)->id()); + } + auto program_ptrs = std::vector(); + program_ptrs.reserve(devices.size()); + std::ranges::transform(programs, std::back_inserter(program_ptrs), [](auto& p) { return &p; }); + + size_t output_tensor_dest_device_index = 0; + if (fabric_enabled) { + if (std::holds_alternative(dest_args)) { + log_info( + tt::LogTest, + "Unicast command dest args. Distance in hops: {}", + std::get(dest_args).distance_in_hops); + output_tensor_dest_device_index = + std::get(dest_args).distance_in_hops; + TT_ASSERT(output_tensor_dest_device_index != 0, "Output tensor destination device index must be non-zero"); + TT_ASSERT(test_mode == TwoInputReaderKernelWriteMode::FABRIC_UNICAST); + } else if (std::holds_alternative(dest_args)) { + log_info( + tt::LogTest, + "Multicast command dest args. Number of targets forward direction: {}", + std::get(dest_args).num_targets_forward_direction); + output_tensor_dest_device_index = + std::get(dest_args).num_targets_forward_direction; + TT_ASSERT(output_tensor_dest_device_index != 0, "Output tensor destination device index must be non-zero"); + TT_ASSERT(test_mode == TwoInputReaderKernelWriteMode::FABRIC_MULTICAST); + } + } else { + log_info(tt::LogTest, "No fabric enabled"); + TT_ASSERT( + std::holds_alternative(dest_args), "Local command dest args expected"); + } + + std::size_t page_plus_header_size = page_size + sizeof(tt::fabric::PacketHeader); + + auto first_cb_index = tt::CB::c_in0; + auto second_cb_index = tt::CB::c_in1; + + auto output_tensor_dest_device = devices.at(output_tensor_dest_device_index); + TT_ASSERT(input_tensor0.get_logical_shape()[-2] != 1); + + bool is_fabric_mcast = std::holds_alternative(dest_args); + + auto input_tensor0_device = input0_tensors.at(0); + auto input_tensor1_device = input1_tensors.at(0); + auto output_tensor0_device = output0_tensors.at(output_tensor_dest_device_index); + auto output_tensor1_device = output1_tensors.at(output_tensor_dest_device_index); + + log_info(tt::LogTest, "input_tensor0_device->address(): {}", input_tensor0_device.buffer()->address()); + log_info(tt::LogTest, "input_tensor1_device->address(): {}", input_tensor1_device.buffer()->address()); + log_info( + tt::LogTest, + "output_tensor0_device->address(): {} on device {}", + output_tensor0_device.buffer()->address(), + output_tensor_dest_device->id()); + log_info( + tt::LogTest, + "output_tensor1_device->address(): {} on device {}", + output_tensor1_device.buffer()->address(), + output_tensor_dest_device->id()); + + //////////////////////////////////////////////////////////////////////////// + // Build Workers + //////////////////////////////////////////////////////////////////////////// + const auto& worker_core = CoreCoord(0, 0); + + const size_t num_pages_per_edm_buffer = 2; + + std::optional chip0_worker_forward_fabric_connection = + fabric_enabled ? line_fabric->uniquely_connect_worker(devices[0], ttnn::ccl::EdmLineFabricOpInterface::FORWARD) + : std::optional{std::nullopt}; + + // always at start of line for now + std::optional> edm_termination_infos = + (!fabric_enabled || enable_persistent_fabric) + ? std::optional>{std::nullopt} + : line_fabric->generate_ordered_termination_info_farthest_to_nearest(); + std::optional chip0_worker_backward_fabric_connection = std::nullopt; + + std::optional sync_details; + std::optional teardown_worker_core; + std::optional teardown_command_stream; + if (fabric_enabled && !enable_persistent_fabric) { + teardown_worker_core = worker_core; + + sync_details = ttnn::ccl::SyncModeSpec{}; + sync_details->core = teardown_worker_core.value(); + sync_details->add_signal(tt::tt_metal::CreateSemaphore(programs.at(0), teardown_worker_core.value(), 0), 1); + teardown_command_stream = {ttnn::ccl::cmd::uops::local_core_semaphore_inc(sync_details->sem_ids.at(0), 1)}; + TT_FATAL(edm_termination_infos.has_value(), "EDM termination infos must be set if fabric is enabled"); + ttnn::ccl::cmd::CclHostLowLevelCommandSequence teardown_commands; + + teardown_commands = ttnn::ccl::worker_detail::build_ccl_cmd_proc_teardown_commands( + programs.at(0), + device, + nullptr, // forward device - in this test, we have a single source doing all teardown + devices.size(), + 0, + edm_termination_infos.value(), + sync_details.value(), + line_fabric.value()); + std::ranges::copy(teardown_commands, std::back_inserter(teardown_command_stream.value())); + } + + generate_multi_input_test_worker_kernels_for_local_tensor_write( + programs.at(0), + device, + input_tensor0_device, + input_tensor1_device, + output_tensor0_device, + output_tensor1_device, + first_cb_index, + second_cb_index, + worker_core, + page_plus_header_size, + num_pages_per_edm_buffer, + in0_tensor_slice, + in1_tensor_slice, + out0_tensor_slice, + out1_tensor_slice, + teardown_command_stream, + chip0_worker_forward_fabric_connection, + chip0_worker_backward_fabric_connection, + dest_args); + + if (!enable_persistent_fabric) { + log_info(tt::LogTest, "Building EDM kernels"); + line_fabric->build_kernels(); + } + + log_info(tt::LogTest, "persistent_fabric: {}", enable_persistent_fabric); + log_info(tt::LogTest, "subdevice_managers.has_value(): {}", subdevice_managers.has_value()); + //////////////////////////////////////////////////////////////////////////// + // Compile and Execute Application + //////////////////////////////////////////////////////////////////////////// + run_programs(programs, enable_persistent_fabric ? std::vector{devices[0]} : devices); + log_info(tt::LogTest, "Finished"); + + bool pass = true; + constexpr bool enable_check = true; + if constexpr (enable_check) { + log_info(tt::LogTest, "Reading back outputs"); + auto output0_cpu = output_tensor0_device.cpu(true, ttnn::DefaultQueueId); + auto output1_cpu = output_tensor1_device.cpu(true, ttnn::DefaultQueueId); + + auto in0_tensor_copyback_cpu = input_tensor0_device.cpu(true, ttnn::DefaultQueueId); + auto in1_tensor_copyback_cpu = input_tensor1_device.cpu(true, ttnn::DefaultQueueId); + + auto in0_tensor_copyback = tt::tt_metal::owned_buffer::get_as(in0_tensor_copyback_cpu); + auto in1_tensor_copyback = tt::tt_metal::owned_buffer::get_as(in1_tensor_copyback_cpu); + + auto in0_tensor_data = tt::tt_metal::owned_buffer::get_as(input_tensor0); + auto in1_tensor_data = tt::tt_metal::owned_buffer::get_as(input_tensor1); + auto out0_tensor_data = tt::tt_metal::owned_buffer::get_as(output0_cpu); + auto out1_tensor_data = tt::tt_metal::owned_buffer::get_as(output1_cpu); + + bool input0_copyback_check_passed = + run_output_check(in0_tensor_data, in0_tensor_copyback) == Correctness::Correct; + bool input1_copyback_check_passed = + run_output_check(in1_tensor_data, in1_tensor_copyback) == Correctness::Correct; + TT_FATAL(input0_copyback_check_passed, "Input 0 copyback check failed"); + TT_FATAL(input1_copyback_check_passed, "Input 1 copyback check failed"); + + log_info(tt::LogTest, "Comparing outputs"); + pass &= run_output_check(in0_tensor_data, out0_tensor_data) == Correctness::Correct; + if (pass) { + log_info(tt::LogTest, "Output check passed for output 0"); + } else { + log_error(tt::LogTest, "Output check failed for output 0"); + } + pass &= run_output_check(in1_tensor_data, out1_tensor_data) == Correctness::Correct; + if (pass) { + log_info(tt::LogTest, "Output check passed for output 1"); + } else { + log_error(tt::LogTest, "Output check failed for output 1"); + } + } + + return pass; +} + +bool RunLineFabricTest( + std::vector devices, + std::vector& programs, + + const size_t mcast_first_chip, + const size_t mcast_last_chip, + + const uint32_t page_size, + const uint32_t num_pages_total, + bool src_is_dram, + bool dest_is_dram, + + std::optional& subdevice_managers, + ttnn::ccl::EdmLineFabricOpInterface& line_fabric, + bool enable_persistent_fabric) { + std::size_t page_plus_header_size = page_size + sizeof(tt::fabric::PacketHeader); + std::size_t tensor_size_bytes = num_pages_total * page_size; + + static constexpr std::size_t edm_buffer_size = + ttnn::ccl::FabricEriscDatamoverBuilder::default_packet_payload_size_bytes + PACKET_HEADER_SIZE_BYTES; + const size_t local_chip_id = 0; + const size_t remote_chip_id = 1; + auto program_ptrs = std::vector(devices.size()); + std::transform(programs.begin(), programs.end(), program_ptrs.begin(), [](auto& program) { return &program; }); + + std::vector worker_cores = {CoreCoord(0, 0)}; + + // Generate inputs + //////////////////////////////////////////////////////////////////////////// + // SETUP THE INPUT CB + //////////////////////////////////////////////////////////////////////////// + BankedConfig test_config = BankedConfig{ + .num_pages = num_pages_total, + .size_bytes = tensor_size_bytes, + .page_size_bytes = page_size, + .input_buffer_type = src_is_dram ? BufferType::DRAM : BufferType::L1, + .output_buffer_type = dest_is_dram ? BufferType::DRAM : BufferType::L1, + .l1_data_format = tt::DataFormat::Float16_b}; + + // Input buffer + auto [local_input_buffer, inputs] = build_input_buffer(devices[0], tensor_size_bytes, test_config); + auto local_input_buffer_address = local_input_buffer->address(); + + std::vector all_zeros(inputs.size(), 0); + // output buffers + TT_ASSERT( + enable_persistent_fabric || mcast_first_chip <= mcast_last_chip, + "mcast_first_chip must be less than or equal to mcast_last_chip"); + TT_ASSERT( + enable_persistent_fabric || mcast_last_chip < devices.size(), + "mcast_last_chip must be less than the number of devices"); + std::vector> output_buffers; + output_buffers.reserve(devices.size()); + for (size_t i = 0; i < devices.size(); i++) { + if (i == 0) { + output_buffers.push_back(CreateBuffer(InterleavedBufferConfig{ + devices.at(i), test_config.size_bytes, test_config.page_size_bytes, test_config.output_buffer_type})); + } else { + output_buffers.push_back(CreateBuffer( + InterleavedBufferConfig{ + devices.at(i), test_config.size_bytes, test_config.page_size_bytes, test_config.output_buffer_type}, + output_buffers[0]->address())); + } + tt_metal::detail::WriteToBuffer(output_buffers.back(), all_zeros); + } + auto local_output_buffer_address = output_buffers[0]->address(); + bool all_same_addr = std::ranges::all_of(output_buffers, [local_output_buffer_address](const auto& buffer) { + return buffer->address() == local_output_buffer_address; + }); + TT_ASSERT(all_same_addr, "All output buffers must have the same address"); + + //////////////////////////////////////////////////////////////////////////// + // Setup Semaphores and Builders + //////////////////////////////////////////////////////////////////////////// + + auto local_worker_fabric_semaphore_id = tt::tt_metal::CreateSemaphore(programs[0], worker_cores.at(0), 0); + auto local_worker_teardown_semaphore_id = tt::tt_metal::CreateSemaphore(programs[0], worker_cores.at(0), 0); + auto local_worker_last_message_semaphore_id = tt::tt_metal::CreateSemaphore(programs[0], worker_cores.at(0), 0); + auto worker_buffer_index_semaphore_id = tt::tt_metal::CreateSemaphore(programs[0], worker_cores.at(0), 0); + //////////////////////////////////////////////////////////////////////////// + // Build Workers + //////////////////////////////////////////////////////////////////////////// + log_trace(tt::LogTest, "Generating local_sender -> remote_receiver workers"); + const auto& worker_core = worker_cores.at(0); + log_trace(tt::LogTest, "Worker {}. On Core x={},y={}", 0, worker_core.x, worker_core.y); + + const auto edm_termination_infos = enable_persistent_fabric + ? std::vector{} + : line_fabric.generate_ordered_termination_info_farthest_to_nearest(); + + auto chip0_worker_fabric_connection = + line_fabric.uniquely_connect_worker(devices[0], ttnn::ccl::EdmLineFabricOpInterface::FORWARD); + + const std::size_t pages_per_send = + (chip0_worker_fabric_connection.buffer_size_bytes - PACKET_HEADER_SIZE_BYTES) / page_size; + generate_sender_worker_kernels( + programs[0], + devices[0], + worker_core, + chip0_worker_fabric_connection, + mcast_send{mcast_first_chip, mcast_last_chip - mcast_first_chip + 1}, + edm_buffer_size, + page_plus_header_size, + num_pages_total, + pages_per_send, + local_worker_fabric_semaphore_id, + local_worker_teardown_semaphore_id, + local_worker_last_message_semaphore_id, + local_input_buffer_address, + src_is_dram, + local_output_buffer_address, + dest_is_dram, + worker_buffer_index_semaphore_id, + edm_termination_infos); + + //////////////////////////////////////////////////////////////////////////// + // Build EDM Kernels + //////////////////////////////////////////////////////////////////////////// + if (!enable_persistent_fabric) { + line_fabric.build_kernels(); + } + + //////////////////////////////////////////////////////////////////////////// + // Compile and Execute Application + //////////////////////////////////////////////////////////////////////////// + + run_programs(programs, devices); + log_info(tt::LogTest, "Reading back outputs"); + + bool pass = true; + constexpr bool enable_check = true; + if constexpr (enable_check) { + // Check all output buffers. Make sure only the buffers in the mcast range are + // non-zero. All other buffers outside the range should be zero filled + TT_ASSERT( + !std::all_of(inputs.begin(), inputs.end(), [](uint32_t x) { return x == 0; }), + "Input buffer expected to not be all 0"); + for (size_t i = 0; i < output_buffers.size(); i++) { + bool compare_with_input = (mcast_first_chip <= i && i <= mcast_last_chip); + auto& golden_tensor = compare_with_input ? inputs : all_zeros; + pass &= run_output_check(all_zeros, golden_tensor, output_buffers.at(i)) == Correctness::Correct; + } + } + + return pass; +} + +void persistent_fabric_teardown_sequence( + const std::vector& devices, + std::optional& subdevice_managers, + ttnn::ccl::EdmLineFabricOpInterface& line_fabric, + tt::fabric::TerminationSignal termination_mode = tt::fabric::TerminationSignal::GRACEFULLY_TERMINATE) { + log_info("Tearing down fabric"); + + // Wait for workers to finish + auto d0_worker_subdevice = devices[0]->get_sub_device_ids()[TEST_WORKERS_SUBDEVICE_INDEX]; + tt_metal::Finish(devices[0]->command_queue(), {subdevice_managers->worker_subdevice_id.at(devices[0]->id())}); + + // Teardown the fabric + line_fabric.teardown_from_host(termination_mode); + + // wait for fabric teardown to finish + std::ranges::for_each(devices, [&](IDevice* d) { + tt_metal::Finish(d->command_queue(), {subdevice_managers->fabric_subdevice_id.at(d->id())}); + }); +} + +void setup_test_with_persistent_fabric( + const std::vector& devices, + std::vector& programs, + std::optional& subdevice_managers, + std::optional>& fabric_programs, + std::vector& fabric_program_ptrs, + std::optional& line_fabric, + bool enable_persistent_fabric, + std::optional num_links = std::nullopt) { + if (enable_persistent_fabric) { + log_info(tt::LogTest, "Enabling persistent fabric"); + fabric_programs = std::vector(devices.size()); + subdevice_managers = create_subdevices(devices); + std::transform( + fabric_programs->begin(), fabric_programs->end(), std::back_inserter(fabric_program_ptrs), [](auto& p) { + return &p; + }); + } else { + std::transform( + programs.begin(), programs.end(), std::back_inserter(fabric_program_ptrs), [](auto& p) { return &p; }); + } + + line_fabric = ttnn::ccl::EdmLineFabricOpInterface( + devices, fabric_program_ptrs, enable_persistent_fabric, num_links.value_or(1)); + line_fabric->set_firmware_context_switch_interval(0); + + if (enable_persistent_fabric) { + TT_FATAL(fabric_programs.has_value(), "Fabric programs must be set if fabric is enabled"); + TT_FATAL(devices.size() == fabric_programs->size(), "Number of devices must match number of programs"); + + log_info(tt::LogTest, "Building EDM kernels"); + line_fabric->build_kernels(); + build_and_enqueue(devices, *fabric_programs); + } +} + +// RESUME HERE AND IMPLEMENT MCAST TEST +int TestLineFabricEntrypoint( + const size_t mcast_first_chip, + const size_t mcast_last_chip, + const uint32_t page_size, + const uint32_t num_pages_total, + const bool src_is_dram, + const bool dest_is_dram, + bool enable_persistent_fabric) { + // argv[0]: program + // argv[1]: buffer_size_bytes + // argv[2]: num_loops + + auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); + auto num_devices = tt::tt_metal::GetNumAvailableDevices(); + if (num_devices < 4) { + log_info("This test can only be run on T3000 devices"); + return 0; + } + if (arch == tt::ARCH::GRAYSKULL) { + log_info("Test must be run on WH"); + return 0; + } + + T3000TestDevice test_fixture; + auto view = test_fixture.mesh_device_->get_view(); + + // build a line of devices + std::vector devices = { + view.get_device(MeshCoordinate(0, 0)), + view.get_device(MeshCoordinate(0, 1)), + view.get_device(MeshCoordinate(0, 2)), + view.get_device(MeshCoordinate(0, 3))}; + std::vector programs(enable_persistent_fabric ? 1 : devices.size()); + std::optional subdevice_managers = std::nullopt; + std::optional> fabric_programs; + std::vector fabric_program_ptrs; + std::optional line_fabric; + setup_test_with_persistent_fabric( + devices, + programs, + subdevice_managers, + fabric_programs, + fabric_program_ptrs, + line_fabric, + enable_persistent_fabric); + + auto launch_workers = [&](std::vector& _programs) -> bool { + bool success = false; + try { + success = RunLineFabricTest( + enable_persistent_fabric ? std::vector{devices[0]} : devices, + _programs, + // fabric_hops, + + mcast_first_chip, + mcast_last_chip, + + page_size, + num_pages_total, + src_is_dram, + dest_is_dram, + + subdevice_managers, + line_fabric.value(), + enable_persistent_fabric); + + } catch (std::exception& e) { + log_error("Caught exception: {}", e.what()); + test_fixture.TearDown(); + return false; + } + return success; + }; + bool success = launch_workers(programs); + + if (enable_persistent_fabric) { + std::vector second_run_programs(1); + success = launch_workers(second_run_programs); + persistent_fabric_teardown_sequence( + devices, subdevice_managers, line_fabric.value(), tt::fabric::TerminationSignal::IMMEDIATELY_TERMINATE); + } + + test_fixture.TearDown(); + + return success ? 0 : -1; +} + +int TestLoopbackEntrypoint( + const uint32_t page_size, + const uint32_t num_pages_total, + const bool src_is_dram, + const bool dest_is_dram, + bool enable_persistent_fabric) { + // argv[0]: program + // argv[1]: buffer_size_bytes + // argv[2]: num_loops + std::optional subdevice_managers = std::nullopt; + + auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); + auto num_devices = tt::tt_metal::GetNumAvailableDevices(); + if (num_devices < 4) { + log_info("This test can only be run on T3000 devices"); + return 0; + } + if (arch == tt::ARCH::GRAYSKULL) { + log_info("Test must be run on WH"); + return 0; + } + + T3000TestDevice test_fixture; + auto view = test_fixture.mesh_device_->get_view(); + + const auto& device_0 = view.get_device(MeshCoordinate(0, 0)); + const auto& device_1 = view.get_device(MeshCoordinate(0, 1)); + + const auto& active_eth_cores = device_0->get_active_ethernet_cores(true); + auto eth_sender_core_iter = active_eth_cores.begin(); + auto eth_sender_core_iter_end = active_eth_cores.end(); + chip_id_t device_id = std::numeric_limits::max(); + tt_xy_pair eth_receiver_core; + bool initialized = false; + tt_xy_pair eth_sender_core; + do { + TT_FATAL(eth_sender_core_iter != eth_sender_core_iter_end, "Error"); + std::tie(device_id, eth_receiver_core) = device_0->get_connected_ethernet_core(*eth_sender_core_iter); + eth_sender_core = *eth_sender_core_iter; + eth_sender_core_iter++; + } while (device_id != device_1->id()); + TT_ASSERT(device_id == device_1->id()); + // const auto& device_1 = test_fixture.mesh_device_->get_device(device_id); + + std::vector programs(enable_persistent_fabric ? 1 : 2); + std::optional> fabric_programs; + auto& sender_program = programs.at(0); + if (enable_persistent_fabric) { + log_info(tt::LogTest, "Enabling persistent fabric"); + fabric_programs = std::vector(2); + subdevice_managers = create_subdevices({device_0, device_1}); + } + + auto& fabric_sender_program = enable_persistent_fabric ? fabric_programs->at(0) : sender_program; + auto& fabric_receiver_program = enable_persistent_fabric ? fabric_programs->at(1) : programs.at(1); + IDevice* sender_device = device_0; + IDevice* receiver_device = device_1; + + static constexpr std::size_t edm_buffer_size = + ttnn::ccl::FabricEriscDatamoverBuilder::default_packet_payload_size_bytes + PACKET_HEADER_SIZE_BYTES; + const chip_id_t local_chip_id = 0; + const chip_id_t remote_chip_id = 1; + const auto& edm_config = ttnn::ccl::FabricEriscDatamoverConfig(edm_buffer_size, 1, 2); + auto chip_0_edm_builder = ttnn::ccl::FabricEriscDatamoverBuilder::build( + sender_device, + fabric_sender_program, + eth_sender_core, + local_chip_id, + remote_chip_id, + edm_config, + enable_persistent_fabric); + chip_0_edm_builder.set_firmware_context_switch_interval(0); + auto chip_1_edm_builder = ttnn::ccl::FabricEriscDatamoverBuilder::build( + receiver_device, + fabric_receiver_program, + eth_receiver_core, + remote_chip_id, + local_chip_id, + edm_config, + enable_persistent_fabric); + chip_1_edm_builder.set_firmware_context_switch_interval(0); + // Create the loopback connection on the second device + chip_1_edm_builder.connect_to_downstream_edm(chip_1_edm_builder); + auto local_edm_kernel = ttnn::ccl::generate_edm_kernel( + fabric_sender_program, sender_device, chip_0_edm_builder, eth_sender_core, NOC::NOC_0); + auto remote_edm_kernel = ttnn::ccl::generate_edm_kernel( + fabric_receiver_program, receiver_device, chip_1_edm_builder, eth_receiver_core, NOC::NOC_0); + + if (enable_persistent_fabric) { + tt::tt_metal::detail::CompileProgram(sender_device, fabric_sender_program); + tt::tt_metal::detail::CompileProgram(receiver_device, fabric_receiver_program); + tt_metal::EnqueueProgram(sender_device->command_queue(), fabric_sender_program, false); + tt_metal::EnqueueProgram(receiver_device->command_queue(), fabric_receiver_program, false); + } + log_trace(tt::LogTest, "{} programs ", programs.size()); + bool success = false; + try { + success = RunLoopbackTest( + device_0, + device_1, + + eth_sender_core, + eth_receiver_core, + + page_size, + num_pages_total, + src_is_dram, + dest_is_dram, + programs, + chip_0_edm_builder, + subdevice_managers, + enable_persistent_fabric); + } catch (std::exception& e) { + log_error("Caught exception: {}", e.what()); + test_fixture.TearDown(); + return -1; + } + + if (enable_persistent_fabric) { + // Run the test twice with a single fabric invocation + + std::vector second_programs(1); + try { + success = RunLoopbackTest( + device_0, + device_1, + + eth_sender_core, + eth_receiver_core, + + page_size, + num_pages_total, + src_is_dram, + dest_is_dram, + second_programs, + chip_0_edm_builder, + subdevice_managers, + enable_persistent_fabric); + } catch (std::exception& e) { + log_error("Caught exception: {}", e.what()); + test_fixture.TearDown(); + return -1; + } + // Wait for worker programs to finish + + auto d0_worker_subdevice = device_0->get_sub_device_ids()[TEST_WORKERS_SUBDEVICE_INDEX]; + auto d1_worker_subdevice = device_1->get_sub_device_ids()[TEST_WORKERS_SUBDEVICE_INDEX]; + auto d0_fabric_subdevice = device_0->get_sub_device_ids()[TEST_EDM_FABRIC_SUBDEVICE_INDEX]; + auto d1_fabric_subdevice = device_1->get_sub_device_ids()[TEST_EDM_FABRIC_SUBDEVICE_INDEX]; + // Teardown the fabric + tt_metal::Finish(sender_device->command_queue(), {d0_worker_subdevice}); + // tt_metal::Finish(receiver_device->command_queue(), {d1_worker_subdevice}); + + // Notify fabric of teardown + chip_1_edm_builder.teardown_from_host(receiver_device); + chip_0_edm_builder.teardown_from_host(sender_device); + + // wait for fabric finish + tt_metal::Finish(sender_device->command_queue(), {d0_fabric_subdevice}); + tt_metal::Finish(receiver_device->command_queue(), {d1_fabric_subdevice}); + } + + test_fixture.TearDown(); + + return success ? 0 : -1; +} + +bool TestMultiInputReaderKernel( + size_t fabric_num_devices, + Tensor& input_tensor0, + const MemoryConfig& input_tensor0_mem_config, + Tensor& input_tensor1, + const MemoryConfig& input_tensor1_mem_config, + Tensor& output_tensor0, + const MemoryConfig& output_tensor0_mem_config, + Tensor& output_tensor1, + const MemoryConfig& output_tensor1_mem_config, + + const ttnn::ccl::v2::TensorSlice& in0_tensor_slice, + const ttnn::ccl::v2::TensorSlice& in1_tensor_slice, + const ttnn::ccl::v2::TensorSlice& out0_tensor_slice, + const ttnn::ccl::v2::TensorSlice& out1_tensor_slice, + + const uint32_t page_size, + + TwoInputReaderKernelWriteMode test_mode, + const ttnn::ccl::cmd::CclCommandDestArgs& dest_args, + bool enable_persistent_fabric) { + auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); + auto num_devices = tt::tt_metal::GetNumAvailableDevices(); + if (num_devices < 4) { + log_info("This test can only be run on T3000 devices"); + return true; + } + if (arch == tt::ARCH::GRAYSKULL) { + log_info("Test must be run on WH"); + return true; + } + T3000TestDevice test_fixture; + + TT_FATAL( + !enable_persistent_fabric || test_mode != TwoInputReaderKernelWriteMode::LOCAL_WRITEBACK, + "Test configuration issue. Set local writeback mode with persistent fabric"); + + auto view = test_fixture.mesh_device_->get_view(); + + std::vector devices; + devices.reserve(fabric_num_devices); + for (size_t i = 0; i < fabric_num_devices; i++) { + devices.push_back(view.get_device(MeshCoordinate(0, i))); + } + + std::vector programs(enable_persistent_fabric ? 1 : devices.size()); + std::optional subdevice_managers = std::nullopt; + std::optional> fabric_programs; + std::vector fabric_program_ptrs; + std::optional line_fabric; + setup_test_with_persistent_fabric( + devices, + programs, + subdevice_managers, + fabric_programs, + fabric_program_ptrs, + line_fabric, + enable_persistent_fabric); + + std::vector input0_tensors_device; + std::vector input1_tensors_device; + std::vector output0_tensors_device; + std::vector output1_tensors_device; + + // All this garbage is to make sure the test sets up buffer addresses correctly so we can safely + // multicast to a consistent destination address + for (size_t i = 0; i < devices.size(); i++) { + input0_tensors_device.push_back( + input_tensor0.to_device(devices.at(i), input_tensor0_mem_config, ttnn::DefaultQueueId)); + input1_tensors_device.push_back( + input_tensor1.to_device(devices.at(i), input_tensor1_mem_config, ttnn::DefaultQueueId)); + output0_tensors_device.push_back( + output_tensor0.to_device(devices.at(i), output_tensor0_mem_config, ttnn::DefaultQueueId)); + output1_tensors_device.push_back( + output_tensor1.to_device(devices.at(i), output_tensor1_mem_config, ttnn::DefaultQueueId)); + } + TT_FATAL( + !enable_persistent_fabric || subdevice_managers.has_value(), + "Subdevice managers must be set if fabric is enabled"); + auto launch_ccl_command_interpreter_workers = [&](std::vector& _programs) { + return RunLocalTestWithMultiInputReaders( + devices, + _programs, + line_fabric, + + input_tensor0, + input_tensor1, + output_tensor0, + output_tensor1, + + input0_tensors_device, + input1_tensors_device, + output0_tensors_device, + output1_tensors_device, + + in0_tensor_slice, + in1_tensor_slice, + out0_tensor_slice, + out1_tensor_slice, + + page_size, + test_mode, + dest_args, + subdevice_managers, + enable_persistent_fabric); + }; + + auto pass = launch_ccl_command_interpreter_workers(programs); + if (enable_persistent_fabric) { + std::vector second_run_programs(1); + // It looks suspicious that we are dropping the first result but there are two reasons we do this + // 1) We really only care that we can run back to back safely + // 2) The first run will end up racing with host and copy-back because there is no + // receiver on the destination that can signal to us when we are done. We need to add this + // to the test to make it more robust but that is future work + pass = launch_ccl_command_interpreter_workers(second_run_programs); + pass = true; + + // Due to race between host and device some packets are in flight by the time host sends shutdown signals so + // some get shutdown in between any packets in the pipeline. This can only be fixed by having a "drainer" op to + // make sure it receives all writes before exiting + persistent_fabric_teardown_sequence( + devices, subdevice_managers, line_fabric.value(), tt::fabric::TerminationSignal::IMMEDIATELY_TERMINATE); + + log_info(tt::LogTest, "Finished"); + for (auto d : devices) { + tt_metal::Synchronize(d, *ttnn::DefaultQueueId); + } + } + return pass; +} + +//////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////// +//// LOCAL CHIP TENSOR READ?WRITE (2 INPUT) +//////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////// + +ttnn::ccl::Shape4D shape_to_shape_in_tiles(const ttnn::Shape& shape) { + auto logical_shape = shape; + logical_shape[-2] /= tt::constants::TILE_HEIGHT; + logical_shape[-1] /= tt::constants::TILE_WIDTH; + EXPECT_TRUE(logical_shape.size() == 4); + ttnn::ccl::Shape4D shape_in_tiles = { + logical_shape[0], logical_shape[1], logical_shape[2], logical_shape[3]}; + return shape_in_tiles; +} + +bool RunMultiInputReaderTestPropagateFullTensorIn( + const ttnn::Shape& tensor_shape, + const Layout& layout, + const MemoryConfig& in0_memory_config, + const MemoryConfig& in1_memory_config, + const MemoryConfig& out0_memory_config, + const MemoryConfig& out1_memory_config, + TwoInputReaderKernelWriteMode test_writeback_mode) { + auto num_elems = std::reduce(tensor_shape.cbegin(), tensor_shape.cend(), 1, std::multiplies()); + Tensor input_tensor0 = + ttnn::experimental::view(ttnn::arange(0, num_elems, 1, DataType::UINT32), tensor_shape).to_layout(layout); + Tensor input_tensor1 = + ttnn::experimental::view(ttnn::arange(num_elems, 2 * num_elems, 1, DataType::UINT32), tensor_shape) + .to_layout(layout); + Tensor output_tensor0 = ttnn::experimental::view(ttnn::ones(tensor_shape, DataType::UINT32, layout), tensor_shape); + Tensor output_tensor1 = ttnn::experimental::view(ttnn::ones(tensor_shape, DataType::UINT32, layout), tensor_shape); + input_tensor0.set_tensor_spec(TensorSpec( + tensor_shape, TensorLayout(DataType::UINT32, PageConfig(layout, tt_metal::Tile()), in0_memory_config))); + input_tensor1.set_tensor_spec(TensorSpec( + tensor_shape, TensorLayout(DataType::UINT32, PageConfig(layout, tt_metal::Tile()), in1_memory_config))); + output_tensor0.set_tensor_spec(TensorSpec( + tensor_shape, TensorLayout(DataType::UINT32, PageConfig(layout, tt_metal::Tile()), out0_memory_config))); + output_tensor1.set_tensor_spec(TensorSpec( + tensor_shape, TensorLayout(DataType::UINT32, PageConfig(layout, tt_metal::Tile()), out1_memory_config))); + + size_t page_size = tile_size(DataFormat::RawUInt32); + + ttnn::ccl::Shape4D tensor_shape_in_pages = shape_to_shape_in_tiles(tensor_shape); + ttnn::ccl::Shape4D tensor_slice_shape_in_pages = tensor_shape_in_pages; + ttnn::ccl::Shape4D tensor_slice_offset = {0, 0, 0, 0}; + ttnn::ccl::Shape4D worker_slice_shape = tensor_shape_in_pages; + ttnn::ccl::Shape4D worker_slice_offset = {0, 0, 0, 0}; + + ttnn::ccl::v2::TensorSlice tensor_slice{ + tensor_shape_in_pages, + tensor_slice_shape_in_pages, + tensor_slice_offset, + worker_slice_shape, + worker_slice_offset}; + + const auto in0_tensor_slice = tensor_slice; + const auto in1_tensor_slice = tensor_slice; + const auto out0_tensor_slice = tensor_slice; + const auto out1_tensor_slice = tensor_slice; + + auto pass = TestMultiInputReaderKernel( + 1, + input_tensor0, + in0_memory_config, + input_tensor1, + in1_memory_config, + output_tensor0, + out0_memory_config, + output_tensor1, + out1_memory_config, + + in0_tensor_slice, + in1_tensor_slice, + out0_tensor_slice, + out1_tensor_slice, + + page_size, + test_writeback_mode, + ttnn::ccl::cmd::LocalOnlyCommandDestArgs{}, + false); + + return pass; +} + +// //////////////////////////////////////////////////////////////////// +// //////////////////////////////////////////////////////////////////// +// //// FABRIC MCAST TENSOR WRITE (2 INPUT) +// //////////////////////////////////////////////////////////////////// +// //////////////////////////////////////////////////////////////////// + +void RunFabricMcastFullTensorPropagateTest( + const ttnn::Shape& tensor_shape, size_t distance_dest_device, size_t num_devices, bool enable_persistent_fabric) { + const Layout layout = Layout::TILE; + const MemoryConfig in0_memory_config = MemoryConfig(TensorMemoryLayout::INTERLEAVED, BufferType::DRAM); + const MemoryConfig in1_memory_config = MemoryConfig(TensorMemoryLayout::INTERLEAVED, BufferType::DRAM); + const MemoryConfig out0_memory_config = MemoryConfig(TensorMemoryLayout::INTERLEAVED, BufferType::DRAM); + const MemoryConfig out1_memory_config = MemoryConfig(TensorMemoryLayout::INTERLEAVED, BufferType::DRAM); + + auto num_elems = std::reduce(tensor_shape.cbegin(), tensor_shape.cend(), 1, std::multiplies()); + Tensor input_tensor1 = + ttnn::experimental::view(ttnn::arange(num_elems, 2 * num_elems, 1, DataType::UINT32), tensor_shape) + .to_layout(layout); + Tensor input_tensor0 = + ttnn::experimental::view(ttnn::arange(0, num_elems, 1, DataType::UINT32), tensor_shape).to_layout(layout); + Tensor output_tensor1 = ttnn::experimental::view(ttnn::ones(tensor_shape, DataType::UINT32, layout), tensor_shape); + Tensor output_tensor0 = ttnn::experimental::view(ttnn::ones(tensor_shape, DataType::UINT32, layout), tensor_shape); + input_tensor0.set_tensor_spec(TensorSpec( + tensor_shape, TensorLayout(DataType::UINT32, PageConfig(layout, tt_metal::Tile()), in0_memory_config))); + input_tensor1.set_tensor_spec(TensorSpec( + tensor_shape, TensorLayout(DataType::UINT32, PageConfig(layout, tt_metal::Tile()), in1_memory_config))); + output_tensor0.set_tensor_spec(TensorSpec( + tensor_shape, TensorLayout(DataType::UINT32, PageConfig(layout, tt_metal::Tile()), out0_memory_config))); + output_tensor1.set_tensor_spec(TensorSpec( + tensor_shape, TensorLayout(DataType::UINT32, PageConfig(layout, tt_metal::Tile()), out1_memory_config))); + ASSERT_EQ(input_tensor0.get_logical_shape(), tensor_shape); + ASSERT_EQ(input_tensor1.get_logical_shape(), tensor_shape); + ASSERT_EQ(output_tensor0.get_logical_shape(), tensor_shape); + ASSERT_EQ(output_tensor1.get_logical_shape(), tensor_shape); + + size_t page_size = tile_size(DataFormat::RawUInt32); + + ttnn::ccl::Shape4D tensor_shape_in_pages = shape_to_shape_in_tiles(tensor_shape); + ttnn::ccl::Shape4D tensor_slice_shape_in_pages = tensor_shape_in_pages; + ttnn::ccl::Shape4D tensor_slice_offset = {0, 0, 0, 0}; + ttnn::ccl::Shape4D worker_slice_shape = tensor_shape_in_pages; + ttnn::ccl::Shape4D worker_slice_offset = {0, 0, 0, 0}; + + ttnn::ccl::v2::TensorSlice tensor_slice{ + tensor_shape_in_pages, + tensor_slice_shape_in_pages, + tensor_slice_offset, + worker_slice_shape, + worker_slice_offset}; + + const auto in0_tensor_slice = tensor_slice; + const auto in1_tensor_slice = tensor_slice; + const auto out0_tensor_slice = tensor_slice; + const auto out1_tensor_slice = tensor_slice; + + ttnn::ccl::cmd::CclCommandDestArgs dest_args = ttnn::ccl::cmd::MulticastCommandDestArgs{distance_dest_device, 0}; + auto pass = TestMultiInputReaderKernel( + num_devices, + input_tensor0, + in0_memory_config, + input_tensor1, + in1_memory_config, + output_tensor0, + out0_memory_config, + output_tensor1, + out1_memory_config, + + in0_tensor_slice, + in1_tensor_slice, + out0_tensor_slice, + out1_tensor_slice, + + page_size, + TwoInputReaderKernelWriteMode::FABRIC_MULTICAST, + dest_args, + enable_persistent_fabric); + + ASSERT_TRUE(pass); +} + +bool RunPipelinedWorkersTest( + + ttnn::Shape tensor_shape, + const size_t split_dim, + + // In this test we will have n stages with anywhere from 1 to 8 workers per stage (this will be configurable) + const size_t num_stages, + std::vector num_workers_per_stage, + const size_t slices_per_stage, + const tt::DataFormat data_format, + const size_t page_size_bytes, + const size_t cb_packet_size_in_pages, + const size_t num_packets_per_cb, + auto layout, + + std::vector> worker_chunk_read_order, + std::vector mem_configs) { + auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); + auto num_devices = tt::tt_metal::GetNumAvailableDevices(); + if (num_devices < 4) { + log_info("This test can only be run on T3000 devices"); + return true; + } + if (arch == tt::ARCH::GRAYSKULL) { + log_info("Test must be run on WH"); + return true; + } + + const auto cb_index = tt::CB::c_in0; + + auto programs = std::vector(1); + Program& program = programs[0]; + + T3000TestDevice test_fixture; + auto view = test_fixture.mesh_device_->get_view(); + + IDevice* device = view.get_device(MeshCoordinate(0, 0)); + ; + + // General setup is as follows: + // Worker 1 reads input tensor as a sequence of slices - it forwards to an output tensor and after each slice, it + // writes a semaphore increment to some known semaphore address on the destination worker so the destination worker + // knows it's safe to read that slice. + // HOWEVER. the reader will be programmed to read the chunks in a different order than they were written, this way + // we can identify synchronization related bugs (e.g. if sender semaphore increments before writes flush) + + TT_FATAL(num_workers_per_stage.size() == num_stages, "Must have a read order for each stage"); + TT_FATAL(worker_chunk_read_order.size() == num_stages, "Must have a read order for each stage"); + for (size_t i = 0; i < num_stages; ++i) { + TT_FATAL(worker_chunk_read_order[i].size() == slices_per_stage, "Must have a read order for each slice"); + } + + // Validate the test setup + TT_FATAL(num_stages > 1, "Must have at least 2 stages"); + TT_FATAL(num_stages < 8, "Must have at most 8 stages"); + for (size_t i = 0; i < num_stages; ++i) { + TT_FATAL(num_workers_per_stage[i] > 0, "Must have at least 1 worker per stage"); + TT_FATAL(num_workers_per_stage[i] < 8, "Must have at most 8 workers per stage"); + } + + std::vector tensor_specs; + tensor_specs.reserve(num_stages + 1); + for (size_t i = 0; i < num_stages + 1; ++i) { + tensor_specs.push_back(TensorSpec( + tensor_shape, TensorLayout(DataType::UINT32, PageConfig(layout, tt_metal::Tile()), mem_configs[i]))); + } + + // Allocate the tensors - pull to function + const size_t num_tensors = num_stages + 1; + std::vector host_tensors; + std::vector device_tensors; + host_tensors.reserve(num_tensors); + device_tensors.reserve(num_tensors); + auto num_elems = std::reduce(tensor_shape.cbegin(), tensor_shape.cend(), 1, std::multiplies()); + host_tensors.push_back( + ttnn::experimental::view(ttnn::arange(0, num_elems, 1, DataType::UINT32), tensor_shape).to_layout(layout)); + for (size_t i = 1; i < num_tensors; ++i) { + host_tensors.push_back( + ttnn::experimental::view(ttnn::ones(tensor_shape, DataType::UINT32, layout), tensor_shape)); + } + TT_FATAL(mem_configs.size() == num_tensors, "Must have a memory config for each tensor"); + for (size_t i = 0; i < num_tensors; i++) { + host_tensors[i].set_tensor_spec(tensor_specs[i]); + device_tensors.push_back(host_tensors[i].to_device(device, mem_configs[i])); + log_info("Tensor[{}] allocated starting at address {}", i, device_tensors[i].buffer()->address()); + } + TT_ASSERT(device_tensors.size() == num_tensors); + TT_ASSERT(device_tensors.size() == host_tensors.size()); + + // MAIN STUFF + + // Initial setup like worker core assignment, chunk read order, etc. + + std::vector pipeline_stage_worker_cores = {}; + for (size_t i = 0; i < num_stages; ++i) { + pipeline_stage_worker_cores.push_back( + CoreRangeSet(CoreRange(CoreCoord(0, i), CoreCoord(num_workers_per_stage[i] - 1, i)))); + } + CoreRangeSet all_workers_cores = CoreRangeSet(); + for (size_t i = 0; i < num_stages; ++i) { + } + + // Create circular buffers + for (size_t stage = 0; stage < num_stages; stage++) { + const size_t cb_packet_size_in_pages = 4; + const size_t num_packets_per_cb = 4; + tt_metal::CircularBufferConfig cb_config = + tt_metal::CircularBufferConfig( + cb_packet_size_in_pages * num_packets_per_cb * page_size_bytes, {{cb_index, data_format}}) + .set_page_size(cb_index, page_size_bytes); + CBHandle sender_workers_cb = CreateCircularBuffer(program, pipeline_stage_worker_cores[stage], cb_config); + } + + // Generate the reader semaphores + std::vector> input_tensor_semaphores; + input_tensor_semaphores.reserve(num_stages); + for (size_t stage = 0; stage < num_stages; stage++) { + input_tensor_semaphores.push_back({}); + for (size_t j = 0; j < slices_per_stage; j++) { + input_tensor_semaphores[stage].push_back(CreateSemaphore(program, pipeline_stage_worker_cores[stage], 0)); + } + } + + constexpr size_t num_command_streams = 1; + std::vector reader_kernels; + std::vector writer_kernels; + // Create the kernel handles for each pipeline stage + for (size_t stage = 0; stage < num_stages; stage++) { + auto reader_kernel = ttnn::ccl::worker_detail::generate_multi_command_stream_kernel_ct_args( + program, + {tt::CB::c_in0}, + {&device_tensors[stage]}, + pipeline_stage_worker_cores[stage], + tt_metal::ReaderDataMovementConfig{}, + num_command_streams); + reader_kernels.push_back(reader_kernel); + auto writer_kernel = ttnn::ccl::worker_detail::generate_multi_command_stream_kernel_ct_args( + program, + {tt::CB::c_in0}, + {&device_tensors[stage + 1]}, + pipeline_stage_worker_cores[stage], + tt_metal::WriterDataMovementConfig{}, + num_command_streams); + writer_kernels.push_back(writer_kernel); + } + + // Generate the tensor slices for each tensor/worker + std::vector> tensor_slices; + tensor_slices.reserve(num_stages + 1); + for (size_t t = 0; t < num_tensors; t++) { + tensor_slices.push_back( + ttnn::ccl::cmd::builder::generate_tensor_slices(slices_per_stage, device_tensors[t], split_dim)); + } + std::vector>> per_stage_worker_reader_tensor_slices; + std::vector>> per_stage_worker_writer_tensor_slices; + per_stage_worker_reader_tensor_slices.reserve(num_tensors); + per_stage_worker_writer_tensor_slices.reserve(num_tensors); + for (size_t stage = 0; stage < num_stages; stage++) { + per_stage_worker_reader_tensor_slices.push_back( + ttnn::ccl::cmd::builder::split_tensor_slices_across_workers_page_aligned( + num_workers_per_stage[stage], tensor_slices[stage])); + // We could compute this once and reuse it but I am generating it twice so I can have size mismatches + per_stage_worker_writer_tensor_slices.push_back( + ttnn::ccl::cmd::builder::split_tensor_slices_across_workers_page_aligned( + num_workers_per_stage[stage], tensor_slices[stage + 1])); + TT_FATAL( + per_stage_worker_reader_tensor_slices.back().size() == num_workers_per_stage[stage], + "Mismatch in tensor slices. Got {} but expected {}", + per_stage_worker_reader_tensor_slices.back().size(), + num_workers_per_stage[stage]); + TT_FATAL( + per_stage_worker_writer_tensor_slices.back().size() == num_workers_per_stage[stage], + "Mismatch in tensor slices. Got {} but expected {}", + per_stage_worker_writer_tensor_slices.back().size(), + num_workers_per_stage[stage]); + } + + // Build the command stream for each stage/worker + // Seminc example + // - local_core_semaphore_inc(second_command_stream_done_semaphore_id, 1); + // semwait example + // - local_semaphore_wait(second_command_stream_done_semaphore_id, 1) + // read tensor slice to cb example + // - read_tensor_slice_to_cb(in0_command_tensor_slice, cb_indices.at(0)) + // write tensor slice to cb example + // - build_write_tensor_slice_to_cb(out0_command_tensor_slice, cb_indices.at(0)) + TT_FATAL(per_stage_worker_reader_tensor_slices.size() == num_stages, "Mismatch in tensor slices"); + for (size_t stage = 0; stage < num_stages; stage++) { + bool last_stage = stage == num_stages - 1; + bool first_stage = stage == 0; + + const auto worker_cores = corerange_to_cores(pipeline_stage_worker_cores[stage]); + TT_FATAL(worker_cores.size() == num_workers_per_stage[stage], "Mismatch in worker cores"); + std::optional> next_worker_cores = + !last_stage ? corerange_to_cores(pipeline_stage_worker_cores[stage + 1]) + : std::optional>(std::nullopt); + + TT_FATAL( + per_stage_worker_reader_tensor_slices[stage].size() == num_workers_per_stage[stage], + "Mismatch in tensor slices"); + TT_FATAL( + per_stage_worker_writer_tensor_slices[stage].size() == num_workers_per_stage[stage], + "Mismatch in tensor slices"); + for (size_t worker = 0; worker < num_workers_per_stage[stage]; worker++) { + std::vector reader_cmd_stream; + std::vector writer_cmd_stream; + TT_FATAL( + per_stage_worker_reader_tensor_slices[stage][worker].size() == slices_per_stage, + "Mismatch in tensor slices"); + TT_FATAL( + per_stage_worker_writer_tensor_slices[stage][worker].size() == slices_per_stage, + "Mismatch in tensor slices"); + for (size_t slice_logical = 0; slice_logical < slices_per_stage; slice_logical++) { + const auto slice_actual = worker_chunk_read_order[stage][slice_logical]; + // reader + if (!first_stage) { + reader_cmd_stream.push_back(ttnn::ccl::cmd::uops::local_semaphore_wait( + input_tensor_semaphores[stage][slice_actual], num_workers_per_stage[stage - 1])); + } + reader_cmd_stream.push_back(ttnn::ccl::cmd::uops::read_tensor_slice_to_cb( + per_stage_worker_reader_tensor_slices[stage][worker][slice_actual], cb_index)); + log_info(tt::LogTest, "Worker {} reading/writing slice {}", worker, slice_actual); + + // writer + writer_cmd_stream.push_back(ttnn::ccl::cmd::uops::local_write_cb_to_tensor_slice( + per_stage_worker_writer_tensor_slices[stage][worker][slice_actual], cb_index)); + if (not last_stage) { + for (auto next_worker_xy : next_worker_cores.value()) { + log_info( + tt::LogTest, + "Stage {} Worker {} noc seminc to core (logical) x={},y={}", + stage, + worker, + next_worker_xy.x, + next_worker_xy.y); + writer_cmd_stream.push_back(ttnn::ccl::cmd::uops::local_chip_noc_semaphore_inc( + device->worker_core_from_logical_core(next_worker_xy).x, + device->worker_core_from_logical_core(next_worker_xy).y, + input_tensor_semaphores[stage + 1][slice_actual], + 1)); + } + } + } + ttnn::ccl::worker_detail::generate_multi_input_command_stream_kernel_rt_args( + program, + reader_kernels[stage], + {&device_tensors[stage]}, + {page_size_bytes}, + device, + cb_packet_size_in_pages, + {worker_cores.at(worker)}, + reader_cmd_stream, + std::nullopt, + std::nullopt, + std::nullopt); + ttnn::ccl::worker_detail::generate_multi_input_command_stream_kernel_rt_args( + program, + writer_kernels[stage], + {&device_tensors[stage + 1]}, + {page_size_bytes}, + device, + cb_packet_size_in_pages, + {worker_cores.at(worker)}, + writer_cmd_stream, + std::nullopt, + std::nullopt, + std::nullopt); + } + } + + run_programs(programs, {device}); + + bool pass = true; + constexpr bool enable_check = true; + if constexpr (enable_check) { + log_info(tt::LogTest, "Reading back outputs"); + auto input_cpu = device_tensors[0].cpu(); + auto final_out_cpu = device_tensors.back().cpu(); + + auto in_tensor_copyback = tt::tt_metal::owned_buffer::get_as(input_cpu); + auto out_tensor_copyback = tt::tt_metal::owned_buffer::get_as(final_out_cpu); + + auto in_tensor_data = tt::tt_metal::owned_buffer::get_as(host_tensors[0]); + + bool input_copyback_check_passed = run_output_check(in_tensor_data, in_tensor_copyback) == Correctness::Correct; + TT_FATAL(input_copyback_check_passed, "Input 0 copyback check failed"); + + log_info(tt::LogTest, "Comparing outputs"); + + pass &= run_output_check(in_tensor_data, out_tensor_copyback) == Correctness::Correct; + if (pass) { + log_info(tt::LogTest, "Output check passed for output 0"); + } else { + log_error(tt::LogTest, "Output check failed for output 0"); + } + } + + return pass; +} + +#include "ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.hpp" +#include + +static void wait_for_worker_subdevice_program_completion( + const std::vector& devices, const std::optional& subdevice_managers) { + std::ranges::for_each(devices, [&](IDevice* d) { + tt_metal::Finish(d->command_queue(), {subdevice_managers->worker_subdevice_id.at(d->id())}); + }); +} + +#include "ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.hpp" +void run_all_gather_with_persistent_fabric(const size_t dim, const size_t num_links, ttnn::Shape const& input_shape) { + log_info(tt::LogTest, "entering test"); + constexpr auto layout = Layout::TILE; + // DEVICES setuip + auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); + constexpr size_t test_expected_num_devices = 4; + if (tt::tt_metal::GetNumAvailableDevices() < test_expected_num_devices) { + log_info("This test can only be run on T3000 devices"); + return; + } + if (arch == tt::ARCH::GRAYSKULL) { + log_info("Test must be run on WH"); + return; + } + T3000TestDevice test_fixture; + auto view = test_fixture.mesh_device_->get_view(); + + // build a line of devices + std::vector devices = { + view.get_device(MeshCoordinate(0, 0)), + view.get_device(MeshCoordinate(0, 1)), + view.get_device(MeshCoordinate(0, 2)), + view.get_device(MeshCoordinate(0, 3))}; + const size_t num_devices = devices.size(); + TT_FATAL( + test_expected_num_devices == num_devices, + "Expected {} devices but got {}", + test_expected_num_devices, + num_devices); + const MemoryConfig in_memory_config = MemoryConfig(TensorMemoryLayout::INTERLEAVED, BufferType::DRAM); + const auto num_elems = input_shape.volume(); + + // INPUT TENSOR setup + log_info(tt::LogTest, "setting up input tensors"); + size_t page_size = tile_size(DataFormat::Float16); + std::vector device_input_tensors; + for (size_t i = 0; i < num_devices; i++) { + auto t = ttnn::experimental::view(ttnn::arange(0, num_elems, 1), input_shape).to_layout(layout); + t.set_tensor_spec(TensorSpec( + input_shape, TensorLayout(DataType::BFLOAT16, PageConfig(layout, tt_metal::Tile()), in_memory_config))); + + device_input_tensors.push_back(t.to_device(devices[i])); + } + // Need to make it a mesh tensor for use with the op + const Tensor input_mesh_tensor = ttnn::distributed::aggregate_as_tensor(device_input_tensors, AllGatherTensor{}); + + // FABRIC setup + const bool enable_persistent_fabric = true; + + std::vector dummy_worker_programs; + std::optional subdevice_managers = std::nullopt; + std::optional> fabric_programs; + std::vector fabric_program_ptrs; + std::optional fabric_handle; + setup_test_with_persistent_fabric( + devices, + dummy_worker_programs, + subdevice_managers, + fabric_programs, + fabric_program_ptrs, + fabric_handle, + enable_persistent_fabric, + num_links); + log_info(tt::LogTest, "Lauching op"); + + ttnn::global_semaphore::MultiDeviceGlobalSemaphore multi_device_global_semaphore = + ttnn::global_semaphore::create_global_semaphore_with_same_address( + test_fixture.mesh_device_.get(), + devices[0]->worker_cores(HalProgrammableCoreType::TENSIX, SubDeviceId{0}), + 0, // initial value + tt::tt_metal::BufferType::L1, // buffer type + 10 // attempts + ); + + auto output_tensor = ttnn::operations::experimental::ccl::all_gather_async( + input_mesh_tensor, + dim, + multi_device_global_semaphore, + num_links, + operation::DEFAULT_OUTPUT_MEMORY_CONFIG, + ttnn::ccl::Topology::Linear, + SubDeviceId(0), + true); + + // wait for op completion + wait_for_worker_subdevice_program_completion(devices, subdevice_managers); + log_info(tt::LogTest, "Main op done"); + + log_info(tt::LogTest, "Fabric teardown"); + persistent_fabric_teardown_sequence( + devices, subdevice_managers, fabric_handle.value(), tt::fabric::TerminationSignal::IMMEDIATELY_TERMINATE); + + log_info(tt::LogTest, "Waiting for teardown completion"); + for (auto d : devices) { + tt_metal::Synchronize(d, *ttnn::DefaultQueueId); + } + log_info(tt::LogTest, "Finished"); +} + +struct WriteThroughputStabilityTestWithPersistentFabricParams { + size_t line_size = 4; + size_t num_devices_with_workers = 0; + bool line_sync = true; +}; + +void RunWriteThroughputStabilityTestWithPersistentFabric( + size_t num_mcasts, + size_t num_unicasts, + size_t num_links, + size_t num_op_invocations, + const WriteThroughputStabilityTestWithPersistentFabricParams& params = {}, + size_t packet_payload_size_bytes = ttnn::ccl::FabricEriscDatamoverBuilder::default_packet_payload_size_bytes) { + auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); + auto num_devices = tt::tt_metal::GetNumAvailableDevices(); + if (num_devices < 4) { + log_info("This test can only be run on T3000 devices"); + return; + } + if (arch == tt::ARCH::GRAYSKULL) { + log_info("Test must be run on WH"); + return; + } + + size_t line_size = params.line_size; + size_t num_devices_with_workers = params.num_devices_with_workers; + if (num_devices_with_workers == 0) { + num_devices_with_workers = line_size; + } + using namespace ttnn::ccl; + TT_FATAL(num_devices_with_workers <= line_size, "num_devices_with_workers must be less than or equal to num_links"); + + auto worker_core_logical = [](size_t link) { return CoreCoord(link, 0); }; + + // static constexpr size_t source_l1_buffer_address = 1000000; + static constexpr uint32_t packet_header_cb_index = tt::CB::c_in0; + static constexpr uint32_t source_payload_cb_index = tt::CB::c_in1; + static constexpr size_t packet_header_cb_size_in_headers = 4; + static constexpr bool enable_persistent_fabric_mode = true; + size_t dest_buffer_size = packet_payload_size_bytes * 4; + static constexpr tt::DataFormat cb_df = tt::DataFormat::Bfp8; + + T3000TestDevice test_fixture; + auto view = test_fixture.mesh_device_->get_view(); + + // Get the inner 4 device ring on a WH T3K device so that we can use both links for all devices + std::vector devices_ = { + view.get_device(MeshCoordinate(0, 1)), + view.get_device(MeshCoordinate(0, 2)), + view.get_device(MeshCoordinate(1, 2)), + view.get_device(MeshCoordinate(1, 1))}; + std::vector devices; + devices.reserve(line_size); + for (size_t i = 0; i < line_size; i++) { + devices.push_back(devices_[i]); + } + // build the mesh device + + // Persistent Fabric Setup + std::vector dummy_worker_programs; + std::optional subdevice_managers = std::nullopt; + std::optional> fabric_programs; + std::vector fabric_program_ptrs; + std::optional fabric_handle; + setup_test_with_persistent_fabric( + devices, + dummy_worker_programs, + subdevice_managers, + fabric_programs, + fabric_program_ptrs, + fabric_handle, + enable_persistent_fabric_mode, + num_links); + + // Other boiler plate setup + CoreRangeSet worker_cores = CoreRangeSet(CoreRange(CoreCoord(0, 0), CoreCoord(num_links - 1, 0))); + auto worker_cores_vec = corerange_to_cores(worker_cores, std::nullopt, false); + auto dest_core_coord = CoreCoord(2, 2); + auto sync_core_coord = CoreCoord(0, 0); + + ttnn::SmallVector> device_dest_buffers; + device_dest_buffers.reserve(line_size); + for (auto* d : devices) { + auto local_input_buffer = + CreateBuffer(InterleavedBufferConfig{d, dest_buffer_size, dest_buffer_size, BufferType::L1}); + device_dest_buffers.push_back(local_input_buffer); + } + + size_t dest_bank_addr = device_dest_buffers[0]->address(); + TT_FATAL( + std::all_of( + device_dest_buffers.begin(), + device_dest_buffers.end(), + [dest_bank_addr](const auto& buffer) { return buffer->address() == dest_bank_addr; }), + "Test setup error: all destination buffers must have the same bank address across devices"); + + std::vector global_semaphore_addrs; + global_semaphore_addrs.reserve(line_size + 1); + std::vector global_semaphore_handles; + for (size_t i = 0; i < line_size * 4; i++) { + auto global_semaphores = ttnn::global_semaphore::create_global_semaphore_with_same_address( + test_fixture.mesh_device_.get(), + devices[0]->worker_cores(HalProgrammableCoreType::TENSIX, SubDeviceId{0}), + 0, // initial value + tt::tt_metal::BufferType::L1, // buffer type + 1000 // attempts + ); + global_semaphore_handles.push_back(global_semaphores); + auto global_semaphore_addr = + ttnn::global_semaphore::get_global_semaphore_address(global_semaphores.global_semaphores.at(0)); + global_semaphore_addrs.push_back(global_semaphore_addr); + } + + std::vector worker_devices; + for (size_t i = 0; i < num_devices_with_workers; i++) { + worker_devices.push_back(devices[i]); + } + // Worker program setup + std::vector programs(num_devices_with_workers); + TT_FATAL( + programs.size() == worker_devices.size(), + "Test misconfiguration. Mismatch in line size and devices. Expected line size of {} but got {} devices " + "instead.", + line_size, + worker_devices.size()); + std::vector worker_kernel_ids; + std::vector per_device_global_sem_addr_rt_arg; + for (size_t i = 0; i < num_devices_with_workers; i++) { + const size_t line_index = i; + auto& program = programs[i]; + auto* device = devices[i]; + const size_t dest_noc_x = device->worker_core_from_logical_core(dest_core_coord).x; + const size_t dest_noc_y = device->worker_core_from_logical_core(dest_core_coord).y; + const size_t sync_core_noc_x = device->worker_core_from_logical_core(sync_core_coord).x; + const size_t sync_core_noc_y = device->worker_core_from_logical_core(sync_core_coord).y; + + IDevice* backward_device = i == 0 ? nullptr : devices[i - 1]; + IDevice* forward_device = i == line_size - 1 ? nullptr : devices[i + 1]; + + // Initialize the fabric handle for worker connection + bool start_of_line = line_index == 0; + bool end_of_line = line_index == line_size - 1; + bool has_forward_connection = !end_of_line; + bool has_backward_connection = !start_of_line; + bool unicast_forward = !end_of_line; + size_t mcast_fwd_hops = line_size - line_index - 1; + size_t mcast_bwd_hops = line_index; + size_t unicast_hops = unicast_forward ? mcast_fwd_hops : mcast_bwd_hops; + + auto local_device_fabric_handle = + ttnn::ccl::EdmLineFabricOpInterface::build_program_builder_worker_connection_fabric( + device, forward_device, backward_device, &program, enable_persistent_fabric_mode, num_links); + + // reserve CB + tt_metal::CircularBufferConfig cb_src0_config = + tt_metal::CircularBufferConfig( + packet_header_cb_size_in_headers * sizeof(tt::fabric::PacketHeader), {{packet_header_cb_index, cb_df}}) + .set_page_size(packet_header_cb_index, sizeof(tt::fabric::PacketHeader)); + CBHandle sender_workers_cb = CreateCircularBuffer(program, worker_cores, cb_src0_config); + + tt_metal::CircularBufferConfig cb_src1_config = + tt_metal::CircularBufferConfig(packet_payload_size_bytes, {{source_payload_cb_index, cb_df}}) + .set_page_size(source_payload_cb_index, packet_payload_size_bytes); + CBHandle sender_workers_payload_cb = CreateCircularBuffer(program, worker_cores, cb_src1_config); + + TT_FATAL( + local_device_fabric_handle.get_num_links() == num_links, + "Error in test setup. Expected two links between devices but got {} links for device {}", + local_device_fabric_handle.get_num_links(), + device->id()); + + std::vector worker_ct_args = {params.line_sync, params.line_sync}; + + auto worker_kernel_id = tt_metal::CreateKernel( + program, + "tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp", + worker_cores, + tt_metal::WriterDataMovementConfig(worker_ct_args)); + worker_kernel_ids.push_back(worker_kernel_id); + for (size_t l = 0; l < num_links; l++) { + auto worker_core = worker_cores_vec[l]; + auto build_connection_args = [&local_device_fabric_handle, device, &program, &worker_core]( + bool is_connected_in_direction, + ttnn::ccl::EdmLineFabricOpInterface::Direction direction, + std::vector& rt_args_out) { + rt_args_out.push_back(is_connected_in_direction); + if (is_connected_in_direction) { + const auto connection = local_device_fabric_handle.uniquely_connect_worker(device, direction); + const auto new_rt_args = + ttnn::ccl::worker_detail::generate_edm_connection_rt_args(connection, program, {worker_core}); + log_info( + tt::LogTest, + "On device: {}, connecting to EDM fabric in {} direction. EDM noc_x: {}, noc_y: {}", + device->id(), + direction, + connection.edm_noc_x, + connection.edm_noc_y); + std::copy(new_rt_args.begin(), new_rt_args.end(), std::back_inserter(rt_args_out)); + } + }; + // RT ARGS + std::vector rt_args = { + dest_bank_addr, + packet_payload_size_bytes, + dest_noc_x, + dest_noc_y, + + num_mcasts, + mcast_fwd_hops, + mcast_bwd_hops, + + num_unicasts, + unicast_hops, + unicast_forward, + + source_payload_cb_index, // source_l1_buffer_address, + packet_header_cb_index, + packet_header_cb_size_in_headers, + }; + + build_connection_args(has_forward_connection, ttnn::ccl::EdmLineFabricOpInterface::FORWARD, rt_args); + build_connection_args(has_backward_connection, ttnn::ccl::EdmLineFabricOpInterface::BACKWARD, rt_args); + + if (params.line_sync) { + rt_args.push_back(sync_core_noc_x); + rt_args.push_back(sync_core_noc_y); + if (l == 0) { + per_device_global_sem_addr_rt_arg.push_back(rt_args.size()); + } + TT_FATAL(global_semaphore_addrs.at(0) != -1, "Invalid test setup. Global semaphore address is -1"); + rt_args.push_back(global_semaphore_addrs.at(0)); + rt_args.push_back(num_links * num_devices_with_workers); + } + + tt_metal::SetRuntimeArgs(program, worker_kernel_id, worker_core, rt_args); + } + } + + for (size_t i = 0; i < num_op_invocations; i++) { + log_info(tt::LogTest, "Iteration: {}", i); + if (i != 0 && params.line_sync) { + for (size_t k = 0; k < worker_kernel_ids.size(); k++) { + auto& worker_rt_args_by_core = GetRuntimeArgs(programs[k], worker_kernel_ids[k]); + auto global_sem_addr_rt_arg_idx = per_device_global_sem_addr_rt_arg[k]; + for (size_t l = 0; l < num_links; l++) { + auto& worker_rt_args = worker_rt_args_by_core[worker_cores_vec[l].x][worker_cores_vec[l].y]; + worker_rt_args.at(global_sem_addr_rt_arg_idx) = + global_semaphore_addrs[i % global_semaphore_addrs.size()]; + } + } + } + + build_and_enqueue(worker_devices, programs, i != 0); + + log_info(tt::LogTest, "Waiting for Op finish on all devices"); + wait_for_worker_subdevice_program_completion(worker_devices, subdevice_managers); + log_info(tt::LogTest, "Main op done"); + } + + TT_FATAL(fabric_programs->size() == devices.size(), "Expected fabric programs size to be same as devices size"); + log_info(tt::LogTest, "Fabric teardown"); + persistent_fabric_teardown_sequence( + devices, subdevice_managers, fabric_handle.value(), tt::fabric::TerminationSignal::GRACEFULLY_TERMINATE); + + log_info(tt::LogTest, "Waiting for teardown completion"); + for (IDevice* d : devices) { + tt_metal::Synchronize(d, *ttnn::DefaultQueueId); + } + for (size_t i = 0; i < programs.size(); i++) { + auto d = worker_devices[i]; + auto& program = programs[i]; + tt_metal::DumpDeviceProfileResults(d, program); + } + for (size_t i = 0; i < fabric_programs->size(); i++) { + auto d = devices[i]; + auto& program = fabric_programs.value()[i]; + tt_metal::DumpDeviceProfileResults(d, program); + } + log_info(tt::LogTest, "Finished"); +} diff --git a/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp b/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp index 52662ba9eef..1031f80f496 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp @@ -3,1486 +3,7 @@ // // SPDX-License-Identifier: Apache-2.0 -#include -#include -#include -#include -#include -#include -#include "tt-metalium/kernel_types.hpp" -#include "tt_metal/test_utils/df/df.hpp" -#include "tt_metal/test_utils/env_vars.hpp" -#include "ttnn/common/queue_id.hpp" -#include "ttnn/cpp/ttnn/operations/ccl/ccl_common.hpp" -#include "ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp" -#include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp" -#include "ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.hpp" -#include "ttnn/operations/ccl/common/uops/ccl_host_commands.hpp" -#include "ttnn/cpp/ttnn/operations/creation.hpp" -#include "ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_command.hpp" -#include "ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types_args_emitters.hpp" -#include "ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.hpp" -#include "ttnn/cpp/ttnn/operations/ccl/common/host/ccl_command_stream_builders.hpp" - -#include -#include -#include "ttnn/cpp/ttnn/operations/experimental/reshape/view.hpp" - -#include - -#include "umd/device/types/arch.h" -#include "umd/device/types/cluster_descriptor_types.h" -#include "gtest/gtest.h" - -#include -#include -#include -#include -#include - -using namespace tt; -using namespace tt::test_utils; -using namespace tt::test_utils::df; - -enum TwoInputReaderKernelWriteMode { LOCAL_WRITEBACK, FABRIC_UNICAST, FABRIC_MULTICAST }; - -static constexpr size_t TEST_WORKERS_SUBDEVICE_INDEX = 0; -static constexpr size_t TEST_EDM_FABRIC_SUBDEVICE_INDEX = 1; - -using subdevice_managers_t = std::unordered_map; -struct SubdeviceInfo { - std::unordered_map sub_device_managers; - std::unordered_map worker_subdevice_id; - std::unordered_map fabric_subdevice_id; -}; - -using tt::tt_metal::distributed::MeshCoordinate; -using tt::tt_metal::distributed::MeshDevice; -using tt::tt_metal::distributed::MeshDeviceConfig; -using tt::tt_metal::distributed::MeshDeviceView; -using tt::tt_metal::distributed::MeshShape; -class T3000TestDevice { -public: - T3000TestDevice() : device_open(false) { - auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); - if (slow_dispatch) { - TT_THROW("This suite can only be run without TT_METAL_SLOW_DISPATCH_MODE set"); - } - arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); - - num_devices_ = tt::tt_metal::GetNumAvailableDevices(); - if (arch_ == tt::ARCH::WORMHOLE_B0 and num_devices_ == 8 and tt::tt_metal::GetNumPCIeDevices() == 4) { - mesh_device_ = MeshDevice::create(MeshDeviceConfig{.mesh_shape = MeshShape{2, 4}}); - - std::vector ids(num_devices_, 0); - std::iota(ids.begin(), ids.end(), 0); - - } else { - TT_THROW("This suite can only be run on T3000 Wormhole devices"); - } - device_open = true; - } - ~T3000TestDevice() { - if (device_open) { - TearDown(); - } - } - - void TearDown() { - device_open = false; - mesh_device_->close(); - } - - tt::ARCH arch_; - size_t num_devices_; - std::shared_ptr mesh_device_; - -private: - bool device_open; -}; - -struct BankedConfig { - size_t num_pages; - size_t size_bytes; - size_t page_size_bytes; - BufferType input_buffer_type; - BufferType output_buffer_type; - tt::DataFormat l1_data_format; -}; - -struct KernelXY { - uint16_t x; - uint16_t y; - - uint32_t to_uint32() const { return y << 16 | x; } -}; - -enum Correctness { Correct, Incorrect }; - -template -Correctness run_output_check(CONTAINER_T const& inputs, CONTAINER_T output_buffer) { - constexpr bool debug_mode = true; - - log_info(tt::LogTest, "Checking outputs"); - bool pass = true; - - std::size_t num_printed_mismatches = 0; - for (size_t i = 0; i < inputs.size() && num_printed_mismatches < 64; i++) { - if (output_buffer[i] != inputs[i]) { - if (debug_mode) { - if (pass) { - log_error("Output mismatch"); - } - log_error("[{}]: expected {} got {}", i, inputs[i], output_buffer[i]); - num_printed_mismatches++; - } - pass = false; - } - } - if (num_printed_mismatches > 0) { - log_error("... (remaining mismatches omitted)"); - } - - log_info(tt::LogTest, "Output check: {}", pass ? "PASS" : "FAIL"); - return pass ? Correctness::Correct : Correctness::Incorrect; -}; - -static SubdeviceInfo create_subdevices(std::vector const& devices) { - SubdeviceInfo subdevice_info; - std::unordered_map sub_device_manager_ids; - for (auto device : devices) { - const auto& tensix_sub_device = - tt_metal::SubDevice(std::array{device->worker_cores(HalProgrammableCoreType::TENSIX, SubDeviceId{0})}); - const auto& eth_sub_device = tt_metal::SubDevice( - std::array{CoreRangeSet(), device->worker_cores(HalProgrammableCoreType::ACTIVE_ETH, SubDeviceId{0})}); - subdevice_info.sub_device_managers.insert( - {device->id(), device->create_sub_device_manager({tensix_sub_device, eth_sub_device}, 0)}); - device->load_sub_device_manager(subdevice_info.sub_device_managers.at(device->id())); - subdevice_info.worker_subdevice_id.insert( - {device->id(), device->get_sub_device_ids().at(TEST_WORKERS_SUBDEVICE_INDEX)}); - subdevice_info.fabric_subdevice_id.insert( - {device->id(), device->get_sub_device_ids().at(TEST_EDM_FABRIC_SUBDEVICE_INDEX)}); - device->set_sub_device_stall_group({subdevice_info.worker_subdevice_id.at(device->id())}); - } - - return subdevice_info; -} - -Correctness run_output_check( - std::vector const& all_zeros, - std::vector const& inputs, - std::shared_ptr& output_buffer) { - constexpr bool debug_mode = true; - std::vector readback_data_vec(all_zeros.size(), 0); // init to 0 data for easier debug - - tt_metal::detail::ReadFromBuffer(output_buffer, readback_data_vec); - return run_output_check(inputs, readback_data_vec); -}; - -void run_programs(std::vector& programs, const std::vector& devices) { - EXPECT_EQ(programs.size(), devices.size()); - const size_t num_programs = programs.size(); - try { - for (size_t i = 0; i < num_programs; i++) { - tt::tt_metal::detail::CompileProgram(devices.at(i), programs.at(i)); - } - } catch (std::exception& e) { - log_error("Failed compile: {}", e.what()); - throw e; - } - - log_info(tt::LogTest, "Running..."); - - std::vector threads; - threads.reserve(num_programs); - if (std::getenv("TT_METAL_SLOW_DISPATCH_MODE")) { - for (size_t i = 0; i < num_programs; i++) { - threads.emplace_back(std::thread([&] { tt_metal::detail::LaunchProgram(devices.at(i), programs.at(i)); })); - } - - std::ranges::for_each(threads, [](std::thread& t) { t.join(); }); - } else { - for (size_t i = 0; i < num_programs; i++) { - tt_metal::EnqueueProgram(devices.at(i)->command_queue(), programs.at(i), false); - } - - log_debug(tt::LogTest, "Calling Finish"); - for (size_t i = 0; i < num_programs; i++) { - tt_metal::Finish(devices.at(i)->command_queue()); - } - } -} - -std::tuple, std::vector> build_input_buffer( - IDevice* first_device, size_t tensor_size_bytes, BankedConfig const& test_config) { - auto inputs = std::vector(tensor_size_bytes / sizeof(uint32_t), 0); - std::iota(inputs.begin(), inputs.end(), 0); - - // Input buffer - auto local_input_buffer = CreateBuffer(InterleavedBufferConfig{ - first_device, test_config.size_bytes, test_config.page_size_bytes, test_config.input_buffer_type}); - tt_metal::detail::WriteToBuffer(local_input_buffer, inputs); - return {local_input_buffer, inputs}; -} - -static void build_and_enqueue( - const std::vector& devices, std::vector& programs, bool enqueue_only = false) { - TT_FATAL( - devices.size() == programs.size(), - "Number of devices must match number of programs when calling build_and_enqueue in test"); - if (!enqueue_only) { - for (size_t i = 0; i < devices.size(); i++) { - tt::tt_metal::detail::CompileProgram(devices[i], programs[i]); - } - } - for (size_t i = 0; i < devices.size(); i++) { - tt_metal::EnqueueProgram(devices[i]->command_queue(), programs[i], false); - } -} - -struct EthLinkHop { - CoreCoord hop_src; - CoreCoord hop_dest; -}; - -struct ChipConnection { - std::vector links; -}; - -struct unicast_send { - size_t distance; -}; -struct mcast_send { - size_t distance; - size_t range; -}; - -using mode_variant_t = std::variant; - -static constexpr size_t PACKET_HEADER_SIZE_BYTES = sizeof(tt::fabric::PacketHeader); -void generate_sender_worker_kernels( - Program& program, - IDevice* device, - const CoreCoord& worker_core, - const ttnn::ccl::SenderWorkerAdapterSpec& worker_fabric_connection, - const mode_variant_t& mode, - std::size_t edm_buffer_size, - uint32_t page_plus_header_size, - uint32_t num_pages_total, - uint32_t num_pages_per_edm_buffer, - uint32_t local_worker_fabric_semaphore_id, - uint32_t local_worker_teardown_semaphore_id, - uint32_t local_worker_last_message_semaphore_id, - uint32_t dram_input_buffer_base_addr, - bool src_is_dram, - uint32_t dram_output_buffer_base_addr, - bool dest_is_dram, - uint32_t worker_buffer_index_semaphore_id, - // farthest to closest - const std::vector& edm_termination_infos) { - auto const& edm_noc_core = CoreCoord(worker_fabric_connection.edm_noc_x, worker_fabric_connection.edm_noc_y); - std::vector sender_worker_reader_compile_args{ - src_is_dram, // - num_pages_total, // - page_plus_header_size - PACKET_HEADER_SIZE_BYTES, - num_pages_per_edm_buffer}; - std::vector sender_worker_reader_runtime_args{dram_input_buffer_base_addr}; - - log_trace(tt::LogTest, "\tSenderReader CT Args"); - for (auto const& arg : sender_worker_reader_compile_args) { - log_trace(tt::LogTest, "\t\t{}", arg); - } - log_trace(tt::LogTest, "\tSenderReader RT Args"); - for (auto const& arg : sender_worker_reader_runtime_args) { - log_trace(tt::LogTest, "\t\t{}", arg); - } - - std::vector sender_worker_writer_compile_args{ - num_pages_per_edm_buffer, - num_pages_total, - page_plus_header_size - PACKET_HEADER_SIZE_BYTES, - worker_fabric_connection.num_buffers_per_channel, - dest_is_dram, - std::holds_alternative(mode) ? 1 : 0}; - log_trace(tt::LogTest, "worker_fabric_connection.edm_l1_sem_addr: {}", worker_fabric_connection.edm_l1_sem_addr); - log_trace(tt::LogTest, "worker_buffer_index_semaphore_id: {}", worker_buffer_index_semaphore_id); - log_trace(tt::LogTest, "last_message_semaphore_address: {}", local_worker_last_message_semaphore_id); - log_trace( - tt::LogTest, "Sender communicating with EDM: x={}, y={}", (uint32_t)edm_noc_core.x, (uint32_t)edm_noc_core.y); - std::vector sender_worker_writer_runtime_args{ - worker_fabric_connection.edm_buffer_base_addr, - worker_fabric_connection.edm_l1_sem_addr, - local_worker_fabric_semaphore_id, - local_worker_teardown_semaphore_id, - (uint32_t)edm_noc_core.x, - (uint32_t)edm_noc_core.y, - worker_fabric_connection.num_buffers_per_channel, - - worker_fabric_connection.edm_connection_handshake_addr, - worker_fabric_connection.edm_worker_location_info_addr, - edm_buffer_size, - dram_output_buffer_base_addr, - local_worker_last_message_semaphore_id, - worker_buffer_index_semaphore_id, - worker_fabric_connection.persistent_fabric ? 1 : 0, - worker_fabric_connection.buffer_index_semaphore_id}; - - if (std::holds_alternative(mode)) { - sender_worker_writer_runtime_args.push_back(std::get(mode).distance); - sender_worker_writer_runtime_args.push_back(std::get(mode).range); - } else { - sender_worker_writer_runtime_args.push_back(std::get(mode).distance); - } - - get_runtime_args_for_edm_termination_infos(edm_termination_infos, sender_worker_writer_runtime_args); - - uint32_t src0_cb_index = CBIndex::c_0; - log_trace(tt::LogTest, "\tSenderWriter CT Args"); - for (auto const& arg : sender_worker_writer_compile_args) { - log_trace(tt::LogTest, "\t\t{}", arg); - } - log_trace(tt::LogTest, "\tSenderWriter RT Args"); - for (auto const& arg : sender_worker_writer_runtime_args) { - log_trace(tt::LogTest, "\t\t{}", arg); - } - - // Just want a dummy DF - tt::DataFormat df = (page_plus_header_size - PACKET_HEADER_SIZE_BYTES) == 1024 ? tt::DataFormat::Bfp8 - : (page_plus_header_size - PACKET_HEADER_SIZE_BYTES) == 2048 ? tt::DataFormat::Float16 - : tt::DataFormat::Float32; - tt_metal::CircularBufferConfig cb_src0_config = - tt_metal::CircularBufferConfig(2 * num_pages_per_edm_buffer * page_plus_header_size, {{src0_cb_index, df}}) - .set_page_size(src0_cb_index, page_plus_header_size); - CBHandle sender_workers_cb = CreateCircularBuffer(program, worker_core, cb_src0_config); - auto sender_worker_reader_kernel = tt_metal::CreateKernel( - program, - "tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_reader.cpp", - worker_core, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt_metal::NOC::RISCV_0_default, - .compile_args = sender_worker_reader_compile_args}); - auto sender_worker_writer_kernel = tt_metal::CreateKernel( - program, - "tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp", - worker_core, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_1, - .noc = tt_metal::NOC::RISCV_1_default, - .compile_args = sender_worker_writer_compile_args}); - tt_metal::SetRuntimeArgs(program, sender_worker_reader_kernel, worker_core, sender_worker_reader_runtime_args); - tt_metal::SetRuntimeArgs(program, sender_worker_writer_kernel, worker_core, sender_worker_writer_runtime_args); -} - -bool RunLoopbackTest( - tt_metal::IDevice* sender_device, - tt_metal::IDevice* receiver_device, - - const CoreCoord& eth_sender_core, - const CoreCoord& eth_receiver_core, - - const uint32_t page_size, - const uint32_t num_pages_total, - bool src_is_dram, - bool dest_is_dram, - std::vector& programs, - ttnn::ccl::FabricEriscDatamoverBuilder& chip_0_edm_builder, - std::optional& subdevice_managers, - bool enable_persistent_fabric) { - auto& sender_program = programs.at(0); - std::size_t page_plus_header_size = page_size + sizeof(tt::fabric::PacketHeader); - std::size_t tensor_size_bytes = num_pages_total * page_size; - - std::vector worker_cores = {CoreCoord(0, 0)}; - - auto local_worker_fabric_semaphore_id = tt::tt_metal::CreateSemaphore(sender_program, worker_cores.at(0), 0); - auto local_worker_teardown_semaphore_id = tt::tt_metal::CreateSemaphore(sender_program, worker_cores.at(0), 0); - auto local_worker_last_message_semaphore_id = tt::tt_metal::CreateSemaphore(sender_program, worker_cores.at(0), 0); - auto worker_buffer_index_semaphore_id = tt::tt_metal::CreateSemaphore(sender_program, worker_cores.at(0), 0); - - // Generate inputs - //////////////////////////////////////////////////////////////////////////// - // SETUP THE INPUT CB - //////////////////////////////////////////////////////////////////////////// - - BankedConfig test_config = BankedConfig{ - .num_pages = num_pages_total, - .size_bytes = tensor_size_bytes, - .page_size_bytes = page_size, - .input_buffer_type = src_is_dram ? BufferType::DRAM : BufferType::L1, - .output_buffer_type = dest_is_dram ? BufferType::DRAM : BufferType::L1, - .l1_data_format = tt::DataFormat::Float16_b}; - - auto [local_input_buffer, inputs] = build_input_buffer(sender_device, tensor_size_bytes, test_config); - - std::vector all_zeros(inputs.size(), 0); - auto local_output_buffer = CreateBuffer(InterleavedBufferConfig{ - sender_device, test_config.size_bytes, test_config.page_size_bytes, test_config.output_buffer_type}); - - tt_metal::detail::WriteToBuffer(local_output_buffer, all_zeros); - - auto local_input_buffer_address = local_input_buffer->address(); - auto local_output_buffer_address = local_output_buffer->address(); - - //////////////////////////////////////////////////////////////////////////// - // EDM Builder Setup - //////////////////////////////////////////////////////////////////////////// - - static constexpr std::size_t edm_buffer_size = - ttnn::ccl::FabricEriscDatamoverBuilder::default_packet_payload_size_bytes + PACKET_HEADER_SIZE_BYTES; - - auto chip0_worker_fabric_connection = chip_0_edm_builder.build_connection_to_worker_channel(); - //////////////////////////////////////////////////////////////////////////// - // Build Workers - //////////////////////////////////////////////////////////////////////////// - log_trace(tt::LogTest, "Generating local_sender -> remote_receiver workers"); - const std::size_t pages_per_send = - (chip0_worker_fabric_connection.buffer_size_bytes - PACKET_HEADER_SIZE_BYTES) / page_size; - auto const& worker_core = worker_cores.at(0); - log_trace(tt::LogTest, "Worker {}. On Core x={},y={}", 0, worker_core.x, worker_core.y); - - const auto& edm_config = ttnn::ccl::FabricEriscDatamoverConfig(edm_buffer_size, 1, 2); - const std::vector& edm_termination_infos = - enable_persistent_fabric ? std::vector{} - : std::vector{ - {1, - sender_device->ethernet_core_from_logical_core(eth_receiver_core).x, - sender_device->ethernet_core_from_logical_core(eth_receiver_core).y, - chip_0_edm_builder.config.termination_signal_address}, - {0, - sender_device->ethernet_core_from_logical_core(eth_sender_core).x, - sender_device->ethernet_core_from_logical_core(eth_sender_core).y, - chip_0_edm_builder.config.termination_signal_address}}; - - TT_ASSERT( - (enable_persistent_fabric && edm_termination_infos.size() == 0) || - (!enable_persistent_fabric && edm_termination_infos.size() > 0)); - generate_sender_worker_kernels( - sender_program, - sender_device, - worker_core, - chip0_worker_fabric_connection, - unicast_send{2}, // 2 hops because we are looping back to ourselves - edm_buffer_size, - page_plus_header_size, - num_pages_total, - pages_per_send, - local_worker_fabric_semaphore_id, - local_worker_teardown_semaphore_id, - local_worker_last_message_semaphore_id, - local_input_buffer_address, - src_is_dram, - local_output_buffer_address, - dest_is_dram, - worker_buffer_index_semaphore_id, - edm_termination_infos); - - //////////////////////////////////////////////////////////////////////////// - // Compile and Execute Application - //////////////////////////////////////////////////////////////////////////// - std::vector devices = {sender_device}; - if (!enable_persistent_fabric) { - devices.push_back(receiver_device); - } - log_trace(tt::LogTest, "{} programs, {} devices", programs.size(), devices.size()); - run_programs(programs, devices); - log_info(tt::LogTest, "Reading back outputs"); - - bool pass = true; - constexpr bool enable_check = true; - if constexpr (enable_check) { - pass &= run_output_check(all_zeros, inputs, local_output_buffer) == Correctness::Correct; - } - return pass; -} - -void generate_multi_input_test_worker_reader_kernel( - Program& program, - std::vector const& cb_indices, - std::vector const& tensors, - IDevice* device, - uint32_t page_size, - CoreRangeSet const& worker_core_range, - uint32_t num_pages_per_edm_buffer, - ttnn::ccl::v2::TensorSlice const& in0_command_tensor_slice, - ttnn::ccl::v2::TensorSlice const& in1_command_tensor_slice, - ttnn::ccl::cmd::CclCommandCode command_type, - DataMovementConfig const& datamovement_kernel_config, - std::optional const& chip0_worker_forward_fabric_connection, - std::optional const& chip0_worker_backward_fabric_connection, - std::optional const& optional_teardown_sequence, - ttnn::ccl::cmd::CclCommandDestArgs const& dest_args) { - bool fabric_enabled = std::holds_alternative(dest_args) || - std::holds_alternative(dest_args); - using namespace ttnn::ccl::cmd::uops; - using namespace ttnn::ccl::cmd; - log_trace( - tt::LogTest, - "Generating multi input test worker reader kernel for command type: {}", - static_cast(command_type)); - - TT_FATAL( - command_type == ttnn::ccl::cmd::CclCommandCode::STREAM_TENSOR_TO_CB || - command_type == ttnn::ccl::cmd::CclCommandCode::STREAM_CB_TO_TENSOR, - "Unsupported tensor IO command type"); - - TT_ASSERT(tensors.size() > 0 && tensors.size() <= 2); - TT_ASSERT(cb_indices.size() == tensors.size()); - - auto sender_worker_reader_kernel = ttnn::ccl::worker_detail::generate_multi_command_stream_kernel_ct_args( - program, cb_indices, tensors, worker_core_range, datamovement_kernel_config); - - std::vector ccl_command_stream0; - std::vector ccl_command_stream1; - - // Add the main tensor slice commands - if (command_type == ttnn::ccl::cmd::CclCommandCode::STREAM_TENSOR_TO_CB) { - log_trace(tt::LogTest, "Adding local noc read"); - if (fabric_enabled) { - ccl_command_stream0.push_back( - read_tensor_slice_to_cb_for_eventual_fabric_write(in0_command_tensor_slice, cb_indices.at(0))); - ccl_command_stream1.push_back( - read_tensor_slice_to_cb_for_eventual_fabric_write(in1_command_tensor_slice, cb_indices.at(1))); - } else { - ccl_command_stream0.push_back(read_tensor_slice_to_cb(in0_command_tensor_slice, cb_indices.at(0))); - ccl_command_stream1.push_back(read_tensor_slice_to_cb(in1_command_tensor_slice, cb_indices.at(1))); - } - } else { - if (std::holds_alternative(dest_args)) { - log_trace(tt::LogTest, "Adding local noc write"); - ccl_command_stream0.push_back(local_write_cb_to_tensor_slice(in0_command_tensor_slice, cb_indices.at(0))); - ccl_command_stream1.push_back(local_write_cb_to_tensor_slice(in1_command_tensor_slice, cb_indices.at(1))); - } else { - if (std::holds_alternative(dest_args)) { - log_trace( - tt::LogTest, - "Adding fabric unicast write command. Distance: {}. Forward: {}", - std::get(dest_args).distance_in_hops, - std::get(dest_args).is_forward_direction); - ccl_command_stream0.push_back(fabric_write_cb_to_tensor_slice( - in0_command_tensor_slice, - cb_indices.at(0), - UnicastCommandDestArgs{std::get(dest_args)})); - ccl_command_stream1.push_back(fabric_write_cb_to_tensor_slice( - in1_command_tensor_slice, - cb_indices.at(1), - UnicastCommandDestArgs{std::get(dest_args)})); - } else if (std::holds_alternative(dest_args)) { - log_trace( - tt::LogTest, - "Adding fabric multicast write command. Forward: {}. Backward: {}", - std::get(dest_args).num_targets_forward_direction, - std::get(dest_args).num_targets_backward_direction); - ccl_command_stream0.push_back(fabric_write_cb_to_tensor_slice( - in0_command_tensor_slice, - cb_indices.at(0), - MulticastCommandDestArgs{std::get(dest_args)})); - ccl_command_stream1.push_back(fabric_write_cb_to_tensor_slice( - in1_command_tensor_slice, - cb_indices.at(1), - MulticastCommandDestArgs{std::get(dest_args)})); - } else { - log_trace(tt::LogTest, "WTF? Should have been caught earlier"); - TT_FATAL(true, "Unsupported dest args type"); - } - } - } - - // Now, because we are bringing up/tearing down the fabric per op with this program, we need to queue up the - // commands to teardown the fabric - // We need to make sure only one of the command streams is sending out the termination signals, and we - // need to make sure it only does that after the other command stream is done - so what we do is - // make the termination command stream wait for a semaphore value (locally) that the other command stream - // will set after it has finished. - if (optional_teardown_sequence.has_value()) { - std::ranges::copy(optional_teardown_sequence.value(), std::back_inserter(ccl_command_stream0)); - } - - ttnn::ccl::worker_detail::generate_multi_input_command_stream_kernel_rt_args( - program, - sender_worker_reader_kernel, - tensors, - {page_size, page_size}, - device, - num_pages_per_edm_buffer, // TODO: get from fabric - worker_core_range, - ccl_command_stream0, - ccl_command_stream1, - chip0_worker_forward_fabric_connection, - chip0_worker_backward_fabric_connection); -} - -void generate_multi_input_test_worker_kernels_for_local_tensor_write( - Program& program, - IDevice* device, - Tensor& input_tensor0, - Tensor& input_tensor1, - Tensor& output_tensor0, - Tensor& output_tensor1, - size_t first_cb_index, - size_t second_cb_index, - CoreCoord const& worker_core, - const uint32_t page_plus_header_size, - const uint32_t num_pages_per_edm_buffer, - ttnn::ccl::v2::TensorSlice const& in0_tensor_slice, - ttnn::ccl::v2::TensorSlice const& in1_tensor_slice, - ttnn::ccl::v2::TensorSlice const& out0_tensor_slice, - ttnn::ccl::v2::TensorSlice const& out1_tensor_slice, - std::optional const& optional_teardown_sequence, - std::optional& chip0_worker_forward_fabric_connection, - std::optional& chip0_worker_backward_fabric_connection, - ttnn::ccl::cmd::CclCommandDestArgs const& dest_args) { - // Just want a dummy DF - tt::DataFormat df = (page_plus_header_size - PACKET_HEADER_SIZE_BYTES) == 1024 ? tt::DataFormat::Bfp8 - : (page_plus_header_size - PACKET_HEADER_SIZE_BYTES) == 2048 ? tt::DataFormat::Float16 - : tt::DataFormat::Float32; - - { - tt_metal::CircularBufferConfig cb_src0_config = - tt_metal::CircularBufferConfig(2 * num_pages_per_edm_buffer * page_plus_header_size, {{first_cb_index, df}}) - .set_page_size(first_cb_index, page_plus_header_size); - CBHandle cb0 = CreateCircularBuffer(program, worker_core, cb_src0_config); - } - { - tt_metal::CircularBufferConfig cb_src1_config = - tt_metal::CircularBufferConfig( - 2 * num_pages_per_edm_buffer * page_plus_header_size, {{second_cb_index, df}}) - .set_page_size(second_cb_index, page_plus_header_size); - CBHandle cb1 = CreateCircularBuffer(program, worker_core, cb_src1_config); - } - - generate_multi_input_test_worker_reader_kernel( - program, - {first_cb_index, second_cb_index}, - {&input_tensor0, &input_tensor1}, - device, - page_plus_header_size - PACKET_HEADER_SIZE_BYTES, - CoreRangeSet({CoreRange(worker_core)}), - num_pages_per_edm_buffer, - in0_tensor_slice, - in1_tensor_slice, - ttnn::ccl::cmd::CclCommandCode::STREAM_TENSOR_TO_CB, - tt_metal::ReaderDataMovementConfig{}, - std::nullopt, - std::nullopt, - std::nullopt, - dest_args); - - generate_multi_input_test_worker_reader_kernel( - program, - {first_cb_index, second_cb_index}, - {&output_tensor0, &output_tensor1}, - device, - page_plus_header_size - PACKET_HEADER_SIZE_BYTES, - CoreRangeSet({CoreRange(worker_core)}), - num_pages_per_edm_buffer, - out0_tensor_slice, - out1_tensor_slice, - ttnn::ccl::cmd::CclCommandCode::STREAM_CB_TO_TENSOR, - tt_metal::WriterDataMovementConfig{}, - chip0_worker_forward_fabric_connection, - chip0_worker_backward_fabric_connection, - optional_teardown_sequence, - dest_args); -} - -bool RunLocalTestWithMultiInputReaders( - std::vector const& devices, - std::vector& programs, - std::optional& line_fabric, - - Tensor& input_tensor0, - Tensor& input_tensor1, - Tensor& output_tensor0, - Tensor& output_tensor1, - std::vector input0_tensors, // Device - std::vector input1_tensors, // Device - std::vector output0_tensors, // Device - std::vector output1_tensors, // Device - - ttnn::ccl::v2::TensorSlice const& in0_tensor_slice, - ttnn::ccl::v2::TensorSlice const& in1_tensor_slice, - ttnn::ccl::v2::TensorSlice const& out0_tensor_slice, - ttnn::ccl::v2::TensorSlice const& out1_tensor_slice, - - const uint32_t page_size, - TwoInputReaderKernelWriteMode test_mode, - ttnn::ccl::cmd::CclCommandDestArgs const& dest_args, - std::optional& subdevice_managers, - bool enable_persistent_fabric) { - const bool fabric_enabled = test_mode != TwoInputReaderKernelWriteMode::LOCAL_WRITEBACK; - tt_metal::IDevice* device = devices.at(0); - for (size_t i = 0; i < devices.size(); i++) { - log_info(tt::LogTest, "Device[{}] ID: {}", i, devices.at(i)->id()); - } - auto program_ptrs = std::vector(); - program_ptrs.reserve(devices.size()); - std::ranges::transform(programs, std::back_inserter(program_ptrs), [](auto& p) { return &p; }); - - size_t output_tensor_dest_device_index = 0; - if (fabric_enabled) { - if (std::holds_alternative(dest_args)) { - log_info( - tt::LogTest, - "Unicast command dest args. Distance in hops: {}", - std::get(dest_args).distance_in_hops); - output_tensor_dest_device_index = - std::get(dest_args).distance_in_hops; - TT_ASSERT(output_tensor_dest_device_index != 0, "Output tensor destination device index must be non-zero"); - TT_ASSERT(test_mode == TwoInputReaderKernelWriteMode::FABRIC_UNICAST); - } else if (std::holds_alternative(dest_args)) { - log_info( - tt::LogTest, - "Multicast command dest args. Number of targets forward direction: {}", - std::get(dest_args).num_targets_forward_direction); - output_tensor_dest_device_index = - std::get(dest_args).num_targets_forward_direction; - TT_ASSERT(output_tensor_dest_device_index != 0, "Output tensor destination device index must be non-zero"); - TT_ASSERT(test_mode == TwoInputReaderKernelWriteMode::FABRIC_MULTICAST); - } - } else { - log_info(tt::LogTest, "No fabric enabled"); - TT_ASSERT( - std::holds_alternative(dest_args), "Local command dest args expected"); - } - - std::size_t page_plus_header_size = page_size + sizeof(tt::fabric::PacketHeader); - - auto first_cb_index = tt::CB::c_in0; - auto second_cb_index = tt::CB::c_in1; - - auto output_tensor_dest_device = devices.at(output_tensor_dest_device_index); - TT_ASSERT(input_tensor0.get_logical_shape()[-2] != 1); - - bool is_fabric_mcast = std::holds_alternative(dest_args); - - auto input_tensor0_device = input0_tensors.at(0); - auto input_tensor1_device = input1_tensors.at(0); - auto output_tensor0_device = output0_tensors.at(output_tensor_dest_device_index); - auto output_tensor1_device = output1_tensors.at(output_tensor_dest_device_index); - - log_info(tt::LogTest, "input_tensor0_device->address(): {}", input_tensor0_device.buffer()->address()); - log_info(tt::LogTest, "input_tensor1_device->address(): {}", input_tensor1_device.buffer()->address()); - log_info( - tt::LogTest, - "output_tensor0_device->address(): {} on device {}", - output_tensor0_device.buffer()->address(), - output_tensor_dest_device->id()); - log_info( - tt::LogTest, - "output_tensor1_device->address(): {} on device {}", - output_tensor1_device.buffer()->address(), - output_tensor_dest_device->id()); - - //////////////////////////////////////////////////////////////////////////// - // Build Workers - //////////////////////////////////////////////////////////////////////////// - auto const& worker_core = CoreCoord(0, 0); - - const size_t num_pages_per_edm_buffer = 2; - - std::optional chip0_worker_forward_fabric_connection = - fabric_enabled ? line_fabric->uniquely_connect_worker(devices[0], ttnn::ccl::EdmLineFabricOpInterface::FORWARD) - : std::optional{std::nullopt}; - - // always at start of line for now - std::optional> edm_termination_infos = - (!fabric_enabled || enable_persistent_fabric) - ? std::optional>{std::nullopt} - : line_fabric->generate_ordered_termination_info_farthest_to_nearest(); - std::optional chip0_worker_backward_fabric_connection = std::nullopt; - - std::optional sync_details; - std::optional teardown_worker_core; - std::optional teardown_command_stream; - if (fabric_enabled && !enable_persistent_fabric) { - teardown_worker_core = worker_core; - - sync_details = ttnn::ccl::SyncModeSpec{}; - sync_details->core = teardown_worker_core.value(); - sync_details->add_signal(tt::tt_metal::CreateSemaphore(programs.at(0), teardown_worker_core.value(), 0), 1); - teardown_command_stream = {ttnn::ccl::cmd::uops::local_core_semaphore_inc(sync_details->sem_ids.at(0), 1)}; - TT_FATAL(edm_termination_infos.has_value(), "EDM termination infos must be set if fabric is enabled"); - ttnn::ccl::cmd::CclHostLowLevelCommandSequence teardown_commands; - - teardown_commands = ttnn::ccl::worker_detail::build_ccl_cmd_proc_teardown_commands( - programs.at(0), - device, - nullptr, // forward device - in this test, we have a single source doing all teardown - devices.size(), - 0, - edm_termination_infos.value(), - sync_details.value(), - line_fabric.value()); - std::ranges::copy(teardown_commands, std::back_inserter(teardown_command_stream.value())); - } - - generate_multi_input_test_worker_kernels_for_local_tensor_write( - programs.at(0), - device, - input_tensor0_device, - input_tensor1_device, - output_tensor0_device, - output_tensor1_device, - first_cb_index, - second_cb_index, - worker_core, - page_plus_header_size, - num_pages_per_edm_buffer, - in0_tensor_slice, - in1_tensor_slice, - out0_tensor_slice, - out1_tensor_slice, - teardown_command_stream, - chip0_worker_forward_fabric_connection, - chip0_worker_backward_fabric_connection, - dest_args); - - if (!enable_persistent_fabric) { - log_info(tt::LogTest, "Building EDM kernels"); - line_fabric->build_kernels(); - } - - log_info(tt::LogTest, "persistent_fabric: {}", enable_persistent_fabric); - log_info(tt::LogTest, "subdevice_managers.has_value(): {}", subdevice_managers.has_value()); - //////////////////////////////////////////////////////////////////////////// - // Compile and Execute Application - //////////////////////////////////////////////////////////////////////////// - run_programs(programs, enable_persistent_fabric ? std::vector{devices[0]} : devices); - log_info(tt::LogTest, "Finished"); - - bool pass = true; - constexpr bool enable_check = true; - if constexpr (enable_check) { - log_info(tt::LogTest, "Reading back outputs"); - auto output0_cpu = output_tensor0_device.cpu(true, ttnn::DefaultQueueId); - auto output1_cpu = output_tensor1_device.cpu(true, ttnn::DefaultQueueId); - - auto in0_tensor_copyback_cpu = input_tensor0_device.cpu(true, ttnn::DefaultQueueId); - auto in1_tensor_copyback_cpu = input_tensor1_device.cpu(true, ttnn::DefaultQueueId); - - auto in0_tensor_copyback = tt::tt_metal::owned_buffer::get_as(in0_tensor_copyback_cpu); - auto in1_tensor_copyback = tt::tt_metal::owned_buffer::get_as(in1_tensor_copyback_cpu); - - auto in0_tensor_data = tt::tt_metal::owned_buffer::get_as(input_tensor0); - auto in1_tensor_data = tt::tt_metal::owned_buffer::get_as(input_tensor1); - auto out0_tensor_data = tt::tt_metal::owned_buffer::get_as(output0_cpu); - auto out1_tensor_data = tt::tt_metal::owned_buffer::get_as(output1_cpu); - - bool input0_copyback_check_passed = - run_output_check(in0_tensor_data, in0_tensor_copyback) == Correctness::Correct; - bool input1_copyback_check_passed = - run_output_check(in1_tensor_data, in1_tensor_copyback) == Correctness::Correct; - TT_FATAL(input0_copyback_check_passed, "Input 0 copyback check failed"); - TT_FATAL(input1_copyback_check_passed, "Input 1 copyback check failed"); - - log_info(tt::LogTest, "Comparing outputs"); - pass &= run_output_check(in0_tensor_data, out0_tensor_data) == Correctness::Correct; - if (pass) { - log_info(tt::LogTest, "Output check passed for output 0"); - } else { - log_error(tt::LogTest, "Output check failed for output 0"); - } - pass &= run_output_check(in1_tensor_data, out1_tensor_data) == Correctness::Correct; - if (pass) { - log_info(tt::LogTest, "Output check passed for output 1"); - } else { - log_error(tt::LogTest, "Output check failed for output 1"); - } - } - - return pass; -} - -bool RunLineFabricTest( - std::vector devices, - std::vector& programs, - - const size_t mcast_first_chip, - const size_t mcast_last_chip, - - const uint32_t page_size, - const uint32_t num_pages_total, - bool src_is_dram, - bool dest_is_dram, - - std::optional& subdevice_managers, - ttnn::ccl::EdmLineFabricOpInterface& line_fabric, - bool enable_persistent_fabric) { - std::size_t page_plus_header_size = page_size + sizeof(tt::fabric::PacketHeader); - std::size_t tensor_size_bytes = num_pages_total * page_size; - - static constexpr std::size_t edm_buffer_size = - ttnn::ccl::FabricEriscDatamoverBuilder::default_packet_payload_size_bytes + PACKET_HEADER_SIZE_BYTES; - const size_t local_chip_id = 0; - const size_t remote_chip_id = 1; - auto program_ptrs = std::vector(devices.size()); - std::transform(programs.begin(), programs.end(), program_ptrs.begin(), [](auto& program) { return &program; }); - - std::vector worker_cores = {CoreCoord(0, 0)}; - - // Generate inputs - //////////////////////////////////////////////////////////////////////////// - // SETUP THE INPUT CB - //////////////////////////////////////////////////////////////////////////// - BankedConfig test_config = BankedConfig{ - .num_pages = num_pages_total, - .size_bytes = tensor_size_bytes, - .page_size_bytes = page_size, - .input_buffer_type = src_is_dram ? BufferType::DRAM : BufferType::L1, - .output_buffer_type = dest_is_dram ? BufferType::DRAM : BufferType::L1, - .l1_data_format = tt::DataFormat::Float16_b}; - - // Input buffer - auto [local_input_buffer, inputs] = build_input_buffer(devices[0], tensor_size_bytes, test_config); - auto local_input_buffer_address = local_input_buffer->address(); - - std::vector all_zeros(inputs.size(), 0); - // output buffers - TT_ASSERT( - enable_persistent_fabric || mcast_first_chip <= mcast_last_chip, - "mcast_first_chip must be less than or equal to mcast_last_chip"); - TT_ASSERT( - enable_persistent_fabric || mcast_last_chip < devices.size(), - "mcast_last_chip must be less than the number of devices"); - std::vector> output_buffers; - output_buffers.reserve(devices.size()); - for (size_t i = 0; i < devices.size(); i++) { - if (i == 0) { - output_buffers.push_back(CreateBuffer(InterleavedBufferConfig{ - devices.at(i), test_config.size_bytes, test_config.page_size_bytes, test_config.output_buffer_type})); - } else { - output_buffers.push_back(CreateBuffer( - InterleavedBufferConfig{ - devices.at(i), test_config.size_bytes, test_config.page_size_bytes, test_config.output_buffer_type}, - output_buffers[0]->address())); - } - tt_metal::detail::WriteToBuffer(output_buffers.back(), all_zeros); - } - auto local_output_buffer_address = output_buffers[0]->address(); - bool all_same_addr = std::ranges::all_of(output_buffers, [local_output_buffer_address](auto const& buffer) { - return buffer->address() == local_output_buffer_address; - }); - TT_ASSERT(all_same_addr, "All output buffers must have the same address"); - - //////////////////////////////////////////////////////////////////////////// - // Setup Semaphores and Builders - //////////////////////////////////////////////////////////////////////////// - - auto local_worker_fabric_semaphore_id = tt::tt_metal::CreateSemaphore(programs[0], worker_cores.at(0), 0); - auto local_worker_teardown_semaphore_id = tt::tt_metal::CreateSemaphore(programs[0], worker_cores.at(0), 0); - auto local_worker_last_message_semaphore_id = tt::tt_metal::CreateSemaphore(programs[0], worker_cores.at(0), 0); - auto worker_buffer_index_semaphore_id = tt::tt_metal::CreateSemaphore(programs[0], worker_cores.at(0), 0); - //////////////////////////////////////////////////////////////////////////// - // Build Workers - //////////////////////////////////////////////////////////////////////////// - log_trace(tt::LogTest, "Generating local_sender -> remote_receiver workers"); - auto const& worker_core = worker_cores.at(0); - log_trace(tt::LogTest, "Worker {}. On Core x={},y={}", 0, worker_core.x, worker_core.y); - - const auto edm_termination_infos = enable_persistent_fabric - ? std::vector{} - : line_fabric.generate_ordered_termination_info_farthest_to_nearest(); - - auto chip0_worker_fabric_connection = - line_fabric.uniquely_connect_worker(devices[0], ttnn::ccl::EdmLineFabricOpInterface::FORWARD); - - const std::size_t pages_per_send = - (chip0_worker_fabric_connection.buffer_size_bytes - PACKET_HEADER_SIZE_BYTES) / page_size; - generate_sender_worker_kernels( - programs[0], - devices[0], - worker_core, - chip0_worker_fabric_connection, - mcast_send{mcast_first_chip, mcast_last_chip - mcast_first_chip + 1}, - edm_buffer_size, - page_plus_header_size, - num_pages_total, - pages_per_send, - local_worker_fabric_semaphore_id, - local_worker_teardown_semaphore_id, - local_worker_last_message_semaphore_id, - local_input_buffer_address, - src_is_dram, - local_output_buffer_address, - dest_is_dram, - worker_buffer_index_semaphore_id, - edm_termination_infos); - - //////////////////////////////////////////////////////////////////////////// - // Build EDM Kernels - //////////////////////////////////////////////////////////////////////////// - if (!enable_persistent_fabric) { - line_fabric.build_kernels(); - } - - //////////////////////////////////////////////////////////////////////////// - // Compile and Execute Application - //////////////////////////////////////////////////////////////////////////// - - run_programs(programs, devices); - log_info(tt::LogTest, "Reading back outputs"); - - bool pass = true; - constexpr bool enable_check = true; - if constexpr (enable_check) { - // Check all output buffers. Make sure only the buffers in the mcast range are - // non-zero. All other buffers outside the range should be zero filled - TT_ASSERT( - !std::all_of(inputs.begin(), inputs.end(), [](uint32_t x) { return x == 0; }), - "Input buffer expected to not be all 0"); - for (size_t i = 0; i < output_buffers.size(); i++) { - bool compare_with_input = (mcast_first_chip <= i && i <= mcast_last_chip); - auto& golden_tensor = compare_with_input ? inputs : all_zeros; - pass &= run_output_check(all_zeros, golden_tensor, output_buffers.at(i)) == Correctness::Correct; - } - } - - return pass; -} - -void persistent_fabric_teardown_sequence( - std::vector const& devices, - std::optional& subdevice_managers, - ttnn::ccl::EdmLineFabricOpInterface& line_fabric, - tt::fabric::TerminationSignal termination_mode = tt::fabric::TerminationSignal::GRACEFULLY_TERMINATE) { - log_info("Tearing down fabric"); - - // Wait for workers to finish - auto d0_worker_subdevice = devices[0]->get_sub_device_ids()[TEST_WORKERS_SUBDEVICE_INDEX]; - tt_metal::Finish(devices[0]->command_queue(), {subdevice_managers->worker_subdevice_id.at(devices[0]->id())}); - - // Teardown the fabric - line_fabric.teardown_from_host(termination_mode); - - // wait for fabric teardown to finish - std::ranges::for_each(devices, [&](IDevice* d) { - tt_metal::Finish(d->command_queue(), {subdevice_managers->fabric_subdevice_id.at(d->id())}); - }); -} - -void setup_test_with_persistent_fabric( - std::vector const& devices, - std::vector& programs, - std::optional& subdevice_managers, - std::optional>& fabric_programs, - std::vector& fabric_program_ptrs, - std::optional& line_fabric, - bool enable_persistent_fabric, - std::optional num_links = std::nullopt) { - if (enable_persistent_fabric) { - log_info(tt::LogTest, "Enabling persistent fabric"); - fabric_programs = std::vector(devices.size()); - subdevice_managers = create_subdevices(devices); - std::transform( - fabric_programs->begin(), fabric_programs->end(), std::back_inserter(fabric_program_ptrs), [](auto& p) { - return &p; - }); - } else { - std::transform( - programs.begin(), programs.end(), std::back_inserter(fabric_program_ptrs), [](auto& p) { return &p; }); - } - - line_fabric = ttnn::ccl::EdmLineFabricOpInterface( - devices, fabric_program_ptrs, enable_persistent_fabric, num_links.value_or(1)); - line_fabric->set_firmware_context_switch_interval(0); - - if (enable_persistent_fabric) { - TT_FATAL(fabric_programs.has_value(), "Fabric programs must be set if fabric is enabled"); - TT_FATAL(devices.size() == fabric_programs->size(), "Number of devices must match number of programs"); - - log_info(tt::LogTest, "Building EDM kernels"); - line_fabric->build_kernels(); - build_and_enqueue(devices, *fabric_programs); - } -} - -// RESUME HERE AND IMPLEMENT MCAST TEST -int TestLineFabricEntrypoint( - const size_t mcast_first_chip, - const size_t mcast_last_chip, - const uint32_t page_size, - const uint32_t num_pages_total, - const bool src_is_dram, - const bool dest_is_dram, - bool enable_persistent_fabric) { - // argv[0]: program - // argv[1]: buffer_size_bytes - // argv[2]: num_loops - - auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); - auto num_devices = tt::tt_metal::GetNumAvailableDevices(); - if (num_devices < 4) { - log_info("This test can only be run on T3000 devices"); - return 0; - } - if (arch == tt::ARCH::GRAYSKULL) { - log_info("Test must be run on WH"); - return 0; - } - - T3000TestDevice test_fixture; - auto view = test_fixture.mesh_device_->get_view(); - - // build a line of devices - std::vector devices = { - view.get_device(MeshCoordinate(0, 0)), - view.get_device(MeshCoordinate(0, 1)), - view.get_device(MeshCoordinate(0, 2)), - view.get_device(MeshCoordinate(0, 3))}; - std::vector programs(enable_persistent_fabric ? 1 : devices.size()); - std::optional subdevice_managers = std::nullopt; - std::optional> fabric_programs; - std::vector fabric_program_ptrs; - std::optional line_fabric; - setup_test_with_persistent_fabric( - devices, - programs, - subdevice_managers, - fabric_programs, - fabric_program_ptrs, - line_fabric, - enable_persistent_fabric); - - auto launch_workers = [&](std::vector& _programs) -> bool { - bool success = false; - try { - success = RunLineFabricTest( - enable_persistent_fabric ? std::vector{devices[0]} : devices, - _programs, - // fabric_hops, - - mcast_first_chip, - mcast_last_chip, - - page_size, - num_pages_total, - src_is_dram, - dest_is_dram, - - subdevice_managers, - line_fabric.value(), - enable_persistent_fabric); - - } catch (std::exception& e) { - log_error("Caught exception: {}", e.what()); - test_fixture.TearDown(); - return false; - } - return success; - }; - bool success = launch_workers(programs); - - if (enable_persistent_fabric) { - std::vector second_run_programs(1); - success = launch_workers(second_run_programs); - persistent_fabric_teardown_sequence( - devices, subdevice_managers, line_fabric.value(), tt::fabric::TerminationSignal::IMMEDIATELY_TERMINATE); - } - - test_fixture.TearDown(); - - return success ? 0 : -1; -} - -int TestLoopbackEntrypoint( - const uint32_t page_size, - const uint32_t num_pages_total, - const bool src_is_dram, - const bool dest_is_dram, - bool enable_persistent_fabric) { - // argv[0]: program - // argv[1]: buffer_size_bytes - // argv[2]: num_loops - std::optional subdevice_managers = std::nullopt; - - auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); - auto num_devices = tt::tt_metal::GetNumAvailableDevices(); - if (num_devices < 4) { - log_info("This test can only be run on T3000 devices"); - return 0; - } - if (arch == tt::ARCH::GRAYSKULL) { - log_info("Test must be run on WH"); - return 0; - } - - T3000TestDevice test_fixture; - auto view = test_fixture.mesh_device_->get_view(); - - const auto& device_0 = view.get_device(MeshCoordinate(0, 0)); - const auto& device_1 = view.get_device(MeshCoordinate(0, 1)); - - auto const& active_eth_cores = device_0->get_active_ethernet_cores(true); - auto eth_sender_core_iter = active_eth_cores.begin(); - auto eth_sender_core_iter_end = active_eth_cores.end(); - chip_id_t device_id = std::numeric_limits::max(); - tt_xy_pair eth_receiver_core; - bool initialized = false; - tt_xy_pair eth_sender_core; - do { - TT_FATAL(eth_sender_core_iter != eth_sender_core_iter_end, "Error"); - std::tie(device_id, eth_receiver_core) = device_0->get_connected_ethernet_core(*eth_sender_core_iter); - eth_sender_core = *eth_sender_core_iter; - eth_sender_core_iter++; - } while (device_id != device_1->id()); - TT_ASSERT(device_id == device_1->id()); - // const auto& device_1 = test_fixture.mesh_device_->get_device(device_id); - - std::vector programs(enable_persistent_fabric ? 1 : 2); - std::optional> fabric_programs; - auto& sender_program = programs.at(0); - if (enable_persistent_fabric) { - log_info(tt::LogTest, "Enabling persistent fabric"); - fabric_programs = std::vector(2); - subdevice_managers = create_subdevices({device_0, device_1}); - } - - auto& fabric_sender_program = enable_persistent_fabric ? fabric_programs->at(0) : sender_program; - auto& fabric_receiver_program = enable_persistent_fabric ? fabric_programs->at(1) : programs.at(1); - IDevice* sender_device = device_0; - IDevice* receiver_device = device_1; - - static constexpr std::size_t edm_buffer_size = - ttnn::ccl::FabricEriscDatamoverBuilder::default_packet_payload_size_bytes + PACKET_HEADER_SIZE_BYTES; - const chip_id_t local_chip_id = 0; - const chip_id_t remote_chip_id = 1; - auto const& edm_config = ttnn::ccl::FabricEriscDatamoverConfig(edm_buffer_size, 1, 2); - auto chip_0_edm_builder = ttnn::ccl::FabricEriscDatamoverBuilder::build( - sender_device, - fabric_sender_program, - eth_sender_core, - local_chip_id, - remote_chip_id, - edm_config, - enable_persistent_fabric); - chip_0_edm_builder.set_firmware_context_switch_interval(0); - auto chip_1_edm_builder = ttnn::ccl::FabricEriscDatamoverBuilder::build( - receiver_device, - fabric_receiver_program, - eth_receiver_core, - remote_chip_id, - local_chip_id, - edm_config, - enable_persistent_fabric); - chip_1_edm_builder.set_firmware_context_switch_interval(0); - // Create the loopback connection on the second device - chip_1_edm_builder.connect_to_downstream_edm(chip_1_edm_builder); - auto local_edm_kernel = ttnn::ccl::generate_edm_kernel( - fabric_sender_program, sender_device, chip_0_edm_builder, eth_sender_core, NOC::NOC_0); - auto remote_edm_kernel = ttnn::ccl::generate_edm_kernel( - fabric_receiver_program, receiver_device, chip_1_edm_builder, eth_receiver_core, NOC::NOC_0); - - if (enable_persistent_fabric) { - tt::tt_metal::detail::CompileProgram(sender_device, fabric_sender_program); - tt::tt_metal::detail::CompileProgram(receiver_device, fabric_receiver_program); - tt_metal::EnqueueProgram(sender_device->command_queue(), fabric_sender_program, false); - tt_metal::EnqueueProgram(receiver_device->command_queue(), fabric_receiver_program, false); - } - log_trace(tt::LogTest, "{} programs ", programs.size()); - bool success = false; - try { - success = RunLoopbackTest( - device_0, - device_1, - - eth_sender_core, - eth_receiver_core, - - page_size, - num_pages_total, - src_is_dram, - dest_is_dram, - programs, - chip_0_edm_builder, - subdevice_managers, - enable_persistent_fabric); - } catch (std::exception& e) { - log_error("Caught exception: {}", e.what()); - test_fixture.TearDown(); - return -1; - } - - if (enable_persistent_fabric) { - // Run the test twice with a single fabric invocation - - std::vector second_programs(1); - try { - success = RunLoopbackTest( - device_0, - device_1, - - eth_sender_core, - eth_receiver_core, - - page_size, - num_pages_total, - src_is_dram, - dest_is_dram, - second_programs, - chip_0_edm_builder, - subdevice_managers, - enable_persistent_fabric); - } catch (std::exception& e) { - log_error("Caught exception: {}", e.what()); - test_fixture.TearDown(); - return -1; - } - // Wait for worker programs to finish - - auto d0_worker_subdevice = device_0->get_sub_device_ids()[TEST_WORKERS_SUBDEVICE_INDEX]; - auto d1_worker_subdevice = device_1->get_sub_device_ids()[TEST_WORKERS_SUBDEVICE_INDEX]; - auto d0_fabric_subdevice = device_0->get_sub_device_ids()[TEST_EDM_FABRIC_SUBDEVICE_INDEX]; - auto d1_fabric_subdevice = device_1->get_sub_device_ids()[TEST_EDM_FABRIC_SUBDEVICE_INDEX]; - // Teardown the fabric - tt_metal::Finish(sender_device->command_queue(), {d0_worker_subdevice}); - // tt_metal::Finish(receiver_device->command_queue(), {d1_worker_subdevice}); - - // Notify fabric of teardown - chip_1_edm_builder.teardown_from_host(receiver_device); - chip_0_edm_builder.teardown_from_host(sender_device); - - // wait for fabric finish - tt_metal::Finish(sender_device->command_queue(), {d0_fabric_subdevice}); - tt_metal::Finish(receiver_device->command_queue(), {d1_fabric_subdevice}); - } - - test_fixture.TearDown(); - - return success ? 0 : -1; -} - -bool TestMultiInputReaderKernel( - size_t fabric_num_devices, - Tensor& input_tensor0, - MemoryConfig const& input_tensor0_mem_config, - Tensor& input_tensor1, - MemoryConfig const& input_tensor1_mem_config, - Tensor& output_tensor0, - MemoryConfig const& output_tensor0_mem_config, - Tensor& output_tensor1, - MemoryConfig const& output_tensor1_mem_config, - - ttnn::ccl::v2::TensorSlice const& in0_tensor_slice, - ttnn::ccl::v2::TensorSlice const& in1_tensor_slice, - ttnn::ccl::v2::TensorSlice const& out0_tensor_slice, - ttnn::ccl::v2::TensorSlice const& out1_tensor_slice, - - const uint32_t page_size, - - TwoInputReaderKernelWriteMode test_mode, - ttnn::ccl::cmd::CclCommandDestArgs const& dest_args, - bool enable_persistent_fabric) { - auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); - auto num_devices = tt::tt_metal::GetNumAvailableDevices(); - if (num_devices < 4) { - log_info("This test can only be run on T3000 devices"); - return true; - } - if (arch == tt::ARCH::GRAYSKULL) { - log_info("Test must be run on WH"); - return true; - } - T3000TestDevice test_fixture; - - TT_FATAL( - !enable_persistent_fabric || test_mode != TwoInputReaderKernelWriteMode::LOCAL_WRITEBACK, - "Test configuration issue. Set local writeback mode with persistent fabric"); - - auto view = test_fixture.mesh_device_->get_view(); - - std::vector devices; - devices.reserve(fabric_num_devices); - for (size_t i = 0; i < fabric_num_devices; i++) { - devices.push_back(view.get_device(MeshCoordinate(0, i))); - } - - std::vector programs(enable_persistent_fabric ? 1 : devices.size()); - std::optional subdevice_managers = std::nullopt; - std::optional> fabric_programs; - std::vector fabric_program_ptrs; - std::optional line_fabric; - setup_test_with_persistent_fabric( - devices, - programs, - subdevice_managers, - fabric_programs, - fabric_program_ptrs, - line_fabric, - enable_persistent_fabric); - - std::vector input0_tensors_device; - std::vector input1_tensors_device; - std::vector output0_tensors_device; - std::vector output1_tensors_device; - - // All this garbage is to make sure the test sets up buffer addresses correctly so we can safely - // multicast to a consistent destination address - for (size_t i = 0; i < devices.size(); i++) { - input0_tensors_device.push_back( - input_tensor0.to_device(devices.at(i), input_tensor0_mem_config, ttnn::DefaultQueueId)); - input1_tensors_device.push_back( - input_tensor1.to_device(devices.at(i), input_tensor1_mem_config, ttnn::DefaultQueueId)); - output0_tensors_device.push_back( - output_tensor0.to_device(devices.at(i), output_tensor0_mem_config, ttnn::DefaultQueueId)); - output1_tensors_device.push_back( - output_tensor1.to_device(devices.at(i), output_tensor1_mem_config, ttnn::DefaultQueueId)); - } - TT_FATAL( - !enable_persistent_fabric || subdevice_managers.has_value(), - "Subdevice managers must be set if fabric is enabled"); - auto launch_ccl_command_interpreter_workers = [&](std::vector& _programs) { - return RunLocalTestWithMultiInputReaders( - devices, - _programs, - line_fabric, - - input_tensor0, - input_tensor1, - output_tensor0, - output_tensor1, - - input0_tensors_device, - input1_tensors_device, - output0_tensors_device, - output1_tensors_device, - - in0_tensor_slice, - in1_tensor_slice, - out0_tensor_slice, - out1_tensor_slice, - - page_size, - test_mode, - dest_args, - subdevice_managers, - enable_persistent_fabric); - }; - - auto pass = launch_ccl_command_interpreter_workers(programs); - if (enable_persistent_fabric) { - std::vector second_run_programs(1); - // It looks suspicious that we are dropping the first result but there are two reasons we do this - // 1) We really only care that we can run back to back safely - // 2) The first run will end up racing with host and copy-back because there is no - // receiver on the destination that can signal to us when we are done. We need to add this - // to the test to make it more robust but that is future work - pass = launch_ccl_command_interpreter_workers(second_run_programs); - pass = true; - - // Due to race between host and device some packets are in flight by the time host sends shutdown signals so - // some get shutdown in between any packets in the pipeline. This can only be fixed by having a "drainer" op to - // make sure it receives all writes before exiting - persistent_fabric_teardown_sequence( - devices, subdevice_managers, line_fabric.value(), tt::fabric::TerminationSignal::IMMEDIATELY_TERMINATE); - - log_info(tt::LogTest, "Finished"); - for (auto d : devices) { - tt_metal::Synchronize(d, *ttnn::DefaultQueueId); - } - } - return pass; -} +#include "tests/ttnn/unit_tests/gtests/ccl/test_fabric_edm_common.hpp" //////////////////////////////////////////////////////////////////// /// MESSAGE COUNT TERMINATION MODE @@ -1663,93 +184,12 @@ TEST(WorkerFabricEdmDatapath, LineFabricMcast_ManyMessages_SingleSource_Persiste ASSERT_EQ(result, 0); } -#include "ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.hpp" - //////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////// //// LOCAL CHIP TENSOR READ?WRITE (2 INPUT) //////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////// -ttnn::ccl::Shape4D shape_to_shape_in_tiles(const ttnn::Shape& shape) { - auto logical_shape = shape; - logical_shape[-2] /= tt::constants::TILE_HEIGHT; - logical_shape[-1] /= tt::constants::TILE_WIDTH; - EXPECT_TRUE(logical_shape.size() == 4); - ttnn::ccl::Shape4D shape_in_tiles = { - logical_shape[0], logical_shape[1], logical_shape[2], logical_shape[3]}; - return shape_in_tiles; -} - -bool RunMultiInputReaderTestPropagateFullTensorIn( - const ttnn::Shape& tensor_shape, - const Layout& layout, - const MemoryConfig& in0_memory_config, - const MemoryConfig& in1_memory_config, - const MemoryConfig& out0_memory_config, - const MemoryConfig& out1_memory_config, - TwoInputReaderKernelWriteMode test_writeback_mode) { - auto num_elems = std::reduce(tensor_shape.cbegin(), tensor_shape.cend(), 1, std::multiplies()); - Tensor input_tensor0 = - ttnn::experimental::view(ttnn::arange(0, num_elems, 1, DataType::UINT32), tensor_shape).to_layout(layout); - Tensor input_tensor1 = - ttnn::experimental::view(ttnn::arange(num_elems, 2 * num_elems, 1, DataType::UINT32), tensor_shape) - .to_layout(layout); - Tensor output_tensor0 = ttnn::experimental::view(ttnn::ones(tensor_shape, DataType::UINT32, layout), tensor_shape); - Tensor output_tensor1 = ttnn::experimental::view(ttnn::ones(tensor_shape, DataType::UINT32, layout), tensor_shape); - input_tensor0.set_tensor_spec(TensorSpec( - tensor_shape, TensorLayout(DataType::UINT32, PageConfig(layout, tt_metal::Tile()), in0_memory_config))); - input_tensor1.set_tensor_spec(TensorSpec( - tensor_shape, TensorLayout(DataType::UINT32, PageConfig(layout, tt_metal::Tile()), in1_memory_config))); - output_tensor0.set_tensor_spec(TensorSpec( - tensor_shape, TensorLayout(DataType::UINT32, PageConfig(layout, tt_metal::Tile()), out0_memory_config))); - output_tensor1.set_tensor_spec(TensorSpec( - tensor_shape, TensorLayout(DataType::UINT32, PageConfig(layout, tt_metal::Tile()), out1_memory_config))); - - size_t page_size = tile_size(DataFormat::RawUInt32); - - ttnn::ccl::Shape4D tensor_shape_in_pages = shape_to_shape_in_tiles(tensor_shape); - ttnn::ccl::Shape4D tensor_slice_shape_in_pages = tensor_shape_in_pages; - ttnn::ccl::Shape4D tensor_slice_offset = {0, 0, 0, 0}; - ttnn::ccl::Shape4D worker_slice_shape = tensor_shape_in_pages; - ttnn::ccl::Shape4D worker_slice_offset = {0, 0, 0, 0}; - - ttnn::ccl::v2::TensorSlice tensor_slice{ - tensor_shape_in_pages, - tensor_slice_shape_in_pages, - tensor_slice_offset, - worker_slice_shape, - worker_slice_offset}; - - auto const in0_tensor_slice = tensor_slice; - auto const in1_tensor_slice = tensor_slice; - auto const out0_tensor_slice = tensor_slice; - auto const out1_tensor_slice = tensor_slice; - - auto pass = TestMultiInputReaderKernel( - 1, - input_tensor0, - in0_memory_config, - input_tensor1, - in1_memory_config, - output_tensor0, - out0_memory_config, - output_tensor1, - out1_memory_config, - - in0_tensor_slice, - in1_tensor_slice, - out0_tensor_slice, - out1_tensor_slice, - - page_size, - test_writeback_mode, - ttnn::ccl::cmd::LocalOnlyCommandDestArgs{}, - false); - - return pass; -} - TEST(WorkerCclCommandProcessingKernelLocalMode, MultiInputReader_SinglePageTile) { auto pass = RunMultiInputReaderTestPropagateFullTensorIn( ttnn::Shape({1, 1, 32, 32}), @@ -1951,107 +391,30 @@ TEST(WorkerCclCommandProcessingKernelLocalMode, MultiInputReader_MultiPage0_Shar // that isn't under test here TEST(WorkerCclCommandProcessingKernelLocalMode, MultiInputReader_MultiPage1) { ttnn::Shape tensor_shape({1, 1, 256, 256}); - auto pass = RunMultiInputReaderTestPropagateFullTensorIn( - tensor_shape, - Layout::TILE, - MemoryConfig(TensorMemoryLayout::INTERLEAVED, BufferType::DRAM), - MemoryConfig(TensorMemoryLayout::INTERLEAVED, BufferType::DRAM), - MemoryConfig(TensorMemoryLayout::INTERLEAVED, BufferType::DRAM), - MemoryConfig(TensorMemoryLayout::INTERLEAVED, BufferType::DRAM), - TwoInputReaderKernelWriteMode::LOCAL_WRITEBACK); - ASSERT_TRUE(pass); -} - -// TODO: update the test infra to be able to properly compare tensors if we are only -// doing a slice of the larger tensor - -// //////////////////////////////////////////////////////////////////// -// //////////////////////////////////////////////////////////////////// -// //// FABRIC UNICAST TENSOR WRITE (2 INPUT) -// //////////////////////////////////////////////////////////////////// -// //////////////////////////////////////////////////////////////////// - -TEST(WorkerCclCommandProcessingKernelFabricUnicastMode, MultiInputReader_SinglePageTile_OneHop_PersistentFabric) { - ttnn::Shape tensor_shape({1, 1, 32, 32}); - constexpr size_t distance_dest_device = 1; - constexpr size_t num_devices = 4; - Layout const layout = Layout::TILE; - MemoryConfig const in0_memory_config = MemoryConfig(TensorMemoryLayout::INTERLEAVED, BufferType::DRAM); - MemoryConfig const in1_memory_config = MemoryConfig(TensorMemoryLayout::INTERLEAVED, BufferType::DRAM); - MemoryConfig const out0_memory_config = MemoryConfig(TensorMemoryLayout::INTERLEAVED, BufferType::DRAM); - MemoryConfig const out1_memory_config = MemoryConfig(TensorMemoryLayout::INTERLEAVED, BufferType::DRAM); - - auto num_elems = std::reduce(tensor_shape.cbegin(), tensor_shape.cend(), 1, std::multiplies()); - Tensor input_tensor0 = - ttnn::experimental::view(ttnn::arange(0, num_elems, 1, DataType::UINT32), tensor_shape).to_layout(layout); - Tensor input_tensor1 = - ttnn::experimental::view(ttnn::arange(num_elems, 2 * num_elems, 1, DataType::UINT32), tensor_shape) - .to_layout(layout); - Tensor output_tensor0 = ttnn::experimental::view(ttnn::ones(tensor_shape, DataType::UINT32, layout), tensor_shape); - Tensor output_tensor1 = ttnn::experimental::view(ttnn::ones(tensor_shape, DataType::UINT32, layout), tensor_shape); - - input_tensor0.set_tensor_spec(TensorSpec( - tensor_shape, TensorLayout(DataType::UINT32, PageConfig(layout, tt_metal::Tile()), in0_memory_config))); - input_tensor1.set_tensor_spec(TensorSpec( - tensor_shape, TensorLayout(DataType::UINT32, PageConfig(layout, tt_metal::Tile()), in1_memory_config))); - output_tensor0.set_tensor_spec(TensorSpec( - tensor_shape, TensorLayout(DataType::UINT32, PageConfig(layout, tt_metal::Tile()), out0_memory_config))); - output_tensor1.set_tensor_spec(TensorSpec( - tensor_shape, TensorLayout(DataType::UINT32, PageConfig(layout, tt_metal::Tile()), out1_memory_config))); - - size_t page_size = tile_size(DataFormat::RawUInt32); - - ttnn::ccl::Shape4D tensor_shape_in_pages = shape_to_shape_in_tiles(tensor_shape); - ttnn::ccl::Shape4D tensor_slice_shape_in_pages = tensor_shape_in_pages; - ttnn::ccl::Shape4D tensor_slice_offset = {0, 0, 0, 0}; - ttnn::ccl::Shape4D worker_slice_shape = tensor_shape_in_pages; - ttnn::ccl::Shape4D worker_slice_offset = {0, 0, 0, 0}; - - ttnn::ccl::v2::TensorSlice tensor_slice{ - tensor_shape_in_pages, - tensor_slice_shape_in_pages, - tensor_slice_offset, - worker_slice_shape, - worker_slice_offset}; - - auto const in0_tensor_slice = tensor_slice; - auto const in1_tensor_slice = tensor_slice; - auto const out0_tensor_slice = tensor_slice; - auto const out1_tensor_slice = tensor_slice; - - ttnn::ccl::cmd::CclCommandDestArgs dest_args = ttnn::ccl::cmd::UnicastCommandDestArgs{distance_dest_device, true}; - auto pass = TestMultiInputReaderKernel( - num_devices, - input_tensor0, - in0_memory_config, - input_tensor1, - in1_memory_config, - output_tensor0, - out0_memory_config, - output_tensor1, - out1_memory_config, - - in0_tensor_slice, - in1_tensor_slice, - out0_tensor_slice, - out1_tensor_slice, - - page_size, - TwoInputReaderKernelWriteMode::FABRIC_UNICAST, - dest_args, - true); - + auto pass = RunMultiInputReaderTestPropagateFullTensorIn( + tensor_shape, + Layout::TILE, + MemoryConfig(TensorMemoryLayout::INTERLEAVED, BufferType::DRAM), + MemoryConfig(TensorMemoryLayout::INTERLEAVED, BufferType::DRAM), + MemoryConfig(TensorMemoryLayout::INTERLEAVED, BufferType::DRAM), + MemoryConfig(TensorMemoryLayout::INTERLEAVED, BufferType::DRAM), + TwoInputReaderKernelWriteMode::LOCAL_WRITEBACK); ASSERT_TRUE(pass); } +// TODO: update the test infra to be able to properly compare tensors if we are only +// doing a slice of the larger tensor + // //////////////////////////////////////////////////////////////////// // //////////////////////////////////////////////////////////////////// -// //// FABRIC MCAST TENSOR WRITE (2 INPUT) +// //// FABRIC UNICAST TENSOR WRITE (2 INPUT) // //////////////////////////////////////////////////////////////////// // //////////////////////////////////////////////////////////////////// -void RunFabricMcastFullTensorPropagateTest( - const ttnn::Shape& tensor_shape, size_t distance_dest_device, size_t num_devices, bool enable_persistent_fabric) { +TEST(WorkerCclCommandProcessingKernelFabricUnicastMode, MultiInputReader_SinglePageTile_OneHop_PersistentFabric) { + ttnn::Shape tensor_shape({1, 1, 32, 32}); + constexpr size_t distance_dest_device = 1; + constexpr size_t num_devices = 4; Layout const layout = Layout::TILE; MemoryConfig const in0_memory_config = MemoryConfig(TensorMemoryLayout::INTERLEAVED, BufferType::DRAM); MemoryConfig const in1_memory_config = MemoryConfig(TensorMemoryLayout::INTERLEAVED, BufferType::DRAM); @@ -2059,13 +422,14 @@ void RunFabricMcastFullTensorPropagateTest( MemoryConfig const out1_memory_config = MemoryConfig(TensorMemoryLayout::INTERLEAVED, BufferType::DRAM); auto num_elems = std::reduce(tensor_shape.cbegin(), tensor_shape.cend(), 1, std::multiplies()); + Tensor input_tensor0 = + ttnn::experimental::view(ttnn::arange(0, num_elems, 1, DataType::UINT32), tensor_shape).to_layout(layout); Tensor input_tensor1 = ttnn::experimental::view(ttnn::arange(num_elems, 2 * num_elems, 1, DataType::UINT32), tensor_shape) .to_layout(layout); - Tensor input_tensor0 = - ttnn::experimental::view(ttnn::arange(0, num_elems, 1, DataType::UINT32), tensor_shape).to_layout(layout); - Tensor output_tensor1 = ttnn::experimental::view(ttnn::ones(tensor_shape, DataType::UINT32, layout), tensor_shape); Tensor output_tensor0 = ttnn::experimental::view(ttnn::ones(tensor_shape, DataType::UINT32, layout), tensor_shape); + Tensor output_tensor1 = ttnn::experimental::view(ttnn::ones(tensor_shape, DataType::UINT32, layout), tensor_shape); + input_tensor0.set_tensor_spec(TensorSpec( tensor_shape, TensorLayout(DataType::UINT32, PageConfig(layout, tt_metal::Tile()), in0_memory_config))); input_tensor1.set_tensor_spec(TensorSpec( @@ -2074,10 +438,6 @@ void RunFabricMcastFullTensorPropagateTest( tensor_shape, TensorLayout(DataType::UINT32, PageConfig(layout, tt_metal::Tile()), out0_memory_config))); output_tensor1.set_tensor_spec(TensorSpec( tensor_shape, TensorLayout(DataType::UINT32, PageConfig(layout, tt_metal::Tile()), out1_memory_config))); - ASSERT_EQ(input_tensor0.get_logical_shape(), tensor_shape); - ASSERT_EQ(input_tensor1.get_logical_shape(), tensor_shape); - ASSERT_EQ(output_tensor0.get_logical_shape(), tensor_shape); - ASSERT_EQ(output_tensor1.get_logical_shape(), tensor_shape); size_t page_size = tile_size(DataFormat::RawUInt32); @@ -2099,7 +459,7 @@ void RunFabricMcastFullTensorPropagateTest( auto const out0_tensor_slice = tensor_slice; auto const out1_tensor_slice = tensor_slice; - ttnn::ccl::cmd::CclCommandDestArgs dest_args = ttnn::ccl::cmd::MulticastCommandDestArgs{distance_dest_device, 0}; + ttnn::ccl::cmd::CclCommandDestArgs dest_args = ttnn::ccl::cmd::UnicastCommandDestArgs{distance_dest_device, true}; auto pass = TestMultiInputReaderKernel( num_devices, input_tensor0, @@ -2117,13 +477,19 @@ void RunFabricMcastFullTensorPropagateTest( out1_tensor_slice, page_size, - TwoInputReaderKernelWriteMode::FABRIC_MULTICAST, + TwoInputReaderKernelWriteMode::FABRIC_UNICAST, dest_args, - enable_persistent_fabric); + true); ASSERT_TRUE(pass); } +// //////////////////////////////////////////////////////////////////// +// //////////////////////////////////////////////////////////////////// +// //// FABRIC MCAST TENSOR WRITE (2 INPUT) +// //////////////////////////////////////////////////////////////////// +// //////////////////////////////////////////////////////////////////// + TEST(WorkerCclCommandProcessingKernelFabricMulticastMode, MultiInputReader_SinglePageTile_SingleHop_PersistentFabric) { ttnn::Shape tensor_shape({1, 1, 32, 32}); constexpr size_t distance_dest_device = 1; @@ -2169,306 +535,6 @@ TEST(WorkerCclCommandProcessingKernelFabricMulticastMode, MultiInputReader_lotsP RunFabricMcastFullTensorPropagateTest(tensor_shape, distance_dest_device, num_devices, true); } -bool RunPipelinedWorkersTest( - - ttnn::Shape tensor_shape, - const size_t split_dim, - - // In this test we will have n stages with anywhere from 1 to 8 workers per stage (this will be configurable) - const size_t num_stages, - std::vector num_workers_per_stage, - const size_t slices_per_stage, - const tt::DataFormat data_format, - const size_t page_size_bytes, - const size_t cb_packet_size_in_pages, - const size_t num_packets_per_cb, - auto layout, - - std::vector> worker_chunk_read_order, - std::vector mem_configs) { - auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); - auto num_devices = tt::tt_metal::GetNumAvailableDevices(); - if (num_devices < 4) { - log_info("This test can only be run on T3000 devices"); - return true; - } - if (arch == tt::ARCH::GRAYSKULL) { - log_info("Test must be run on WH"); - return true; - } - - auto const cb_index = tt::CB::c_in0; - - auto programs = std::vector(1); - Program& program = programs[0]; - - T3000TestDevice test_fixture; - auto view = test_fixture.mesh_device_->get_view(); - - IDevice* device = view.get_device(MeshCoordinate(0, 0)); - ; - - // General setup is as follows: - // Worker 1 reads input tensor as a sequence of slices - it forwards to an output tensor and after each slice, it - // writes a semaphore increment to some known semaphore address on the destination worker so the destination worker - // knows it's safe to read that slice. - // HOWEVER. the reader will be programmed to read the chunks in a different order than they were written, this way - // we can identify synchronization related bugs (e.g. if sender semaphore increments before writes flush) - - TT_FATAL(num_workers_per_stage.size() == num_stages, "Must have a read order for each stage"); - TT_FATAL(worker_chunk_read_order.size() == num_stages, "Must have a read order for each stage"); - for (size_t i = 0; i < num_stages; ++i) { - TT_FATAL(worker_chunk_read_order[i].size() == slices_per_stage, "Must have a read order for each slice"); - } - - // Validate the test setup - TT_FATAL(num_stages > 1, "Must have at least 2 stages"); - TT_FATAL(num_stages < 8, "Must have at most 8 stages"); - for (size_t i = 0; i < num_stages; ++i) { - TT_FATAL(num_workers_per_stage[i] > 0, "Must have at least 1 worker per stage"); - TT_FATAL(num_workers_per_stage[i] < 8, "Must have at most 8 workers per stage"); - } - - std::vector tensor_specs; - tensor_specs.reserve(num_stages + 1); - for (size_t i = 0; i < num_stages + 1; ++i) { - tensor_specs.push_back(TensorSpec( - tensor_shape, TensorLayout(DataType::UINT32, PageConfig(layout, tt_metal::Tile()), mem_configs[i]))); - } - - // Allocate the tensors - pull to function - const size_t num_tensors = num_stages + 1; - std::vector host_tensors; - std::vector device_tensors; - host_tensors.reserve(num_tensors); - device_tensors.reserve(num_tensors); - auto num_elems = std::reduce(tensor_shape.cbegin(), tensor_shape.cend(), 1, std::multiplies()); - host_tensors.push_back( - ttnn::experimental::view(ttnn::arange(0, num_elems, 1, DataType::UINT32), tensor_shape).to_layout(layout)); - for (size_t i = 1; i < num_tensors; ++i) { - host_tensors.push_back( - ttnn::experimental::view(ttnn::ones(tensor_shape, DataType::UINT32, layout), tensor_shape)); - } - TT_FATAL(mem_configs.size() == num_tensors, "Must have a memory config for each tensor"); - for (size_t i = 0; i < num_tensors; i++) { - host_tensors[i].set_tensor_spec(tensor_specs[i]); - device_tensors.push_back(host_tensors[i].to_device(device, mem_configs[i])); - log_info("Tensor[{}] allocated starting at address {}", i, device_tensors[i].buffer()->address()); - } - TT_ASSERT(device_tensors.size() == num_tensors); - TT_ASSERT(device_tensors.size() == host_tensors.size()); - - // MAIN STUFF - - // Initial setup like worker core assignment, chunk read order, etc. - - std::vector pipeline_stage_worker_cores = {}; - for (size_t i = 0; i < num_stages; ++i) { - pipeline_stage_worker_cores.push_back( - CoreRangeSet(CoreRange(CoreCoord(0, i), CoreCoord(num_workers_per_stage[i] - 1, i)))); - } - CoreRangeSet all_workers_cores = CoreRangeSet(); - for (size_t i = 0; i < num_stages; ++i) { - } - - // Create circular buffers - for (size_t stage = 0; stage < num_stages; stage++) { - const size_t cb_packet_size_in_pages = 4; - const size_t num_packets_per_cb = 4; - tt_metal::CircularBufferConfig cb_config = - tt_metal::CircularBufferConfig( - cb_packet_size_in_pages * num_packets_per_cb * page_size_bytes, {{cb_index, data_format}}) - .set_page_size(cb_index, page_size_bytes); - CBHandle sender_workers_cb = CreateCircularBuffer(program, pipeline_stage_worker_cores[stage], cb_config); - } - - // Generate the reader semaphores - std::vector> input_tensor_semaphores; - input_tensor_semaphores.reserve(num_stages); - for (size_t stage = 0; stage < num_stages; stage++) { - input_tensor_semaphores.push_back({}); - for (size_t j = 0; j < slices_per_stage; j++) { - input_tensor_semaphores[stage].push_back(CreateSemaphore(program, pipeline_stage_worker_cores[stage], 0)); - } - } - - constexpr size_t num_command_streams = 1; - std::vector reader_kernels; - std::vector writer_kernels; - // Create the kernel handles for each pipeline stage - for (size_t stage = 0; stage < num_stages; stage++) { - auto reader_kernel = ttnn::ccl::worker_detail::generate_multi_command_stream_kernel_ct_args( - program, - {tt::CB::c_in0}, - {&device_tensors[stage]}, - pipeline_stage_worker_cores[stage], - tt_metal::ReaderDataMovementConfig{}, - num_command_streams); - reader_kernels.push_back(reader_kernel); - auto writer_kernel = ttnn::ccl::worker_detail::generate_multi_command_stream_kernel_ct_args( - program, - {tt::CB::c_in0}, - {&device_tensors[stage + 1]}, - pipeline_stage_worker_cores[stage], - tt_metal::WriterDataMovementConfig{}, - num_command_streams); - writer_kernels.push_back(writer_kernel); - } - - // Generate the tensor slices for each tensor/worker - std::vector> tensor_slices; - tensor_slices.reserve(num_stages + 1); - for (size_t t = 0; t < num_tensors; t++) { - tensor_slices.push_back( - ttnn::ccl::cmd::builder::generate_tensor_slices(slices_per_stage, device_tensors[t], split_dim)); - } - std::vector>> per_stage_worker_reader_tensor_slices; - std::vector>> per_stage_worker_writer_tensor_slices; - per_stage_worker_reader_tensor_slices.reserve(num_tensors); - per_stage_worker_writer_tensor_slices.reserve(num_tensors); - for (size_t stage = 0; stage < num_stages; stage++) { - per_stage_worker_reader_tensor_slices.push_back( - ttnn::ccl::cmd::builder::split_tensor_slices_across_workers_page_aligned( - num_workers_per_stage[stage], tensor_slices[stage])); - // We could compute this once and reuse it but I am generating it twice so I can have size mismatches - per_stage_worker_writer_tensor_slices.push_back( - ttnn::ccl::cmd::builder::split_tensor_slices_across_workers_page_aligned( - num_workers_per_stage[stage], tensor_slices[stage + 1])); - TT_FATAL( - per_stage_worker_reader_tensor_slices.back().size() == num_workers_per_stage[stage], - "Mismatch in tensor slices. Got {} but expected {}", - per_stage_worker_reader_tensor_slices.back().size(), - num_workers_per_stage[stage]); - TT_FATAL( - per_stage_worker_writer_tensor_slices.back().size() == num_workers_per_stage[stage], - "Mismatch in tensor slices. Got {} but expected {}", - per_stage_worker_writer_tensor_slices.back().size(), - num_workers_per_stage[stage]); - } - - // Build the command stream for each stage/worker - // Seminc example - // - local_core_semaphore_inc(second_command_stream_done_semaphore_id, 1); - // semwait example - // - local_semaphore_wait(second_command_stream_done_semaphore_id, 1) - // read tensor slice to cb example - // - read_tensor_slice_to_cb(in0_command_tensor_slice, cb_indices.at(0)) - // write tensor slice to cb example - // - build_write_tensor_slice_to_cb(out0_command_tensor_slice, cb_indices.at(0)) - TT_FATAL(per_stage_worker_reader_tensor_slices.size() == num_stages, "Mismatch in tensor slices"); - for (size_t stage = 0; stage < num_stages; stage++) { - bool last_stage = stage == num_stages - 1; - bool first_stage = stage == 0; - - const auto worker_cores = corerange_to_cores(pipeline_stage_worker_cores[stage]); - TT_FATAL(worker_cores.size() == num_workers_per_stage[stage], "Mismatch in worker cores"); - std::optional> next_worker_cores = - !last_stage ? corerange_to_cores(pipeline_stage_worker_cores[stage + 1]) - : std::optional>(std::nullopt); - - TT_FATAL( - per_stage_worker_reader_tensor_slices[stage].size() == num_workers_per_stage[stage], - "Mismatch in tensor slices"); - TT_FATAL( - per_stage_worker_writer_tensor_slices[stage].size() == num_workers_per_stage[stage], - "Mismatch in tensor slices"); - for (size_t worker = 0; worker < num_workers_per_stage[stage]; worker++) { - std::vector reader_cmd_stream; - std::vector writer_cmd_stream; - TT_FATAL( - per_stage_worker_reader_tensor_slices[stage][worker].size() == slices_per_stage, - "Mismatch in tensor slices"); - TT_FATAL( - per_stage_worker_writer_tensor_slices[stage][worker].size() == slices_per_stage, - "Mismatch in tensor slices"); - for (size_t slice_logical = 0; slice_logical < slices_per_stage; slice_logical++) { - const auto slice_actual = worker_chunk_read_order[stage][slice_logical]; - // reader - if (!first_stage) { - reader_cmd_stream.push_back(ttnn::ccl::cmd::uops::local_semaphore_wait( - input_tensor_semaphores[stage][slice_actual], num_workers_per_stage[stage - 1])); - } - reader_cmd_stream.push_back(ttnn::ccl::cmd::uops::read_tensor_slice_to_cb( - per_stage_worker_reader_tensor_slices[stage][worker][slice_actual], cb_index)); - log_info(tt::LogTest, "Worker {} reading/writing slice {}", worker, slice_actual); - - // writer - writer_cmd_stream.push_back(ttnn::ccl::cmd::uops::local_write_cb_to_tensor_slice( - per_stage_worker_writer_tensor_slices[stage][worker][slice_actual], cb_index)); - if (not last_stage) { - for (auto next_worker_xy : next_worker_cores.value()) { - log_info( - tt::LogTest, - "Stage {} Worker {} noc seminc to core (logical) x={},y={}", - stage, - worker, - next_worker_xy.x, - next_worker_xy.y); - writer_cmd_stream.push_back(ttnn::ccl::cmd::uops::local_chip_noc_semaphore_inc( - device->worker_core_from_logical_core(next_worker_xy).x, - device->worker_core_from_logical_core(next_worker_xy).y, - input_tensor_semaphores[stage + 1][slice_actual], - 1)); - } - } - } - ttnn::ccl::worker_detail::generate_multi_input_command_stream_kernel_rt_args( - program, - reader_kernels[stage], - {&device_tensors[stage]}, - {page_size_bytes}, - device, - cb_packet_size_in_pages, - {worker_cores.at(worker)}, - reader_cmd_stream, - std::nullopt, - std::nullopt, - std::nullopt); - ttnn::ccl::worker_detail::generate_multi_input_command_stream_kernel_rt_args( - program, - writer_kernels[stage], - {&device_tensors[stage + 1]}, - {page_size_bytes}, - device, - cb_packet_size_in_pages, - {worker_cores.at(worker)}, - writer_cmd_stream, - std::nullopt, - std::nullopt, - std::nullopt); - } - } - - run_programs(programs, {device}); - - bool pass = true; - constexpr bool enable_check = true; - if constexpr (enable_check) { - log_info(tt::LogTest, "Reading back outputs"); - auto input_cpu = device_tensors[0].cpu(); - auto final_out_cpu = device_tensors.back().cpu(); - - auto in_tensor_copyback = tt::tt_metal::owned_buffer::get_as(input_cpu); - auto out_tensor_copyback = tt::tt_metal::owned_buffer::get_as(final_out_cpu); - - auto in_tensor_data = tt::tt_metal::owned_buffer::get_as(host_tensors[0]); - - bool input_copyback_check_passed = run_output_check(in_tensor_data, in_tensor_copyback) == Correctness::Correct; - TT_FATAL(input_copyback_check_passed, "Input 0 copyback check failed"); - - log_info(tt::LogTest, "Comparing outputs"); - - pass &= run_output_check(in_tensor_data, out_tensor_copyback) == Correctness::Correct; - if (pass) { - log_info(tt::LogTest, "Output check passed for output 0"); - } else { - log_error(tt::LogTest, "Output check failed for output 0"); - } - } - - return pass; -} - TEST(WorkerCclCommandProcessingKernels, ChainOfCommandProcessorsWithVaryingDataReadOrders_LocalOnly0) { ttnn::Shape tensor_shape({1, 1, 64, 16384}); const size_t split_dim = 3; @@ -2723,8 +789,6 @@ TEST( } } -#include "ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.hpp" -#include TEST(CclAsyncOp, ReduceScatterSmall_PersistentFabric) { const size_t dim = 3; const size_t num_links = 1; @@ -2841,113 +905,6 @@ TEST(CclAsyncOp, ReduceScatterSmall_PersistentFabric) { log_info(tt::LogTest, "Finished"); } -static void wait_for_worker_subdevice_program_completion( - const std::vector& devices, const std::optional& subdevice_managers) { - std::ranges::for_each(devices, [&](IDevice* d) { - tt_metal::Finish(d->command_queue(), {subdevice_managers->worker_subdevice_id.at(d->id())}); - }); -} - -#include "ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.hpp" -void run_all_gather_with_persistent_fabric(const size_t dim, const size_t num_links, ttnn::Shape const& input_shape) { - log_info(tt::LogTest, "entering test"); - constexpr auto layout = Layout::TILE; - // DEVICES setuip - auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); - constexpr size_t test_expected_num_devices = 4; - if (tt::tt_metal::GetNumAvailableDevices() < test_expected_num_devices) { - log_info("This test can only be run on T3000 devices"); - return; - } - if (arch == tt::ARCH::GRAYSKULL) { - log_info("Test must be run on WH"); - return; - } - T3000TestDevice test_fixture; - auto view = test_fixture.mesh_device_->get_view(); - - // build a line of devices - std::vector devices = { - view.get_device(MeshCoordinate(0, 0)), - view.get_device(MeshCoordinate(0, 1)), - view.get_device(MeshCoordinate(0, 2)), - view.get_device(MeshCoordinate(0, 3))}; - const size_t num_devices = devices.size(); - TT_FATAL( - test_expected_num_devices == num_devices, - "Expected {} devices but got {}", - test_expected_num_devices, - num_devices); - const MemoryConfig in_memory_config = MemoryConfig(TensorMemoryLayout::INTERLEAVED, BufferType::DRAM); - const auto num_elems = input_shape.volume(); - - // INPUT TENSOR setup - log_info(tt::LogTest, "setting up input tensors"); - size_t page_size = tile_size(DataFormat::Float16); - std::vector device_input_tensors; - for (size_t i = 0; i < num_devices; i++) { - auto t = ttnn::experimental::view(ttnn::arange(0, num_elems, 1), input_shape).to_layout(layout); - t.set_tensor_spec(TensorSpec( - input_shape, TensorLayout(DataType::BFLOAT16, PageConfig(layout, tt_metal::Tile()), in_memory_config))); - - device_input_tensors.push_back(t.to_device(devices[i])); - } - // Need to make it a mesh tensor for use with the op - const Tensor input_mesh_tensor = ttnn::distributed::aggregate_as_tensor(device_input_tensors, AllGatherTensor{}); - - // FABRIC setup - const bool enable_persistent_fabric = true; - - std::vector dummy_worker_programs; - std::optional subdevice_managers = std::nullopt; - std::optional> fabric_programs; - std::vector fabric_program_ptrs; - std::optional fabric_handle; - setup_test_with_persistent_fabric( - devices, - dummy_worker_programs, - subdevice_managers, - fabric_programs, - fabric_program_ptrs, - fabric_handle, - enable_persistent_fabric, - num_links); - log_info(tt::LogTest, "Lauching op"); - - ttnn::global_semaphore::MultiDeviceGlobalSemaphore multi_device_global_semaphore = - ttnn::global_semaphore::create_global_semaphore_with_same_address( - test_fixture.mesh_device_.get(), - devices[0]->worker_cores(HalProgrammableCoreType::TENSIX, SubDeviceId{0}), - 0, // initial value - tt::tt_metal::BufferType::L1, // buffer type - 10 // attempts - ); - - auto output_tensor = ttnn::operations::experimental::ccl::all_gather_async( - input_mesh_tensor, - dim, - multi_device_global_semaphore, - num_links, - operation::DEFAULT_OUTPUT_MEMORY_CONFIG, - ttnn::ccl::Topology::Linear, - SubDeviceId(0), - true); - - // wait for op completion - wait_for_worker_subdevice_program_completion(devices, subdevice_managers); - log_info(tt::LogTest, "Main op done"); - - log_info(tt::LogTest, "Fabric teardown"); - persistent_fabric_teardown_sequence( - devices, subdevice_managers, fabric_handle.value(), tt::fabric::TerminationSignal::IMMEDIATELY_TERMINATE); - - log_info(tt::LogTest, "Waiting for teardown completion"); - for (auto d : devices) { - tt_metal::Synchronize(d, *ttnn::DefaultQueueId); - } - log_info(tt::LogTest, "Finished"); -} - TEST(CclAsyncOp, AllGather_PersistentFabric_Dim3_Links1_Shape1_1_32_128) { run_all_gather_with_persistent_fabric(3, 1, ttnn::Shape({1, 1, 32, 128})); } @@ -2963,288 +920,6 @@ TEST(CclAsyncOp, DISABLED_AllGather_PersistentFabric_Dim3_Links2_Shape1_1_32_819 run_all_gather_with_persistent_fabric(3, 2, ttnn::Shape({1, 1, 32, 8192})); } -struct WriteThroughputStabilityTestWithPersistentFabricParams { - size_t line_size = 4; - size_t num_devices_with_workers = 0; - bool line_sync = true; -}; - -void RunWriteThroughputStabilityTestWithPersistentFabric( - size_t num_mcasts, - size_t num_unicasts, - size_t num_links, - size_t num_op_invocations, - const WriteThroughputStabilityTestWithPersistentFabricParams& params = {}) { - auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); - auto num_devices = tt::tt_metal::GetNumAvailableDevices(); - if (num_devices < 4) { - log_info("This test can only be run on T3000 devices"); - return; - } - if (arch == tt::ARCH::GRAYSKULL) { - log_info("Test must be run on WH"); - return; - } - - size_t line_size = params.line_size; - size_t num_devices_with_workers = params.num_devices_with_workers; - if (num_devices_with_workers == 0) { - num_devices_with_workers = line_size; - } - using namespace ttnn::ccl; - TT_FATAL(num_devices_with_workers <= line_size, "num_devices_with_workers must be less than or equal to num_links"); - - auto worker_core_logical = [](size_t link) { return CoreCoord(link, 0); }; - - // static constexpr size_t source_l1_buffer_address = 1000000; - static constexpr uint32_t packet_header_cb_index = tt::CB::c_in0; - static constexpr uint32_t source_payload_cb_index = tt::CB::c_in1; - static constexpr size_t packet_header_cb_size_in_headers = 4; - static constexpr bool enable_persistent_fabric_mode = true; - static constexpr size_t packet_payload_size_bytes = - ttnn::ccl::FabricEriscDatamoverBuilder::default_packet_payload_size_bytes; - static constexpr size_t dest_buffer_size = packet_payload_size_bytes * 4; - static constexpr tt::DataFormat cb_df = tt::DataFormat::Bfp8; - - T3000TestDevice test_fixture; - auto view = test_fixture.mesh_device_->get_view(); - - // Get the inner 4 device ring on a WH T3K device so that we can use both links for all devices - std::vector devices_ = { - view.get_device(MeshCoordinate(0, 1)), - view.get_device(MeshCoordinate(0, 2)), - view.get_device(MeshCoordinate(1, 2)), - view.get_device(MeshCoordinate(1, 1))}; - std::vector devices; - devices.reserve(line_size); - for (size_t i = 0; i < line_size; i++) { - devices.push_back(devices_[i]); - } - // build the mesh device - - // Persistent Fabric Setup - std::vector dummy_worker_programs; - std::optional subdevice_managers = std::nullopt; - std::optional> fabric_programs; - std::vector fabric_program_ptrs; - std::optional fabric_handle; - setup_test_with_persistent_fabric( - devices, - dummy_worker_programs, - subdevice_managers, - fabric_programs, - fabric_program_ptrs, - fabric_handle, - enable_persistent_fabric_mode, - num_links); - - // Other boiler plate setup - CoreRangeSet worker_cores = CoreRangeSet(CoreRange(CoreCoord(0, 0), CoreCoord(num_links - 1, 0))); - auto worker_cores_vec = corerange_to_cores(worker_cores, std::nullopt, false); - auto dest_core_coord = CoreCoord(2, 2); - auto sync_core_coord = CoreCoord(0, 0); - - ttnn::SmallVector> device_dest_buffers; - device_dest_buffers.reserve(line_size); - for (auto* d : devices) { - auto local_input_buffer = - CreateBuffer(InterleavedBufferConfig{d, dest_buffer_size, dest_buffer_size, BufferType::L1}); - device_dest_buffers.push_back(local_input_buffer); - } - - size_t dest_bank_addr = device_dest_buffers[0]->address(); - TT_FATAL( - std::all_of( - device_dest_buffers.begin(), - device_dest_buffers.end(), - [dest_bank_addr](const auto& buffer) { return buffer->address() == dest_bank_addr; }), - "Test setup error: all destination buffers must have the same bank address across devices"); - - std::vector global_semaphore_addrs; - global_semaphore_addrs.reserve(line_size + 1); - std::vector global_semaphore_handles; - for (size_t i = 0; i < line_size * 4; i++) { - auto global_semaphores = ttnn::global_semaphore::create_global_semaphore_with_same_address( - test_fixture.mesh_device_.get(), - devices[0]->worker_cores(HalProgrammableCoreType::TENSIX, SubDeviceId{0}), - 0, // initial value - tt::tt_metal::BufferType::L1, // buffer type - 1000 // attempts - ); - global_semaphore_handles.push_back(global_semaphores); - auto global_semaphore_addr = - ttnn::global_semaphore::get_global_semaphore_address(global_semaphores.global_semaphores.at(0)); - global_semaphore_addrs.push_back(global_semaphore_addr); - } - - std::vector worker_devices; - for (size_t i = 0; i < num_devices_with_workers; i++) { - worker_devices.push_back(devices[i]); - } - // Worker program setup - std::vector programs(num_devices_with_workers); - TT_FATAL( - programs.size() == worker_devices.size(), - "Test misconfiguration. Mismatch in line size and devices. Expected line size of {} but got {} devices " - "instead.", - line_size, - worker_devices.size()); - std::vector worker_kernel_ids; - std::vector per_device_global_sem_addr_rt_arg; - for (size_t i = 0; i < num_devices_with_workers; i++) { - const size_t line_index = i; - auto& program = programs[i]; - auto* device = devices[i]; - const size_t dest_noc_x = device->worker_core_from_logical_core(dest_core_coord).x; - const size_t dest_noc_y = device->worker_core_from_logical_core(dest_core_coord).y; - const size_t sync_core_noc_x = device->worker_core_from_logical_core(sync_core_coord).x; - const size_t sync_core_noc_y = device->worker_core_from_logical_core(sync_core_coord).y; - - IDevice* backward_device = i == 0 ? nullptr : devices[i - 1]; - IDevice* forward_device = i == line_size - 1 ? nullptr : devices[i + 1]; - - // Initialize the fabric handle for worker connection - bool start_of_line = line_index == 0; - bool end_of_line = line_index == line_size - 1; - bool has_forward_connection = !end_of_line; - bool has_backward_connection = !start_of_line; - bool unicast_forward = !end_of_line; - size_t mcast_fwd_hops = line_size - line_index - 1; - size_t mcast_bwd_hops = line_index; - size_t unicast_hops = unicast_forward ? mcast_fwd_hops : mcast_bwd_hops; - - auto local_device_fabric_handle = - ttnn::ccl::EdmLineFabricOpInterface::build_program_builder_worker_connection_fabric( - device, forward_device, backward_device, &program, enable_persistent_fabric_mode, num_links); - - // reserve CB - tt_metal::CircularBufferConfig cb_src0_config = - tt_metal::CircularBufferConfig( - packet_header_cb_size_in_headers * sizeof(tt::fabric::PacketHeader), {{packet_header_cb_index, cb_df}}) - .set_page_size(packet_header_cb_index, sizeof(tt::fabric::PacketHeader)); - CBHandle sender_workers_cb = CreateCircularBuffer(program, worker_cores, cb_src0_config); - - tt_metal::CircularBufferConfig cb_src1_config = - tt_metal::CircularBufferConfig(packet_payload_size_bytes, {{source_payload_cb_index, cb_df}}) - .set_page_size(source_payload_cb_index, packet_payload_size_bytes); - CBHandle sender_workers_payload_cb = CreateCircularBuffer(program, worker_cores, cb_src1_config); - - TT_FATAL( - local_device_fabric_handle.get_num_links() == num_links, - "Error in test setup. Expected two links between devices but got {} links for device {}", - local_device_fabric_handle.get_num_links(), - device->id()); - - std::vector worker_ct_args = {params.line_sync, params.line_sync}; - - auto worker_kernel_id = tt_metal::CreateKernel( - program, - "tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp", - worker_cores, - tt_metal::WriterDataMovementConfig(worker_ct_args)); - worker_kernel_ids.push_back(worker_kernel_id); - for (size_t l = 0; l < num_links; l++) { - auto worker_core = worker_cores_vec[l]; - auto build_connection_args = [&local_device_fabric_handle, device, &program, &worker_core]( - bool is_connected_in_direction, - ttnn::ccl::EdmLineFabricOpInterface::Direction direction, - std::vector& rt_args_out) { - rt_args_out.push_back(is_connected_in_direction); - if (is_connected_in_direction) { - const auto connection = local_device_fabric_handle.uniquely_connect_worker(device, direction); - const auto new_rt_args = - ttnn::ccl::worker_detail::generate_edm_connection_rt_args(connection, program, {worker_core}); - log_info( - tt::LogTest, - "On device: {}, connecting to EDM fabric in {} direction. EDM noc_x: {}, noc_y: {}", - device->id(), - direction, - connection.edm_noc_x, - connection.edm_noc_y); - std::copy(new_rt_args.begin(), new_rt_args.end(), std::back_inserter(rt_args_out)); - } - }; - // RT ARGS - std::vector rt_args = { - dest_bank_addr, - packet_payload_size_bytes, - dest_noc_x, - dest_noc_y, - - num_mcasts, - mcast_fwd_hops, - mcast_bwd_hops, - - num_unicasts, - unicast_hops, - unicast_forward, - - source_payload_cb_index, // source_l1_buffer_address, - packet_header_cb_index, - packet_header_cb_size_in_headers, - }; - - build_connection_args(has_forward_connection, ttnn::ccl::EdmLineFabricOpInterface::FORWARD, rt_args); - build_connection_args(has_backward_connection, ttnn::ccl::EdmLineFabricOpInterface::BACKWARD, rt_args); - - if (params.line_sync) { - rt_args.push_back(sync_core_noc_x); - rt_args.push_back(sync_core_noc_y); - if (l == 0) { - per_device_global_sem_addr_rt_arg.push_back(rt_args.size()); - } - TT_FATAL(global_semaphore_addrs.at(0) != -1, "Invalid test setup. Global semaphore address is -1"); - rt_args.push_back(global_semaphore_addrs.at(0)); - rt_args.push_back(num_links * num_devices_with_workers); - } - - tt_metal::SetRuntimeArgs(program, worker_kernel_id, worker_core, rt_args); - } - } - - for (size_t i = 0; i < num_op_invocations; i++) { - log_info(tt::LogTest, "Iteration: {}", i); - if (i != 0 && params.line_sync) { - for (size_t k = 0; k < worker_kernel_ids.size(); k++) { - auto& worker_rt_args_by_core = GetRuntimeArgs(programs[k], worker_kernel_ids[k]); - auto global_sem_addr_rt_arg_idx = per_device_global_sem_addr_rt_arg[k]; - for (size_t l = 0; l < num_links; l++) { - auto& worker_rt_args = worker_rt_args_by_core[worker_cores_vec[l].x][worker_cores_vec[l].y]; - worker_rt_args.at(global_sem_addr_rt_arg_idx) = - global_semaphore_addrs[i % global_semaphore_addrs.size()]; - } - } - } - - build_and_enqueue(worker_devices, programs, i != 0); - - log_info(tt::LogTest, "Waiting for Op finish on all devices"); - wait_for_worker_subdevice_program_completion(worker_devices, subdevice_managers); - log_info(tt::LogTest, "Main op done"); - } - - TT_FATAL(fabric_programs->size() == devices.size(), "Expected fabric programs size to be same as devices size"); - log_info(tt::LogTest, "Fabric teardown"); - persistent_fabric_teardown_sequence( - devices, subdevice_managers, fabric_handle.value(), tt::fabric::TerminationSignal::GRACEFULLY_TERMINATE); - - log_info(tt::LogTest, "Waiting for teardown completion"); - for (IDevice* d : devices) { - tt_metal::Synchronize(d, *ttnn::DefaultQueueId); - } - for (size_t i = 0; i < programs.size(); i++) { - auto d = worker_devices[i]; - auto& program = programs[i]; - tt_metal::DumpDeviceProfileResults(d, program); - } - for (size_t i = 0; i < fabric_programs->size(); i++) { - auto d = devices[i]; - auto& program = fabric_programs.value()[i]; - tt_metal::DumpDeviceProfileResults(d, program); - } - log_info(tt::LogTest, "Finished"); -} - TEST(EdmFabric, BasicMcastThroughputTest_SingleLink_LineSize2_SingleMcast) { const size_t num_mcasts = 1; const size_t num_unicasts = 2; From aa09c9f23861e4275a828d2d76ff775f268b4e05 Mon Sep 17 00:00:00 2001 From: Raymond Kim <109366641+tt-rkim@users.noreply.github.com> Date: Mon, 24 Feb 2025 17:43:29 -0500 Subject: [PATCH 271/316] #0: [skip ci] Rename nightly L2 tests to something more sane + ping Borys on failure (#18254) ### Ticket Link to Github Issue ### Problem description Provide context for the problem. ### What's changed Describe the approach used to solve the problem. Summarize the changes made and its impact. ### Checklist - [ ] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes - [ ] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [ ] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [ ] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [ ] New/Existing tests provide coverage for changes --- .github/workflows/_produce-data.yaml | 1 + .github/workflows/tt-metal-l2-nightly.yaml | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_produce-data.yaml b/.github/workflows/_produce-data.yaml index eca1d625272..e54fb263118 100644 --- a/.github/workflows/_produce-data.yaml +++ b/.github/workflows/_produce-data.yaml @@ -44,6 +44,7 @@ on: - "Blackhole post-commit tests" - "Custom test dispatch" - "PR Gate" + - "Nightly tt-metal L2 tests" types: - completed diff --git a/.github/workflows/tt-metal-l2-nightly.yaml b/.github/workflows/tt-metal-l2-nightly.yaml index 7bdd961431c..85aba0b214c 100644 --- a/.github/workflows/tt-metal-l2-nightly.yaml +++ b/.github/workflows/tt-metal-l2-nightly.yaml @@ -1,4 +1,4 @@ -name: "[internal] tt-metal l2 nightly tests" +name: "Nightly tt-metal L2 tests" on: workflow_call: @@ -78,4 +78,4 @@ jobs: if: ${{ failure() }} with: slack_webhook_url: ${{ secrets.SLACK_WEBHOOK_URL }} - owner: U07HTBQPHFG # Bryan Keith + owner: U06Q7ESTFEV # Borys Bradel From 4a0562cb607598433f35fd4582be767ef33d3b18 Mon Sep 17 00:00:00 2001 From: Stanislav Minakov Date: Mon, 24 Feb 2025 22:53:25 +0000 Subject: [PATCH 272/316] Fix crash if MeshDevice is deallocated before MeshBuffer (#18181) ### Ticket ### Problem description Currently there is a crash if MeshDevice is deallocated or closed before MeshBuffer There are two semi-independent issues: 1. Lifetime issue if MeshDevice is deallocated 2. Destruction order is inconsistent in MeshDevice destructor and close method, because `sub_device_manager_tracker_ ` may perform buffer deallocation and this would call back to MeshDevice, so member destruction order actually meters here. ### What's changed Added a test to reproduce the issue Stored MeshDevice as weak_ptr inside of MeshBuffer to be able to detect this case Added special handling for this case, skipping buffer deallocation call Change reset order in MeshDevice close ### Checklist - [x] [All post commit CI passes](https://github.com/tenstorrent/tt-metal/actions/runs/13496304356) - [x] New/Existing tests provide coverage for changes --- .../tt_metal/distributed/test_mesh_buffer.cpp | 38 +++++++++++++++++++ tt_metal/api/tt-metalium/buffer.hpp | 3 ++ tt_metal/api/tt-metalium/mesh_buffer.hpp | 10 +++-- tt_metal/distributed/mesh_buffer.cpp | 25 +++++++++++- tt_metal/distributed/mesh_device.cpp | 4 +- tt_metal/impl/buffers/buffer.cpp | 8 ++++ 6 files changed, 80 insertions(+), 8 deletions(-) diff --git a/tests/tt_metal/distributed/test_mesh_buffer.cpp b/tests/tt_metal/distributed/test_mesh_buffer.cpp index d1834c37595..364790f8984 100644 --- a/tests/tt_metal/distributed/test_mesh_buffer.cpp +++ b/tests/tt_metal/distributed/test_mesh_buffer.cpp @@ -126,6 +126,44 @@ TEST_F(MeshBufferTestT3000, Deallocation) { EXPECT_FALSE(buffer_view->is_allocated()); } +TEST(MeshBufferTest, DeallocationWithoutMeshDevice) { + for (int i = 0; i < 100; i++) { + auto config = + MeshDeviceConfig{.mesh_shape = SimpleMeshShape(1, 1), .offset = std::nullopt, .physical_device_ids = {}}; + auto mesh_device = + MeshDevice::create(config, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, DispatchCoreType::WORKER); + + const DeviceLocalBufferConfig device_local_config{ + .page_size = 2048, + .buffer_type = BufferType::DRAM, + .buffer_layout = TensorMemoryLayout::INTERLEAVED, + .bottom_up = false}; + const ReplicatedBufferConfig buffer_config{.size = 2048}; + auto buffer = MeshBuffer::create(buffer_config, device_local_config, mesh_device.get()); + + mesh_device.reset(); + } +} + +TEST(MeshBufferTest, DeallocationWithMeshDeviceClosed) { + for (int i = 0; i < 100; i++) { + auto config = + MeshDeviceConfig{.mesh_shape = SimpleMeshShape(1, 1), .offset = std::nullopt, .physical_device_ids = {}}; + auto mesh_device = + MeshDevice::create(config, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, DispatchCoreType::WORKER); + + const DeviceLocalBufferConfig device_local_config{ + .page_size = 2048, + .buffer_type = BufferType::DRAM, + .buffer_layout = TensorMemoryLayout::INTERLEAVED, + .bottom_up = false}; + const ReplicatedBufferConfig buffer_config{.size = 2048}; + auto buffer = MeshBuffer::create(buffer_config, device_local_config, mesh_device.get()); + + mesh_device->close(); + } +} + TEST_F(MeshBufferTestT3000, GetDeviceBuffer) { const DeviceLocalBufferConfig device_local_config{ .page_size = 1024, diff --git a/tt_metal/api/tt-metalium/buffer.hpp b/tt_metal/api/tt-metalium/buffer.hpp index e52f45b2105..cf5d06cecb5 100644 --- a/tt_metal/api/tt-metalium/buffer.hpp +++ b/tt_metal/api/tt-metalium/buffer.hpp @@ -252,6 +252,9 @@ class Buffer final { size_t unique_id() const { return unique_id_; } + // Mark the buffer as deallocated, without releasing underlying device memory + void mark_as_deallocated(); + Buffer( IDevice* device, DeviceAddr size, diff --git a/tt_metal/api/tt-metalium/mesh_buffer.hpp b/tt_metal/api/tt-metalium/mesh_buffer.hpp index de14271da85..2a16355fbaa 100644 --- a/tt_metal/api/tt-metalium/mesh_buffer.hpp +++ b/tt_metal/api/tt-metalium/mesh_buffer.hpp @@ -75,6 +75,7 @@ class MeshBuffer { const DeviceLocalBufferConfig& device_local_layout, MeshDevice* mesh_device, std::optional address = std::nullopt); + ~MeshBuffer(); // Returns true if the MeshBuffer is allocated. Note that MeshBuffer is created in the allocated state; either the // destructor or the `deallocate` method deallocate the MeshBuffer. @@ -85,7 +86,8 @@ class MeshBuffer { // resources. void deallocate(); - MeshDevice* device() const { return mesh_device_; } + // Throws an exception if the corresponding MeshDevice is already deallocated + MeshDevice* device() const; DeviceAddr size() const; DeviceAddr device_local_size() const { return device_local_size_; } DeviceAddr address() const { return address_; }; @@ -114,7 +116,7 @@ class MeshBuffer { buffers_(SimpleMeshShape(mesh_device->shape()), nullptr), config_(config), device_local_config_(device_local_config), - mesh_device_(mesh_device), + mesh_device_(mesh_device->shared_from_this()), address_(backing_buffer->address()), device_local_size_(device_local_size), state_(OwnedBufferState{std::move(backing_buffer)}) {} @@ -129,7 +131,7 @@ class MeshBuffer { buffers_(SimpleMeshShape(mesh_device->shape()), /*fill_value=*/nullptr), config_(config), device_local_config_(device_local_config), - mesh_device_(mesh_device), + mesh_device_(mesh_device->shared_from_this()), address_(address), device_local_size_(device_local_size), state_(ExternallyOwnedState{}) {} @@ -137,7 +139,7 @@ class MeshBuffer { void initialize_device_buffers(); MeshBufferConfig config_; DeviceLocalBufferConfig device_local_config_; - MeshDevice* mesh_device_ = nullptr; + std::weak_ptr mesh_device_; DeviceAddr address_ = 0; DeviceAddr device_local_size_ = 0; diff --git a/tt_metal/distributed/mesh_buffer.cpp b/tt_metal/distributed/mesh_buffer.cpp index 9ed3f95627c..9eb540c5efd 100644 --- a/tt_metal/distributed/mesh_buffer.cpp +++ b/tt_metal/distributed/mesh_buffer.cpp @@ -114,7 +114,7 @@ std::shared_ptr MeshBuffer::create( void MeshBuffer::initialize_device_buffers() { auto init_device_buffer_at_address = [this](const MeshCoordinate& coord) { std::shared_ptr buffer = Buffer::create( - mesh_device_->get_device(coord), + device()->get_device(coord), address_, device_local_size_, device_local_config_.page_size, @@ -132,7 +132,28 @@ void MeshBuffer::initialize_device_buffers() { bool MeshBuffer::is_allocated() const { return not std::holds_alternative(state_); } -void MeshBuffer::deallocate() { state_ = DeallocatedState{}; } +MeshBuffer::~MeshBuffer() { deallocate(); } + +void MeshBuffer::deallocate() { + auto mesh_device = mesh_device_.lock(); + if (mesh_device) { + state_ = DeallocatedState{}; + return; + } + + // Special handling is required if MeshDevice is already deallocated + if (std::holds_alternative(state_)) { + auto& owned_state = std::get(state_); + owned_state.backing_buffer->mark_as_deallocated(); + } + state_ = DeallocatedState{}; +} + +MeshDevice* MeshBuffer::device() const { + auto device = mesh_device_.lock(); + TT_FATAL(device, "Can't get device from mesh buffer, already deallocated"); + return device.get(); +} std::shared_ptr MeshBuffer::get_device_buffer(const MeshCoordinate& device_coord) const { return buffers_.at(device_coord); diff --git a/tt_metal/distributed/mesh_device.cpp b/tt_metal/distributed/mesh_device.cpp index 80535e32674..8ac1df381ce 100644 --- a/tt_metal/distributed/mesh_device.cpp +++ b/tt_metal/distributed/mesh_device.cpp @@ -205,7 +205,7 @@ std::vector> MeshDevice::create_submeshes(const Mesh return submeshes; } -MeshDevice::~MeshDevice() {} +MeshDevice::~MeshDevice() { close(); } IDevice* MeshDevice::get_device(chip_id_t physical_device_id) const { for (auto device : this->get_devices()) { @@ -327,12 +327,12 @@ bool MeshDevice::close() { submesh->close(); } submeshes_.clear(); + sub_device_manager_tracker_.reset(); if (scoped_devices_) { scoped_devices_.reset(); } parent_mesh_.reset(); view_.reset(); - sub_device_manager_tracker_.reset(); return true; } diff --git a/tt_metal/impl/buffers/buffer.cpp b/tt_metal/impl/buffers/buffer.cpp index 29cdf05c980..0d0ef13b6f5 100644 --- a/tt_metal/impl/buffers/buffer.cpp +++ b/tt_metal/impl/buffers/buffer.cpp @@ -401,7 +401,15 @@ void Buffer::deallocate() { }); } +void Buffer::mark_as_deallocated() { + allocation_status_.store(AllocationStatus::DEALLOCATED, std::memory_order::relaxed); +} + void Buffer::deleter(Buffer* buffer) { + if (buffer->allocation_status_.load(std::memory_order::relaxed) == AllocationStatus::DEALLOCATED) { + delete buffer; + return; + } buffer->device_->push_work([buffer] { std::unique_ptr unique_buffer = std::unique_ptr(buffer); buffer->deallocate_impl(); From 1a243080183a821897545e538893bb4041d312f4 Mon Sep 17 00:00:00 2001 From: Andrew Fuller Date: Mon, 24 Feb 2025 19:11:15 -0500 Subject: [PATCH 273/316] [skip ci] Dockerize TGG frequent (#18255) Ticket #18188 Problem description This workflow was limited to the OS of the host machine. What's changed Dockerized the workflow. Checklist - [x] TGG Freq [passes](https://github.com/tenstorrent/tt-metal/actions/runs/13509472608) - [x] CYOPipeline [passes](https://github.com/tenstorrent/tt-metal/actions/runs/13509515137) --- .github/workflows/pipeline-select-galaxy.yaml | 12 +++- .../workflows/tgg-frequent-tests-impl.yaml | 62 +++++++++++++++---- .github/workflows/tgg-frequent-tests.yaml | 6 ++ 3 files changed, 65 insertions(+), 15 deletions(-) diff --git a/.github/workflows/pipeline-select-galaxy.yaml b/.github/workflows/pipeline-select-galaxy.yaml index 69e09c900f7..a136ae487e6 100644 --- a/.github/workflows/pipeline-select-galaxy.yaml +++ b/.github/workflows/pipeline-select-galaxy.yaml @@ -49,17 +49,25 @@ jobs: with: build-type: ${{ inputs.build-type }} tracy: ${{ inputs.build-with-tracy }} + build-wheel: true secrets: inherit tgg-unit-tests: + if: ${{ inputs.tgg-unit }} needs: build-artifact secrets: inherit uses: ./.github/workflows/tgg-unit-tests-impl.yaml - if: ${{ inputs.tgg-unit }} + with: + docker-image: ${{ needs.build-artifact.outputs.ci-build-docker-image }} + wheel-artifact-name: ${{ needs.build-artifact.outputs.wheel-artifact-name }} tgg-frequent-tests: + if: ${{ inputs.tgg-frequent }} needs: build-artifact secrets: inherit uses: ./.github/workflows/tgg-frequent-tests-impl.yaml - if: ${{ inputs.tgg-frequent }} + with: + docker-image: ${{ needs.build-artifact.outputs.ci-build-docker-image }} + wheel-artifact-name: ${{ needs.build-artifact.outputs.wheel-artifact-name }} + build-artifact-name: ${{ needs.build-artifact.outputs.build-artifact-name }} tgg-model-perf-tests: needs: build-artifact secrets: inherit diff --git a/.github/workflows/tgg-frequent-tests-impl.yaml b/.github/workflows/tgg-frequent-tests-impl.yaml index c374035b286..e31f519c010 100644 --- a/.github/workflows/tgg-frequent-tests-impl.yaml +++ b/.github/workflows/tgg-frequent-tests-impl.yaml @@ -2,6 +2,16 @@ name: "[internal] TGG frequent tests" on: workflow_call: + inputs: + docker-image: + required: true + type: string + wheel-artifact-name: + required: true + type: string + build-artifact-name: + required: true + type: string jobs: tgg-frequent-tests: @@ -17,26 +27,52 @@ jobs: }, ] name: ${{ matrix.test-group.name }} - env: - ARCH_NAME: ${{ matrix.test-group.arch }} - LOGURU_LEVEL: INFO - LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib runs-on: ${{ matrix.test-group.runs-on }} + container: + image: ${{ inputs.docker-image }} + env: + TT_METAL_HOME: /work + PYTHONPATH: /work + LD_LIBRARY_PATH: /work/build/lib + LOGURU_LEVEL: INFO + ARCH_NAME: ${{ matrix.test-group.arch }} + volumes: + - ${{ github.workspace }}/docker-job:/work # Subdir to workaround https://github.com/actions/runner/issues/691 + - /dev/hugepages-1G:/dev/hugepages-1G + - /mnt/MLPerf:/mnt/MLPerf + options: "--device /dev/tenstorrent" + defaults: + run: + shell: bash + working-directory: /work # https://github.com/actions/runner/issues/878 steps: - - uses: tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main - - name: Set up dynamic env vars for build - run: | - echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV + - name: ⬇️ Checkout + uses: actions/checkout@v4 + with: + submodules: recursive + path: docker-job # Here be dragons; keep it scoped to our desired volume, yet must be under github.workspace and be sure to clean up at the end - uses: actions/download-artifact@v4 with: - name: TTMetal_build_any + name: ${{ inputs.build-artifact-name }} + path: /work - name: Extract files run: tar -xvf ttm_any.tar - - uses: ./.github/actions/install-python-deps + - name: ⬇️ Download Wheel + uses: actions/download-artifact@v4 + with: + name: ${{ inputs.wheel-artifact-name }} + path: /work + - name: Install Wheel + run: | + WHEEL_FILENAME=$(ls -1 *.whl) + pip3 install $WHEEL_FILENAME - name: Run frequent regression tests timeout-minutes: 90 run: | - source ${{ github.workspace }}/python_env/bin/activate - cd $TT_METAL_HOME - export PYTHONPATH=$TT_METAL_HOME ${{ matrix.test-group.cmd }} + - name: Cleanup + if: always() + run: | + # We are forced to checkout the repo into a subdir of the host's workdir; this pollutes the host + # with root-owned files. Be sure to clean up after ourselves in case we're on a non-ephemeral runner. + rm -rf /__w/tt-metal/tt-metal/docker-job diff --git a/.github/workflows/tgg-frequent-tests.yaml b/.github/workflows/tgg-frequent-tests.yaml index 4c15f1c7209..9ca8a848002 100644 --- a/.github/workflows/tgg-frequent-tests.yaml +++ b/.github/workflows/tgg-frequent-tests.yaml @@ -9,7 +9,13 @@ jobs: build-artifact: uses: ./.github/workflows/build-artifact.yaml secrets: inherit + with: + build-wheel: true tgg-frequent-tests: needs: build-artifact secrets: inherit uses: ./.github/workflows/tgg-frequent-tests-impl.yaml + with: + docker-image: ${{ needs.build-artifact.outputs.ci-build-docker-image }} + wheel-artifact-name: ${{ needs.build-artifact.outputs.wheel-artifact-name }} + build-artifact-name: ${{ needs.build-artifact.outputs.build-artifact-name }} From fe980b83520913910bd90047e14b1c75c7a845f2 Mon Sep 17 00:00:00 2001 From: Jay Kruer Date: Mon, 24 Feb 2025 16:11:31 -0800 Subject: [PATCH 274/316] [tt-train] add silu op + forward/backward test (#18226) ### Problem description We need a differentiable silu op to implement swiglu for llama 3 training. ### What's changed - Uses existing ttnn ops to implement ttml::ops::silu. - Adds a test which checks both forward and backward with respect to PyTorch's `torch.nn.functional.silu` against a single realistic case. ### Checklist - [ ] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes - [ ] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [ ] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [ ] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [x] New/Existing tests provide coverage for changes --- tt-train/sources/ttml/ops/unary_ops.cpp | 14 ++ tt-train/sources/ttml/ops/unary_ops.hpp | 1 + tt-train/tests/ops/unary_ops_test.cpp | 198 ++++++++++++++++++++++-- 3 files changed, 199 insertions(+), 14 deletions(-) diff --git a/tt-train/sources/ttml/ops/unary_ops.cpp b/tt-train/sources/ttml/ops/unary_ops.cpp index a9ec11094eb..dcd86ff12ff 100644 --- a/tt-train/sources/ttml/ops/unary_ops.cpp +++ b/tt-train/sources/ttml/ops/unary_ops.cpp @@ -50,6 +50,20 @@ autograd::TensorPtr gelu(const autograd::TensorPtr& tensor) { return out; } +autograd::TensorPtr silu(const autograd::TensorPtr& tensor) { + auto out = autograd::create_tensor(ttnn::silu(tensor->get_value())); + autograd::GradFunction grad = [tensor, out]() { + auto res = ttnn::silu_bw(out->get_grad(), tensor->get_value()); + assert(res.size() == 1U && "Silu backward should return only one gradient"); + tensor->add_grad(res.front().value()); + }; + + auto links = autograd::get_links(tensor); + out->set_node(autograd::ctx().add_backward_node(std::move(grad), links)); + + return out; +} + autograd::TensorPtr log_softmax(const autograd::TensorPtr& tensor, int dim) { auto log_softmax = ttnn_fixed::log_softmax(tensor->get_value(), dim); auto out = autograd::create_tensor(log_softmax); diff --git a/tt-train/sources/ttml/ops/unary_ops.hpp b/tt-train/sources/ttml/ops/unary_ops.hpp index 669ee04233b..ba5fa36ccb2 100644 --- a/tt-train/sources/ttml/ops/unary_ops.hpp +++ b/tt-train/sources/ttml/ops/unary_ops.hpp @@ -10,6 +10,7 @@ namespace ttml::ops { autograd::TensorPtr relu(const autograd::TensorPtr& tensor); autograd::TensorPtr gelu(const autograd::TensorPtr& tensor); +autograd::TensorPtr silu(const autograd::TensorPtr& tensor); autograd::TensorPtr mean(const autograd::TensorPtr& tensor); autograd::TensorPtr sum(const autograd::TensorPtr& tensor); autograd::TensorPtr broadcast_batch(const autograd::TensorPtr& tensor, uint32_t new_batch_dim); diff --git a/tt-train/tests/ops/unary_ops_test.cpp b/tt-train/tests/ops/unary_ops_test.cpp index 90c2afeac0d..6446a84f930 100644 --- a/tt-train/tests/ops/unary_ops_test.cpp +++ b/tt-train/tests/ops/unary_ops_test.cpp @@ -11,34 +11,37 @@ #include "autograd/auto_context.hpp" #include "autograd/tensor.hpp" #include "core/tt_tensor_utils.hpp" +#include "ops/losses.hpp" + +namespace ttml::ops::tests { class UnaryOpsTest : public ::testing::Test { protected: void SetUp() override { - ttml::autograd::ctx().open_device(); + autograd::ctx().open_device(); } void TearDown() override { - ttml::autograd::ctx().close_device(); + autograd::ctx().close_device(); } }; TEST_F(UnaryOpsTest, GlobalMean) { std::vector test_data = {1.F, 2.F, 3.F, 4.F, 1.F, 2.F, 3.F, 4.F}; - auto shape = ttml::core::create_shape({2, 1, 1, 4}); - auto tensor = ttml::core::from_vector(test_data, shape, &ttml::autograd::ctx().get_device()); + auto shape = core::create_shape({2, 1, 1, 4}); + auto tensor = core::from_vector(test_data, shape, &autograd::ctx().get_device()); - auto tensor_ptr = ttml::autograd::create_tensor(tensor); + auto tensor_ptr = autograd::create_tensor(tensor); - auto result = ttml::ops::mean(tensor_ptr); - auto result_data = ttml::core::to_vector(result->get_value()); + auto result = mean(tensor_ptr); + auto result_data = core::to_vector(result->get_value()); ASSERT_EQ(result_data.size(), 1); EXPECT_FLOAT_EQ(result_data[0], 2.5F); result->backward(); - auto tensor_grad = ttml::core::to_vector(tensor_ptr->get_grad()); + auto tensor_grad = core::to_vector(tensor_ptr->get_grad()); ASSERT_EQ(tensor_grad.size(), test_data.size()); for (float it : tensor_grad) { EXPECT_FLOAT_EQ(it, 0.125F); @@ -46,12 +49,12 @@ TEST_F(UnaryOpsTest, GlobalMean) { } TEST_F(UnaryOpsTest, LogSoftmax) { - auto* device = &ttml::autograd::ctx().get_device(); + auto* device = &autograd::ctx().get_device(); std::vector test_data = {-0.1F, -0.2F, -0.3F, -0.4F, 0.F, -0.2F, -0.3F, -0.4F}; - auto tensor = ttml::core::from_vector(test_data, ttml::core::create_shape({2, 1, 1, 4}), device); - auto tensor_ptr = ttml::autograd::create_tensor(tensor); - auto result = ttml::ops::log_softmax_moreh(tensor_ptr, 3); - auto result_data = ttml::core::to_vector(result->get_value()); + auto tensor = core::from_vector(test_data, core::create_shape({2, 1, 1, 4}), device); + auto tensor_ptr = autograd::create_tensor(tensor); + auto result = log_softmax_moreh(tensor_ptr, 3); + auto result_data = core::to_vector(result->get_value()); std::vector expected_data = { -1.24253553F, -1.34253553F, -1.44253553F, -1.54253553F, -1.17244159F, -1.37244159F, -1.47244159F, -1.57244159F}; EXPECT_EQ(result_data.size(), expected_data.size()); @@ -60,10 +63,177 @@ TEST_F(UnaryOpsTest, LogSoftmax) { } result->backward(); - auto tensor_grad = ttml::core::to_vector(tensor_ptr->get_grad()); + auto tensor_grad = core::to_vector(tensor_ptr->get_grad()); std::vector expected_grad = {-0.156F, -0.03906F, 0.05078F, 0.1406F, -0.25F, -0.0156F, 0.07421F, 0.16406F}; EXPECT_EQ(tensor_grad.size(), expected_grad.size()); for (uint32_t idx = 0; idx < tensor_grad.size(); ++idx) { EXPECT_NEAR(tensor_grad[idx], expected_grad[idx], 2e-2F); } } + +TEST_F(UnaryOpsTest, Silu) { + auto N = 4; + auto C = 1; + auto H = 20; + auto W = 5; + auto len = static_cast(N * C * H * W); + xt::random::seed(42); + xt::xarray a = xt::random::rand({N, C, H, W}, -1.0F, 1.0F); + xt::xarray expected_silu = { + {{{-0.10980F, 0.38199F, 0.64114F, -0.21957F, 0.28487F}, + {0.35594F, 0.10836F, 0.10620F, -0.23011F, -0.05124F}, + {-0.23012F, -0.24803F, -0.25842F, -0.03909F, 0.49457F}, + {-0.13889F, 0.11130F, -0.23475F, 0.25075F, 0.17348F}, + {-0.26570F, -0.25878F, 0.67579F, 0.27049F, 0.43906F}, + {0.61943F, -0.20712F, -0.26883F, -0.22022F, 0.71665F}, + {-0.21958F, 0.13122F, -0.15792F, 0.12407F, 0.02537F}, + {-0.26789F, -0.06343F, -0.26528F, -0.16581F, 0.02539F}, + {0.12431F, -0.09014F, -0.23589F, -0.26083F, -0.16526F}, + {0.68279F, -0.11588F, -0.19747F, -0.04200F, -0.25057F}, + {0.36437F, 0.13234F, -0.21275F, -0.10379F, 0.01444F}, + {0.70012F, 0.10093F, -0.03213F, -0.26088F, 0.48418F}, + {0.11907F, 0.21247F, -0.22469F, -0.04705F, -0.25686F}, + {-0.26692F, 0.63786F, 0.62592F, 0.66803F, 0.06729F}, + {0.40060F, -0.10151F, -0.15769F, -0.26648F, -0.24866F}, + {-0.19839F, 0.21780F, -0.19337F, -0.05627F, 0.21648F}, + {-0.24154F, 0.12205F, -0.00480F, 0.44028F, -0.26324F}, + {-0.22358F, 0.56809F, -0.09712F, -0.18414F, -0.22006F}, + {0.18871F, 0.31919F, -0.15325F, -0.06925F, 0.02047F}, + {-0.20911F, 0.04889F, 0.07228F, -0.21899F, -0.26381F}}}, + {{{0.67520F, 0.45507F, 0.34898F, -0.04772F, 0.62111F}, + {-0.09390F, 0.54309F, 0.59840F, 0.10745F, 0.27805F}, + {0.58999F, -0.14367F, -0.25112F, 0.07540F, -0.21434F}, + {0.02127F, -0.26112F, 0.65996F, -0.14447F, 0.45875F}, + {-0.09898F, 0.30727F, -0.17726F, 0.04127F, 0.43307F}, + {0.09426F, -0.12287F, 0.66734F, -0.17183F, 0.11845F}, + {0.04452F, -0.17465F, -0.23541F, -0.16279F, 0.39084F}, + {-0.22669F, -0.25463F, -0.26653F, 0.70683F, -0.07074F}, + {0.34458F, -0.09411F, -0.21316F, -0.16446F, -0.26812F}, + {-0.26678F, 0.41180F, -0.21311F, 0.24905F, 0.25535F}, + {0.28055F, 0.37209F, 0.34310F, 0.11715F, -0.25475F}, + {0.59777F, -0.12164F, 0.17373F, -0.24343F, 0.57790F}, + {0.48944F, 0.46779F, 0.13842F, -0.04800F, -0.14078F}, + {-0.24928F, -0.25720F, -0.11259F, -0.15371F, 0.19708F}, + {-0.14456F, 0.19320F, 0.28142F, 0.09961F, 0.15636F}, + {-0.17537F, 0.53008F, 0.06499F, -0.02701F, -0.10343F}, + {-0.24230F, 0.67907F, 0.25804F, 0.46594F, 0.32729F}, + {0.27010F, 0.06503F, -0.19589F, 0.34264F, -0.18558F}, + {-0.00617F, -0.26208F, 0.02325F, 0.25440F, -0.06722F}, + {-0.24491F, -0.26487F, -0.05699F, -0.24578F, -0.21186F}}}, + {{{-0.26378F, 0.54470F, 0.15490F, -0.02402F, -0.15157F}, + {0.06727F, 0.00864F, 0.23326F, 0.56505F, -0.23595F}, + {-0.18914F, 0.11528F, -0.08161F, 0.04143F, 0.31947F}, + {-0.21127F, -0.19940F, 0.62708F, -0.25404F, 0.10861F}, + {-0.16668F, 0.23225F, -0.22821F, 0.51862F, 0.60375F}, + {0.13974F, 0.40016F, -0.16317F, 0.15110F, -0.24647F}, + {0.50343F, -0.04158F, 0.39315F, -0.20431F, -0.21829F}, + {-0.07654F, 0.53920F, 0.52339F, 0.04089F, -0.14511F}, + {0.39909F, -0.24153F, 0.54526F, -0.12319F, -0.14923F}, + {0.56377F, -0.24515F, -0.17682F, -0.19982F, 0.16935F}, + {-0.06759F, -0.26887F, 0.41587F, -0.12585F, 0.48549F}, + {-0.15759F, -0.26791F, -0.22692F, 0.01086F, 0.03525F}, + {-0.07578F, -0.01494F, -0.20260F, 0.22902F, -0.24221F}, + {-0.17834F, -0.13625F, -0.19180F, 0.62718F, -0.22554F}, + {-0.14586F, -0.20416F, 0.01914F, 0.06147F, 0.24368F}, + {-0.08694F, -0.11789F, -0.25690F, 0.67920F, -0.18672F}, + {0.66226F, -0.19039F, -0.18784F, 0.23435F, -0.00274F}, + {0.25666F, -0.15999F, -0.23294F, -0.16957F, 0.72687F}, + {-0.26276F, -0.17979F, 0.12152F, 0.68801F, 0.00269F}, + {-0.08107F, -0.25984F, -0.26348F, -0.17314F, -0.13112F}}}, + {{{0.56626F, 0.15229F, -0.19410F, 0.21301F, -0.23405F}, {0.03189F, -0.01044F, -0.04949F, 0.70456F, 0.05569F}, + {-0.19285F, 0.10126F, 0.20148F, -0.25308F, 0.32854F}, {-0.11345F, -0.19507F, -0.19279F, 0.27941F, 0.39232F}, + {-0.11484F, -0.02882F, 0.14971F, 0.70047F, 0.15125F}, {-0.09097F, 0.03705F, 0.41335F, -0.25065F, 0.38480F}, + {0.44370F, -0.23201F, -0.14744F, 0.00827F, -0.21831F}, {0.23367F, -0.26201F, 0.48155F, 0.09913F, -0.14405F}, + {0.20877F, -0.20347F, -0.26637F, 0.25508F, 0.01224F}, {0.40235F, -0.20051F, -0.12861F, 0.16610F, -0.24907F}, + {-0.22319F, 0.62293F, 0.22696F, -0.09197F, -0.10049F}, {0.01807F, 0.61620F, 0.44761F, -0.23656F, 0.20624F}, + {-0.13388F, 0.28954F, -0.24414F, -0.20860F, 0.59494F}, {0.04316F, 0.51333F, 0.23363F, -0.18458F, -0.19952F}, + {0.18536F, -0.22296F, 0.41461F, 0.69817F, 0.05825F}, {0.01691F, 0.03053F, -0.18303F, -0.19295F, 0.72412F}, + {-0.24990F, 0.66764F, 0.54719F, 0.06169F, 0.55270F}, {0.52230F, 0.15071F, -0.21740F, -0.13528F, -0.17301F}, + {-0.12822F, 0.23997F, 0.27616F, 0.46224F, 0.54701F}, {0.47818F, 0.52986F, -0.08640F, 0.35622F, 0.53103F}}}}; + + auto a_tensor = autograd::create_tensor(core::from_xtensor(a, &autograd::ctx().get_device())); + auto computed_silu = silu(a_tensor); + auto computed_silu_xtensor = core::to_xtensor(computed_silu->get_value()); + EXPECT_TRUE(xt::allclose(computed_silu_xtensor, expected_silu, 8e-3F, 4e-2F)); + + xt::xarray expected_silu_grad_ = { + {{{-0.00021F, 0.00149F, 0.00287F, -0.00022F, 0.00103F}, + {0.00136F, 0.00032F, 0.00032F, -0.00021F, -0.00011F}, + {-0.00021F, -0.00017F, -0.00014F, -0.00009F, 0.00207F}, + {-0.00023F, 0.00033F, -0.00020F, 0.00088F, 0.00056F}, + {-0.00011F, -0.00014F, 0.00307F, 0.00097F, 0.00178F}, + {0.00275F, -0.00024F, -0.00010F, -0.00022F, 0.00331F}, + {-0.00022F, 0.00040F, -0.00024F, 0.00038F, 0.00007F}, + {-0.00010F, -0.00014F, -0.00011F, -0.00025F, 0.00007F}, + {0.00038F, -0.00018F, -0.00020F, -0.00013F, -0.00025F}, + {0.00311F, -0.00021F, -0.00024F, -0.00010F, -0.00017F}, + {0.00140F, 0.00041F, -0.00023F, -0.00020F, 0.00004F}, + {0.00321F, 0.00030F, -0.00007F, -0.00013F, 0.00201F}, + {0.00036F, 0.00072F, -0.00022F, -0.00011F, -0.00015F}, + {-0.00011F, 0.00285F, 0.00279F, 0.00303F, 0.00019F}, + {0.00158F, -0.00020F, -0.00024F, -0.00011F, -0.00017F}, + {-0.00024F, 0.00074F, -0.00024F, -0.00012F, 0.00074F}, + {-0.00019F, 0.00037F, -0.00001F, 0.00178F, -0.00012F}, + {-0.00022F, 0.00247F, -0.00019F, -0.00025F, -0.00022F}, + {0.00062F, 0.00119F, -0.00024F, -0.00015F, 0.00005F}, + {-0.00023F, 0.00013F, 0.00021F, -0.00022F, -0.00012F}}}, + {{{0.00307F, 0.00186F, 0.00133F, -0.00011F, 0.00276F}, + {-0.00019F, 0.00233F, 0.00263F, 0.00032F, 0.00100F}, + {0.00259F, -0.00024F, -0.00016F, 0.00021F, -0.00023F}, + {0.00006F, -0.00013F, 0.00298F, -0.00024F, 0.00188F}, + {-0.00019F, 0.00113F, -0.00025F, 0.00011F, 0.00175F}, + {0.00028F, -0.00022F, 0.00302F, -0.00025F, 0.00036F}, + {0.00012F, -0.00025F, -0.00020F, -0.00025F, 0.00153F}, + {-0.00021F, -0.00015F, -0.00011F, 0.00325F, -0.00015F}, + {0.00131F, -0.00019F, -0.00023F, -0.00025F, -0.00010F}, + {-0.00011F, 0.00164F, -0.00023F, 0.00087F, 0.00090F}, + {0.00101F, 0.00144F, 0.00130F, 0.00035F, -0.00015F}, + {0.00263F, -0.00022F, 0.00056F, -0.00018F, 0.00252F}, + {0.00204F, 0.00193F, 0.00043F, -0.00011F, -0.00024F}, + {-0.00017F, -0.00015F, -0.00021F, -0.00024F, 0.00066F}, + {-0.00024F, 0.00064F, 0.00102F, 0.00029F, 0.00050F}, + {-0.00025F, 0.00226F, 0.00018F, -0.00006F, -0.00020F}, + {-0.00019F, 0.00309F, 0.00091F, 0.00192F, 0.00123F}, + {0.00097F, 0.00018F, -0.00024F, 0.00130F, -0.00025F}, + {-0.00002F, -0.00013F, 0.00006F, 0.00090F, -0.00014F}, + {-0.00018F, -0.00012F, -0.00013F, -0.00018F, -0.00023F}}}, + {{{-0.00012F, 0.00234F, 0.00049F, -0.00006F, -0.00024F}, + {0.00019F, 0.00002F, 0.00081F, 0.00245F, -0.00020F}, + {-0.00025F, 0.00035F, -0.00017F, 0.00011F, 0.00119F}, + {-0.00023F, -0.00024F, 0.00279F, -0.00016F, 0.00032F}, + {-0.00025F, 0.00080F, -0.00021F, 0.00220F, 0.00266F}, + {0.00044F, 0.00158F, -0.00025F, 0.00048F, -0.00018F}, + {0.00211F, -0.00009F, 0.00155F, -0.00024F, -0.00022F}, + {-0.00016F, 0.00231F, 0.00222F, 0.00011F, -0.00024F}, + {0.00157F, -0.00019F, 0.00234F, -0.00022F, -0.00024F}, + {0.00244F, -0.00018F, -0.00025F, -0.00024F, 0.00055F}, + {-0.00014F, -0.00010F, 0.00166F, -0.00022F, 0.00202F}, + {-0.00024F, -0.00010F, -0.00021F, 0.00003F, 0.00009F}, + {-0.00016F, -0.00004F, -0.00024F, 0.00079F, -0.00019F}, + {-0.00025F, -0.00023F, -0.00024F, 0.00279F, -0.00022F}, + {-0.00024F, -0.00024F, 0.00005F, 0.00017F, 0.00085F}, + {-0.00018F, -0.00022F, -0.00015F, 0.00309F, -0.00025F}, + {0.00299F, -0.00024F, -0.00025F, 0.00081F, -0.00001F}, + {0.00091F, -0.00024F, -0.00020F, -0.00025F, 0.00337F}, + {-0.00013F, -0.00025F, 0.00037F, 0.00314F, 0.00001F}, + {-0.00017F, -0.00014F, -0.00012F, -0.00025F, -0.00023F}}}, + {{{0.00245F, 0.00048F, -0.00024F, 0.00072F, -0.00020F}, {0.00008F, -0.00003F, -0.00011F, 0.00324F, 0.00015F}, + {-0.00024F, 0.00030F, 0.00067F, -0.00016F, 0.00123F}, {-0.00021F, -0.00024F, -0.00024F, 0.00101F, 0.00154F}, + {-0.00021F, -0.00007F, 0.00047F, 0.00321F, 0.00048F}, {-0.00018F, 0.00010F, 0.00165F, -0.00017F, 0.00150F}, + {0.00180F, -0.00021F, -0.00024F, 0.00002F, -0.00022F}, {0.00081F, -0.00013F, 0.00200F, 0.00029F, -0.00024F}, + {0.00070F, -0.00024F, -0.00011F, 0.00090F, 0.00003F}, {0.00159F, -0.00024F, -0.00023F, 0.00053F, -0.00017F}, + {-0.00022F, 0.00277F, 0.00078F, -0.00018F, -0.00019F}, {0.00005F, 0.00273F, 0.00182F, -0.00020F, 0.00069F}, + {-0.00023F, 0.00105F, -0.00018F, -0.00023F, 0.00261F}, {0.00012F, 0.00217F, 0.00081F, -0.00025F, -0.00024F}, + {0.00061F, -0.00022F, 0.00165F, 0.00320F, 0.00016F}, {0.00004F, 0.00008F, -0.00025F, -0.00024F, 0.00335F}, + {-0.00017F, 0.00302F, 0.00235F, 0.00017F, 0.00238F}, {0.00222F, 0.00048F, -0.00023F, -0.00023F, -0.00025F}, + {-0.00023F, 0.00083F, 0.00099F, 0.00190F, 0.00235F}, {0.00198F, 0.00226F, -0.00017F, 0.00136F, 0.00226F}}}}; + xt::xarray expected_silu_grad = expected_silu_grad_.reshape({N, C, H, W}); + + auto target = autograd::create_tensor(core::zeros_like(computed_silu->get_value())); + auto result = mse_loss(computed_silu, target); + result->backward(); + auto computed_silu_grad = core::to_xtensor(computed_silu->get_grad()); + EXPECT_TRUE(xt::allclose(computed_silu_grad, expected_silu_grad, 8e-3F, 4e-2F)); +} + +} // namespace ttml::ops::tests From f0dd37751f8f03a7c8ebfce2c1238975d80ea00c Mon Sep 17 00:00:00 2001 From: Allan Liu Date: Thu, 20 Feb 2025 18:23:26 +0000 Subject: [PATCH 275/316] Decouple control plane init and configuring routing tables --- .../routing/test_tt_fabric_multi_hop_sanity.cpp | 1 + .../perf_microbenchmark/routing/test_tt_fabric_sanity.cpp | 1 + tt_metal/fabric/control_plane.cpp | 1 - tt_metal/impl/device/device_pool.cpp | 7 +++++-- 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp index 00761a5843a..111176b7992 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp @@ -236,6 +236,7 @@ int main(int argc, char** argv) { std::filesystem::path(tt::llrt::RunTimeOptions::get_instance().get_root_dir()) / "tt_metal/fabric/mesh_graph_descriptors/tg_mesh_graph_descriptor.yaml"; auto control_plane = std::make_unique(tg_mesh_graph_desc_path.string()); + control_plane->configure_routing_tables(); int num_devices = tt_metal::GetNumAvailableDevices(); if (test_device_id_l >= num_devices) { diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp index c6d48b3f670..83c9a5e0bfa 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp @@ -145,6 +145,7 @@ typedef struct test_board { tt::tt_metal::detail::CreateDevices(available_chip_ids, 1, 0, 0, DispatchCoreConfig{dispatch_core_type}); if (metal_fabric_init_level == 0) { _init_control_plane(mesh_graph_descriptor); + control_plane->configure_routing_tables(); } else { control_plane = tt::DevicePool::instance().get_control_plane(); } diff --git a/tt_metal/fabric/control_plane.cpp b/tt_metal/fabric/control_plane.cpp index b8787ba29cc..f35254590f3 100644 --- a/tt_metal/fabric/control_plane.cpp +++ b/tt_metal/fabric/control_plane.cpp @@ -52,7 +52,6 @@ ControlPlane::ControlPlane(const std::string& mesh_graph_desc_file) { this->routing_table_generator_->print_routing_tables(); this->initialize_from_mesh_graph_desc_file(mesh_graph_desc_file); - this->configure_routing_tables(); // Printing, only enabled with log_debug this->print_ethernet_channels(); diff --git a/tt_metal/impl/device/device_pool.cpp b/tt_metal/impl/device/device_pool.cpp index a9c9840a9f6..b7f1704a30b 100644 --- a/tt_metal/impl/device/device_pool.cpp +++ b/tt_metal/impl/device/device_pool.cpp @@ -403,9 +403,12 @@ void DevicePool::add_devices_to_pool(const std::vector& device_ids) { } // TODO: add handling of EDM + // Initialize control plane, does not configure kernels/routing tables + // We always need a control plane for mapping of logical devices to physical devices + _inst->initialize_control_plane(); if (this->fabric_setting == detail::FabricSetting::FABRIC) { - // Initialize control plane, which writes routing tables to all ethernet cores - _inst->initialize_control_plane(); + // write routing tables to all ethernet cores + this->control_plane->configure_routing_tables(); } this->using_fast_dispatch = (std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr); if (this->using_fast_dispatch) { From acd603cd37b34e807a42132714b4a7c3c9ce8a93 Mon Sep 17 00:00:00 2001 From: Allan Liu Date: Thu, 20 Feb 2025 18:55:54 +0000 Subject: [PATCH 276/316] ControlPlane: add api to get direct routers to chip --- tt_metal/api/tt-metalium/control_plane.hpp | 4 +++ tt_metal/fabric/control_plane.cpp | 29 ++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/tt_metal/api/tt-metalium/control_plane.hpp b/tt_metal/api/tt-metalium/control_plane.hpp index 7c62a0ef9e4..fa78a7144e2 100644 --- a/tt_metal/api/tt-metalium/control_plane.hpp +++ b/tt_metal/api/tt-metalium/control_plane.hpp @@ -43,6 +43,10 @@ class ControlPlane { chip_id_t dst_chip_id, chan_id_t src_chan_id) const; + // Return routers to get to the destination chip, avoid local eth to eth routing + std::vector> get_routers_to_chip( + mesh_id_t src_mesh_id, chip_id_t src_chip_id, mesh_id_t dst_mesh_id, chip_id_t dst_chip_id) const; + std::vector get_intra_chip_neighbors( mesh_id_t src_mesh_id, chip_id_t src_chip_id, RoutingDirection routing_direction) const; diff --git a/tt_metal/fabric/control_plane.cpp b/tt_metal/fabric/control_plane.cpp index f35254590f3..c6595f0a802 100644 --- a/tt_metal/fabric/control_plane.cpp +++ b/tt_metal/fabric/control_plane.cpp @@ -581,6 +581,35 @@ std::vector> ControlPlane::get_fabric_route( return route; } +std::vector> ControlPlane::get_routers_to_chip( + mesh_id_t src_mesh_id, chip_id_t src_chip_id, mesh_id_t dst_mesh_id, chip_id_t dst_chip_id) const { + std::vector> routers; + const auto& router_direction_eth_channels = + router_port_directions_to_physical_eth_chan_map_[src_mesh_id][src_chip_id]; + for (const auto& [direction, eth_chans] : router_direction_eth_channels) { + for (const auto& src_chan_id : eth_chans) { + chan_id_t next_chan_id = 0; + if (src_mesh_id != dst_mesh_id) { + // Inter-mesh routing + next_chan_id = this->inter_mesh_routing_tables_[src_mesh_id][src_chip_id][src_chan_id][dst_mesh_id]; + + } else if (src_chip_id != dst_chip_id) { + // Intra-mesh routing + next_chan_id = this->intra_mesh_routing_tables_[src_mesh_id][src_chip_id][src_chan_id][dst_chip_id]; + } + if (src_chan_id != next_chan_id) { + continue; + } + const auto& physical_chip_id = + this->logical_mesh_chip_id_to_physical_chip_id_mapping_[src_mesh_id][src_chip_id]; + routers.emplace_back( + this->get_routing_plane_id(src_chan_id), + tt::Cluster::instance().get_virtual_eth_core_from_channel(physical_chip_id, src_chan_id)); + } + } + return routers; +} + std::vector ControlPlane::get_intra_chip_neighbors( mesh_id_t src_mesh_id, chip_id_t src_chip_id, RoutingDirection routing_direction) const { for (const auto& [_, routing_edge] : From 3cb663bd7cc880fa0106957fa2f4275e3be4477b Mon Sep 17 00:00:00 2001 From: Allan Liu Date: Mon, 24 Feb 2025 15:56:21 +0000 Subject: [PATCH 277/316] Initial checkin of fabric api examples, some minor cleanup --- tests/tt_metal/tt_fabric/CMakeLists.txt | 5 +- .../tt_fabric/common/fabric_fixture.hpp | 38 +- .../fabric_async_write_atomic_inc_sender.cpp | 61 ++ ...rite_multicast_multidirectional_sender.cpp | 137 ++++ .../fabric_async_write_multicast_sender.cpp | 60 ++ ...abric_async_write_routing_plane_sender.cpp | 51 ++ .../kernels/fabric_async_write_sender.cpp | 56 ++ .../kernels/fabric_atomic_inc_sender.cpp | 57 ++ .../kernels/fabric_receiver.cpp | 16 + .../test_basic_fabric_apis.cpp | 765 ++++++++++++++++++ .../routing/test_tt_fabric_sanity.cpp | 4 +- 11 files changed, 1245 insertions(+), 5 deletions(-) create mode 100644 tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_atomic_inc_sender.cpp create mode 100644 tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_multicast_multidirectional_sender.cpp create mode 100644 tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_multicast_sender.cpp create mode 100644 tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_routing_plane_sender.cpp create mode 100644 tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_sender.cpp create mode 100644 tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_atomic_inc_sender.cpp create mode 100644 tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_receiver.cpp create mode 100644 tests/tt_metal/tt_fabric/fabric_data_movement/test_basic_fabric_apis.cpp diff --git a/tests/tt_metal/tt_fabric/CMakeLists.txt b/tests/tt_metal/tt_fabric/CMakeLists.txt index 796577e524c..8b449020302 100644 --- a/tests/tt_metal/tt_fabric/CMakeLists.txt +++ b/tests/tt_metal/tt_fabric/CMakeLists.txt @@ -1,4 +1,7 @@ -set(UNIT_TESTS_FABRIC_SRC ${CMAKE_CURRENT_SOURCE_DIR}/fabric_router/test_routing_tables.cpp) +set(UNIT_TESTS_FABRIC_SRC + ${CMAKE_CURRENT_SOURCE_DIR}/fabric_router/test_routing_tables.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/fabric_data_movement/test_basic_fabric_apis.cpp +) add_executable(fabric_unit_tests ${UNIT_TESTS_FABRIC_SRC}) target_link_libraries( diff --git a/tests/tt_metal/tt_fabric/common/fabric_fixture.hpp b/tests/tt_metal/tt_fabric/common/fabric_fixture.hpp index 23b5dcdfd79..b69e2aae769 100644 --- a/tests/tt_metal/tt_fabric/common/fabric_fixture.hpp +++ b/tests/tt_metal/tt_fabric/common/fabric_fixture.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. // // SPDX-License-Identifier: Apache-2.0 @@ -8,6 +8,8 @@ #include "tt_metal/test_utils/env_vars.hpp" #include #include +#include +#include namespace tt::tt_fabric { namespace fabric_router_tests { @@ -29,4 +31,38 @@ class ControlPlaneFixture : public ::testing::Test { }; } // namespace fabric_router_tests + +class FabricFixture : public ::testing::Test { +protected: + tt::ARCH arch_; + std::map devices_map_; + tt::tt_fabric::ControlPlane* control_plane_; + bool slow_dispatch_; + + void SetUp() override { + auto slow_dispatch_ = getenv("TT_METAL_SLOW_DISPATCH_MODE"); + if (slow_dispatch_) { + tt::log_info( + tt::LogTest, + "Fabric test suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE unset"); + GTEST_SKIP(); + } + // Set up all available devices + this->arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); + auto num_devices = tt::tt_metal::GetNumAvailableDevices(); + std::vector ids; + for (unsigned int id = 0; id < num_devices; id++) { + ids.push_back(id); + } + tt::tt_metal::detail::InitializeFabricSetting(tt::tt_metal::detail::FabricSetting::FABRIC); + devices_map_ = tt::tt_metal::detail::CreateDevices(ids); + control_plane_ = tt::DevicePool::instance().get_control_plane(); + } + + void TearDown() override { + std::cout << " TEARDOWN" << std::endl; + tt::tt_metal::detail::CloseDevices(devices_map_); + } +}; + } // namespace tt::tt_fabric diff --git a/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_atomic_inc_sender.cpp b/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_atomic_inc_sender.cpp new file mode 100644 index 00000000000..c1d00e50a6d --- /dev/null +++ b/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_atomic_inc_sender.cpp @@ -0,0 +1,61 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +// clang-format off +#include "dataflow_api.h" +#include "debug/dprint.h" +#include "tt_fabric/hw/inc/tt_fabric.h" +#include "tt_fabric/hw/inc/tt_fabric_interface.h" +#include "tt_fabric/hw/inc/tt_fabric_api.h" +#include "tests/tt_metal/tt_metal/perf_microbenchmark/common/kernel_utils.hpp" + +// clang-format on + +using namespace tt::tt_fabric; + +volatile fabric_client_interface_t* client_interface; + +uint64_t xy_local_addr; + +void kernel_main() { + uint32_t rt_args_idx = 0; + // Fabric configuration specific arguments + uint32_t client_interface_addr = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t gk_interface_addr_l = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t gk_interface_addr_h = get_arg_val(increment_arg_idx(rt_args_idx)); + + uint32_t src_addr = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t dst_noc_offset = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t dst_write_addr = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t dst_atomic_addr = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t num_bytes = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t atomic_inc = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t dst_mesh_id = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t dst_device_id = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t router_noc_xy = get_arg_val(increment_arg_idx(rt_args_idx)); + + uint64_t dst_write_noc_addr = get_noc_addr_helper(dst_noc_offset, dst_write_addr); + uint64_t dst_atomic_noc_addr = get_noc_addr_helper(dst_noc_offset, dst_atomic_addr); + uint32_t packet_size_bytes = num_bytes + PACKET_HEADER_SIZE_BYTES; + fabric_async_write_atomic_inc_add_header( + src_addr, // source address in sender’s memory + dst_mesh_id, + dst_device_id, + dst_write_noc_addr, // destination write address + dst_atomic_noc_addr, // destination atomic address + packet_size_bytes, // number of bytes to write to remote destination + atomic_inc // atomic increment value + ); + + // make sure fabric node gatekeeper is available. + fabric_endpoint_init(client_interface_addr, gk_interface_addr_l, gk_interface_addr_h); + + fabric_setup_pull_request( + src_addr, // source address in sender’s memory + packet_size_bytes // number of bytes to write to remote destination + ); + + fabric_send_pull_request(router_noc_xy, dst_mesh_id, dst_device_id); + fabric_wait_for_pull_request_flushed(); +} diff --git a/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_multicast_multidirectional_sender.cpp b/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_multicast_multidirectional_sender.cpp new file mode 100644 index 00000000000..42a49426d7d --- /dev/null +++ b/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_multicast_multidirectional_sender.cpp @@ -0,0 +1,137 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +// clang-format off +#include "dataflow_api.h" +#include "debug/dprint.h" +#include "tt_fabric/hw/inc/tt_fabric.h" +#include "tt_fabric/hw/inc/tt_fabric_interface.h" +#include "tt_fabric/hw/inc/tt_fabric_api.h" +#include "tests/tt_metal/tt_metal/perf_microbenchmark/common/kernel_utils.hpp" + +// clang-format on + +using namespace tt::tt_fabric; + +volatile fabric_client_interface_t* client_interface; + +uint64_t xy_local_addr; + +void kernel_main() { + uint32_t rt_args_idx = 0; + // Fabric configuration specific arguments + uint32_t client_interface_addr = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t gk_interface_addr_l = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t gk_interface_addr_h = get_arg_val(increment_arg_idx(rt_args_idx)); + + uint32_t src_addr = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t dst_noc_offset = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t dst_addr = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t num_bytes = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t e_dst_mesh_id = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t e_dst_device_id = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t e_depth = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t e_router_noc_xy = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t w_dst_mesh_id = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t w_dst_device_id = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t w_depth = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t w_router_noc_xy = get_arg_val(increment_arg_idx(rt_args_idx)); + // uint32_t n_dst_mesh_id = get_arg_val(increment_arg_idx(rt_args_idx)); + // uint32_t n_dst_device_id = get_arg_val(increment_arg_idx(rt_args_idx)); + // uint32_t n_depth = get_arg_val(increment_arg_idx(rt_args_idx)); + // uint32_t n_router_noc_xy = get_arg_val(increment_arg_idx(rt_args_idx)); + // uint32_t s_dst_mesh_id = get_arg_val(increment_arg_idx(rt_args_idx)); + // uint32_t s_dst_device_id = get_arg_val(increment_arg_idx(rt_args_idx)); + // uint32_t s_depth = get_arg_val(increment_arg_idx(rt_args_idx)); + // uint32_t s_router_noc_xy = get_arg_val(increment_arg_idx(rt_args_idx)); + constexpr uint32_t num_dirs = 2; // 4 + + uint64_t dst_noc_addr = get_noc_addr_helper(dst_noc_offset, dst_addr); + uint32_t packet_size_bytes = num_bytes + PACKET_HEADER_SIZE_BYTES; + fabric_async_write_multicast_add_header( + src_addr, // source address in sender’s memory + e_dst_mesh_id, + e_dst_device_id, + dst_noc_addr, // destination write address + packet_size_bytes, // number of bytes to write to remote destination + e_depth, + 0, + 0, + 0); + + // make sure fabric node gatekeeper is available. + fabric_endpoint_init(client_interface_addr, gk_interface_addr_l, gk_interface_addr_h); + for (uint32_t i = 1; i < num_dirs; i++) { + copy_l1_buf((uint32_t*)client_interface, (uint32_t*)(client_interface + i), sizeof(fabric_client_interface_t)); + } + + fabric_setup_pull_request( + src_addr, // source address in sender’s memory + packet_size_bytes // number of bytes to write to remote destination + ); + + fabric_send_pull_request(e_router_noc_xy, e_dst_mesh_id, e_dst_device_id); + fabric_wait_for_pull_request_bytes_flushed(PACKET_HEADER_SIZE_BYTES); + packet_header_t* packet_header = (packet_header_t*)(src_addr); + + // West Mcast + client_interface++; + + packet_header->routing.dst_mesh_id = w_dst_mesh_id; + packet_header->routing.dst_dev_id = w_dst_device_id; + packet_header->packet_parameters.mcast_parameters.east = 0; + packet_header->packet_parameters.mcast_parameters.west = w_depth; + // make sure fabric node gatekeeper is available. + fabric_endpoint_init(client_interface_addr, gk_interface_addr_l, gk_interface_addr_h); + + fabric_setup_pull_request( + src_addr, // source address in sender’s memory + packet_size_bytes // number of bytes to write to remote destination + ); + + fabric_send_pull_request(w_router_noc_xy, w_dst_mesh_id, w_dst_device_id); + // fabric_wait_for_pull_request_bytes_flushed(PACKET_HEADER_SIZE_BYTES); + + // // North Mcast + // client_interface++; + + // packet_header->routing.dst_mesh_id = n_dst_mesh_id; + // packet_header->routing.dst_dev_id = n_dst_device_id; + // packet_header->packet_parameters.mcast_parameters.west = 0; + // packet_header->packet_parameters.mcast_parameters.north = n_depth; + // // make sure fabric node gatekeeper is available. + // fabric_endpoint_init(client_interface_addr, gk_interface_addr_l, gk_interface_addr_h); + + // fabric_setup_pull_request( + // src_addr, // source address in sender’s memory + // packet_size_bytes // number of bytes to write to remote destination + // ); + + // fabric_send_pull_request(n_router_noc_xy, n_dst_mesh_id, n_dst_device_id); + // fabric_wait_for_pull_request_bytes_flushed(PACKET_HEADER_SIZE_BYTES); + + // // South Mcast + // client_interface++; + + // packet_header->routing.dst_mesh_id = s_dst_mesh_id; + // packet_header->routing.dst_dev_id = s_dst_device_id; + // packet_header->packet_parameters.mcast_parameters.north = 0; + // packet_header->packet_parameters.mcast_parameters.south = s_depth; + // // make sure fabric node gatekeeper is available. + // fabric_endpoint_init(client_interface_addr, gk_interface_addr_l, gk_interface_addr_h); + + // fabric_setup_pull_request( + // src_addr, // source address in sender’s memory + // packet_size_bytes // number of bytes to write to remote destination + // ); + + // fabric_send_pull_request(s_router_noc_xy, s_dst_mesh_id, s_dst_device_id); + + // Flush all pull requests + client_interface = (volatile fabric_client_interface_t*)client_interface_addr; + for (uint32_t i = 0; i < num_dirs; i++) { + fabric_wait_for_pull_request_flushed(); + client_interface++; + } +} diff --git a/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_multicast_sender.cpp b/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_multicast_sender.cpp new file mode 100644 index 00000000000..57ee4376fcd --- /dev/null +++ b/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_multicast_sender.cpp @@ -0,0 +1,60 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +// clang-format off +#include "dataflow_api.h" +#include "debug/dprint.h" +#include "tt_fabric/hw/inc/tt_fabric.h" +#include "tt_fabric/hw/inc/tt_fabric_interface.h" +#include "tt_fabric/hw/inc/tt_fabric_api.h" +#include "tests/tt_metal/tt_metal/perf_microbenchmark/common/kernel_utils.hpp" + +// clang-format on + +using namespace tt::tt_fabric; + +volatile fabric_client_interface_t* client_interface; + +uint64_t xy_local_addr; + +void kernel_main() { + uint32_t rt_args_idx = 0; + // Fabric configuration specific arguments + uint32_t client_interface_addr = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t gk_interface_addr_l = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t gk_interface_addr_h = get_arg_val(increment_arg_idx(rt_args_idx)); + + uint32_t src_addr = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t dst_noc_offset = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t dst_addr = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t num_bytes = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t e_dst_mesh_id = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t e_dst_device_id = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t e_depth = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t e_router_noc_xy = get_arg_val(increment_arg_idx(rt_args_idx)); + + uint64_t dst_noc_addr = get_noc_addr_helper(dst_noc_offset, dst_addr); + uint32_t packet_size_bytes = num_bytes + PACKET_HEADER_SIZE_BYTES; + fabric_async_write_multicast_add_header( + src_addr, // source address in sender’s memory + e_dst_mesh_id, + e_dst_device_id, + dst_noc_addr, // destination write address + packet_size_bytes, // number of bytes to write to remote destination + e_depth, + 0, + 0, + 0); + + // make sure fabric node gatekeeper is available. + fabric_endpoint_init(client_interface_addr, gk_interface_addr_l, gk_interface_addr_h); + + fabric_setup_pull_request( + src_addr, // source address in sender’s memory + packet_size_bytes // number of bytes to write to remote destination + ); + + fabric_send_pull_request(e_router_noc_xy, e_dst_mesh_id, e_dst_device_id); + fabric_wait_for_pull_request_flushed(); +} diff --git a/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_routing_plane_sender.cpp b/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_routing_plane_sender.cpp new file mode 100644 index 00000000000..4c18a71a06c --- /dev/null +++ b/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_routing_plane_sender.cpp @@ -0,0 +1,51 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +// clang-format off +#include "dataflow_api.h" +#include "debug/dprint.h" +#include "tt_fabric/hw/inc/tt_fabric.h" +#include "tt_fabric/hw/inc/tt_fabric_interface.h" +#include "tt_fabric/hw/inc/tt_fabric_api.h" +#include "tests/tt_metal/tt_metal/perf_microbenchmark/common/kernel_utils.hpp" + +// clang-format on + +using namespace tt::tt_fabric; + +volatile fabric_client_interface_t* client_interface; + +uint64_t xy_local_addr; + +void kernel_main() { + uint32_t rt_args_idx = 0; + // Fabric configuration specific arguments + uint32_t client_interface_addr = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t gk_interface_addr_l = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t gk_interface_addr_h = get_arg_val(increment_arg_idx(rt_args_idx)); + + uint32_t src_addr = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t dst_noc_offset = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t dst_addr = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t num_bytes = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t dst_mesh_id = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t dst_device_id = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t routing_plane = get_arg_val(increment_arg_idx(rt_args_idx)); + + uint64_t dst_noc_addr = get_noc_addr_helper(dst_noc_offset, dst_addr); + uint32_t packet_size_bytes = num_bytes + PACKET_HEADER_SIZE_BYTES; + + // make sure fabric node gatekeeper is available. + fabric_endpoint_init(client_interface_addr, gk_interface_addr_l, gk_interface_addr_h); + + fabric_async_write( + routing_plane, + src_addr, // source address in sender’s memory + dst_mesh_id, + dst_device_id, + dst_noc_addr, // destination write address + packet_size_bytes // number of bytes to write to remote destination + ); + fabric_wait_for_pull_request_flushed(); +} diff --git a/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_sender.cpp b/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_sender.cpp new file mode 100644 index 00000000000..195fb00331c --- /dev/null +++ b/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_sender.cpp @@ -0,0 +1,56 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +// clang-format off +#include "dataflow_api.h" +#include "debug/dprint.h" +#include "tt_fabric/hw/inc/tt_fabric.h" +#include "tt_fabric/hw/inc/tt_fabric_interface.h" +#include "tt_fabric/hw/inc/tt_fabric_api.h" +#include "tests/tt_metal/tt_metal/perf_microbenchmark/common/kernel_utils.hpp" + +// clang-format on + +using namespace tt::tt_fabric; + +volatile fabric_client_interface_t* client_interface; + +uint64_t xy_local_addr; + +void kernel_main() { + uint32_t rt_args_idx = 0; + // Fabric configuration specific arguments + uint32_t client_interface_addr = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t gk_interface_addr_l = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t gk_interface_addr_h = get_arg_val(increment_arg_idx(rt_args_idx)); + + uint32_t src_addr = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t dst_noc_offset = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t dst_addr = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t num_bytes = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t dst_mesh_id = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t dst_device_id = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t router_noc_xy = get_arg_val(increment_arg_idx(rt_args_idx)); + + uint64_t dst_noc_addr = get_noc_addr_helper(dst_noc_offset, dst_addr); + uint32_t packet_size_bytes = num_bytes + PACKET_HEADER_SIZE_BYTES; + fabric_async_write_add_header( + src_addr, // source address in sender’s memory + dst_mesh_id, + dst_device_id, + dst_noc_addr, // destination write address + packet_size_bytes // number of bytes to write to remote destination + ); + + // make sure fabric node gatekeeper is available. + fabric_endpoint_init(client_interface_addr, gk_interface_addr_l, gk_interface_addr_h); + + fabric_setup_pull_request( + src_addr, // source address in sender’s memory + packet_size_bytes // number of bytes to write to remote destination + ); + + fabric_send_pull_request(router_noc_xy, dst_mesh_id, dst_device_id); + fabric_wait_for_pull_request_flushed(); +} diff --git a/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_atomic_inc_sender.cpp b/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_atomic_inc_sender.cpp new file mode 100644 index 00000000000..6fdd05f63aa --- /dev/null +++ b/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_atomic_inc_sender.cpp @@ -0,0 +1,57 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +// clang-format off +#include "dataflow_api.h" +#include "debug/dprint.h" +#include "tt_fabric/hw/inc/tt_fabric.h" +#include "tt_fabric/hw/inc/tt_fabric_interface.h" +#include "tt_fabric/hw/inc/tt_fabric_api.h" +#include "tests/tt_metal/tt_metal/perf_microbenchmark/common/kernel_utils.hpp" + +// clang-format on + +using namespace tt::tt_fabric; + +volatile fabric_client_interface_t* client_interface; + +uint64_t xy_local_addr; + +void kernel_main() { + uint32_t rt_args_idx = 0; + // Fabric configuration specific arguments + uint32_t client_interface_addr = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t gk_interface_addr_l = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t gk_interface_addr_h = get_arg_val(increment_arg_idx(rt_args_idx)); + + uint32_t src_addr = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t dst_noc_offset = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t dst_addr = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t atomic_inc = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t wrap_boundary = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t dst_mesh_id = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t dst_device_id = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t router_noc_xy = get_arg_val(increment_arg_idx(rt_args_idx)); + + uint64_t dst_noc_addr = get_noc_addr_helper(dst_noc_offset, dst_addr); + uint32_t packet_size_bytes = PACKET_HEADER_SIZE_BYTES; + fabric_atomic_inc_add_header( + src_addr, // source address in sender’s memory + dst_mesh_id, + dst_device_id, + dst_noc_addr, // destination write address + atomic_inc, + wrap_boundary); + + // make sure fabric node gatekeeper is available. + fabric_endpoint_init(client_interface_addr, gk_interface_addr_l, gk_interface_addr_h); + + fabric_setup_pull_request( + src_addr, // source address in sender’s memory + packet_size_bytes // number of bytes to write to remote destination + ); + + fabric_send_pull_request(router_noc_xy, dst_mesh_id, dst_device_id); + fabric_wait_for_pull_request_flushed(); +} diff --git a/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_receiver.cpp b/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_receiver.cpp new file mode 100644 index 00000000000..6588b336ac2 --- /dev/null +++ b/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_receiver.cpp @@ -0,0 +1,16 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "dataflow_api.h" +#include "tests/tt_metal/tt_metal/perf_microbenchmark/common/kernel_utils.hpp" + +void kernel_main() { + uint32_t rt_args_idx = 0; + uint32_t address = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t size = get_arg_val(increment_arg_idx(rt_args_idx)); + + volatile tt_l1_ptr uint32_t* ptr = + reinterpret_cast(address + size - sizeof(uint32_t)); + while (*ptr == 0); +} diff --git a/tests/tt_metal/tt_fabric/fabric_data_movement/test_basic_fabric_apis.cpp b/tests/tt_metal/tt_fabric/fabric_data_movement/test_basic_fabric_apis.cpp new file mode 100644 index 00000000000..84d6dea5e5c --- /dev/null +++ b/tests/tt_metal/tt_fabric/fabric_data_movement/test_basic_fabric_apis.cpp @@ -0,0 +1,765 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "fabric_fixture.hpp" +#include "llrt.hpp" + +namespace tt::tt_fabric { + +TEST_F(FabricFixture, TestShell) { std::cout << " Test started " << std::endl; } +/* +TEST_F(FabricFixture, TestAsyncWriteRoutingPlane) { + CoreCoord sender_logical_core = {0, 0}; + CoreCoord receiver_logical_core = {1, 0}; + std::pair start_mesh_chip_id; + chip_id_t physical_start_device_id; + std::pair end_mesh_chip_id; + chip_id_t physical_end_device_id; + bool connection_found = false; + for (const auto &[id, device] : devices_map_) { + start_mesh_chip_id = control_plane_->get_mesh_chip_id_from_physical_chip_id(device->id()); + auto neighbors = control_plane_->get_intra_chip_neighbors( + start_mesh_chip_id.first, start_mesh_chip_id.second, RoutingDirection::E); + if (neighbors.size() > 0) { + physical_start_device_id = device->id(); + end_mesh_chip_id = {start_mesh_chip_id.first, neighbors[0]}; + physical_end_device_id = control_plane_->get_physical_chip_id_from_mesh_chip_id(end_mesh_chip_id); + connection_found = true; + break; + } + } + if (!connection_found) { + GTEST_SKIP() << "No path found between sender and receivers"; + } + auto* sender_device = DevicePool::instance().get_active_device(physical_start_device_id); + auto* receiver_device = DevicePool::instance().get_active_device(physical_end_device_id); + CoreCoord sender_virtual_core = sender_device->worker_core_from_logical_core(sender_logical_core); + CoreCoord receiver_virtual_core = receiver_device->worker_core_from_logical_core(receiver_logical_core); + + uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); + + uint32_t worker_unreserved_base_addr = + hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); + uint32_t client_interface_addr = worker_unreserved_base_addr; + uint32_t packet_header_addr = tt::round_up( + client_interface_addr + sizeof(fabric_client_interface_t) + 4 * sizeof(fabric_router_l1_config_t), + l1_alignment); + uint32_t buffer_data_addr = packet_header_addr + PACKET_HEADER_SIZE_BYTES; + uint32_t buffer_data_size = tt::constants::TILE_HW * sizeof(uint32_t); + std::vector buffer_data(buffer_data_size / sizeof(uint32_t), 0); + tt::llrt::write_hex_vec_to_core(physical_end_device_id, receiver_virtual_core, buffer_data, buffer_data_addr); + + std::iota(buffer_data.begin(), buffer_data.end(), 0); + tt::llrt::write_hex_vec_to_core(physical_start_device_id, sender_virtual_core, buffer_data, buffer_data_addr); + + tt::Cluster::instance().l1_barrier(physical_end_device_id); + tt::Cluster::instance().l1_barrier(physical_start_device_id); + + auto receiver_noc_encoding = tt::tt_metal::hal.noc_xy_encoding(receiver_virtual_core.x, receiver_virtual_core.y); + + auto sender_program = tt_metal::CreateProgram(); + auto sender_kernel = tt_metal::CreateKernel( + sender_program, + "tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_routing_plane_sender.cpp", + {sender_logical_core}, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); + + auto [sender_gk_noc_offset, sender_gk_interface_addr] = + this->GetFabricData().get_gatekeeper_noc_addr(physical_start_device_id); + + uint32_t routing_plane = 0; + std::vector sender_runtime_args = { + client_interface_addr, + sender_gk_interface_addr, + sender_gk_noc_offset, + packet_header_addr, + receiver_noc_encoding, + buffer_data_addr, + buffer_data_size, + end_mesh_chip_id.first, + end_mesh_chip_id.second, + routing_plane}; + tt_metal::SetRuntimeArgs(sender_program, sender_kernel, sender_logical_core, sender_runtime_args); + + auto receiver_program = tt_metal::CreateProgram(); + auto receiver_kernel = tt_metal::CreateKernel( + receiver_program, + "tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_receiver.cpp", + {receiver_logical_core}, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); + + auto [receiver_gk_noc_offset, receiver_gk_interface_addr] = + this->GetFabricData().get_gatekeeper_noc_addr(physical_end_device_id); + std::vector receiver_runtime_args = { + buffer_data_addr, + buffer_data_size, + }; + tt_metal::SetRuntimeArgs(receiver_program, receiver_kernel, receiver_logical_core, receiver_runtime_args); + + tt_metal::detail::LaunchProgram(receiver_device, receiver_program, false); + tt_metal::detail::LaunchProgram(sender_device, sender_program, false); + tt_metal::detail::WaitProgramDone(sender_device, sender_program); + tt_metal::detail::WaitProgramDone(receiver_device, receiver_program); + + std::vector received_buffer_data = tt::llrt::read_hex_vec_from_core( + physical_end_device_id, receiver_virtual_core, buffer_data_addr, buffer_data_size); + EXPECT_EQ(buffer_data, received_buffer_data); +} + +TEST_F(FabricFixture, TestAsyncWrite) { + CoreCoord sender_logical_core = {0, 0}; + CoreCoord receiver_logical_core = {1, 0}; + std::pair start_mesh_chip_id; + chip_id_t physical_start_device_id; + std::pair end_mesh_chip_id; + chip_id_t physical_end_device_id; + bool connection_found = false; + for (const auto &[id, device] : devices_map_) { + start_mesh_chip_id = control_plane_->get_mesh_chip_id_from_physical_chip_id(device->id()); + auto neighbors = control_plane_->get_intra_chip_neighbors( + start_mesh_chip_id.first, start_mesh_chip_id.second, RoutingDirection::E); + if (neighbors.size() > 0) { + physical_start_device_id = device->id(); + end_mesh_chip_id = {start_mesh_chip_id.first, neighbors[0]}; + physical_end_device_id = control_plane_->get_physical_chip_id_from_mesh_chip_id(end_mesh_chip_id); + connection_found = true; + break; + } + } + auto routers = control_plane_->get_routers_to_chip( + start_mesh_chip_id.first, start_mesh_chip_id.second, end_mesh_chip_id.first, end_mesh_chip_id.second); + + if (routers.empty()) { + GTEST_SKIP() << "No path found between sender and receivers"; + } + auto* sender_device = DevicePool::instance().get_active_device(physical_start_device_id); + auto* receiver_device = DevicePool::instance().get_active_device(physical_end_device_id); + CoreCoord sender_virtual_core = sender_device->worker_core_from_logical_core(sender_logical_core); + CoreCoord receiver_virtual_core = receiver_device->worker_core_from_logical_core(receiver_logical_core); + + uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); + + uint32_t worker_unreserved_base_addr = + hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); + uint32_t client_interface_addr = worker_unreserved_base_addr; + uint32_t packet_header_addr = tt::round_up(client_interface_addr + sizeof(fabric_client_interface_t), l1_alignment); + uint32_t buffer_data_addr = packet_header_addr + PACKET_HEADER_SIZE_BYTES; + uint32_t buffer_data_size = tt::constants::TILE_HW * sizeof(uint32_t); + std::vector buffer_data(buffer_data_size / sizeof(uint32_t), 0); + tt::llrt::write_hex_vec_to_core(physical_end_device_id, receiver_virtual_core, buffer_data, buffer_data_addr); + + std::iota(buffer_data.begin(), buffer_data.end(), 0); + tt::llrt::write_hex_vec_to_core(physical_start_device_id, sender_virtual_core, buffer_data, buffer_data_addr); + + tt::Cluster::instance().l1_barrier(physical_end_device_id); + tt::Cluster::instance().l1_barrier(physical_start_device_id); + + auto receiver_noc_encoding = tt::tt_metal::hal.noc_xy_encoding(receiver_virtual_core.x, receiver_virtual_core.y); + + auto sender_program = tt_metal::CreateProgram(); + auto sender_kernel = tt_metal::CreateKernel( + sender_program, + "tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_sender.cpp", + {sender_logical_core}, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); + + auto [sender_gk_noc_offset, sender_gk_interface_addr] = + this->GetFabricData().get_gatekeeper_noc_addr(physical_start_device_id); + + auto& sender_virtual_router_coord = routers[0].second; + auto sender_router_noc_xy = + tt_metal::hal.noc_xy_encoding(sender_virtual_router_coord.x, sender_virtual_router_coord.y); + std::vector sender_runtime_args = { + client_interface_addr, + sender_gk_interface_addr, + sender_gk_noc_offset, + packet_header_addr, + receiver_noc_encoding, + buffer_data_addr, + buffer_data_size, + end_mesh_chip_id.first, + end_mesh_chip_id.second, + sender_router_noc_xy}; + tt_metal::SetRuntimeArgs(sender_program, sender_kernel, sender_logical_core, sender_runtime_args); + + auto receiver_program = tt_metal::CreateProgram(); + auto receiver_kernel = tt_metal::CreateKernel( + receiver_program, + "tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_receiver.cpp", + {receiver_logical_core}, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); + + auto [receiver_gk_noc_offset, receiver_gk_interface_addr] = + this->GetFabricData().get_gatekeeper_noc_addr(physical_end_device_id); + std::vector receiver_runtime_args = { + buffer_data_addr, + buffer_data_size, + }; + tt_metal::SetRuntimeArgs(receiver_program, receiver_kernel, receiver_logical_core, receiver_runtime_args); + + tt_metal::detail::LaunchProgram(receiver_device, receiver_program, false); + tt_metal::detail::LaunchProgram(sender_device, sender_program, false); + tt_metal::detail::WaitProgramDone(sender_device, sender_program); + tt_metal::detail::WaitProgramDone(receiver_device, receiver_program); + + std::vector received_buffer_data = tt::llrt::read_hex_vec_from_core( + physical_end_device_id, receiver_virtual_core, buffer_data_addr, buffer_data_size); + EXPECT_EQ(buffer_data, received_buffer_data); +} + +TEST_F(FabricFixture, TestAtomicInc) { + CoreCoord sender_logical_core = {0, 0}; + CoreCoord receiver_logical_core = {1, 0}; + std::pair start_mesh_chip_id; + chip_id_t physical_start_device_id; + std::pair end_mesh_chip_id; + chip_id_t physical_end_device_id; + bool connection_found = false; + for (const auto &[id, device] : devices_map_) { + start_mesh_chip_id = control_plane_->get_mesh_chip_id_from_physical_chip_id(device->id()); + auto neighbors = control_plane_->get_intra_chip_neighbors( + start_mesh_chip_id.first, start_mesh_chip_id.second, RoutingDirection::E); + if (neighbors.size() > 0) { + physical_start_device_id = device->id(); + end_mesh_chip_id = {start_mesh_chip_id.first, neighbors[0]}; + physical_end_device_id = control_plane_->get_physical_chip_id_from_mesh_chip_id(end_mesh_chip_id); + connection_found = true; + break; + } + } + auto routers = control_plane_->get_routers_to_chip( + start_mesh_chip_id.first, start_mesh_chip_id.second, end_mesh_chip_id.first, end_mesh_chip_id.second); + + if (routers.empty()) { + GTEST_SKIP() << "No path found between sender and receivers"; + } + auto* sender_device = DevicePool::instance().get_active_device(physical_start_device_id); + auto* receiver_device = DevicePool::instance().get_active_device(physical_end_device_id); + CoreCoord sender_virtual_core = sender_device->worker_core_from_logical_core(sender_logical_core); + CoreCoord receiver_virtual_core = receiver_device->worker_core_from_logical_core(receiver_logical_core); + + uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); + + uint32_t worker_unreserved_base_addr = + hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); + uint32_t client_interface_addr = worker_unreserved_base_addr; + uint32_t packet_header_addr = tt::round_up(client_interface_addr + sizeof(fabric_client_interface_t), l1_alignment); + uint32_t atomic_inc_addr = packet_header_addr + PACKET_HEADER_SIZE_BYTES; + uint32_t atomic_inc_size = sizeof(uint32_t); + std::vector atomic_inc_data(atomic_inc_size / sizeof(uint32_t), 0); + tt::llrt::write_hex_vec_to_core(physical_end_device_id, receiver_virtual_core, atomic_inc_data, atomic_inc_addr); + + uint32_t atomic_inc = 5; + uint32_t wrap_boundary = 31; + tt::Cluster::instance().l1_barrier(physical_end_device_id); + tt::Cluster::instance().l1_barrier(physical_start_device_id); + + auto receiver_noc_encoding = tt::tt_metal::hal.noc_xy_encoding(receiver_virtual_core.x, receiver_virtual_core.y); + + auto sender_program = tt_metal::CreateProgram(); + auto sender_kernel = tt_metal::CreateKernel( + sender_program, + "tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_atomic_inc_sender.cpp", + {sender_logical_core}, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); + + auto [sender_gk_noc_offset, sender_gk_interface_addr] = + this->GetFabricData().get_gatekeeper_noc_addr(physical_start_device_id); + + auto& sender_virtual_router_coord = routers[0].second; + auto sender_router_noc_xy = + tt_metal::hal.noc_xy_encoding(sender_virtual_router_coord.x, sender_virtual_router_coord.y); + std::vector sender_runtime_args = { + client_interface_addr, + sender_gk_interface_addr, + sender_gk_noc_offset, + packet_header_addr, + receiver_noc_encoding, + atomic_inc_addr, + atomic_inc, + wrap_boundary, + end_mesh_chip_id.first, + end_mesh_chip_id.second, + sender_router_noc_xy}; + tt_metal::SetRuntimeArgs(sender_program, sender_kernel, sender_logical_core, sender_runtime_args); + + auto receiver_program = tt_metal::CreateProgram(); + auto receiver_kernel = tt_metal::CreateKernel( + receiver_program, + "tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_receiver.cpp", + {receiver_logical_core}, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); + + auto [receiver_gk_noc_offset, receiver_gk_interface_addr] = + this->GetFabricData().get_gatekeeper_noc_addr(physical_end_device_id); + std::vector receiver_runtime_args = { + atomic_inc_addr, + sizeof(uint32_t), + }; + tt_metal::SetRuntimeArgs(receiver_program, receiver_kernel, receiver_logical_core, receiver_runtime_args); + + tt_metal::detail::LaunchProgram(receiver_device, receiver_program, false); + tt_metal::detail::LaunchProgram(sender_device, sender_program, false); + tt_metal::detail::WaitProgramDone(sender_device, sender_program); + tt_metal::detail::WaitProgramDone(receiver_device, receiver_program); + + std::vector received_buffer_data = tt::llrt::read_hex_vec_from_core( + physical_end_device_id, receiver_virtual_core, atomic_inc_addr, atomic_inc_size); + EXPECT_EQ(atomic_inc, received_buffer_data[0]); +} + +TEST_F(FabricFixture, TestAyncWriteAtomicInc) { + CoreCoord sender_logical_core = {0, 0}; + CoreCoord receiver_logical_core = {1, 0}; + std::pair start_mesh_chip_id; + chip_id_t physical_start_device_id; + std::pair end_mesh_chip_id; + chip_id_t physical_end_device_id; + bool connection_found = false; + for (const auto &[id, device] : devices_map_) { + start_mesh_chip_id = control_plane_->get_mesh_chip_id_from_physical_chip_id(device->id()); + auto neighbors = control_plane_->get_intra_chip_neighbors( + start_mesh_chip_id.first, start_mesh_chip_id.second, RoutingDirection::E); + if (neighbors.size() > 0) { + physical_start_device_id = device->id(); + end_mesh_chip_id = {start_mesh_chip_id.first, neighbors[0]}; + physical_end_device_id = control_plane_->get_physical_chip_id_from_mesh_chip_id(end_mesh_chip_id); + connection_found = true; + break; + } + } + auto routers = control_plane_->get_routers_to_chip( + start_mesh_chip_id.first, start_mesh_chip_id.second, end_mesh_chip_id.first, end_mesh_chip_id.second); + + if (routers.empty()) { + GTEST_SKIP() << "No path found between sender and receivers"; + } + auto* sender_device = DevicePool::instance().get_active_device(physical_start_device_id); + auto* receiver_device = DevicePool::instance().get_active_device(physical_end_device_id); + CoreCoord sender_virtual_core = sender_device->worker_core_from_logical_core(sender_logical_core); + CoreCoord receiver_virtual_core = receiver_device->worker_core_from_logical_core(receiver_logical_core); + + uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); + + uint32_t worker_unreserved_base_addr = + hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); + uint32_t client_interface_addr = worker_unreserved_base_addr; + uint32_t packet_header_addr = tt::round_up(client_interface_addr + sizeof(fabric_client_interface_t), l1_alignment); + uint32_t buffer_data_addr = packet_header_addr + PACKET_HEADER_SIZE_BYTES; + uint32_t buffer_data_size = constants::TILE_HW; + uint32_t atomic_inc_addr = tt::round_up(buffer_data_addr + buffer_data_size, l1_alignment); + uint32_t atomic_inc_size = sizeof(uint32_t); + uint32_t atomic_inc = 5; + std::vector buffer_data(buffer_data_size / sizeof(uint32_t), 0); + tt::llrt::write_hex_vec_to_core(physical_end_device_id, receiver_virtual_core, buffer_data, buffer_data_addr); + std::vector atomic_inc_data(atomic_inc_size / sizeof(uint32_t), 0); + tt::llrt::write_hex_vec_to_core(physical_end_device_id, receiver_virtual_core, atomic_inc_data, atomic_inc_addr); + + uint32_t wrap_boundary = 31; + std::iota(buffer_data.begin(), buffer_data.end(), 0); + tt::llrt::write_hex_vec_to_core(physical_start_device_id, sender_virtual_core, buffer_data, buffer_data_addr); + + tt::Cluster::instance().l1_barrier(physical_end_device_id); + tt::Cluster::instance().l1_barrier(physical_start_device_id); + + auto receiver_noc_encoding = tt::tt_metal::hal.noc_xy_encoding(receiver_virtual_core.x, receiver_virtual_core.y); + + auto sender_program = tt_metal::CreateProgram(); + auto sender_kernel = tt_metal::CreateKernel( + sender_program, + "tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_atomic_inc_sender.cpp", + {sender_logical_core}, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); + + auto [sender_gk_noc_offset, sender_gk_interface_addr] = + this->GetFabricData().get_gatekeeper_noc_addr(physical_start_device_id); + + auto& sender_virtual_router_coord = routers[0].second; + auto sender_router_noc_xy = + tt_metal::hal.noc_xy_encoding(sender_virtual_router_coord.x, sender_virtual_router_coord.y); + std::vector sender_runtime_args = { + client_interface_addr, + sender_gk_interface_addr, + sender_gk_noc_offset, + packet_header_addr, + receiver_noc_encoding, + buffer_data_addr, + atomic_inc_addr, + buffer_data_size, + atomic_inc, + end_mesh_chip_id.first, + end_mesh_chip_id.second, + sender_router_noc_xy}; + tt_metal::SetRuntimeArgs(sender_program, sender_kernel, sender_logical_core, sender_runtime_args); + + auto receiver_program = tt_metal::CreateProgram(); + auto receiver_kernel = tt_metal::CreateKernel( + receiver_program, + "tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_receiver.cpp", + {receiver_logical_core}, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); + + auto [receiver_gk_noc_offset, receiver_gk_interface_addr] = + this->GetFabricData().get_gatekeeper_noc_addr(physical_end_device_id); + std::vector receiver_runtime_args = { + atomic_inc_addr, + sizeof(uint32_t), + }; + tt_metal::SetRuntimeArgs(receiver_program, receiver_kernel, receiver_logical_core, receiver_runtime_args); + + tt_metal::detail::LaunchProgram(receiver_device, receiver_program, false); + tt_metal::detail::LaunchProgram(sender_device, sender_program, false); + tt_metal::detail::WaitProgramDone(sender_device, sender_program); + tt_metal::detail::WaitProgramDone(receiver_device, receiver_program); + + std::vector received_buffer_data = tt::llrt::read_hex_vec_from_core( + physical_end_device_id, receiver_virtual_core, buffer_data_addr, buffer_data_size); + EXPECT_EQ(buffer_data, received_buffer_data); + received_buffer_data.clear(); + received_buffer_data = tt::llrt::read_hex_vec_from_core( + physical_end_device_id, receiver_virtual_core, atomic_inc_addr, atomic_inc_size); + EXPECT_EQ(atomic_inc, received_buffer_data[0]); +} + +TEST_F(FabricFixture, TestAsyncWriteMulticastMultidirectional) { + CoreCoord sender_logical_core = {0, 0}; + CoreCoord receiver_logical_core = {1, 0}; + std::pair start_mesh_chip_id; + chip_id_t physical_start_device_id; + std::unordered_map>> end_mesh_chip_ids_by_dir; + std::unordered_map> physical_end_device_ids_by_dir; + uint32_t num_dirs = 2; + std::unordered_map mcast_hops; + mcast_hops[RoutingDirection::E] = 2; + for (const auto &[id, device] : devices_map_) { + start_mesh_chip_id = control_plane_->get_mesh_chip_id_from_physical_chip_id(device->id()); + std::unordered_map>> + temp_end_mesh_chip_ids_by_dir; + std::unordered_map> temp_physical_end_device_ids_by_dir; + bool connection_found = true; + for (auto [routing_direction, num_hops] : mcast_hops) { + bool direction_found = true; + auto& temp_end_mesh_chip_ids = temp_end_mesh_chip_ids_by_dir[routing_direction]; + auto& temp_physical_end_device_ids = temp_physical_end_device_ids_by_dir[routing_direction]; + uint32_t curr_mesh_id = start_mesh_chip_id.first; + uint32_t curr_chip_id = start_mesh_chip_id.second; + for (uint32_t i = 0; i < num_hops; i++) { + auto neighbors = + control_plane_->get_intra_chip_neighbors(curr_mesh_id, curr_chip_id, routing_direction); + if (neighbors.size() > 0) { + temp_end_mesh_chip_ids.emplace_back(curr_mesh_id, neighbors[0]); + temp_physical_end_device_ids.push_back( + control_plane_->get_physical_chip_id_from_mesh_chip_id(temp_end_mesh_chip_ids.back())); + curr_mesh_id = temp_end_mesh_chip_ids.back().first; + curr_chip_id = temp_end_mesh_chip_ids.back().second; + } else { + direction_found = false; + break; + } + } + if (!direction_found) { + connection_found = false; + break; + } + } + if (connection_found) { + physical_start_device_id = device->id(); + end_mesh_chip_ids_by_dir = std::move(temp_end_mesh_chip_ids_by_dir); + physical_end_device_ids_by_dir = std::move(temp_physical_end_device_ids_by_dir); + break; + } + } + if (end_mesh_chip_ids_by_dir.empty()) { + GTEST_SKIP() << "No path found between sender and receivers"; + } + + auto* sender_device = DevicePool::instance().get_active_device(physical_start_device_id); + CoreCoord sender_virtual_core = sender_device->worker_core_from_logical_core(sender_logical_core); + + CoreCoord receiver_virtual_core = sender_device->worker_core_from_logical_core(receiver_logical_core); + + uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); + + uint32_t worker_unreserved_base_addr = + hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); + uint32_t client_interface_addr = worker_unreserved_base_addr; + uint32_t packet_header_addr = + tt::round_up(client_interface_addr + sizeof(fabric_client_interface_t) * num_dirs, l1_alignment); + uint32_t buffer_data_addr = packet_header_addr + PACKET_HEADER_SIZE_BYTES; + uint32_t buffer_data_size = tt::constants::TILE_HW * sizeof(uint32_t); + std::vector buffer_data(buffer_data_size / sizeof(uint32_t), 0); + std::vector receiver_programs; + for (auto& [routing_direction, physical_end_device_ids] : physical_end_device_ids_by_dir) { + for (auto physical_end_device_id : physical_end_device_ids) { + auto* receiver_device = DevicePool::instance().get_active_device(physical_end_device_id); + tt::llrt::write_hex_vec_to_core( + physical_end_device_id, receiver_virtual_core, buffer_data, buffer_data_addr); + tt::Cluster::instance().l1_barrier(physical_end_device_id); + auto receiver_program = tt_metal::CreateProgram(); + auto receiver_kernel = tt_metal::CreateKernel( + receiver_program, + "tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_receiver.cpp", + {receiver_logical_core}, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); + + auto [receiver_gk_noc_offset, receiver_gk_interface_addr] = + this->GetFabricData().get_gatekeeper_noc_addr(physical_end_device_id); + std::vector receiver_runtime_args = { + buffer_data_addr, + buffer_data_size, + }; + tt_metal::SetRuntimeArgs(receiver_program, receiver_kernel, receiver_logical_core, receiver_runtime_args); + + tt_metal::detail::LaunchProgram(receiver_device, receiver_program, false); + receiver_programs.push_back(std::move(receiver_program)); + } + } + + std::iota(buffer_data.begin(), buffer_data.end(), 0); + tt::llrt::write_hex_vec_to_core(physical_start_device_id, sender_virtual_core, buffer_data, buffer_data_addr); + + tt::Cluster::instance().l1_barrier(physical_start_device_id); + + auto receiver_noc_encoding = tt::tt_metal::hal.noc_xy_encoding(receiver_virtual_core.x, receiver_virtual_core.y); + + auto sender_program = tt_metal::CreateProgram(); + auto sender_kernel = tt_metal::CreateKernel( + sender_program, + "tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_multicast_sender.cpp", + {sender_logical_core}, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); + + auto [sender_gk_noc_offset, sender_gk_interface_addr] = + this->GetFabricData().get_gatekeeper_noc_addr(physical_start_device_id); + + std::unordered_map sender_router_noc_xys; + for (auto& [routing_direction, end_mesh_chip_ids] : end_mesh_chip_ids_by_dir) { + auto routers = control_plane_->get_routers_to_chip( + start_mesh_chip_id.first, + start_mesh_chip_id.second, + end_mesh_chip_ids[0].first, + end_mesh_chip_ids[0].second); + auto& sender_virtual_router_coord = routers[0].second; + sender_router_noc_xys.try_emplace( + routing_direction, + tt_metal::hal.noc_xy_encoding(sender_virtual_router_coord.x, sender_virtual_router_coord.y)); + } + std::vector sender_runtime_args = { + client_interface_addr, + sender_gk_interface_addr, + sender_gk_noc_offset, + packet_header_addr, + receiver_noc_encoding, + buffer_data_addr, + buffer_data_size, + end_mesh_chip_ids_by_dir[RoutingDirection::E][0].first, + end_mesh_chip_ids_by_dir[RoutingDirection::E][0].second, + mcast_hops[RoutingDirection::E], + sender_router_noc_xys[RoutingDirection::E]}; + tt_metal::SetRuntimeArgs(sender_program, sender_kernel, sender_logical_core, sender_runtime_args); + + tt_metal::detail::LaunchProgram(sender_device, sender_program, false); + tt_metal::detail::WaitProgramDone(sender_device, sender_program); + for (auto [routing_direction, physical_end_device_ids] : physical_end_device_ids_by_dir) { + for (uint32_t i = 0; i < physical_end_device_ids.size(); i++) { + auto* receiver_device = DevicePool::instance().get_active_device(physical_end_device_ids[i]); + tt_metal::detail::WaitProgramDone(receiver_device, receiver_programs[i]); + } + } + + for (auto [routing_direction, physical_end_device_ids] : physical_end_device_ids_by_dir) { + for (auto physical_end_device_id : physical_end_device_ids) { + std::vector received_buffer_data = tt::llrt::read_hex_vec_from_core( + physical_end_device_id, receiver_virtual_core, buffer_data_addr, buffer_data_size); + EXPECT_EQ(buffer_data, received_buffer_data); + } + } +} + +TEST_F(FabricFixture, TestAsyncWriteMulticast) { + CoreCoord sender_logical_core = {0, 0}; + CoreCoord receiver_logical_core = {1, 0}; + std::pair start_mesh_chip_id; + chip_id_t physical_start_device_id; + std::unordered_map>> end_mesh_chip_ids_by_dir; + std::unordered_map> physical_end_device_ids_by_dir; + uint32_t num_dirs = 2; + std::unordered_map mcast_hops; + mcast_hops[RoutingDirection::E] = 2; + mcast_hops[RoutingDirection::W] = 1; + // mcast_hops[RoutingDirection::N] = 1; + // mcast_hops[RoutingDirection::S] = 0; + for (const auto &[id, device] : devices_map_) { + start_mesh_chip_id = control_plane_->get_mesh_chip_id_from_physical_chip_id(device->id()); + std::unordered_map>> + temp_end_mesh_chip_ids_by_dir; + std::unordered_map> temp_physical_end_device_ids_by_dir; + bool connection_found = true; + for (auto [routing_direction, num_hops] : mcast_hops) { + bool direction_found = true; + auto& temp_end_mesh_chip_ids = temp_end_mesh_chip_ids_by_dir[routing_direction]; + auto& temp_physical_end_device_ids = temp_physical_end_device_ids_by_dir[routing_direction]; + uint32_t curr_mesh_id = start_mesh_chip_id.first; + uint32_t curr_chip_id = start_mesh_chip_id.second; + for (uint32_t i = 0; i < num_hops; i++) { + auto neighbors = + control_plane_->get_intra_chip_neighbors(curr_mesh_id, curr_chip_id, routing_direction); + if (neighbors.size() > 0) { + temp_end_mesh_chip_ids.emplace_back(curr_mesh_id, neighbors[0]); + temp_physical_end_device_ids.push_back( + control_plane_->get_physical_chip_id_from_mesh_chip_id(temp_end_mesh_chip_ids.back())); + curr_mesh_id = temp_end_mesh_chip_ids.back().first; + curr_chip_id = temp_end_mesh_chip_ids.back().second; + } else { + direction_found = false; + break; + } + } + if (!direction_found) { + connection_found = false; + break; + } + } + if (connection_found) { + physical_start_device_id = device->id(); + end_mesh_chip_ids_by_dir = std::move(temp_end_mesh_chip_ids_by_dir); + physical_end_device_ids_by_dir = std::move(temp_physical_end_device_ids_by_dir); + break; + } + } + if (end_mesh_chip_ids_by_dir.empty()) { + GTEST_SKIP() << "No path found between sender and receivers"; + } + + auto* sender_device = DevicePool::instance().get_active_device(physical_start_device_id); + CoreCoord sender_virtual_core = sender_device->worker_core_from_logical_core(sender_logical_core); + + CoreCoord receiver_virtual_core = sender_device->worker_core_from_logical_core(receiver_logical_core); + + uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); + + uint32_t worker_unreserved_base_addr = + hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); + uint32_t client_interface_addr = worker_unreserved_base_addr; + uint32_t packet_header_addr = + tt::round_up(client_interface_addr + sizeof(fabric_client_interface_t) * num_dirs, l1_alignment); + uint32_t buffer_data_addr = packet_header_addr + PACKET_HEADER_SIZE_BYTES; + uint32_t buffer_data_size = tt::constants::TILE_HW * sizeof(uint32_t); + std::vector buffer_data(buffer_data_size / sizeof(uint32_t), 0); + std::vector receiver_programs; + for (auto& [routing_direction, physical_end_device_ids] : physical_end_device_ids_by_dir) { + for (auto physical_end_device_id : physical_end_device_ids) { + auto* receiver_device = DevicePool::instance().get_active_device(physical_end_device_id); + tt::llrt::write_hex_vec_to_core( + physical_end_device_id, receiver_virtual_core, buffer_data, buffer_data_addr); + tt::Cluster::instance().l1_barrier(physical_end_device_id); + auto receiver_program = tt_metal::CreateProgram(); + auto receiver_kernel = tt_metal::CreateKernel( + receiver_program, + "tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_receiver.cpp", + {receiver_logical_core}, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); + + auto [receiver_gk_noc_offset, receiver_gk_interface_addr] = + this->GetFabricData().get_gatekeeper_noc_addr(physical_end_device_id); + std::vector receiver_runtime_args = { + buffer_data_addr, + buffer_data_size, + }; + tt_metal::SetRuntimeArgs(receiver_program, receiver_kernel, receiver_logical_core, receiver_runtime_args); + + tt_metal::detail::LaunchProgram(receiver_device, receiver_program, false); + receiver_programs.push_back(std::move(receiver_program)); + } + } + + std::iota(buffer_data.begin(), buffer_data.end(), 0); + tt::llrt::write_hex_vec_to_core(physical_start_device_id, sender_virtual_core, buffer_data, buffer_data_addr); + + tt::Cluster::instance().l1_barrier(physical_start_device_id); + + auto receiver_noc_encoding = tt::tt_metal::hal.noc_xy_encoding(receiver_virtual_core.x, receiver_virtual_core.y); + + auto sender_program = tt_metal::CreateProgram(); + auto sender_kernel = tt_metal::CreateKernel( + sender_program, + "tests/tt_metal/tt_fabric/fabric_data_movement/kernels/" + "fabric_async_write_multicast_multidirectional_sender.cpp", + {sender_logical_core}, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); + + auto [sender_gk_noc_offset, sender_gk_interface_addr] = + this->GetFabricData().get_gatekeeper_noc_addr(physical_start_device_id); + + std::unordered_map sender_router_noc_xys; + for (auto& [routing_direction, end_mesh_chip_ids] : end_mesh_chip_ids_by_dir) { + auto routers = control_plane_->get_routers_to_chip( + start_mesh_chip_id.first, + start_mesh_chip_id.second, + end_mesh_chip_ids[0].first, + end_mesh_chip_ids[0].second); + auto& sender_virtual_router_coord = routers[0].second; + sender_router_noc_xys.try_emplace( + routing_direction, + tt_metal::hal.noc_xy_encoding(sender_virtual_router_coord.x, sender_virtual_router_coord.y)); + } + std::vector sender_runtime_args = { + client_interface_addr, + sender_gk_interface_addr, + sender_gk_noc_offset, + packet_header_addr, + receiver_noc_encoding, + buffer_data_addr, + buffer_data_size, + end_mesh_chip_ids_by_dir[RoutingDirection::E][0].first, + end_mesh_chip_ids_by_dir[RoutingDirection::E][0].second, + mcast_hops[RoutingDirection::E], + sender_router_noc_xys[RoutingDirection::E], + end_mesh_chip_ids_by_dir[RoutingDirection::W][0].first, + end_mesh_chip_ids_by_dir[RoutingDirection::W][0].second, + mcast_hops[RoutingDirection::W], + sender_router_noc_xys[RoutingDirection::W] + // end_mesh_chip_ids_by_dir[RoutingDirection::N][0].first, + // end_mesh_chip_ids_by_dir[RoutingDirection::N][0].second, + // mcast_hops[RoutingDirection::N], + // sender_router_noc_xys[RoutingDirection::N], + // end_mesh_chip_ids_by_dir[RoutingDirection::S][0].first, + // end_mesh_chip_ids_by_dir[RoutingDirection::S][0].second, + // mcast_hops[RoutingDirection::S], + // sender_router_noc_xys[RoutingDirection::S] + }; + tt_metal::SetRuntimeArgs(sender_program, sender_kernel, sender_logical_core, sender_runtime_args); + + tt_metal::detail::LaunchProgram(sender_device, sender_program, false); + tt_metal::detail::WaitProgramDone(sender_device, sender_program); + for (auto [routing_direction, physical_end_device_ids] : physical_end_device_ids_by_dir) { + for (uint32_t i = 0; i < physical_end_device_ids.size(); i++) { + auto* receiver_device = DevicePool::instance().get_active_device(physical_end_device_ids[i]); + tt_metal::detail::WaitProgramDone(receiver_device, receiver_programs[i]); + } + } + + for (auto [routing_direction, physical_end_device_ids] : physical_end_device_ids_by_dir) { + for (auto physical_end_device_id : physical_end_device_ids) { + std::vector received_buffer_data = tt::llrt::read_hex_vec_from_core( + physical_end_device_id, receiver_virtual_core, buffer_data_addr, buffer_data_size); + EXPECT_EQ(buffer_data, received_buffer_data); + } + } +}*/ + +} // namespace tt::tt_fabric diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp index 83c9a5e0bfa..1b0f40eaee9 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp @@ -96,7 +96,6 @@ typedef struct test_board { std::unique_ptr cp_owning_ptr; uint32_t num_chips_to_use; std::string mesh_graph_descriptor; - tt::tt_metal::DispatchCoreType dispatch_core_type = tt::tt_metal::DispatchCoreType::WORKER; test_board(std::string& board_type_) { if ("n300" == board_type_) { @@ -141,8 +140,7 @@ typedef struct test_board { if (metal_fabric_init_level != 0) { tt::tt_metal::detail::InitializeFabricSetting(tt::tt_metal::detail::FabricSetting::FABRIC); } - device_handle_map = - tt::tt_metal::detail::CreateDevices(available_chip_ids, 1, 0, 0, DispatchCoreConfig{dispatch_core_type}); + device_handle_map = tt::tt_metal::detail::CreateDevices(available_chip_ids); if (metal_fabric_init_level == 0) { _init_control_plane(mesh_graph_descriptor); control_plane->configure_routing_tables(); From e12a94953f6b6764a98f18b5df1b638f522f5790 Mon Sep 17 00:00:00 2001 From: Austin Ho Date: Mon, 24 Feb 2025 17:03:06 +0000 Subject: [PATCH 278/316] #0: Move some constants from tt_fabric_interface.h to fabric_host_interface.h --- tt_metal/api/tt-metalium/fabric_host_interface.h | 5 +++++ tt_metal/fabric/hw/inc/tt_fabric_interface.h | 5 +---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/tt_metal/api/tt-metalium/fabric_host_interface.h b/tt_metal/api/tt-metalium/fabric_host_interface.h index fac0ef01765..cdfa03b7caf 100644 --- a/tt_metal/api/tt-metalium/fabric_host_interface.h +++ b/tt_metal/api/tt-metalium/fabric_host_interface.h @@ -35,6 +35,11 @@ static_assert( (sizeof(std::uint32_t) / sizeof(chan_id_t)) == NUM_CHANNELS_PER_UINT32, "LOG_BASE_2_NUM_CHANNELS_PER_UINT32 must be equal to log2(sizeof(std::uint32_t) / sizeof(chan_id_t))"); +static constexpr std::uint32_t CLIENT_INTERFACE_SIZE = 3280; +static constexpr std::uint32_t PACKET_WORD_SIZE_BYTES = 16; +static constexpr std::uint32_t PACKET_HEADER_SIZE_BYTES = 48; +static constexpr std::uint32_t PACKET_HEADER_SIZE_WORDS = PACKET_HEADER_SIZE_BYTES / PACKET_WORD_SIZE_BYTES; + enum eth_chan_magic_values { INVALID_DIRECTION = 0xDD, INVALID_ROUTING_TABLE_ENTRY = 0xFF, diff --git a/tt_metal/fabric/hw/inc/tt_fabric_interface.h b/tt_metal/fabric/hw/inc/tt_fabric_interface.h index 11cf5ebbaea..be8cefaf34a 100644 --- a/tt_metal/fabric/hw/inc/tt_fabric_interface.h +++ b/tt_metal/fabric/hw/inc/tt_fabric_interface.h @@ -17,7 +17,6 @@ typedef struct _endpoint_sync { static_assert(sizeof(endpoint_sync_t) == 4); -constexpr uint32_t PACKET_WORD_SIZE_BYTES = 16; constexpr uint32_t NUM_WR_CMD_BUFS = 4; constexpr uint32_t DEFAULT_MAX_NOC_SEND_WORDS = (NOC_MAX_BURST_WORDS * NOC_WORD_BYTES) / PACKET_WORD_SIZE_BYTES; constexpr uint32_t DEFAULT_MAX_ETH_SEND_WORDS = 2 * 1024; @@ -129,9 +128,6 @@ typedef struct _packet_header { tt_routing routing; } packet_header_t; -constexpr uint32_t PACKET_HEADER_SIZE_BYTES = 48; -constexpr uint32_t PACKET_HEADER_SIZE_WORDS = PACKET_HEADER_SIZE_BYTES / PACKET_WORD_SIZE_BYTES; - static_assert(sizeof(packet_header_t) == PACKET_HEADER_SIZE_BYTES); static_assert(offsetof(packet_header_t, routing) % 4 == 0); @@ -344,6 +340,7 @@ typedef struct _fabric_client_interface { } fabric_client_interface_t; static_assert(sizeof(fabric_client_interface_t) % 16 == 0); +static_assert(sizeof(fabric_client_interface_t) == CLIENT_INTERFACE_SIZE); constexpr uint32_t FABRIC_ROUTER_MISC_START = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; constexpr uint32_t FABRIC_ROUTER_MISC_SIZE = 256; From b131586c165d53969caa333ff0ae8596d1cf55eb Mon Sep 17 00:00:00 2001 From: Austin Ho Date: Mon, 24 Feb 2025 17:03:49 +0000 Subject: [PATCH 279/316] #0: Update fabric api unit tests --- .../tt_fabric/common/fabric_fixture.hpp | 23 +- ...rite_multicast_multidirectional_sender.cpp | 137 --- .../kernels/fabric_async_write_sender.cpp | 56 -- ...ic_pull_async_write_atomic_inc_sender.cpp} | 42 +- ...rite_multicast_multidirectional_sender.cpp | 83 ++ ...ric_pull_async_write_multicast_sender.cpp} | 42 +- ...cpp => fabric_pull_async_write_sender.cpp} | 35 +- ....cpp => fabric_pull_atomic_inc_sender.cpp} | 42 +- .../test_basic_fabric_apis.cpp | 821 +++++++++++------- .../routing/kernels/tt_fabric_tx_ubench.cpp | 4 +- tt_metal/fabric/hw/inc/tt_fabric_api.h | 15 +- 11 files changed, 645 insertions(+), 655 deletions(-) delete mode 100644 tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_multicast_multidirectional_sender.cpp delete mode 100644 tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_sender.cpp rename tests/tt_metal/tt_fabric/fabric_data_movement/kernels/{fabric_async_write_atomic_inc_sender.cpp => fabric_pull_async_write_atomic_inc_sender.cpp} (58%) create mode 100644 tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_async_write_multicast_multidirectional_sender.cpp rename tests/tt_metal/tt_fabric/fabric_data_movement/kernels/{fabric_async_write_multicast_sender.cpp => fabric_pull_async_write_multicast_sender.cpp} (53%) rename tests/tt_metal/tt_fabric/fabric_data_movement/kernels/{fabric_async_write_routing_plane_sender.cpp => fabric_pull_async_write_sender.cpp} (55%) rename tests/tt_metal/tt_fabric/fabric_data_movement/kernels/{fabric_atomic_inc_sender.cpp => fabric_pull_atomic_inc_sender.cpp} (51%) diff --git a/tests/tt_metal/tt_fabric/common/fabric_fixture.hpp b/tests/tt_metal/tt_fabric/common/fabric_fixture.hpp index b69e2aae769..864b05f6918 100644 --- a/tests/tt_metal/tt_fabric/common/fabric_fixture.hpp +++ b/tests/tt_metal/tt_fabric/common/fabric_fixture.hpp @@ -1,15 +1,14 @@ -// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. // // SPDX-License-Identifier: Apache-2.0 #include "gtest/gtest.h" +#include #include #include #include "tt_metal/test_utils/env_vars.hpp" #include #include -#include -#include namespace tt::tt_fabric { namespace fabric_router_tests { @@ -36,15 +35,14 @@ class FabricFixture : public ::testing::Test { protected: tt::ARCH arch_; std::map devices_map_; - tt::tt_fabric::ControlPlane* control_plane_; + std::vector devices_; bool slow_dispatch_; void SetUp() override { - auto slow_dispatch_ = getenv("TT_METAL_SLOW_DISPATCH_MODE"); - if (slow_dispatch_) { + slow_dispatch_ = getenv("TT_METAL_SLOW_DISPATCH_MODE"); + if (!slow_dispatch_) { tt::log_info( - tt::LogTest, - "Fabric test suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE unset"); + tt::LogTest, "Fabric test suite can only be run with slow dispatch or TT_METAL_SLOW_DISPATCH_MODE set"); GTEST_SKIP(); } // Set up all available devices @@ -56,13 +54,12 @@ class FabricFixture : public ::testing::Test { } tt::tt_metal::detail::InitializeFabricSetting(tt::tt_metal::detail::FabricSetting::FABRIC); devices_map_ = tt::tt_metal::detail::CreateDevices(ids); - control_plane_ = tt::DevicePool::instance().get_control_plane(); + for (auto& [id, device] : devices_map_) { + devices_.push_back(device); + } } - void TearDown() override { - std::cout << " TEARDOWN" << std::endl; - tt::tt_metal::detail::CloseDevices(devices_map_); - } + void TearDown() override { tt::tt_metal::detail::CloseDevices(devices_map_); } }; } // namespace tt::tt_fabric diff --git a/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_multicast_multidirectional_sender.cpp b/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_multicast_multidirectional_sender.cpp deleted file mode 100644 index 42a49426d7d..00000000000 --- a/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_multicast_multidirectional_sender.cpp +++ /dev/null @@ -1,137 +0,0 @@ -// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -// clang-format off -#include "dataflow_api.h" -#include "debug/dprint.h" -#include "tt_fabric/hw/inc/tt_fabric.h" -#include "tt_fabric/hw/inc/tt_fabric_interface.h" -#include "tt_fabric/hw/inc/tt_fabric_api.h" -#include "tests/tt_metal/tt_metal/perf_microbenchmark/common/kernel_utils.hpp" - -// clang-format on - -using namespace tt::tt_fabric; - -volatile fabric_client_interface_t* client_interface; - -uint64_t xy_local_addr; - -void kernel_main() { - uint32_t rt_args_idx = 0; - // Fabric configuration specific arguments - uint32_t client_interface_addr = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t gk_interface_addr_l = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t gk_interface_addr_h = get_arg_val(increment_arg_idx(rt_args_idx)); - - uint32_t src_addr = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t dst_noc_offset = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t dst_addr = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t num_bytes = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t e_dst_mesh_id = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t e_dst_device_id = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t e_depth = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t e_router_noc_xy = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t w_dst_mesh_id = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t w_dst_device_id = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t w_depth = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t w_router_noc_xy = get_arg_val(increment_arg_idx(rt_args_idx)); - // uint32_t n_dst_mesh_id = get_arg_val(increment_arg_idx(rt_args_idx)); - // uint32_t n_dst_device_id = get_arg_val(increment_arg_idx(rt_args_idx)); - // uint32_t n_depth = get_arg_val(increment_arg_idx(rt_args_idx)); - // uint32_t n_router_noc_xy = get_arg_val(increment_arg_idx(rt_args_idx)); - // uint32_t s_dst_mesh_id = get_arg_val(increment_arg_idx(rt_args_idx)); - // uint32_t s_dst_device_id = get_arg_val(increment_arg_idx(rt_args_idx)); - // uint32_t s_depth = get_arg_val(increment_arg_idx(rt_args_idx)); - // uint32_t s_router_noc_xy = get_arg_val(increment_arg_idx(rt_args_idx)); - constexpr uint32_t num_dirs = 2; // 4 - - uint64_t dst_noc_addr = get_noc_addr_helper(dst_noc_offset, dst_addr); - uint32_t packet_size_bytes = num_bytes + PACKET_HEADER_SIZE_BYTES; - fabric_async_write_multicast_add_header( - src_addr, // source address in sender’s memory - e_dst_mesh_id, - e_dst_device_id, - dst_noc_addr, // destination write address - packet_size_bytes, // number of bytes to write to remote destination - e_depth, - 0, - 0, - 0); - - // make sure fabric node gatekeeper is available. - fabric_endpoint_init(client_interface_addr, gk_interface_addr_l, gk_interface_addr_h); - for (uint32_t i = 1; i < num_dirs; i++) { - copy_l1_buf((uint32_t*)client_interface, (uint32_t*)(client_interface + i), sizeof(fabric_client_interface_t)); - } - - fabric_setup_pull_request( - src_addr, // source address in sender’s memory - packet_size_bytes // number of bytes to write to remote destination - ); - - fabric_send_pull_request(e_router_noc_xy, e_dst_mesh_id, e_dst_device_id); - fabric_wait_for_pull_request_bytes_flushed(PACKET_HEADER_SIZE_BYTES); - packet_header_t* packet_header = (packet_header_t*)(src_addr); - - // West Mcast - client_interface++; - - packet_header->routing.dst_mesh_id = w_dst_mesh_id; - packet_header->routing.dst_dev_id = w_dst_device_id; - packet_header->packet_parameters.mcast_parameters.east = 0; - packet_header->packet_parameters.mcast_parameters.west = w_depth; - // make sure fabric node gatekeeper is available. - fabric_endpoint_init(client_interface_addr, gk_interface_addr_l, gk_interface_addr_h); - - fabric_setup_pull_request( - src_addr, // source address in sender’s memory - packet_size_bytes // number of bytes to write to remote destination - ); - - fabric_send_pull_request(w_router_noc_xy, w_dst_mesh_id, w_dst_device_id); - // fabric_wait_for_pull_request_bytes_flushed(PACKET_HEADER_SIZE_BYTES); - - // // North Mcast - // client_interface++; - - // packet_header->routing.dst_mesh_id = n_dst_mesh_id; - // packet_header->routing.dst_dev_id = n_dst_device_id; - // packet_header->packet_parameters.mcast_parameters.west = 0; - // packet_header->packet_parameters.mcast_parameters.north = n_depth; - // // make sure fabric node gatekeeper is available. - // fabric_endpoint_init(client_interface_addr, gk_interface_addr_l, gk_interface_addr_h); - - // fabric_setup_pull_request( - // src_addr, // source address in sender’s memory - // packet_size_bytes // number of bytes to write to remote destination - // ); - - // fabric_send_pull_request(n_router_noc_xy, n_dst_mesh_id, n_dst_device_id); - // fabric_wait_for_pull_request_bytes_flushed(PACKET_HEADER_SIZE_BYTES); - - // // South Mcast - // client_interface++; - - // packet_header->routing.dst_mesh_id = s_dst_mesh_id; - // packet_header->routing.dst_dev_id = s_dst_device_id; - // packet_header->packet_parameters.mcast_parameters.north = 0; - // packet_header->packet_parameters.mcast_parameters.south = s_depth; - // // make sure fabric node gatekeeper is available. - // fabric_endpoint_init(client_interface_addr, gk_interface_addr_l, gk_interface_addr_h); - - // fabric_setup_pull_request( - // src_addr, // source address in sender’s memory - // packet_size_bytes // number of bytes to write to remote destination - // ); - - // fabric_send_pull_request(s_router_noc_xy, s_dst_mesh_id, s_dst_device_id); - - // Flush all pull requests - client_interface = (volatile fabric_client_interface_t*)client_interface_addr; - for (uint32_t i = 0; i < num_dirs; i++) { - fabric_wait_for_pull_request_flushed(); - client_interface++; - } -} diff --git a/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_sender.cpp b/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_sender.cpp deleted file mode 100644 index 195fb00331c..00000000000 --- a/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_sender.cpp +++ /dev/null @@ -1,56 +0,0 @@ -// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -// clang-format off -#include "dataflow_api.h" -#include "debug/dprint.h" -#include "tt_fabric/hw/inc/tt_fabric.h" -#include "tt_fabric/hw/inc/tt_fabric_interface.h" -#include "tt_fabric/hw/inc/tt_fabric_api.h" -#include "tests/tt_metal/tt_metal/perf_microbenchmark/common/kernel_utils.hpp" - -// clang-format on - -using namespace tt::tt_fabric; - -volatile fabric_client_interface_t* client_interface; - -uint64_t xy_local_addr; - -void kernel_main() { - uint32_t rt_args_idx = 0; - // Fabric configuration specific arguments - uint32_t client_interface_addr = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t gk_interface_addr_l = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t gk_interface_addr_h = get_arg_val(increment_arg_idx(rt_args_idx)); - - uint32_t src_addr = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t dst_noc_offset = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t dst_addr = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t num_bytes = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t dst_mesh_id = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t dst_device_id = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t router_noc_xy = get_arg_val(increment_arg_idx(rt_args_idx)); - - uint64_t dst_noc_addr = get_noc_addr_helper(dst_noc_offset, dst_addr); - uint32_t packet_size_bytes = num_bytes + PACKET_HEADER_SIZE_BYTES; - fabric_async_write_add_header( - src_addr, // source address in sender’s memory - dst_mesh_id, - dst_device_id, - dst_noc_addr, // destination write address - packet_size_bytes // number of bytes to write to remote destination - ); - - // make sure fabric node gatekeeper is available. - fabric_endpoint_init(client_interface_addr, gk_interface_addr_l, gk_interface_addr_h); - - fabric_setup_pull_request( - src_addr, // source address in sender’s memory - packet_size_bytes // number of bytes to write to remote destination - ); - - fabric_send_pull_request(router_noc_xy, dst_mesh_id, dst_device_id); - fabric_wait_for_pull_request_flushed(); -} diff --git a/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_atomic_inc_sender.cpp b/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_async_write_atomic_inc_sender.cpp similarity index 58% rename from tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_atomic_inc_sender.cpp rename to tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_async_write_atomic_inc_sender.cpp index c1d00e50a6d..131c9a2fff1 100644 --- a/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_atomic_inc_sender.cpp +++ b/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_async_write_atomic_inc_sender.cpp @@ -1,30 +1,17 @@ -// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2025 Tenstorrent AI ULC // // SPDX-License-Identifier: Apache-2.0 -// clang-format off #include "dataflow_api.h" -#include "debug/dprint.h" -#include "tt_fabric/hw/inc/tt_fabric.h" -#include "tt_fabric/hw/inc/tt_fabric_interface.h" -#include "tt_fabric/hw/inc/tt_fabric_api.h" +#include "tt_metal/fabric/hw/inc/tt_fabric_api.h" +#include "tt_metal/fabric/hw/inc/tt_fabric_interface.h" #include "tests/tt_metal/tt_metal/perf_microbenchmark/common/kernel_utils.hpp" -// clang-format on - using namespace tt::tt_fabric; -volatile fabric_client_interface_t* client_interface; - -uint64_t xy_local_addr; - void kernel_main() { + constexpr uint32_t client_interface_cb = get_compile_time_arg_val(0); uint32_t rt_args_idx = 0; - // Fabric configuration specific arguments - uint32_t client_interface_addr = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t gk_interface_addr_l = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t gk_interface_addr_h = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t src_addr = get_arg_val(increment_arg_idx(rt_args_idx)); uint32_t dst_noc_offset = get_arg_val(increment_arg_idx(rt_args_idx)); uint32_t dst_write_addr = get_arg_val(increment_arg_idx(rt_args_idx)); @@ -38,7 +25,15 @@ void kernel_main() { uint64_t dst_write_noc_addr = get_noc_addr_helper(dst_noc_offset, dst_write_addr); uint64_t dst_atomic_noc_addr = get_noc_addr_helper(dst_noc_offset, dst_atomic_addr); uint32_t packet_size_bytes = num_bytes + PACKET_HEADER_SIZE_BYTES; - fabric_async_write_atomic_inc_add_header( + + uint32_t client_interface_addr = get_write_ptr(client_interface_cb); + volatile tt_l1_ptr fabric_client_interface_t* client_interface = + reinterpret_cast(client_interface_addr); + fabric_endpoint_init(client_interface, 0 /* unused */); + + fabric_async_write_atomic_inc( + client_interface, + router_noc_xy, src_addr, // source address in sender’s memory dst_mesh_id, dst_device_id, @@ -48,14 +43,5 @@ void kernel_main() { atomic_inc // atomic increment value ); - // make sure fabric node gatekeeper is available. - fabric_endpoint_init(client_interface_addr, gk_interface_addr_l, gk_interface_addr_h); - - fabric_setup_pull_request( - src_addr, // source address in sender’s memory - packet_size_bytes // number of bytes to write to remote destination - ); - - fabric_send_pull_request(router_noc_xy, dst_mesh_id, dst_device_id); - fabric_wait_for_pull_request_flushed(); + fabric_wait_for_pull_request_flushed(client_interface); } diff --git a/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_async_write_multicast_multidirectional_sender.cpp b/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_async_write_multicast_multidirectional_sender.cpp new file mode 100644 index 00000000000..b6dab8d940f --- /dev/null +++ b/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_async_write_multicast_multidirectional_sender.cpp @@ -0,0 +1,83 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#include "dataflow_api.h" +#include "tt_metal/fabric/hw/inc/tt_fabric_api.h" +#include "tt_metal/fabric/hw/inc/tt_fabric_interface.h" +#include "tests/tt_metal/tt_metal/perf_microbenchmark/common/kernel_utils.hpp" + +using namespace tt::tt_fabric; + +void kernel_main() { + constexpr uint32_t client_interface_cb = get_compile_time_arg_val(0); + uint32_t rt_args_idx = 0; + uint32_t src_addr = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t dst_noc_offset = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t dst_addr = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t num_bytes = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t e_dst_mesh_id = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t e_dst_device_id = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t e_depth = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t e_router_noc_xy = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t w_dst_mesh_id = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t w_dst_device_id = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t w_depth = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t w_router_noc_xy = get_arg_val(increment_arg_idx(rt_args_idx)); + constexpr uint32_t num_dirs = 2; + + uint64_t dst_noc_addr = get_noc_addr_helper(dst_noc_offset, dst_addr); + uint32_t packet_size_bytes = num_bytes + PACKET_HEADER_SIZE_BYTES; + + uint32_t client_interface_addr = get_write_ptr(client_interface_cb); + volatile tt_l1_ptr fabric_client_interface_t* client_interface = + reinterpret_cast(client_interface_addr); + for (uint32_t i = 0; i < num_dirs; i++) { + fabric_endpoint_init(client_interface + i, 0 /* unused */); + } + + fabric_async_write_multicast( + client_interface, + e_router_noc_xy, + src_addr, // source address in sender’s memory + e_dst_mesh_id, + e_dst_device_id, + dst_noc_addr, // destination write address + packet_size_bytes, // number of bytes to write to remote destination + e_depth, + 0, + 0, + 0); + + // Wait for packet header to be flushed since we will reuse it for the next mcast direction + fabric_wait_for_pull_request_bytes_flushed(client_interface, PACKET_HEADER_SIZE_BYTES); + packet_header_t* packet_header = (packet_header_t*)(src_addr); + + // West Mcast + client_interface++; + + packet_header->routing.dst_mesh_id = w_dst_mesh_id; + packet_header->routing.dst_dev_id = w_dst_device_id; + packet_header->packet_parameters.mcast_parameters.east = 0; + packet_header->packet_parameters.mcast_parameters.west = w_depth; + + fabric_async_write_multicast( + client_interface, + w_router_noc_xy, + src_addr, // source address in sender’s memory + w_dst_mesh_id, + w_dst_device_id, + dst_noc_addr, // destination write address + packet_size_bytes, // number of bytes to write to remote destination + 0, + w_depth, + 0, + 0); + + // Flush all pull requests + client_interface = reinterpret_cast(client_interface_addr); + for (uint32_t i = 0; i < num_dirs; i++) { + fabric_wait_for_pull_request_flushed(client_interface); + client_interface++; + } +} diff --git a/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_multicast_sender.cpp b/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_async_write_multicast_sender.cpp similarity index 53% rename from tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_multicast_sender.cpp rename to tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_async_write_multicast_sender.cpp index 57ee4376fcd..09d0384fcc9 100644 --- a/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_multicast_sender.cpp +++ b/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_async_write_multicast_sender.cpp @@ -1,30 +1,17 @@ -// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2025 Tenstorrent AI ULC // // SPDX-License-Identifier: Apache-2.0 -// clang-format off #include "dataflow_api.h" -#include "debug/dprint.h" -#include "tt_fabric/hw/inc/tt_fabric.h" -#include "tt_fabric/hw/inc/tt_fabric_interface.h" -#include "tt_fabric/hw/inc/tt_fabric_api.h" +#include "tt_metal/fabric/hw/inc/tt_fabric_api.h" +#include "tt_metal/fabric/hw/inc/tt_fabric_interface.h" #include "tests/tt_metal/tt_metal/perf_microbenchmark/common/kernel_utils.hpp" -// clang-format on - using namespace tt::tt_fabric; -volatile fabric_client_interface_t* client_interface; - -uint64_t xy_local_addr; - void kernel_main() { + constexpr uint32_t client_interface_cb = get_compile_time_arg_val(0); uint32_t rt_args_idx = 0; - // Fabric configuration specific arguments - uint32_t client_interface_addr = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t gk_interface_addr_l = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t gk_interface_addr_h = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t src_addr = get_arg_val(increment_arg_idx(rt_args_idx)); uint32_t dst_noc_offset = get_arg_val(increment_arg_idx(rt_args_idx)); uint32_t dst_addr = get_arg_val(increment_arg_idx(rt_args_idx)); @@ -36,7 +23,15 @@ void kernel_main() { uint64_t dst_noc_addr = get_noc_addr_helper(dst_noc_offset, dst_addr); uint32_t packet_size_bytes = num_bytes + PACKET_HEADER_SIZE_BYTES; - fabric_async_write_multicast_add_header( + + uint32_t client_interface_addr = get_write_ptr(client_interface_cb); + volatile tt_l1_ptr fabric_client_interface_t* client_interface = + reinterpret_cast(client_interface_addr); + fabric_endpoint_init(client_interface, 0 /* unused */); + + fabric_async_write_multicast( + client_interface, + e_router_noc_xy, src_addr, // source address in sender’s memory e_dst_mesh_id, e_dst_device_id, @@ -47,14 +42,5 @@ void kernel_main() { 0, 0); - // make sure fabric node gatekeeper is available. - fabric_endpoint_init(client_interface_addr, gk_interface_addr_l, gk_interface_addr_h); - - fabric_setup_pull_request( - src_addr, // source address in sender’s memory - packet_size_bytes // number of bytes to write to remote destination - ); - - fabric_send_pull_request(e_router_noc_xy, e_dst_mesh_id, e_dst_device_id); - fabric_wait_for_pull_request_flushed(); + fabric_wait_for_pull_request_flushed(client_interface); } diff --git a/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_routing_plane_sender.cpp b/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_async_write_sender.cpp similarity index 55% rename from tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_routing_plane_sender.cpp rename to tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_async_write_sender.cpp index 4c18a71a06c..2815a1c207b 100644 --- a/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_routing_plane_sender.cpp +++ b/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_async_write_sender.cpp @@ -1,51 +1,42 @@ -// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2025 Tenstorrent AI ULC // // SPDX-License-Identifier: Apache-2.0 -// clang-format off #include "dataflow_api.h" -#include "debug/dprint.h" -#include "tt_fabric/hw/inc/tt_fabric.h" -#include "tt_fabric/hw/inc/tt_fabric_interface.h" -#include "tt_fabric/hw/inc/tt_fabric_api.h" +#include "tt_metal/fabric/hw/inc/tt_fabric_api.h" +#include "tt_metal/fabric/hw/inc/tt_fabric_interface.h" #include "tests/tt_metal/tt_metal/perf_microbenchmark/common/kernel_utils.hpp" -// clang-format on - using namespace tt::tt_fabric; -volatile fabric_client_interface_t* client_interface; - -uint64_t xy_local_addr; - void kernel_main() { + constexpr uint32_t client_interface_cb = get_compile_time_arg_val(0); uint32_t rt_args_idx = 0; - // Fabric configuration specific arguments - uint32_t client_interface_addr = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t gk_interface_addr_l = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t gk_interface_addr_h = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t src_addr = get_arg_val(increment_arg_idx(rt_args_idx)); uint32_t dst_noc_offset = get_arg_val(increment_arg_idx(rt_args_idx)); uint32_t dst_addr = get_arg_val(increment_arg_idx(rt_args_idx)); uint32_t num_bytes = get_arg_val(increment_arg_idx(rt_args_idx)); uint32_t dst_mesh_id = get_arg_val(increment_arg_idx(rt_args_idx)); uint32_t dst_device_id = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t routing_plane = get_arg_val(increment_arg_idx(rt_args_idx)); + uint32_t router_noc_xy = get_arg_val(increment_arg_idx(rt_args_idx)); uint64_t dst_noc_addr = get_noc_addr_helper(dst_noc_offset, dst_addr); uint32_t packet_size_bytes = num_bytes + PACKET_HEADER_SIZE_BYTES; - // make sure fabric node gatekeeper is available. - fabric_endpoint_init(client_interface_addr, gk_interface_addr_l, gk_interface_addr_h); + uint32_t client_interface_addr = get_write_ptr(client_interface_cb); + volatile tt_l1_ptr fabric_client_interface_t* client_interface = + reinterpret_cast(client_interface_addr); + fabric_endpoint_init(client_interface, 0 /* unused */); fabric_async_write( - routing_plane, + client_interface, + router_noc_xy, src_addr, // source address in sender’s memory dst_mesh_id, dst_device_id, dst_noc_addr, // destination write address packet_size_bytes // number of bytes to write to remote destination ); - fabric_wait_for_pull_request_flushed(); + + fabric_wait_for_pull_request_flushed(client_interface); } diff --git a/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_atomic_inc_sender.cpp b/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_atomic_inc_sender.cpp similarity index 51% rename from tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_atomic_inc_sender.cpp rename to tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_atomic_inc_sender.cpp index 6fdd05f63aa..beba0160782 100644 --- a/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_atomic_inc_sender.cpp +++ b/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_atomic_inc_sender.cpp @@ -1,30 +1,17 @@ -// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2025 Tenstorrent AI ULC // // SPDX-License-Identifier: Apache-2.0 -// clang-format off #include "dataflow_api.h" -#include "debug/dprint.h" -#include "tt_fabric/hw/inc/tt_fabric.h" -#include "tt_fabric/hw/inc/tt_fabric_interface.h" -#include "tt_fabric/hw/inc/tt_fabric_api.h" +#include "tt_metal/fabric/hw/inc/tt_fabric_api.h" +#include "tt_metal/fabric/hw/inc/tt_fabric_interface.h" #include "tests/tt_metal/tt_metal/perf_microbenchmark/common/kernel_utils.hpp" -// clang-format on - using namespace tt::tt_fabric; -volatile fabric_client_interface_t* client_interface; - -uint64_t xy_local_addr; - void kernel_main() { + constexpr uint32_t client_interface_cb = get_compile_time_arg_val(0); uint32_t rt_args_idx = 0; - // Fabric configuration specific arguments - uint32_t client_interface_addr = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t gk_interface_addr_l = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t gk_interface_addr_h = get_arg_val(increment_arg_idx(rt_args_idx)); - uint32_t src_addr = get_arg_val(increment_arg_idx(rt_args_idx)); uint32_t dst_noc_offset = get_arg_val(increment_arg_idx(rt_args_idx)); uint32_t dst_addr = get_arg_val(increment_arg_idx(rt_args_idx)); @@ -36,7 +23,15 @@ void kernel_main() { uint64_t dst_noc_addr = get_noc_addr_helper(dst_noc_offset, dst_addr); uint32_t packet_size_bytes = PACKET_HEADER_SIZE_BYTES; - fabric_atomic_inc_add_header( + + uint32_t client_interface_addr = get_write_ptr(client_interface_cb); + volatile tt_l1_ptr fabric_client_interface_t* client_interface = + reinterpret_cast(client_interface_addr); + fabric_endpoint_init(client_interface, 0 /* unused */); + + fabric_atomic_inc( + client_interface, + router_noc_xy, src_addr, // source address in sender’s memory dst_mesh_id, dst_device_id, @@ -44,14 +39,5 @@ void kernel_main() { atomic_inc, wrap_boundary); - // make sure fabric node gatekeeper is available. - fabric_endpoint_init(client_interface_addr, gk_interface_addr_l, gk_interface_addr_h); - - fabric_setup_pull_request( - src_addr, // source address in sender’s memory - packet_size_bytes // number of bytes to write to remote destination - ); - - fabric_send_pull_request(router_noc_xy, dst_mesh_id, dst_device_id); - fabric_wait_for_pull_request_flushed(); + fabric_wait_for_pull_request_flushed(client_interface); } diff --git a/tests/tt_metal/tt_fabric/fabric_data_movement/test_basic_fabric_apis.cpp b/tests/tt_metal/tt_fabric/fabric_data_movement/test_basic_fabric_apis.cpp index 84d6dea5e5c..7e1e1c6a03e 100644 --- a/tests/tt_metal/tt_fabric/fabric_data_movement/test_basic_fabric_apis.cpp +++ b/tests/tt_metal/tt_fabric/fabric_data_movement/test_basic_fabric_apis.cpp @@ -1,31 +1,40 @@ -// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2025 Tenstorrent AI ULC // // SPDX-License-Identifier: Apache-2.0 #include +#include +#include +#include + #include "fabric_fixture.hpp" -#include "llrt.hpp" +#include "tt_metal/llrt/tt_cluster.hpp" namespace tt::tt_fabric { -TEST_F(FabricFixture, TestShell) { std::cout << " Test started " << std::endl; } -/* -TEST_F(FabricFixture, TestAsyncWriteRoutingPlane) { +TEST_F(FabricFixture, TestAsyncWrite) { CoreCoord sender_logical_core = {0, 0}; + CoreRangeSet sender_logical_crs = {sender_logical_core}; CoreCoord receiver_logical_core = {1, 0}; + CoreRangeSet receiver_logical_crs = {receiver_logical_core}; std::pair start_mesh_chip_id; chip_id_t physical_start_device_id; std::pair end_mesh_chip_id; chip_id_t physical_end_device_id; + + auto control_plane = tt::DevicePool::instance().get_control_plane(); + + // Find a device with a neighbour in the East direction bool connection_found = false; - for (const auto &[id, device] : devices_map_) { - start_mesh_chip_id = control_plane_->get_mesh_chip_id_from_physical_chip_id(device->id()); - auto neighbors = control_plane_->get_intra_chip_neighbors( + for (auto* device : devices_) { + start_mesh_chip_id = control_plane->get_mesh_chip_id_from_physical_chip_id(device->id()); + // Get neighbours within a mesh in the East direction + auto neighbors = control_plane->get_intra_chip_neighbors( start_mesh_chip_id.first, start_mesh_chip_id.second, RoutingDirection::E); if (neighbors.size() > 0) { physical_start_device_id = device->id(); end_mesh_chip_id = {start_mesh_chip_id.first, neighbors[0]}; - physical_end_device_id = control_plane_->get_physical_chip_id_from_mesh_chip_id(end_mesh_chip_id); + physical_end_device_id = control_plane->get_physical_chip_id_from_mesh_chip_id(end_mesh_chip_id); connection_found = true; break; } @@ -33,160 +42,101 @@ TEST_F(FabricFixture, TestAsyncWriteRoutingPlane) { if (!connection_found) { GTEST_SKIP() << "No path found between sender and receivers"; } - auto* sender_device = DevicePool::instance().get_active_device(physical_start_device_id); - auto* receiver_device = DevicePool::instance().get_active_device(physical_end_device_id); - CoreCoord sender_virtual_core = sender_device->worker_core_from_logical_core(sender_logical_core); - CoreCoord receiver_virtual_core = receiver_device->worker_core_from_logical_core(receiver_logical_core); - - uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); - - uint32_t worker_unreserved_base_addr = - hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); - uint32_t client_interface_addr = worker_unreserved_base_addr; - uint32_t packet_header_addr = tt::round_up( - client_interface_addr + sizeof(fabric_client_interface_t) + 4 * sizeof(fabric_router_l1_config_t), - l1_alignment); - uint32_t buffer_data_addr = packet_header_addr + PACKET_HEADER_SIZE_BYTES; - uint32_t buffer_data_size = tt::constants::TILE_HW * sizeof(uint32_t); - std::vector buffer_data(buffer_data_size / sizeof(uint32_t), 0); - tt::llrt::write_hex_vec_to_core(physical_end_device_id, receiver_virtual_core, buffer_data, buffer_data_addr); - - std::iota(buffer_data.begin(), buffer_data.end(), 0); - tt::llrt::write_hex_vec_to_core(physical_start_device_id, sender_virtual_core, buffer_data, buffer_data_addr); - - tt::Cluster::instance().l1_barrier(physical_end_device_id); - tt::Cluster::instance().l1_barrier(physical_start_device_id); - - auto receiver_noc_encoding = tt::tt_metal::hal.noc_xy_encoding(receiver_virtual_core.x, receiver_virtual_core.y); - - auto sender_program = tt_metal::CreateProgram(); - auto sender_kernel = tt_metal::CreateKernel( - sender_program, - "tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_routing_plane_sender.cpp", - {sender_logical_core}, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); - - auto [sender_gk_noc_offset, sender_gk_interface_addr] = - this->GetFabricData().get_gatekeeper_noc_addr(physical_start_device_id); - - uint32_t routing_plane = 0; - std::vector sender_runtime_args = { - client_interface_addr, - sender_gk_interface_addr, - sender_gk_noc_offset, - packet_header_addr, - receiver_noc_encoding, - buffer_data_addr, - buffer_data_size, - end_mesh_chip_id.first, - end_mesh_chip_id.second, - routing_plane}; - tt_metal::SetRuntimeArgs(sender_program, sender_kernel, sender_logical_core, sender_runtime_args); - - auto receiver_program = tt_metal::CreateProgram(); - auto receiver_kernel = tt_metal::CreateKernel( - receiver_program, - "tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_receiver.cpp", - {receiver_logical_core}, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); - - auto [receiver_gk_noc_offset, receiver_gk_interface_addr] = - this->GetFabricData().get_gatekeeper_noc_addr(physical_end_device_id); - std::vector receiver_runtime_args = { - buffer_data_addr, - buffer_data_size, - }; - tt_metal::SetRuntimeArgs(receiver_program, receiver_kernel, receiver_logical_core, receiver_runtime_args); - tt_metal::detail::LaunchProgram(receiver_device, receiver_program, false); - tt_metal::detail::LaunchProgram(sender_device, sender_program, false); - tt_metal::detail::WaitProgramDone(sender_device, sender_program); - tt_metal::detail::WaitProgramDone(receiver_device, receiver_program); - - std::vector received_buffer_data = tt::llrt::read_hex_vec_from_core( - physical_end_device_id, receiver_virtual_core, buffer_data_addr, buffer_data_size); - EXPECT_EQ(buffer_data, received_buffer_data); -} - -TEST_F(FabricFixture, TestAsyncWrite) { - CoreCoord sender_logical_core = {0, 0}; - CoreCoord receiver_logical_core = {1, 0}; - std::pair start_mesh_chip_id; - chip_id_t physical_start_device_id; - std::pair end_mesh_chip_id; - chip_id_t physical_end_device_id; - bool connection_found = false; - for (const auto &[id, device] : devices_map_) { - start_mesh_chip_id = control_plane_->get_mesh_chip_id_from_physical_chip_id(device->id()); - auto neighbors = control_plane_->get_intra_chip_neighbors( - start_mesh_chip_id.first, start_mesh_chip_id.second, RoutingDirection::E); - if (neighbors.size() > 0) { - physical_start_device_id = device->id(); - end_mesh_chip_id = {start_mesh_chip_id.first, neighbors[0]}; - physical_end_device_id = control_plane_->get_physical_chip_id_from_mesh_chip_id(end_mesh_chip_id); - connection_found = true; - break; - } - } - auto routers = control_plane_->get_routers_to_chip( + // Get the optimal routers (no internal hops) on the start chip that will forward in the direction of the end chip + auto routers = control_plane->get_routers_to_chip( start_mesh_chip_id.first, start_mesh_chip_id.second, end_mesh_chip_id.first, end_mesh_chip_id.second); - if (routers.empty()) { - GTEST_SKIP() << "No path found between sender and receivers"; - } auto* sender_device = DevicePool::instance().get_active_device(physical_start_device_id); auto* receiver_device = DevicePool::instance().get_active_device(physical_end_device_id); CoreCoord sender_virtual_core = sender_device->worker_core_from_logical_core(sender_logical_core); CoreCoord receiver_virtual_core = receiver_device->worker_core_from_logical_core(receiver_logical_core); - uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); - - uint32_t worker_unreserved_base_addr = - hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); - uint32_t client_interface_addr = worker_unreserved_base_addr; - uint32_t packet_header_addr = tt::round_up(client_interface_addr + sizeof(fabric_client_interface_t), l1_alignment); - uint32_t buffer_data_addr = packet_header_addr + PACKET_HEADER_SIZE_BYTES; - uint32_t buffer_data_size = tt::constants::TILE_HW * sizeof(uint32_t); - std::vector buffer_data(buffer_data_size / sizeof(uint32_t), 0); - tt::llrt::write_hex_vec_to_core(physical_end_device_id, receiver_virtual_core, buffer_data, buffer_data_addr); - - std::iota(buffer_data.begin(), buffer_data.end(), 0); - tt::llrt::write_hex_vec_to_core(physical_start_device_id, sender_virtual_core, buffer_data, buffer_data_addr); - + uint32_t data_size = tt::constants::TILE_HW * sizeof(uint32_t); + + auto receiver_shard_parameters = + ShardSpecBuffer(receiver_logical_crs, {1, 1}, ShardOrientation::ROW_MAJOR, {1, 1}, {1, 1}); + ShardedBufferConfig receiver_shard_config = { + .device = receiver_device, + .size = data_size, + .page_size = data_size, + .buffer_type = BufferType::L1, + .buffer_layout = TensorMemoryLayout::HEIGHT_SHARDED, + .shard_parameters = std::move(receiver_shard_parameters), + }; + auto receiver_buffer = CreateBuffer(receiver_shard_config); + // Reset buffer space for test validation + std::vector receiver_buffer_data(data_size / sizeof(uint32_t), 0); + tt::tt_metal::detail::WriteToBuffer(receiver_buffer, receiver_buffer_data); + + // Packet header needs to be inlined with the data being sent, so this test just allocates buffer space for both + // together on the sender + uint32_t sender_packet_header_and_data_size = tt::tt_fabric::PACKET_HEADER_SIZE_BYTES + data_size; + auto sender_shard_parameters = + ShardSpecBuffer(sender_logical_crs, {1, 1}, ShardOrientation::ROW_MAJOR, {1, 1}, {1, 1}); + ShardedBufferConfig sender_shard_config = { + .device = sender_device, + .size = sender_packet_header_and_data_size, + .page_size = sender_packet_header_and_data_size, + .buffer_type = BufferType::L1, + .buffer_layout = TensorMemoryLayout::HEIGHT_SHARDED, + .shard_parameters = std::move(sender_shard_parameters), + }; + auto sender_buffer = CreateBuffer(sender_shard_config); + // Write the data to send to the buffer + std::vector sender_buffer_data(sender_packet_header_and_data_size / sizeof(uint32_t), 0); + std::iota(sender_buffer_data.begin() + PACKET_HEADER_SIZE_BYTES / sizeof(uint32_t), sender_buffer_data.end(), 0); + tt::tt_metal::detail::WriteToBuffer(sender_buffer, sender_buffer_data); + + // Extract the expected data to be read from the receiver + std::copy( + sender_buffer_data.begin() + tt::tt_fabric::PACKET_HEADER_SIZE_BYTES / sizeof(uint32_t), + sender_buffer_data.end(), + receiver_buffer_data.begin()); + + // Wait for buffer data to be written to device tt::Cluster::instance().l1_barrier(physical_end_device_id); tt::Cluster::instance().l1_barrier(physical_start_device_id); auto receiver_noc_encoding = tt::tt_metal::hal.noc_xy_encoding(receiver_virtual_core.x, receiver_virtual_core.y); + // Create the sender program auto sender_program = tt_metal::CreateProgram(); + + // Allocate space for the client interface + uint32_t client_interface_cb_index = tt::CBIndex::c_0; + tt::tt_metal::CircularBufferConfig client_interface_cb_config = + tt::tt_metal::CircularBufferConfig( + tt::tt_fabric::CLIENT_INTERFACE_SIZE, {{client_interface_cb_index, DataFormat::UInt32}}) + .set_page_size(client_interface_cb_index, tt::tt_fabric::CLIENT_INTERFACE_SIZE); + auto client_interface_cb = + tt::tt_metal::CreateCircularBuffer(sender_program, sender_logical_core, client_interface_cb_config); + + std::vector sender_compile_time_args = {client_interface_cb_index}; auto sender_kernel = tt_metal::CreateKernel( sender_program, - "tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_sender.cpp", - {sender_logical_core}, + "tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_async_write_sender.cpp", + sender_logical_crs, tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); - - auto [sender_gk_noc_offset, sender_gk_interface_addr] = - this->GetFabricData().get_gatekeeper_noc_addr(physical_start_device_id); + .processor = tt_metal::DataMovementProcessor::RISCV_0, + .noc = NOC::RISCV_0_default, + .compile_args = sender_compile_time_args}); auto& sender_virtual_router_coord = routers[0].second; auto sender_router_noc_xy = tt_metal::hal.noc_xy_encoding(sender_virtual_router_coord.x, sender_virtual_router_coord.y); + std::vector sender_runtime_args = { - client_interface_addr, - sender_gk_interface_addr, - sender_gk_noc_offset, - packet_header_addr, + sender_buffer->address(), receiver_noc_encoding, - buffer_data_addr, - buffer_data_size, + receiver_buffer->address(), + data_size, end_mesh_chip_id.first, end_mesh_chip_id.second, sender_router_noc_xy}; tt_metal::SetRuntimeArgs(sender_program, sender_kernel, sender_logical_core, sender_runtime_args); + // Create the receiver program for validation auto receiver_program = tt_metal::CreateProgram(); auto receiver_kernel = tt_metal::CreateKernel( receiver_program, @@ -195,94 +145,140 @@ TEST_F(FabricFixture, TestAsyncWrite) { tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); - auto [receiver_gk_noc_offset, receiver_gk_interface_addr] = - this->GetFabricData().get_gatekeeper_noc_addr(physical_end_device_id); std::vector receiver_runtime_args = { - buffer_data_addr, - buffer_data_size, + receiver_buffer->address(), + data_size, }; tt_metal::SetRuntimeArgs(receiver_program, receiver_kernel, receiver_logical_core, receiver_runtime_args); + // Launch sender and receiver programs and wait for them to finish tt_metal::detail::LaunchProgram(receiver_device, receiver_program, false); tt_metal::detail::LaunchProgram(sender_device, sender_program, false); tt_metal::detail::WaitProgramDone(sender_device, sender_program); tt_metal::detail::WaitProgramDone(receiver_device, receiver_program); - std::vector received_buffer_data = tt::llrt::read_hex_vec_from_core( - physical_end_device_id, receiver_virtual_core, buffer_data_addr, buffer_data_size); - EXPECT_EQ(buffer_data, received_buffer_data); + // Validate the data received by the receiver + std::vector received_buffer_data; + tt::tt_metal::detail::ReadFromBuffer(receiver_buffer, received_buffer_data); + EXPECT_EQ(receiver_buffer_data, received_buffer_data); } TEST_F(FabricFixture, TestAtomicInc) { CoreCoord sender_logical_core = {0, 0}; + CoreRangeSet sender_logical_crs = {sender_logical_core}; CoreCoord receiver_logical_core = {1, 0}; + CoreRangeSet receiver_logical_crs = {receiver_logical_core}; std::pair start_mesh_chip_id; chip_id_t physical_start_device_id; std::pair end_mesh_chip_id; chip_id_t physical_end_device_id; + + auto control_plane = tt::DevicePool::instance().get_control_plane(); + + // Find a device with a neighbour in the East direction bool connection_found = false; - for (const auto &[id, device] : devices_map_) { - start_mesh_chip_id = control_plane_->get_mesh_chip_id_from_physical_chip_id(device->id()); - auto neighbors = control_plane_->get_intra_chip_neighbors( + for (auto* device : devices_) { + start_mesh_chip_id = control_plane->get_mesh_chip_id_from_physical_chip_id(device->id()); + // Get neighbours within a mesh in the East direction + auto neighbors = control_plane->get_intra_chip_neighbors( start_mesh_chip_id.first, start_mesh_chip_id.second, RoutingDirection::E); if (neighbors.size() > 0) { physical_start_device_id = device->id(); end_mesh_chip_id = {start_mesh_chip_id.first, neighbors[0]}; - physical_end_device_id = control_plane_->get_physical_chip_id_from_mesh_chip_id(end_mesh_chip_id); + physical_end_device_id = control_plane->get_physical_chip_id_from_mesh_chip_id(end_mesh_chip_id); connection_found = true; break; } } - auto routers = control_plane_->get_routers_to_chip( - start_mesh_chip_id.first, start_mesh_chip_id.second, end_mesh_chip_id.first, end_mesh_chip_id.second); - - if (routers.empty()) { + if (!connection_found) { GTEST_SKIP() << "No path found between sender and receivers"; } + + // Get the optimal routers (no internal hops) on the start chip that will forward in the direction of the end chip + auto routers = control_plane->get_routers_to_chip( + start_mesh_chip_id.first, start_mesh_chip_id.second, end_mesh_chip_id.first, end_mesh_chip_id.second); + auto* sender_device = DevicePool::instance().get_active_device(physical_start_device_id); auto* receiver_device = DevicePool::instance().get_active_device(physical_end_device_id); CoreCoord sender_virtual_core = sender_device->worker_core_from_logical_core(sender_logical_core); CoreCoord receiver_virtual_core = receiver_device->worker_core_from_logical_core(receiver_logical_core); - uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); - - uint32_t worker_unreserved_base_addr = - hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); - uint32_t client_interface_addr = worker_unreserved_base_addr; - uint32_t packet_header_addr = tt::round_up(client_interface_addr + sizeof(fabric_client_interface_t), l1_alignment); - uint32_t atomic_inc_addr = packet_header_addr + PACKET_HEADER_SIZE_BYTES; - uint32_t atomic_inc_size = sizeof(uint32_t); - std::vector atomic_inc_data(atomic_inc_size / sizeof(uint32_t), 0); - tt::llrt::write_hex_vec_to_core(physical_end_device_id, receiver_virtual_core, atomic_inc_data, atomic_inc_addr); + uint32_t data_size = sizeof(uint32_t); + + auto receiver_shard_parameters = + ShardSpecBuffer(receiver_logical_crs, {1, 1}, ShardOrientation::ROW_MAJOR, {1, 1}, {1, 1}); + ShardedBufferConfig receiver_shard_config = { + .device = receiver_device, + .size = data_size, + .page_size = data_size, + .buffer_type = BufferType::L1, + .buffer_layout = TensorMemoryLayout::HEIGHT_SHARDED, + .shard_parameters = std::move(receiver_shard_parameters), + }; + auto receiver_buffer = CreateBuffer(receiver_shard_config); + // Reset buffer space for test validation + std::vector receiver_buffer_data(data_size / sizeof(uint32_t), 0); + tt::tt_metal::detail::WriteToBuffer(receiver_buffer, receiver_buffer_data); + + // Packet header needs to be inlined with the data being sent, so this test just allocates buffer space for both + // together on the sender + uint32_t sender_packet_header_and_data_size = tt::tt_fabric::PACKET_HEADER_SIZE_BYTES; + auto sender_shard_parameters = + ShardSpecBuffer(sender_logical_crs, {1, 1}, ShardOrientation::ROW_MAJOR, {1, 1}, {1, 1}); + ShardedBufferConfig sender_shard_config = { + .device = sender_device, + .size = sender_packet_header_and_data_size, + .page_size = sender_packet_header_and_data_size, + .buffer_type = BufferType::L1, + .buffer_layout = TensorMemoryLayout::HEIGHT_SHARDED, + .shard_parameters = std::move(sender_shard_parameters), + }; + auto sender_buffer = CreateBuffer(sender_shard_config); + // Write the data to send to the buffer + std::vector sender_buffer_data(sender_packet_header_and_data_size / sizeof(uint32_t), 0); + tt::tt_metal::detail::WriteToBuffer(sender_buffer, sender_buffer_data); uint32_t atomic_inc = 5; uint32_t wrap_boundary = 31; + + // Extract the expected data to be read from the receiver + receiver_buffer_data[0] = atomic_inc; + + // Wait for buffer data to be written to device tt::Cluster::instance().l1_barrier(physical_end_device_id); tt::Cluster::instance().l1_barrier(physical_start_device_id); auto receiver_noc_encoding = tt::tt_metal::hal.noc_xy_encoding(receiver_virtual_core.x, receiver_virtual_core.y); + // Create the sender program auto sender_program = tt_metal::CreateProgram(); + + // Allocate space for the client interface + uint32_t client_interface_cb_index = tt::CBIndex::c_0; + tt::tt_metal::CircularBufferConfig client_interface_cb_config = + tt::tt_metal::CircularBufferConfig( + tt::tt_fabric::CLIENT_INTERFACE_SIZE, {{client_interface_cb_index, DataFormat::UInt32}}) + .set_page_size(client_interface_cb_index, tt::tt_fabric::CLIENT_INTERFACE_SIZE); + auto client_interface_cb = + tt::tt_metal::CreateCircularBuffer(sender_program, sender_logical_core, client_interface_cb_config); + + std::vector sender_compile_time_args = {client_interface_cb_index}; auto sender_kernel = tt_metal::CreateKernel( sender_program, - "tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_atomic_inc_sender.cpp", - {sender_logical_core}, + "tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_atomic_inc_sender.cpp", + sender_logical_crs, tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); - - auto [sender_gk_noc_offset, sender_gk_interface_addr] = - this->GetFabricData().get_gatekeeper_noc_addr(physical_start_device_id); + .processor = tt_metal::DataMovementProcessor::RISCV_0, + .noc = NOC::RISCV_0_default, + .compile_args = sender_compile_time_args}); auto& sender_virtual_router_coord = routers[0].second; auto sender_router_noc_xy = tt_metal::hal.noc_xy_encoding(sender_virtual_router_coord.x, sender_virtual_router_coord.y); std::vector sender_runtime_args = { - client_interface_addr, - sender_gk_interface_addr, - sender_gk_noc_offset, - packet_header_addr, + sender_buffer->address(), receiver_noc_encoding, - atomic_inc_addr, + receiver_buffer->address(), atomic_inc, wrap_boundary, end_mesh_chip_id.first, @@ -290,6 +286,7 @@ TEST_F(FabricFixture, TestAtomicInc) { sender_router_noc_xy}; tt_metal::SetRuntimeArgs(sender_program, sender_kernel, sender_logical_core, sender_runtime_args); + // Create the receiver program for validation auto receiver_program = tt_metal::CreateProgram(); auto receiver_kernel = tt_metal::CreateKernel( receiver_program, @@ -298,109 +295,165 @@ TEST_F(FabricFixture, TestAtomicInc) { tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); - auto [receiver_gk_noc_offset, receiver_gk_interface_addr] = - this->GetFabricData().get_gatekeeper_noc_addr(physical_end_device_id); std::vector receiver_runtime_args = { - atomic_inc_addr, - sizeof(uint32_t), + receiver_buffer->address(), + data_size, }; tt_metal::SetRuntimeArgs(receiver_program, receiver_kernel, receiver_logical_core, receiver_runtime_args); + // Launch sender and receiver programs and wait for them to finish tt_metal::detail::LaunchProgram(receiver_device, receiver_program, false); tt_metal::detail::LaunchProgram(sender_device, sender_program, false); tt_metal::detail::WaitProgramDone(sender_device, sender_program); tt_metal::detail::WaitProgramDone(receiver_device, receiver_program); - std::vector received_buffer_data = tt::llrt::read_hex_vec_from_core( - physical_end_device_id, receiver_virtual_core, atomic_inc_addr, atomic_inc_size); - EXPECT_EQ(atomic_inc, received_buffer_data[0]); + // Validate the data received by the receiver + std::vector received_buffer_data; + tt::tt_metal::detail::ReadFromBuffer(receiver_buffer, received_buffer_data); + EXPECT_EQ(receiver_buffer_data, received_buffer_data); } -TEST_F(FabricFixture, TestAyncWriteAtomicInc) { +TEST_F(FabricFixture, TestAsyncWriteAtomicInc) { CoreCoord sender_logical_core = {0, 0}; + CoreRangeSet sender_logical_crs = {sender_logical_core}; CoreCoord receiver_logical_core = {1, 0}; + CoreRangeSet receiver_logical_crs = {receiver_logical_core}; std::pair start_mesh_chip_id; chip_id_t physical_start_device_id; std::pair end_mesh_chip_id; chip_id_t physical_end_device_id; + + auto control_plane = tt::DevicePool::instance().get_control_plane(); + + // Find a device with a neighbour in the East direction bool connection_found = false; - for (const auto &[id, device] : devices_map_) { - start_mesh_chip_id = control_plane_->get_mesh_chip_id_from_physical_chip_id(device->id()); - auto neighbors = control_plane_->get_intra_chip_neighbors( + for (auto* device : devices_) { + start_mesh_chip_id = control_plane->get_mesh_chip_id_from_physical_chip_id(device->id()); + // Get neighbours within a mesh in the East direction + auto neighbors = control_plane->get_intra_chip_neighbors( start_mesh_chip_id.first, start_mesh_chip_id.second, RoutingDirection::E); if (neighbors.size() > 0) { physical_start_device_id = device->id(); end_mesh_chip_id = {start_mesh_chip_id.first, neighbors[0]}; - physical_end_device_id = control_plane_->get_physical_chip_id_from_mesh_chip_id(end_mesh_chip_id); + physical_end_device_id = control_plane->get_physical_chip_id_from_mesh_chip_id(end_mesh_chip_id); connection_found = true; break; } } - auto routers = control_plane_->get_routers_to_chip( - start_mesh_chip_id.first, start_mesh_chip_id.second, end_mesh_chip_id.first, end_mesh_chip_id.second); - - if (routers.empty()) { + if (!connection_found) { GTEST_SKIP() << "No path found between sender and receivers"; } + + // Get the optimal routers (no internal hops) on the start chip that will forward in the direction of the end chip + auto routers = control_plane->get_routers_to_chip( + start_mesh_chip_id.first, start_mesh_chip_id.second, end_mesh_chip_id.first, end_mesh_chip_id.second); + auto* sender_device = DevicePool::instance().get_active_device(physical_start_device_id); auto* receiver_device = DevicePool::instance().get_active_device(physical_end_device_id); CoreCoord sender_virtual_core = sender_device->worker_core_from_logical_core(sender_logical_core); CoreCoord receiver_virtual_core = receiver_device->worker_core_from_logical_core(receiver_logical_core); - uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); - - uint32_t worker_unreserved_base_addr = - hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); - uint32_t client_interface_addr = worker_unreserved_base_addr; - uint32_t packet_header_addr = tt::round_up(client_interface_addr + sizeof(fabric_client_interface_t), l1_alignment); - uint32_t buffer_data_addr = packet_header_addr + PACKET_HEADER_SIZE_BYTES; - uint32_t buffer_data_size = constants::TILE_HW; - uint32_t atomic_inc_addr = tt::round_up(buffer_data_addr + buffer_data_size, l1_alignment); + uint32_t data_size = tt::constants::TILE_HW * sizeof(uint32_t); uint32_t atomic_inc_size = sizeof(uint32_t); + + auto receiver_shard_parameters = + ShardSpecBuffer(receiver_logical_crs, {1, 1}, ShardOrientation::ROW_MAJOR, {1, 1}, {1, 1}); + ShardedBufferConfig receiver_shard_config = { + .device = receiver_device, + .size = data_size, + .page_size = data_size, + .buffer_type = BufferType::L1, + .buffer_layout = TensorMemoryLayout::HEIGHT_SHARDED, + .shard_parameters = receiver_shard_parameters, + }; + auto receiver_buffer = CreateBuffer(receiver_shard_config); + ShardedBufferConfig receiver_atomic_shard_config = { + .device = receiver_device, + .size = atomic_inc_size, + .page_size = atomic_inc_size, + .buffer_type = BufferType::L1, + .buffer_layout = TensorMemoryLayout::HEIGHT_SHARDED, + .shard_parameters = receiver_shard_parameters, + }; + auto receiver_atomic_buffer = CreateBuffer(receiver_atomic_shard_config); + // Reset buffer space for test validation + std::vector receiver_buffer_data(atomic_inc_size / sizeof(uint32_t), 0); + tt::tt_metal::detail::WriteToBuffer(receiver_atomic_buffer, receiver_buffer_data); + receiver_buffer_data.resize(data_size / sizeof(uint32_t), 0); + tt::tt_metal::detail::WriteToBuffer(receiver_buffer, receiver_buffer_data); + + // Packet header needs to be inlined with the data being sent, so this test just allocates buffer space for both + // together on the sender + uint32_t sender_packet_header_and_data_size = tt::tt_fabric::PACKET_HEADER_SIZE_BYTES + data_size; + auto sender_shard_parameters = + ShardSpecBuffer(sender_logical_crs, {1, 1}, ShardOrientation::ROW_MAJOR, {1, 1}, {1, 1}); + ShardedBufferConfig sender_shard_config = { + .device = sender_device, + .size = sender_packet_header_and_data_size, + .page_size = sender_packet_header_and_data_size, + .buffer_type = BufferType::L1, + .buffer_layout = TensorMemoryLayout::HEIGHT_SHARDED, + .shard_parameters = std::move(sender_shard_parameters), + }; + auto sender_buffer = CreateBuffer(sender_shard_config); + // Write the data to send to the buffer + std::vector sender_buffer_data(sender_packet_header_and_data_size / sizeof(uint32_t), 0); + std::iota(sender_buffer_data.begin() + PACKET_HEADER_SIZE_BYTES / sizeof(uint32_t), sender_buffer_data.end(), 0); + tt::tt_metal::detail::WriteToBuffer(sender_buffer, sender_buffer_data); + uint32_t atomic_inc = 5; - std::vector buffer_data(buffer_data_size / sizeof(uint32_t), 0); - tt::llrt::write_hex_vec_to_core(physical_end_device_id, receiver_virtual_core, buffer_data, buffer_data_addr); - std::vector atomic_inc_data(atomic_inc_size / sizeof(uint32_t), 0); - tt::llrt::write_hex_vec_to_core(physical_end_device_id, receiver_virtual_core, atomic_inc_data, atomic_inc_addr); - uint32_t wrap_boundary = 31; - std::iota(buffer_data.begin(), buffer_data.end(), 0); - tt::llrt::write_hex_vec_to_core(physical_start_device_id, sender_virtual_core, buffer_data, buffer_data_addr); + // Extract the expected data to be read from the receiver + std::copy( + sender_buffer_data.begin() + tt::tt_fabric::PACKET_HEADER_SIZE_BYTES / sizeof(uint32_t), + sender_buffer_data.end(), + receiver_buffer_data.begin()); + // Wait for buffer data to be written to device tt::Cluster::instance().l1_barrier(physical_end_device_id); tt::Cluster::instance().l1_barrier(physical_start_device_id); auto receiver_noc_encoding = tt::tt_metal::hal.noc_xy_encoding(receiver_virtual_core.x, receiver_virtual_core.y); + // Create the sender program auto sender_program = tt_metal::CreateProgram(); + + // Allocate space for the client interface + uint32_t client_interface_cb_index = tt::CBIndex::c_0; + tt::tt_metal::CircularBufferConfig client_interface_cb_config = + tt::tt_metal::CircularBufferConfig( + tt::tt_fabric::CLIENT_INTERFACE_SIZE, {{client_interface_cb_index, DataFormat::UInt32}}) + .set_page_size(client_interface_cb_index, tt::tt_fabric::CLIENT_INTERFACE_SIZE); + auto client_interface_cb = + tt::tt_metal::CreateCircularBuffer(sender_program, sender_logical_core, client_interface_cb_config); + + std::vector sender_compile_time_args = {client_interface_cb_index}; auto sender_kernel = tt_metal::CreateKernel( sender_program, - "tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_atomic_inc_sender.cpp", - {sender_logical_core}, + "tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_async_write_atomic_inc_sender.cpp", + sender_logical_crs, tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); - - auto [sender_gk_noc_offset, sender_gk_interface_addr] = - this->GetFabricData().get_gatekeeper_noc_addr(physical_start_device_id); + .processor = tt_metal::DataMovementProcessor::RISCV_0, + .noc = NOC::RISCV_0_default, + .compile_args = sender_compile_time_args}); auto& sender_virtual_router_coord = routers[0].second; auto sender_router_noc_xy = tt_metal::hal.noc_xy_encoding(sender_virtual_router_coord.x, sender_virtual_router_coord.y); + std::vector sender_runtime_args = { - client_interface_addr, - sender_gk_interface_addr, - sender_gk_noc_offset, - packet_header_addr, + sender_buffer->address(), receiver_noc_encoding, - buffer_data_addr, - atomic_inc_addr, - buffer_data_size, + receiver_buffer->address(), + receiver_atomic_buffer->address(), + data_size, atomic_inc, end_mesh_chip_id.first, end_mesh_chip_id.second, sender_router_noc_xy}; tt_metal::SetRuntimeArgs(sender_program, sender_kernel, sender_logical_core, sender_runtime_args); + // Create the receiver program for validation auto receiver_program = tt_metal::CreateProgram(); auto receiver_kernel = tt_metal::CreateKernel( receiver_program, @@ -409,44 +462,50 @@ TEST_F(FabricFixture, TestAyncWriteAtomicInc) { tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); - auto [receiver_gk_noc_offset, receiver_gk_interface_addr] = - this->GetFabricData().get_gatekeeper_noc_addr(physical_end_device_id); std::vector receiver_runtime_args = { - atomic_inc_addr, - sizeof(uint32_t), + receiver_buffer->address(), + data_size, }; tt_metal::SetRuntimeArgs(receiver_program, receiver_kernel, receiver_logical_core, receiver_runtime_args); + // Launch sender and receiver programs and wait for them to finish tt_metal::detail::LaunchProgram(receiver_device, receiver_program, false); tt_metal::detail::LaunchProgram(sender_device, sender_program, false); tt_metal::detail::WaitProgramDone(sender_device, sender_program); tt_metal::detail::WaitProgramDone(receiver_device, receiver_program); - std::vector received_buffer_data = tt::llrt::read_hex_vec_from_core( - physical_end_device_id, receiver_virtual_core, buffer_data_addr, buffer_data_size); - EXPECT_EQ(buffer_data, received_buffer_data); + // Validate the data received by the receiver + std::vector received_buffer_data; + tt::tt_metal::detail::ReadFromBuffer(receiver_buffer, received_buffer_data); + EXPECT_EQ(receiver_buffer_data, received_buffer_data); received_buffer_data.clear(); - received_buffer_data = tt::llrt::read_hex_vec_from_core( - physical_end_device_id, receiver_virtual_core, atomic_inc_addr, atomic_inc_size); + tt::tt_metal::detail::ReadFromBuffer(receiver_atomic_buffer, received_buffer_data); EXPECT_EQ(atomic_inc, received_buffer_data[0]); } -TEST_F(FabricFixture, TestAsyncWriteMulticastMultidirectional) { +TEST_F(FabricFixture, TestAsyncWriteMulticast) { CoreCoord sender_logical_core = {0, 0}; + CoreRangeSet sender_logical_crs = {sender_logical_core}; CoreCoord receiver_logical_core = {1, 0}; + CoreRangeSet receiver_logical_crs = {receiver_logical_core}; std::pair start_mesh_chip_id; chip_id_t physical_start_device_id; std::unordered_map>> end_mesh_chip_ids_by_dir; std::unordered_map> physical_end_device_ids_by_dir; - uint32_t num_dirs = 2; std::unordered_map mcast_hops; - mcast_hops[RoutingDirection::E] = 2; - for (const auto &[id, device] : devices_map_) { - start_mesh_chip_id = control_plane_->get_mesh_chip_id_from_physical_chip_id(device->id()); + auto routing_direction = RoutingDirection::E; + mcast_hops[routing_direction] = 1; + + auto control_plane = tt::DevicePool::instance().get_control_plane(); + + // Find a device with enough neighbours in the specified direction + bool connection_found = false; + for (auto* device : devices_) { + start_mesh_chip_id = control_plane->get_mesh_chip_id_from_physical_chip_id(device->id()); std::unordered_map>> temp_end_mesh_chip_ids_by_dir; std::unordered_map> temp_physical_end_device_ids_by_dir; - bool connection_found = true; + connection_found = true; for (auto [routing_direction, num_hops] : mcast_hops) { bool direction_found = true; auto& temp_end_mesh_chip_ids = temp_end_mesh_chip_ids_by_dir[routing_direction]; @@ -454,12 +513,11 @@ TEST_F(FabricFixture, TestAsyncWriteMulticastMultidirectional) { uint32_t curr_mesh_id = start_mesh_chip_id.first; uint32_t curr_chip_id = start_mesh_chip_id.second; for (uint32_t i = 0; i < num_hops; i++) { - auto neighbors = - control_plane_->get_intra_chip_neighbors(curr_mesh_id, curr_chip_id, routing_direction); + auto neighbors = control_plane->get_intra_chip_neighbors(curr_mesh_id, curr_chip_id, routing_direction); if (neighbors.size() > 0) { temp_end_mesh_chip_ids.emplace_back(curr_mesh_id, neighbors[0]); temp_physical_end_device_ids.push_back( - control_plane_->get_physical_chip_id_from_mesh_chip_id(temp_end_mesh_chip_ids.back())); + control_plane->get_physical_chip_id_from_mesh_chip_id(temp_end_mesh_chip_ids.back())); curr_mesh_id = temp_end_mesh_chip_ids.back().first; curr_chip_id = temp_end_mesh_chip_ids.back().second; } else { @@ -479,32 +537,41 @@ TEST_F(FabricFixture, TestAsyncWriteMulticastMultidirectional) { break; } } - if (end_mesh_chip_ids_by_dir.empty()) { + + if (!connection_found) { GTEST_SKIP() << "No path found between sender and receivers"; } auto* sender_device = DevicePool::instance().get_active_device(physical_start_device_id); CoreCoord sender_virtual_core = sender_device->worker_core_from_logical_core(sender_logical_core); - + // Virtual coordinate space. All devices have the same logical to virtual mapping CoreCoord receiver_virtual_core = sender_device->worker_core_from_logical_core(receiver_logical_core); - uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); + uint32_t data_size = tt::constants::TILE_HW * sizeof(uint32_t); + + auto receiver_shard_parameters = + ShardSpecBuffer(receiver_logical_crs, {1, 1}, ShardOrientation::ROW_MAJOR, {1, 1}, {1, 1}); + + // Reset buffer space for test validation + std::vector receiver_buffer_data(data_size / sizeof(uint32_t), 0); - uint32_t worker_unreserved_base_addr = - hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); - uint32_t client_interface_addr = worker_unreserved_base_addr; - uint32_t packet_header_addr = - tt::round_up(client_interface_addr + sizeof(fabric_client_interface_t) * num_dirs, l1_alignment); - uint32_t buffer_data_addr = packet_header_addr + PACKET_HEADER_SIZE_BYTES; - uint32_t buffer_data_size = tt::constants::TILE_HW * sizeof(uint32_t); - std::vector buffer_data(buffer_data_size / sizeof(uint32_t), 0); std::vector receiver_programs; + std::vector> receiver_buffers; for (auto& [routing_direction, physical_end_device_ids] : physical_end_device_ids_by_dir) { for (auto physical_end_device_id : physical_end_device_ids) { auto* receiver_device = DevicePool::instance().get_active_device(physical_end_device_id); - tt::llrt::write_hex_vec_to_core( - physical_end_device_id, receiver_virtual_core, buffer_data, buffer_data_addr); + ShardedBufferConfig receiver_shard_config = { + .device = receiver_device, + .size = data_size, + .page_size = data_size, + .buffer_type = BufferType::L1, + .buffer_layout = TensorMemoryLayout::HEIGHT_SHARDED, + .shard_parameters = receiver_shard_parameters, + }; + auto receiver_buffer = CreateBuffer(receiver_shard_config); + tt::tt_metal::detail::WriteToBuffer(receiver_buffer, receiver_buffer_data); tt::Cluster::instance().l1_barrier(physical_end_device_id); + // Create the receiver program for validation auto receiver_program = tt_metal::CreateProgram(); auto receiver_kernel = tt_metal::CreateKernel( receiver_program, @@ -513,40 +580,80 @@ TEST_F(FabricFixture, TestAsyncWriteMulticastMultidirectional) { tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); - auto [receiver_gk_noc_offset, receiver_gk_interface_addr] = - this->GetFabricData().get_gatekeeper_noc_addr(physical_end_device_id); std::vector receiver_runtime_args = { - buffer_data_addr, - buffer_data_size, + receiver_buffer->address(), + data_size, }; tt_metal::SetRuntimeArgs(receiver_program, receiver_kernel, receiver_logical_core, receiver_runtime_args); tt_metal::detail::LaunchProgram(receiver_device, receiver_program, false); receiver_programs.push_back(std::move(receiver_program)); + receiver_buffers.push_back(std::move(receiver_buffer)); + } + } + // Assume all receiver buffers are at the same address + uint32_t receiver_buffer_addr = receiver_buffers[0]->address(); + for (const auto& receiver_buffer : receiver_buffers) { + if (receiver_buffer_addr != receiver_buffer->address()) { + GTEST_SKIP() << "Receiver buffers are not at the same address"; } } - std::iota(buffer_data.begin(), buffer_data.end(), 0); - tt::llrt::write_hex_vec_to_core(physical_start_device_id, sender_virtual_core, buffer_data, buffer_data_addr); - + // Packet header needs to be inlined with the data being sent, so this test just allocates buffer space for both + // together on the sender + uint32_t sender_packet_header_and_data_size = tt::tt_fabric::PACKET_HEADER_SIZE_BYTES + data_size; + auto sender_shard_parameters = + ShardSpecBuffer(sender_logical_crs, {1, 1}, ShardOrientation::ROW_MAJOR, {1, 1}, {1, 1}); + ShardedBufferConfig sender_shard_config = { + .device = sender_device, + .size = sender_packet_header_and_data_size, + .page_size = sender_packet_header_and_data_size, + .buffer_type = BufferType::L1, + .buffer_layout = TensorMemoryLayout::HEIGHT_SHARDED, + .shard_parameters = std::move(sender_shard_parameters), + }; + auto sender_buffer = CreateBuffer(sender_shard_config); + // Write the data to send to the buffer + std::vector sender_buffer_data(sender_packet_header_and_data_size / sizeof(uint32_t), 0); + std::iota(sender_buffer_data.begin() + PACKET_HEADER_SIZE_BYTES / sizeof(uint32_t), sender_buffer_data.end(), 0); + tt::tt_metal::detail::WriteToBuffer(sender_buffer, sender_buffer_data); + + // Extract the expected data to be read from the receiver + std::copy( + sender_buffer_data.begin() + tt::tt_fabric::PACKET_HEADER_SIZE_BYTES / sizeof(uint32_t), + sender_buffer_data.end(), + receiver_buffer_data.begin()); + + // Wait for buffer data to be written to device tt::Cluster::instance().l1_barrier(physical_start_device_id); auto receiver_noc_encoding = tt::tt_metal::hal.noc_xy_encoding(receiver_virtual_core.x, receiver_virtual_core.y); + // Create the sender program auto sender_program = tt_metal::CreateProgram(); + + // Allocate space for the client interface + uint32_t client_interface_cb_index = tt::CBIndex::c_0; + tt::tt_metal::CircularBufferConfig client_interface_cb_config = + tt::tt_metal::CircularBufferConfig( + mcast_hops.size() * tt::tt_fabric::CLIENT_INTERFACE_SIZE, {{client_interface_cb_index, DataFormat::UInt32}}) + .set_page_size(client_interface_cb_index, tt::tt_fabric::CLIENT_INTERFACE_SIZE); + auto client_interface_cb = + tt::tt_metal::CreateCircularBuffer(sender_program, sender_logical_core, client_interface_cb_config); + + std::vector sender_compile_time_args = {client_interface_cb_index}; auto sender_kernel = tt_metal::CreateKernel( sender_program, - "tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_async_write_multicast_sender.cpp", - {sender_logical_core}, + "tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_async_write_multicast_sender.cpp", + sender_logical_crs, tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); - - auto [sender_gk_noc_offset, sender_gk_interface_addr] = - this->GetFabricData().get_gatekeeper_noc_addr(physical_start_device_id); + .processor = tt_metal::DataMovementProcessor::RISCV_0, + .noc = NOC::RISCV_0_default, + .compile_args = sender_compile_time_args}); std::unordered_map sender_router_noc_xys; for (auto& [routing_direction, end_mesh_chip_ids] : end_mesh_chip_ids_by_dir) { - auto routers = control_plane_->get_routers_to_chip( + auto routers = control_plane->get_routers_to_chip( start_mesh_chip_id.first, start_mesh_chip_id.second, end_mesh_chip_ids[0].first, @@ -556,20 +663,19 @@ TEST_F(FabricFixture, TestAsyncWriteMulticastMultidirectional) { routing_direction, tt_metal::hal.noc_xy_encoding(sender_virtual_router_coord.x, sender_virtual_router_coord.y)); } + std::vector sender_runtime_args = { - client_interface_addr, - sender_gk_interface_addr, - sender_gk_noc_offset, - packet_header_addr, + sender_buffer->address(), receiver_noc_encoding, - buffer_data_addr, - buffer_data_size, - end_mesh_chip_ids_by_dir[RoutingDirection::E][0].first, - end_mesh_chip_ids_by_dir[RoutingDirection::E][0].second, - mcast_hops[RoutingDirection::E], - sender_router_noc_xys[RoutingDirection::E]}; + receiver_buffer_addr, + data_size, + end_mesh_chip_ids_by_dir[routing_direction][0].first, + end_mesh_chip_ids_by_dir[routing_direction][0].second, + mcast_hops[routing_direction], + sender_router_noc_xys[routing_direction]}; tt_metal::SetRuntimeArgs(sender_program, sender_kernel, sender_logical_core, sender_runtime_args); + // Launch sender and receiver programs and wait for them to finish tt_metal::detail::LaunchProgram(sender_device, sender_program, false); tt_metal::detail::WaitProgramDone(sender_device, sender_program); for (auto [routing_direction, physical_end_device_ids] : physical_end_device_ids_by_dir) { @@ -579,34 +685,40 @@ TEST_F(FabricFixture, TestAsyncWriteMulticastMultidirectional) { } } + // Validate the data received by the receiver for (auto [routing_direction, physical_end_device_ids] : physical_end_device_ids_by_dir) { - for (auto physical_end_device_id : physical_end_device_ids) { - std::vector received_buffer_data = tt::llrt::read_hex_vec_from_core( - physical_end_device_id, receiver_virtual_core, buffer_data_addr, buffer_data_size); - EXPECT_EQ(buffer_data, received_buffer_data); + for (uint32_t i = 0; i < physical_end_device_ids.size(); i++) { + std::vector received_buffer_data; + tt::tt_metal::detail::ReadFromBuffer(receiver_buffers[i], received_buffer_data); + EXPECT_EQ(receiver_buffer_data, received_buffer_data); } } } -TEST_F(FabricFixture, TestAsyncWriteMulticast) { +TEST_F(FabricFixture, TestAsyncWriteMulticastMultidirectional) { CoreCoord sender_logical_core = {0, 0}; + CoreRangeSet sender_logical_crs = {sender_logical_core}; CoreCoord receiver_logical_core = {1, 0}; + CoreRangeSet receiver_logical_crs = {receiver_logical_core}; std::pair start_mesh_chip_id; chip_id_t physical_start_device_id; std::unordered_map>> end_mesh_chip_ids_by_dir; std::unordered_map> physical_end_device_ids_by_dir; - uint32_t num_dirs = 2; + RoutingDirection routing_direction = RoutingDirection::E; std::unordered_map mcast_hops; - mcast_hops[RoutingDirection::E] = 2; - mcast_hops[RoutingDirection::W] = 1; - // mcast_hops[RoutingDirection::N] = 1; - // mcast_hops[RoutingDirection::S] = 0; - for (const auto &[id, device] : devices_map_) { - start_mesh_chip_id = control_plane_->get_mesh_chip_id_from_physical_chip_id(device->id()); + mcast_hops[RoutingDirection::E] = 1; + mcast_hops[RoutingDirection::W] = 2; + + auto control_plane = tt::DevicePool::instance().get_control_plane(); + + // Find a device with enough neighbours in the specified direction + bool connection_found = false; + for (auto* device : devices_) { + start_mesh_chip_id = control_plane->get_mesh_chip_id_from_physical_chip_id(device->id()); std::unordered_map>> temp_end_mesh_chip_ids_by_dir; std::unordered_map> temp_physical_end_device_ids_by_dir; - bool connection_found = true; + connection_found = true; for (auto [routing_direction, num_hops] : mcast_hops) { bool direction_found = true; auto& temp_end_mesh_chip_ids = temp_end_mesh_chip_ids_by_dir[routing_direction]; @@ -614,12 +726,11 @@ TEST_F(FabricFixture, TestAsyncWriteMulticast) { uint32_t curr_mesh_id = start_mesh_chip_id.first; uint32_t curr_chip_id = start_mesh_chip_id.second; for (uint32_t i = 0; i < num_hops; i++) { - auto neighbors = - control_plane_->get_intra_chip_neighbors(curr_mesh_id, curr_chip_id, routing_direction); + auto neighbors = control_plane->get_intra_chip_neighbors(curr_mesh_id, curr_chip_id, routing_direction); if (neighbors.size() > 0) { temp_end_mesh_chip_ids.emplace_back(curr_mesh_id, neighbors[0]); temp_physical_end_device_ids.push_back( - control_plane_->get_physical_chip_id_from_mesh_chip_id(temp_end_mesh_chip_ids.back())); + control_plane->get_physical_chip_id_from_mesh_chip_id(temp_end_mesh_chip_ids.back())); curr_mesh_id = temp_end_mesh_chip_ids.back().first; curr_chip_id = temp_end_mesh_chip_ids.back().second; } else { @@ -639,32 +750,41 @@ TEST_F(FabricFixture, TestAsyncWriteMulticast) { break; } } - if (end_mesh_chip_ids_by_dir.empty()) { + + if (!connection_found) { GTEST_SKIP() << "No path found between sender and receivers"; } auto* sender_device = DevicePool::instance().get_active_device(physical_start_device_id); CoreCoord sender_virtual_core = sender_device->worker_core_from_logical_core(sender_logical_core); - + // Virtual coordinate space. All devices have the same logical to virtual mapping CoreCoord receiver_virtual_core = sender_device->worker_core_from_logical_core(receiver_logical_core); - uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); + uint32_t data_size = tt::constants::TILE_HW * sizeof(uint32_t); + + auto receiver_shard_parameters = + ShardSpecBuffer(receiver_logical_crs, {1, 1}, ShardOrientation::ROW_MAJOR, {1, 1}, {1, 1}); + + // Reset buffer space for test validation + std::vector receiver_buffer_data(data_size / sizeof(uint32_t), 0); - uint32_t worker_unreserved_base_addr = - hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); - uint32_t client_interface_addr = worker_unreserved_base_addr; - uint32_t packet_header_addr = - tt::round_up(client_interface_addr + sizeof(fabric_client_interface_t) * num_dirs, l1_alignment); - uint32_t buffer_data_addr = packet_header_addr + PACKET_HEADER_SIZE_BYTES; - uint32_t buffer_data_size = tt::constants::TILE_HW * sizeof(uint32_t); - std::vector buffer_data(buffer_data_size / sizeof(uint32_t), 0); std::vector receiver_programs; + std::vector> receiver_buffers; for (auto& [routing_direction, physical_end_device_ids] : physical_end_device_ids_by_dir) { for (auto physical_end_device_id : physical_end_device_ids) { auto* receiver_device = DevicePool::instance().get_active_device(physical_end_device_id); - tt::llrt::write_hex_vec_to_core( - physical_end_device_id, receiver_virtual_core, buffer_data, buffer_data_addr); + ShardedBufferConfig receiver_shard_config = { + .device = receiver_device, + .size = data_size, + .page_size = data_size, + .buffer_type = BufferType::L1, + .buffer_layout = TensorMemoryLayout::HEIGHT_SHARDED, + .shard_parameters = receiver_shard_parameters, + }; + auto receiver_buffer = CreateBuffer(receiver_shard_config); + tt::tt_metal::detail::WriteToBuffer(receiver_buffer, receiver_buffer_data); tt::Cluster::instance().l1_barrier(physical_end_device_id); + // Create the receiver program for validation auto receiver_program = tt_metal::CreateProgram(); auto receiver_kernel = tt_metal::CreateKernel( receiver_program, @@ -673,41 +793,81 @@ TEST_F(FabricFixture, TestAsyncWriteMulticast) { tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); - auto [receiver_gk_noc_offset, receiver_gk_interface_addr] = - this->GetFabricData().get_gatekeeper_noc_addr(physical_end_device_id); std::vector receiver_runtime_args = { - buffer_data_addr, - buffer_data_size, + receiver_buffer->address(), + data_size, }; tt_metal::SetRuntimeArgs(receiver_program, receiver_kernel, receiver_logical_core, receiver_runtime_args); tt_metal::detail::LaunchProgram(receiver_device, receiver_program, false); receiver_programs.push_back(std::move(receiver_program)); + receiver_buffers.push_back(std::move(receiver_buffer)); + } + } + // Assume all receiver buffers are at the same address + uint32_t receiver_buffer_addr = receiver_buffers[0]->address(); + for (const auto& receiver_buffer : receiver_buffers) { + if (receiver_buffer_addr != receiver_buffer->address()) { + GTEST_SKIP() << "Receiver buffers are not at the same address"; } } - std::iota(buffer_data.begin(), buffer_data.end(), 0); - tt::llrt::write_hex_vec_to_core(physical_start_device_id, sender_virtual_core, buffer_data, buffer_data_addr); - + // Packet header needs to be inlined with the data being sent, so this test just allocates buffer space for both + // together on the sender + uint32_t sender_packet_header_and_data_size = tt::tt_fabric::PACKET_HEADER_SIZE_BYTES + data_size; + auto sender_shard_parameters = + ShardSpecBuffer(sender_logical_crs, {1, 1}, ShardOrientation::ROW_MAJOR, {1, 1}, {1, 1}); + ShardedBufferConfig sender_shard_config = { + .device = sender_device, + .size = sender_packet_header_and_data_size, + .page_size = sender_packet_header_and_data_size, + .buffer_type = BufferType::L1, + .buffer_layout = TensorMemoryLayout::HEIGHT_SHARDED, + .shard_parameters = std::move(sender_shard_parameters), + }; + auto sender_buffer = CreateBuffer(sender_shard_config); + // Write the data to send to the buffer + std::vector sender_buffer_data(sender_packet_header_and_data_size / sizeof(uint32_t), 0); + std::iota(sender_buffer_data.begin() + PACKET_HEADER_SIZE_BYTES / sizeof(uint32_t), sender_buffer_data.end(), 0); + tt::tt_metal::detail::WriteToBuffer(sender_buffer, sender_buffer_data); + + // Extract the expected data to be read from the receiver + std::copy( + sender_buffer_data.begin() + tt::tt_fabric::PACKET_HEADER_SIZE_BYTES / sizeof(uint32_t), + sender_buffer_data.end(), + receiver_buffer_data.begin()); + + // Wait for buffer data to be written to device tt::Cluster::instance().l1_barrier(physical_start_device_id); auto receiver_noc_encoding = tt::tt_metal::hal.noc_xy_encoding(receiver_virtual_core.x, receiver_virtual_core.y); + // Create the sender program auto sender_program = tt_metal::CreateProgram(); + + // Allocate space for the client interface + uint32_t client_interface_cb_index = tt::CBIndex::c_0; + tt::tt_metal::CircularBufferConfig client_interface_cb_config = + tt::tt_metal::CircularBufferConfig( + mcast_hops.size() * tt::tt_fabric::CLIENT_INTERFACE_SIZE, {{client_interface_cb_index, DataFormat::UInt32}}) + .set_page_size(client_interface_cb_index, tt::tt_fabric::CLIENT_INTERFACE_SIZE); + auto client_interface_cb = + tt::tt_metal::CreateCircularBuffer(sender_program, sender_logical_core, client_interface_cb_config); + + std::vector sender_compile_time_args = {client_interface_cb_index}; auto sender_kernel = tt_metal::CreateKernel( sender_program, "tests/tt_metal/tt_fabric/fabric_data_movement/kernels/" - "fabric_async_write_multicast_multidirectional_sender.cpp", - {sender_logical_core}, + "fabric_pull_async_write_multicast_multidirectional_sender.cpp", + sender_logical_crs, tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); - - auto [sender_gk_noc_offset, sender_gk_interface_addr] = - this->GetFabricData().get_gatekeeper_noc_addr(physical_start_device_id); + .processor = tt_metal::DataMovementProcessor::RISCV_0, + .noc = NOC::RISCV_0_default, + .compile_args = sender_compile_time_args}); std::unordered_map sender_router_noc_xys; for (auto& [routing_direction, end_mesh_chip_ids] : end_mesh_chip_ids_by_dir) { - auto routers = control_plane_->get_routers_to_chip( + auto routers = control_plane->get_routers_to_chip( start_mesh_chip_id.first, start_mesh_chip_id.second, end_mesh_chip_ids[0].first, @@ -717,14 +877,12 @@ TEST_F(FabricFixture, TestAsyncWriteMulticast) { routing_direction, tt_metal::hal.noc_xy_encoding(sender_virtual_router_coord.x, sender_virtual_router_coord.y)); } + std::vector sender_runtime_args = { - client_interface_addr, - sender_gk_interface_addr, - sender_gk_noc_offset, - packet_header_addr, + sender_buffer->address(), receiver_noc_encoding, - buffer_data_addr, - buffer_data_size, + receiver_buffer_addr, + data_size, end_mesh_chip_ids_by_dir[RoutingDirection::E][0].first, end_mesh_chip_ids_by_dir[RoutingDirection::E][0].second, mcast_hops[RoutingDirection::E], @@ -732,18 +890,10 @@ TEST_F(FabricFixture, TestAsyncWriteMulticast) { end_mesh_chip_ids_by_dir[RoutingDirection::W][0].first, end_mesh_chip_ids_by_dir[RoutingDirection::W][0].second, mcast_hops[RoutingDirection::W], - sender_router_noc_xys[RoutingDirection::W] - // end_mesh_chip_ids_by_dir[RoutingDirection::N][0].first, - // end_mesh_chip_ids_by_dir[RoutingDirection::N][0].second, - // mcast_hops[RoutingDirection::N], - // sender_router_noc_xys[RoutingDirection::N], - // end_mesh_chip_ids_by_dir[RoutingDirection::S][0].first, - // end_mesh_chip_ids_by_dir[RoutingDirection::S][0].second, - // mcast_hops[RoutingDirection::S], - // sender_router_noc_xys[RoutingDirection::S] - }; + sender_router_noc_xys[RoutingDirection::W]}; tt_metal::SetRuntimeArgs(sender_program, sender_kernel, sender_logical_core, sender_runtime_args); + // Launch sender and receiver programs and wait for them to finish tt_metal::detail::LaunchProgram(sender_device, sender_program, false); tt_metal::detail::WaitProgramDone(sender_device, sender_program); for (auto [routing_direction, physical_end_device_ids] : physical_end_device_ids_by_dir) { @@ -753,13 +903,14 @@ TEST_F(FabricFixture, TestAsyncWriteMulticast) { } } + // Validate the data received by the receiver for (auto [routing_direction, physical_end_device_ids] : physical_end_device_ids_by_dir) { - for (auto physical_end_device_id : physical_end_device_ids) { - std::vector received_buffer_data = tt::llrt::read_hex_vec_from_core( - physical_end_device_id, receiver_virtual_core, buffer_data_addr, buffer_data_size); - EXPECT_EQ(buffer_data, received_buffer_data); + for (uint32_t i = 0; i < physical_end_device_ids.size(); i++) { + std::vector received_buffer_data; + tt::tt_metal::detail::ReadFromBuffer(receiver_buffers[i], received_buffer_data); + EXPECT_EQ(receiver_buffer_data, received_buffer_data); } } -}*/ +} } // namespace tt::tt_fabric diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_tx_ubench.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_tx_ubench.cpp index a94d6185364..1acfcf915b9 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_tx_ubench.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_tx_ubench.cpp @@ -157,7 +157,7 @@ void kernel_main() { while (true) { client_interface->local_pull_request.pull_request.words_read = 0; if constexpr (mcast_data) { - fabric_async_write_multicast( + fabric_async_write_multicast( client_interface, 0, // the network plane to use for this transaction data_buffer_start_addr, // source address in sender’s memory @@ -170,7 +170,7 @@ void kernel_main() { n_depth, s_depth); } else { - fabric_async_write( + fabric_async_write( client_interface, 0, // the network plane to use for this transaction data_buffer_start_addr, // source address in sender’s memory diff --git a/tt_metal/fabric/hw/inc/tt_fabric_api.h b/tt_metal/fabric/hw/inc/tt_fabric_api.h index b36b5861025..e56f8e78c15 100644 --- a/tt_metal/fabric/hw/inc/tt_fabric_api.h +++ b/tt_metal/fabric/hw/inc/tt_fabric_api.h @@ -8,15 +8,18 @@ #include "dataflow_api.h" #include "noc_overlay_parameters.h" #include "ethernet/dataflow_api.h" +#include "tt_fabric.h" #include "tt_fabric_interface.h" +#include "eth_chan_noc_mapping.h" namespace tt::tt_fabric { enum AsyncWriteMode : uint8_t { ADD_PR = 0x01, - SEND = 0x02, + SEND_PR = 0x02, ADD_HEADER = 0x04, - ALL = ADD_HEADER | ADD_PR | SEND, + ADD_AND_SEND_PR = ADD_PR | SEND_PR, + ALL = ADD_HEADER | ADD_PR | SEND_PR, }; enum RoutingType : uint8_t { @@ -135,7 +138,7 @@ inline void fabric_async_write( fabric_setup_pull_request(client_interface, src_addr, size); } - if constexpr (mode & AsyncWriteMode::SEND) { + if constexpr (mode & AsyncWriteMode::SEND_PR) { fabric_send_pull_request(client_interface, routing, dst_mesh_id, dst_dev_id); } } @@ -189,7 +192,7 @@ inline void fabric_async_write_multicast( fabric_setup_pull_request(client_interface, src_addr, size); } - if constexpr (mode & AsyncWriteMode::SEND) { + if constexpr (mode & AsyncWriteMode::SEND_PR) { fabric_send_pull_request(client_interface, routing, dst_mesh_id, dst_dev_id); } } @@ -235,7 +238,7 @@ inline void fabric_atomic_inc( fabric_setup_pull_request(client_interface, src_addr, PACKET_HEADER_SIZE_BYTES); } - if constexpr (mode & AsyncWriteMode::SEND) { + if constexpr (mode & AsyncWriteMode::SEND_PR) { fabric_send_pull_request(client_interface, routing, dst_mesh_id, dst_dev_id); } } @@ -285,7 +288,7 @@ inline void fabric_async_write_atomic_inc( fabric_setup_pull_request(client_interface, src_addr, size); } - if constexpr (mode & AsyncWriteMode::SEND) { + if constexpr (mode & AsyncWriteMode::SEND_PR) { fabric_send_pull_request(client_interface, routing, dst_mesh_id, dst_dev_id); } } From 49b9da0f170d0496bb4eb1b1de36f8551268443a Mon Sep 17 00:00:00 2001 From: Austin Ho Date: Mon, 24 Feb 2025 17:58:42 +0000 Subject: [PATCH 280/316] #0: Add slimmed down fabric_pull_client_interface_t to use with fabric pull apis --- ...ric_pull_async_write_atomic_inc_sender.cpp | 4 +- ...rite_multicast_multidirectional_sender.cpp | 6 +- ...bric_pull_async_write_multicast_sender.cpp | 4 +- .../fabric_pull_async_write_sender.cpp | 4 +- .../kernels/fabric_pull_atomic_inc_sender.cpp | 4 +- .../tt_fabric_traffic_gen_rx_socket.cpp | 6 +- .../kernels/tt_fabric_traffic_gen_tx.cpp | 4 +- .../tt_fabric_traffic_gen_tx_socket.cpp | 4 +- .../routing/kernels/tt_fabric_tx_ubench.cpp | 4 +- .../test_tt_fabric_multi_hop_sanity.cpp | 2 +- .../routing/test_tt_fabric_sanity.cpp | 2 +- .../routing/test_tt_fabric_socket_sanity.cpp | 2 +- .../api/tt-metalium/fabric_host_interface.h | 1 + tt_metal/fabric/hw/inc/tt_fabric_api.h | 128 ++---------------- tt_metal/fabric/hw/inc/tt_fabric_interface.h | 11 ++ 15 files changed, 48 insertions(+), 138 deletions(-) diff --git a/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_async_write_atomic_inc_sender.cpp b/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_async_write_atomic_inc_sender.cpp index 131c9a2fff1..1f3b72d7ecc 100644 --- a/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_async_write_atomic_inc_sender.cpp +++ b/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_async_write_atomic_inc_sender.cpp @@ -27,8 +27,8 @@ void kernel_main() { uint32_t packet_size_bytes = num_bytes + PACKET_HEADER_SIZE_BYTES; uint32_t client_interface_addr = get_write_ptr(client_interface_cb); - volatile tt_l1_ptr fabric_client_interface_t* client_interface = - reinterpret_cast(client_interface_addr); + volatile tt_l1_ptr fabric_pull_client_interface_t* client_interface = + reinterpret_cast(client_interface_addr); fabric_endpoint_init(client_interface, 0 /* unused */); fabric_async_write_atomic_inc( diff --git a/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_async_write_multicast_multidirectional_sender.cpp b/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_async_write_multicast_multidirectional_sender.cpp index b6dab8d940f..301b131d88b 100644 --- a/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_async_write_multicast_multidirectional_sender.cpp +++ b/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_async_write_multicast_multidirectional_sender.cpp @@ -30,8 +30,8 @@ void kernel_main() { uint32_t packet_size_bytes = num_bytes + PACKET_HEADER_SIZE_BYTES; uint32_t client_interface_addr = get_write_ptr(client_interface_cb); - volatile tt_l1_ptr fabric_client_interface_t* client_interface = - reinterpret_cast(client_interface_addr); + volatile tt_l1_ptr fabric_pull_client_interface_t* client_interface = + reinterpret_cast(client_interface_addr); for (uint32_t i = 0; i < num_dirs; i++) { fabric_endpoint_init(client_interface + i, 0 /* unused */); } @@ -75,7 +75,7 @@ void kernel_main() { 0); // Flush all pull requests - client_interface = reinterpret_cast(client_interface_addr); + client_interface = reinterpret_cast(client_interface_addr); for (uint32_t i = 0; i < num_dirs; i++) { fabric_wait_for_pull_request_flushed(client_interface); client_interface++; diff --git a/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_async_write_multicast_sender.cpp b/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_async_write_multicast_sender.cpp index 09d0384fcc9..d8775281441 100644 --- a/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_async_write_multicast_sender.cpp +++ b/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_async_write_multicast_sender.cpp @@ -25,8 +25,8 @@ void kernel_main() { uint32_t packet_size_bytes = num_bytes + PACKET_HEADER_SIZE_BYTES; uint32_t client_interface_addr = get_write_ptr(client_interface_cb); - volatile tt_l1_ptr fabric_client_interface_t* client_interface = - reinterpret_cast(client_interface_addr); + volatile tt_l1_ptr fabric_pull_client_interface_t* client_interface = + reinterpret_cast(client_interface_addr); fabric_endpoint_init(client_interface, 0 /* unused */); fabric_async_write_multicast( diff --git a/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_async_write_sender.cpp b/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_async_write_sender.cpp index 2815a1c207b..e9e23ab932c 100644 --- a/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_async_write_sender.cpp +++ b/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_async_write_sender.cpp @@ -24,8 +24,8 @@ void kernel_main() { uint32_t packet_size_bytes = num_bytes + PACKET_HEADER_SIZE_BYTES; uint32_t client_interface_addr = get_write_ptr(client_interface_cb); - volatile tt_l1_ptr fabric_client_interface_t* client_interface = - reinterpret_cast(client_interface_addr); + volatile tt_l1_ptr fabric_pull_client_interface_t* client_interface = + reinterpret_cast(client_interface_addr); fabric_endpoint_init(client_interface, 0 /* unused */); fabric_async_write( diff --git a/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_atomic_inc_sender.cpp b/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_atomic_inc_sender.cpp index beba0160782..528be917ef4 100644 --- a/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_atomic_inc_sender.cpp +++ b/tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_pull_atomic_inc_sender.cpp @@ -25,8 +25,8 @@ void kernel_main() { uint32_t packet_size_bytes = PACKET_HEADER_SIZE_BYTES; uint32_t client_interface_addr = get_write_ptr(client_interface_cb); - volatile tt_l1_ptr fabric_client_interface_t* client_interface = - reinterpret_cast(client_interface_addr); + volatile tt_l1_ptr fabric_pull_client_interface_t* client_interface = + reinterpret_cast(client_interface_addr); fabric_endpoint_init(client_interface, 0 /* unused */); fabric_atomic_inc( diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx_socket.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx_socket.cpp index 5232ef3fce5..2690d6bc5ca 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx_socket.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx_socket.cpp @@ -44,8 +44,8 @@ constexpr uint32_t data_buffer_size_words = get_compile_time_arg_val(13); volatile tt_l1_ptr chan_req_buf* client_pull_req_buf = reinterpret_cast(client_pull_req_buf_addr); -volatile tt_l1_ptr fabric_client_interface_t* client_interface = - (volatile tt_l1_ptr fabric_client_interface_t*)client_interface_addr; +volatile tt_l1_ptr fabric_pull_client_interface_t* client_interface = + (volatile tt_l1_ptr fabric_pull_client_interface_t*)client_interface_addr; uint64_t xy_local_addr; socket_reader_state socket_reader; @@ -70,7 +70,7 @@ void kernel_main() { zero_l1_buf( reinterpret_cast(data_buffer_start_addr), data_buffer_size_words * PACKET_WORD_SIZE_BYTES); test_results[TT_FABRIC_MISC_INDEX] = 0xff000001; - zero_l1_buf((uint32_t*)client_interface, sizeof(fabric_client_interface_t)); + zero_l1_buf((uint32_t*)client_interface, sizeof(fabric_pull_client_interface_t)); test_results[TT_FABRIC_MISC_INDEX] = 0xff000002; zero_l1_buf((uint32_t*)client_pull_req_buf, sizeof(chan_req_buf)); test_results[TT_FABRIC_MISC_INDEX] = 0xff000003; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp index 9678fe4e0dc..e9f55e19ffc 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp @@ -71,8 +71,8 @@ uint32_t max_packet_size_mask; auto input_queue_state = select_input_queue(); volatile local_pull_request_t *local_pull_request = (volatile local_pull_request_t *)(data_buffer_start_addr - 1024); volatile tt_l1_ptr fabric_router_l1_config_t* routing_table; -volatile tt_l1_ptr fabric_client_interface_t* client_interface = - (volatile tt_l1_ptr fabric_client_interface_t*)client_interface_addr; +volatile tt_l1_ptr fabric_pull_client_interface_t* client_interface = + (volatile tt_l1_ptr fabric_pull_client_interface_t*)client_interface_addr; fvc_producer_state_t test_producer __attribute__((aligned(16))); fvcc_inbound_state_t fvcc_test_producer __attribute__((aligned(16))); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx_socket.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx_socket.cpp index d63197ab70b..1f37b128499 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx_socket.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx_socket.cpp @@ -67,8 +67,8 @@ uint32_t max_packet_size_mask; auto input_queue_state = select_input_queue(); volatile local_pull_request_t* local_pull_request = (volatile local_pull_request_t*)(data_buffer_start_addr - 1024); volatile tt_l1_ptr fabric_router_l1_config_t* routing_table; -volatile tt_l1_ptr fabric_client_interface_t* client_interface = - (volatile tt_l1_ptr fabric_client_interface_t*)client_interface_addr; +volatile tt_l1_ptr fabric_pull_client_interface_t* client_interface = + (volatile tt_l1_ptr fabric_pull_client_interface_t*)client_interface_addr; volatile tt_l1_ptr chan_req_buf* client_pull_req_buf = reinterpret_cast(client_pull_req_buf_addr); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_tx_ubench.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_tx_ubench.cpp index 1acfcf915b9..e2d0bf6ed78 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_tx_ubench.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_tx_ubench.cpp @@ -63,8 +63,8 @@ constexpr uint32_t w_depth = get_compile_time_arg_val(25); constexpr uint32_t n_depth = get_compile_time_arg_val(26); constexpr uint32_t s_depth = get_compile_time_arg_val(27); -volatile tt_l1_ptr fabric_client_interface_t* client_interface = - (volatile tt_l1_ptr fabric_client_interface_t*)client_interface_addr; +volatile tt_l1_ptr fabric_pull_client_interface_t* client_interface = + (volatile tt_l1_ptr fabric_pull_client_interface_t*)client_interface_addr; uint32_t target_address; uint32_t noc_offset; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp index 111176b7992..100a2c523fb 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp @@ -302,7 +302,7 @@ int main(int argc, char** argv) { uint32_t routing_table_addr = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); uint32_t gk_interface_addr = routing_table_addr + sizeof(fabric_router_l1_config_t) * 4; uint32_t client_interface_addr = routing_table_addr + sizeof(fabric_router_l1_config_t) * 4; - uint32_t client_pull_req_buf_addr = client_interface_addr + sizeof(fabric_client_interface_t); + uint32_t client_pull_req_buf_addr = client_interface_addr + sizeof(fabric_pull_client_interface_t); uint32_t socket_info_addr = gk_interface_addr + sizeof(gatekeeper_info_t); log_info(LogTest, "GK Routing Table Addr = 0x{:08X}", routing_table_addr); log_info(LogTest, "GK Info Addr = 0x{:08X}", gk_interface_addr); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp index 1b0f40eaee9..5273e8d37b5 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp @@ -1584,7 +1584,7 @@ int main(int argc, char **argv) { uint32_t client_interface_addr = worker_unreserved_base_addr; uint32_t client_pull_req_buf_addr = - client_interface_addr + sizeof(fabric_client_interface_t) + sizeof(fabric_router_l1_config_t) * 4; + client_interface_addr + sizeof(fabric_pull_client_interface_t) + sizeof(fabric_router_l1_config_t) * 4; std::vector tx_compile_args = { 0, //(device->id() << 8) + src_endpoint_start_id + i, // 0: src_endpoint_id diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp index 198246ce0da..f1f82a1b4da 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp @@ -300,7 +300,7 @@ int main(int argc, char** argv) { uint32_t routing_table_addr = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); uint32_t gk_interface_addr = routing_table_addr + sizeof(fabric_router_l1_config_t) * 4; uint32_t client_interface_addr = routing_table_addr + sizeof(fabric_router_l1_config_t) * 4; - uint32_t client_pull_req_buf_addr = client_interface_addr + sizeof(fabric_client_interface_t); + uint32_t client_pull_req_buf_addr = client_interface_addr + sizeof(fabric_pull_client_interface_t); uint32_t socket_info_addr = gk_interface_addr + sizeof(gatekeeper_info_t); log_info(LogTest, "GK Routing Table Addr = 0x{:08X}", routing_table_addr); log_info(LogTest, "GK Info Addr = 0x{:08X}", gk_interface_addr); diff --git a/tt_metal/api/tt-metalium/fabric_host_interface.h b/tt_metal/api/tt-metalium/fabric_host_interface.h index cdfa03b7caf..fbb7cf87068 100644 --- a/tt_metal/api/tt-metalium/fabric_host_interface.h +++ b/tt_metal/api/tt-metalium/fabric_host_interface.h @@ -36,6 +36,7 @@ static_assert( "LOG_BASE_2_NUM_CHANNELS_PER_UINT32 must be equal to log2(sizeof(std::uint32_t) / sizeof(chan_id_t))"); static constexpr std::uint32_t CLIENT_INTERFACE_SIZE = 3280; +static constexpr std::uint32_t PULL_CLIENT_INTERFACE_SIZE = 112; static constexpr std::uint32_t PACKET_WORD_SIZE_BYTES = 16; static constexpr std::uint32_t PACKET_HEADER_SIZE_BYTES = 48; static constexpr std::uint32_t PACKET_HEADER_SIZE_WORDS = PACKET_HEADER_SIZE_BYTES / PACKET_WORD_SIZE_BYTES; diff --git a/tt_metal/fabric/hw/inc/tt_fabric_api.h b/tt_metal/fabric/hw/inc/tt_fabric_api.h index e56f8e78c15..b14fcf94d5a 100644 --- a/tt_metal/fabric/hw/inc/tt_fabric_api.h +++ b/tt_metal/fabric/hw/inc/tt_fabric_api.h @@ -28,7 +28,7 @@ enum RoutingType : uint8_t { }; inline uint32_t get_next_hop_router_noc_xy( - volatile tt_l1_ptr fabric_client_interface_t* client_interface, + volatile tt_l1_ptr fabric_pull_client_interface_t* client_interface, uint32_t routing_plane, uint32_t dst_mesh_id, uint32_t dst_dev_id) { @@ -44,7 +44,7 @@ inline uint32_t get_next_hop_router_noc_xy( } inline void fabric_setup_pull_request( - volatile tt_l1_ptr fabric_client_interface_t* client_interface, uint32_t src_addr, uint32_t size) { + volatile tt_l1_ptr fabric_pull_client_interface_t* client_interface, uint32_t src_addr, uint32_t size) { uint32_t size_in_words = (size + PACKET_WORD_SIZE_BYTES - 1) >> 4; // TODO: Could return this value to the user and take this as an arg to avoid repeated lookup // Added here to avoid user having to declare globals @@ -63,7 +63,7 @@ inline void fabric_setup_pull_request( template inline void fabric_send_pull_request( - volatile tt_l1_ptr fabric_client_interface_t* client_interface, + volatile tt_l1_ptr fabric_pull_client_interface_t* client_interface, uint32_t routing, // routing refers to the router noc xy to use when using ROUTER_XY, // and the routing plane to use when using ROUTING_TABLE uint16_t dst_mesh_id, @@ -79,7 +79,7 @@ inline void fabric_send_pull_request( } FORCE_INLINE void fabric_wait_for_pull_request_words_flushed( - volatile tt_l1_ptr fabric_client_interface_t* client_interface, uint32_t words) { + volatile tt_l1_ptr fabric_pull_client_interface_t* client_interface, uint32_t words) { while (client_interface->local_pull_request.pull_request.words_read < words) { #pragma GCC unroll 4 for (int i = 0; i < 4; i++) { @@ -89,12 +89,12 @@ FORCE_INLINE void fabric_wait_for_pull_request_words_flushed( } inline void fabric_wait_for_pull_request_bytes_flushed( - volatile tt_l1_ptr fabric_client_interface_t* client_interface, uint32_t size) { + volatile tt_l1_ptr fabric_pull_client_interface_t* client_interface, uint32_t size) { uint32_t size_in_words = (size + PACKET_WORD_SIZE_BYTES - 1) >> 4; fabric_wait_for_pull_request_words_flushed(client_interface, size_in_words); } -inline void fabric_wait_for_pull_request_flushed(volatile tt_l1_ptr fabric_client_interface_t* client_interface) { +inline void fabric_wait_for_pull_request_flushed(volatile tt_l1_ptr fabric_pull_client_interface_t* client_interface) { uint32_t words_written = client_interface->local_pull_request.pull_request.words_written; fabric_wait_for_pull_request_words_flushed(client_interface, words_written); } @@ -121,7 +121,7 @@ inline void fabric_async_write_add_header( // Packet is at src_addr in sender L1. template inline void fabric_async_write( - volatile tt_l1_ptr fabric_client_interface_t* client_interface, + volatile tt_l1_ptr fabric_pull_client_interface_t* client_interface, uint32_t routing, // routing refers to the router noc xy to use when using ROUTER_XY, // and the routing plane to use when using ROUTING_TABLE uint32_t src_addr, // source address in sender’s memory @@ -171,7 +171,7 @@ inline void fabric_async_write_multicast_add_header( // Packet is at src_addr in sender L1. template inline void fabric_async_write_multicast( - volatile tt_l1_ptr fabric_client_interface_t* client_interface, + volatile tt_l1_ptr fabric_pull_client_interface_t* client_interface, uint32_t routing, // routing refers to the router noc xy to use when using ROUTER_XY, // and the routing plane to use when using ROUTING_TABLE uint32_t src_addr, // source address in sender’s memory @@ -221,7 +221,7 @@ inline void fabric_atomic_inc_add_header( // Packet is at src_addr in sender L1. template inline void fabric_atomic_inc( - volatile tt_l1_ptr fabric_client_interface_t* client_interface, + volatile tt_l1_ptr fabric_pull_client_interface_t* client_interface, uint32_t routing, // routing refers to the router noc xy to use when using ROUTER_XY, // and the routing plane to use when using ROUTING_TABLE uint32_t src_addr, // source address in sender’s memory @@ -269,7 +269,7 @@ inline void fabric_async_write_atomic_inc_add_header( // Packet is at src_addr in sender L1. template inline void fabric_async_write_atomic_inc( - volatile tt_l1_ptr fabric_client_interface_t* client_interface, + volatile tt_l1_ptr fabric_pull_client_interface_t* client_interface, uint32_t routing, // routing refers to the router noc xy to use when using ROUTER_XY, // and the routing plane to use when using ROUTING_TABLE uint32_t src_addr, // source address in sender’s memory @@ -293,116 +293,14 @@ inline void fabric_async_write_atomic_inc( } } -inline void send_message_to_gk(volatile tt_l1_ptr fabric_client_interface_t* client_interface) { - uint64_t gk_noc_base = client_interface->gk_msg_buf_addr; - uint64_t noc_addr = gk_noc_base + offsetof(ctrl_chan_msg_buf, wrptr); - noc_fast_atomic_increment( - noc_index, - NCRISC_AT_CMD_BUF, - noc_addr, - NOC_UNICAST_WRITE_VC, - 1, - FVCC_BUF_LOG_SIZE, - false, - false, - (uint32_t)&client_interface->wrptr.ptr); - while (!ncrisc_noc_nonposted_atomics_flushed(noc_index)); - uint32_t wrptr = client_interface->wrptr.ptr; - noc_addr = gk_noc_base + offsetof(ctrl_chan_msg_buf, rdptr); - while (1) { - noc_async_read_one_packet(noc_addr, (uint32_t)(&client_interface->rdptr.ptr), 4); - noc_async_read_barrier(); - if (!fvcc_buf_ptrs_full(wrptr, client_interface->rdptr.ptr)) { - break; - } - } - uint32_t dest_wr_index = wrptr & FVCC_SIZE_MASK; - noc_addr = gk_noc_base + offsetof(ctrl_chan_msg_buf, msg_buf) + dest_wr_index * sizeof(packet_header_t); - noc_async_write_one_packet((uint32_t)(&client_interface->gk_message), noc_addr, sizeof(packet_header_t), noc_index); - noc_async_write_barrier(); -} - -inline socket_handle_t* fabric_socket_open( - volatile tt_l1_ptr fabric_client_interface_t* client_interface, - uint32_t routing_plane, // the network plane to use for this socket - uint16_t epoch_id, // Temporal epoch for which the socket is being opened - uint16_t socket_id, // Socket Id to open - uint8_t socket_type, // Unicast, Multicast, SSocket, DSocket - uint8_t direction, // Send or Receive - uint16_t remote_mesh_id, // Remote mesh/device that is the socket data sender/receiver. - uint16_t remote_dev_id, - uint8_t fvc // fabric virtual channel. -) { - uint32_t socket_count = client_interface->socket_count; - socket_handle_t* socket_handle = (socket_handle_t*)&client_interface->socket_handles[socket_count]; - socket_count++; - client_interface->socket_count = socket_count; - socket_handle->socket_state = SocketState::OPENING; - - if (direction == SOCKET_DIRECTION_SEND) { - client_interface->gk_message.packet_header.routing.dst_mesh_id = remote_mesh_id; - client_interface->gk_message.packet_header.routing.dst_dev_id = remote_dev_id; - } else { - client_interface->gk_message.packet_header.routing.src_mesh_id = remote_mesh_id; - client_interface->gk_message.packet_header.routing.src_dev_id = remote_dev_id; - } - client_interface->gk_message.packet_header.routing.flags = SYNC; - client_interface->gk_message.packet_header.session.command = SOCKET_OPEN; - client_interface->gk_message.packet_header.session.target_offset_h = client_interface->pull_req_buf_addr >> 32; - client_interface->gk_message.packet_header.session.target_offset_l = (uint32_t)client_interface->pull_req_buf_addr; - client_interface->gk_message.packet_header.session.ack_offset_h = NOC_XY_ENCODING(my_x[noc_index], my_y[noc_index]); - client_interface->gk_message.packet_header.session.ack_offset_l = (uint32_t)socket_handle; - client_interface->gk_message.packet_header.packet_parameters.socket_parameters.socket_id = socket_id; - client_interface->gk_message.packet_header.packet_parameters.socket_parameters.epoch_id = epoch_id; - client_interface->gk_message.packet_header.packet_parameters.socket_parameters.socket_type = socket_type; - client_interface->gk_message.packet_header.packet_parameters.socket_parameters.socket_direction = direction; - client_interface->gk_message.packet_header.packet_parameters.socket_parameters.routing_plane = routing_plane; - tt_fabric_add_header_checksum((packet_header_t*)&client_interface->gk_message.packet_header); - send_message_to_gk(client_interface); - return socket_handle; -} - -inline void fabric_socket_close( - volatile tt_l1_ptr fabric_client_interface_t* client_interface, socket_handle_t* socket_handle) { - packet_header_t* packet_header = (packet_header_t*)&client_interface->gk_message.packet_header; - uint32_t dst_mesh_id = socket_handle->rcvr_mesh_id; - uint32_t dst_dev_id = socket_handle->rcvr_dev_id; - packet_header->routing.flags = INLINE_FORWARD; - packet_header->routing.dst_mesh_id = dst_mesh_id; - packet_header->routing.dst_dev_id = dst_dev_id; - packet_header->routing.packet_size_bytes = PACKET_HEADER_SIZE_BYTES; - packet_header->session.command = SOCKET_CLOSE; - packet_header->session.target_offset_l = (uint32_t)socket_handle->pull_notification_adddr; - packet_header->session.target_offset_h = socket_handle->pull_notification_adddr >> 32; - tt_fabric_add_header_checksum(packet_header); - - uint32_t* dst = (uint32_t*)&client_interface->local_pull_request.pull_request; - uint32_t* src = (uint32_t*)packet_header; - for (uint32_t i = 0; i < sizeof(pull_request_t) / 4; i++) { - dst[i] = src[i]; - } - uint64_t dest_addr = - ((uint64_t)get_next_hop_router_noc_xy(client_interface, socket_handle->routing_plane, dst_mesh_id, dst_dev_id) - << 32) | - FABRIC_ROUTER_REQ_QUEUE_START; - tt_fabric_send_pull_request(dest_addr, (volatile local_pull_request_t*)&client_interface->local_pull_request); -} - -inline void fabric_socket_connect(socket_handle_t* socket_handle) { - // wait for socket state to change to Active. - // Gatekeeper will update local socket handle when the receiver for send socket - // is ready. - while (((volatile socket_handle_t*)socket_handle)->socket_state != SocketState::ACTIVE); -} - template inline void fabric_endpoint_init( - volatile tt_l1_ptr fabric_client_interface_t* client_interface, uint32_t outbound_eth_chan) { + volatile tt_l1_ptr fabric_pull_client_interface_t* client_interface, uint32_t outbound_eth_chan) { // TODO: Should not assume routing tables are immediately after the client interface // This should be a separate address we take in - uint32_t routing_tables_offset = (uint32_t)client_interface + sizeof(fabric_client_interface_t); + uint32_t routing_tables_offset = (uint32_t)client_interface + sizeof(fabric_pull_client_interface_t); - zero_l1_buf((uint32_t*)client_interface, sizeof(fabric_client_interface_t)); + zero_l1_buf((uint32_t*)client_interface, sizeof(fabric_pull_client_interface_t)); client_interface->routing_tables_l1_offset = routing_tables_offset; client_interface->num_routing_planes = 1; diff --git a/tt_metal/fabric/hw/inc/tt_fabric_interface.h b/tt_metal/fabric/hw/inc/tt_fabric_interface.h index be8cefaf34a..a9124e7f434 100644 --- a/tt_metal/fabric/hw/inc/tt_fabric_interface.h +++ b/tt_metal/fabric/hw/inc/tt_fabric_interface.h @@ -339,9 +339,20 @@ typedef struct _fabric_client_interface { socket_handle_t socket_handles[MAX_SOCKETS]; } fabric_client_interface_t; +typedef struct _fabric_pull_client_interface { + uint64_t pull_req_buf_addr; + uint32_t num_routing_planes; + uint32_t routing_tables_l1_offset; + uint32_t return_status[3]; + local_pull_request_t local_pull_request; +} fabric_pull_client_interface_t; + static_assert(sizeof(fabric_client_interface_t) % 16 == 0); static_assert(sizeof(fabric_client_interface_t) == CLIENT_INTERFACE_SIZE); +static_assert(sizeof(fabric_pull_client_interface_t) % 16 == 0); +static_assert(sizeof(fabric_pull_client_interface_t) == PULL_CLIENT_INTERFACE_SIZE); + constexpr uint32_t FABRIC_ROUTER_MISC_START = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; constexpr uint32_t FABRIC_ROUTER_MISC_SIZE = 256; constexpr uint32_t FABRIC_ROUTER_SYNC_SEM = FABRIC_ROUTER_MISC_START; From d95a9c2d64abb3c7dfcbaa8ab0161a9ed6bdd959 Mon Sep 17 00:00:00 2001 From: Austin Ho Date: Mon, 24 Feb 2025 17:58:53 +0000 Subject: [PATCH 281/316] #0: Add fabric unit tests to CI --- tests/scripts/run_cpp_fabric_tests.sh | 11 +++++++++-- tests/scripts/t3000/run_t3000_unit_tests.sh | 1 + tests/scripts/tg/run_tg_unit_tests.sh | 1 + 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/tests/scripts/run_cpp_fabric_tests.sh b/tests/scripts/run_cpp_fabric_tests.sh index d16e10963c4..d7a03c6e015 100755 --- a/tests/scripts/run_cpp_fabric_tests.sh +++ b/tests/scripts/run_cpp_fabric_tests.sh @@ -14,13 +14,20 @@ fi export TT_METAL_CLEAR_L1=1 +cd $TT_METAL_HOME + +############################################# +# FABRIC UNIT TESTS # +############################################# +echo "Running fabric unit tests now..."; + +TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/tt_fabric/fabric_unit_tests --gtest_filter="FabricFixture.*" + ############################################# # FABRIC SANITY TESTS # ############################################# echo "Running fabric sanity tests now..."; -cd $TT_METAL_HOME - TEST_FOLDER="./build/test/tt_metal/perf_microbenchmark/routing" # Async Write diff --git a/tests/scripts/t3000/run_t3000_unit_tests.sh b/tests/scripts/t3000/run_t3000_unit_tests.sh index 7f709db3316..6bb668d01f8 100755 --- a/tests/scripts/t3000/run_t3000_unit_tests.sh +++ b/tests/scripts/t3000/run_t3000_unit_tests.sh @@ -44,6 +44,7 @@ run_t3000_ttfabric_tests() { echo "LOG_METAL: Running run_t3000_ttfabric_tests" TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/tt_fabric/fabric_unit_tests --gtest_filter=ControlPlaneFixture.*T3k* + TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/tt_fabric/fabric_unit_tests --gtest_filter="FabricFixture.*" # Unicast tests TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity_wormhole_b0 --fabric_command 1 --board_type t3k --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16 TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity_wormhole_b0 --fabric_command 64 --board_type t3k --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16 diff --git a/tests/scripts/tg/run_tg_unit_tests.sh b/tests/scripts/tg/run_tg_unit_tests.sh index 433ba6fb784..0b6db80427b 100755 --- a/tests/scripts/tg/run_tg_unit_tests.sh +++ b/tests/scripts/tg/run_tg_unit_tests.sh @@ -114,6 +114,7 @@ run_tg_tests() { elif [[ "$1" == "fabric" ]]; then echo "LOG_FABRIC: running run_tg_fabric_tests" TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/tt_fabric/fabric_unit_tests --gtest_filter=ControlPlaneFixture.*TG* + TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/tt_fabric/fabric_unit_tests --gtest_filter="FabricFixture.*" # Unicast tests TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity_wormhole_b0 --fabric_command 1 --board_type glx32 --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16 TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity_wormhole_b0 --fabric_command 64 --board_type glx32 --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16 From ce2d088513a01533f23b36a509695fde7e494a1f Mon Sep 17 00:00:00 2001 From: Brian Beggs Date: Mon, 24 Feb 2025 17:03:03 -0800 Subject: [PATCH 282/316] [skip ci] Update README.md (#18266) ### Ticket Link to Github Issue ### Problem description Host location of model_bring_up.md changed. ### What's changed New path to model bring up. ### Checklist - [ ] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes - [ ] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [ ] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [ ] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [ ] New/Existing tests provide coverage for changes --- models/bringup_testing/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/models/bringup_testing/README.md b/models/bringup_testing/README.md index 54f286473b9..129308a4d36 100644 --- a/models/bringup_testing/README.md +++ b/models/bringup_testing/README.md @@ -11,7 +11,7 @@ Welcome to the Model Bring-Up and Testing Landing Page! ## Model Bring-Up and Testing -- **Model Bring-Up** - [Model Bring-Up](https://github.com/tenstorrent/tt-training/blob/main/models/Model_Bring_Up.md) +- **Model Bring-Up** - [Model Bring-Up](https://github.com/tenstorrent/tt-metal/tree/main/models/model_bring_up.md) ## Model Optimization From e963fa49d9cce94ce1df16298f6b8469c4056950 Mon Sep 17 00:00:00 2001 From: Sean Nijjar Date: Mon, 24 Feb 2025 20:35:46 -0500 Subject: [PATCH 283/316] add extra guard on connection management for 1D fabric kernel (#18213) This change avoids cascading conditionals for a in frequent operation (adding or acknowledging a connection teardown request). Leads to a modest perf bump: BASELINE @ 4k packet size mcast -> 13.81 GB/s unicast -> 17 GB/s Extra guard around check_connection: mcast -> 14 GB/s unicast -> 17.5 GB/s --- .../edm_fabric/fabric_erisc_datamover.cpp | 70 +++++++++++-------- 1 file changed, 41 insertions(+), 29 deletions(-) diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp index e345fc70b8b..97cdc73d050 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp @@ -660,6 +660,44 @@ FORCE_INLINE void receiver_forward_packet( } } +template +FORCE_INLINE void check_worker_connections( + tt::fabric::EdmChannelWorkerInterface &local_sender_channel_worker_interface, + bool &channel_connection_established, + bool &did_something +) { + if (!channel_connection_established) { + // Can get rid of one of these two checks if we duplicate the logic above here in the function + // and depending on which of the two versions we are in (the connected version or disconnected version) + // We also check if the interface has a teardown request in case worker + // 1. opened connection + // 2. sent of all packets (EDM sender channel was sufficiently empty) + // 3. closed the connection + // + // In such a case like that, we still want to formally teardown the connection to keep things clean + bool connect_requested = local_sender_channel_worker_interface.connection_is_live() || + local_sender_channel_worker_interface.has_worker_teardown_request(); + if (connect_requested) { + // if constexpr (enable_fabric_counters) { + // sender_channel_counters->add_connection(); + // } + did_something = true; + channel_connection_established = true; + local_sender_channel_worker_interface.cache_producer_noc_addr(); + if constexpr (enable_first_level_ack) { + local_sender_channel_worker_interface.update_worker_copy_of_read_ptr(local_sender_channel_worker_interface.local_ackptr.get_ptr()); + } else { + local_sender_channel_worker_interface.update_worker_copy_of_read_ptr(local_sender_channel_worker_interface.local_rdptr.get_ptr()); + } + } + } else if (local_sender_channel_worker_interface.has_worker_teardown_request()) { + did_something = true; + channel_connection_established = false; + local_sender_channel_worker_interface.teardown_connection( + local_sender_channel_worker_interface.local_rdptr.get_ptr()); + } +} + //////////////////////////////////// //////////////////////////////////// // Main Control Loop @@ -733,35 +771,9 @@ FORCE_INLINE bool run_sender_channel_step( } - if (!channel_connection_established) { - // Can get rid of one of these two checks if we duplicate the logic above here in the function - // and depending on which of the two versions we are in (the connected version or disconnected version) - // We also check if the interface has a teardown request in case worker - // 1. opened connection - // 2. sent of all packets (EDM sender channel was sufficiently empty) - // 3. closed the connection - // - // In such a case like that, we still want to formally teardown the connection to keep things clean - bool connect_requested = local_sender_channel_worker_interface.connection_is_live() || - local_sender_channel_worker_interface.has_worker_teardown_request(); - if (connect_requested) { - if constexpr (enable_fabric_counters) { - sender_channel_counters->add_connection(); - } - did_something = true; - channel_connection_established = true; - local_sender_channel_worker_interface.cache_producer_noc_addr(); - if constexpr (enable_first_level_ack) { - local_sender_channel_worker_interface.update_worker_copy_of_read_ptr(local_sender_channel_worker_interface.local_ackptr.get_ptr()); - } else { - local_sender_channel_worker_interface.update_worker_copy_of_read_ptr(local_sender_channel_worker_interface.local_rdptr.get_ptr()); - } - } - } else if (local_sender_channel_worker_interface.has_worker_teardown_request()) { - did_something = true; - channel_connection_established = false; - local_sender_channel_worker_interface.teardown_connection( - local_sender_channel_worker_interface.local_rdptr.get_ptr()); + bool check_connection_status = !channel_connection_established || local_sender_channel_worker_interface.has_worker_teardown_request(); + if (check_connection_status) { + check_worker_connections(local_sender_channel_worker_interface, channel_connection_established, did_something); } return did_something; From bc10e86cc83a848131175382d2528bad0fdf795a Mon Sep 17 00:00:00 2001 From: Sean Nijjar Date: Mon, 24 Feb 2025 20:36:23 -0500 Subject: [PATCH 284/316] add pybindings for custom 1D fabric ctx switch intervals (#18239) ### Problem description There is currently no one-size-fits-all context switch interval for 1D fabric on Wormhole. In some use cases (e.g. test suites with many back to back tests) we want smaller intervals so teardown is quick. In other cases (real workloads), we want a longer interval since there may be longer gaps between subsequent ops using a given fabric link. ### What's changed Added pybindings for context switch interval override. By default, if a user does not provide an override, the fabric will use the implementation default, which is more favourable to test environments and faster teardown times. To override the context switch check interval, a user can override either `create_and_load_sub_device_manager_with_fabric_interface` or `ttnn.initialize_edm_fabric`. In both cases, the kw_only arg `context_switch_interval_override` is used to override the interval. The current default is `10000`. For performance oriented workloads, it is recommended to start in the 100k-200k range and tweak from there. --- .../unit_tests/operations/ccl/test_ccl_common.py | 8 +++++++- ttnn/cpp/ttnn/operations/ccl/ccl_pybind.cpp | 3 ++- .../operations/ccl/erisc_datamover_builder.cpp | 14 +++++++++++++- .../operations/ccl/erisc_datamover_builder.hpp | 5 ++++- 4 files changed, 26 insertions(+), 4 deletions(-) diff --git a/tests/ttnn/unit_tests/operations/ccl/test_ccl_common.py b/tests/ttnn/unit_tests/operations/ccl/test_ccl_common.py index 65fa2a49b73..0b7ece8de6d 100644 --- a/tests/ttnn/unit_tests/operations/ccl/test_ccl_common.py +++ b/tests/ttnn/unit_tests/operations/ccl/test_ccl_common.py @@ -13,6 +13,7 @@ def create_and_load_sub_device_manager_with_fabric_interface( local_allocator_size, enable_persistent_fabric=True, wrap_fabric_around_mesh=False, + context_switch_interval_override=None, ): assert ccl_worker_sub_device_id < len(worker_sub_devices) mesh_sub_device_manager_id, fabric_subdevice_id = mesh_device.create_sub_device_manager_with_fabric( @@ -21,11 +22,16 @@ def create_and_load_sub_device_manager_with_fabric_interface( # fabric sub-device id can also be queried from device, no need to explicitly pass it in mesh_device.load_sub_device_manager(mesh_sub_device_manager_id) if enable_persistent_fabric: - ttnn.initialize_edm_fabric(mesh_device, wrap_fabric_around_mesh=wrap_fabric_around_mesh) + ttnn.initialize_edm_fabric( + mesh_device, + wrap_fabric_around_mesh=wrap_fabric_around_mesh, + context_switch_interval_override=context_switch_interval_override, + ) return mesh_sub_device_manager_id def teardown_fabric_interface(mesh_device): + logger.debug(f"Tearing down fabric (this may take a while if context switch interval is large)") ttnn.teardown_edm_fabric(mesh_device) ttnn.synchronize_devices(mesh_device) diff --git a/ttnn/cpp/ttnn/operations/ccl/ccl_pybind.cpp b/ttnn/cpp/ttnn/operations/ccl/ccl_pybind.cpp index adbd4c341ad..8d6041dd131 100644 --- a/ttnn/cpp/ttnn/operations/ccl/ccl_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/ccl_pybind.cpp @@ -23,7 +23,8 @@ void py_bind_common(pybind11::module& module) { &ttnn::ccl::initialize_edm_fabric, py::arg("mesh_device"), py::kw_only(), - py::arg("wrap_fabric_around_mesh") = false); + py::arg("wrap_fabric_around_mesh") = false, + py::arg("context_switch_interval_override") = std::nullopt); module.def("teardown_edm_fabric", &ttnn::ccl::teardown_edm_fabric, py::arg("mesh_device"), py::kw_only()); } diff --git a/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.cpp b/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.cpp index 2f505f41586..3c61c8c37ea 100644 --- a/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.cpp @@ -825,7 +825,10 @@ void EdmLineFabricOpInterface::set_firmware_context_switch_interval(size_t inter } } -void initialize_edm_fabric(distributed::MeshDevice* mesh_device, bool wrap_fabric_around_mesh) { +void initialize_edm_fabric( + distributed::MeshDevice* mesh_device, + bool wrap_fabric_around_mesh, + std::optional context_switch_interval_override) { if (wrap_fabric_around_mesh) { auto devices = mesh_device->get_view().get_ring_devices(); std::vector program_ptrs; @@ -835,6 +838,9 @@ void initialize_edm_fabric(distributed::MeshDevice* mesh_device, bool wrap_fabri std::transform( programs.begin(), programs.end(), std::back_inserter(program_ptrs), [](Program& p) { return &p; }); EdmLineFabricOpInterface fabric_device_builders = EdmLineFabricOpInterface(devices, program_ptrs, true); + if (context_switch_interval_override.has_value()) { + fabric_device_builders.set_firmware_context_switch_interval(context_switch_interval_override.value()); + } fabric_device_builders.build_kernels(); for (size_t i = 0; i < devices.size(); i++) { @@ -865,6 +871,9 @@ void initialize_edm_fabric(distributed::MeshDevice* mesh_device, bool wrap_fabri }); row_fabric_lines.push_back( EdmLineFabricOpInterface(mesh_device->get_view().get_row_views()[i], program_ptrs, true)); + if (context_switch_interval_override.has_value()) { + row_fabric_lines.back().set_firmware_context_switch_interval(context_switch_interval_override.value()); + } } for (size_t i = 0; i < num_cols; i++) { @@ -875,6 +884,9 @@ void initialize_edm_fabric(distributed::MeshDevice* mesh_device, bool wrap_fabri } col_fabric_lines.push_back( EdmLineFabricOpInterface(mesh_device->get_view().get_column_views()[i], program_ptrs, true)); + if (context_switch_interval_override.has_value()) { + col_fabric_lines.back().set_firmware_context_switch_interval(context_switch_interval_override.value()); + } } std::for_each(row_fabric_lines.begin(), row_fabric_lines.end(), [](auto& line) { line.build_kernels(); }); diff --git a/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp b/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp index 58f369b1cd0..ce0fac4e864 100644 --- a/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp @@ -371,7 +371,10 @@ class EdmLineFabricOpInterface { size_t firmware_context_switch_interval = FabricEriscDatamoverBuilder::default_firmware_context_switch_interval; }; -void initialize_edm_fabric(distributed::MeshDevice* mesh_device, bool wrap_fabric_around_mesh = false); +void initialize_edm_fabric( + distributed::MeshDevice* mesh_device, + bool wrap_fabric_around_mesh = false, + std::optional context_switch_interval_override = std::nullopt); void teardown_edm_fabric(distributed::MeshDevice* mesh_device); }; // namespace ccl From 7e9eda695cd3644f9e193e3948cc3bebbc333cfc Mon Sep 17 00:00:00 2001 From: Nigel Huang Date: Tue, 25 Feb 2025 01:12:03 +0000 Subject: [PATCH 285/316] #0: show the kernel name when logging size The log output is scrambled due to JITBuild using multithreads. Add the kernel name to help identify which kernel the size corresponds to. --- tt_metal/impl/kernels/kernel.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tt_metal/impl/kernels/kernel.cpp b/tt_metal/impl/kernels/kernel.cpp index 6299cd38e73..7e9d18c5ea6 100644 --- a/tt_metal/impl/kernels/kernel.cpp +++ b/tt_metal/impl/kernels/kernel.cpp @@ -401,7 +401,7 @@ void DataMovementKernel::read_binaries(IDevice* device) { load_type); binaries.push_back(&binary_mem); uint32_t binary_size = binary_mem.get_packed_size(); - log_debug(LogLoader, "RISC {} kernel binary size: {} in bytes", riscv_id, binary_size); + log_debug(LogLoader, "RISC={}, name={}, size={} (bytes)", riscv_id, this->name(), binary_size); this->set_binaries( BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_key, std::move(binaries)); } @@ -424,7 +424,7 @@ void EthernetKernel::read_binaries(IDevice* device) { load_type); binaries.push_back(&binary_mem); uint32_t binary_size = binary_mem.get_packed_size(); - log_debug(LogLoader, "ERISC {} kernel binary size: {} in bytes", erisc_id, binary_size); + log_debug(LogLoader, "ERISC={}, name={}, size={} (bytes)", erisc_id, this->name(), binary_size); this->set_binaries( BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_key, std::move(binaries)); } @@ -442,7 +442,7 @@ void ComputeKernel::read_binaries(IDevice* device) { ll_api::memory::Loading::CONTIGUOUS_XIP); binaries.push_back(&binary_mem); uint32_t binary_size = binary_mem.get_packed_size(); - log_debug(LogLoader, "RISC {} kernel binary size: {} in bytes", trisc_id + 2, binary_size); + log_debug(LogLoader, "RISC={}, name={}, size={} (bytes)", trisc_id + 2, this->name(), binary_size); } this->set_binaries( BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_key, std::move(binaries)); From f3bb74d68050bf90656512a52382b6345f026cb4 Mon Sep 17 00:00:00 2001 From: Oleg Milyutin Date: Mon, 24 Feb 2025 22:28:09 -0500 Subject: [PATCH 286/316] #17477: Finalize adoption of ND `MeshShape` in Metal and TTNN. (#18190) ### Ticket #17477 ### Problem description Continuing with adopting ND coordinate system in Metal and TTNN. ### What's changed Remove the legacy `MeshShape`, and rename the new ND `SimpleMeshShape` to `MeshShape`: * Remove last usages in `MeshDevice`, tensor libs, and tests. * Remove `MeshOffset` and instead use `MeshCoordinate`. * Add `is_line_topology`, `zero_coordinate`. * Tests, tests, tests. ### Checklist - [x] [All post commit](https://github.com/tenstorrent/tt-metal/actions/runs/13490385633) - [X] [T3K unit tests](https://github.com/tenstorrent/tt-metal/actions/runs/13490397728) - [X] New/Existing tests provide coverage for changes --- conftest.py | 2 +- .../tt_metal/distributed/test_mesh_buffer.cpp | 4 +- .../tt_metal/distributed/test_mesh_coord.cpp | 74 ++++++---- .../tt_metal/distributed/test_mesh_device.cpp | 45 ++++-- .../distributed/test_mesh_device_reshape.cpp | 105 +++++++------- .../gtests/tensor/test_distributed_tensor.cpp | 32 ++-- .../unit_tests/gtests/test_ccl_on_galaxy.cpp | 11 +- .../examples/linear_regression_ddp/main.cpp | 2 +- tt-train/sources/examples/mnist_mlp/main.cpp | 2 +- tt-train/sources/examples/nano_gpt/utils.cpp | 2 +- .../sources/ttml/autograd/auto_context.hpp | 2 +- .../sources/ttml/core/distributed_mapping.hpp | 20 +-- tt-train/tests/core/distributed_test.cpp | 19 +-- tt-train/tests/core/n300_utils_test.cpp | 2 +- .../model/linear_regression_ddp_test.cpp | 2 +- .../tests/modules/distributed/linear_test.cpp | 2 +- .../tests/ops/distributed/comm_ops_test.cpp | 2 +- .../distributed/distributed_ttnn_ops_test.cpp | 2 +- tt_metal/api/tt-metalium/mesh_buffer.hpp | 4 +- tt_metal/api/tt-metalium/mesh_config.hpp | 12 +- tt_metal/api/tt-metalium/mesh_coord.hpp | 49 ++++--- tt_metal/api/tt-metalium/mesh_device.hpp | 12 +- tt_metal/api/tt-metalium/mesh_device_view.hpp | 9 +- tt_metal/api/tt-metalium/system_mesh.hpp | 2 +- tt_metal/common/mesh_coord.cpp | 50 ++++--- .../distributed/coordinate_translation.cpp | 21 +-- .../distributed/coordinate_translation.hpp | 2 +- tt_metal/distributed/mesh_command_queue.cpp | 1 - tt_metal/distributed/mesh_device.cpp | 137 +++++++++--------- tt_metal/distributed/mesh_device_view.cpp | 6 +- tt_metal/distributed/system_mesh.cpp | 15 +- .../distributed_program_dispatch.cpp | 2 +- .../distributed_buffer_rw.cpp | 2 +- .../distributed_eltwise_add.cpp | 2 +- ttnn/cpp/ttnn/distributed/api.cpp | 15 +- ttnn/cpp/ttnn/distributed/api.hpp | 2 +- .../ttnn/distributed/distributed_pybind.cpp | 67 ++++++--- .../ttnn/distributed/distributed_tensor.cpp | 55 +++---- ttnn/cpp/ttnn/distributed/types.hpp | 4 - ttnn/cpp/ttnn/tensor/storage.cpp | 24 +-- ttnn/cpp/ttnn/tensor/tensor_impl.cpp | 9 +- ttnn/ttnn/__init__.py | 2 +- ttnn/ttnn/distributed/distributed.py | 4 +- ttnn/ttnn/types.py | 2 +- 44 files changed, 453 insertions(+), 387 deletions(-) diff --git a/conftest.py b/conftest.py index 4be5deca442..9e94913a18f 100644 --- a/conftest.py +++ b/conftest.py @@ -256,7 +256,7 @@ def pcie_mesh_device(request, silicon_arch_name, silicon_arch_wormhole_b0, devic mesh_device = ttnn.open_mesh_device( mesh_shape=ttnn.MeshShape(2, 2), **updated_device_params, - offset=ttnn.MeshOffset(0, 1), + offset=ttnn.MeshCoordinate(0, 1), ) mesh_device.reshape(ttnn.MeshShape(1, 4)) diff --git a/tests/tt_metal/distributed/test_mesh_buffer.cpp b/tests/tt_metal/distributed/test_mesh_buffer.cpp index 364790f8984..36a54b6914c 100644 --- a/tests/tt_metal/distributed/test_mesh_buffer.cpp +++ b/tests/tt_metal/distributed/test_mesh_buffer.cpp @@ -129,7 +129,7 @@ TEST_F(MeshBufferTestT3000, Deallocation) { TEST(MeshBufferTest, DeallocationWithoutMeshDevice) { for (int i = 0; i < 100; i++) { auto config = - MeshDeviceConfig{.mesh_shape = SimpleMeshShape(1, 1), .offset = std::nullopt, .physical_device_ids = {}}; + MeshDeviceConfig{.mesh_shape = MeshShape(1, 1), .offset = std::nullopt, .physical_device_ids = {}}; auto mesh_device = MeshDevice::create(config, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, DispatchCoreType::WORKER); @@ -148,7 +148,7 @@ TEST(MeshBufferTest, DeallocationWithoutMeshDevice) { TEST(MeshBufferTest, DeallocationWithMeshDeviceClosed) { for (int i = 0; i < 100; i++) { auto config = - MeshDeviceConfig{.mesh_shape = SimpleMeshShape(1, 1), .offset = std::nullopt, .physical_device_ids = {}}; + MeshDeviceConfig{.mesh_shape = MeshShape(1, 1), .offset = std::nullopt, .physical_device_ids = {}}; auto mesh_device = MeshDevice::create(config, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, DispatchCoreType::WORKER); diff --git a/tests/tt_metal/distributed/test_mesh_coord.cpp b/tests/tt_metal/distributed/test_mesh_coord.cpp index 16eaa7a04bd..6d87c191930 100644 --- a/tests/tt_metal/distributed/test_mesh_coord.cpp +++ b/tests/tt_metal/distributed/test_mesh_coord.cpp @@ -14,26 +14,26 @@ namespace { using ::testing::ElementsAre; using ::testing::UnorderedElementsAre; -TEST(SimpleMeshShapeTest, Construction) { - SimpleMeshShape shape_1d(3); +TEST(MeshShapeTest, Construction) { + MeshShape shape_1d(3); EXPECT_EQ(shape_1d.dims(), 1); EXPECT_EQ(shape_1d[0], 3); EXPECT_EQ(shape_1d.mesh_size(), 3); - SimpleMeshShape shape_2d(3, 4); + MeshShape shape_2d(3, 4); EXPECT_EQ(shape_2d.dims(), 2); EXPECT_EQ(shape_2d[0], 3); EXPECT_EQ(shape_2d[1], 4); EXPECT_EQ(shape_2d.mesh_size(), 12); - SimpleMeshShape shape_3d(2, 3, 4); + MeshShape shape_3d(2, 3, 4); EXPECT_EQ(shape_3d.dims(), 3); EXPECT_EQ(shape_3d[0], 2); EXPECT_EQ(shape_3d[1], 3); EXPECT_EQ(shape_3d[2], 4); EXPECT_EQ(shape_3d.mesh_size(), 24); - SimpleMeshShape shape_5d({2, 3, 4, 5, 6}); + MeshShape shape_5d({2, 3, 4, 5, 6}); EXPECT_EQ(shape_5d.dims(), 5); EXPECT_EQ(shape_5d[0], 2); EXPECT_EQ(shape_5d[1], 3); @@ -43,25 +43,41 @@ TEST(SimpleMeshShapeTest, Construction) { EXPECT_EQ(shape_5d.mesh_size(), 720); } -TEST(SimpleMeshShapeTest, ZeroShape) { - SimpleMeshShape shape({}); +TEST(MeshShapeTest, ZeroShape) { + MeshShape shape({}); EXPECT_EQ(shape.dims(), 0); EXPECT_EQ(shape.mesh_size(), 0); } -TEST(SimpleMeshShapeTest, Strides) { - SimpleMeshShape shape(2, 3, 4); +TEST(MeshShapeTest, Strides) { + MeshShape shape(2, 3, 4); EXPECT_EQ(shape.get_stride(0), 12); // 3 * 4 EXPECT_EQ(shape.get_stride(1), 4); // 4 EXPECT_EQ(shape.get_stride(2), 1); // 1 } -TEST(SimpleMeshShapeTest, Comparison) { - SimpleMeshShape shape(2, 3); +TEST(MeshShapeTest, Comparison) { + MeshShape shape(2, 3); - EXPECT_EQ(shape, SimpleMeshShape(2, 3)); - EXPECT_NE(shape, SimpleMeshShape(3, 2)); - EXPECT_NE(shape, SimpleMeshShape(1, 2, 3)); + EXPECT_EQ(shape, MeshShape(2, 3)); + EXPECT_NE(shape, MeshShape(3, 2)); + EXPECT_NE(shape, MeshShape(1, 2, 3)); +} + +TEST(MeshShapeTest, LinearTopology) { + EXPECT_TRUE(is_line_topology(MeshShape(1))); + EXPECT_TRUE(is_line_topology(MeshShape(3))); + EXPECT_TRUE(is_line_topology(MeshShape(1, 1))); + EXPECT_TRUE(is_line_topology(MeshShape(1, 3))); + EXPECT_TRUE(is_line_topology(MeshShape(3, 1))); + EXPECT_FALSE(is_line_topology(MeshShape(3, 3))); + EXPECT_TRUE(is_line_topology(MeshShape(1, 1, 1))); + EXPECT_TRUE(is_line_topology(MeshShape(1, 1, 3))); + EXPECT_TRUE(is_line_topology(MeshShape(1, 3, 1))); + EXPECT_TRUE(is_line_topology(MeshShape(3, 1, 1))); + EXPECT_FALSE(is_line_topology(MeshShape(1, 3, 3))); + EXPECT_FALSE(is_line_topology(MeshShape(3, 1, 3))); + EXPECT_FALSE(is_line_topology(MeshShape(3, 3, 3))); } TEST(MeshCoordinateTest, Construction) { @@ -117,8 +133,14 @@ TEST(MeshCoordinateTest, UnorderedSet) { MeshCoordinate(0, 0, 2))); } +TEST(MeshCoordinateTest, ZeroCoordinate) { + EXPECT_EQ(MeshCoordinate::zero_coordinate(1), MeshCoordinate(0)); + EXPECT_EQ(MeshCoordinate::zero_coordinate(2), MeshCoordinate(0, 0)); + EXPECT_EQ(MeshCoordinate::zero_coordinate(3), MeshCoordinate(0, 0, 0)); +} + TEST(MeshCoordinateRangeTest, FromShape) { - SimpleMeshShape shape(2, 3); + MeshShape shape(2, 3); MeshCoordinateRange range(shape); std::vector coords; @@ -211,7 +233,7 @@ TEST(MeshCoordinateRangeTest, InvalidRange) { } TEST(ToLinearIndexTest, Basic) { - SimpleMeshShape shape(2, 2, 3); + MeshShape shape(2, 2, 3); EXPECT_EQ(to_linear_index(shape, MeshCoordinate(0, 0, 0)), 0); EXPECT_EQ(to_linear_index(shape, MeshCoordinate(0, 0, 1)), 1); @@ -228,16 +250,16 @@ TEST(ToLinearIndexTest, Basic) { } TEST(ToLinearIndexTest, MismatchedDimensions) { - EXPECT_ANY_THROW(to_linear_index(SimpleMeshShape(1, 2, 3), MeshCoordinate(0, 0))); + EXPECT_ANY_THROW(to_linear_index(MeshShape(1, 2, 3), MeshCoordinate(0, 0))); } TEST(ToLinearIndexTest, OutOfBounds) { - EXPECT_ANY_THROW(to_linear_index(SimpleMeshShape(2, 3), MeshCoordinate(2, 0))); - EXPECT_ANY_THROW(to_linear_index(SimpleMeshShape(2, 3), MeshCoordinate(0, 3))); + EXPECT_ANY_THROW(to_linear_index(MeshShape(2, 3), MeshCoordinate(2, 0))); + EXPECT_ANY_THROW(to_linear_index(MeshShape(2, 3), MeshCoordinate(0, 3))); } TEST(MeshContainerTest, InitialValues) { - SimpleMeshShape shape(2, 3); + MeshShape shape(2, 3); MeshContainer container(shape, 3); std::vector initial_values; @@ -248,7 +270,7 @@ TEST(MeshContainerTest, InitialValues) { } TEST(MeshContainerTest, FromVector) { - SimpleMeshShape shape(2, 3); + MeshShape shape(2, 3); MeshContainer container(shape, std::vector{0, 1, 2, 3, 4, 5}); std::vector initial_values; @@ -259,12 +281,12 @@ TEST(MeshContainerTest, FromVector) { } TEST(MeshContainerTest, FromVectorInvalidSize) { - SimpleMeshShape shape(2, 3); + MeshShape shape(2, 3); EXPECT_ANY_THROW(MeshContainer(shape, std::vector{0, 1, 2, 3, 4})); } TEST(MeshContainerTest, ElementAccessRowMajor) { - SimpleMeshShape shape(2, 3); + MeshShape shape(2, 3); MeshContainer container(shape, 0); container.at(MeshCoordinate(0, 0)) = 0; @@ -294,7 +316,7 @@ TEST(MeshContainerTest, ElementAccessRowMajor) { } TEST(MeshContainerTest, ConstContainer) { - SimpleMeshShape shape(2, 3); + MeshShape shape(2, 3); const MeshContainer container(shape, 0); std::vector coords; @@ -317,7 +339,7 @@ TEST(MeshContainerTest, ConstContainer) { } TEST(MeshContainerTest, MutateThroughProxy) { - SimpleMeshShape shape(2, 3); + MeshShape shape(2, 3); MeshContainer container(shape, 0); // Proxy class provides access to the container value through the mutable reference. @@ -340,7 +362,7 @@ TEST(MeshContainerTest, MutateThroughProxy) { } TEST(MeshContainerTest, OutOfBounds) { - SimpleMeshShape shape(2, 3); + MeshShape shape(2, 3); MeshContainer container(shape, 0); EXPECT_ANY_THROW(container.at(MeshCoordinate(2, 0))); diff --git a/tests/tt_metal/distributed/test_mesh_device.cpp b/tests/tt_metal/distributed/test_mesh_device.cpp index c87c87cae35..501d2f3d874 100644 --- a/tests/tt_metal/distributed/test_mesh_device.cpp +++ b/tests/tt_metal/distributed/test_mesh_device.cpp @@ -55,30 +55,37 @@ TEST_F(MeshDeviceTest, NumDramChannels) { TEST_F(MeshDeviceTest, ViewIs2D) { std::vector devices = mesh_device_->get_devices(); - MeshContainer container_1d(SimpleMeshShape(8), devices); + MeshContainer container_1d(MeshShape(8), devices); MeshDeviceView view_1d(container_1d); EXPECT_FALSE(view_1d.is_mesh_2d()); - MeshContainer container_2d(SimpleMeshShape(2, 4), devices); + MeshContainer container_2d(MeshShape(2, 4), devices); MeshDeviceView view_2d(container_2d); EXPECT_TRUE(view_2d.is_mesh_2d()); - MeshContainer container_3d(SimpleMeshShape(2, 2, 2), devices); + MeshContainer container_3d(MeshShape(2, 2, 2), devices); MeshDeviceView view_3d(container_3d); EXPECT_FALSE(view_3d.is_mesh_2d()); } -TEST_F(MeshDeviceTest, Submesh) { - EXPECT_EQ(mesh_device_->shape().num_rows, 2); - EXPECT_EQ(mesh_device_->shape().num_cols, 4); +TEST_F(MeshDeviceTest, CreateSubmeshInvalidConfig) { + EXPECT_EQ(mesh_device_->shape(), MeshShape(2, 4)); + + EXPECT_ANY_THROW(mesh_device_->create_submesh(MeshShape{1, 3}, MeshCoordinate{1})); + EXPECT_ANY_THROW(mesh_device_->create_submesh(MeshShape{0, 3}, MeshCoordinate{0, 0})); + EXPECT_ANY_THROW(mesh_device_->create_submesh(MeshShape{2, 4}, MeshCoordinate{1, 1})); + EXPECT_ANY_THROW(mesh_device_->create_submesh(MeshShape{2, 4, 1}, MeshCoordinate{0, 0})); +} + +TEST_F(MeshDeviceTest, CreateSubmesh) { + EXPECT_EQ(mesh_device_->shape(), MeshShape(2, 4)); EXPECT_THAT(mesh_device_->get_devices(), SizeIs(8)); EXPECT_TRUE(mesh_device_->is_parent_mesh()); EXPECT_THAT(mesh_device_->get_submeshes(), IsEmpty()); - auto submesh = mesh_device_->create_submesh(MeshShape{1, 2}, MeshOffset{1, 1}); + auto submesh = mesh_device_->create_submesh(MeshShape{1, 2}, MeshCoordinate{1, 1}); EXPECT_THAT(mesh_device_->get_submeshes(), SizeIs(1)); - EXPECT_EQ(submesh->shape().num_rows, 1); - EXPECT_EQ(submesh->shape().num_cols, 2); + EXPECT_EQ(submesh->shape(), MeshShape(1, 2)); EXPECT_THAT(submesh->get_devices(), SizeIs(2)); EXPECT_FALSE(submesh->is_parent_mesh()); EXPECT_THAT(submesh->get_submeshes(), IsEmpty()); @@ -86,7 +93,25 @@ TEST_F(MeshDeviceTest, Submesh) { // Verify coordinates are correct. EXPECT_EQ(mesh_device_->get_device(MeshCoordinate{1, 1})->id(), submesh->get_device(MeshCoordinate{0, 0})->id()); EXPECT_EQ(mesh_device_->get_device(MeshCoordinate{1, 2})->id(), submesh->get_device(MeshCoordinate{0, 1})->id()); - EXPECT_EQ(submesh->get_device(1, 1), nullptr); + EXPECT_EQ(submesh->get_device(MeshCoordinate{1, 1}), nullptr); +} + +TEST_F(MeshDeviceTest, CreateSubmeshesNonDivisibleSubshape) { + EXPECT_EQ(mesh_device_->shape(), MeshShape(2, 4)); + EXPECT_ANY_THROW(mesh_device_->create_submeshes(MeshShape{1, 3})); +} + +TEST_F(MeshDeviceTest, CreateSubmeshes) { + EXPECT_EQ(mesh_device_->shape(), MeshShape(2, 4)); + + auto submeshes = mesh_device_->create_submeshes(MeshShape{1, 2}); + EXPECT_THAT(submeshes, SizeIs(4)); + for (const auto& submesh : submeshes) { + EXPECT_EQ(submesh->shape(), MeshShape(1, 2)); + EXPECT_THAT(submesh->get_devices(), SizeIs(2)); + } + + EXPECT_EQ(mesh_device_->get_submeshes(), submeshes); } } // namespace diff --git a/tests/tt_metal/distributed/test_mesh_device_reshape.cpp b/tests/tt_metal/distributed/test_mesh_device_reshape.cpp index 893ad9aca1a..7f858002eb7 100644 --- a/tests/tt_metal/distributed/test_mesh_device_reshape.cpp +++ b/tests/tt_metal/distributed/test_mesh_device_reshape.cpp @@ -8,6 +8,7 @@ #include #include "host_api.hpp" +#include "indestructible.hpp" #include "mesh_config.hpp" #include "mesh_device.hpp" #include "mesh_coord.hpp" @@ -43,22 +44,26 @@ class T3KTestFixture : public ::testing::Test { } }; -constexpr std::array kMeshShapes{{{1, 1}, {1, 2}, {1, 3}, {1, 4}, {1, 5}, {1, 6}, {1, 7}, {1, 8}, - {2, 1}, {2, 2}, {2, 3}, {2, 4}, {3, 1}, {3, 2}, {4, 1}, {4, 2}, - {8, 1}, {7, 1}, {6, 1}, {5, 1}, {4, 1}, {3, 1}, {2, 1}, {1, 1}}}; +const std::vector get_mesh_shapes() { + static tt::stl::Indestructible> kMeshShapes(std::vector{ + MeshShape{1, 1}, MeshShape{1, 2}, MeshShape{1, 3}, MeshShape{1, 4}, MeshShape{1, 5}, MeshShape{1, 6}, + MeshShape{1, 7}, MeshShape{1, 8}, MeshShape{2, 1}, MeshShape{2, 2}, MeshShape{2, 3}, MeshShape{2, 4}, + MeshShape{3, 1}, MeshShape{3, 2}, MeshShape{4, 1}, MeshShape{4, 2}, MeshShape{8, 1}, MeshShape{7, 1}, + MeshShape{6, 1}, MeshShape{5, 1}, MeshShape{4, 1}, MeshShape{3, 1}, MeshShape{2, 1}, MeshShape{1, 1}}); + return kMeshShapes.get(); +} class MeshConfigurationTest : public T3KTestFixture, public ::testing::WithParamInterface {}; TEST_P(MeshConfigurationTest, MeshConfigurations) { const auto& shape = GetParam(); auto mesh = tt::tt_metal::distributed::MeshDevice::create( - MeshDeviceConfig{.mesh_shape = SimpleMeshShape(shape.num_rows, shape.num_cols)}, + MeshDeviceConfig{.mesh_shape = MeshShape(shape)}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); - EXPECT_EQ(mesh->num_rows(), shape.num_rows); - EXPECT_EQ(mesh->num_cols(), shape.num_cols); + EXPECT_EQ(mesh->shape(), shape); mesh->close(); } @@ -67,12 +72,12 @@ TEST_P(MeshConfigurationTest, GetPhysicalDeviceIds) { auto& system_mesh = SystemMesh::instance(); EXPECT_THAT( - system_mesh.get_mapped_physical_device_ids(MeshDeviceConfig{.mesh_shape = SimpleMeshShape(shape)}), - SizeIs(shape.num_cols * shape.num_rows)); + system_mesh.get_mapped_physical_device_ids(MeshDeviceConfig{.mesh_shape = MeshShape(shape)}), + SizeIs(shape.mesh_size())); } // Test all possible mesh configurations on T3000 -INSTANTIATE_TEST_SUITE_P(AllMeshShapes, MeshConfigurationTest, ::testing::ValuesIn(kMeshShapes)); +INSTANTIATE_TEST_SUITE_P(AllMeshShapes, MeshConfigurationTest, ::testing::ValuesIn(get_mesh_shapes())); class MeshDeviceReshapeRoundtripTest : public T3KTestFixture, public ::testing::WithParamInterface> {}; @@ -80,42 +85,40 @@ class MeshDeviceReshapeRoundtripTest : public T3KTestFixture, TEST_P(MeshDeviceReshapeRoundtripTest, ReshapeBetweenConfigurations) { const auto& [old_shape, new_shape] = GetParam(); - if ((old_shape.num_rows * old_shape.num_cols) != (new_shape.num_rows * new_shape.num_cols)) { + if (old_shape.mesh_size() != new_shape.mesh_size()) { GTEST_SKIP() << "Device counts don't match; we test this in InvalidReshapeDimensions"; } - if (old_shape.num_rows == 1 or old_shape.num_cols == 1 or new_shape.num_rows == 1 or new_shape.num_cols == 1) { - GTEST_SKIP() << "Old shape is 1xN or Nx1; we test this in From1x4To2x2Invalid"; + if (is_line_topology(old_shape) or is_line_topology(new_shape)) { + GTEST_SKIP() << "Either old or new shape is in line configuration; we test this in From1x4To2x2Invalid"; } auto mesh = tt::tt_metal::distributed::MeshDevice::create( - MeshDeviceConfig{.mesh_shape = SimpleMeshShape(old_shape.num_rows, old_shape.num_cols)}, + MeshDeviceConfig{.mesh_shape = MeshShape(old_shape)}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); - EXPECT_EQ(mesh->num_rows(), old_shape.num_rows); - EXPECT_EQ(mesh->num_cols(), old_shape.num_cols); + EXPECT_EQ(mesh->shape(), old_shape); auto original_order = mesh->get_device_ids(); // Attempt reshape - mesh->reshape({new_shape.num_rows, new_shape.num_cols}); + mesh->reshape(new_shape); // Verify new shape - EXPECT_EQ(mesh->num_rows(), new_shape.num_rows); - EXPECT_EQ(mesh->num_cols(), new_shape.num_cols); + EXPECT_EQ(mesh->shape(), new_shape); // Verify device ordering is preserved EXPECT_EQ(mesh->get_device_ids(), original_order) - << "Device ordering is not preserved " << SimpleMeshShape(old_shape) << " -> " << SimpleMeshShape(new_shape); + << "Device ordering is not preserved " << MeshShape(old_shape) << " -> " << new_shape; } // Generate all possible combinations of shapes from kMeshShapes INSTANTIATE_TEST_SUITE_P( AllMeshShapes, MeshDeviceReshapeRoundtripTest, - ::testing::Combine(::testing::ValuesIn(kMeshShapes), ::testing::ValuesIn(kMeshShapes))); + ::testing::Combine(::testing::ValuesIn(get_mesh_shapes()), ::testing::ValuesIn(get_mesh_shapes()))); // Base class for non-parameterized tests using MeshDeviceReshapeTest = T3KTestFixture; @@ -124,57 +127,54 @@ TEST_F(MeshDeviceReshapeTest, InvalidRequestedShape) { auto& system_mesh = tt::tt_metal::distributed::SystemMesh::instance(); // Shape too big. - EXPECT_ANY_THROW(system_mesh.get_mapped_physical_device_ids(MeshDeviceConfig{.mesh_shape = SimpleMeshShape(9)})); - EXPECT_ANY_THROW(system_mesh.get_mapped_physical_device_ids(MeshDeviceConfig{.mesh_shape = SimpleMeshShape(2, 5)})); + EXPECT_ANY_THROW(system_mesh.get_mapped_physical_device_ids(MeshDeviceConfig{.mesh_shape = MeshShape(9)})); + EXPECT_ANY_THROW(system_mesh.get_mapped_physical_device_ids(MeshDeviceConfig{.mesh_shape = MeshShape(2, 5)})); // Invalid offset. EXPECT_ANY_THROW(system_mesh.get_mapped_physical_device_ids( - MeshDeviceConfig{.mesh_shape = SimpleMeshShape(1, 8), .offset = MeshCoordinate(0, 1)})); + MeshDeviceConfig{.mesh_shape = MeshShape(1, 8), .offset = MeshCoordinate(0, 1)})); EXPECT_ANY_THROW(system_mesh.get_mapped_physical_device_ids( - MeshDeviceConfig{.mesh_shape = SimpleMeshShape(2, 3), .offset = MeshCoordinate(1, 1)})); + MeshDeviceConfig{.mesh_shape = MeshShape(2, 3), .offset = MeshCoordinate(1, 1)})); // Offset dimensionality mismatch. EXPECT_ANY_THROW(system_mesh.get_mapped_physical_device_ids( - MeshDeviceConfig{.mesh_shape = SimpleMeshShape(2, 3), .offset = MeshCoordinate(1)})); + MeshDeviceConfig{.mesh_shape = MeshShape(2, 3), .offset = MeshCoordinate(1)})); // Mismatch system mesh shape. EXPECT_ANY_THROW(system_mesh.get_mapped_physical_device_ids( - MeshDeviceConfig{.mesh_shape = SimpleMeshShape(8), .offset = MeshCoordinate(1)})); + MeshDeviceConfig{.mesh_shape = MeshShape(8), .offset = MeshCoordinate(1)})); } TEST_F(MeshDeviceReshapeTest, InvalidReshapeDimensions) { auto mesh = tt::tt_metal::distributed::MeshDevice::create( - MeshDeviceConfig{.mesh_shape = SimpleMeshShape(1, 8)}, + MeshDeviceConfig{.mesh_shape = MeshShape(1, 8)}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); // Test reshaping to dimensions that don't match total device count - EXPECT_THROW(mesh->reshape({3, 3}), std::runtime_error); // 9 devices != 8 - EXPECT_THROW(mesh->reshape({1, 9}), std::runtime_error); // 9 devices != 8 + EXPECT_THROW(mesh->reshape(MeshShape(3, 3)), std::runtime_error); // 9 devices != 8 + EXPECT_THROW(mesh->reshape(MeshShape(1, 9)), std::runtime_error); // 9 devices != 8 // Verify original shape is preserved after failed reshapes - EXPECT_EQ(mesh->num_rows(), 1); - EXPECT_EQ(mesh->num_cols(), 8); + EXPECT_EQ(mesh->shape(), MeshShape(1, 8)); } TEST_F(MeshDeviceReshapeTest, From1x8To2x4ThenBackTo1x8) { auto mesh = tt::tt_metal::distributed::MeshDevice::create( - MeshDeviceConfig{.mesh_shape = SimpleMeshShape(1, 8)}, + MeshDeviceConfig{.mesh_shape = MeshShape(1, 8)}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); - EXPECT_EQ(mesh->num_rows(), 1); - EXPECT_EQ(mesh->num_cols(), 8); + EXPECT_EQ(mesh->shape(), MeshShape(1, 8)); auto original_order = mesh->get_device_ids(); - mesh->reshape({2, 4}); + mesh->reshape(MeshShape(2, 4)); - EXPECT_EQ(mesh->num_rows(), 2); - EXPECT_EQ(mesh->num_cols(), 4); + EXPECT_EQ(mesh->shape(), MeshShape(2, 4)); std::vector expected_physical_device_id_order = { original_order[0], original_order[1], @@ -189,37 +189,36 @@ TEST_F(MeshDeviceReshapeTest, From1x8To2x4ThenBackTo1x8) { auto new_order = mesh->get_device_ids(); EXPECT_EQ(new_order, expected_physical_device_id_order); - mesh->reshape({1, 8}); + mesh->reshape(MeshShape(1, 8)); EXPECT_EQ(mesh->get_device_ids(), original_order); } TEST_F(MeshDeviceReshapeTest, InvalidTotalDeviceCount) { auto mesh = tt::tt_metal::distributed::MeshDevice::create( - MeshDeviceConfig{.mesh_shape = SimpleMeshShape(1, 8)}, + MeshDeviceConfig{.mesh_shape = MeshShape(1, 8)}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); // Test reshaping to dimensions that don't match total device count - EXPECT_THROW(mesh->reshape({3, 3}), std::runtime_error); // 9 devices != 8 - EXPECT_THROW(mesh->reshape({1, 9}), std::runtime_error); // 9 devices != 8 + EXPECT_THROW(mesh->reshape(MeshShape(3, 3)), std::runtime_error); // 9 devices != 8 + EXPECT_THROW(mesh->reshape(MeshShape(1, 9)), std::runtime_error); // 9 devices != 8 // Verify original shape is preserved after failed reshapes - EXPECT_EQ(mesh->num_rows(), 1); - EXPECT_EQ(mesh->num_cols(), 8); + EXPECT_EQ(mesh->shape(), MeshShape(1, 8)); } TEST_F(MeshDeviceReshapeTest, From1x4To2x2Invalid) { auto mesh = tt::tt_metal::distributed::MeshDevice::create( - MeshDeviceConfig{.mesh_shape = SimpleMeshShape(1, 4)}, + MeshDeviceConfig{.mesh_shape = MeshShape(1, 4)}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); // This is an invalid reshape because the 1x4 mesh does not fully cover the 2x2 mesh - EXPECT_THROW(mesh->reshape({2, 2}), std::runtime_error); + EXPECT_THROW(mesh->reshape(MeshShape(2, 2)), std::runtime_error); } TEST_F(MeshDeviceReshapeTest, From1x4To2x2Valid) { @@ -227,21 +226,20 @@ TEST_F(MeshDeviceReshapeTest, From1x4To2x2Valid) { // Fetch the device ids for a physically connected 2x2 mesh. auto physical_device_ids = system_mesh.get_mapped_physical_device_ids(MeshDeviceConfig{ - .mesh_shape = SimpleMeshShape(2, 2), + .mesh_shape = MeshShape(2, 2), }); // Supply the physical device ids to the mesh constructor that we know we know is 2x2 physically connected. // We will create a 1x4 mesh and then reshape it to 2x2. auto mesh = tt::tt_metal::distributed::MeshDevice::create( - MeshDeviceConfig{.mesh_shape = SimpleMeshShape(1, 4), .physical_device_ids = physical_device_ids}, + MeshDeviceConfig{.mesh_shape = MeshShape(1, 4), .physical_device_ids = physical_device_ids}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); - mesh->reshape({2, 2}); - EXPECT_EQ(mesh->num_rows(), 2); - EXPECT_EQ(mesh->num_cols(), 2); + mesh->reshape(MeshShape(2, 2)); + EXPECT_EQ(mesh->shape(), MeshShape(2, 2)); auto new_layout = mesh->get_device_ids(); for (auto physical_device_id : physical_device_ids) { EXPECT_TRUE(std::find(new_layout.begin(), new_layout.end(), physical_device_id) != new_layout.end()); @@ -250,7 +248,7 @@ TEST_F(MeshDeviceReshapeTest, From1x4To2x2Valid) { TEST_F(MeshDeviceReshapeTest, From2x2To1x4) { auto mesh = tt::tt_metal::distributed::MeshDevice::create( - MeshDeviceConfig{.mesh_shape = SimpleMeshShape(2, 2)}, + MeshDeviceConfig{.mesh_shape = MeshShape(2, 2)}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, @@ -258,9 +256,8 @@ TEST_F(MeshDeviceReshapeTest, From2x2To1x4) { auto mesh_2x2_device_ids = mesh->get_device_ids(); - mesh->reshape({1, 4}); - EXPECT_EQ(mesh->num_rows(), 1); - EXPECT_EQ(mesh->num_cols(), 4); + mesh->reshape(MeshShape(1, 4)); + EXPECT_EQ(mesh->shape(), MeshShape(1, 4)); auto mesh_1x4_device_ids = mesh->get_device_ids(); std::vector expected_1x4_device_ids = { diff --git a/tests/ttnn/unit_tests/gtests/tensor/test_distributed_tensor.cpp b/tests/ttnn/unit_tests/gtests/tensor/test_distributed_tensor.cpp index 8b753db4043..810da702d59 100644 --- a/tests/ttnn/unit_tests/gtests/tensor/test_distributed_tensor.cpp +++ b/tests/ttnn/unit_tests/gtests/tensor/test_distributed_tensor.cpp @@ -102,15 +102,16 @@ TEST_F(TensorDistributionTest, Shard1D) { } TEST_F(TensorDistributionTest, Shard2DInvalidMeshShape) { - const auto [num_rows, num_cols] = mesh_device_->shape(); - ASSERT_EQ(num_rows, 2); - ASSERT_EQ(num_cols, 4); + ASSERT_EQ(mesh_device_->shape(), MeshShape(2, 4)); EXPECT_ANY_THROW( shard_tensor_to_2d_mesh_mapper(*mesh_device_, MeshShape{3, 1}, Shard2dConfig{.row_dim = 1, .col_dim = 2})); EXPECT_ANY_THROW( shard_tensor_to_2d_mesh_mapper(*mesh_device_, MeshShape{2, 5}, Shard2dConfig{.row_dim = 1, .col_dim = 2})); + + EXPECT_ANY_THROW( + shard_tensor_to_2d_mesh_mapper(*mesh_device_, MeshShape{1, 1, 2}, Shard2dConfig{.row_dim = 1, .col_dim = 2})); } TEST_F(TensorDistributionTest, Shard2DInvalidShardConfig) { @@ -122,19 +123,18 @@ TEST_F(TensorDistributionTest, Concat2DInvalidConfig) { } TEST_F(TensorDistributionTest, Shard2DReplicateDim) { - const auto [num_rows, num_cols] = mesh_device_->shape(); - ASSERT_EQ(num_rows, 2); - ASSERT_EQ(num_cols, 4); - const int num_devices = num_rows * num_cols; + constexpr size_t kNumRows = 2; + constexpr size_t kNumCols = 4; + ASSERT_EQ(mesh_device_->shape(), MeshShape(kNumRows, kNumCols)); std::vector test_data = {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}; Tensor input_tensor = - Tensor::from_vector(test_data, get_tensor_spec(ttnn::Shape{1, num_rows, num_cols, 1}, DataType::FLOAT32)); + Tensor::from_vector(test_data, get_tensor_spec(ttnn::Shape{1, kNumRows, kNumCols, 1}, DataType::FLOAT32)); input_tensor.print(); auto mapper = shard_tensor_to_2d_mesh_mapper( *mesh_device_, - MeshShape{num_rows, num_cols}, + MeshShape{kNumRows, kNumCols}, Shard2dConfig{ .row_dim = 1, }); @@ -154,21 +154,21 @@ TEST_F(TensorDistributionTest, Shard2DReplicateDim) { } TEST_F(TensorDistributionTest, Shard2D) { - const auto [num_rows, num_cols] = mesh_device_->shape(); - ASSERT_EQ(num_rows, 2); - ASSERT_EQ(num_cols, 4); - const int num_devices = num_rows * num_cols; + constexpr size_t kNumRows = 2; + constexpr size_t kNumCols = 4; + ASSERT_EQ(mesh_device_->shape(), MeshShape(kNumRows, kNumCols)); + const int num_devices = kNumRows * kNumCols; std::vector test_data; for (int i = 0; i < num_devices; i++) { test_data.insert(test_data.end(), {i * 1.F, i * 2.F, i * 3.F}); } Tensor input_tensor = - Tensor::from_vector(test_data, get_tensor_spec(ttnn::Shape{1, num_rows, num_cols, 3}, DataType::FLOAT32)); + Tensor::from_vector(test_data, get_tensor_spec(ttnn::Shape{1, kNumRows, kNumCols, 3}, DataType::FLOAT32)); auto mapper = shard_tensor_to_2d_mesh_mapper( *mesh_device_, - MeshShape{num_rows, num_cols}, + MeshShape{kNumRows, kNumCols}, Shard2dConfig{ .row_dim = 1, .col_dim = 2, @@ -190,7 +190,7 @@ TEST_F(TensorDistributionTest, Shard2D) { Tensor concatenated_tensor = aggregate_tensor(sharded_tensor, *composer); Tensor expected_tensor = - Tensor::from_vector(test_data, get_tensor_spec(ttnn::Shape{num_rows, 1, num_cols, 3}, DataType::FLOAT32)); + Tensor::from_vector(test_data, get_tensor_spec(ttnn::Shape{kNumRows, 1, kNumCols, 3}, DataType::FLOAT32)); EXPECT_TRUE(ttnn::allclose(concatenated_tensor, expected_tensor)); } diff --git a/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp b/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp index 69ba9810227..17fdd93ee1a 100644 --- a/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp +++ b/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp @@ -73,14 +73,13 @@ bool is_tgg_system() { } ttnn::MeshShape get_mesh_shape() { - ttnn::MeshShape shape; if (is_tg_system()) { - shape = {8, 4}; + return ttnn::MeshShape{8, 4}; + } else if (is_tgg_system()) { + return ttnn::MeshShape{8, 8}; } else { - TT_FATAL(is_tgg_system(), "Unsupported Galaxy system"); - shape = {8, 8}; + TT_THROW("Unsupported Galaxy system"); } - return shape; } void validate_num_tunnels_and_tunnel_depth() { @@ -212,7 +211,7 @@ TEST(GalaxyTests, TestReduceScatterDeadlock) { auto view = ttnn::MeshDeviceView(*mesh); std::vector ring_devices = view.get_devices_on_row(0); // Tunnel 0 std::vector ring_devices_1 = - view.get_devices_on_column(mesh_shape.num_cols - 1); // Orthogonal to tunnel .. no deadlocks + view.get_devices_on_column(mesh_shape[1] - 1); // Orthogonal to tunnel .. no deadlocks ring_devices_1 = std::vector(ring_devices_1.begin() + 1, ring_devices_1.end()); std::vector ring_devices_2 = view.get_devices_on_row(7); // Tunnel 7 .. potential deadlocks with lack of buffering diff --git a/tt-train/sources/examples/linear_regression_ddp/main.cpp b/tt-train/sources/examples/linear_regression_ddp/main.cpp index af0d6d14927..309b6039559 100644 --- a/tt-train/sources/examples/linear_regression_ddp/main.cpp +++ b/tt-train/sources/examples/linear_regression_ddp/main.cpp @@ -32,7 +32,7 @@ int main() { const size_t num_targets = 32; const float noise = 0.0F; const bool bias = true; - ttml::autograd::ctx().set_mesh_shape({1, 2}); + ttml::autograd::ctx().set_mesh_shape(tt::tt_metal::distributed::MeshShape(1, 2)); auto training_params = ttml::datasets::MakeRegressionParams{ .n_samples = training_samples_count, diff --git a/tt-train/sources/examples/mnist_mlp/main.cpp b/tt-train/sources/examples/mnist_mlp/main.cpp index 649e7463c26..8d551264cfe 100644 --- a/tt-train/sources/examples/mnist_mlp/main.cpp +++ b/tt-train/sources/examples/mnist_mlp/main.cpp @@ -67,7 +67,7 @@ TrainingConfig parse_config(const YAML::Node &yaml_config) { void initialize_device(bool enable_tp) { if (enable_tp) { // we support only N300 for now - ttml::autograd::ctx().set_mesh_shape({1, 2}); + ttml::autograd::ctx().set_mesh_shape(tt::tt_metal::distributed::MeshShape(1, 2)); } } diff --git a/tt-train/sources/examples/nano_gpt/utils.cpp b/tt-train/sources/examples/nano_gpt/utils.cpp index f8e53e68042..a5526debef2 100644 --- a/tt-train/sources/examples/nano_gpt/utils.cpp +++ b/tt-train/sources/examples/nano_gpt/utils.cpp @@ -97,6 +97,6 @@ std::unique_ptr create_warmup_with_linear_sch void initialize_device(bool ddp) { if (ddp) { // currently supports only N300 device - ttml::autograd::ctx().set_mesh_shape({1, 2}); + ttml::autograd::ctx().set_mesh_shape(tt::tt_metal::distributed::MeshShape(1, 2)); } } diff --git a/tt-train/sources/ttml/autograd/auto_context.hpp b/tt-train/sources/ttml/autograd/auto_context.hpp index 8d335836ca4..03e85716709 100644 --- a/tt-train/sources/ttml/autograd/auto_context.hpp +++ b/tt-train/sources/ttml/autograd/auto_context.hpp @@ -59,7 +59,7 @@ class AutoContext { GradMode m_grads_mode = GradMode::ENABLED; Graph m_graph; - tt::tt_metal::distributed::MeshShape m_mesh_shape = {1, 1}; + tt::tt_metal::distributed::MeshShape m_mesh_shape = tt::tt_metal::distributed::MeshShape(1, 1); std::unique_ptr m_device; friend class tt::stl::Indestructible; diff --git a/tt-train/sources/ttml/core/distributed_mapping.hpp b/tt-train/sources/ttml/core/distributed_mapping.hpp index 1ba3a9e5c02..2b10c4fb4fd 100644 --- a/tt-train/sources/ttml/core/distributed_mapping.hpp +++ b/tt-train/sources/ttml/core/distributed_mapping.hpp @@ -34,7 +34,7 @@ class XTensorToMesh { tt::tt_metal::distributed::MeshShape m_mesh_shape; size_t get_num_devices() const { - return m_mesh_shape.num_rows * m_mesh_shape.num_cols; + return m_mesh_shape.mesh_size(); } }; @@ -90,8 +90,8 @@ class ShardTensor2dMesh : public XTensorToMesh, T> { throw std::invalid_argument("ShardTensor2dMesh requires at least one dimension to shard"); } - int rows = Base::m_mesh_shape.num_rows; - int cols = Base::m_mesh_shape.num_cols; + int rows = Base::m_mesh_shape[0]; + int cols = Base::m_mesh_shape[1]; auto row_dim = m_dims.first; auto col_dim = m_dims.second; @@ -138,8 +138,8 @@ class ShardTensor2dMesh : public XTensorToMesh, T> { std::unordered_map config_impl() const { return { {"strategy", "shard_2d"}, - {"mesh_shape_y", std::to_string(Base::m_mesh_shape.num_rows)}, - {"mesh_shape_x", std::to_string(Base::m_mesh_shape.num_cols)}}; + {"mesh_shape_y", std::to_string(Base::m_mesh_shape[0])}, + {"mesh_shape_x", std::to_string(Base::m_mesh_shape[1])}}; } private: @@ -153,16 +153,16 @@ class ConcatMesh2dToTensor : public MeshToXTensor, T> { ConcatMesh2dToTensor( tt::tt_metal::distributed::MeshShape mesh_shape, const tt::tt_metal::distributed::MeshShape& dims) : Base(std::move(mesh_shape)), m_dims(dims) { - if (m_dims.num_rows == m_dims.num_cols) { + if (m_dims[0] == m_dims[1]) { throw std::invalid_argument("Dimensions in 'dims' must be different"); } } std::vector> compose_impl(const std::vector>& tensors) const { - int rows = Base::m_mesh_shape.num_rows; - int cols = Base::m_mesh_shape.num_cols; - size_t row_dim = m_dims.num_rows; - size_t col_dim = m_dims.num_cols; + int rows = Base::m_mesh_shape[0]; + int cols = Base::m_mesh_shape[1]; + size_t row_dim = m_dims[0]; + size_t col_dim = m_dims[1]; std::vector> row_concatenated; row_concatenated.reserve(static_cast(rows)); diff --git a/tt-train/tests/core/distributed_test.cpp b/tt-train/tests/core/distributed_test.cpp index 0617c317ef3..926b671393f 100644 --- a/tt-train/tests/core/distributed_test.cpp +++ b/tt-train/tests/core/distributed_test.cpp @@ -11,6 +11,7 @@ namespace { +using MetalMeshShape = ::tt::tt_metal::distributed::MeshShape; using ::testing::SizeIs; template @@ -23,7 +24,7 @@ using TestTypes = ::testing::Types; TYPED_TEST_SUITE(MeshOpsTest, TestTypes); TYPED_TEST(MeshOpsTest, ShardXTensorToMeshBasicShard) { - tt::tt_metal::distributed::MeshShape mesh_shape = {1, 4}; + MetalMeshShape mesh_shape{1, 4}; // A simple 1D tensor to shard across 4 devices auto tensor = xt::arange(8); // [0,...,7] @@ -40,7 +41,7 @@ TYPED_TEST(MeshOpsTest, ShardXTensorToMeshBasicShard) { TYPED_TEST(MeshOpsTest, ShardTensor2dMeshTwoDimSharding) { // Mesh shape: 2x2, total 4 devices - tt::tt_metal::distributed::MeshShape mesh_shape = {2, 2}; + MetalMeshShape mesh_shape{2, 2}; // Create a 2D tensor shape: (4,4) auto tensor = xt::arange(16).reshape({4, 4}); @@ -58,8 +59,8 @@ TYPED_TEST(MeshOpsTest, ShardTensor2dMeshTwoDimSharding) { } TYPED_TEST(MeshOpsTest, ReplicateXTensorToMeshReplication) { - tt::tt_metal::distributed::MeshShape mesh_shape = {2, 2}; - int num_devices = mesh_shape.num_rows * mesh_shape.num_cols; // 4 + MetalMeshShape mesh_shape{2, 2}; + int num_devices = mesh_shape.mesh_size(); // 4 auto tensor = xt::arange(4); // [0,1,2,3] @@ -73,7 +74,7 @@ TYPED_TEST(MeshOpsTest, ReplicateXTensorToMeshReplication) { } TYPED_TEST(MeshOpsTest, ConcatMesh2dToTensorRecomposition) { - tt::tt_metal::distributed::MeshShape mesh_shape = {2, 2}; + MetalMeshShape mesh_shape{2, 2}; // Create shards that would come from a 4x4 tensor: // Expected final tensor: @@ -90,7 +91,7 @@ TYPED_TEST(MeshOpsTest, ConcatMesh2dToTensorRecomposition) { std::vector> shards = {top_left, top_right, bot_left, bot_right}; - ttml::core::ConcatMesh2dToTensor composer(mesh_shape, {0, 1}); + ttml::core::ConcatMesh2dToTensor composer(mesh_shape, MetalMeshShape{0, 1}); auto composed = composer.compose(shards); xt::xarray expected = { @@ -103,7 +104,7 @@ TYPED_TEST(MeshOpsTest, ConcatMesh2dToTensorRecomposition) { } TYPED_TEST(MeshOpsTest, ConcatMeshToXTensorOneDimConcatenation) { - tt::tt_metal::distributed::MeshShape mesh_shape = {1, 3}; + MetalMeshShape mesh_shape{1, 3}; // Create a few shards: [0,1], [2,3], [4,5] xt::xarray s1 = {TypeParam(0), TypeParam(1)}; @@ -120,7 +121,7 @@ TYPED_TEST(MeshOpsTest, ConcatMeshToXTensorOneDimConcatenation) { } TYPED_TEST(MeshOpsTest, VectorMeshToXTensorVectorReturn) { - tt::tt_metal::distributed::MeshShape mesh_shape = {2, 2}; + MetalMeshShape mesh_shape{2, 2}; ttml::core::VectorMeshToXTensor vectorComposer(mesh_shape); std::vector> shards = { @@ -134,7 +135,7 @@ TYPED_TEST(MeshOpsTest, VectorMeshToXTensorVectorReturn) { } TYPED_TEST(MeshOpsTest, ConcatenateSameParametersAsCompose) { - tt::tt_metal::distributed::MeshShape mesh_shape = {1, 3}; + MetalMeshShape mesh_shape{1, 3}; // Create a few shards: [0,1], [2,3], [4,5] xt::xarray s1 = {TypeParam(0), TypeParam(1)}; diff --git a/tt-train/tests/core/n300_utils_test.cpp b/tt-train/tests/core/n300_utils_test.cpp index 358c5475420..f5a9c560e81 100644 --- a/tt-train/tests/core/n300_utils_test.cpp +++ b/tt-train/tests/core/n300_utils_test.cpp @@ -23,7 +23,7 @@ class N300UtilsTest : public ::testing::Test { if (!check_board_is_n300()) { GTEST_SKIP() << "Skipping N300 specific tests"; } - ttml::autograd::ctx().set_mesh_shape({1, 2}); + ttml::autograd::ctx().set_mesh_shape(tt::tt_metal::distributed::MeshShape(1, 2)); ttml::autograd::ctx().open_device(); } diff --git a/tt-train/tests/model/linear_regression_ddp_test.cpp b/tt-train/tests/model/linear_regression_ddp_test.cpp index cb29f87b187..2758eeefd6f 100644 --- a/tt-train/tests/model/linear_regression_ddp_test.cpp +++ b/tt-train/tests/model/linear_regression_ddp_test.cpp @@ -34,7 +34,7 @@ class LinearRegressionDDPTest : public ::testing::Test { if (!check_board_is_n300()) { GTEST_SKIP() << "Skipping N300 specific tests"; } - ttml::autograd::ctx().set_mesh_shape({1, 2}); + ttml::autograd::ctx().set_mesh_shape(tt::tt_metal::distributed::MeshShape(1, 2)); ttml::autograd::ctx().open_device(); } diff --git a/tt-train/tests/modules/distributed/linear_test.cpp b/tt-train/tests/modules/distributed/linear_test.cpp index fb1c47c23be..b3e5854b1ec 100644 --- a/tt-train/tests/modules/distributed/linear_test.cpp +++ b/tt-train/tests/modules/distributed/linear_test.cpp @@ -37,7 +37,7 @@ class N300TensorParallelLinearTest : public ::testing::Test { if (!check_board_is_n300()) { GTEST_SKIP() << "Skipping N300 specific tests"; } - ttml::autograd::ctx().set_mesh_shape({1, 2}); + ttml::autograd::ctx().set_mesh_shape(tt::tt_metal::distributed::MeshShape(1, 2)); ttml::autograd::ctx().open_device(); } diff --git a/tt-train/tests/ops/distributed/comm_ops_test.cpp b/tt-train/tests/ops/distributed/comm_ops_test.cpp index e0d938d06eb..1fd23112980 100644 --- a/tt-train/tests/ops/distributed/comm_ops_test.cpp +++ b/tt-train/tests/ops/distributed/comm_ops_test.cpp @@ -29,7 +29,7 @@ class N300CommOpsTest : public ::testing::Test { if (!check_board_is_n300()) { GTEST_SKIP() << "Skipping N300 specific tests"; } - ttml::autograd::ctx().set_mesh_shape({1, 2}); + ttml::autograd::ctx().set_mesh_shape(tt::tt_metal::distributed::MeshShape(1, 2)); ttml::autograd::ctx().open_device(); } diff --git a/tt-train/tests/ttnn_fixed/distributed/distributed_ttnn_ops_test.cpp b/tt-train/tests/ttnn_fixed/distributed/distributed_ttnn_ops_test.cpp index ff3cf5f838d..48efc0e3a8a 100644 --- a/tt-train/tests/ttnn_fixed/distributed/distributed_ttnn_ops_test.cpp +++ b/tt-train/tests/ttnn_fixed/distributed/distributed_ttnn_ops_test.cpp @@ -27,7 +27,7 @@ class TrivialTnnFixedDistributedTest : public ::testing::Test { if (!check_board_is_n300()) { GTEST_SKIP() << "Skipping N300 specific tests"; } - ttml::autograd::ctx().set_mesh_shape({1, 2}); + ttml::autograd::ctx().set_mesh_shape(tt::tt_metal::distributed::MeshShape(1, 2)); ttml::autograd::ctx().open_device(); } diff --git a/tt_metal/api/tt-metalium/mesh_buffer.hpp b/tt_metal/api/tt-metalium/mesh_buffer.hpp index 2a16355fbaa..e8d8347a3ae 100644 --- a/tt_metal/api/tt-metalium/mesh_buffer.hpp +++ b/tt_metal/api/tt-metalium/mesh_buffer.hpp @@ -113,7 +113,7 @@ class MeshBuffer { DeviceAddr device_local_size, MeshDevice* mesh_device, std::shared_ptr backing_buffer) : - buffers_(SimpleMeshShape(mesh_device->shape()), nullptr), + buffers_(MeshShape(mesh_device->shape()), nullptr), config_(config), device_local_config_(device_local_config), mesh_device_(mesh_device->shared_from_this()), @@ -128,7 +128,7 @@ class MeshBuffer { DeviceAddr address, DeviceAddr device_local_size, MeshDevice* mesh_device) : - buffers_(SimpleMeshShape(mesh_device->shape()), /*fill_value=*/nullptr), + buffers_(MeshShape(mesh_device->shape()), /*fill_value=*/nullptr), config_(config), device_local_config_(device_local_config), mesh_device_(mesh_device->shared_from_this()), diff --git a/tt_metal/api/tt-metalium/mesh_config.hpp b/tt_metal/api/tt-metalium/mesh_config.hpp index e14440da1d3..5547f4f70d2 100644 --- a/tt_metal/api/tt-metalium/mesh_config.hpp +++ b/tt_metal/api/tt-metalium/mesh_config.hpp @@ -15,16 +15,6 @@ using DeviceIds = std::vector; using MeshDeviceID = int; using chip_id_t = int; -struct MeshOffset { - size_t row = 0; - size_t col = 0; -}; - -struct MeshShape { - size_t num_rows = 0; - size_t num_cols = 0; -}; - /** * @brief Defines the organization of physical devices in a user-defined MeshDevice. * @@ -40,7 +30,7 @@ struct MeshShape { */ struct MeshDeviceConfig { - SimpleMeshShape mesh_shape{0, 0}; + MeshShape mesh_shape{0, 0}; std::optional offset; std::vector physical_device_ids{}; }; diff --git a/tt_metal/api/tt-metalium/mesh_coord.hpp b/tt_metal/api/tt-metalium/mesh_coord.hpp index 9dd3292de1d..0823ca1205d 100644 --- a/tt_metal/api/tt-metalium/mesh_coord.hpp +++ b/tt_metal/api/tt-metalium/mesh_coord.hpp @@ -14,21 +14,19 @@ namespace tt::tt_metal::distributed { -struct MeshShape; - -// TODO: #17477 - Rename to `MeshShape` when the legacy type is gone. -class SimpleMeshShape : public ShapeBase { +class MeshShape : public ShapeBase { public: - using ShapeBase::ShapeBase; using ShapeBase::operator[]; // Shorthands for constructing 1D, 2D and 3D shapes. - explicit SimpleMeshShape(uint32_t x); - SimpleMeshShape(uint32_t x, uint32_t y); - SimpleMeshShape(uint32_t x, uint32_t y, uint32_t z); + explicit MeshShape(uint32_t x); + MeshShape(uint32_t x, uint32_t y); + MeshShape(uint32_t x, uint32_t y, uint32_t z); - // Temporary constructor for transitioning to `SimpleMeshShape`. - SimpleMeshShape(const MeshShape& legacy_shape); + explicit MeshShape(const tt::stl::SmallVector& shape); + explicit MeshShape(tt::stl::SmallVector&& shape); + explicit MeshShape(std::initializer_list ilist); + explicit MeshShape(tt::stl::Span span); // Returns the dimensionality of the mesh. size_t dims() const; @@ -43,18 +41,22 @@ class SimpleMeshShape : public ShapeBase { static constexpr auto attribute_names = std::forward_as_tuple("value"); auto attribute_values() const { return std::forward_as_tuple(value_); } - friend bool operator==(const SimpleMeshShape& lhs, const SimpleMeshShape& rhs); - friend bool operator!=(const SimpleMeshShape& lhs, const SimpleMeshShape& rhs); - friend std::ostream& operator<<(std::ostream& os, const SimpleMeshShape& shape); + friend bool operator==(const MeshShape& lhs, const MeshShape& rhs); + friend bool operator!=(const MeshShape& lhs, const MeshShape& rhs); + friend std::ostream& operator<<(std::ostream& os, const MeshShape& shape); private: using ShapeBase::empty; + using ShapeBase::ShapeBase; using ShapeBase::size; void compute_strides(); tt::stl::SmallVector strides_; }; +// Returns true if the mesh shape is in a line topology: at most 1 dimension can be non-unit. +bool is_line_topology(const MeshShape& shape); + class MeshCoordinate { public: // Shorthands for constructing 1D, 2D and 3D coordinates. @@ -65,6 +67,9 @@ class MeshCoordinate { // Constructs a generic N-dimensional coordinate. explicit MeshCoordinate(tt::stl::Span coords); + // Returns a zero-initialized N-dimensional coordinate. + static MeshCoordinate zero_coordinate(size_t dimensions); + // Returns the dimensionality of the coordinate. size_t dims() const; @@ -88,7 +93,7 @@ class MeshCoordinate { // Converts a MeshCoordinate to a linear index. // Throws if `coord` is out of bounds of `shape`. -size_t to_linear_index(const SimpleMeshShape& shape, const MeshCoordinate& coord); +size_t to_linear_index(const MeshShape& shape, const MeshCoordinate& coord); // Represents a range of MeshCoordinates. Requires that mesh coordinates have the same dimensionality. class MeshCoordinateRange { @@ -97,7 +102,7 @@ class MeshCoordinateRange { MeshCoordinateRange(const MeshCoordinate& start, const MeshCoordinate& end); // Constructs a range that iterates over all coordinates in the mesh. - MeshCoordinateRange(const SimpleMeshShape& shape); + explicit MeshCoordinateRange(const MeshShape& shape); // Returns the dimensionality of the range. size_t dims() const; @@ -192,11 +197,11 @@ class MeshCoordinateValueProxy { template class MeshContainer { public: - MeshContainer(const SimpleMeshShape& shape, const T& fill_value); - MeshContainer(const SimpleMeshShape& shape, std::vector values); + MeshContainer(const MeshShape& shape, const T& fill_value); + MeshContainer(const MeshShape& shape, std::vector values); // Returns a shape of the container. - const SimpleMeshShape& shape() const; + const MeshShape& shape() const; // Returns (inclusive) range of coordinates in the container. const MeshCoordinateRange& coord_range() const; @@ -269,17 +274,17 @@ class MeshContainer { friend bool operator!=(const MeshContainer& lhs, const MeshContainer& rhs) { return !(lhs == rhs); } private: - SimpleMeshShape shape_; + MeshShape shape_; MeshCoordinateRange coord_range_; std::vector values_; }; template -MeshContainer::MeshContainer(const SimpleMeshShape& shape, const T& fill_value) : +MeshContainer::MeshContainer(const MeshShape& shape, const T& fill_value) : shape_(shape), coord_range_(shape), values_(shape.mesh_size(), fill_value) {} template -MeshContainer::MeshContainer(const SimpleMeshShape& shape, std::vector values) : +MeshContainer::MeshContainer(const MeshShape& shape, std::vector values) : shape_(shape), coord_range_(shape), values_(std::move(values)) { TT_FATAL( shape.mesh_size() == values_.size(), @@ -289,7 +294,7 @@ MeshContainer::MeshContainer(const SimpleMeshShape& shape, std::vector val } template -const SimpleMeshShape& MeshContainer::shape() const { +const MeshShape& MeshContainer::shape() const { return shape_; } diff --git a/tt_metal/api/tt-metalium/mesh_device.hpp b/tt_metal/api/tt-metalium/mesh_device.hpp index 9b7c6843abd..db0ebf1b7ca 100644 --- a/tt_metal/api/tt-metalium/mesh_device.hpp +++ b/tt_metal/api/tt-metalium/mesh_device.hpp @@ -57,7 +57,6 @@ class MeshDevice : public IDevice, public std::enable_shared_from_this scoped_devices_; MeshDeviceID mesh_id_; - MeshShape mesh_shape_; std::unique_ptr view_; std::vector> submeshes_; // Parent owns submeshes and is responsible for their destruction @@ -75,7 +74,6 @@ class MeshDevice : public IDevice, public std::enable_shared_from_this scoped_devices, - const MeshShape& mesh_shape, std::unique_ptr mesh_device_view, std::weak_ptr parent_mesh = {}); ~MeshDevice() override; @@ -217,15 +215,19 @@ class MeshDevice : public IDevice, public std::enable_shared_from_this get_devices() const; IDevice* get_device(chip_id_t physical_device_id) const; - IDevice* get_device(size_t row_idx, size_t col_idx) const; IDevice* get_device(const MeshCoordinate& coord) const; const DeviceIds get_device_ids() const; size_t num_devices() const; + + // The following methods assume 2D mesh, and throw if the mesh is not 2D. + // TODO: #17477 - Remove the methods that assume 2D mesh. size_t num_rows() const; size_t num_cols() const; - MeshShape shape() const; + IDevice* get_device(size_t row_idx, size_t col_idx) const; + + const MeshShape& shape() const; // Reshapes the logical mesh and re-maps the physical devices to the new logical coordinates. // Reshaping Rules: @@ -251,7 +253,7 @@ class MeshDevice : public IDevice, public std::enable_shared_from_this> get_submeshes() const; std::shared_ptr create_submesh( - const MeshShape& submesh_shape, const MeshOffset& offset = MeshOffset{0, 0}); + const MeshShape& submesh_shape, const std::optional& offset = std::nullopt); std::vector> create_submeshes(const MeshShape& submesh_shape); diff --git a/tt_metal/api/tt-metalium/mesh_device_view.hpp b/tt_metal/api/tt-metalium/mesh_device_view.hpp index afe2b49fb05..232bdbdd3c9 100644 --- a/tt_metal/api/tt-metalium/mesh_device_view.hpp +++ b/tt_metal/api/tt-metalium/mesh_device_view.hpp @@ -53,13 +53,13 @@ class MeshDeviceView { // Get devices spanning the region defined by `range` in row-major order with start/end coordinates inclusive [[nodiscard]] DeviceView get_devices(const MeshCoordinateRange& range) const; - [[nodiscard]] DeviceView get_devices(const SimpleMeshShape& submesh_shape) const; + [[nodiscard]] DeviceView get_devices(const MeshShape& submesh_shape) const; [[nodiscard]] DeviceView get_devices() const; [[nodiscard]] size_t num_devices() const; [[nodiscard]] bool empty() const noexcept; [[nodiscard]] size_t size() const noexcept; - [[nodiscard]] SimpleMeshShape shape() const noexcept; + [[nodiscard]] const MeshShape& shape() const noexcept; [[nodiscard]] bool contains(const MeshCoordinate& coord) const noexcept; [[nodiscard]] IDevice* get_device(const MeshCoordinate& coord) const; [[nodiscard]] const IDevice* at(const MeshCoordinate& coord) const noexcept; @@ -77,7 +77,7 @@ class MeshDeviceView { // Throws if the `coord` is out of bounds of this view. [[nodiscard]] chip_id_t find_device_id(const MeshCoordinate& coord) const; - // TODO: Remove the methods that assume 2D mesh. + // TODO: #17477 - Remove the methods that assume 2D mesh. [[nodiscard]] bool is_mesh_2d() const; [[nodiscard]] size_t num_rows() const; [[nodiscard]] size_t num_cols() const; @@ -95,6 +95,7 @@ class MeshDeviceView { // The current support only provides left-to-right and right-to-left snaking of the line. // // Important: these utilities currently only support 2D meshes. + // TODO: #17477 - Remove the methods that assume 2D mesh. [[nodiscard]] static std::vector get_line_coordinates(size_t length, const Shape2D& mesh_shape); [[nodiscard]] static std::vector get_ring_coordinates( const Shape2D& ring_shape, const Shape2D& mesh_shape); @@ -106,7 +107,7 @@ class MeshDeviceView { std::unordered_map device_coordinates_; // Set if the view is 2D to enable row/col APIs, otherwise nullopt. - // TODO: remove this? + // TODO: #17477 - Remove this? std::optional shape_2d_; }; diff --git a/tt_metal/api/tt-metalium/system_mesh.hpp b/tt_metal/api/tt-metalium/system_mesh.hpp index f904de46044..88df02e1a4b 100644 --- a/tt_metal/api/tt-metalium/system_mesh.hpp +++ b/tt_metal/api/tt-metalium/system_mesh.hpp @@ -31,7 +31,7 @@ class SystemMesh { SystemMesh& operator=(SystemMesh&&) = delete; // Returns the shape of the system mesh - const SimpleMeshShape& get_shape() const; + const MeshShape& get_shape() const; // Returns the physical device ID for a given logical row and column index chip_id_t get_physical_device_id(const MeshCoordinate& coord) const; diff --git a/tt_metal/common/mesh_coord.cpp b/tt_metal/common/mesh_coord.cpp index 88f4309cd90..aefdb409642 100644 --- a/tt_metal/common/mesh_coord.cpp +++ b/tt_metal/common/mesh_coord.cpp @@ -7,18 +7,14 @@ #include #include #include -#include #include #include namespace tt::tt_metal::distributed { namespace { -// Returns a zero coordinate of dimensionality `dims`. -MeshCoordinate zero_coordinate(size_t dims) { return MeshCoordinate(tt::stl::SmallVector(dims, 0)); } - // Returns the last valid coordinate for the provided `shape`. -MeshCoordinate shape_back(const SimpleMeshShape& shape) { +MeshCoordinate shape_back(const MeshShape& shape) { tt::stl::SmallVector coords; for (int i = 0; i < shape.dims(); i++) { coords.push_back(shape[i] - 1); @@ -28,14 +24,16 @@ MeshCoordinate shape_back(const SimpleMeshShape& shape) { } // namespace -SimpleMeshShape::SimpleMeshShape(uint32_t x) : ShapeBase({x}) { compute_strides(); } -SimpleMeshShape::SimpleMeshShape(uint32_t x, uint32_t y) : ShapeBase({x, y}) { compute_strides(); } -SimpleMeshShape::SimpleMeshShape(uint32_t x, uint32_t y, uint32_t z) : ShapeBase({x, y, z}) { compute_strides(); } +MeshShape::MeshShape(uint32_t x) : MeshShape({x}) {} +MeshShape::MeshShape(uint32_t x, uint32_t y) : MeshShape({x, y}) {} +MeshShape::MeshShape(uint32_t x, uint32_t y, uint32_t z) : MeshShape({x, y, z}) {} -SimpleMeshShape::SimpleMeshShape(const MeshShape& legacy_shape) : - SimpleMeshShape(legacy_shape.num_rows, legacy_shape.num_cols) {} +MeshShape::MeshShape(const tt::stl::SmallVector& shape) : ShapeBase(shape) { compute_strides(); } +MeshShape::MeshShape(tt::stl::SmallVector&& shape) : ShapeBase(std::move(shape)) { compute_strides(); } +MeshShape::MeshShape(std::initializer_list ilist) : ShapeBase(ilist) { compute_strides(); } +MeshShape::MeshShape(tt::stl::Span span) : ShapeBase(span) { compute_strides(); } -void SimpleMeshShape::compute_strides() { +void MeshShape::compute_strides() { size_t stride = 1; strides_.resize(dims()); for (int dim = dims() - 1; dim >= 0; --dim) { @@ -44,18 +42,18 @@ void SimpleMeshShape::compute_strides() { } } -size_t SimpleMeshShape::get_stride(size_t dim) const { return strides_[dim]; } +size_t MeshShape::get_stride(size_t dim) const { return strides_[dim]; } -size_t SimpleMeshShape::dims() const { return size(); } -size_t SimpleMeshShape::mesh_size() const { +size_t MeshShape::dims() const { return size(); } +size_t MeshShape::mesh_size() const { return empty() ? 0 : std::accumulate(value_.begin(), value_.end(), 1, std::multiplies()); } -bool operator==(const SimpleMeshShape& lhs, const SimpleMeshShape& rhs) = default; -bool operator!=(const SimpleMeshShape& lhs, const SimpleMeshShape& rhs) = default; +bool operator==(const MeshShape& lhs, const MeshShape& rhs) = default; +bool operator!=(const MeshShape& lhs, const MeshShape& rhs) = default; -std::ostream& operator<<(std::ostream& os, const SimpleMeshShape& shape) { - os << "SimpleMeshShape(["; +std::ostream& operator<<(std::ostream& os, const MeshShape& shape) { + os << "MeshShape(["; for (size_t i = 0; i < shape.dims(); ++i) { if (i > 0) { os << ", "; @@ -66,12 +64,20 @@ std::ostream& operator<<(std::ostream& os, const SimpleMeshShape& shape) { return os; } -MeshCoordinate::MeshCoordinate(uint32_t coord) : value_({coord}) {} +bool is_line_topology(const MeshShape& shape) { + return std::count_if(shape.cbegin(), shape.cend(), [](size_t dim) { return dim != 1; }) <= 1; +} + +MeshCoordinate::MeshCoordinate(uint32_t x) : value_({x}) {} MeshCoordinate::MeshCoordinate(uint32_t x, uint32_t y) : value_({x, y}) {} MeshCoordinate::MeshCoordinate(uint32_t x, uint32_t y, uint32_t z) : value_({x, y, z}) {} MeshCoordinate::MeshCoordinate(tt::stl::Span coords) : value_(coords.begin(), coords.end()) {} +MeshCoordinate MeshCoordinate::zero_coordinate(size_t dimensions) { + return MeshCoordinate(tt::stl::SmallVector(dimensions, 0)); +} + size_t MeshCoordinate::dims() const { return value_.size(); } tt::stl::Span MeshCoordinate::coords() const { return value_; } uint32_t MeshCoordinate::operator[](size_t dim) const { return value_[dim]; } @@ -105,8 +111,8 @@ MeshCoordinateRange::MeshCoordinateRange(const MeshCoordinate& start, const Mesh } } -MeshCoordinateRange::MeshCoordinateRange(const SimpleMeshShape& shape) : - MeshCoordinateRange(zero_coordinate(shape.dims()), shape_back(shape)) {} +MeshCoordinateRange::MeshCoordinateRange(const MeshShape& shape) : + MeshCoordinateRange(MeshCoordinate::zero_coordinate(shape.dims()), shape_back(shape)) {} size_t MeshCoordinateRange::dims() const { return start_.dims(); } const MeshCoordinate& MeshCoordinateRange::start_coord() const { return start_; } @@ -162,7 +168,7 @@ bool operator==(const MeshCoordinateRange& lhs, const MeshCoordinateRange& rhs) } bool operator!=(const MeshCoordinateRange& lhs, const MeshCoordinateRange& rhs) { return !(lhs == rhs); } -size_t to_linear_index(const SimpleMeshShape& shape, const MeshCoordinate& coord) { +size_t to_linear_index(const MeshShape& shape, const MeshCoordinate& coord) { TT_FATAL( shape.dims() == coord.dims(), "Shape and coordinate dimensions do not match: {} != {}", diff --git a/tt_metal/distributed/coordinate_translation.cpp b/tt_metal/distributed/coordinate_translation.cpp index 2070a138ed0..31f0a0e9d17 100644 --- a/tt_metal/distributed/coordinate_translation.cpp +++ b/tt_metal/distributed/coordinate_translation.cpp @@ -4,6 +4,7 @@ #include "tt_metal/distributed/coordinate_translation.hpp" +#include "indestructible.hpp" #include "tt_cluster.hpp" #include @@ -51,20 +52,20 @@ CoordinateTranslationMap load_translation_map(const std::string& filename, const } // namespace -const std::pair& get_system_mesh_coordinate_translation_map() { - static const auto* cached_translation_map = new std::pair([] { +const std::pair& get_system_mesh_coordinate_translation_map() { + static tt::stl::Indestructible> kTranslationMap([]() { const auto system_num_devices = tt::Cluster::instance().number_of_user_devices(); const bool is_qg = tt::Cluster::instance().number_of_pci_devices() == system_num_devices; // TODO: #17477 - This assumes shapes and coordinates are in 2D. This will be extended for 3D. // Consider if 1D can be used for single device and N300. - const std::unordered_map> system_mesh_translation_map = { - {1, std::make_pair("device.json", SimpleMeshShape(1, 1))}, - {2, std::make_pair("N300.json", SimpleMeshShape(1, 2))}, - {8, std::make_pair("T3000.json", SimpleMeshShape(2, 4))}, - {32, std::make_pair(is_qg ? "QG.json" : "TG.json", SimpleMeshShape(8, 4))}, - {64, std::make_pair("TGG.json", SimpleMeshShape(8, 8))}, + const std::unordered_map> system_mesh_translation_map = { + {1, std::make_pair("device.json", MeshShape(1, 1))}, + {2, std::make_pair("N300.json", MeshShape(1, 2))}, + {8, std::make_pair("T3000.json", MeshShape(2, 4))}, + {32, std::make_pair(is_qg ? "QG.json" : "TG.json", MeshShape(8, 4))}, + {64, std::make_pair("TGG.json", MeshShape(8, 8))}, }; TT_FATAL( system_mesh_translation_map.contains(system_num_devices), @@ -79,12 +80,12 @@ const std::pair& get_system_mesh_coor shape.mesh_size()); log_debug(LogMetal, "Logical SystemMesh Shape: {}", shape); - return std::pair{ + return std::pair{ load_translation_map(get_config_path(translation_config_file), /*key=*/"logical_to_physical_coordinates"), shape}; }()); - return *cached_translation_map; + return kTranslationMap.get(); } } // namespace tt::tt_metal::distributed diff --git a/tt_metal/distributed/coordinate_translation.hpp b/tt_metal/distributed/coordinate_translation.hpp index 5aa0f7242f0..363ab2762c4 100644 --- a/tt_metal/distributed/coordinate_translation.hpp +++ b/tt_metal/distributed/coordinate_translation.hpp @@ -19,6 +19,6 @@ using CoordinateTranslationMap = std::unordered_map that contains everything we need. -const std::pair& get_system_mesh_coordinate_translation_map(); +const std::pair& get_system_mesh_coordinate_translation_map(); } // namespace tt::tt_metal::distributed diff --git a/tt_metal/distributed/mesh_command_queue.cpp b/tt_metal/distributed/mesh_command_queue.cpp index 5e971d42a51..2b5c09252a1 100644 --- a/tt_metal/distributed/mesh_command_queue.cpp +++ b/tt_metal/distributed/mesh_command_queue.cpp @@ -434,7 +434,6 @@ void MeshCommandQueue::enqueue_read_shards( bool blocking) { // TODO: #17215 - this API is used by TTNN, as it currently implements rich ND sharding API for multi-devices. // In the long run, the multi-device sharding API in Metal will change, and this will most likely be replaced. - const auto [num_rows, num_cols] = buffer->device()->shape(); for (const auto& shard_data_transfer : shard_data_transfers) { auto device_shard_view = buffer->get_device_buffer(shard_data_transfer.shard_coord); read_shard_from_device( diff --git a/tt_metal/distributed/mesh_device.cpp b/tt_metal/distributed/mesh_device.cpp index 8ac1df381ce..03f73ceaed9 100644 --- a/tt_metal/distributed/mesh_device.cpp +++ b/tt_metal/distributed/mesh_device.cpp @@ -23,6 +23,8 @@ #include #include +#include +#include namespace tt::tt_metal::distributed { @@ -110,11 +112,9 @@ IDevice* MeshDevice::reference_device() const { return this->get_devices().at(0) MeshDevice::MeshDevice( std::shared_ptr mesh_handle, - const MeshShape& mesh_shape, std::unique_ptr mesh_device_view, std::weak_ptr parent_mesh) : scoped_devices_(std::move(mesh_handle)), - mesh_shape_(mesh_shape), view_(std::move(mesh_device_view)), mesh_id_(generate_unique_mesh_id()), parent_mesh_(std::move(parent_mesh)) {} @@ -126,82 +126,89 @@ std::shared_ptr MeshDevice::create( size_t num_command_queues, const DispatchCoreConfig& dispatch_core_config, tt::stl::Span l1_bank_remap) { - // TODO: #17477 Extend to ND. - TT_FATAL(config.mesh_shape.dims() == 2, "Mesh shape must be 2D"); - auto mesh_shape_2d = MeshShape{config.mesh_shape[0], config.mesh_shape[1]}; - auto scoped_devices = std::make_shared( l1_small_size, trace_region_size, num_command_queues, dispatch_core_config, config); MeshContainer devices(config.mesh_shape, scoped_devices->root_devices()); auto mesh_device = std::make_shared( - std::move(scoped_devices), - mesh_shape_2d, - std::make_unique(devices), - std::weak_ptr()); + std::move(scoped_devices), std::make_unique(devices), std::weak_ptr()); mesh_device->initialize(num_command_queues, l1_small_size, trace_region_size, l1_bank_remap); return mesh_device; } -std::shared_ptr MeshDevice::create_submesh(const MeshShape& submesh_shape, const MeshOffset& offset) { - if (submesh_shape.num_rows <= 0 || submesh_shape.num_cols <= 0) { - TT_THROW( - "Invalid submesh shape: ({}, {}). Both dimensions must be positive.", - submesh_shape.num_rows, - submesh_shape.num_cols); - } - - if (offset.row < 0 || offset.col < 0) { - TT_THROW("Invalid offset: ({}, {}). Offset must be non-negative.", offset.row, offset.col); - } +std::shared_ptr MeshDevice::create_submesh( + const MeshShape& submesh_shape, const std::optional& offset) { + TT_FATAL( + std::all_of(submesh_shape.cbegin(), submesh_shape.cend(), [](size_t dim) { return dim > 0; }), + "Invalid submesh shape: ({}). All dimensions must be positive.", + submesh_shape); + TT_FATAL( + submesh_shape.dims() == view_->shape().dims(), + "Submesh shape {} and mesh device shape {} must have the same number of dimensions.", + submesh_shape, + view_->shape()); + + const MeshCoordinate offset_coord = [&offset, &submesh_shape]() { + if (offset.has_value()) { + TT_FATAL( + submesh_shape.dims() == offset->dims(), + "Submesh shape {} and offset {} must have the same number of dimensions.", + submesh_shape, + *offset); + return *offset; + } else { + return MeshCoordinate::zero_coordinate(submesh_shape.dims()); + } + }(); - if (offset.row + submesh_shape.num_rows > mesh_shape_.num_rows || - offset.col + submesh_shape.num_cols > mesh_shape_.num_cols) { - TT_THROW( - "Submesh ({}x{}) with offset ({}, {}) does not fit within parent mesh ({}x{}).", - submesh_shape.num_rows, - submesh_shape.num_cols, - offset.row, - offset.col, - mesh_shape_.num_rows, - mesh_shape_.num_cols); + tt::stl::SmallVector end_coords; + for (size_t i = 0; i < submesh_shape.dims(); i++) { + TT_FATAL( + offset_coord[i] + submesh_shape[i] - 1 < view_->shape()[i], + "Submesh shape {} and offset {} does not fit within parent mesh ({}).", + submesh_shape, + offset, + view_->shape()); + end_coords.push_back(offset_coord[i] + submesh_shape[i] - 1); } - - auto start_coordinate = MeshCoordinate{offset.row, offset.col}; - auto end_coordinate = - MeshCoordinate{offset.row + submesh_shape.num_rows - 1, offset.col + submesh_shape.num_cols - 1}; + auto end_coordinate = MeshCoordinate(end_coords); MeshContainer submesh_devices_container( - submesh_shape, view_->get_devices(MeshCoordinateRange{start_coordinate, end_coordinate})); + submesh_shape, view_->get_devices(MeshCoordinateRange{offset_coord, end_coordinate})); auto submesh = std::make_shared( - scoped_devices_, - submesh_shape, - std::make_unique(submesh_devices_container), - shared_from_this()); + scoped_devices_, std::make_unique(submesh_devices_container), shared_from_this()); submeshes_.push_back(submesh); - log_trace( - LogMetal, - "Instantiating submesh {}: {}x{} with offset: {} {}", - submesh->id(), - submesh_shape.num_rows, - submesh_shape.num_cols, - offset.row, - offset.col); + log_trace(LogMetal, "Instantiating submesh {}: {} with offset: {}", submesh->id(), submesh_shape, offset); log_trace(LogMetal, "Submesh {} instantiated with {} devices", submesh->id(), submesh->get_devices().size()); - return submesh; } std::vector> MeshDevice::create_submeshes(const MeshShape& submesh_shape) { + // Calculate how many submeshes fit in each dimension. + tt::stl::SmallVector steps; + for (size_t dim = 0; dim < shape().dims(); dim++) { + TT_FATAL( + shape()[dim] % submesh_shape[dim] == 0, + "Shape {} is not divisible by submesh shape {} along dimension {}", + shape(), + submesh_shape, + dim); + uint32_t num_steps = shape()[dim] / submesh_shape[dim]; + steps.push_back(num_steps); + } + + // Stamp `submesh_shape` along each dimension, `steps` number of times. std::vector> submeshes; - for (int row = 0; row < this->num_rows(); row += submesh_shape.num_rows) { - for (int col = 0; col < this->num_cols(); col += submesh_shape.num_cols) { - auto submesh = this->create_submesh(submesh_shape, MeshOffset{row, col}); - submeshes.push_back(submesh); + for (const auto& step_position : MeshCoordinateRange(MeshShape(steps))) { + tt::stl::SmallVector offset_coords; + for (size_t dim = 0; dim < submesh_shape.dims(); dim++) { + offset_coords.push_back(step_position[dim] * submesh_shape[dim]); } + submeshes.push_back(create_submesh(submesh_shape, MeshCoordinate(offset_coords))); } + return submeshes; } @@ -251,11 +258,11 @@ tt::ARCH MeshDevice::arch() const { scoped_devices_->root_devices(), [](const auto& device) { return device->arch(); }); } -size_t MeshDevice::num_rows() const { return mesh_shape_.num_rows; } +size_t MeshDevice::num_rows() const { return view_->num_rows(); } -size_t MeshDevice::num_cols() const { return mesh_shape_.num_cols; } +size_t MeshDevice::num_cols() const { return view_->num_cols(); } -MeshShape MeshDevice::shape() const { return mesh_shape_; } +const MeshShape& MeshDevice::shape() const { return view_->shape(); } std::vector MeshDevice::get_row_major_devices(const MeshShape& new_shape) const { // MeshDeviceView requires devices to be provided as a 1D array in row-major order for the target mesh shape. @@ -281,7 +288,7 @@ std::vector MeshDevice::get_row_major_devices(const MeshShape& new_sha // From an MxN mesh, we can always reduce rank to a 1xM*N Line mesh. // However, going from a Line mesh to an MxN mesh is not always possible. - if (new_shape.num_rows == 1 || new_shape.num_cols == 1) { + if (is_line_topology(new_shape)) { return view_->get_line_devices(); } @@ -292,14 +299,10 @@ std::vector MeshDevice::get_row_major_devices(const MeshShape& new_sha if (physical_device_id_to_linearized_index.find(new_physical_device_ids[i]) == physical_device_id_to_linearized_index.end()) { TT_THROW( - "User has requested a reshape of the MeshDevice to shape: {}x{}, but it is not possible to form a " - "physically connected mesh of {}x{} grid with the opened devices from the original shape: {}x{}.", - new_shape.num_rows, - new_shape.num_cols, - new_shape.num_rows, - new_shape.num_cols, - this->num_rows(), - this->num_cols()); + "User has requested a reshape of the MeshDevice to shape: {}, but it is not possible to form a " + "physically connected mesh grid with the opened devices from the original shape: {}.", + new_shape, + view_->shape()); } } @@ -312,13 +315,11 @@ std::vector MeshDevice::get_row_major_devices(const MeshShape& new_sha void MeshDevice::reshape(const MeshShape& new_shape) { TT_FATAL( - new_shape.num_rows * new_shape.num_cols == this->num_devices(), + new_shape.mesh_size() == this->num_devices(), "New shape must have the same number of devices as current shape"); MeshContainer devices(new_shape, this->get_row_major_devices(new_shape)); auto new_view = std::make_unique(devices); - - mesh_shape_ = new_shape; view_ = std::move(new_view); } diff --git a/tt_metal/distributed/mesh_device_view.cpp b/tt_metal/distributed/mesh_device_view.cpp index 64b80167f31..e6f63b85033 100644 --- a/tt_metal/distributed/mesh_device_view.cpp +++ b/tt_metal/distributed/mesh_device_view.cpp @@ -37,7 +37,7 @@ MeshDeviceView::MeshDeviceView(const MeshContainer& devices) : devices } MeshDeviceView::MeshDeviceView(const MeshDevice& mesh_device) : - MeshDeviceView(MeshContainer(SimpleMeshShape(mesh_device.shape()), mesh_device.get_devices())) {} + MeshDeviceView(MeshContainer(MeshShape(mesh_device.shape()), mesh_device.get_devices())) {} MeshDeviceView::DeviceView MeshDeviceView::get_devices(const MeshCoordinateRange& range) const { DeviceView devices_in_region; @@ -47,7 +47,7 @@ MeshDeviceView::DeviceView MeshDeviceView::get_devices(const MeshCoordinateRange return devices_in_region; } -MeshDeviceView::DeviceView MeshDeviceView::get_devices(const SimpleMeshShape& submesh_shape) const { +MeshDeviceView::DeviceView MeshDeviceView::get_devices(const MeshShape& submesh_shape) const { return get_devices(MeshCoordinateRange(submesh_shape)); } @@ -91,7 +91,7 @@ std::vector> MeshDeviceView::get_column_views() const { bool MeshDeviceView::empty() const noexcept { return devices_.shape().mesh_size() == 0; } size_t MeshDeviceView::size() const noexcept { return devices_.shape().mesh_size(); } -SimpleMeshShape MeshDeviceView::shape() const noexcept { return devices_.shape(); } +const MeshShape& MeshDeviceView::shape() const noexcept { return devices_.shape(); } bool MeshDeviceView::contains(const MeshCoordinate& coord) const noexcept { return devices_.coord_range().contains(coord); diff --git a/tt_metal/distributed/system_mesh.cpp b/tt_metal/distributed/system_mesh.cpp index 10a20b6e433..f4d310ce7eb 100644 --- a/tt_metal/distributed/system_mesh.cpp +++ b/tt_metal/distributed/system_mesh.cpp @@ -16,7 +16,7 @@ namespace tt::tt_metal::distributed { class SystemMesh::Impl { private: - SimpleMeshShape logical_mesh_shape_; + MeshShape logical_mesh_shape_; CoordinateTranslationMap logical_to_physical_coordinates_; std::unordered_map logical_to_device_id_; std::unordered_map physical_coordinate_to_device_id_; @@ -27,7 +27,7 @@ class SystemMesh::Impl { bool is_system_mesh_initialized() const; void initialize(); - const SimpleMeshShape& get_shape() const; + const MeshShape& get_shape() const; std::vector get_mapped_physical_device_ids(const MeshDeviceConfig& config) const; std::vector request_available_devices(const MeshDeviceConfig& config) const; chip_id_t get_physical_device_id(const MeshCoordinate& coord) const; @@ -68,7 +68,7 @@ void SystemMesh::Impl::initialize() { } } -const SimpleMeshShape& SystemMesh::Impl::get_shape() const { return logical_mesh_shape_; } +const MeshShape& SystemMesh::Impl::get_shape() const { return logical_mesh_shape_; } chip_id_t SystemMesh::Impl::get_physical_device_id(const MeshCoordinate& coord) const { TT_FATAL( @@ -111,12 +111,7 @@ std::vector SystemMesh::Impl::get_mapped_physical_device_ids(const Me } }(); - const bool line_topology = [&config]() { - const int non_unit_dims = - std::count_if(config.mesh_shape.cbegin(), config.mesh_shape.cend(), [](int dim) { return dim != 1; }); - return non_unit_dims <= 1; - }(); - if (line_topology) { + if (is_line_topology(config.mesh_shape)) { TT_FATAL( std::all_of(system_offset.coords().begin(), system_offset.coords().end(), [](int dim) { return dim == 0; }), "Offsets are unsupported for a line mesh"); @@ -206,7 +201,7 @@ chip_id_t SystemMesh::get_physical_device_id(const MeshCoordinate& coord) const return pimpl_->get_physical_device_id(coord); } -const SimpleMeshShape& SystemMesh::get_shape() const { return pimpl_->get_shape(); } +const MeshShape& SystemMesh::get_shape() const { return pimpl_->get_shape(); } std::vector SystemMesh::request_available_devices(const MeshDeviceConfig& config) const { return pimpl_->request_available_devices(config); diff --git a/tt_metal/programming_examples/distributed/1_distributed_program_dispatch/distributed_program_dispatch.cpp b/tt_metal/programming_examples/distributed/1_distributed_program_dispatch/distributed_program_dispatch.cpp index c15df5a5f95..247c6cec967 100644 --- a/tt_metal/programming_examples/distributed/1_distributed_program_dispatch/distributed_program_dispatch.cpp +++ b/tt_metal/programming_examples/distributed/1_distributed_program_dispatch/distributed_program_dispatch.cpp @@ -10,7 +10,7 @@ int main(int argc, char** argv) { using namespace tt::tt_metal::distributed; - auto mesh_device = MeshDevice::create(MeshDeviceConfig{.mesh_shape = SimpleMeshShape(2, 4)}); + auto mesh_device = MeshDevice::create(MeshDeviceConfig{.mesh_shape = MeshShape(2, 4)}); auto& cq = mesh_device->mesh_command_queue(); // In a typical single-device fashion, instantiate a program with diff --git a/tt_metal/programming_examples/distributed/2_distributed_buffer_rw/distributed_buffer_rw.cpp b/tt_metal/programming_examples/distributed/2_distributed_buffer_rw/distributed_buffer_rw.cpp index 9a401213a4f..7678985f273 100644 --- a/tt_metal/programming_examples/distributed/2_distributed_buffer_rw/distributed_buffer_rw.cpp +++ b/tt_metal/programming_examples/distributed/2_distributed_buffer_rw/distributed_buffer_rw.cpp @@ -19,7 +19,7 @@ int main(int argc, char** argv) { using namespace tt::tt_metal::distributed; using tt::tt_metal::distributed::ShardedBufferConfig; - auto mesh_device = MeshDevice::create(MeshDeviceConfig{.mesh_shape = SimpleMeshShape(2, 4)}); + auto mesh_device = MeshDevice::create(MeshDeviceConfig{.mesh_shape = MeshShape(2, 4)}); auto& cq = mesh_device->mesh_command_queue(); // Define the shape of the shard and the distributed buffer. diff --git a/tt_metal/programming_examples/distributed/3_distributed_eltwise_add/distributed_eltwise_add.cpp b/tt_metal/programming_examples/distributed/3_distributed_eltwise_add/distributed_eltwise_add.cpp index 7ed668c4c22..c5760403898 100644 --- a/tt_metal/programming_examples/distributed/3_distributed_eltwise_add/distributed_eltwise_add.cpp +++ b/tt_metal/programming_examples/distributed/3_distributed_eltwise_add/distributed_eltwise_add.cpp @@ -85,7 +85,7 @@ Program CreateEltwiseAddProgram( // The example showcases TT-Metalium's ability to abstract away the complexity // of distributed memory management and compute. int main(int argc, char** argv) { - auto mesh_device = MeshDevice::create(MeshDeviceConfig{.mesh_shape = SimpleMeshShape(2, 4)}); + auto mesh_device = MeshDevice::create(MeshDeviceConfig{.mesh_shape = MeshShape(2, 4)}); // Define the global buffer shape and shard shape for distributed buffers auto shard_shape = Shape2D{32, 32}; diff --git a/ttnn/cpp/ttnn/distributed/api.cpp b/ttnn/cpp/ttnn/distributed/api.cpp index e8f2846b3ba..0f6685dc5c3 100644 --- a/ttnn/cpp/ttnn/distributed/api.cpp +++ b/ttnn/cpp/ttnn/distributed/api.cpp @@ -25,12 +25,10 @@ std::shared_ptr open_mesh_device( size_t trace_region_size, size_t num_command_queues, const DispatchCoreConfig& dispatch_core_config, - const MeshOffset& offset, + const std::optional& offset, const std::vector& physical_device_ids) { - std::optional offset_opt = - offset.row != 0 || offset.col != 0 ? std::make_optional(offset.row, offset.col) : std::nullopt; - auto config = MeshDeviceConfig{ - .mesh_shape = SimpleMeshShape(mesh_shape), .offset = offset_opt, .physical_device_ids = physical_device_ids}; + auto config = + MeshDeviceConfig{.mesh_shape = mesh_shape, .offset = offset, .physical_device_ids = physical_device_ids}; return MeshDevice::create(config, l1_small_size, trace_region_size, num_command_queues, dispatch_core_config); } @@ -130,8 +128,7 @@ std::vector get_t3k_physical_device_ids_ring() { auto num_devices = instance.get_shape().mesh_size(); TT_FATAL(num_devices == 8, "T3000 ring topology only works with 8 devices"); - auto physical_device_ids = - instance.get_mapped_physical_device_ids(MeshDeviceConfig{.mesh_shape = SimpleMeshShape(1, 8)}); + auto physical_device_ids = instance.get_mapped_physical_device_ids(MeshDeviceConfig{.mesh_shape = MeshShape(1, 8)}); return physical_device_ids; } @@ -154,7 +151,9 @@ std::vector get_mapped_devices(const Tensor& tensor, MeshDevice& mesh_ return std::visit( tt::stl::overloaded{ [&](const ShardTensor2D& s) { - return mesh_device.get_view().get_devices(MeshShape{s.shard_mesh.y, s.shard_mesh.x}); + const tt::tt_metal::distributed::MeshCoordinateRange range( + MeshShape(s.shard_mesh.y, s.shard_mesh.x)); + return mesh_device.get_view().get_devices(range); }, [&](const auto&) { return get_workers_for_tensor(mesh_device.get_devices()); }}, host_storage.strategy); diff --git a/ttnn/cpp/ttnn/distributed/api.hpp b/ttnn/cpp/ttnn/distributed/api.hpp index da1758a16e2..4ecf4807734 100644 --- a/ttnn/cpp/ttnn/distributed/api.hpp +++ b/ttnn/cpp/ttnn/distributed/api.hpp @@ -18,7 +18,7 @@ std::shared_ptr open_mesh_device( size_t trace_region_size, size_t num_command_queues, const tt::tt_metal::DispatchCoreConfig& dispatch_core_config, - const MeshOffset& offset = MeshOffset(0, 0), + const std::optional& offset = std::nullopt, const std::vector& physical_device_ids = {}); void close_mesh_device(const std::shared_ptr& mesh_device); diff --git a/ttnn/cpp/ttnn/distributed/distributed_pybind.cpp b/ttnn/cpp/ttnn/distributed/distributed_pybind.cpp index 92c02b515c3..3e96b6130bb 100644 --- a/ttnn/cpp/ttnn/distributed/distributed_pybind.cpp +++ b/ttnn/cpp/ttnn/distributed/distributed_pybind.cpp @@ -5,9 +5,12 @@ #include "ttnn/distributed/distributed_pybind.hpp" #include +#include + #include #include "tt-metalium/mesh_coord.hpp" #include "ttnn/distributed/api.hpp" +#include "ttnn/distributed/types.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/types.hpp" @@ -25,56 +28,80 @@ void py_module_types(py::module& module) { py::class_>(module, "MeshDevice"); py::class_(module, "MeshSubDeviceManagerId"); py::class_(module, "MeshShape", "Struct representing the shape of a mesh device."); - py::class_(module, "MeshOffset", "Struct representing the offset of a mesh device."); + py::class_(module, "MeshCoordinate", "Struct representing the coordinate of a mesh device."); } void py_module(py::module& module) { + // TODO: #17477 - Remove overloads that accept 'row' and 'col'. Instead, use generic ND terms. static_cast>(module.attr("MeshShape")) .def( py::init([](size_t num_rows, size_t num_cols) { return MeshShape(num_rows, num_cols); }), - "Constructor with specified number of rows and columns.", + "Constructor with the specified number of rows and columns.", py::arg("num_rows"), py::arg("num_cols")) - .def_readwrite("num_rows", &MeshShape::num_rows, "Number of rows in the mesh.") - .def_readwrite("num_cols", &MeshShape::num_cols, "Number of columns in the mesh.") + .def( + py::init([](size_t x, size_t y, size_t z) { return MeshShape(x, y, z); }), + "Constructor with the specified 3D shape.", + py::arg("x"), + py::arg("y"), + py::arg("z")) + .def( + py::init([](const std::vector& shape) { return MeshShape(shape); }), + "Constructor with the specified ND shape.", + py::arg("shape")) .def( "__repr__", [](const MeshShape& ms) { - return ""; + std::ostringstream str; + str << ms; + return str.str(); }) - .def("__iter__", [](const MeshShape& ms) { return py::iter(py::make_tuple(ms.num_rows, ms.num_cols)); }); - static_cast>(module.attr("MeshOffset")) .def( - py::init([](size_t row, size_t col) { return MeshOffset(row, col); }), + "__iter__", + [](const MeshShape& ms) { return py::make_iterator(ms.view().begin(), ms.view().end()); }, + py::keep_alive<0, 1>()); + static_cast>(module.attr("MeshCoordinate")) + .def( + py::init([](size_t row, size_t col) { return MeshCoordinate(row, col); }), "Constructor with specified row and column offsets.", py::arg("row"), py::arg("col")) - .def_readwrite("row", &MeshOffset::row, "Row offset in the mesh.") - .def_readwrite("col", &MeshOffset::col, "Column offset in the mesh.") + .def( + py::init([](size_t x, size_t y, size_t z) { return MeshCoordinate(x, y, z); }), + "Constructor with the specified 3D coordinate.", + py::arg("x"), + py::arg("y"), + py::arg("z")) + .def( + py::init([](const std::vector& coords) { return MeshCoordinate(coords); }), + "Constructor with the specified ND coordinate.", + py::arg("coords")) .def( "__repr__", - [](const MeshOffset& mo) { - return ""; + [](const MeshCoordinate& mc) { + std::ostringstream str; + str << mc; + return str.str(); }) - .def("__iter__", [](const MeshOffset& mo) { return py::iter(py::make_tuple(mo.row, mo.col)); }); + .def( + "__iter__", + [](const MeshCoordinate& mc) { return py::make_iterator(mc.coords().begin(), mc.coords().end()); }, + py::keep_alive<0, 1>()); auto py_mesh_device = static_cast>>(module.attr("MeshDevice")); py_mesh_device .def( - py::init([](const MeshShape& mesh_device_shape, + py::init([](const MeshShape& mesh_shape, size_t l1_small_size, size_t trace_region_size, size_t num_command_queues, const DispatchCoreConfig& dispatch_core_config, - const MeshOffset& offset, + const std::optional& offset, const std::vector& physical_device_ids) { return MeshDevice::create( MeshDeviceConfig{ - .mesh_shape = SimpleMeshShape(mesh_device_shape), - .offset = offset.row != 0 || offset.col != 0 - ? std::make_optional(offset.row, offset.col) - : std::nullopt, + .mesh_shape = mesh_shape, + .offset = offset, .physical_device_ids = physical_device_ids, }, l1_small_size, diff --git a/ttnn/cpp/ttnn/distributed/distributed_tensor.cpp b/ttnn/cpp/ttnn/distributed/distributed_tensor.cpp index 3d82d24714f..18995b49ed0 100644 --- a/ttnn/cpp/ttnn/distributed/distributed_tensor.cpp +++ b/ttnn/cpp/ttnn/distributed/distributed_tensor.cpp @@ -52,58 +52,58 @@ class ShardTensorToMesh : public TensorToMesh { class ShardTensorTo2dMesh : public TensorToMesh { public: - ShardTensorTo2dMesh(const MeshShape& mesh_shape, const Shard2dConfig& config) : - mesh_shape_(mesh_shape), config_(config) {} + ShardTensorTo2dMesh(size_t mesh_rows, size_t mesh_cols, const Shard2dConfig& config) : + mesh_rows_(mesh_rows), mesh_cols_(mesh_cols), config_(config) {} std::vector map(const Tensor& tensor) const override { - const auto [rows, cols] = mesh_shape_; const auto [row_dim, col_dim] = config_; std::vector row_tensors; // Shard along rows if (!row_dim.has_value()) { - row_tensors.reserve(rows); - for (int i = 0; i < rows; ++i) { + row_tensors.reserve(mesh_rows_); + for (int i = 0; i < mesh_rows_; ++i) { row_tensors.push_back(tensor); } } else { - row_tensors = experimental::xtensor::chunk(tensor, rows, *row_dim); + row_tensors = experimental::xtensor::chunk(tensor, mesh_rows_, *row_dim); } std::vector tensor_shards; - tensor_shards.reserve(rows * cols); + tensor_shards.reserve(mesh_rows_ * mesh_cols_); // Shard along columns if (!col_dim.has_value()) { for (const auto& t : row_tensors) { - for (int i = 0; i < cols; ++i) { + for (int i = 0; i < mesh_cols_; ++i) { tensor_shards.push_back(t); } } } else { for (const auto& t : row_tensors) { - auto col_chunks = experimental::xtensor::chunk(t, cols, *col_dim); + auto col_chunks = experimental::xtensor::chunk(t, mesh_cols_, *col_dim); tensor_shards.insert(tensor_shards.end(), col_chunks.begin(), col_chunks.end()); } } TT_FATAL( - static_cast(tensor_shards.size()) == rows * cols, + static_cast(tensor_shards.size()) == mesh_rows_ * mesh_cols_, "ShardTensorTo2dMesh: Sharding failed. Number of shards should match the product of the mesh " "dimensions. Size: {}, rows: {}, cols: {}", tensor_shards.size(), - rows, - cols); + mesh_rows_, + mesh_cols_); return tensor_shards; } tt::tt_metal::DistributedTensorConfig config() const override { - return DistributedTensorConfig{ShardTensor2D{ShardMesh{mesh_shape_.num_rows, mesh_shape_.num_cols}}}; + return DistributedTensorConfig{ShardTensor2D{ShardMesh{mesh_rows_, mesh_cols_}}}; } private: - MeshShape mesh_shape_; + size_t mesh_rows_ = 0; + size_t mesh_cols_ = 0; Shard2dConfig config_; }; @@ -121,18 +121,17 @@ class ConcatMeshToTensor : public MeshToTensor { class Concat2dMeshToTensor : public MeshToTensor { public: - Concat2dMeshToTensor(MeshDevice& mesh_device, const Concat2dConfig& config) : - mesh_shape_(mesh_device.shape()), config_(config) {} + Concat2dMeshToTensor(size_t mesh_rows, size_t mesh_cols, const Concat2dConfig& config) : + mesh_rows_(mesh_rows), mesh_cols_(mesh_cols), config_(config) {} Tensor compose(const std::vector& tensors) const override { - const auto [rows, cols] = mesh_shape_; const auto [row_dim, col_dim] = config_; std::vector row_concatenated; - row_concatenated.reserve(rows); - for (int i = 0; i < rows; ++i) { - auto row_start = tensors.begin() + i * cols; - auto row_end = row_start + cols; + row_concatenated.reserve(mesh_rows_); + for (int i = 0; i < mesh_rows_; ++i) { + auto row_start = tensors.begin() + i * mesh_cols_; + auto row_end = row_start + mesh_cols_; std::vector row_tensors(row_start, row_end); row_concatenated.push_back(experimental::xtensor::concat(row_tensors, col_dim)); } @@ -141,7 +140,8 @@ class Concat2dMeshToTensor : public MeshToTensor { } private: - MeshShape mesh_shape_; + size_t mesh_rows_ = 0; + size_t mesh_cols_ = 0; Concat2dConfig config_; }; @@ -160,11 +160,13 @@ std::unique_ptr shard_tensor_to_2d_mesh_mapper( TT_FATAL( config.row_dim.has_value() || config.col_dim.has_value(), "Sharding a tensor to 2D mesh requires at least one dimension to shard"); + TT_FATAL(mesh_shape.dims() == 2, "Mesh shape is not 2D: {}", mesh_shape); + TT_FATAL(mesh_device.shape().dims() == 2, "Mesh device is not configured as a 2D mesh: {}", mesh_device.shape()); TT_FATAL( - mesh_shape.num_rows <= mesh_device.shape().num_rows && // - mesh_shape.num_cols <= mesh_device.shape().num_cols, + mesh_shape[0] <= mesh_device.shape()[0] && // + mesh_shape[1] <= mesh_device.shape()[1], "Device mesh shape does not match the provided mesh shape."); - return std::make_unique(mesh_shape, config); + return std::make_unique(mesh_shape[0], mesh_shape[1], config); } std::unique_ptr concat_mesh_to_tensor_composer(int dim) { @@ -177,7 +179,8 @@ std::unique_ptr concat_2d_mesh_to_tensor_composer(MeshDevice& mesh "Dimensions in 'dims' must be different; got row_dim: {}, col_dim: {}", config.row_dim, config.col_dim); - return std::make_unique(mesh_device, config); + TT_FATAL(mesh_device.shape().dims() == 2, "Mesh device is not configured as a 2D mesh: {}", mesh_device.shape()); + return std::make_unique(mesh_device.shape()[0], mesh_device.shape()[1], config); } Tensor distribute_tensor( diff --git a/ttnn/cpp/ttnn/distributed/types.hpp b/ttnn/cpp/ttnn/distributed/types.hpp index de8ae02c43a..c97df2a667d 100644 --- a/ttnn/cpp/ttnn/distributed/types.hpp +++ b/ttnn/cpp/ttnn/distributed/types.hpp @@ -13,9 +13,7 @@ namespace ttnn::distributed { using MeshShape = tt::tt_metal::distributed::MeshShape; -using SimpleMeshShape = tt::tt_metal::distributed::SimpleMeshShape; using MeshCoordinate = tt::tt_metal::distributed::MeshCoordinate; -using MeshOffset = tt::tt_metal::distributed::MeshOffset; using DeviceIds = tt::tt_metal::distributed::DeviceIds; using MeshDevice = tt::tt_metal::distributed::MeshDevice; using SystemMesh = tt::tt_metal::distributed::SystemMesh; @@ -33,10 +31,8 @@ using ttnn::distributed::MeshCoordinate; using ttnn::distributed::MeshDevice; using ttnn::distributed::MeshDeviceConfig; using ttnn::distributed::MeshDeviceView; -using ttnn::distributed::MeshOffset; using ttnn::distributed::MeshShape; using ttnn::distributed::MeshSubDeviceManagerId; -using ttnn::distributed::SimpleMeshShape; using ttnn::distributed::SystemMesh; } // namespace ttnn diff --git a/ttnn/cpp/ttnn/tensor/storage.cpp b/ttnn/cpp/ttnn/tensor/storage.cpp index cd6fb20179d..e8543b0b199 100644 --- a/ttnn/cpp/ttnn/tensor/storage.cpp +++ b/ttnn/cpp/ttnn/tensor/storage.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include "ttnn/tensor/storage.hpp" +#include "tt-metalium/mesh_coord.hpp" namespace tt::tt_metal { @@ -26,20 +27,19 @@ MultiDeviceStorage::MultiDeviceStorage( // tensor spec. // // For now, this code ensures MeshBuffer backed tensors are compatible with the rest of the ops infra. - const auto [num_rows, num_cols] = mesh_buffer->device()->shape(); + const auto& mesh_shape = mesh_buffer->device()->shape(); + distributed::MeshCoordinateRange range(mesh_shape); - ordered_device_ids.reserve(num_rows * num_cols); - buffers.reserve(num_rows * num_cols); - specs.reserve(num_rows * num_cols); + ordered_device_ids.reserve(mesh_shape.mesh_size()); + buffers.reserve(mesh_shape.mesh_size()); + specs.reserve(mesh_shape.mesh_size()); - for (int row = 0; row < num_rows; ++row) { - for (int col = 0; col < num_cols; ++col) { - auto buffer = mesh_buffer->get_device_buffer(distributed::MeshCoordinate(row, col)); - const int device_id = buffer->device()->id(); - ordered_device_ids.push_back(device_id); - buffers.emplace(device_id, std::move(buffer)); - specs.emplace(device_id, tensor_spec); - } + for (const auto& coord : range) { + auto buffer = mesh_buffer->get_device_buffer(coord); + const int device_id = buffer->device()->id(); + ordered_device_ids.push_back(device_id); + buffers.emplace(device_id, std::move(buffer)); + specs.emplace(device_id, tensor_spec); } } diff --git a/ttnn/cpp/ttnn/tensor/tensor_impl.cpp b/ttnn/cpp/ttnn/tensor/tensor_impl.cpp index baae4fb53a4..4673418e56c 100644 --- a/ttnn/cpp/ttnn/tensor/tensor_impl.cpp +++ b/ttnn/cpp/ttnn/tensor/tensor_impl.cpp @@ -583,7 +583,6 @@ Tensor to_host_mesh_tensor(const Tensor& tensor, bool blocking) { const auto& mesh_buffer = storage.mesh_buffer; ttnn::MeshDevice* device = mesh_buffer->device(); distributed::MeshCommandQueue& mesh_cq = device->mesh_command_queue(); - const auto [num_rows, num_cols] = device->shape(); const auto num_buffers = storage.buffers.size(); std::vector shard_data_transfers; @@ -592,8 +591,7 @@ Tensor to_host_mesh_tensor(const Tensor& tensor, bool blocking) { specs.reserve(num_buffers); buffers.reserve(num_buffers); shard_data_transfers.reserve(num_buffers); - distributed::MeshCoordinateRange coord_range( - distributed::MeshCoordinate(0, 0), distributed::MeshCoordinate(num_rows - 1, num_cols - 1)); + distributed::MeshCoordinateRange coord_range(device->shape()); auto shard_coord = coord_range.begin(); for (int id : storage.ordered_device_ids) { std::vector host_buffer; @@ -771,7 +769,7 @@ MultiDeviceStorage shard_to_mesh_buffer( buffers.reserve(storage.buffers.size()); specs.reserve(storage.buffers.size()); - const auto [num_rows, num_cols] = mesh_device->shape(); + const auto& mesh_shape = mesh_device->shape(); TT_FATAL( storage.buffers.size() <= mesh_device->num_devices(), "Number of host buffers {} exceeds the number of shards {}", @@ -781,8 +779,7 @@ MultiDeviceStorage shard_to_mesh_buffer( std::vector shard_data_transfers; shard_data_transfers.reserve(storage.buffers.size()); - distributed::MeshCoordinateRange coord_range( - distributed::MeshCoordinate(0, 0), distributed::MeshCoordinate(num_rows - 1, num_cols - 1)); + distributed::MeshCoordinateRange coord_range(mesh_shape); auto shard_coord = coord_range.begin(); for (int i = 0; i < storage.buffers.size(); ++shard_coord, i++) { TensorSpec shard_tensor_spec( diff --git a/ttnn/ttnn/__init__.py b/ttnn/ttnn/__init__.py index 1d1d9eea9d5..ada0fd82c6d 100644 --- a/ttnn/ttnn/__init__.py +++ b/ttnn/ttnn/__init__.py @@ -156,7 +156,7 @@ def manage_config(name, value): WormholeComputeKernelConfig, GrayskullComputeKernelConfig, MeshShape, - MeshOffset, + MeshCoordinate, UnaryWithParam, UnaryOpType, BinaryOpType, diff --git a/ttnn/ttnn/distributed/distributed.py b/ttnn/ttnn/distributed/distributed.py index 46ee1e58c73..f5adb7c0f50 100644 --- a/ttnn/ttnn/distributed/distributed.py +++ b/ttnn/ttnn/distributed/distributed.py @@ -138,7 +138,7 @@ def open_mesh_device( trace_region_size: int = ttnn._ttnn.device.DEFAULT_TRACE_REGION_SIZE, num_command_queues: int = 1, dispatch_core_config: ttnn.DispatchCoreConfig = ttnn.DispatchCoreConfig(), - offset: ttnn.MeshOffset = ttnn.MeshOffset(row=0, col=0), + offset: Optional[ttnn.MeshCoordinate] = None, physical_device_ids: List[int] = [], ): """ @@ -150,7 +150,7 @@ def open_mesh_device( trace_region_size (int, optional): Size of the trace region. Defaults to ttnn._ttnn.device.DEFAULT_TRACE_REGION_SIZE. num_command_queues (int, optional): Number of command queues. Defaults to 1. dispatch_core_type (int, optional): Type of dispatch core. Defaults to DispatchCoreType.WORKER. - offset (ttnn.MeshOffset, optional): Offset in logical mesh coordinates for the mesh device. Defaults to (0, 0). + offset (ttnn.MeshCoordinate, optional): Offset in logical mesh coordinates for the mesh device. Defaults to None. physical_device_ids (List[int], optional): List of physical device IDs to use. Defaults to []. Returns: diff --git a/ttnn/ttnn/types.py b/ttnn/ttnn/types.py index b210fe90f5f..d8cd7380a52 100644 --- a/ttnn/ttnn/types.py +++ b/ttnn/ttnn/types.py @@ -65,7 +65,7 @@ class ShardStrategy(Enum): MeshShape = ttnn._ttnn.multi_device.MeshShape -MeshOffset = ttnn._ttnn.multi_device.MeshOffset +MeshCoordinate = ttnn._ttnn.multi_device.MeshCoordinate ShardOrientation = ttnn._ttnn.tensor.ShardOrientation ShardMode = ttnn._ttnn.tensor.ShardMode ShardSpec = ttnn._ttnn.tensor.ShardSpec From 3002a18df3b3ff599153f5619afe2a2378d61c82 Mon Sep 17 00:00:00 2001 From: Austin Ho <109362939+tt-aho@users.noreply.github.com> Date: Mon, 24 Feb 2025 22:50:00 -0500 Subject: [PATCH 287/316] Revert "Decouple control plane init and configuring routing tables" (#18273) --- .../routing/test_tt_fabric_multi_hop_sanity.cpp | 1 - .../perf_microbenchmark/routing/test_tt_fabric_sanity.cpp | 1 - tt_metal/fabric/control_plane.cpp | 1 + tt_metal/impl/device/device_pool.cpp | 7 ++----- 4 files changed, 3 insertions(+), 7 deletions(-) diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp index 100a2c523fb..2a1f17eeaaf 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp @@ -236,7 +236,6 @@ int main(int argc, char** argv) { std::filesystem::path(tt::llrt::RunTimeOptions::get_instance().get_root_dir()) / "tt_metal/fabric/mesh_graph_descriptors/tg_mesh_graph_descriptor.yaml"; auto control_plane = std::make_unique(tg_mesh_graph_desc_path.string()); - control_plane->configure_routing_tables(); int num_devices = tt_metal::GetNumAvailableDevices(); if (test_device_id_l >= num_devices) { diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp index 5273e8d37b5..224972472e4 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp @@ -143,7 +143,6 @@ typedef struct test_board { device_handle_map = tt::tt_metal::detail::CreateDevices(available_chip_ids); if (metal_fabric_init_level == 0) { _init_control_plane(mesh_graph_descriptor); - control_plane->configure_routing_tables(); } else { control_plane = tt::DevicePool::instance().get_control_plane(); } diff --git a/tt_metal/fabric/control_plane.cpp b/tt_metal/fabric/control_plane.cpp index c6595f0a802..0e0118a8bb7 100644 --- a/tt_metal/fabric/control_plane.cpp +++ b/tt_metal/fabric/control_plane.cpp @@ -52,6 +52,7 @@ ControlPlane::ControlPlane(const std::string& mesh_graph_desc_file) { this->routing_table_generator_->print_routing_tables(); this->initialize_from_mesh_graph_desc_file(mesh_graph_desc_file); + this->configure_routing_tables(); // Printing, only enabled with log_debug this->print_ethernet_channels(); diff --git a/tt_metal/impl/device/device_pool.cpp b/tt_metal/impl/device/device_pool.cpp index b7f1704a30b..a9c9840a9f6 100644 --- a/tt_metal/impl/device/device_pool.cpp +++ b/tt_metal/impl/device/device_pool.cpp @@ -403,12 +403,9 @@ void DevicePool::add_devices_to_pool(const std::vector& device_ids) { } // TODO: add handling of EDM - // Initialize control plane, does not configure kernels/routing tables - // We always need a control plane for mapping of logical devices to physical devices - _inst->initialize_control_plane(); if (this->fabric_setting == detail::FabricSetting::FABRIC) { - // write routing tables to all ethernet cores - this->control_plane->configure_routing_tables(); + // Initialize control plane, which writes routing tables to all ethernet cores + _inst->initialize_control_plane(); } this->using_fast_dispatch = (std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr); if (this->using_fast_dispatch) { From 1fa6f52fcfa5b3c1a836295de3283d7841aa5005 Mon Sep 17 00:00:00 2001 From: Brian Liu Date: Mon, 24 Feb 2025 19:18:45 +0000 Subject: [PATCH 288/316] #18148: Modify create_row_major_owned_buffer to directly return owned buffer if possible --- ttnn/cpp/pybind11/pytensor.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ttnn/cpp/pybind11/pytensor.cpp b/ttnn/cpp/pybind11/pytensor.cpp index 51430ff6b2c..b4b0fffeb4c 100644 --- a/ttnn/cpp/pybind11/pytensor.cpp +++ b/ttnn/cpp/pybind11/pytensor.cpp @@ -401,6 +401,12 @@ owned_buffer::Buffer create_row_major_owned_buffer( return owned_buffer; } + // No modifications needed; direclty return buffer + if (tensor_spec.layout() == Layout::ROW_MAJOR and tensor_spec.logical_2d_shape() == tensor_spec.physical_shape()) { + return owned_buffer; + } + + // TODO: Switch to use span in decode_tensor_data and avoid data copy here auto physical_data = owned_buffer.get(); // See implementation for documentation From ee7806f2dc89c31b05b5a8bef2caeda2612f941d Mon Sep 17 00:00:00 2001 From: Adrian Morrison Date: Mon, 24 Feb 2025 20:53:08 -0800 Subject: [PATCH 289/316] fix for falcon regression (bad multi-device behavior) (#18221) --- ttnn/cpp/ttnn/device_operation.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ttnn/cpp/ttnn/device_operation.hpp b/ttnn/cpp/ttnn/device_operation.hpp index c9794df5d6e..3e67bc6e5cf 100644 --- a/ttnn/cpp/ttnn/device_operation.hpp +++ b/ttnn/cpp/ttnn/device_operation.hpp @@ -432,8 +432,8 @@ typename device_operation_t::tensor_return_value_t launch_on_multi_device( std::vector outputs; outputs.reserve(num_shards); - for (auto shard_index = 0; shard_index < num_shards; shard_index++) { - auto device = storage.get_buffer_for_device_id(shard_index)->device(); + for (const auto &[shard_index, buffer] : storage.buffers ) { + auto device = buffer->device(); auto shard_tensor_args = get_shard_tensor_args(shard_index, device, tensor_args); outputs.push_back(launch_on_single_device(cq_id, operation_attributes, shard_tensor_args)); } From 0316ba7bbbd32cf068afe829435ba8d91fb0b289 Mon Sep 17 00:00:00 2001 From: Stanislav Minakov Date: Tue, 25 Feb 2025 05:03:54 +0000 Subject: [PATCH 290/316] Don't write fatal logs when any exception is thrown (#18258) ### Ticket https://github.com/tenstorrent/tt-metal/issues/18059 ### Problem description Currently we write FATAL logs on every invocation of TT_THROW, which creates strange logs for our users even the exception is caught and handled ### What's changed Use `log_debug` instead of `log_fatal` when throwing an exception. If the exception won't get caught, the user should still see the error with text in the logs. ### Checklist - [x] [All post commit CI passes](https://github.com/tenstorrent/tt-metal/actions/runs/13510968044) - [x] New/Existing tests provide coverage for changes --- tt_metal/api/tt-metalium/assert.hpp | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/tt_metal/api/tt-metalium/assert.hpp b/tt_metal/api/tt-metalium/assert.hpp index 9c6a31f35fd..6ac825f2316 100644 --- a/tt_metal/api/tt-metalium/assert.hpp +++ b/tt_metal/api/tt-metalium/assert.hpp @@ -92,20 +92,25 @@ inline std::string backtrace_to_string(int size = 64, int skip = 2, const std::s template [[noreturn]] void tt_throw_impl( char const* file, int line, char const* assert_type, char const* condition_str, Args const&... args) { + if (std::getenv("TT_ASSERT_ABORT")) { + if constexpr (sizeof...(args) > 0) { + log_fatal(args...); + Logger::get().flush(); + } + abort(); + } + std::stringstream trace_message_ss = {}; trace_message_ss << assert_type << " @ " << file << ":" << line << ": " << condition_str << std::endl; if constexpr (sizeof...(args) > 0) { trace_message_ss << "info:" << std::endl; trace_message_ss << fmt::format(args...) << std::endl; - log_fatal(args...); + log_debug(args...); + Logger::get().flush(); } trace_message_ss << "backtrace:\n"; trace_message_ss << tt::assert::backtrace_to_string(100, 3, " --- "); trace_message_ss << std::flush; - Logger::get().flush(); - if (std::getenv("TT_ASSERT_ABORT")) { - abort(); - } throw std::runtime_error(trace_message_ss.str()); } From a346d53ab225215f5f71cf3eb1eea61796b8da19 Mon Sep 17 00:00:00 2001 From: Stanislav Minakov Date: Tue, 25 Feb 2025 05:04:36 +0000 Subject: [PATCH 291/316] Properly handle missing file and exceptions in ttnn.as_tensor (#18261) ### Ticket https://github.com/tenstorrent/tt-metal/issues/18059 ### Problem description `ttnn.as_tensor` silently hides exceptions thrown trying to load a tensor from file ### What's changed Check if the cache file is present first, in this case no loading should be attempted If we attempted to load a tensor and an exception occurred we should show it as a warning, because we can recover from it by regenerating cache ### Checklist - [x] [All post commit CI passes](https://github.com/tenstorrent/tt-metal/actions/runs/13510971997) - [x] New/Existing tests provide coverage for changes --- ttnn/ttnn/operations/core.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ttnn/ttnn/operations/core.py b/ttnn/ttnn/operations/core.py index c47d76b4d3c..409480605bb 100644 --- a/ttnn/ttnn/operations/core.py +++ b/ttnn/ttnn/operations/core.py @@ -625,6 +625,10 @@ def from_torch_and_dump( cache_file_name = f"{cache_file_name}{storage_type}_dtype_{dtype_name}_layout_{layout_name}.bin" + cache_path = pathlib.Path(cache_file_name) + if not cache_path.exists() or not cache_path.is_file(): + return from_torch_and_dump(tensor, dtype, layout, cache_file_name, mesh_mapper) + try: tensor = ttnn._ttnn.tensor.load_tensor(cache_file_name, device=device) if tuple(tensor.shape) != tuple(tensor.shape): @@ -633,7 +637,8 @@ def from_torch_and_dump( ) tensor = from_torch_and_dump(tensor, dtype, layout, cache_file_name, mesh_mapper) logger.debug(f"Loaded cache for {cache_file_name} of shape {tensor.shape}") - except (FileNotFoundError, RuntimeError): + except RuntimeError as e: + log.warning(f"Failed to load cache for {cache_file_name}: {e}") tensor = from_torch_and_dump(tensor, dtype, layout, cache_file_name, mesh_mapper) return tensor From 57ba436ec4366d9129df6a53b2d9e1e828ef0356 Mon Sep 17 00:00:00 2001 From: John Bauman Date: Fri, 21 Feb 2025 22:19:16 +0000 Subject: [PATCH 292/316] #16643: Disabling dispatch posting atomic increments on blackhole We seem to have NOC issues when posting atomic increments to ethernet cores on Blackhole, so disable it there. This may lead to a 250 cycle increase in GO message latency, worst-case. --- tt_metal/hw/firmware/src/brisc.cc | 13 +++++++++++-- tt_metal/hw/inc/blackhole/dev_mem_map.h | 2 +- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/tt_metal/hw/firmware/src/brisc.cc b/tt_metal/hw/firmware/src/brisc.cc index 048cffe6106..992b4dd8d67 100644 --- a/tt_metal/hw/firmware/src/brisc.cc +++ b/tt_metal/hw/firmware/src/brisc.cc @@ -385,6 +385,15 @@ int main() { noc_local_state_init(noc_index); uint8_t prev_noc_mode = DM_DEDICATED_NOC; + +#if defined(ARCH_BLACKHOLE) + // When dispatch_s is on an ethernet core on blockhole, we've been seeing + // issues where posted atomic incremenets seem to fail to complete. + const bool post_atomic_increments = false; +#else + const bool post_atomic_increments = true; +#endif + while (1) { init_sync_registers(); reset_ncrisc_with_iram(); @@ -423,7 +432,7 @@ int main() { 1, 31 /*wrap*/, false /*linked*/, - true /*posted*/); + post_atomic_increments /*posted*/); } } @@ -550,7 +559,7 @@ int main() { 1, 31 /*wrap*/, false /*linked*/, - true /*posted*/); + post_atomic_increments /*posted*/); mailboxes->launch_msg_rd_ptr = (launch_msg_rd_ptr + 1) & (launch_msg_buffer_num_entries - 1); } } diff --git a/tt_metal/hw/inc/blackhole/dev_mem_map.h b/tt_metal/hw/inc/blackhole/dev_mem_map.h index 7a6bdd3e585..f0a87e1567c 100644 --- a/tt_metal/hw/inc/blackhole/dev_mem_map.h +++ b/tt_metal/hw/inc/blackhole/dev_mem_map.h @@ -48,7 +48,7 @@ ///////////// // Firmware/kernel code holes -#define MEM_BRISC_FIRMWARE_SIZE (5 * 1024 + 256) +#define MEM_BRISC_FIRMWARE_SIZE (5 * 1024 + 512) // TODO: perhaps put NCRISC FW in the scratch area and free 1.5K after init (GS/WH) #define MEM_NCRISC_FIRMWARE_SIZE 1536 #define MEM_TRISC0_FIRMWARE_SIZE 1536 From aa4b300d336e1095799bb7552638011c4f8583ef Mon Sep 17 00:00:00 2001 From: Virdhatchani Narayanamoorthy <138196495+VirdhatchaniKN@users.noreply.github.com> Date: Tue, 25 Feb 2025 14:19:34 +0530 Subject: [PATCH 293/316] #17863: Remove pop for eps in BN (#18202) ### Ticket https://github.com/tenstorrent/tt-metal/issues/17863 ### Problem description Need to remove pop for eps value as it is a scalar const. Previously, `cb_eps` was being popped after the first iteration, requiring unnecessary repopulation. Removing the pop line ensures epsilon is read once and retained throughout iterations, preventing redundant operations. ### What's changed Removal of pop_front for eps scalar value ### Checklist - [x] [All post-commit tests](https://github.com/tenstorrent/tt-metal/actions/runs/13499693686) - [x] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/runs/13499697318) - [ ] Full [new models](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) - [Link to test](https://github.com/tenstorrent/tt-metal/actions/runs/13499702976) - [x] [(Single-card) Demo tests](https://github.com/tenstorrent/tt-metal/actions/runs/13499706954) - [x] [(Single-card) Device perf regressions](https://github.com/tenstorrent/tt-metal/actions/runs/13509644007) - [ ] [Single-card Model perf tests](https://github.com/tenstorrent/tt-metal/actions/runs/13499714894) --- tests/ttnn/unit_tests/operations/test_batch_norm.py | 12 +++--------- .../device/kernels/compute/batch_norm_kernel.cpp | 8 +++----- .../kernels/compute/batch_norm_sfpu_kernel.cpp | 9 ++++----- 3 files changed, 10 insertions(+), 19 deletions(-) diff --git a/tests/ttnn/unit_tests/operations/test_batch_norm.py b/tests/ttnn/unit_tests/operations/test_batch_norm.py index fc2ab1abb6c..8846ad2256d 100644 --- a/tests/ttnn/unit_tests/operations/test_batch_norm.py +++ b/tests/ttnn/unit_tests/operations/test_batch_norm.py @@ -19,9 +19,7 @@ [ *(torch.Size([n, c, 32, 32]) for n, c in product([1, 2, 3, 4], [1, 2, 3, 4])), *(torch.Size([n, c, 23, 23]) for n, c in product([1, 2, 3, 4], [1, 2, 3, 4])), - *(torch.Size([n, c, 64, 120]) for n, c in product([1, 2], [1, 2, 3])), - torch.Size([3, 1, 64, 120]), - torch.Size([3, 2, 64, 120]), + *(torch.Size([n, c, 64, 120]) for n, c in product([1, 2, 3], [1, 2, 3, 4])), ], ) @pytest.mark.parametrize( @@ -171,9 +169,7 @@ def test_BN_fp32_full_value(device, channel_size, eps, weight, bias): [ *(torch.Size([n, c, 32, 32]) for n, c in product([1, 2, 3, 4], [1, 2, 3, 4])), *(torch.Size([n, c, 23, 23]) for n, c in product([1, 2, 3, 4], [1, 2, 3, 4])), - *(torch.Size([n, c, 64, 120]) for n, c in product([1, 2], [1, 2, 3])), - torch.Size([3, 1, 64, 120]), - torch.Size([3, 2, 64, 120]), + *(torch.Size([n, c, 64, 120]) for n, c in product([1, 2, 3], [1, 2, 3, 4])), ], ) @pytest.mark.parametrize( @@ -248,9 +244,7 @@ def test_batch_norm_fp32( [ *(torch.Size([n, c, 32, 32]) for n, c in product([1, 2, 3, 4], [1, 2, 3, 4])), *(torch.Size([n, c, 23, 23]) for n, c in product([1, 2, 3, 4], [1, 2, 3, 4])), - *(torch.Size([n, c, 64, 120]) for n, c in product([1, 2], [1, 2, 3])), - torch.Size([3, 1, 64, 120]), - torch.Size([3, 2, 64, 120]), + *(torch.Size([n, c, 64, 120]) for n, c in product([1, 2, 3], [1, 2, 3, 4])), ], ) @pytest.mark.parametrize( diff --git a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/batch_norm_kernel.cpp b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/batch_norm_kernel.cpp index 0de891f21cb..9ffbdeb1144 100644 --- a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/batch_norm_kernel.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/batch_norm_kernel.cpp @@ -53,7 +53,6 @@ ALWI void batchnorm_bcast_tiles( // 1/(sqrt(batch_var + eps)) cb_reserve_back(cb_den, onetile); cb_wait_front(cb_batch_var, 1); - cb_wait_front(cb_eps, 1); tile_regs_acquire(); add_tiles_init_with_dt(cb_batch_var, cb_eps); @@ -67,7 +66,6 @@ ALWI void batchnorm_bcast_tiles( tile_regs_release(); cb_pop_front(cb_batch_var, 1); - cb_pop_front(cb_eps, 1); cb_push_back(cb_den, onetile); // (input - batch_mean)/(sqrt(batch_var + eps)) = result @@ -164,6 +162,9 @@ void MAIN { sub_tiles_init(cb_other, cb_bcast); uint32_t complete_iterations = (num_tiles + tile_start) / tile_freq; uint32_t remaining_iterations = (num_tiles + tile_start) % tile_freq; + + cb_wait_front(cb_eps, 1); + for (uint32_t i = 0; i < complete_iterations; ++i, tile_start = 0) { batchnorm_bcast_tiles( cb_bcast, @@ -198,8 +199,5 @@ void MAIN { weight_has_value, bias_has_value); } - - constexpr uint32_t onetile = 1; - constexpr int dst0 = 0; } } // namespace NAMESPACE diff --git a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/batch_norm_sfpu_kernel.cpp b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/batch_norm_sfpu_kernel.cpp index 11ce1c3c086..007ed3e92ae 100644 --- a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/batch_norm_sfpu_kernel.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/batch_norm_sfpu_kernel.cpp @@ -63,7 +63,6 @@ ALWI void batchnorm_bcast_tiles( // 1/(sqrt(batch_var + eps)) cb_reserve_back(cb_den, onetile); cb_wait_front(cb_batch_var, onetile); - cb_wait_front(cb_eps, onetile); add_binary_tile_init(); rsqrt_tile_init(); @@ -86,7 +85,6 @@ ALWI void batchnorm_bcast_tiles( cb_push_back(cb_den, onetile); cb_pop_front(cb_batch_var, onetile); - cb_pop_front(cb_eps, onetile); // (input - batch_mean)/(sqrt(batch_var + eps)) = result cb_wait_front(cb_den, onetile); @@ -202,6 +200,10 @@ void MAIN { uint32_t complete_iterations = (num_tiles + tile_start) / tile_freq; uint32_t remaining_iterations = (num_tiles + tile_start) % tile_freq; + + constexpr uint32_t onetile = 1; + cb_wait_front(cb_eps, onetile); + for (uint32_t i = 0; i < complete_iterations; ++i, tile_start = 0) { batchnorm_bcast_tiles( cb_bcast, @@ -236,8 +238,5 @@ void MAIN { weight_has_value, bias_has_value); } - - constexpr uint32_t onetile = 1; - constexpr int dst0 = 0; } } // namespace NAMESPACE From 9dd5351f0acf5c7fdab08841515900241eb53672 Mon Sep 17 00:00:00 2001 From: Pavle Josipovic Date: Mon, 24 Feb 2025 18:02:45 +0000 Subject: [PATCH 294/316] #18206 Conv2d Block Sharded with ReLu If Conv2d has fused Relu with blocks sharding and activation block height override is used, pcc would fail as state of the packer wan't properly cleared (relu disabled) when compute kernel starts processing new block. --- .../unit_tests/operations/test_new_conv2d.py | 56 +++++++++++++++++++ .../conv_bmm_tilize_col_major_out_blocks.cpp | 3 + 2 files changed, 59 insertions(+) diff --git a/tests/ttnn/unit_tests/operations/test_new_conv2d.py b/tests/ttnn/unit_tests/operations/test_new_conv2d.py index 7a6a83ec276..dbc28079e16 100644 --- a/tests/ttnn/unit_tests/operations/test_new_conv2d.py +++ b/tests/ttnn/unit_tests/operations/test_new_conv2d.py @@ -72,6 +72,7 @@ def run_conv( weight_mesh_mapper=None, output_mesh_composer=None, enable_split_reader=False, + activation="", ): if isinstance(device, ttnn.MeshDevice): assert input_mesh_mapper is not None, "Expected mesh mapper for input tensor when using device mesh" @@ -102,6 +103,8 @@ def run_conv( dilation=(dilation, dilation), groups=groups, ) + if activation == "relu": + torch_out_golden_tensor = torch.nn.functional.relu(torch_out_golden_tensor) reader_patterns_cache = {} @@ -134,6 +137,7 @@ def run_conv( enable_split_reader=enable_split_reader, enable_subblock_padding=False, output_layout=output_layout, + activation=activation, ) compute_config = ttnn.init_device_compute_kernel_config( device.arch(), @@ -2796,3 +2800,55 @@ def test_small_in_large_out_channels_auto_shard(device, torch_tensor_map): None, auto_shard=True, ) + + +# fmt: off +@pytest.mark.parametrize( + "batch, input_channels, output_channels, input_height, input_width, kernel, stride, padding", + ( + (1, 64, 64, 128, 128, (3, 3), (1, 1), (1, 1)), + ), +) +#fmt: on + +@pytest.mark.parametrize("shard_layout", [BS]) +@pytest.mark.parametrize("activation", ["relu"]) + +@pytest.mark.parametrize("device_params", [{"l1_small_size": 16384*2}], indirect=True) +def test_block_sharding_relu_act_block_h( + device, + torch_tensor_map, + batch, + input_channels, + output_channels, + input_height, + input_width, + kernel, + stride, + padding, + shard_layout, + activation, +): + config_override = {} + config_override["act_block_h"] = 32 + run_conv( + device, + torch_tensor_map, + ttnn.MathFidelity.LoFi, + ttnn.bfloat16, + ttnn.bfloat16, + batch, + output_channels, + input_channels, + input_height, + input_width, + kernel[0], + kernel[1], + stride[0], + stride[1], + padding[0], + padding[1], + config_override=config_override, + shard_layout=shard_layout, + activation=activation, + ) diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/conv_bmm_tilize_col_major_out_blocks.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/conv_bmm_tilize_col_major_out_blocks.cpp index 94545fc3704..7a7b06971c4 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/conv_bmm_tilize_col_major_out_blocks.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/conv_bmm_tilize_col_major_out_blocks.cpp @@ -445,6 +445,9 @@ void MAIN { if constexpr (!tilize_in0) { mm_block_init_short(mm_in0_cb_id, in1_cb_id, false, out_subblock_w, out_subblock_h, in0_block_w); +#ifdef PACK_RELU + PACK((llk_pack_relu_config(ReluType::NO_RELU))); +#endif } } } // for in0_num_blocks_h From 2d6e2726c62b5de224338275bde81d9327a1e98c Mon Sep 17 00:00:00 2001 From: Raymond Kim Date: Tue, 25 Feb 2025 09:16:48 -0500 Subject: [PATCH 295/316] #18283: [skip ci] Increase yolov4 e2e perf threshold because there's some non-det perf drop happening around the beginning of week Feb 23, 2025 --- models/demos/yolov4/tests/test_perf_yolo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/models/demos/yolov4/tests/test_perf_yolo.py b/models/demos/yolov4/tests/test_perf_yolo.py index 1b07addbbfe..28c7c82cdb6 100644 --- a/models/demos/yolov4/tests/test_perf_yolo.py +++ b/models/demos/yolov4/tests/test_perf_yolo.py @@ -30,7 +30,7 @@ def get_expected_compile_time_sec(): def get_expected_inference_time_sec(): - return 0.237 + return 0.25 @pytest.mark.models_performance_bare_metal From 9b959d2cbd9abc472fcabfa16154f304140e150b Mon Sep 17 00:00:00 2001 From: Salar Hosseini <159165450+skhorasganiTT@users.noreply.github.com> Date: Tue, 25 Feb 2025 10:48:53 -0500 Subject: [PATCH 296/316] [skip ci] Update perf and latest features for llm models (Feb 24) (#18247) --- README.md | 13 ++++++++----- models/MODEL_UPDATES.md | 13 +++++++++++++ 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 9ff79c7fb7e..2bc7ceb6426 100644 --- a/README.md +++ b/README.md @@ -26,23 +26,26 @@ | Model | Batch | Hardware | ttft (ms) | t/s/u | Target
t/s/u | t/s | TT-Metalium Release | vLLM Tenstorrent Repo Release | |---------------------------------------------------------------|-------|----------------------------------------------------------|-----------|-------|-----------------|--------|---------------------------------------------------|---------------------------------------------------------------------------------------------------| -| [Falcon 7B](./models/demos/wormhole/falcon7b) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | 71 | 18.1 | 26 | 579.2 | [v0.56.0-rc6](https://github.com/tenstorrent/tt-metal/tree/v0.56.0-rc6) | | +| [Falcon 7B](./models/demos/wormhole/falcon7b) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | 71 | 18.1 | 26 | 579.2 | [v0.56.0-rc33](https://github.com/tenstorrent/tt-metal/tree/v0.56.0-rc33) | | | [Mistral 7B](./models/demos/wormhole/mistral7b) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | | 9.9 | 25 | 316.8 | [v0.51.0-rc28](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc28) | | | [Mamba 2.8B](./models/demos/wormhole/mamba) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | 48 | 12.3 | 41 | 393.6 | [v0.51.0-rc26](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc26) | | | [Llama 3.1 8B](./models/demos/llama3) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | 168 | 24.0 | 23 | 768.0 | [v0.56.0-rc6](https://github.com/tenstorrent/tt-metal/tree/v0.56.0-rc6) | [b9564bf](https://github.com/tenstorrent/vllm/tree/b9564bf364e95a3850619fc7b2ed968cc71e30b7) | | [Llama 3.2 1B](./models/demos/llama3) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | 56 | 59.4 | 160 | 1900.8 | [v0.56.0-rc6](https://github.com/tenstorrent/tt-metal/tree/v0.56.0-rc6) | [b9564bf](https://github.com/tenstorrent/vllm/tree/b9564bf364e95a3850619fc7b2ed968cc71e30b7) | | [Llama 3.2 3B](./models/demos/llama3) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | 97 | 36.5 | 60 | 1168.0 | [v0.56.0-rc6](https://github.com/tenstorrent/tt-metal/tree/v0.56.0-rc6) | [b9564bf](https://github.com/tenstorrent/vllm/tree/b9564bf364e95a3850619fc7b2ed968cc71e30b7) | -| [Llama 3.2 11B Vision (TP=2)](./models/demos/llama3) | 16 | [n300](https://tenstorrent.com/hardware/wormhole) | 2550 | 15.8 | 17 | 252.8 | [v0.56.0-rc3](https://github.com/tenstorrent/tt-metal/tree/v0.56.0-rc3) | [0fde628](https://github.com/tenstorrent/vllm/tree/0fde6285eb133f5c71522840a1beb6b57a2e3b70) | -| [Falcon 7B (DP=8)](./models/demos/t3000/falcon7b) | 256 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 88 | 15.5 | 26 | 3968.0 | [v0.55.0-rc18](https://github.com/tenstorrent/tt-metal/tree/v0.55.0-rc18) | | +| [Llama 3.2 11B Vision (TP=2)](./models/demos/llama3) | 16 | [n300](https://tenstorrent.com/hardware/wormhole) | 2550 | 15.8 | 17 | 252.8 | [v0.56.0-rc6](https://github.com/tenstorrent/tt-metal/tree/v0.56.0-rc6) | [b9564bf](https://github.com/tenstorrent/vllm/tree/b9564bf364e95a3850619fc7b2ed968cc71e30b7) | +| [Qwen 2.5 7B (TP=2)](./models/demos/llama3) | 32 | [n300](https://tenstorrent.com/hardware/wormhole) | 126 | 32.5 | 38 | 1040.0 | [v0.56.0-rc33](https://github.com/tenstorrent/tt-metal/tree/v0.56.0-rc33) | [9ac3783](https://github.com/tenstorrent/vllm/tree/9ac3783d5e3a4547f879f2cdadaab8571047a0a8) | +| [Falcon 7B (DP=8)](./models/demos/t3000/falcon7b) | 256 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 88 | 15.5 | 26 | 3968.0 | [v0.56.0-rc33](https://github.com/tenstorrent/tt-metal/tree/v0.56.0-rc33) | | | [Llama 3.1 70B (TP=8)](./models/demos/t3000/llama3_70b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 190 | 15.1 | 20 | 483.2 | [v0.54.0-rc2](https://github.com/tenstorrent/tt-metal/tree/v0.54.0-rc2) | [9531611](https://github.com/tenstorrent/vllm/tree/953161188c50f10da95a88ab305e23977ebd3750) | | [Falcon 40B (TP=8)](./models/demos/t3000/falcon40b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | | 5.3 | 36 | 169.6 | [v0.55.0-rc20](https://github.com/tenstorrent/tt-metal/tree/v0.55.0-rc20) | | | [Mixtral 8x7B (TP=8)](./models/demos/t3000/mixtral8x7b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 227 | 14.9 | 33 | 476.8 | [v0.56.0-rc6](https://github.com/tenstorrent/tt-metal/tree/v0.56.0-rc6) | | +| [Qwen 2.5 72B (TP=8)](./models/demos/llama3) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 333 | 14.5 | 20 | 464.0 | [v0.56.0-rc33](https://github.com/tenstorrent/tt-metal/tree/v0.56.0-rc33) | [9ac3783](https://github.com/tenstorrent/vllm/tree/9ac3783d5e3a4547f879f2cdadaab8571047a0a8) | +| [DeepSeek R1 Distill Llama 3.3 70B (TP=8)](./models/demos/llama3) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | TBD | 16.4 | 20 | 524.8 | [v0.56.0-rc33](https://github.com/tenstorrent/tt-metal/tree/v0.56.0-rc33) | [9ac3783](https://github.com/tenstorrent/vllm/tree/9ac3783d5e3a4547f879f2cdadaab8571047a0a8) | | [Falcon 7B (DP=32)](./models/demos/tg/falcon7b) | 1024 | [Galaxy](https://tenstorrent.com/hardware/galaxy) | 223 | 4.8 | 26 | 4915.2 | [v0.56.0-rc6](https://github.com/tenstorrent/tt-metal/tree/v0.56.0-rc6) | | | [Llama 3.1 70B (DP=4, TP=8)](./models/demos/t3000/llama3_70b) | 128 | [Galaxy](https://tenstorrent.com/hardware/galaxy) | 190 | 14.3 | 20 | 1835.5 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) | | | [Llama 3.1 70B (TP=32)](./models/demos/llama3) | 32 | [Galaxy](https://tenstorrent.com/hardware/galaxy) | 763 | 13.5 | 80 | 432.0 | [v0.56.0-rc6](https://github.com/tenstorrent/tt-metal/tree/v0.56.0-rc6) | [b9564bf](https://github.com/tenstorrent/vllm/tree/b9564bf364e95a3850619fc7b2ed968cc71e30b7) | -| [DeepSeek R1 Distill Llama 3.3 70B (TP=8)](https://github.com/tenstorrent/tt-metal/tree/main/models/demos/llama3) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 1113 | 16.4 | 33 |524.8 | [main](https://github.com/tenstorrent/tt-metal/) | [b9564bf](https://github.com/tenstorrent/vllm/tree/b9564bf364e95a3850619fc7b2ed968cc71e30b7) | -> **Last Update:** February 10, 2025 + +> **Last Update:** February 24, 2025 > > **Notes:** > diff --git a/models/MODEL_UPDATES.md b/models/MODEL_UPDATES.md index d76b8df8387..78999bb9bd7 100644 --- a/models/MODEL_UPDATES.md +++ b/models/MODEL_UPDATES.md @@ -4,6 +4,19 @@ > > Please refer to the front-page [README](../README.md) for the latest verified release for each model. +## February 24, 2025 + +### [DeepSeek R1 Distill Llama 3.3 70B](demos/llama3) +- Added support for DeepSeek R1 Distill Llama 3.3 70B on T3000. + +### [Qwen 2.5](demos/llama3) +- Added support for Qwen2.5-7B on N300 and Qwen2.5-72B on T3000. + +### [Llama 3.1/3.2](demos/llama3) +> **Note:** This feature is available as of release [v0.56.0-rc37](https://github.com/tenstorrent/tt-metal/tree/v0.56.0-rc37) +- Overhauled the demo script (now called [simple_text_demo.py](demos/llama3/demo/simple_text_demo.py)) to use a simplified causal generation interface. +- Added support for custom input argument overrides to the demo. + ## February 10, 2025 ### [Llama 3.1/3.2](demos/llama3) From 04368e20255d0c30739ad511f8c818d3f3517907 Mon Sep 17 00:00:00 2001 From: Dimitri Gnidash <119051828+dimitri-tenstorrent@users.noreply.github.com> Date: Tue, 25 Feb 2025 11:15:23 -0500 Subject: [PATCH 297/316] Revert "Allow the user to select the version of the docs" (#18291) Reverts tenstorrent/tt-metal#17434 --- .github/workflows/code-analysis.yaml | 1 + .../workflows/docs-latest-public-wrapper.yaml | 2 - .github/workflows/docs-latest-public.yaml | 20 +++---- .github/workflows/package-and-release.yaml | 2 +- docs/published_versions.json | 7 --- docs/source/common/_static/tt_theme.css | 8 --- docs/source/common/_templates/layout.html | 12 ----- docs/source/common/_templates/versions.html | 54 ------------------- 8 files changed, 9 insertions(+), 97 deletions(-) delete mode 100644 docs/published_versions.json delete mode 100644 docs/source/common/_templates/versions.html diff --git a/.github/workflows/code-analysis.yaml b/.github/workflows/code-analysis.yaml index 331921254f1..b096bb0c5e0 100644 --- a/.github/workflows/code-analysis.yaml +++ b/.github/workflows/code-analysis.yaml @@ -46,6 +46,7 @@ jobs: distro: ${{ inputs.distro }} version: ${{ inputs.version }} architecture: ${{ inputs.architecture }} + clang-tidy: name: 🤖 Clang Tidy needs: build-docker-image diff --git a/.github/workflows/docs-latest-public-wrapper.yaml b/.github/workflows/docs-latest-public-wrapper.yaml index 07164ddd381..35c1f016a80 100644 --- a/.github/workflows/docs-latest-public-wrapper.yaml +++ b/.github/workflows/docs-latest-public-wrapper.yaml @@ -15,5 +15,3 @@ jobs: needs: build-artifact uses: ./.github/workflows/docs-latest-public.yaml secrets: inherit - with: - version: latest diff --git a/.github/workflows/docs-latest-public.yaml b/.github/workflows/docs-latest-public.yaml index ef671c2f436..d3e918a6dcc 100644 --- a/.github/workflows/docs-latest-public.yaml +++ b/.github/workflows/docs-latest-public.yaml @@ -2,11 +2,6 @@ name: "[internal] Docs build and deploy to GitHub pages on main impl" on: workflow_call: - inputs: - version: - required: false - type: string - default: latest concurrency: # Note that people may spam the post-commit pipeline on their branch, and @@ -25,6 +20,7 @@ jobs: matrix: arch: [grayskull] env: + DOCS_VERSION: latest ARCH_NAME: ${{ matrix.arch }} LOGURU_LEVEL: INFO LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib @@ -61,23 +57,21 @@ jobs: - name: Prepare artifact - move output run: | mkdir gh_pages - mv docs/build/html gh_pages/${{ inputs.version }} + mv docs/build/html gh_pages/$DOCS_VERSION - name: Prepare artifact - create .nojekyll run: | touch gh_pages/.nojekyll - name: Prepare artifact - create root index run: | touch gh_pages/index.html + - name: Upload artifact + uses: actions/upload-pages-artifact@v3.0.1 + with: + path: "gh_pages" - name: Deploy to GitHub Pages if: ${{ github.ref == 'refs/heads/main' }} - uses: JamesIves/github-pages-deploy-action@v4 id: deployment - with: - token: ${{ secrets.GITHUB_TOKEN }} - branch: gh-pages - target-folder: ${{ inputs.version }} - folder: ./gh_pages/${{ inputs.version }} - force: false + uses: actions/deploy-pages@v4.0.4 - name: Delete artifact if deployment failed # When the deployment API call fails, the artifacts are not cleaned up correctly # and the next attempt (!) run will cause an error. diff --git a/.github/workflows/package-and-release.yaml b/.github/workflows/package-and-release.yaml index e6d92cb127e..7b4a4160167 100644 --- a/.github/workflows/package-and-release.yaml +++ b/.github/workflows/package-and-release.yaml @@ -210,7 +210,7 @@ jobs: create-and-upload-draft-release ] if: ${{ needs.get-params.outputs.is-release-candidate !='true' && needs.get-params.outputs.should-create-release == 'true' }} - uses: ./.github/workflows/docs-latest-public.yaml + uses: ./.github/workflows/docs-release.yaml with: version: ${{ needs.create-tag.outputs.version }} secrets: inherit diff --git a/docs/published_versions.json b/docs/published_versions.json deleted file mode 100644 index 978d82a8caf..00000000000 --- a/docs/published_versions.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "versions": [ - "latest", - "v0.55.0", - "v0.54.0" - ] -} diff --git a/docs/source/common/_static/tt_theme.css b/docs/source/common/_static/tt_theme.css index 9b81114bea5..a4f1176666d 100644 --- a/docs/source/common/_static/tt_theme.css +++ b/docs/source/common/_static/tt_theme.css @@ -453,11 +453,3 @@ html.writer-html5 background: var(--color-background-alt2) !important; color: var(--color-foreground) !important; } - -.rst-versions.shift-up { - overflow-y: auto; -} - -.project-versions { - font-size: small; -} diff --git a/docs/source/common/_templates/layout.html b/docs/source/common/_templates/layout.html index 34ce35ad1af..e80a0b044a7 100644 --- a/docs/source/common/_templates/layout.html +++ b/docs/source/common/_templates/layout.html @@ -17,18 +17,6 @@ {{ project }} -{%- if theme_display_version %} - {%- set nav_version = version %} - {%- if READTHEDOCS and current_version %} - {%- set nav_version = current_version %} - {%- endif %} - {%- if nav_version %} -
- {{ nav_version }} -
- {%- endif %} -{%- endif %} - {%- include "searchbox.html" %} {%- endblock %} diff --git a/docs/source/common/_templates/versions.html b/docs/source/common/_templates/versions.html deleted file mode 100644 index 6e118db8db7..00000000000 --- a/docs/source/common/_templates/versions.html +++ /dev/null @@ -1,54 +0,0 @@ -
- - Version: latest - - -
-
-
{{ _('Versions') }}
-
-
- -
-
- - From b2d121d09a262947da8a8aab515505797ca554fa Mon Sep 17 00:00:00 2001 From: Raymond Kim <109366641+tt-rkim@users.noreply.github.com> Date: Tue, 25 Feb 2025 12:25:04 -0500 Subject: [PATCH 298/316] #18179: [skip ci] Add wheel to post commit wrapper for models since it uses wheel now (#18288) ### Ticket Quick fix for @mbahnasTT #18179 ### Problem description Provide context for the problem. ### What's changed Describe the approach used to solve the problem. Summarize the changes made and its impact. ### Checklist - [ ] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes - [ ] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [ ] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [ ] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [ ] New/Existing tests provide coverage for changes --- .github/workflows/models-post-commit-wrapper.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/models-post-commit-wrapper.yaml b/.github/workflows/models-post-commit-wrapper.yaml index b63c9fb6869..45e39806021 100644 --- a/.github/workflows/models-post-commit-wrapper.yaml +++ b/.github/workflows/models-post-commit-wrapper.yaml @@ -11,6 +11,8 @@ jobs: build-artifact: uses: ./.github/workflows/build-artifact.yaml secrets: inherit + with: + build-wheel: true models-unit-tests: needs: build-artifact secrets: inherit From 66be5d43196474c4ac824118336b1d74b685e1b9 Mon Sep 17 00:00:00 2001 From: Dimitri Gnidash <119051828+dimitri-tenstorrent@users.noreply.github.com> Date: Tue, 25 Feb 2025 12:34:56 -0500 Subject: [PATCH 299/316] #18302: Fix the permissions missing in the docs wrapper workflow (#18301) ### Ticket #18302 ### Problem description When @bkeith-TT and I were re-deploying the docs back, the wrapper workflow did not have enough permissions to re-deploy the docs. Specifically, it was missing ``` id-token: write ``` ### What's changed Added the required permissions to the workflow. ### Checklist - [ ] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes - [ ] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [ ] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [ ] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [ ] New/Existing tests provide coverage for changes --- .github/workflows/docs-latest-public-wrapper.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/docs-latest-public-wrapper.yaml b/.github/workflows/docs-latest-public-wrapper.yaml index 35c1f016a80..60a56800209 100644 --- a/.github/workflows/docs-latest-public-wrapper.yaml +++ b/.github/workflows/docs-latest-public-wrapper.yaml @@ -3,6 +3,10 @@ name: "[post-commit] Docs build and deploy to GitHub pages on main" on: workflow_dispatch: +permissions: + id-token: write + pages: write + jobs: build-docker-artifact: uses: ./.github/workflows/build-docker-artifact.yaml From 9c9cbd035c7a40731ef4aa57255a4535a90ae032 Mon Sep 17 00:00:00 2001 From: William Ly Date: Tue, 25 Feb 2025 12:37:11 -0500 Subject: [PATCH 300/316] #18150: [skip ci] Drop xmltodict from requirements-dev.txt and use built-in xml.etree instead (#18298) ### Ticket Resolves https://github.com/tenstorrent/tt-metal/issues/18150 and concerns from PR https://github.com/tenstorrent/tt-metal/pull/18251 ### Problem description - pytest mysteriously crashes on tt-metal-ci-vm-24 when xmltodict is included in the dev env. [From PR 18251] - avoid installing infra deps each time we want to run the github action - prevent situation where deps can be installed outside of a docker container due to running `pip install` directly ### What's changed Remove `xmltodict` from `requirements-dev.txt` Refactor github action script to use built-in `xml.etree.ElementTree` instead. ### Checklist - [ ] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes https://github.com/tenstorrent/tt-metal/actions/runs/13526080092 --- .../data_analysis/print_gtest_annotations.py | 52 ++++++------------- tt_metal/python_env/requirements-dev.txt | 1 - 2 files changed, 16 insertions(+), 37 deletions(-) diff --git a/.github/scripts/data_analysis/print_gtest_annotations.py b/.github/scripts/data_analysis/print_gtest_annotations.py index a599b4e440e..ad0b1403e15 100644 --- a/.github/scripts/data_analysis/print_gtest_annotations.py +++ b/.github/scripts/data_analysis/print_gtest_annotations.py @@ -1,19 +1,10 @@ import argparse -import xmltodict +import xml.etree.ElementTree as ET import glob import os from typing import Union -def _guaranteed_list(x): - if not x: - return [] - elif isinstance(x, list): - return x - else: - return [x] - - def _build_workflow_command( command_name: str, file: str, @@ -61,29 +52,18 @@ def _escape(s: str) -> str: # Iterate through each XML file for xml_file in xml_files: - with open(xml_file, "r") as f: - results = xmltodict.parse(f.read()) - - # Check for failed tests - failed_tests = [] - for testsuite in _guaranteed_list(results["testsuites"]["testsuite"]): - for testcase in _guaranteed_list(testsuite["testcase"]): - if "failure" in testcase: - failed_tests.append(testcase) - - # Create error annotations for each failed test - for failed_test in failed_tests: - failure_messages = _guaranteed_list(failed_test["failure"]) - if failure_messages: - # first message is often enough - failure_message = failure_messages[0]["@message"] - else: - failure_message = "unknown_failure_message" - - msg = _build_workflow_command( - command_name="error", - file=failed_test["@file"].lstrip("/work/"), - line=int(failed_test["@line"]), - message=failure_message, - ) - print(msg) + tree = ET.parse(xml_file) + root = tree.getroot() + for testsuite in root.findall("testsuite"): + for testcase in testsuite.findall("testcase"): + failure = testcase.find("failure") + # If failure exists, print the failure message + if failure is not None: + failure_message = failure.attrib.get("message") + msg = _build_workflow_command( + command_name="error", + file=testcase.attrib.get("file", "").lstrip("/work/"), + line=int(testcase.attrib["line"]), + message=failure_message, + ) + print(msg) diff --git a/tt_metal/python_env/requirements-dev.txt b/tt_metal/python_env/requirements-dev.txt index 808205dc2ce..f1599339107 100644 --- a/tt_metal/python_env/requirements-dev.txt +++ b/tt_metal/python_env/requirements-dev.txt @@ -5,7 +5,6 @@ loguru # For github workflow unit test failure annotations -xmltodict pytest-github-actions-annotate-failures==0.3.0 # During dep resolution, black may install platformdirs >=4.0.0, which is From 854990fca346fd00477483208b39a81df9c09bbf Mon Sep 17 00:00:00 2001 From: Nigel Huang Date: Mon, 24 Feb 2025 19:36:06 +0000 Subject: [PATCH 301/316] #7449: add KernelBuildOptLevel --- tt_metal/api/tt-metalium/kernel_types.hpp | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/tt_metal/api/tt-metalium/kernel_types.hpp b/tt_metal/api/tt-metalium/kernel_types.hpp index 4d1643fef7a..4bd746c4889 100644 --- a/tt_metal/api/tt-metalium/kernel_types.hpp +++ b/tt_metal/api/tt-metalium/kernel_types.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. // // SPDX-License-Identifier: Apache-2.0 @@ -15,6 +15,18 @@ namespace tt::tt_metal { using KernelHandle = std::uint32_t; +// Option that controls compiler optimization level +enum class KernelBuildOptLevel : uint8_t { + O1, // Level 1 optimization. Same as O. + O2, // Level 2 optimization. Turns on all flags specified by O1. + O3, // Level 3 optimizaiton. Turns on all flags specified by O2. + O0, // Reduce compilation time and make debugging produce the expected results. + Os, // Optimize for size. Enables O2 optimizations except for those that increase binary size. + Ofast, // Enable all O3 and non standard optimizations. + Og, // Optimize for debugging. + Oz, // Aggresively optimize for size rather than speed. +}; + struct DataMovementConfig { DataMovementProcessor processor = DataMovementProcessor::RISCV_0; // For data transfer kernels: NCRISC & BRISC NOC noc = NOC::RISCV_0_default; @@ -24,6 +36,8 @@ struct DataMovementConfig { // Each unique combination of defines will produce a unique compiled instantiation // This file is then automatically included in the generated compiled kernel files std::map defines; + // Kernel optimization level + KernelBuildOptLevel opt_level = KernelBuildOptLevel::Os; }; struct ReaderDataMovementConfig : public DataMovementConfig { @@ -46,6 +60,8 @@ struct ComputeConfig { // Each unique combination of defines will produce a unique compiled instantiation // This file is then automatically included in the generated compiled kernel files std::map defines; + // Kernel optimization level + KernelBuildOptLevel opt_level = KernelBuildOptLevel::O3; }; struct EthernetConfig { @@ -57,6 +73,8 @@ struct EthernetConfig { // Each unique combination of defines will produce a unique compiled instantiation // This file is then automatically included in the generated compiled kernel files std::map defines; + // Kernel optimization level + KernelBuildOptLevel opt_level = KernelBuildOptLevel::Os; }; } // namespace tt::tt_metal From ef1f62ab87faabc5c908a2ddf533e9679baecc1a Mon Sep 17 00:00:00 2001 From: Nigel Huang Date: Mon, 24 Feb 2025 23:54:13 +0000 Subject: [PATCH 302/316] #7449: allow users to specify compile opt level Add a field for users to set the compiler optimization level in the config passed to CreateKernel Default is still O3 for compute and Os for rest --- tt_metal/api/tt-metalium/build.hpp | 26 ++++-- tt_metal/api/tt-metalium/kernel.hpp | 21 ++++- tt_metal/api/tt-metalium/kernel_types.hpp | 19 ++--- tt_metal/impl/kernels/kernel.cpp | 25 +++++- tt_metal/jit_build/build.cpp | 82 ++++++++++--------- .../hello_world_compute_kernel.cpp | 6 +- 6 files changed, 116 insertions(+), 63 deletions(-) diff --git a/tt_metal/api/tt-metalium/build.hpp b/tt_metal/api/tt-metalium/build.hpp index 426d7d763d3..9ecdfffe9e6 100644 --- a/tt_metal/api/tt-metalium/build.hpp +++ b/tt_metal/api/tt-metalium/build.hpp @@ -1,8 +1,9 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. // // SPDX-License-Identifier: Apache-2.0 #pragma once +#include #include #include #include @@ -101,6 +102,14 @@ class alignas(CACHE_LINE_ALIGNMENT) JitBuildState { string link_objs_; + // Default compiler optimization setting + // Used when JitBuildSettings is not provided + string default_compile_opt_level_; + + // Default linker optimization setting + // Used when JitBuildSettings is not provided + string default_linker_opt_level_; + void compile(const string& log_file, const string& out_path, const JitBuildSettings* settings) const; void compile_one( const string& log_file, @@ -108,7 +117,7 @@ class alignas(CACHE_LINE_ALIGNMENT) JitBuildState { const JitBuildSettings* settings, const string& src, const string& obj) const; - void link(const string& log_file, const string& out_path) const; + void link(const string& log_file, const string& out_path, const JitBuildSettings* settings) const; void weaken(const string& log_file, const string& out_path) const; void copy_kernel(const string& kernel_in_path, const string& op_out_path) const; void extract_zone_src_locations(const string& log_file) const; @@ -169,12 +178,19 @@ class JitBuildIdleEthernet : public JitBuildState { // (eg, API specified settings) class JitBuildSettings { public: - virtual const string& get_full_kernel_name() const = 0; + // Returns the full kernel name + virtual const std::string& get_full_kernel_name() const = 0; + // Returns the compiler optimization level + virtual std::string_view get_compiler_opt_level() const = 0; + // Returns the linker optimization level + virtual std::string_view get_linker_opt_level() const = 0; + + // Called to process the user defines virtual void process_defines(const std::function) const = 0; + // Called to process the user compile time args virtual void process_compile_time_args(const std::function) const = 0; -private: - bool use_multi_threaded_compile = true; + virtual ~JitBuildSettings() = default; }; void jit_build(const JitBuildState& build, const JitBuildSettings* settings); diff --git a/tt_metal/api/tt-metalium/kernel.hpp b/tt_metal/api/tt-metalium/kernel.hpp index b419cde9698..2fd689411b6 100644 --- a/tt_metal/api/tt-metalium/kernel.hpp +++ b/tt_metal/api/tt-metalium/kernel.hpp @@ -1,9 +1,10 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. // // SPDX-License-Identifier: Apache-2.0 #pragma once +#include #include #include #include @@ -176,7 +177,11 @@ class DataMovementKernel : public Kernel { void process_defines(const std::function) const override; - private: + std::string_view get_compiler_opt_level() const override; + + std::string_view get_linker_opt_level() const override; + +private: const DataMovementConfig config_; uint8_t expected_num_binaries() const override; @@ -204,7 +209,11 @@ class EthernetKernel : public Kernel { void process_defines(const std::function) const override; - private: + std::string_view get_compiler_opt_level() const override; + + std::string_view get_linker_opt_level() const override; + +private: const EthernetConfig config_; uint8_t expected_num_binaries() const override; @@ -233,7 +242,11 @@ class ComputeKernel : public Kernel { void process_defines(const std::function) const override; - private: + std::string_view get_compiler_opt_level() const override; + + std::string_view get_linker_opt_level() const override; + +private: const ComputeConfig config_; uint8_t expected_num_binaries() const override; diff --git a/tt_metal/api/tt-metalium/kernel_types.hpp b/tt_metal/api/tt-metalium/kernel_types.hpp index 4bd746c4889..98620024fed 100644 --- a/tt_metal/api/tt-metalium/kernel_types.hpp +++ b/tt_metal/api/tt-metalium/kernel_types.hpp @@ -15,15 +15,14 @@ namespace tt::tt_metal { using KernelHandle = std::uint32_t; -// Option that controls compiler optimization level +// Option that controls build optimization level enum class KernelBuildOptLevel : uint8_t { - O1, // Level 1 optimization. Same as O. - O2, // Level 2 optimization. Turns on all flags specified by O1. - O3, // Level 3 optimizaiton. Turns on all flags specified by O2. + O1, // Turns on level 1 optimization. Same as O. + O2, // Turns on level 2 optimization and also all flags specified by O1. + O3, // Turns on level 3 optimization and also all flags specified by O2. O0, // Reduce compilation time and make debugging produce the expected results. - Os, // Optimize for size. Enables O2 optimizations except for those that increase binary size. - Ofast, // Enable all O3 and non standard optimizations. - Og, // Optimize for debugging. + Os, // Optimize for size and also O2 optimizations except for those that increase binary size. + Ofast, // Turns on level O3 and also non standard optimizations. Oz, // Aggresively optimize for size rather than speed. }; @@ -36,7 +35,7 @@ struct DataMovementConfig { // Each unique combination of defines will produce a unique compiled instantiation // This file is then automatically included in the generated compiled kernel files std::map defines; - // Kernel optimization level + // Set the compiler and linker optimization level KernelBuildOptLevel opt_level = KernelBuildOptLevel::Os; }; @@ -60,7 +59,7 @@ struct ComputeConfig { // Each unique combination of defines will produce a unique compiled instantiation // This file is then automatically included in the generated compiled kernel files std::map defines; - // Kernel optimization level + // Set the compiler and linker optimization level KernelBuildOptLevel opt_level = KernelBuildOptLevel::O3; }; @@ -73,7 +72,7 @@ struct EthernetConfig { // Each unique combination of defines will produce a unique compiled instantiation // This file is then automatically included in the generated compiled kernel files std::map defines; - // Kernel optimization level + // Set the compiler and linker optimization level KernelBuildOptLevel opt_level = KernelBuildOptLevel::Os; }; diff --git a/tt_metal/impl/kernels/kernel.cpp b/tt_metal/impl/kernels/kernel.cpp index 7e9d18c5ea6..2f0c7a1f69b 100644 --- a/tt_metal/impl/kernels/kernel.cpp +++ b/tt_metal/impl/kernels/kernel.cpp @@ -1,16 +1,19 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. // // SPDX-License-Identifier: Apache-2.0 #include +#include #include #include +#include #include #include #include "llrt.hpp" +#include #include #include "tt_metal/impl/debug/watcher_server.hpp" #include "tt_metal/kernel.hpp" @@ -105,7 +108,7 @@ CoreType Kernel::get_kernel_core_type() const { return CoreType::WORKER; } -const string &Kernel::get_full_kernel_name() const { return this->kernel_full_name_; } +const std::string& Kernel::get_full_kernel_name() const { return this->kernel_full_name_; } void Kernel::add_defines(const std::map& defines) { this->defines_.insert(defines.begin(), defines.end()); @@ -141,6 +144,24 @@ void EthernetKernel::process_defines( callback("NOC_MODE", std::to_string(NOC_MODE::DM_DEDICATED_NOC)); } +std::string_view DataMovementKernel::get_compiler_opt_level() const { + return magic_enum::enum_name(this->config_.opt_level); +} + +std::string_view DataMovementKernel::get_linker_opt_level() const { return this->get_compiler_opt_level(); } + +std::string_view ComputeKernel::get_compiler_opt_level() const { + return magic_enum::enum_name(this->config_.opt_level); +} + +std::string_view ComputeKernel::get_linker_opt_level() const { return this->get_compiler_opt_level(); } + +std::string_view EthernetKernel::get_compiler_opt_level() const { + return magic_enum::enum_name(this->config_.opt_level); +} + +std::string_view EthernetKernel::get_linker_opt_level() const { return this->get_compiler_opt_level(); } + void Kernel::process_compile_time_args(const std::function callback) const { for (int i = 0; i < this->compile_time_args_.size(); i++) { callback(i, this->compile_time_args_[i]); diff --git a/tt_metal/jit_build/build.cpp b/tt_metal/jit_build/build.cpp index d5d8b6eaca8..6c99b210c4f 100644 --- a/tt_metal/jit_build/build.cpp +++ b/tt_metal/jit_build/build.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. // // SPDX-License-Identifier: Apache-2.0 @@ -252,10 +252,12 @@ void JitBuildState::finish_init() { JitBuildDataMovement::JitBuildDataMovement(const JitBuildEnv& env, const JitBuiltStateConfig& build_config) : JitBuildState(env, build_config) { TT_ASSERT(this->core_id_ >= 0 && this->core_id_ < 2, "Invalid data movement processor"); - + this->lflags_ = env.lflags_; + this->cflags_ = env.cflags_; + this->default_compile_opt_level_ = "Os"; + this->default_linker_opt_level_ = "Os"; this->out_path_ = this->is_fw_ ? env_.out_firmware_root_ : env_.out_kernel_root_; - - this->cflags_ = env_.cflags_ + "-Os " + "-fno-tree-loop-distribute-patterns "; // don't use memcpy for cpy loops + this->cflags_ = env_.cflags_ + "-fno-tree-loop-distribute-patterns "; // don't use memcpy for cpy loops this->includes_ = env_.includes_ + "-I " + env_.root_ + "tt_metal/hw/firmware/src " + "-I " + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/common " + "-I " + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/llk_io "; @@ -265,8 +267,6 @@ JitBuildDataMovement::JitBuildDataMovement(const JitBuildEnv& env, const JitBuil uint32_t l1_cache_disable_mask = tt::llrt::RunTimeOptions::get_instance().get_feature_riscv_mask( tt::llrt::RunTimeDebugFeatureDisableL1DataCache); - this->lflags_ = env_.lflags_ + "-Os "; - switch (this->core_id_) { case 0: this->target_name_ = "brisc"; @@ -324,11 +324,12 @@ JitBuildDataMovement::JitBuildDataMovement(const JitBuildEnv& env, const JitBuil JitBuildCompute::JitBuildCompute(const JitBuildEnv& env, const JitBuiltStateConfig& build_config) : JitBuildState(env, build_config) { TT_ASSERT(this->core_id_ >= 0 && this->core_id_ < 3, "Invalid compute processor"); - + this->lflags_ = env.lflags_; + this->cflags_ = env.cflags_; + this->default_compile_opt_level_ = "O3"; + this->default_linker_opt_level_ = "O3"; this->out_path_ = this->is_fw_ ? env_.out_firmware_root_ : env_.out_kernel_root_; - this->cflags_ = env_.cflags_ + "-O3 "; - this->defines_ = env_.defines_; uint32_t l1_cache_disable_mask = tt::llrt::RunTimeOptions::get_instance().get_feature_riscv_mask( tt::llrt::RunTimeDebugFeatureDisableL1DataCache); @@ -353,8 +354,6 @@ JitBuildCompute::JitBuildCompute(const JitBuildEnv& env, const JitBuiltStateConf this->srcs_.push_back("tt_metal/hw/firmware/src/trisck.cc"); } - this->lflags_ = env_.lflags_ + "-O3 "; - switch (this->core_id_) { case 0: this->target_name_ = "trisc0"; @@ -416,6 +415,10 @@ JitBuildCompute::JitBuildCompute(const JitBuildEnv& env, const JitBuiltStateConf JitBuildActiveEthernet::JitBuildActiveEthernet(const JitBuildEnv& env, const JitBuiltStateConfig& build_config) : JitBuildState(env, build_config) { TT_ASSERT(this->core_id_ >= 0 && this->core_id_ < 1, "Invalid active ethernet processor"); + this->lflags_ = env.lflags_; + this->cflags_ = env.cflags_; + this->default_compile_opt_level_ = "Os"; + this->default_linker_opt_level_ = "Os"; this->out_path_ = this->is_fw_ ? env_.out_firmware_root_ : env_.out_kernel_root_; this->includes_ = env_.includes_ + "-I " + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + @@ -437,8 +440,7 @@ JitBuildActiveEthernet::JitBuildActiveEthernet(const JitBuildEnv& env, const Jit switch (build_class) { case 0: { this->target_name_ = "active_erisc"; - this->cflags_ = - env_.cflags_ + "-Os " + "-fno-tree-loop-distribute-patterns "; // don't use memcpy for cpy loops + this->cflags_ = env_.cflags_ + "-fno-tree-loop-distribute-patterns "; // don't use memcpy for cpy loops this->defines_ += "-DCOMPILE_FOR_ERISC " @@ -452,7 +454,6 @@ JitBuildActiveEthernet::JitBuildActiveEthernet(const JitBuildEnv& env, const Jit } else { this->srcs_.push_back("tt_metal/hw/firmware/src/active_erisck.cc"); } - this->lflags_ = env_.lflags_ + "-Os "; if (this->is_fw_) { this->lflags_ += @@ -466,7 +467,7 @@ JitBuildActiveEthernet::JitBuildActiveEthernet(const JitBuildEnv& env, const Jit } case 1: { this->target_name_ = "erisc"; - this->cflags_ = env_.cflags_ + "-Os -fno-delete-null-pointer-checks "; + this->cflags_ = env_.cflags_ + " -fno-delete-null-pointer-checks "; this->defines_ += "-DCOMPILE_FOR_ERISC " @@ -489,10 +490,7 @@ JitBuildActiveEthernet::JitBuildActiveEthernet(const JitBuildEnv& env, const Jit } else { linker_str = "tt_metal/hw/toolchain/erisc-b0-kernel.ld "; } - this->lflags_ = env_.lflags_ + - "-Os " - "-L" + - env_.root_ + + this->lflags_ = env_.lflags_ + "-L" + env_.root_ + "/tt_metal/hw/toolchain " "-T" + env_.root_ + linker_str; @@ -513,6 +511,10 @@ JitBuildActiveEthernet::JitBuildActiveEthernet(const JitBuildEnv& env, const Jit JitBuildIdleEthernet::JitBuildIdleEthernet(const JitBuildEnv& env, const JitBuiltStateConfig& build_config) : JitBuildState(env, build_config) { TT_ASSERT(this->core_id_ >= 0 && this->core_id_ < 2, "Invalid idle ethernet processor"); + this->lflags_ = env.lflags_; + this->cflags_ = env.cflags_; + this->default_compile_opt_level_ = "Os"; + this->default_linker_opt_level_ = "Os"; this->out_path_ = this->is_fw_ ? env_.out_firmware_root_ : env_.out_kernel_root_; this->includes_ = env_.includes_ + "-I " + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + @@ -530,8 +532,7 @@ JitBuildIdleEthernet::JitBuildIdleEthernet(const JitBuildEnv& env, const JitBuil switch (this->core_id_) { case 0: { this->target_name_ = "idle_erisc"; - this->cflags_ = - env_.cflags_ + "-Os " + "-fno-tree-loop-distribute-patterns "; // don't use memcpy for cpy loops + this->cflags_ = env_.cflags_ + "-fno-tree-loop-distribute-patterns "; // don't use memcpy for cpy loops this->defines_ += "-DCOMPILE_FOR_IDLE_ERISC=0 " @@ -545,7 +546,6 @@ JitBuildIdleEthernet::JitBuildIdleEthernet(const JitBuildEnv& env, const JitBuil } else { this->srcs_.push_back("tt_metal/hw/firmware/src/idle_erisck.cc"); } - this->lflags_ = env_.lflags_ + "-Os "; if (this->is_fw_) { this->lflags_ += @@ -559,8 +559,7 @@ JitBuildIdleEthernet::JitBuildIdleEthernet(const JitBuildEnv& env, const JitBuil } case 1: { this->target_name_ = "slave_idle_erisc"; - this->cflags_ = - env_.cflags_ + "-Os " + "-fno-tree-loop-distribute-patterns "; // don't use memcpy for cpy loops + this->cflags_ = env_.cflags_ + "-fno-tree-loop-distribute-patterns "; // don't use memcpy for cpy loops this->defines_ += "-DCOMPILE_FOR_IDLE_ERISC=1 " "-DERISC " @@ -571,7 +570,6 @@ JitBuildIdleEthernet::JitBuildIdleEthernet(const JitBuildEnv& env, const JitBuil } else { this->srcs_.push_back("tt_metal/hw/firmware/src/idle_erisck.cc"); } - this->lflags_ = env_.lflags_ + "-Os "; if (this->is_fw_) { this->lflags_ += "-T" + env_.root_ + "runtime/hw/toolchain/" + get_alias(env_.arch_) + "/firmware_slave_ierisc.ld "; @@ -609,9 +607,11 @@ void JitBuildState::compile_one( // ZoneScoped; fs::create_directories(out_dir); - // Add kernel specific defines + string cmd{"cd " + out_dir + " && " + env_.gpp_}; string defines = this->defines_; - if (settings != nullptr) { + + if (settings) { + // Append user args if (process_defines_at_compile) { settings->process_defines([&defines](const string& define, const string& value) { defines += "-D" + define + "='" + value + "' "; @@ -621,15 +621,17 @@ void JitBuildState::compile_one( settings->process_compile_time_args([&defines](int i, uint32_t value) { defines += "-DKERNEL_COMPILE_TIME_ARG_" + to_string(i) + "=" + to_string(value) + " "; }); + + cmd += fmt::format("-{} ", settings->get_compiler_opt_level()); + } else { + cmd += fmt::format("-{} ", this->default_compile_opt_level_); } - string cmd; - cmd = "cd " + out_dir + " && "; - cmd += env_.gpp_; + // Append common args provided by the build state cmd += this->cflags_; - cmd += defines; cmd += this->includes_; - cmd += "-c -o " + obj + " " + src; + cmd += "-c -o " + obj + " " + src + " "; + cmd += defines; log_debug(tt::LogBuildKernels, " g++ compile cmd: {}", cmd); @@ -659,18 +661,16 @@ void JitBuildState::compile(const string& log_file, const string& out_dir, const } } -void JitBuildState::link(const string& log_file, const string& out_dir) const { +void JitBuildState::link(const string& log_file, const string& out_dir, const JitBuildSettings* settings) const { // ZoneScoped; + string cmd{"cd " + out_dir + " && " + env_.gpp_}; string lflags = this->lflags_; if (tt::llrt::RunTimeOptions::get_instance().get_build_map_enabled()) { lflags += "-Wl,-Map=" + out_dir + "linker.map "; } - string cmd; - cmd = "cd " + out_dir + " && "; - cmd += env_.gpp_; - cmd += lflags; - cmd += this->link_objs_; + // Append user args + cmd += fmt::format("-{} ", settings ? settings->get_linker_opt_level() : this->default_linker_opt_level_); if (!this->is_fw_) { string weakened_elf_name = @@ -678,6 +678,9 @@ void JitBuildState::link(const string& log_file, const string& out_dir) const { cmd += "-Wl,--just-symbols=" + weakened_elf_name + " "; } + // Append common args provided by the build state + cmd += lflags; + cmd += this->link_objs_; cmd += "-o " + out_dir + this->target_name_ + ".elf"; log_debug(tt::LogBuildKernels, " g++ link cmd: {}", cmd); if (!tt::utils::run_command(cmd, log_file, false)) { @@ -731,9 +734,8 @@ void JitBuildState::build(const JitBuildSettings* settings) const { if (fs::exists(log_file)) { std::remove(log_file.c_str()); } - compile(log_file, out_dir, settings); - link(log_file, out_dir); + link(log_file, out_dir, settings); if (this->is_fw_) { weaken(log_file, out_dir); } diff --git a/tt_metal/programming_examples/hello_world_compute_kernel/hello_world_compute_kernel.cpp b/tt_metal/programming_examples/hello_world_compute_kernel/hello_world_compute_kernel.cpp index 34d7cc5e282..6d774a9d726 100644 --- a/tt_metal/programming_examples/hello_world_compute_kernel/hello_world_compute_kernel.cpp +++ b/tt_metal/programming_examples/hello_world_compute_kernel/hello_world_compute_kernel.cpp @@ -1,9 +1,10 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. // // SPDX-License-Identifier: Apache-2.0 #include #include +#include "tt-metalium/kernel_types.hpp" using namespace tt; using namespace tt::tt_metal; @@ -28,7 +29,8 @@ int main(int argc, char** argv) { .math_fidelity = MathFidelity::HiFi4, .fp32_dest_acc_en = false, .math_approx_mode = false, - .compile_args = compute_kernel_args}); + .compile_args = compute_kernel_args, + .opt_level = KernelBuildOptLevel::O3}); // Configure Program and Start Program Execution on Device From 38699bcd2220733966f377afecb1a68864e007c3 Mon Sep 17 00:00:00 2001 From: Mohamed Bahnas <116673264+mbahnasTT@users.noreply.github.com> Date: Tue, 25 Feb 2025 11:41:54 -0800 Subject: [PATCH 303/316] fix yolov4 faster webdemo (#18178) ### Ticket Link to Github Issue ### Problem description Provide context for the problem. ### What's changed Describe the approach used to solve the problem. Summarize the changes made and its impact. ### Checklist - [ ] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes https://github.com/tenstorrent/tt-metal/actions/runs/13476817943 - [ ] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [ ] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [ ] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [ ] New/Existing tests provide coverage for changes --------- Co-authored-by: Dalar Vartanians <132954887+dvartaniansTT@users.noreply.github.com> Co-authored-by: Dalar Vartanians --- .../wormhole/yolov4/test_yolov4_performant.py | 4 +- .../yolov4/test_yolov4_performant_webdemo.py | 44 +-- models/demos/yolov4/README.md | 27 +- models/demos/yolov4/demo/demo.py | 231 ++++++++-------- models/demos/yolov4/tests/test_perf_yolo.py | 18 +- .../yolov4/tests/yolov4_perfomant_webdemo.py | 250 ++--------------- .../demos/yolov4/tests/yolov4_test_infra.py | 63 ++--- models/demos/yolov4/ttnn/common.py | 8 + models/demos/yolov4/ttnn/genboxes.py | 256 ++++++++++++++++++ models/demos/yolov4/ttnn/yolov4.py | 35 ++- models/demos/yolov4/web_demo/README.md | 5 + .../demos/yolov4/web_demo/client/coco.names | 80 ++++++ .../yolov4/web_demo/client/requirements.txt | 1 + models/demos/yolov4/web_demo/client/yolov4.py | 181 ++++--------- .../yolov4/web_demo/server/fast_api_yolov4.py | 166 +++++++++++- tests/scripts/run_python_model_tests.sh | 2 +- .../yolov4/test_ttnn_downsample1.py | 10 +- .../yolov4/test_ttnn_downsample2.py | 10 +- .../yolov4/test_ttnn_downsample3.py | 11 +- .../yolov4/test_ttnn_downsample4.py | 9 +- .../yolov4/test_ttnn_downsample5.py | 9 +- .../yolov4/test_ttnn_head.py | 26 +- .../yolov4/test_ttnn_neck.py | 12 +- .../yolov4/test_ttnn_post_processing.py | 80 ++++++ .../yolov4/test_ttnn_yolov4.py | 134 +++++---- 25 files changed, 959 insertions(+), 713 deletions(-) create mode 100644 models/demos/yolov4/ttnn/genboxes.py create mode 100644 models/demos/yolov4/web_demo/client/coco.names mode change 100755 => 100644 models/demos/yolov4/web_demo/server/fast_api_yolov4.py create mode 100644 tests/ttnn/integration_tests/yolov4/test_ttnn_post_processing.py diff --git a/models/demos/wormhole/yolov4/test_yolov4_performant.py b/models/demos/wormhole/yolov4/test_yolov4_performant.py index ec4819711a9..81357bfdd70 100644 --- a/models/demos/wormhole/yolov4/test_yolov4_performant.py +++ b/models/demos/wormhole/yolov4/test_yolov4_performant.py @@ -24,7 +24,7 @@ def test_run_yolov4_inference(device, use_program_cache, batch_size, act_dtype, @run_for_wormhole_b0() -@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576, "trace_region_size": 1843200}], indirect=True) +@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576, "trace_region_size": 6422528}], indirect=True) @pytest.mark.parametrize( "batch_size, act_dtype, weight_dtype", ((1, ttnn.bfloat16, ttnn.bfloat16),), @@ -50,7 +50,7 @@ def test_run_yolov4_trace_inference( @run_for_wormhole_b0() @pytest.mark.parametrize( - "device_params", [{"l1_small_size": 24576, "trace_region_size": 3686400, "num_command_queues": 2}], indirect=True + "device_params", [{"l1_small_size": 24576, "trace_region_size": 6397952, "num_command_queues": 2}], indirect=True ) @pytest.mark.parametrize( "batch_size, act_dtype, weight_dtype", diff --git a/models/demos/wormhole/yolov4/test_yolov4_performant_webdemo.py b/models/demos/wormhole/yolov4/test_yolov4_performant_webdemo.py index b4940fbd2ab..bf716285a53 100644 --- a/models/demos/wormhole/yolov4/test_yolov4_performant_webdemo.py +++ b/models/demos/wormhole/yolov4/test_yolov4_performant_webdemo.py @@ -8,52 +8,12 @@ import torch from models.utility_functions import run_for_wormhole_b0 -from models.demos.yolov4.tests.yolov4_perfomant_webdemo import ( - run_yolov4_inference, - run_yolov4_trace_inference, - run_yolov4_trace_2cqs_inference, - Yolov4Trace2CQ, -) - - -@run_for_wormhole_b0() -@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576}], indirect=True) -@pytest.mark.parametrize( - "batch_size, act_dtype, weight_dtype", - ((1, ttnn.bfloat16, ttnn.bfloat16),), -) -def test_run_yolov4_inference(device, use_program_cache, batch_size, act_dtype, weight_dtype, model_location_generator): - run_yolov4_inference(device, batch_size, act_dtype, weight_dtype, model_location_generator) - - -@run_for_wormhole_b0() -@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576, "trace_region_size": 1617920}], indirect=True) -@pytest.mark.parametrize( - "batch_size, act_dtype, weight_dtype", - ((1, ttnn.bfloat16, ttnn.bfloat16),), -) -@pytest.mark.parametrize("enable_async_mode", (False, True), indirect=True) -def test_run_yolov4_trace_inference( - device, - use_program_cache, - batch_size, - act_dtype, - weight_dtype, - enable_async_mode, - model_location_generator, -): - run_yolov4_trace_inference( - device, - batch_size, - act_dtype, - weight_dtype, - model_location_generator, - ) +from models.demos.yolov4.tests.yolov4_perfomant_webdemo import Yolov4Trace2CQ @run_for_wormhole_b0() @pytest.mark.parametrize( - "device_params", [{"l1_small_size": 24576, "trace_region_size": 1617920, "num_command_queues": 2}], indirect=True + "device_params", [{"l1_small_size": 24576, "trace_region_size": 3211264, "num_command_queues": 2}], indirect=True ) @pytest.mark.parametrize( "batch_size, act_dtype, weight_dtype", diff --git a/models/demos/yolov4/README.md b/models/demos/yolov4/README.md index 6e6f560379c..006e1eaacf9 100644 --- a/models/demos/yolov4/README.md +++ b/models/demos/yolov4/README.md @@ -2,24 +2,31 @@ ## How to run yolov4 -- Use the following command to run the yolov4 performant impelementation (95 FPS): +### Model code running with Trace+2CQ +- Use the following command to run the yolov4 performant implementation (71 FPS): + ```bash + pytest models/demos/wormhole/yolov4/test_yolov4_performant_webdemo.py::test_run_yolov4_trace_2cqs_inference[True-1-act_dtype0-weight_dtype0-device_params0] ``` - pytest models/demos/wormhole/yolov4/test_yolov4_performant.py::test_run_yolov4_trace_2cqs_inference[True-1-act_dtype0-weight_dtype0-device_params0] - ``` - -- You may try the interactive web demo following the instructions here: models/demos/yolov4/web_demo/README.md (25-30 FPS). NOTE: The post-processing is currently running on host. It will be moved to device soon which should significantly improve the end to end FPS. - -- Use the following command to run a single-image demo for visualization. NOTE: the following demos are intented for visualization. It is not the performant implementation yet. And, the post processing is currently done on host which we will be moving to device soon. +### Single Image Demo - Use the following command to run the yolov4 with a giraffe image: - ``` + ```bash pytest models/demos/yolov4/demo/demo.py ``` +- The output file `ttnn_yolov4_320_prediction_demo.jpg` will be generated. - Use the following command to run the yolov4 with different input image: - ``` + ```bash pytest --disable-warnings --input-path= models/demos/yolov4/demo/demo.py ``` -Once you run the command, The output file named `ttnn_prediction_demo.jpg` will be generated. + +### mAP Accuracy Test +- To be added soon + +### Web Demo +- You may try the interactive web demo (35 FPS end-2-end) following the instructions: +``` +models/demos/yolov4/web_demo/README.md +``` diff --git a/models/demos/yolov4/demo/demo.py b/models/demos/yolov4/demo/demo.py index 277e28deab0..987f0c7b509 100644 --- a/models/demos/yolov4/demo/demo.py +++ b/models/demos/yolov4/demo/demo.py @@ -140,10 +140,10 @@ def yolo_forward_dynamic( by_bh /= output.size(2) # Shape: [batch, num_anchors * H * W, 1] - bx = bx_bw[:, :num_anchors].view(output.size(0), num_anchors * output.size(2) * output.size(3), 1) - by = by_bh[:, :num_anchors].view(output.size(0), num_anchors * output.size(2) * output.size(3), 1) - bw = bx_bw[:, num_anchors:].view(output.size(0), num_anchors * output.size(2) * output.size(3), 1) - bh = by_bh[:, num_anchors:].view(output.size(0), num_anchors * output.size(2) * output.size(3), 1) + bx = bx_bw[:, :num_anchors].reshape(output.size(0), num_anchors * output.size(2) * output.size(3), 1) + by = by_bh[:, :num_anchors].reshape(output.size(0), num_anchors * output.size(2) * output.size(3), 1) + bw = bx_bw[:, num_anchors:].reshape(output.size(0), num_anchors * output.size(2) * output.size(3), 1) + bh = by_bh[:, num_anchors:].reshape(output.size(0), num_anchors * output.size(2) * output.size(3), 1) bx1 = bx - bw * 0.5 by1 = by - bh * 0.5 @@ -324,12 +324,6 @@ def nms_cpu(boxes, confs, nms_thresh=0.5, min_mode=False): def post_processing(img, conf_thresh, nms_thresh, output): - # anchors = [12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401] - # num_anchors = 9 - # anchor_masks = [[0, 1, 2], [3, 4, 5], [6, 7, 8]] - # strides = [8, 16, 32] - # anchor_step = len(anchors) // num_anchors - # [batch, num, 1, 4] box_array = output[0] # [batch, num, num_classes] @@ -464,34 +458,7 @@ def do_detect(model, img, conf_thresh, nms_thresh, n_classes, device=None, class output_tensor3 = output_tensor3.reshape(1, 10, 10, 255) output_tensor3 = torch.permute(output_tensor3, (0, 3, 1, 2)) - yolo1 = YoloLayer( - anchor_mask=[0, 1, 2], - num_classes=n_classes, - anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], - num_anchors=9, - stride=8, - ) - - yolo2 = YoloLayer( - anchor_mask=[3, 4, 5], - num_classes=n_classes, - anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], - num_anchors=9, - stride=16, - ) - - yolo3 = YoloLayer( - anchor_mask=[6, 7, 8], - num_classes=n_classes, - anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], - num_anchors=9, - stride=32, - ) - - y1 = yolo1(output_tensor1) - y2 = yolo2(output_tensor2) - y3 = yolo3(output_tensor3) - + y1, y2, y3 = gen_yolov4_boxes_confs([output_tensor1, output_tensor2, output_tensor3]) output = get_region_boxes([y1, y2, y3]) t2 = time.time() @@ -511,37 +478,8 @@ def do_detect(model, img, conf_thresh, nms_thresh, n_classes, device=None, class else: t1 = time.time() output = model(img) - - yolo1 = YoloLayer( - anchor_mask=[0, 1, 2], - num_classes=n_classes, - anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], - num_anchors=9, - stride=8, - ) - - yolo2 = YoloLayer( - anchor_mask=[3, 4, 5], - num_classes=n_classes, - anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], - num_anchors=9, - stride=16, - ) - - yolo3 = YoloLayer( - anchor_mask=[6, 7, 8], - num_classes=n_classes, - anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], - num_anchors=9, - stride=32, - ) - - y1 = yolo1(output[0]) - y2 = yolo2(output[1]) - y3 = yolo3(output[2]) - + y1, y2, y3 = gen_yolov4_boxes_confs(output) output = get_region_boxes([y1, y2, y3]) - t2 = time.time() print("-----------------------------------") @@ -556,66 +494,117 @@ def do_detect(model, img, conf_thresh, nms_thresh, n_classes, device=None, class plot_boxes_cv2(img, boxes[0], "torch_prediction_demo.jpg", class_names) +def gen_yolov4_boxes_confs(output): + n_classes = 80 + anchors_array = [12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401] + num_anchors = 9 + anchor_masks = [[0, 1, 2], [3, 4, 5], [6, 7, 8]] + strides = [8, 16, 32] + + yolo1 = YoloLayer( + anchor_mask=anchor_masks[0], + num_classes=n_classes, + anchors=anchors_array, + num_anchors=num_anchors, + stride=strides[0], + ) + + yolo2 = YoloLayer( + anchor_mask=anchor_masks[1], + num_classes=n_classes, + anchors=anchors_array, + num_anchors=num_anchors, + stride=strides[1], + ) + + yolo3 = YoloLayer( + anchor_mask=anchor_masks[2], + num_classes=n_classes, + anchors=anchors_array, + num_anchors=num_anchors, + stride=strides[2], + ) + + y1 = yolo1(output[0]) + y2 = yolo2(output[1]) + y3 = yolo3(output[2]) + + return y1, y2, y3 + + @skip_for_grayskull() @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) -@pytest.mark.parametrize( - "use_pretrained_weight", - [True, False], - ids=[ - "pretrained_weight_true", - "pretrained_weight_false", - ], -) -def test_yolov4_model(device, model_location_generator, reset_seeds, input_path, use_pretrained_weight): +def test_yolov4(device, reset_seeds, model_location_generator): + torch.manual_seed(0) model_path = model_location_generator("models", model_subdir="Yolo") - if use_pretrained_weight: - if model_path == "models": - if not os.path.exists("tests/ttnn/integration_tests/yolov4/yolov4.pth"): # check if yolov4.th is availble - os.system( - "tests/ttnn/integration_tests/yolov4/yolov4_weights_download.sh" - ) # execute the yolov4_weights_download.sh file - - weights_pth = "tests/ttnn/integration_tests/yolov4/yolov4.pth" - else: - weights_pth = str(model_path / "yolov4.pth") - - ttnn_model = TtYOLOv4(device, weights_pth) - torch_model = Yolov4() - new_state_dict = {} - ds_state_dict = {k: v for k, v in ttnn_model.torch_model.items()} - - keys = [name for name, parameter in torch_model.state_dict().items()] - values = [parameter for name, parameter in ds_state_dict.items()] - for i in range(len(keys)): - new_state_dict[keys[i]] = values[i] + if model_path == "models": + if not os.path.exists("tests/ttnn/integration_tests/yolov4/yolov4.pth"): # check if yolov4.th is availble + os.system( + "tests/ttnn/integration_tests/yolov4/yolov4_weights_download.sh" + ) # execute the yolov4_weights_download.sh file - torch_model.load_state_dict(new_state_dict) - torch_model.eval() + weights_pth = "tests/ttnn/integration_tests/yolov4/yolov4.pth" else: - torch_model = Yolov4.from_random_weights() - ttnn_weights = update_weight_parameters(OrderedDict(torch_model.state_dict())) - ttnn_model = TtYOLOv4(device, ttnn_weights) + weights_pth = str(model_path / "yolov4.pth") - n_classes = 80 - namesfile = "models/demos/yolov4/demo/coco.names" - if input_path == "": - imgfile = "models/demos/yolov4/demo/giraffe_320.jpg" - else: - imgfile = input_path + ttnn_model = TtYOLOv4(weights_pth, device) + + imgfile = "models/demos/yolov4/demo/giraffe_320.jpg" width = 320 height = 320 - img = cv2.imread(imgfile) - - # Inference input size is 416*416 does not mean training size is the same - # Training size could be 608*608 or even other sizes - # Optional inference sizes: - # Hight in {320, 416, 512, 608, ... 320 + 96 * n} - # Width in {320, 416, 512, 608, ... 320 + 96 * m} - sized = cv2.resize(img, (width, height)) - sized = cv2.cvtColor(sized, cv2.COLOR_BGR2RGB) - - for i in range(2): # This 'for' loop is for speed check - # Because the first iteration is usually longer - do_detect(ttnn_model, sized, 0.3, 0.4, n_classes, device, class_name=namesfile, imgfile=imgfile) + img = cv2.resize(img, (width, height)) + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + if type(img) == np.ndarray and len(img.shape) == 3: # cv2 image + img = torch.from_numpy(img.transpose(2, 0, 1)).float().div(255.0).unsqueeze(0) + elif type(img) == np.ndarray and len(img.shape) == 4: + img = torch.from_numpy(img.transpose(0, 3, 1, 2)).float().div(255.0) + else: + exit() + torch_input = torch.autograd.Variable(img) + + input_tensor = torch.permute(torch_input, (0, 2, 3, 1)) + ttnn_input = ttnn.from_torch(input_tensor, ttnn.bfloat16) + + torch_model = Yolov4() + new_state_dict = dict(zip(torch_model.state_dict().keys(), ttnn_model.torch_model.values())) + torch_model.load_state_dict(new_state_dict) + torch_model.eval() + + torch_output_tensor = torch_model(torch_input) + + ref1, ref2, ref3 = gen_yolov4_boxes_confs(torch_output_tensor) + ref_boxes, ref_confs = get_region_boxes([ref1, ref2, ref3]) + + ttnn_output_tensor = ttnn_model(ttnn_input) + result_boxes_padded = ttnn.to_torch(ttnn_output_tensor[0]) + result_confs = ttnn.to_torch(ttnn_output_tensor[1]) + + result_boxes_padded = result_boxes_padded.permute(0, 2, 1, 3) + result_boxes_list = [] + # Unpadding + # That ttnn tensor is the concat output of 3 padded tensors + # As a perf workaround I'm doing the unpadding on the torch output here. + # TODO: cleaner ttnn code when ttnn.untilize() is fully optimized + box_1_start_i = 0 + box_1_end_i = 6100 + box_2_start_i = 6128 + box_2_end_i = 6228 + box_3_start_i = 6256 + box_3_end_i = 6356 + result_boxes_list.append(result_boxes_padded[:, box_1_start_i:box_1_end_i]) + result_boxes_list.append(result_boxes_padded[:, box_2_start_i:box_2_end_i]) + result_boxes_list.append(result_boxes_padded[:, box_3_start_i:box_3_end_i]) + result_boxes = torch.cat(result_boxes_list, dim=1) + + ## Giraffe image detection + conf_thresh = 0.3 + nms_thresh = 0.4 + output = [result_boxes.to(torch.float16), result_confs.to(torch.float16)] + + boxes = post_processing(img, conf_thresh, nms_thresh, output) + namesfile = "models/demos/yolov4/demo/coco.names" + class_names = load_class_names(namesfile) + img = cv2.imread(imgfile) + plot_boxes_cv2(img, boxes[0], "ttnn_yolov4_320_prediction_demo.jpg", class_names) diff --git a/models/demos/yolov4/tests/test_perf_yolo.py b/models/demos/yolov4/tests/test_perf_yolo.py index 28c7c82cdb6..4230aa818e3 100644 --- a/models/demos/yolov4/tests/test_perf_yolo.py +++ b/models/demos/yolov4/tests/test_perf_yolo.py @@ -26,12 +26,11 @@ def get_expected_compile_time_sec(): - return 60 + return 75 def get_expected_inference_time_sec(): - return 0.25 - + return 0.37 @pytest.mark.models_performance_bare_metal @pytest.mark.parametrize("device_params", [{"l1_small_size": 24576}], indirect=True) @@ -60,14 +59,15 @@ def test_yolov4( weights_pth = "tests/ttnn/integration_tests/yolov4/yolov4.pth" else: weights_pth = str(model_path / "yolov4.pth") - ttnn_model = TtYOLOv4(device, weights_pth) + ttnn_model = TtYOLOv4(weights_pth, device) torch_input_tensor = torch.rand(input_shape, dtype=torch.bfloat16) ttnn_input = ttnn.from_torch(torch_input_tensor, ttnn.bfloat16) logger.info(f"Compiling model with warmup run") profiler.start(f"inference_and_compile_time") - out1, out2, out3 = ttnn_model(ttnn_input) + ttnn_output_tensor = ttnn_model(ttnn_input) + profiler.end(f"inference_and_compile_time") inference_and_compile_time = profiler.get("inference_and_compile_time") @@ -79,10 +79,8 @@ def test_yolov4( for idx in range(iterations): profiler.start("inference_time") profiler.start(f"inference_time_{idx}") - out1, out2, out3 = ttnn_model(ttnn_input) - outputs.append(ttnn.from_device(out1, blocking=False)) - outputs.append(ttnn.from_device(out2, blocking=False)) - outputs.append(ttnn.from_device(out3, blocking=False)) + ttnn_output_tensor = ttnn_model(ttnn_input) + profiler.end(f"inference_time_{idx}") profiler.end("inference_time") @@ -126,7 +124,7 @@ def test_perf_device_bare_metal_yolov4(batch_size, model_name): num_iterations = 1 margin = 0.03 - expected_perf = 234 + expected_perf = 102 command = f"pytest tests/ttnn/integration_tests/yolov4/test_ttnn_yolov4.py" cols = ["DEVICE FW", "DEVICE KERNEL", "DEVICE BRISC KERNEL"] diff --git a/models/demos/yolov4/tests/yolov4_perfomant_webdemo.py b/models/demos/yolov4/tests/yolov4_perfomant_webdemo.py index 0968152e3ce..f8b5486060c 100644 --- a/models/demos/yolov4/tests/yolov4_perfomant_webdemo.py +++ b/models/demos/yolov4/tests/yolov4_perfomant_webdemo.py @@ -9,8 +9,6 @@ is_wormhole_b0, ) from models.demos.yolov4.tests.yolov4_test_infra import create_test_infra -from models.demos.yolov4.demo.demo import YoloLayer - try: from tracy import signpost @@ -31,175 +29,6 @@ def buffer_address(tensor): ttnn.buffer_address = buffer_address -def run_yolov4_inference( - device, - device_batch_size, - act_dtype, - weight_dtype, - model_location_generator, -): - test_infra = create_test_infra( - device, - device_batch_size, - act_dtype, - weight_dtype, - model_location_generator=model_location_generator, - ) - - tt_inputs_host, self.input_mem_config = test_infra.setup_l1_sharded_input(device) - - # # First run configures convs JIT - test_infra.input_tensor = tt_inputs_host.to(device, self.input_mem_config) - test_infra.run() - test_infra.validate() - test_infra.dealloc_output() - - # Optimized run - test_infra.input_tensor = tt_inputs_host.to(device, self.input_mem_config) - test_infra.run() - test_infra.validate() - test_infra.dealloc_output() - - # More optimized run with caching - if use_signpost: - signpost(header="start") - test_infra.input_tensor = tt_inputs_host.to(device, self.input_mem_config) - test_infra.run() - if use_signpost: - signpost(header="stop") - test_infra.validate() - test_infra.dealloc_output() - - -def run_yolov4_trace_inference( - device, - device_batch_size, - act_dtype, - weight_dtype, - model_location_generator, -): - test_infra = create_test_infra( - device, - device_batch_size, - act_dtype, - weight_dtype, - model_location_generator=model_location_generator, - ) - tt_inputs_host, self.input_mem_config = test_infra.setup_l1_sharded_input(device) - - # First run configures convs JIT - test_infra.input_tensor = tt_inputs_host.to(device, self.input_mem_config) - spec = test_infra.input_tensor.spec - test_infra.run() - test_infra.validate() - test_infra.dealloc_output() - - # Optimized run - test_infra.input_tensor = tt_inputs_host.to(device, self.input_mem_config) - test_infra.run() - test_infra.validate() - - # Capture - test_infra.input_tensor = tt_inputs_host.to(device, self.input_mem_config) - test_infra.dealloc_output() - trace_input_addr = ttnn.buffer_address(test_infra.input_tensor) - self.tid = ttnn.begin_trace_capture(device, cq_id=0) - test_infra.run() - tt_image_res = ttnn.allocate_tensor_on_device(spec, device) - ttnn.end_trace_capture(device, self.tid, cq_id=0) - assert trace_input_addr == ttnn.buffer_address(tt_image_res) - - # More optimized run with caching - if use_signpost: - signpost(header="start") - ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res, 0) - ttnn.execute_trace(device, self.tid, cq_id=0, blocking=True) - if use_signpost: - signpost(header="stop") - test_infra.validate() - - ttnn.release_trace(device, self.tid) - test_infra.dealloc_output() - - -def run_yolov4_trace_2cqs_inference( - device, - device_batch_size, - act_dtype, - weight_dtype, - model_location_generator, -): - test_infra = create_test_infra( - device, - device_batch_size, - act_dtype, - weight_dtype, - model_location_generator=model_location_generator, - ) - tt_inputs_host, sharded_mem_config_DRAM, self.input_mem_config = test_infra.setup_dram_sharded_input(device) - tt_image_res = tt_inputs_host.to(device, sharded_mem_config_DRAM) - op_event = ttnn.create_event(device) - write_event = ttnn.create_event(device) - # Initialize the op event so we can write - ttnn.record_event(0, op_event) - - # First run configures convs JIT - ttnn.wait_for_event(1, op_event) - ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res, 1) - ttnn.record_event(1, write_event) - ttnn.wait_for_event(0, write_event) - test_infra.input_tensor = ttnn.to_memory_config(tt_image_res, self.input_mem_config) - spec = test_infra.input_tensor.spec - ttnn.record_event(0, op_event) - test_infra.run() - test_infra.validate() - test_infra.dealloc_output() - - # Optimized run - ttnn.wait_for_event(1, op_event) - ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res, 1) - ttnn.record_event(1, write_event) - ttnn.wait_for_event(0, write_event) - test_infra.input_tensor = ttnn.to_memory_config(tt_image_res, self.input_mem_config) - ttnn.record_event(0, op_event) - test_infra.run() - test_infra.validate() - - # Capture - ttnn.wait_for_event(1, op_event) - ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res, 1) - ttnn.record_event(1, write_event) - ttnn.wait_for_event(0, write_event) - test_infra.input_tensor = ttnn.to_memory_config(tt_image_res, self.input_mem_config) - ttnn.record_event(0, op_event) - test_infra.dealloc_output() - trace_input_addr = ttnn.buffer_address(test_infra.input_tensor) - self.tid = ttnn.begin_trace_capture(device, cq_id=0) - test_infra.run() - self.input_tensor = ttnn.allocate_tensor_on_device(spec, device) - ttnn.end_trace_capture(device, self.tid, cq_id=0) - assert trace_input_addr == ttnn.buffer_address(self.input_tensor) - - # More optimized run with caching - if use_signpost: - signpost(header="start") - for iter in range(0, 2): - ttnn.wait_for_event(1, op_event) - ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res, 1) - ttnn.record_event(1, write_event) - ttnn.wait_for_event(0, write_event) - # TODO: Add in place support to ttnn to_memory_config - self.input_tensor = ttnn.reshard(tt_image_res, self.input_mem_config, self.input_tensor) - ttnn.record_event(0, op_event) - ttnn.execute_trace(device, self.tid, cq_id=0, blocking=False) - ttnn.synchronize_devices(device) - - if use_signpost: - signpost(header="stop") - - ttnn.release_trace(device, self.tid) - - class Yolov4Trace2CQ: def __init__(self): ... @@ -267,12 +96,7 @@ def initialize_yolov4_trace_2cqs_inference( self.device = device - # More optimized run with caching - # if use_signpost: - # signpost(header="start") - def get_region_boxes(self, boxes_and_confs): - print("Getting boxes from boxes and confs ...") boxes_list = [] confs_list = [] @@ -280,8 +104,6 @@ def get_region_boxes(self, boxes_and_confs): boxes_list.append(item[0]) confs_list.append(item[1]) - # boxes: [batch, num1 + num2 + num3, 1, 4] - # confs: [batch, num1 + num2 + num3, num_classes] boxes = torch.cat(boxes_list, dim=1) confs = torch.cat(confs_list, dim=1) @@ -298,57 +120,29 @@ def execute_yolov4_trace_2cqs_inference(self, tt_inputs_host=None): ttnn.record_event(0, self.op_event) ttnn.execute_trace(self.device, self.tid, cq_id=0, blocking=False) ttnn.synchronize_devices(self.device) - output = self.test_infra.output_tensor - - output_tensor1 = ttnn.to_torch(output[0]) - output_tensor1 = output_tensor1.reshape(1, 40, 40, 255) - output_tensor1 = torch.permute(output_tensor1, (0, 3, 1, 2)) - - output_tensor2 = ttnn.to_torch(output[1]) - output_tensor2 = output_tensor2.reshape(1, 20, 20, 255) - output_tensor2 = torch.permute(output_tensor2, (0, 3, 1, 2)) - - output_tensor3 = ttnn.to_torch(output[2]) - output_tensor3 = output_tensor3.reshape(1, 10, 10, 255) - output_tensor3 = torch.permute(output_tensor3, (0, 3, 1, 2)) - - n_classes = 80 - - yolo1 = YoloLayer( - anchor_mask=[0, 1, 2], - num_classes=n_classes, - anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], - num_anchors=9, - stride=8, - ) - - yolo2 = YoloLayer( - anchor_mask=[3, 4, 5], - num_classes=n_classes, - anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], - num_anchors=9, - stride=16, - ) - - yolo3 = YoloLayer( - anchor_mask=[6, 7, 8], - num_classes=n_classes, - anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], - num_anchors=9, - stride=32, - ) - - y1 = yolo1(output_tensor1) - y2 = yolo2(output_tensor2) - y3 = yolo3(output_tensor3) - - output = self.get_region_boxes([y1, y2, y3]) - - return output - # return self.test_infra.output_tensor - # if use_signpost: - # signpost(header="stop") + ttnn_output_tensor = self.test_infra.output_tensor + + result_boxes_padded = ttnn.to_torch(ttnn_output_tensor[0]) + result_confs = ttnn.to_torch(ttnn_output_tensor[1]) + + result_boxes_padded = result_boxes_padded.permute(0, 2, 1, 3) + result_boxes_list = [] + # That ttnn tensor is the concat output of 3 padded tensors + # As a perf workaround I'm doing the unpadding on the torch output here. + # TODO: cleaner ttnn code when ttnn.untilize() is fully optimized + box_1_start_i = 0 + box_1_end_i = 6100 + box_2_start_i = 6128 + box_2_end_i = 6228 + box_3_start_i = 6256 + box_3_end_i = 6356 + result_boxes_list.append(result_boxes_padded[:, box_1_start_i:box_1_end_i]) + result_boxes_list.append(result_boxes_padded[:, box_2_start_i:box_2_end_i]) + result_boxes_list.append(result_boxes_padded[:, box_3_start_i:box_3_end_i]) + result_boxes = torch.cat(result_boxes_list, dim=1) + + return [result_boxes, result_confs] def release_yolov4_trace_2cqs_inference(self): ttnn.release_trace(self.device, self.tid) diff --git a/models/demos/yolov4/tests/yolov4_test_infra.py b/models/demos/yolov4/tests/yolov4_test_infra.py index 1c82369c476..474e2f2e87e 100644 --- a/models/demos/yolov4/tests/yolov4_test_infra.py +++ b/models/demos/yolov4/tests/yolov4_test_infra.py @@ -11,6 +11,8 @@ import ttnn from models.demos.yolov4.reference.yolov4 import Yolov4 from models.demos.yolov4.ttnn.yolov4 import TtYOLOv4 +from models.demos.yolov4.demo.demo import YoloLayer, get_region_boxes, gen_yolov4_boxes_confs + from models.utility_functions import ( is_wormhole_b0, @@ -40,15 +42,7 @@ def load_yolov4_weight(model_location_generator=None): def load_yolov4_model(ttnn_model): torch_model = Yolov4() - new_state_dict = {} - ds_state_dict = {k: v for k, v in ttnn_model.torch_model.items()} - - keys = [name for name, parameter in torch_model.state_dict().items()] - values = [parameter for name, parameter in ds_state_dict.items()] - - for i in range(len(keys)): - new_state_dict[keys[i]] = values[i] - + new_state_dict = dict(zip(torch_model.state_dict().keys(), ttnn_model.torch_model.values())) torch_model.load_state_dict(new_state_dict) torch_model.eval() return torch_model @@ -72,13 +66,16 @@ def __init__( self.act_dtype = act_dtype self.weight_dtype = weight_dtype self.model_location_generator = model_location_generator - self.ttnn_yolov4_model = TtYOLOv4(device, load_yolov4_weight(self.model_location_generator)) + self.ttnn_yolov4_model = TtYOLOv4(load_yolov4_weight(self.model_location_generator), device) + torch_model = load_yolov4_model(self.ttnn_yolov4_model) input_shape = (1, 320, 320, 3) torch_input_tensor = torch.randn(input_shape, dtype=torch.float32) self.input_tensor = ttnn.from_torch(torch_input_tensor, ttnn.bfloat16) self.torch_input_tensor = torch_input_tensor.permute(0, 3, 1, 2) self.torch_output_tensor = torch_model(self.torch_input_tensor) + ref1, ref2, ref3 = gen_yolov4_boxes_confs(self.torch_output_tensor) + self.ref_boxes, self.ref_confs = get_region_boxes([ref1, ref2, ref3]) def run(self): self.output_tensor = self.ttnn_yolov4_model(self.input_tensor) @@ -130,38 +127,42 @@ def setup_dram_sharded_input(self, device, torch_input_tensor=None, mesh_mapper= def validate(self, output_tensor=None): output_tensor = self.output_tensor if output_tensor is None else output_tensor - output_tensor = ttnn.to_torch(self.output_tensor[0]) - output_tensor = output_tensor.reshape(1, 40, 40, 255) - output_tensor = torch.permute(output_tensor, (0, 3, 1, 2)) - - valid_pcc = 0.985 - self.pcc_passed, self.pcc_message = assert_with_pcc(self.torch_output_tensor[0], output_tensor, pcc=valid_pcc) + result_boxes_padded = ttnn.to_torch(self.output_tensor[0]) + result_confs = ttnn.to_torch(self.output_tensor[1]) + + result_boxes_padded = result_boxes_padded.permute(0, 2, 1, 3) + result_boxes_list = [] + # That ttnn tensor is the concat output of 3 padded tensors + # As a perf workaround I'm doing the unpadding on the torch output here. + # TODO: cleaner ttnn code when ttnn.untilize() is fully optimized + box_1_start_i = 0 + box_1_end_i = 6100 + box_2_start_i = 6128 + box_2_end_i = 6228 + box_3_start_i = 6256 + box_3_end_i = 6356 + result_boxes_list.append(result_boxes_padded[:, box_1_start_i:box_1_end_i]) + result_boxes_list.append(result_boxes_padded[:, box_2_start_i:box_2_end_i]) + result_boxes_list.append(result_boxes_padded[:, box_3_start_i:box_3_end_i]) + result_boxes = torch.cat(result_boxes_list, dim=1) + + valid_pcc = 0.99 + self.pcc_passed, self.pcc_message = assert_with_pcc(self.ref_boxes, result_boxes, pcc=valid_pcc) logger.info( - f"Yolov4 batch_size={self.batch_size}, act_dtype={self.act_dtype}, weight_dtype={self.weight_dtype}, PCC={self.pcc_message}" + f"Yolov4 - Bboxes. batch_size={self.batch_size}, act_dtype={self.act_dtype}, weight_dtype={self.weight_dtype}, PCC={self.pcc_message}" ) - output_tensor = ttnn.to_torch(self.output_tensor[1]) - output_tensor = torch.reshape(output_tensor, (self.batch_size, 20, 20, 255)) - output_tensor = torch.permute(output_tensor, (0, 3, 1, 2)) - self.pcc_passed, self.pcc_message = assert_with_pcc(self.torch_output_tensor[1], output_tensor, pcc=valid_pcc) - - logger.info( - f"Yolov4 batch_size={self.batch_size}, act_dtype={self.act_dtype}, weight_dtype={self.weight_dtype}, PCC={self.pcc_message}" - ) + valid_pcc = 0.71 + self.pcc_passed, self.pcc_message = assert_with_pcc(self.ref_confs, result_confs, pcc=valid_pcc) - output_tensor = ttnn.to_torch(self.output_tensor[2]) - output_tensor = torch.reshape(output_tensor, (self.batch_size, 10, 10, 255)) - output_tensor = torch.permute(output_tensor, (0, 3, 1, 2)) - self.pcc_passed, self.pcc_message = assert_with_pcc(self.torch_output_tensor[2], output_tensor, pcc=valid_pcc) logger.info( - f"Yolov4 batch_size={self.batch_size}, act_dtype={self.act_dtype}, weight_dtype={self.weight_dtype}, PCC={self.pcc_message}" + f"Yolov4 - Confs. batch_size={self.batch_size}, act_dtype={self.act_dtype}, weight_dtype={self.weight_dtype}, PCC={self.pcc_message}" ) def dealloc_output(self): ttnn.deallocate(self.output_tensor[0]) ttnn.deallocate(self.output_tensor[1]) - ttnn.deallocate(self.output_tensor[2]) def create_test_infra( diff --git a/models/demos/yolov4/ttnn/common.py b/models/demos/yolov4/ttnn/common.py index 70ead902094..e20814a3a73 100644 --- a/models/demos/yolov4/ttnn/common.py +++ b/models/demos/yolov4/ttnn/common.py @@ -52,9 +52,17 @@ def __init__( else: weight = model[path + ".conv.0.weight"] bias = model[path + ".conv.0.bias"] + # padding the channel dim in the last conv in the head module from 255 to 256 + # to avoid additional padding in the model graph + if weight.shape[0] == 255: + weight = torch.nn.functional.pad(weight, (0, 0, 0, 0, 0, 0, 0, 1)) self.weights = ttnn.from_torch(weight) bias = bias.reshape(1, 1, 1, -1) + # padding the channel dim in the last conv in the head module from 255 to 256 + if bias.shape[-1] == 255: + bias = torch.nn.functional.pad(bias, (0, 1, 0, 0, 0, 0, 0, 0)) self.bias = ttnn.from_torch(bias) + self.input_params = input_params self.kernel_size = (self.weights.shape[2], self.weights.shape[3]) self.conv_params = conv_params diff --git a/models/demos/yolov4/ttnn/genboxes.py b/models/demos/yolov4/ttnn/genboxes.py new file mode 100644 index 00000000000..fb8bb49867d --- /dev/null +++ b/models/demos/yolov4/ttnn/genboxes.py @@ -0,0 +1,256 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +import torch +import math +import numpy as np +import ttnn +from models.utility_functions import _nearest_32 + + +def create_conv_bias_tensor(torch_tensor, N, K, pad=0): + bias_shape = [1, 1, N, K] + bias_padded_shape = [1, 1, _nearest_32(N), _nearest_32(K)] + tt_tensor = ttnn.Tensor(torch.flatten(torch_tensor).tolist(), bias_shape, ttnn.bfloat16, ttnn.ROW_MAJOR_LAYOUT).pad( + bias_shape, (0, 0, 0, 0), 0.0 + ) + tt_tensor = tt_tensor.pad_to_tile(pad).to(ttnn.TILE_LAYOUT) + return tt_tensor + + +class TtGenBoxes: + def __init__(self, device) -> None: + self.thresh = 0.6 + self.num_classes = 80 + self.num_anchors = 3 + + self.grid_x = [] + self.grid_y = [] + for H in (40, 20, 10): + grid_x_i = torch.reshape( + torch.flatten( + torch.from_numpy( + np.expand_dims( + np.expand_dims(np.expand_dims(np.linspace(0, H - 1, H), axis=0).repeat(H, 0), axis=0), + axis=0, + ) + ) + ), + (1, 1, 1, H * H), + ) + + grid_y_i = torch.reshape( + torch.flatten( + torch.from_numpy( + np.expand_dims( + np.expand_dims(np.expand_dims(np.linspace(0, H - 1, H), axis=1).repeat(H, 1), axis=0), + axis=0, + ) + ) + ), + (1, 1, 1, H * H), + ) + self.grid_x.append( + ttnn.from_torch(grid_x_i, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device) + ) # , 1, H*H)) + self.grid_y.append( + ttnn.from_torch(grid_y_i, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device) + ) # , 1, H*H)) + + def __call__(self, device, input_tensor): + B, __, HW, dim = input_tensor.shape + H = W = int(math.sqrt(HW)) + AHW = self.num_anchors * HW + A = self.num_anchors + + if HW == 1600: + group = 0 + elif HW == 400: + group = 1 + elif HW == 100: + group = 2 + + # Pre-derived from the torch function + if group == 0: + anchor_w_a = 1.5 + anchor_w_b = 2.375 + anchor_w_c = 5.0 + anchor_h_a = 2.0 + anchor_h_b = 4.5 + anchor_h_c = 3.5 + elif group == 1: + anchor_w_a = 2.25 + anchor_w_b = 4.75 + anchor_w_c = 4.5 + anchor_h_a = 4.6875 + anchor_h_b = 3.4375 + anchor_h_c = 9.125 + elif group == 2: + anchor_w_a = 4.4375 + anchor_w_b = 6.0 + anchor_w_c = 14.34375 + anchor_h_a = 3.4375 + anchor_h_b = 7.59375 + anchor_h_c = 12.53125 + + input_tensor_i = ttnn.to_memory_config(input_tensor, ttnn.L1_MEMORY_CONFIG) + input_tensor_i = ttnn.to_layout(input_tensor_i, ttnn.ROW_MAJOR_LAYOUT) + input_tensor_i = ttnn.permute(input_tensor_i, (0, 1, 3, 2)) + + # first anchor + bx_a = ttnn.slice(input_tensor_i, [0, 0, 0, 0], [1, 1, 1, HW]) + by_a = ttnn.slice(input_tensor_i, [0, 0, 1, 0], [1, 1, 2, HW]) + bw_a = ttnn.slice(input_tensor_i, [0, 0, 2, 0], [1, 1, 3, HW]) + bh_a = ttnn.slice(input_tensor_i, [0, 0, 3, 0], [1, 1, 4, HW]) + det_confs_a = ttnn.slice(input_tensor_i, [0, 0, 4, 0], [1, 1, 5, HW]) + cls_confs_a = ttnn.slice(input_tensor_i, [0, 0, 5, 0], [1, 1, 85, HW]) + # second anchor + bx_b = ttnn.slice(input_tensor_i, [0, 0, 85, 0], [1, 1, 86, HW]) + by_b = ttnn.slice(input_tensor_i, [0, 0, 86, 0], [1, 1, 87, HW]) + bw_b = ttnn.slice(input_tensor_i, [0, 0, 87, 0], [1, 1, 88, HW]) + bh_b = ttnn.slice(input_tensor_i, [0, 0, 88, 0], [1, 1, 89, HW]) + det_confs_b = ttnn.slice(input_tensor_i, [0, 0, 89, 0], [1, 1, 90, HW]) + cls_confs_b = ttnn.slice(input_tensor_i, [0, 0, 90, 0], [1, 1, 170, HW]) + # third anchor + bx_c = ttnn.slice(input_tensor_i, [0, 0, 170, 0], [1, 1, 171, HW]) + by_c = ttnn.slice(input_tensor_i, [0, 0, 171, 0], [1, 1, 172, HW]) + bw_c = ttnn.slice(input_tensor_i, [0, 0, 172, 0], [1, 1, 173, HW]) + bh_c = ttnn.slice(input_tensor_i, [0, 0, 173, 0], [1, 1, 174, HW]) + det_confs_c = ttnn.slice(input_tensor_i, [0, 0, 174, 0], [1, 1, 175, HW]) + cls_confs_c = ttnn.slice(input_tensor_i, [0, 0, 175, 0], [1, 1, 255, HW]) + + ############# + # Confs + ############# + + det_confs_a = ttnn.to_layout(det_confs_a, ttnn.TILE_LAYOUT) + det_confs_b = ttnn.to_layout(det_confs_b, ttnn.TILE_LAYOUT) + det_confs_c = ttnn.to_layout(det_confs_c, ttnn.TILE_LAYOUT) + cls_confs_a = ttnn.to_layout(cls_confs_a, ttnn.TILE_LAYOUT) + cls_confs_b = ttnn.to_layout(cls_confs_b, ttnn.TILE_LAYOUT) + cls_confs_c = ttnn.to_layout(cls_confs_c, ttnn.TILE_LAYOUT) + + det_confs_a = ttnn.sigmoid(det_confs_a) + det_confs_b = ttnn.sigmoid(det_confs_b) + det_confs_c = ttnn.sigmoid(det_confs_c) + cls_confs_a = ttnn.sigmoid(cls_confs_a) + cls_confs_b = ttnn.sigmoid(cls_confs_b) + cls_confs_c = ttnn.sigmoid(cls_confs_c) + + confs_a = ttnn.multiply(det_confs_a, cls_confs_a) + confs_b = ttnn.multiply(det_confs_b, cls_confs_b) + confs_c = ttnn.multiply(det_confs_c, cls_confs_c) + + confs = ttnn.concat([confs_a, confs_b, confs_c], dim=1) + confs = ttnn.permute(confs, (0, 1, 3, 2)) + confs = ttnn.reshape(confs, (B, AHW, self.num_classes)) + + ################# + ## Boxes + ################# + + # expensive TilizeWithValPadding + bx_a = ttnn.to_layout(bx_a, ttnn.TILE_LAYOUT) + by_a = ttnn.to_layout(by_a, ttnn.TILE_LAYOUT) + bw_a = ttnn.to_layout(bw_a, ttnn.TILE_LAYOUT) + bh_a = ttnn.to_layout(bh_a, ttnn.TILE_LAYOUT) + bx_a = ttnn.sigmoid(bx_a) + by_a = ttnn.sigmoid(by_a) + bw_a = ttnn.exp(bw_a) + bh_a = ttnn.exp(bh_a) + + bx_b = ttnn.to_layout(bx_b, ttnn.TILE_LAYOUT) + by_b = ttnn.to_layout(by_b, ttnn.TILE_LAYOUT) + bw_b = ttnn.to_layout(bw_b, ttnn.TILE_LAYOUT) + bh_b = ttnn.to_layout(bh_b, ttnn.TILE_LAYOUT) + bx_b = ttnn.sigmoid(bx_b) + by_b = ttnn.sigmoid(by_b) + bw_b = ttnn.exp(bw_b) + bh_b = ttnn.exp(bh_b) + + bx_c = ttnn.to_layout(bx_c, ttnn.TILE_LAYOUT) + by_c = ttnn.to_layout(by_c, ttnn.TILE_LAYOUT) + bw_c = ttnn.to_layout(bw_c, ttnn.TILE_LAYOUT) + bh_c = ttnn.to_layout(bh_c, ttnn.TILE_LAYOUT) + bx_c = ttnn.sigmoid(bx_c) + by_c = ttnn.sigmoid(by_c) + bw_c = ttnn.exp(bw_c) + bh_c = ttnn.exp(bh_c) + + #### + ## Grid tensor derivation + #### + + grid_x = self.grid_x[group] # .to(device, mem_config=ttnn.L1_MEMORY_CONFIG) + grid_y = self.grid_y[group] # .to(device, mem_config=ttnn.L1_MEMORY_CONFIG) + + bx_a = ttnn.add(bx_a, grid_x) + by_a = ttnn.add(by_a, grid_y) + bx_b = ttnn.add(bx_b, grid_x) + by_b = ttnn.add(by_b, grid_y) + bx_c = ttnn.add(bx_c, grid_x) + by_c = ttnn.add(by_c, grid_y) + + bx_a = ttnn.multiply(bx_a, 1 / W) + by_a = ttnn.multiply(by_a, 1 / H) + bx_b = ttnn.multiply(bx_b, 1 / W) + by_b = ttnn.multiply(by_b, 1 / H) + bx_c = ttnn.multiply(bx_c, 1 / W) + by_c = ttnn.multiply(by_c, 1 / H) + + bw_a = bw_a * (anchor_w_a / W) + bw_b = bw_b * (anchor_w_b / W) + bw_c = bw_c * (anchor_w_c / W) + + bh_a = bh_a * (anchor_h_a / H) + bh_b = bh_b * (anchor_h_b / H) + bh_c = bh_c * (anchor_h_c / H) + + bw_a_half = bw_a * (0.5) + bw_b_half = bw_b * (0.5) + bw_c_half = bw_c * (0.5) + + bh_a_half = bh_a * (0.5) + bh_b_half = bh_b * (0.5) + bh_c_half = bh_c * (0.5) + + bx1_a = bx_a - bw_a_half + by1_a = by_a - bh_a_half + bx2_a = bx1_a + bw_a + by2_a = by1_a + bh_a + + bx1_b = bx_b - bw_b_half + by1_b = by_b - bh_b_half + bx2_b = bx1_b + bw_b + by2_b = by1_b + bh_b + + bx1_c = bx_c - bw_c_half + by1_c = by_c - bh_c_half + bx2_c = bx1_c + bw_c + by2_c = by1_c + bh_c + + bx1_a = ttnn.to_layout(bx1_a, ttnn.ROW_MAJOR_LAYOUT) + bx2_a = ttnn.to_layout(bx2_a, ttnn.ROW_MAJOR_LAYOUT) + by1_a = ttnn.to_layout(by1_a, ttnn.ROW_MAJOR_LAYOUT) + by2_a = ttnn.to_layout(by2_a, ttnn.ROW_MAJOR_LAYOUT) + + bx1_b = ttnn.to_layout(bx1_b, ttnn.ROW_MAJOR_LAYOUT) + bx2_b = ttnn.to_layout(bx2_b, ttnn.ROW_MAJOR_LAYOUT) + by1_b = ttnn.to_layout(by1_b, ttnn.ROW_MAJOR_LAYOUT) + by2_b = ttnn.to_layout(by2_b, ttnn.ROW_MAJOR_LAYOUT) + + bx1_c = ttnn.to_layout(bx1_c, ttnn.ROW_MAJOR_LAYOUT) + bx2_c = ttnn.to_layout(bx2_c, ttnn.ROW_MAJOR_LAYOUT) + by1_c = ttnn.to_layout(by1_c, ttnn.ROW_MAJOR_LAYOUT) + by2_c = ttnn.to_layout(by2_c, ttnn.ROW_MAJOR_LAYOUT) + + bx1 = ttnn.concat([bx1_a, bx1_b, bx1_c], dim=2) + by1 = ttnn.concat([by1_a, by1_b, by1_c], dim=2) + bx2 = ttnn.concat([bx2_a, bx2_b, bx2_c], dim=2) + by2 = ttnn.concat([by2_a, by2_b, by2_c], dim=2) + + # Shape: [batch, num_anchors * h * w, 4] -> [batch, num_anchors * h * w, 1, 4] + boxes = ttnn.concat((bx1, by1, bx2, by2), dim=1) + + return boxes, confs diff --git a/models/demos/yolov4/ttnn/yolov4.py b/models/demos/yolov4/ttnn/yolov4.py index 42f1a9cd7fe..307e0fc55ca 100644 --- a/models/demos/yolov4/ttnn/yolov4.py +++ b/models/demos/yolov4/ttnn/yolov4.py @@ -21,10 +21,11 @@ from models.demos.yolov4.ttnn.downsample5 import Down5 from models.demos.yolov4.ttnn.neck import TtNeck from models.demos.yolov4.ttnn.head import TtHead +from models.demos.yolov4.ttnn.genboxes import TtGenBoxes class TtYOLOv4: - def __init__(self, device, path) -> None: + def __init__(self, path, device) -> None: if type(path) is str: self.torch_model = torch.load(path) else: @@ -39,7 +40,12 @@ def __init__(self, device, path) -> None: self.neck = TtNeck(device, self) self.head = TtHead(device, self) + self.boxes_confs_0 = TtGenBoxes(device) + self.boxes_confs_1 = TtGenBoxes(device) + self.boxes_confs_2 = TtGenBoxes(device) + self.downs = [] # [self.down1] + self.device = device def __call__(self, input_tensor): d1 = self.down1(input_tensor) @@ -52,7 +58,32 @@ def __call__(self, input_tensor): x20, x13, x6 = self.neck([d5, d4, d3]) x4, x5, x6 = self.head([x20, x13, x6]) - return x4, x5, x6 + orig = 0 + if orig: + return x4, x5, x6 + else: + x4_boxes_confs = self.boxes_confs_0(self.device, x4) + x5_boxes_confs = self.boxes_confs_1(self.device, x5) + x6_boxes_confs = self.boxes_confs_2(self.device, x6) + + confs_1 = ttnn.to_layout(x4_boxes_confs[1], ttnn.ROW_MAJOR_LAYOUT) + confs_2 = ttnn.to_layout(x5_boxes_confs[1], ttnn.ROW_MAJOR_LAYOUT) + confs_3 = ttnn.to_layout(x6_boxes_confs[1], ttnn.ROW_MAJOR_LAYOUT) + confs = ttnn.concat([confs_1, confs_2, confs_3], dim=1) + + boxes_1 = ttnn.to_layout(x4_boxes_confs[0], ttnn.ROW_MAJOR_LAYOUT) + boxes_2 = ttnn.to_layout(x5_boxes_confs[0], ttnn.ROW_MAJOR_LAYOUT) + boxes_3 = ttnn.to_layout(x6_boxes_confs[0], ttnn.ROW_MAJOR_LAYOUT) + boxes_1 = ttnn.reshape(boxes_1, (1, 4, 1, 4800)) + boxes_2 = ttnn.reshape(boxes_2, (1, 4, 1, 1200)) + boxes_3 = ttnn.pad(boxes_3, ((0, 0), (0, 0), (0, 0), (0, 28)), 0) + boxes_3 = ttnn.reshape(boxes_3, (1, 4, 1, 384)) + boxes_1 = ttnn.permute(boxes_1, (0, 2, 3, 1)) + boxes_2 = ttnn.permute(boxes_2, (0, 2, 3, 1)) + boxes_3 = ttnn.permute(boxes_3, (0, 2, 3, 1)) + boxes = ttnn.concat([boxes_1, boxes_2, boxes_3], dim=2) + + return boxes, confs def __str__(self) -> str: this_str = "" diff --git a/models/demos/yolov4/web_demo/README.md b/models/demos/yolov4/web_demo/README.md index d35bb31c518..5b112cadaa6 100644 --- a/models/demos/yolov4/web_demo/README.md +++ b/models/demos/yolov4/web_demo/README.md @@ -12,6 +12,11 @@ pip install -r models/demos/yolov4/web_demo/server/requirements.txt ``` +- After installing the server side requirments, ONLY if you are running the demo on an N300 card,run the following to export the approprite envirement variable for N300. + ``` + export WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml + ``` + - From the server run: ``` source models/demos/yolov4/web_demo/server/run_uvicorn.sh diff --git a/models/demos/yolov4/web_demo/client/coco.names b/models/demos/yolov4/web_demo/client/coco.names new file mode 100644 index 00000000000..ca76c80b5b2 --- /dev/null +++ b/models/demos/yolov4/web_demo/client/coco.names @@ -0,0 +1,80 @@ +person +bicycle +car +motorbike +aeroplane +bus +train +truck +boat +traffic light +fire hydrant +stop sign +parking meter +bench +bird +cat +dog +horse +sheep +cow +elephant +bear +zebra +giraffe +backpack +umbrella +handbag +tie +suitcase +frisbee +skis +snowboard +sports ball +kite +baseball bat +baseball glove +skateboard +surfboard +tennis racket +bottle +wine glass +cup +fork +knife +spoon +bowl +banana +apple +sandwich +orange +broccoli +carrot +hot dog +pizza +donut +cake +chair +sofa +pottedplant +bed +diningtable +toilet +tvmonitor +laptop +mouse +remote +keyboard +cell phone +microwave +oven +toaster +sink +refrigerator +book +clock +vase +scissors +teddy bear +hair drier +toothbrush diff --git a/models/demos/yolov4/web_demo/client/requirements.txt b/models/demos/yolov4/web_demo/client/requirements.txt index 282195275da..be5f168cc74 100644 --- a/models/demos/yolov4/web_demo/client/requirements.txt +++ b/models/demos/yolov4/web_demo/client/requirements.txt @@ -1,3 +1,4 @@ opencv-python==4.6.0.66 streamlit==1.26.0 streamlit-webrtc==0.47.0 +orjson==3.10.12 diff --git a/models/demos/yolov4/web_demo/client/yolov4.py b/models/demos/yolov4/web_demo/client/yolov4.py index 5fc4ea6c692..ada420cbdad 100644 --- a/models/demos/yolov4/web_demo/client/yolov4.py +++ b/models/demos/yolov4/web_demo/client/yolov4.py @@ -11,7 +11,9 @@ import cv2 import requests import torch +import orjson import av +import logging import streamlit as st import numpy as np @@ -20,78 +22,16 @@ from streamlit_webrtc import VideoProcessorBase, webrtc_streamer +# Configure the logger +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", handlers=[logging.StreamHandler()] +) + + class VideoProcessor(VideoProcessorBase): def __init__(self): self.frame_count = 0 - def post_processing(self, img, conf_thresh, nms_thresh, output): - box_array = output[0] - confs = output[1].float() - - t1 = time.time() - - if type(box_array).__name__ != "ndarray": - box_array = box_array.cpu().detach().numpy() - confs = confs.cpu().detach().numpy() - - num_classes = confs.shape[2] - - # [batch, num, 4] - box_array = box_array[:, :, 0] - - # [batch, num, num_classes] --> [batch, num] - max_conf = np.max(confs, axis=2) - max_id = np.argmax(confs, axis=2) - - t2 = time.time() - - bboxes_batch = [] - for i in range(box_array.shape[0]): - argwhere = max_conf[i] > conf_thresh - l_box_array = box_array[i, argwhere, :] - l_max_conf = max_conf[i, argwhere] - l_max_id = max_id[i, argwhere] - - bboxes = [] - # nms for each class - for j in range(num_classes): - cls_argwhere = l_max_id == j - ll_box_array = l_box_array[cls_argwhere, :] - ll_max_conf = l_max_conf[cls_argwhere] - ll_max_id = l_max_id[cls_argwhere] - - keep = self.nms_cpu(ll_box_array, ll_max_conf, nms_thresh) - - if keep.size > 0: - ll_box_array = ll_box_array[keep, :] - ll_max_conf = ll_max_conf[keep] - ll_max_id = ll_max_id[keep] - - for k in range(ll_box_array.shape[0]): - bboxes.append( - [ - ll_box_array[k, 0], - ll_box_array[k, 1], - ll_box_array[k, 2], - ll_box_array[k, 3], - ll_max_conf[k], - ll_max_conf[k], - ll_max_id[k], - ] - ) - - bboxes_batch.append(bboxes) - - t3 = time.time() - - print("-----------------------------------") - print(" max and argmax : %f" % (t2 - t1)) - print(" nms : %f" % (t3 - t2)) - print("Post processing total : %f" % (t3 - t1)) - print("-----------------------------------") - - return bboxes_batch - def load_class_names(self, namesfile): class_names = [] with open(namesfile, "r") as fp: @@ -101,41 +41,6 @@ def load_class_names(self, namesfile): class_names.append(line) return class_names - def nms_cpu(self, boxes, confs, nms_thresh=0.5, min_mode=False): - x1 = boxes[:, 0] - y1 = boxes[:, 1] - x2 = boxes[:, 2] - y2 = boxes[:, 3] - - areas = (x2 - x1) * (y2 - y1) - order = confs.argsort()[::-1] - - keep = [] - while order.size > 0: - idx_self = order[0] - idx_other = order[1:] - - keep.append(idx_self) - - xx1 = np.maximum(x1[idx_self], x1[idx_other]) - yy1 = np.maximum(y1[idx_self], y1[idx_other]) - xx2 = np.minimum(x2[idx_self], x2[idx_other]) - yy2 = np.minimum(y2[idx_self], y2[idx_other]) - - w = np.maximum(0.0, xx2 - xx1) - h = np.maximum(0.0, yy2 - yy1) - inter = w * h - - if min_mode: - over = inter / np.minimum(areas[order[0]], areas[order[1:]]) - else: - over = inter / (areas[order[0]] + areas[order[1:]] - inter) - - inds = np.where(over <= nms_thresh)[0] - order = order[inds + 1] - - return np.array(keep) - def plot_boxes_cv2(self, bgr_img, boxes, savename=None, class_names=None, color=None): img = np.copy(bgr_img) colors = np.array([[1, 0, 1], [0, 0, 1], [0, 1, 1], [0, 1, 0], [1, 1, 0], [1, 0, 0]], dtype=np.float32) @@ -196,52 +101,60 @@ def get_color(c, x, max_val): def recv(self, frame): t0 = time.time() + + # Convert frame to PIL image and resize pil_image = frame.to_image() - # resize on the client side - new_size = (320, 320) - pil_image = pil_image.resize(new_size) + pil_image = pil_image.resize((320, 320)) # Resize to target dimensions t1 = time.time() + + # Save image as JPEG in-memory with optimized settings buf = io.BytesIO() - pil_image.save(buf, format="JPEG") + pil_image.save(buf, format="JPEG", quality=85, optimize=True) byte_im = buf.getvalue() file = {"file": byte_im} - # Argument Parser to grab namespace_id of server pod from user - parser = argparse.ArgumentParser(description="YOLOv4 script") - parser.add_argument("--api-url", type=str, help="URL for the object detection API", required=True) - args = parser.parse_args() - apiurl = args.api_url - url = f"{apiurl}/objdetection_v2" - r = requests.post(url, files=file) - if r.status_code == 200: - try: - # Get the JSON response as a dictionary - response_dict = r.json() - output = [torch.tensor(tensor_data) for tensor_data in response_dict["output"]] - except ValueError: - st.error("Failed to parse JSON. The response is not in JSON format.") - else: - st.error(f"Request failed with status code {r.status_code}") + # Parse API URL once at the class level for efficiency + if not hasattr(self, "api_url"): + parser = argparse.ArgumentParser(description="YOLOv4 script") + parser.add_argument("--api-url", type=str, required=True, help="URL for the object detection API") + args = parser.parse_args() + self.api_url = args.api_url + + url = f"{self.api_url}/objdetection_v2" + + try: + # Use a persistent session for multiple requests + with requests.Session() as session: + # Post request with a timeout + response = session.post(url, files=file, timeout=5) + + # Check if response is successful + if response.status_code == 200: + # Parse JSON response + output = orjson.loads(response.content) + else: + print(f"Request failed with status code {response.status_code}") + # return None + except requests.exceptions.RequestException as e: + print(f"Request failed: {e}") + return None t3 = time.time() + # Convert frame to ndarray and perform post-processing bgr_image = frame.to_ndarray(format="bgr24") conf_thresh = 0.6 nms_thresh = 0.5 - boxes = self.post_processing(bgr_image, conf_thresh, nms_thresh, output) + + # Load class names and plot bounding boxes namesfile = "coco.names" class_names = self.load_class_names(namesfile) + image_final = self.plot_boxes_cv2(bgr_image, output, None, class_names) - # random_number = random.randint(1, 100) - # save_name = "ttnn_prediction_demo" + str(random_number) + ".jpg" - save_name = None - - image_final = self.plot_boxes_cv2(bgr_image, boxes[0], save_name, class_names) t4 = time.time() - print() - print(f" IMG-IN | WH | Post | Total time: ") - print(f" {(t1-t0):.3f} | {(t3-t1):.3f} | {(t4-t3):.3f} || {(t4-t0):.3f} ") + logging.info( + f" IMG-IN | WH | Post | Total time: {(t1-t0):.3f} | {(t3-t1):.3f} | {(t4-t3):.3f} || {(t4-t0):.3f} " + ) - # return image_final return av.VideoFrame.from_ndarray(image_final, format="bgr24") @@ -254,10 +167,8 @@ def recv(self, frame): media_stream_constraints={ "video": { "width": {"min": 320, "ideal": 400, "max": 960}, - # "height": {"min": 180, "ideal": 225, "max": 450}, "height": {"min": 320, "ideal": 400, "max": 960}, "frameRate": {"min": 1, "ideal": 50, "max": 60}, } }, - # async_processing=True # Use asynchronous processing for long tasks ) diff --git a/models/demos/yolov4/web_demo/server/fast_api_yolov4.py b/models/demos/yolov4/web_demo/server/fast_api_yolov4.py old mode 100755 new mode 100644 index 19732cbc074..83af1d6e14b --- a/models/demos/yolov4/web_demo/server/fast_api_yolov4.py +++ b/models/demos/yolov4/web_demo/server/fast_api_yolov4.py @@ -2,6 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 import json +import os +import logging from fastapi import FastAPI, File, UploadFile from io import BytesIO from PIL import Image @@ -25,14 +27,43 @@ async def root(): return {"message": "Hello World"} +# Configure the logger +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", handlers=[logging.StreamHandler()] +) + + +def get_dispatch_core_type(): + # TODO: 11059 move dispatch_core_type to device_params when all tests are updated to not use WH_ARCH_YAML env flag + dispatch_core_type = ttnn.device.DispatchCoreType.WORKER + # if ("WH_ARCH_YAML" in os.environ) and os.environ["WH_ARCH_YAML"] == "wormhole_b0_80_arch_eth_dispatch.yaml": + if os.environ["WH_ARCH_YAML"] == "wormhole_b0_80_arch_eth_dispatch.yaml": + dispatch_core_type = ttnn.device.DispatchCoreType.ETH + return dispatch_core_type + + @app.on_event("startup") async def startup(): - device_id = 0 - device = ttnn.CreateDevice(device_id, l1_small_size=24576, trace_region_size=1617920, num_command_queues=2) - ttnn.enable_program_cache(device) global model - model = Yolov4Trace2CQ() - model.initialize_yolov4_trace_2cqs_inference(device) + if ("WH_ARCH_YAML" in os.environ) and os.environ["WH_ARCH_YAML"] == "wormhole_b0_80_arch_eth_dispatch.yaml": + print("WH_ARCH_YAML:", os.environ.get("WH_ARCH_YAML")) + device_id = 0 + device = ttnn.CreateDevice( + device_id, + dispatch_core_type=get_dispatch_core_type(), + l1_small_size=24576, + trace_region_size=3211264, + num_command_queues=2, + ) + ttnn.enable_program_cache(device) + model = Yolov4Trace2CQ() + model.initialize_yolov4_trace_2cqs_inference(device) + else: + device_id = 0 + device = ttnn.CreateDevice(device_id, l1_small_size=24576, trace_region_size=3211264, num_command_queues=2) + ttnn.enable_program_cache(device) + model = Yolov4Trace2CQ() + model.initialize_yolov4_trace_2cqs_inference(device) @app.on_event("shutdown") @@ -40,16 +71,112 @@ async def shutdown(): model.release_yolov4_trace_2cqs_inference() -def process_request(output): - # Convert all tensors to lists for JSON serialization - output_serializable = {"output": [tensor.tolist() for tensor in output]} - return output_serializable +def process_output(output): + outs = [] + output = output + cnt = 0 + for item in output: + cnt = cnt + 1 + output_i = [element.item() for element in item] + outs.append(output_i) + return outs + + +def post_processing(img, conf_thresh, nms_thresh, output): + box_array = output[0] + confs = output[1] + + box_array = np.array(box_array.to(torch.float32)) + confs = np.array(confs.to(torch.float32)) + + num_classes = confs.shape[2] + + # [batch, num, 4] + box_array = box_array[:, :, 0] + + # [batch, num, num_classes] --> [batch, num] + max_conf = np.max(confs, axis=2) + max_id = np.argmax(confs, axis=2) + + bboxes_batch = [] + for i in range(box_array.shape[0]): + argwhere = max_conf[i] > conf_thresh + l_box_array = box_array[i, argwhere, :] + l_max_conf = max_conf[i, argwhere] + l_max_id = max_id[i, argwhere] + + bboxes = [] + # nms for each class + for j in range(num_classes): + cls_argwhere = l_max_id == j + ll_box_array = l_box_array[cls_argwhere, :] + ll_max_conf = l_max_conf[cls_argwhere] + ll_max_id = l_max_id[cls_argwhere] + + keep = nms_cpu(ll_box_array, ll_max_conf, nms_thresh) + + if keep.size > 0: + ll_box_array = ll_box_array[keep, :] + ll_max_conf = ll_max_conf[keep] + ll_max_id = ll_max_id[keep] + + for k in range(ll_box_array.shape[0]): + bboxes.append( + [ + ll_box_array[k, 0], + ll_box_array[k, 1], + ll_box_array[k, 2], + ll_box_array[k, 3], + ll_max_conf[k], + ll_max_conf[k], + ll_max_id[k], + ] + ) + + bboxes_batch.append(bboxes) + + return bboxes_batch + + +def nms_cpu(boxes, confs, nms_thresh=0.5, min_mode=False): + x1 = boxes[:, 0] + y1 = boxes[:, 1] + x2 = boxes[:, 2] + y2 = boxes[:, 3] + + areas = (x2 - x1) * (y2 - y1) + order = confs.argsort()[::-1] + + keep = [] + while order.size > 0: + idx_self = order[0] + idx_other = order[1:] + + keep.append(idx_self) + + xx1 = np.maximum(x1[idx_self], x1[idx_other]) + yy1 = np.maximum(y1[idx_self], y1[idx_other]) + xx2 = np.minimum(x2[idx_self], x2[idx_other]) + yy2 = np.minimum(y2[idx_self], y2[idx_other]) + + w = np.maximum(0.0, xx2 - xx1) + h = np.maximum(0.0, yy2 - yy1) + inter = w * h + + if min_mode: + over = inter / np.minimum(areas[order[0]], areas[order[1:]]) + else: + over = inter / (areas[order[0]] + areas[order[1:]] - inter) + + inds = np.where(over <= nms_thresh)[0] + order = order[inds + 1] + + return np.array(keep) @app.post("/objdetection_v2") async def objdetection_v2(file: UploadFile = File(...)): contents = await file.read() - # Load and convert the image to RGB image = Image.open(BytesIO(contents)).convert("RGB") image = np.array(image) @@ -60,11 +187,24 @@ async def objdetection_v2(file: UploadFile = File(...)): else: print("unknow image type") exit(-1) + t1 = time.time() response = model.run_traced_inference(image) t2 = time.time() - print("the inference on the sever side took: ", t2 - t1) + logging.info("The inference on the sever side took: %.3f seconds", t2 - t1) + conf_thresh = 0.6 + nms_thresh = 0.5 + + boxes = post_processing(image, conf_thresh, nms_thresh, response) + output = boxes[0] + # output = boxes + try: + output = process_output(output) + except Exception as E: + print("the Exception is: ", E) + print("No objects detected!") + return [] + t3 = time.time() + logging.info("The post-processing to get the boxes took: %.3f seconds", t3 - t2) - # Convert response tensors to JSON-serializable format - output = process_request(response) return output diff --git a/tests/scripts/run_python_model_tests.sh b/tests/scripts/run_python_model_tests.sh index 0290537e6e3..576ef139fc7 100755 --- a/tests/scripts/run_python_model_tests.sh +++ b/tests/scripts/run_python_model_tests.sh @@ -35,7 +35,7 @@ run_python_model_tests_wormhole_b0() { # higher sequence lengths and different formats trigger memory issues pytest models/demos/falcon7b_common/tests/unit_tests/test_falcon_matmuls_and_bmms_with_mixed_precision.py -k "seq_len_128 and in0_BFLOAT16-in1_BFLOAT8_B-out_BFLOAT16-weights_DRAM" pytest tests/ttnn/integration_tests/resnet/test_ttnn_functional_resnet50.py -k "pretrained_weight_false" - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/yolov4/demo/demo.py -k "pretrained_weight_false" + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/ttnn/integration_tests/yolov4/test_ttnn_yolov4.py -k "pretrained_weight_false" # Unet Shallow WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -svv models/experimental/functional_unet/tests/test_unet_model.py diff --git a/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample1.py b/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample1.py index 3ae46d4970c..9dd13940717 100644 --- a/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample1.py +++ b/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample1.py @@ -36,16 +36,8 @@ def test_down1(device, reset_seeds, model_location_generator): ttnn_input = ttnn.from_torch(torch_input, dtype=ttnn.bfloat16) torch_input = torch_input.permute(0, 3, 1, 2).float() torch_model = DownSample1() - - new_state_dict = {} ds_state_dict = {k: v for k, v in ttnn_model.torch_model.items() if (k.startswith("down1."))} - - keys = [name for name, parameter in torch_model.state_dict().items()] - values = [parameter for name, parameter in ds_state_dict.items()] - - for i in range(len(keys)): - new_state_dict[keys[i]] = values[i] - + new_state_dict = dict(zip(torch_model.state_dict().keys(), ds_state_dict.values())) torch_model.load_state_dict(new_state_dict) torch_model.eval() diff --git a/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample2.py b/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample2.py index 5efc12af3f1..ba7da86ee8c 100644 --- a/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample2.py +++ b/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample2.py @@ -35,16 +35,10 @@ def test_down2(device, reset_seeds, model_location_generator): torch_input = torch.randn((1, 160, 160, 64), dtype=torch.bfloat16) ttnn_input = ttnn.from_torch(torch_input, dtype=ttnn.bfloat16) torch_input = torch_input.permute(0, 3, 1, 2).float() - torch_model = DownSample2() - new_state_dict = {} + torch_model = DownSample2() ds_state_dict = {k: v for k, v in ttnn_model.torch_model.items() if (k.startswith("down2."))} - - keys = [name for name, parameter in torch_model.state_dict().items()] - values = [parameter for name, parameter in ds_state_dict.items()] - for i in range(len(keys)): - new_state_dict[keys[i]] = values[i] - + new_state_dict = dict(zip(torch_model.state_dict().keys(), ds_state_dict.values())) torch_model.load_state_dict(new_state_dict) torch_model.eval() diff --git a/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample3.py b/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample3.py index 23c015fbb5b..8ae58e41470 100644 --- a/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample3.py +++ b/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample3.py @@ -36,15 +36,8 @@ def test_down3(device, reset_seeds, model_location_generator): ttnn_input = ttnn.from_torch(torch_input, dtype=ttnn.bfloat16) torch_input = torch_input.permute(0, 3, 1, 2).float() torch_model = DownSample3() - - new_state_dict = {} ds_state_dict = {k: v for k, v in ttnn_model.torch_model.items() if (k.startswith("down3."))} - - keys = [name for name, parameter in torch_model.state_dict().items()] - values = [parameter for name, parameter in ds_state_dict.items()] - for i in range(len(keys)): - new_state_dict[keys[i]] = values[i] - + new_state_dict = dict(zip(torch_model.state_dict().keys(), ds_state_dict.values())) torch_model.load_state_dict(new_state_dict) torch_model.eval() @@ -58,4 +51,4 @@ def test_down3(device, reset_seeds, model_location_generator): ref = torch_model(torch_input) ref = ref.permute(0, 2, 3, 1) result = result.reshape(ref.shape) - assert_with_pcc(result, ref, 0.95) # PCC 0.95 - The PCC will improve once #3612 is resolved. + assert_with_pcc(result, ref, 0.96) # PCC 0.96 - The PCC will improve once #3612 is resolved. diff --git a/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample4.py b/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample4.py index 35579f14664..b791e9fc813 100644 --- a/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample4.py +++ b/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample4.py @@ -36,15 +36,8 @@ def test_down4(device, reset_seeds, model_location_generator): ttnn_input = ttnn.from_torch(torch_input, dtype=ttnn.bfloat16) torch_input = torch_input.permute(0, 3, 1, 2).float() torch_model = DownSample4() - - new_state_dict = {} ds_state_dict = {k: v for k, v in ttnn_model.torch_model.items() if (k.startswith("down4."))} - - keys = [name for name, parameter in torch_model.state_dict().items()] - values = [parameter for name, parameter in ds_state_dict.items()] - for i in range(len(keys)): - new_state_dict[keys[i]] = values[i] - + new_state_dict = dict(zip(torch_model.state_dict().keys(), ds_state_dict.values())) torch_model.load_state_dict(new_state_dict) torch_model.eval() diff --git a/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample5.py b/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample5.py index 8809d4d8275..d53eab4825e 100644 --- a/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample5.py +++ b/tests/ttnn/integration_tests/yolov4/test_ttnn_downsample5.py @@ -36,15 +36,8 @@ def test_down5(device, reset_seeds, model_location_generator): ttnn_input = ttnn.from_torch(torch_input, dtype=ttnn.bfloat16) torch_input = torch_input.permute(0, 3, 1, 2).float() torch_model = DownSample5() - - new_state_dict = {} ds_state_dict = {k: v for k, v in ttnn_model.torch_model.items() if (k.startswith("down5."))} - - keys = [name for name, parameter in torch_model.state_dict().items()] - values = [parameter for name, parameter in ds_state_dict.items()] - for i in range(len(keys)): - new_state_dict[keys[i]] = values[i] - + new_state_dict = dict(zip(torch_model.state_dict().keys(), ds_state_dict.values())) torch_model.load_state_dict(new_state_dict) torch_model.eval() diff --git a/tests/ttnn/integration_tests/yolov4/test_ttnn_head.py b/tests/ttnn/integration_tests/yolov4/test_ttnn_head.py index 126e3713645..155885f2cb3 100644 --- a/tests/ttnn/integration_tests/yolov4/test_ttnn_head.py +++ b/tests/ttnn/integration_tests/yolov4/test_ttnn_head.py @@ -6,6 +6,7 @@ import ttnn from models.demos.yolov4.reference.head import Head from tests.ttnn.utils_for_testing import assert_with_pcc +from models.utility_functions import skip_for_grayskull import pytest import time from models.demos.yolov4.ttnn.head import TtHead @@ -13,6 +14,7 @@ import os +@skip_for_grayskull() @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) def test_head(device, reset_seeds, model_location_generator): torch.manual_seed(0) @@ -56,15 +58,8 @@ def test_head(device, reset_seeds, model_location_generator): torch_input_tensor = [torch_input_tensor1, torch_input_tensor2, torch_input_tensor3] torch_model = Head() - - new_state_dict = {} ds_state_dict = {k: v for k, v in ttnn_model.torch_model.items() if (k.startswith("head."))} - - keys = [name for name, parameter in torch_model.state_dict().items()] - values = [parameter for name, parameter in ds_state_dict.items()] - for i in range(len(keys)): - new_state_dict[keys[i]] = values[i] - + new_state_dict = dict(zip(torch_model.state_dict().keys(), ds_state_dict.values())) torch_model.load_state_dict(new_state_dict) torch_model.eval() @@ -79,19 +74,22 @@ def test_head(device, reset_seeds, model_location_generator): result_3 = ttnn.to_torch(result_ttnn[2]) ref1, ref2, ref3 = torch_model(torch_input_tensor[0], torch_input_tensor[1], torch_input_tensor[2]) - result_1 = result_1.reshape(1, ref1.shape[2], ref1.shape[3], 255) + num_channels = ref1.shape[1] # 255 + num_channels_padded = num_channels + 1 + + result_1 = result_1.reshape(1, ref1.shape[2], ref1.shape[3], num_channels_padded) result_1 = result_1.permute(0, 3, 1, 2) - result_2 = result_2.reshape(1, ref2.shape[2], ref2.shape[3], 255) + result_2 = result_2.reshape(1, ref2.shape[2], ref2.shape[3], num_channels_padded) result_2 = result_2.permute(0, 3, 1, 2) - result_3 = result_3.reshape(1, ref3.shape[2], ref3.shape[3], 255) + result_3 = result_3.reshape(1, ref3.shape[2], ref3.shape[3], num_channels_padded) result_3 = result_3.permute(0, 3, 1, 2) # Output is sliced because ttnn.conv returns 256 channels instead of 255. - result_1 = result_1[:, :255, :, :] - result_2 = result_2[:, :255, :, :] - result_3 = result_3[:, :255, :, :] + result_1 = result_1[:, :num_channels, :, :] + result_2 = result_2[:, :num_channels, :, :] + result_3 = result_3[:, :num_channels, :, :] pcc_passed, pcc_message = assert_with_pcc(result_1, ref1, 0.99) logger.info(pcc_message) diff --git a/tests/ttnn/integration_tests/yolov4/test_ttnn_neck.py b/tests/ttnn/integration_tests/yolov4/test_ttnn_neck.py index 41ac8781fc1..02c9d81f75d 100644 --- a/tests/ttnn/integration_tests/yolov4/test_ttnn_neck.py +++ b/tests/ttnn/integration_tests/yolov4/test_ttnn_neck.py @@ -6,6 +6,7 @@ import ttnn from models.demos.yolov4.ttnn.neck import TtNeck from models.demos.yolov4.reference.neck import Neck +from models.utility_functions import skip_for_grayskull from tests.ttnn.utils_for_testing import assert_with_pcc import pytest import time @@ -13,6 +14,7 @@ import os +@skip_for_grayskull() @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) def test_neck(device, reset_seeds, model_location_generator): torch.manual_seed(0) @@ -50,16 +52,10 @@ def test_neck(device, reset_seeds, model_location_generator): torch_input_tensor2 = torch_input_tensor2.permute(0, 3, 1, 2).float() torch_input_tensor3 = torch_input_tensor3.permute(0, 3, 1, 2).float() torch_input_tensor = [torch_input_tensor1, torch_input_tensor2, torch_input_tensor3] - torch_model = Neck() - new_state_dict = {} + torch_model = Neck() ds_state_dict = {k: v for k, v in ttnn_model.torch_model.items() if (k.startswith("neek."))} - - keys = [name for name, parameter in torch_model.state_dict().items()] - values = [parameter for name, parameter in ds_state_dict.items()] - for i in range(len(keys)): - new_state_dict[keys[i]] = values[i] - + new_state_dict = dict(zip(torch_model.state_dict().keys(), ds_state_dict.values())) torch_model.load_state_dict(new_state_dict) torch_model.eval() diff --git a/tests/ttnn/integration_tests/yolov4/test_ttnn_post_processing.py b/tests/ttnn/integration_tests/yolov4/test_ttnn_post_processing.py new file mode 100644 index 00000000000..128a0c93f43 --- /dev/null +++ b/tests/ttnn/integration_tests/yolov4/test_ttnn_post_processing.py @@ -0,0 +1,80 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +import torch +import ttnn +from models.utility_functions import skip_for_grayskull +from tests.ttnn.utils_for_testing import assert_with_pcc +from models.demos.yolov4.ttnn.genboxes import TtGenBoxes +from models.demos.yolov4.demo.demo import YoloLayer, get_region_boxes, gen_yolov4_boxes_confs + +import pytest +import os + + +@skip_for_grayskull() +@pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) +def test_yolov4_post_processing(device, reset_seeds, model_location_generator): + torch.manual_seed(0) + + torch_input_1 = torch.randn((1, 1, 1600, 256), dtype=torch.bfloat16) + ttnn_input_1 = ttnn.from_torch( + torch_input_1, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device, memory_config=ttnn.L1_MEMORY_CONFIG + ) + torch_input_2 = torch.randn((1, 1, 400, 256), dtype=torch.bfloat16) + ttnn_input_2 = ttnn.from_torch( + torch_input_2, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device, memory_config=ttnn.L1_MEMORY_CONFIG + ) + torch_input_3 = torch.randn((1, 1, 100, 256), dtype=torch.bfloat16) + ttnn_input_3 = ttnn.from_torch( + torch_input_3, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device, memory_config=ttnn.L1_MEMORY_CONFIG + ) + + torch_input_1 = torch_input_1[:, :, :, :255] + torch_input_1 = torch_input_1.reshape(1, 40, 40, 255) + torch_input_1 = torch.permute(torch_input_1, (0, 3, 1, 2)) + torch_input_2 = torch_input_2[:, :, :, :255] + torch_input_2 = torch_input_2.reshape(1, 20, 20, 255) + torch_input_2 = torch.permute(torch_input_2, (0, 3, 1, 2)) + torch_input_3 = torch_input_3[:, :, :, :255] + torch_input_3 = torch_input_3.reshape(1, 10, 10, 255) + torch_input_3 = torch.permute(torch_input_3, (0, 3, 1, 2)) + + ref1, ref2, ref3 = gen_yolov4_boxes_confs([torch_input_1, torch_input_2, torch_input_3]) + + boxes_confs_1 = TtGenBoxes(device) + boxes_confs_2 = TtGenBoxes(device) + boxes_confs_3 = TtGenBoxes(device) + + result_1 = boxes_confs_1(device, ttnn_input_1) + result_2 = boxes_confs_2(device, ttnn_input_2) + result_3 = boxes_confs_3(device, ttnn_input_3) + + result_1_bb = ttnn.to_torch(result_1[0]) + result_2_bb = ttnn.to_torch(result_2[0]) + result_3_bb = ttnn.to_torch(result_3[0]) + + result_1_bb = result_1_bb.permute(0, 2, 3, 1) + result_2_bb = result_2_bb.permute(0, 2, 3, 1) + result_3_bb = result_3_bb.permute(0, 2, 3, 1) + + result_1_bb = result_1_bb.reshape(1, 4800, 1, 4) + result_2_bb = result_2_bb.reshape(1, 1200, 1, 4) + result_3_bb = result_3_bb.reshape(1, 300, 1, 4) + + result_1_conf = ttnn.to_torch(result_1[1]) + result_2_conf = ttnn.to_torch(result_2[1]) + result_3_conf = ttnn.to_torch(result_3[1]) + + assert_with_pcc(ref1[0], result_1_bb, 0.99) + assert_with_pcc(ref2[0], result_2_bb, 0.99) + assert_with_pcc(ref3[0], result_3_bb, 0.99) + + assert_with_pcc(ref1[1], result_1_conf, 0.99) + assert_with_pcc(ref2[1], result_2_conf, 0.99) + assert_with_pcc(ref3[1], result_3_conf, 0.99) + + output = get_region_boxes( + [(result_1_bb, result_1_conf), (result_2_bb, result_2_conf), (result_3_bb, result_3_conf)] + ) diff --git a/tests/ttnn/integration_tests/yolov4/test_ttnn_yolov4.py b/tests/ttnn/integration_tests/yolov4/test_ttnn_yolov4.py index ff9a9d4c1dc..6e22f222474 100644 --- a/tests/ttnn/integration_tests/yolov4/test_ttnn_yolov4.py +++ b/tests/ttnn/integration_tests/yolov4/test_ttnn_yolov4.py @@ -4,70 +4,96 @@ import torch import ttnn -from models.utility_functions import skip_for_grayskull from models.demos.yolov4.reference.yolov4 import Yolov4 from tests.ttnn.utils_for_testing import assert_with_pcc +from models.utility_functions import skip_for_grayskull from models.demos.yolov4.ttnn.yolov4 import TtYOLOv4 +from models.demos.yolov4.demo.demo import YoloLayer, get_region_boxes, gen_yolov4_boxes_confs +from models.demos.yolov4.ttnn.weight_parameter_update import update_weight_parameters +from collections import OrderedDict + +import cv2 +import numpy as np + import pytest import os @skip_for_grayskull() @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) -def test_yolov4(device, reset_seeds, model_location_generator): +@pytest.mark.parametrize( + "use_pretrained_weight", + [True, False], + ids=[ + "pretrained_weight_true", + "pretrained_weight_false", + ], +) +def test_yolov4(device, reset_seeds, model_location_generator, use_pretrained_weight): torch.manual_seed(0) model_path = model_location_generator("models", model_subdir="Yolo") - if model_path == "models": - if not os.path.exists("tests/ttnn/integration_tests/yolov4/yolov4.pth"): # check if yolov4.th is availble - os.system( - "tests/ttnn/integration_tests/yolov4/yolov4_weights_download.sh" - ) # execute the yolov4_weights_download.sh file - - weights_pth = "tests/ttnn/integration_tests/yolov4/yolov4.pth" + if use_pretrained_weight: + if model_path == "models": + if not os.path.exists("tests/ttnn/integration_tests/yolov4/yolov4.pth"): # check if yolov4.th is availble + os.system( + "tests/ttnn/integration_tests/yolov4/yolov4_weights_download.sh" + ) # execute the yolov4_weights_download.sh file + + weights_pth = "tests/ttnn/integration_tests/yolov4/yolov4.pth" + else: + weights_pth = str(model_path / "yolov4.pth") + + ttnn_model = TtYOLOv4(weights_pth, device) + torch_model = Yolov4() + new_state_dict = dict(zip(torch_model.state_dict().keys(), ttnn_model.torch_model.values())) + torch_model.load_state_dict(new_state_dict) + torch_model.eval() else: - weights_pth = str(model_path / "yolov4.pth") - - ttnn_model = TtYOLOv4(device, weights_pth) - - torch_input = torch.randn((1, 320, 320, 3), dtype=torch.bfloat16) - ttnn_input = ttnn.from_torch(torch_input, dtype=ttnn.bfloat16) - torch_input = torch_input.permute(0, 3, 1, 2).float() - torch_model = Yolov4() - - new_state_dict = {} - ds_state_dict = {k: v for k, v in ttnn_model.torch_model.items()} - - keys = [name for name, parameter in torch_model.state_dict().items()] - values = [parameter for name, parameter in ds_state_dict.items()] - - for i in range(len(keys)): - new_state_dict[keys[i]] = values[i] - - torch_model.load_state_dict(new_state_dict) - torch_model.eval() - - result_1, result_2, result_3 = ttnn_model(ttnn_input) - result_1 = ttnn.to_torch(result_1) - result_2 = ttnn.to_torch(result_2) - result_3 = ttnn.to_torch(result_3) - - ref1, ref2, ref3 = torch_model(torch_input) - - result_1 = result_1.reshape(1, ref1.shape[2], ref1.shape[3], 255) - result_1 = result_1.permute(0, 3, 1, 2) - - result_2 = result_2.reshape(1, ref2.shape[2], ref2.shape[3], 255) - result_2 = result_2.permute(0, 3, 1, 2) - - result_3 = result_3.reshape(1, ref3.shape[2], ref3.shape[3], 255) - result_3 = result_3.permute(0, 3, 1, 2) - - # Output is sliced because ttnn.conv returns 256 channels instead of 255. - result_1 = result_1[:, :255, :, :] - result_2 = result_2[:, :255, :, :] - result_3 = result_3[:, :255, :, :] - - assert_with_pcc(result_1, ref1, 0.99) - assert_with_pcc(result_2, ref2, 0.99) - assert_with_pcc(result_3, ref3, 0.98) + torch_model = Yolov4.from_random_weights() + ttnn_weights = update_weight_parameters(OrderedDict(torch_model.state_dict())) + ttnn_model = TtYOLOv4(ttnn_weights, device) + + imgfile = "models/demos/yolov4/demo/giraffe_320.jpg" + width = 320 + height = 320 + img = cv2.imread(imgfile) + img = cv2.resize(img, (width, height)) + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + if type(img) == np.ndarray and len(img.shape) == 3: # cv2 image + img = torch.from_numpy(img.transpose(2, 0, 1)).float().div(255.0).unsqueeze(0) + elif type(img) == np.ndarray and len(img.shape) == 4: + img = torch.from_numpy(img.transpose(0, 3, 1, 2)).float().div(255.0) + torch_input = torch.autograd.Variable(img) + + input_tensor = torch.permute(torch_input, (0, 2, 3, 1)) + ttnn_input = ttnn.from_torch(input_tensor, ttnn.bfloat16) + + torch_output_tensor = torch_model(torch_input) + + ref1, ref2, ref3 = gen_yolov4_boxes_confs(torch_output_tensor) + ref_boxes, ref_confs = get_region_boxes([ref1, ref2, ref3]) + + ttnn_output_tensor = ttnn_model(ttnn_input) + result_boxes_padded = ttnn.to_torch(ttnn_output_tensor[0]) + result_confs = ttnn.to_torch(ttnn_output_tensor[1]) + + result_boxes_padded = result_boxes_padded.permute(0, 2, 1, 3) + result_boxes_list = [] + # Unpadding + # That ttnn tensor is the concat output of 3 padded tensors + # As a perf workaround I'm doing the unpadding on the torch output here. + # TODO: cleaner ttnn code when ttnn.untilize() is fully optimized + box_1_start_i = 0 + box_1_end_i = 6100 + box_2_start_i = 6128 + box_2_end_i = 6228 + box_3_start_i = 6256 + box_3_end_i = 6356 + result_boxes_list.append(result_boxes_padded[:, box_1_start_i:box_1_end_i]) + result_boxes_list.append(result_boxes_padded[:, box_2_start_i:box_2_end_i]) + result_boxes_list.append(result_boxes_padded[:, box_3_start_i:box_3_end_i]) + result_boxes = torch.cat(result_boxes_list, dim=1) + + assert_with_pcc(ref_boxes, result_boxes, 0.99) + assert_with_pcc(ref_confs, result_confs, 0.71) From 54c42a208131af079f92e5e8a8a9655916763d67 Mon Sep 17 00:00:00 2001 From: Austin Ho Date: Tue, 25 Feb 2025 16:53:14 +0000 Subject: [PATCH 304/316] #0: Fix dprint of edm packet header --- .../fabric_edm_packet_transmission.hpp | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp index 5e8f59954c2..2c946eaf9cf 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp @@ -38,7 +38,8 @@ FORCE_INLINE void print_pkt_hdr_routing_fields(volatile tt::fabric::LowLatencyPa #endif } -FORCE_INLINE void print_pkt_header_noc_fields(volatile PACKET_HEADER_TYPE *const packet_start) { +template +FORCE_INLINE void print_pkt_header_noc_fields(volatile T *const packet_start) { #ifdef DEBUG_PRINT_ENABLED switch (packet_start->noc_send_type) { case tt::fabric::NocSendType::NOC_UNICAST_WRITE: { @@ -69,15 +70,15 @@ FORCE_INLINE void print_pkt_header(volatile tt::fabric::PacketHeader *const pack } FORCE_INLINE void print_pkt_header(volatile tt::fabric::LowLatencyPacketHeader *const packet_start) { - #ifdef DEBUG_PRINT_ENABLED - auto const& header = *packet_start; - DPRINT << "PKT: nsnd_t:" << (uint32_t) packet_start->noc_send_type << - ", src_chip:" << (uint32_t) packet_start->src_ch_id << - ", payload_size_bytes:" << (uint32_t) packet_start->payload_size_bytes << "\n"; - print_pkt_hdr_routing_fields(packet_start); - print_pkt_header_noc_fields(packet_start); - #endif - } +#ifdef DEBUG_PRINT_ENABLED + auto const& header = *packet_start; + DPRINT << "PKT: nsnd_t:" << (uint32_t) packet_start->noc_send_type << + ", src_chip:" << (uint32_t) packet_start->src_ch_id << + ", payload_size_bytes:" << (uint32_t) packet_start->payload_size_bytes << "\n"; + print_pkt_hdr_routing_fields(packet_start); + print_pkt_header_noc_fields(packet_start); +#endif +} // Since we unicast to local, we must omit the packet header From 14c537eb533e4c8fb5964b3b104c2603eeeff1a1 Mon Sep 17 00:00:00 2001 From: Oleg Milyutin Date: Tue, 25 Feb 2025 15:34:52 -0500 Subject: [PATCH 305/316] #17477: Use ND mesh coordinates for mesh events, trace, workload (#18256) ### Ticket #17477 ### Problem description This is the final PR for adopting ND shapes in TT distributed stack. ### What's changed * Removed `LogicalDeviceRange`, `LogicalDeviceRangeSet`, `DeviceCoord` and instead used ND `MeshCoordinate`, `MeshCoordinateRange`, `MeshCoordinateRangeSet`. * Implemented `MeshCoordinateRange::intersects`, `::intersection`, `::contains`. * Implemented `MeshCoordinateRangeSet` that supports merging of ranges. * Implemented "subtraction" of ranges. ### Checklist - [X] [All post commit](https://github.com/tenstorrent/tt-metal/actions/runs/13527733629) - [X] New/Existing tests provide coverage for changes - [X] Ran TT-distributed tests on T3K manually --- .../tt_metal/distributed/test_mesh_coord.cpp | 136 ++++++++- .../tt_metal/distributed/test_mesh_events.cpp | 89 +++--- .../distributed/test_mesh_sub_device.cpp | 31 +- .../tt_metal/distributed/test_mesh_trace.cpp | 279 ++++++++++-------- .../distributed/test_mesh_workload.cpp | 268 ++++++++--------- tt_metal/api/tt-metalium/distributed.hpp | 6 +- .../api/tt-metalium/mesh_command_queue.hpp | 15 +- tt_metal/api/tt-metalium/mesh_common.hpp | 11 - tt_metal/api/tt-metalium/mesh_coord.hpp | 41 +++ tt_metal/api/tt-metalium/mesh_event.hpp | 2 +- tt_metal/api/tt-metalium/mesh_trace.hpp | 8 +- tt_metal/api/tt-metalium/mesh_workload.hpp | 17 +- tt_metal/common/mesh_coord.cpp | 162 +++++++++- tt_metal/distributed/distributed.cpp | 7 +- tt_metal/distributed/mesh_command_queue.cpp | 207 ++++++------- tt_metal/distributed/mesh_trace.cpp | 15 +- tt_metal/distributed/mesh_workload.cpp | 23 +- tt_metal/distributed/mesh_workload_utils.cpp | 67 ----- tt_metal/distributed/mesh_workload_utils.hpp | 2 - .../distributed_program_dispatch.cpp | 7 +- .../distributed_eltwise_add.cpp | 7 +- .../distributed_trace_and_events.cpp | 16 +- 22 files changed, 817 insertions(+), 599 deletions(-) diff --git a/tests/tt_metal/distributed/test_mesh_coord.cpp b/tests/tt_metal/distributed/test_mesh_coord.cpp index 6d87c191930..c9a28a44310 100644 --- a/tests/tt_metal/distributed/test_mesh_coord.cpp +++ b/tests/tt_metal/distributed/test_mesh_coord.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. // // SPDX-License-Identifier: Apache-2.0 @@ -6,12 +6,15 @@ #include #include +#include "gmock/gmock.h" #include "mesh_coord.hpp" namespace tt::tt_metal::distributed { namespace { using ::testing::ElementsAre; +using ::testing::Eq; +using ::testing::IsEmpty; using ::testing::UnorderedElementsAre; TEST(MeshShapeTest, Construction) { @@ -195,6 +198,12 @@ TEST(MeshCoordinateRangeTest, SubrangeOneElement) { EXPECT_THAT(coords, ElementsAre(MeshCoordinate(1, 1, 1))); } +TEST(MeshCoordinateRangeTest, ContainsInvalidDimensions) { + MeshCoordinateRange range(MeshCoordinate(1, 1, 3), MeshCoordinate(1, 1, 3)); + EXPECT_ANY_THROW(range.contains(MeshCoordinate(1, 1))); + EXPECT_ANY_THROW(range.contains(MeshCoordinateRange(MeshCoordinate(1, 1), MeshCoordinate(1, 1)))); +} + TEST(MeshCoordinateRangeTest, Contains) { MeshCoordinateRange range(MeshCoordinate(1, 1, 3), MeshCoordinate(1, 1, 3)); EXPECT_TRUE(range.contains(MeshCoordinate(1, 1, 3))); @@ -207,6 +216,52 @@ TEST(MeshCoordinateRangeTest, Contains) { EXPECT_FALSE(range.contains(MeshCoordinate(2, 2))); } +TEST(MeshCoordinateRangeTest, ContainsRange) { + MeshCoordinateRange range(MeshCoordinate(1, 1, 3), MeshCoordinate(1, 1, 3)); + EXPECT_TRUE(range.contains(range)); + + EXPECT_FALSE(range.contains(MeshCoordinateRange(MeshCoordinate(1, 1, 2), MeshCoordinate(1, 1, 3)))); + EXPECT_FALSE(range.contains(MeshCoordinateRange(MeshCoordinate(1, 1, 3), MeshCoordinate(1, 1, 4)))); + + range = MeshCoordinateRange(MeshCoordinate(1, 1), MeshCoordinate(2, 2)); + EXPECT_FALSE(range.contains(MeshCoordinateRange(MeshCoordinate(0, 0), MeshCoordinate(0, 0)))); + EXPECT_FALSE(range.contains(MeshCoordinateRange(MeshCoordinate(0, 3), MeshCoordinate(0, 3)))); + EXPECT_FALSE(range.contains(MeshCoordinateRange(MeshCoordinate(0, 0), MeshCoordinate(0, 1)))); + EXPECT_FALSE(range.contains(MeshCoordinateRange(MeshCoordinate(0, 2), MeshCoordinate(1, 2)))); + EXPECT_TRUE(range.contains(MeshCoordinateRange(MeshCoordinate(1, 1), MeshCoordinate(1, 2)))); +} + +TEST(MeshCoordinateRangeTest, Intersection) { + MeshCoordinateRange range(MeshCoordinate(1, 1), MeshCoordinate(3, 3)); + auto intersection = range.intersection(MeshCoordinateRange(MeshCoordinate(2, 2), MeshCoordinate(4, 4))); + ASSERT_TRUE(intersection.has_value()); + EXPECT_EQ(intersection->start_coord(), MeshCoordinate(2, 2)); + EXPECT_EQ(intersection->end_coord(), MeshCoordinate(3, 3)); + + intersection = range.intersection(MeshCoordinateRange(MeshCoordinate(1, 1), MeshCoordinate(1, 1))); + ASSERT_TRUE(intersection.has_value()); + EXPECT_EQ(intersection->start_coord(), MeshCoordinate(1, 1)); + EXPECT_EQ(intersection->end_coord(), MeshCoordinate(1, 1)); + + intersection = range.intersection(MeshCoordinateRange(MeshCoordinate(3, 3), MeshCoordinate(3, 3))); + ASSERT_TRUE(intersection.has_value()); + EXPECT_EQ(intersection->start_coord(), MeshCoordinate(3, 3)); + EXPECT_EQ(intersection->end_coord(), MeshCoordinate(3, 3)); + + intersection = range.intersection(MeshCoordinateRange(MeshCoordinate(2, 2), MeshCoordinate(2, 2))); + ASSERT_TRUE(intersection.has_value()); + EXPECT_EQ(intersection->start_coord(), MeshCoordinate(2, 2)); + EXPECT_EQ(intersection->end_coord(), MeshCoordinate(2, 2)); + + intersection = range.intersection(MeshCoordinateRange(MeshCoordinate(0, 0), MeshCoordinate(5, 5))); + ASSERT_TRUE(intersection.has_value()); + EXPECT_EQ(intersection->start_coord(), MeshCoordinate(1, 1)); + EXPECT_EQ(intersection->end_coord(), MeshCoordinate(3, 3)); + + intersection = range.intersection(MeshCoordinateRange(MeshCoordinate(5, 5), MeshCoordinate(6, 6))); + EXPECT_FALSE(intersection.has_value()); +} + TEST(MeshCoordinateRangeTest, Dimensionality) { EXPECT_EQ(MeshCoordinateRange(MeshCoordinate(0), MeshCoordinate(5)).dims(), 1); EXPECT_EQ(MeshCoordinateRange(MeshCoordinate(0, 1), MeshCoordinate(5, 1)).dims(), 2); @@ -232,6 +287,85 @@ TEST(MeshCoordinateRangeTest, InvalidRange) { EXPECT_ANY_THROW(MeshCoordinateRange(start, end)); } +TEST(MeshCoordinateRangeSetTest, MergeInvalidDimensions) { + MeshCoordinateRangeSet range_set; + range_set.merge(MeshCoordinateRange(MeshCoordinate(0, 0), MeshCoordinate(1, 1))); + + EXPECT_ANY_THROW(range_set.merge(MeshCoordinateRange(MeshCoordinate(0, 0, 0), MeshCoordinate(1, 1, 1)))); +} + +TEST(MeshCoordinateRangeSetTest, Merge1D) { + MeshCoordinateRangeSet set; + // Merge first range: [0, 3]. + MeshCoordinateRange r1(MeshCoordinate(0), MeshCoordinate(3)); + set.merge(r1); + + // Merge an adjacent range: [4, 6] (adjacent to r1, since 3 and 4 touch). + MeshCoordinateRange r2(MeshCoordinate(4), MeshCoordinate(6)); + set.merge(r2); + ASSERT_EQ(set.size(), 1); + auto merged_range = set.ranges().front(); + EXPECT_EQ(merged_range.start_coord(), MeshCoordinate(0)); + EXPECT_EQ(merged_range.end_coord(), MeshCoordinate(6)); + + // Merge a separate range: [8, 10]. + MeshCoordinateRange r3(MeshCoordinate(8), MeshCoordinate(10)); + set.merge(r3); + ASSERT_EQ(set.size(), 2); + + // Merge a range bridging the gap: [7, 7] should merge all into one [0, 10]. + MeshCoordinateRange r4(MeshCoordinate(7), MeshCoordinate(7)); + set.merge(r4); + ASSERT_EQ(set.size(), 1); + merged_range = set.ranges().front(); + EXPECT_EQ(merged_range.start_coord(), MeshCoordinate(0)); + EXPECT_EQ(merged_range.end_coord(), MeshCoordinate(10)); +} + +TEST(MeshCoordinateRangeSetTest, SubtractInvalidDimensions) { + EXPECT_ANY_THROW(subtract( + MeshCoordinateRange(MeshCoordinate(0, 0, 0), MeshCoordinate(1, 1, 1)), + MeshCoordinateRange(MeshCoordinate(0, 0), MeshCoordinate(1, 1)))); +} + +TEST(MeshCoordinateRangeSetTest, SubtractNoIntersection) { + MeshCoordinateRange parent(MeshCoordinate(0, 0), MeshCoordinate(4, 10)); + MeshCoordinateRange intersection(MeshCoordinate(5, 5), MeshCoordinate(12, 12)); + EXPECT_THAT(subtract(parent, intersection).ranges(), ElementsAre(Eq(parent))); +} + +TEST(MeshCoordinateRangeSetTest, SubtractParentEqualsIntersection) { + MeshCoordinateRange parent(MeshCoordinate(0, 0), MeshCoordinate(4, 10)); + MeshCoordinateRange intersection(MeshCoordinate(0, 0), MeshCoordinate(4, 10)); + EXPECT_THAT(subtract(parent, intersection).ranges(), IsEmpty()); +} + +TEST(MeshCoordinateRangeSetTest, Subtract1DAdjacentIntersection) { + // Parent [0, 10] and intersection [3, 7] should yield [0,2] and [8,10]. + MeshCoordinateRange parent(MeshCoordinate(0), MeshCoordinate(10)); + MeshCoordinateRange intersection(MeshCoordinate(3), MeshCoordinate(7)); + + EXPECT_THAT( + subtract(parent, intersection).ranges(), + ElementsAre( + Eq(MeshCoordinateRange(MeshCoordinate(0), MeshCoordinate(2))), + Eq(MeshCoordinateRange(MeshCoordinate(8), MeshCoordinate(10))))); +} + +TEST(MeshCoordinateRangeSetTest, Subtract2DNonAdjacentIntersection) { + // Parent [(0,0) to (2,2)] and intersection [(1,1) to (1,1)]. + MeshCoordinateRange parent(MeshCoordinate(0, 0), MeshCoordinate(2, 2)); + MeshCoordinateRange intersection(MeshCoordinate(1, 1), MeshCoordinate(1, 1)); + + EXPECT_THAT( + subtract(parent, intersection).ranges(), + UnorderedElementsAre( + Eq(MeshCoordinateRange(MeshCoordinate(0, 0), MeshCoordinate(0, 2))), + Eq(MeshCoordinateRange(MeshCoordinate(1, 0), MeshCoordinate(2, 0))), + Eq(MeshCoordinateRange(MeshCoordinate(2, 1), MeshCoordinate(2, 1))), + Eq(MeshCoordinateRange(MeshCoordinate(1, 2), MeshCoordinate(2, 2))))); +} + TEST(ToLinearIndexTest, Basic) { MeshShape shape(2, 2, 3); diff --git a/tests/tt_metal/distributed/test_mesh_events.cpp b/tests/tt_metal/distributed/test_mesh_events.cpp index 4b942f0391d..85d5cae74d7 100644 --- a/tests/tt_metal/distributed/test_mesh_events.cpp +++ b/tests/tt_metal/distributed/test_mesh_events.cpp @@ -47,16 +47,10 @@ TEST_F(MeshEventsTestSuite, ReplicatedAsyncIO) { EnqueueWaitForEvent(mesh_device_->mesh_command_queue(1), event); // Reads on CQ 1 - for (std::size_t logical_x = 0; logical_x < buf->device()->num_cols(); logical_x++) { - for (std::size_t logical_y = 0; logical_y < buf->device()->num_rows(); logical_y++) { - readback_vecs.push_back({}); - auto shard = buf->get_device_buffer(MeshCoordinate(logical_y, logical_x)); - ReadShard( - mesh_device_->mesh_command_queue(1), - readback_vecs.back(), - buf, - MeshCoordinate(logical_y, logical_x)); - } + for (const auto& coord : MeshCoordinateRange(mesh_device_->shape())) { + readback_vecs.push_back({}); + auto shard = buf->get_device_buffer(coord); + ReadShard(mesh_device_->mesh_command_queue(1), readback_vecs.back(), buf, coord); } for (auto& vec : readback_vecs) { @@ -123,11 +117,11 @@ TEST_F(MeshEventsTestSuite, AsyncWorkloadAndIO) { auto programs = tt::tt_metal::distributed::test::utils::create_eltwise_bin_programs( mesh_device_, src0_bufs, src1_bufs, output_bufs); auto mesh_workload = CreateMeshWorkload(); - LogicalDeviceRange devices_0 = LogicalDeviceRange({0, 0}, {mesh_device_->num_cols() - 1, 0}); - LogicalDeviceRange devices_1 = LogicalDeviceRange({0, 1}, {mesh_device_->num_cols() - 1, 1}); + MeshCoordinateRange devices_0(MeshCoordinate{0, 0}, MeshCoordinate{0, mesh_device_->num_cols() - 1}); + MeshCoordinateRange devices_1(MeshCoordinate{1, 0}, MeshCoordinate{1, mesh_device_->num_cols() - 1}); - AddProgramToMeshWorkload(mesh_workload, *programs[0], devices_0); - AddProgramToMeshWorkload(mesh_workload, *programs[1], devices_1); + AddProgramToMeshWorkload(mesh_workload, std::move(*programs[0]), devices_0); + AddProgramToMeshWorkload(mesh_workload, std::move(*programs[1]), devices_1); for (int iter = 0; iter < num_iters; iter++) { std::vector src0_vec = create_constant_vector_of_bfloat16(src0_bufs[0]->size(), iter + 2); @@ -167,24 +161,23 @@ TEST_F(MeshEventsTestSuite, AsyncWorkloadAndIO) { } // Issue reads on MeshCQ 1 - for (std::size_t logical_y = 0; logical_y < mesh_device_->num_rows(); logical_y++) { - for (std::size_t logical_x = 0; logical_x < mesh_device_->num_cols(); logical_x++) { - for (std::size_t col_idx = 0; col_idx < worker_grid_size.x; col_idx++) { - for (std::size_t row_idx = 0; row_idx < worker_grid_size.y; row_idx++) { - std::vector dst_vec = {}; - ReadShard( - mesh_device_->mesh_command_queue(1), - dst_vec, - output_bufs[col_idx * worker_grid_size.y + row_idx], - MeshCoordinate(logical_y, logical_x)); - if (logical_y == 0) { - for (int i = 0; i < dst_vec.size(); i++) { - EXPECT_EQ(dst_vec[i].to_float(), (2 * iter + 5)); - } - } else { - for (int i = 0; i < dst_vec.size(); i++) { - EXPECT_EQ(dst_vec[i].to_float(), (iter + 2) * (iter + 3)); - } + for (const auto& device_coord : MeshCoordinateRange(mesh_device_->shape())) { + std::vector dst_vec = {}; + for (std::size_t col_idx = 0; col_idx < worker_grid_size.x; col_idx++) { + for (std::size_t row_idx = 0; row_idx < worker_grid_size.y; row_idx++) { + std::vector dst_vec = {}; + ReadShard( + mesh_device_->mesh_command_queue(1), + dst_vec, + output_bufs[col_idx * worker_grid_size.y + row_idx], + device_coord); + if (device_coord[0] == 0) { + for (int i = 0; i < dst_vec.size(); i++) { + EXPECT_EQ(dst_vec[i].to_float(), (2 * iter + 5)); + } + } else { + for (int i = 0; i < dst_vec.size(); i++) { + EXPECT_EQ(dst_vec[i].to_float(), (iter + 2) * (iter + 3)); } } } @@ -213,8 +206,8 @@ TEST_F(MeshEventsTestSuite, CustomDeviceRanges) { for (std::size_t i = 0; i < num_iterations; i++) { std::vector src_vec(NUM_TILES * single_tile_size / sizeof(uint32_t), i); std::iota(src_vec.begin(), src_vec.end(), i); - LogicalDeviceRange devices_0 = LogicalDeviceRange({0, 0}, {mesh_device_->num_cols() - 1, 0}); - LogicalDeviceRange devices_1 = LogicalDeviceRange({0, 1}, {mesh_device_->num_cols() - 1, 1}); + MeshCoordinateRange devices_0(MeshCoordinate{0, 0}, MeshCoordinate{0, mesh_device_->num_cols() - 1}); + MeshCoordinateRange devices_1(MeshCoordinate{1, 0}, MeshCoordinate{1, mesh_device_->num_cols() - 1}); std::vector> readback_vecs = {}; std::shared_ptr event_0 = std::make_shared(); @@ -224,32 +217,20 @@ TEST_F(MeshEventsTestSuite, CustomDeviceRanges) { EnqueueRecordEvent(mesh_device_->mesh_command_queue(1), event_0, {}, devices_0); EnqueueWaitForEvent(mesh_device_->mesh_command_queue(0), event_0); - for (std::size_t logical_x = devices_0.start_coord.x; logical_x < devices_0.end_coord.x; logical_x++) { - for (std::size_t logical_y = devices_0.start_coord.y; logical_y < devices_0.end_coord.y; logical_y++) { - readback_vecs.push_back({}); - auto shard = buf->get_device_buffer(MeshCoordinate(logical_y, logical_x)); - ReadShard( - mesh_device_->mesh_command_queue(0), - readback_vecs.back(), - buf, - MeshCoordinate(logical_y, logical_x)); - } + for (const auto& coord : devices_0) { + readback_vecs.push_back({}); + auto shard = buf->get_device_buffer(coord); + ReadShard(mesh_device_->mesh_command_queue(0), readback_vecs.back(), buf, coord); } mesh_device_->mesh_command_queue(1).enqueue_write_shard_to_sub_grid(*buf, src_vec.data(), devices_1, false); EnqueueRecordEventToHost(mesh_device_->mesh_command_queue(1), event_1, {}, devices_1); EventSynchronize(event_1); - for (std::size_t logical_x = devices_1.start_coord.x; logical_x < devices_1.end_coord.x; logical_x++) { - for (std::size_t logical_y = devices_1.start_coord.y; logical_y < devices_1.end_coord.y; logical_y++) { - readback_vecs.push_back({}); - auto shard = buf->get_device_buffer(MeshCoordinate(logical_y, logical_x)); - ReadShard( - mesh_device_->mesh_command_queue(0), - readback_vecs.back(), - buf, - MeshCoordinate(logical_y, logical_x)); - } + for (const auto& coord : devices_1) { + readback_vecs.push_back({}); + auto shard = buf->get_device_buffer(coord); + ReadShard(mesh_device_->mesh_command_queue(0), readback_vecs.back(), buf, coord); } for (auto& vec : readback_vecs) { EXPECT_EQ(vec, src_vec); diff --git a/tests/tt_metal/distributed/test_mesh_sub_device.cpp b/tests/tt_metal/distributed/test_mesh_sub_device.cpp index b39608a0781..90ae82320d4 100644 --- a/tests/tt_metal/distributed/test_mesh_sub_device.cpp +++ b/tests/tt_metal/distributed/test_mesh_sub_device.cpp @@ -25,14 +25,13 @@ TEST_F(MeshSubDeviceTestSuite, SyncWorkloadsOnSubDevice) { auto [waiter_program, syncer_program, incrementer_program, global_sem] = create_basic_sync_program(mesh_device_.get(), sub_device_1, sub_device_2); - LogicalDeviceRange devices = - LogicalDeviceRange({0, 0}, {mesh_device_->num_cols() - 1, mesh_device_->num_rows() - 1}); + MeshCoordinateRange devices(mesh_device_->shape()); auto waiter_mesh_workload = CreateMeshWorkload(); auto syncer_mesh_workload = CreateMeshWorkload(); auto incrementer_mesh_workload = CreateMeshWorkload(); - AddProgramToMeshWorkload(waiter_mesh_workload, waiter_program, devices); - AddProgramToMeshWorkload(syncer_mesh_workload, syncer_program, devices); - AddProgramToMeshWorkload(incrementer_mesh_workload, incrementer_program, devices); + AddProgramToMeshWorkload(waiter_mesh_workload, std::move(waiter_program), devices); + AddProgramToMeshWorkload(syncer_mesh_workload, std::move(syncer_program), devices); + AddProgramToMeshWorkload(incrementer_mesh_workload, std::move(incrementer_program), devices); for (uint32_t i = 0; i < num_iters; i++) { EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), waiter_mesh_workload, false); mesh_device_->set_sub_device_stall_group({SubDeviceId{0}}); @@ -103,11 +102,10 @@ TEST_F(MeshSubDeviceTestSuite, DataCopyOnSubDevices) { auto syncer_mesh_workload = CreateMeshWorkload(); auto datacopy_mesh_workload = CreateMeshWorkload(); - LogicalDeviceRange devices = - LogicalDeviceRange({0, 0}, {mesh_device_->num_cols() - 1, mesh_device_->num_rows() - 1}); + MeshCoordinateRange devices(mesh_device_->shape()); - AddProgramToMeshWorkload(syncer_mesh_workload, sync_and_incr_program, devices); - AddProgramToMeshWorkload(datacopy_mesh_workload, datacopy_program, devices); + AddProgramToMeshWorkload(syncer_mesh_workload, std::move(sync_and_incr_program), devices); + AddProgramToMeshWorkload(datacopy_mesh_workload, std::move(datacopy_program), devices); for (int i = 0; i < 50; i++) { mesh_device_->set_sub_device_stall_group({SubDeviceId{2}}); @@ -158,21 +156,20 @@ TEST_F(MeshSubDeviceTestSuite, SubDeviceSwitching) { uint32_t num_iters = 100; // Create MeshWorkloads corresponding to different SubDevice configs, // so we can single-shot dispatch to the entire Mesh - LogicalDeviceRange devices = - LogicalDeviceRange({0, 0}, {mesh_device_->num_cols() - 1, mesh_device_->num_rows() - 1}); + MeshCoordinateRange devices(mesh_device_->shape()); auto waiter_mesh_workload = CreateMeshWorkload(); auto syncer_mesh_workload = CreateMeshWorkload(); auto incrementer_mesh_workload = CreateMeshWorkload(); - AddProgramToMeshWorkload(waiter_mesh_workload, waiter_program, devices); - AddProgramToMeshWorkload(syncer_mesh_workload, syncer_program, devices); - AddProgramToMeshWorkload(incrementer_mesh_workload, incrementer_program, devices); + AddProgramToMeshWorkload(waiter_mesh_workload, std::move(waiter_program), devices); + AddProgramToMeshWorkload(syncer_mesh_workload, std::move(syncer_program), devices); + AddProgramToMeshWorkload(incrementer_mesh_workload, std::move(incrementer_program), devices); auto waiter_mesh_workload_1 = CreateMeshWorkload(); auto syncer_mesh_workload_1 = CreateMeshWorkload(); auto incrementer_mesh_workload_1 = CreateMeshWorkload(); - AddProgramToMeshWorkload(waiter_mesh_workload_1, waiter_program_1, devices); - AddProgramToMeshWorkload(syncer_mesh_workload_1, syncer_program_1, devices); - AddProgramToMeshWorkload(incrementer_mesh_workload_1, incrementer_program_1, devices); + AddProgramToMeshWorkload(waiter_mesh_workload_1, std::move(waiter_program_1), devices); + AddProgramToMeshWorkload(syncer_mesh_workload_1, std::move(syncer_program_1), devices); + AddProgramToMeshWorkload(incrementer_mesh_workload_1, std::move(incrementer_program_1), devices); // Load SubDevice configs, run corresponding workloads, reset ... repeat for (uint32_t i = 0; i < num_iters; i++) { diff --git a/tests/tt_metal/distributed/test_mesh_trace.cpp b/tests/tt_metal/distributed/test_mesh_trace.cpp index b3e51f352c2..cea977bda16 100644 --- a/tests/tt_metal/distributed/test_mesh_trace.cpp +++ b/tests/tt_metal/distributed/test_mesh_trace.cpp @@ -8,7 +8,9 @@ #include #include #include +#include +#include "indestructible.hpp" #include "tests/tt_metal/tt_metal/common/multi_device_fixture.hpp" #include "tests/tt_metal/tt_metal/dispatch/sub_device_test_utils.hpp" #include "tests/tt_metal/distributed/utils.hpp" @@ -16,21 +18,34 @@ namespace tt::tt_metal::distributed::test { namespace { +// Helper functions that return MeshCoordinateRange spanning various parts of the T3000 device. +const MeshCoordinateRange& t3k_bottom_row() { + static tt::stl::Indestructible bottom_row(MeshCoordinate{1, 0}, MeshCoordinate{1, 3}); + return bottom_row.get(); +} + +const MeshCoordinateRange& t3k_top_row() { + static tt::stl::Indestructible top_row(MeshCoordinate{0, 0}, MeshCoordinate{0, 3}); + return top_row.get(); +} + +const MeshCoordinateRange& t3k_full_grid() { + static tt::stl::Indestructible full_grid(MeshCoordinate{0, 0}, MeshCoordinate{1, 3}); + return full_grid.get(); +} + // Define custom fixtures initializing a trace region on the MeshDevice -class GenericMeshDeviceTraceFixture : public MeshDeviceFixtureBase { +class MeshTraceTestSuite : public MeshDeviceFixtureBase { protected: - GenericMeshDeviceTraceFixture() : MeshDeviceFixtureBase(Config{.num_cqs = 1, .trace_region_size = (64 << 20)}) {} + MeshTraceTestSuite() : MeshDeviceFixtureBase(Config{.num_cqs = 1, .trace_region_size = (64 << 20)}) {} }; -class T3000MeshDeviceTraceFixture : public MeshDeviceFixtureBase { +class MeshTraceTestT3000 : public MeshDeviceFixtureBase { protected: - T3000MeshDeviceTraceFixture() : + MeshTraceTestT3000() : MeshDeviceFixtureBase(Config{.mesh_device_type = MeshDeviceType::T3000, .trace_region_size = (64 << 20)}) {} }; -using MeshTraceTestT3000 = T3000MeshDeviceTraceFixture; -using MeshTraceTestSuite = GenericMeshDeviceTraceFixture; - TEST_F(MeshTraceTestSuite, Sanity) { auto random_seed = 10; uint32_t seed = tt::parse_env("TT_METAL_SEED", random_seed); @@ -41,15 +56,14 @@ TEST_F(MeshTraceTestSuite, Sanity) { uint32_t num_traces = 4; uint32_t num_iters = 10; - LogicalDeviceRange all_devices = - LogicalDeviceRange({0, 0}, {mesh_device_->num_cols() - 1, mesh_device_->num_rows() - 1}); + MeshCoordinateRange all_devices(mesh_device_->shape()); std::vector> mesh_workloads = {}; for (int i = 0; i < num_workloads_per_trace * num_traces; i++) { auto workload = std::make_shared(); auto programs = tt::tt_metal::distributed::test::utils::create_random_programs( 1, mesh_device_->compute_with_storage_grid_size(), seed); - AddProgramToMeshWorkload(*workload, *programs[0], all_devices); + AddProgramToMeshWorkload(*workload, std::move(*programs[0]), all_devices); EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), *workload, false); mesh_workloads.push_back(workload); } @@ -80,7 +94,7 @@ TEST_F(MeshTraceTestSuite, Sanity) { } class MeshTraceSweepTest : public MeshTraceTestT3000, - public testing::WithParamInterface>> {}; + public testing::WithParamInterface>> {}; TEST_P(MeshTraceSweepTest, Sweep) { auto random_seed = 10; @@ -99,7 +113,7 @@ TEST_P(MeshTraceSweepTest, Sweep) { for (auto& program_grid : workload_grid) { auto programs = tt::tt_metal::distributed::test::utils::create_random_programs( 1, mesh_device_->compute_with_storage_grid_size(), seed); - AddProgramToMeshWorkload(*workload, *programs[0], program_grid); + AddProgramToMeshWorkload(*workload, std::move(*programs[0]), program_grid); } EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), *workload, false); mesh_workloads.push_back(workload); @@ -121,78 +135,100 @@ INSTANTIATE_TEST_SUITE_P( MeshTraceSweepTests, MeshTraceSweepTest, ::testing::Values( - std::vector>({ - {LogicalDeviceRange({0, 0}, {3, 1})}, // Full grid - {LogicalDeviceRange({1, 0}, {1, 1})}, // Run on single center column - {LogicalDeviceRange({2, 0}, {2, 0})}, // Run on single device - top row, center - {LogicalDeviceRange({3, 1}, {3, 1})}, // Run on bottom right device - {LogicalDeviceRange({0, 0}, {0, 0})}, // Run on top left device - {LogicalDeviceRange({0, 0}, {3, 1})}, // Full grid + std::vector>({ + {t3k_full_grid()}, + // Run on single center column: + {MeshCoordinateRange({0, 1}, {1, 1})}, + // Run on single device - top row, center: + {MeshCoordinateRange({0, 2}, {0, 2})}, + // Run on bottom right device: + {MeshCoordinateRange({1, 3}, {1, 3})}, + // Run on top left device: + {MeshCoordinateRange({0, 0}, {0, 0})}, + {t3k_full_grid()}, }), - std::vector>({ - {LogicalDeviceRange({0, 0}, {3, 1})}, // Full grid - {LogicalDeviceRange({1, 0}, {1, 1}), - LogicalDeviceRange({2, 0}, {2, 1}), - LogicalDeviceRange({3, 0}, {3, 1}), - LogicalDeviceRange({0, 0}, {0, 1})}, // Split grid into 4 columns - {LogicalDeviceRange({0, 0}, {3, 0}), LogicalDeviceRange({0, 1}, {3, 1})}, // Split grid into 2 rows + std::vector>({ + {t3k_full_grid()}, + // Split grid into 4 columns: + {MeshCoordinateRange({0, 1}, {1, 1}), + MeshCoordinateRange({0, 2}, {1, 2}), + MeshCoordinateRange({0, 3}, {1, 3}), + MeshCoordinateRange({0, 0}, {1, 0})}, + // Split grid into 2 rows: + {t3k_top_row(), t3k_bottom_row()}, }), - std::vector>({ - {LogicalDeviceRange({0, 0}, {3, 1})}, // Full grid - {LogicalDeviceRange({0, 0}, {3, 0}), LogicalDeviceRange({0, 1}, {3, 1})}, // Split grid into 2 rows - {LogicalDeviceRange({0, 0}, {1, 1}), LogicalDeviceRange({2, 0}, {3, 1})}, // Split grid into 2 columns - {LogicalDeviceRange({0, 0}, {1, 1}), - LogicalDeviceRange({2, 0}, {2, 1}), - LogicalDeviceRange({3, 0}, {3, 1})}, // Split grid into 3 columns - {LogicalDeviceRange({0, 0}, {0, 1}), - LogicalDeviceRange({1, 0}, {1, 1}), - LogicalDeviceRange({2, 0}, {2, 1}), - LogicalDeviceRange({3, 0}, {3, 1})}, // Split grid into 4 columns + std::vector>({ + {t3k_full_grid()}, + // Split grid into 2 rows: + {t3k_top_row(), t3k_bottom_row()}, + // Split grid into 2 columns: + {MeshCoordinateRange({0, 0}, {1, 1}), // + MeshCoordinateRange({0, 2}, {1, 3})}, + // Split grid into 3 columns: + {MeshCoordinateRange({0, 0}, {1, 1}), // + MeshCoordinateRange({0, 2}, {1, 2}), // + MeshCoordinateRange({0, 3}, {1, 3})}, + // Split grid into 4 columns: + {MeshCoordinateRange({0, 0}, {1, 0}), // + MeshCoordinateRange({0, 1}, {1, 1}), // + MeshCoordinateRange({0, 2}, {1, 2}), // + MeshCoordinateRange({0, 3}, {1, 3})}, }), - std::vector>({ - {LogicalDeviceRange({0, 0}, {3, 1})}, // Full grid - {LogicalDeviceRange({0, 0}, {0, 0}), - LogicalDeviceRange({1, 0}, {1, 0}), - LogicalDeviceRange({2, 0}, {2, 0}), - LogicalDeviceRange({3, 0}, {3, 0}), - LogicalDeviceRange({0, 1}, {0, 1}), - LogicalDeviceRange({1, 1}, {1, 1}), - LogicalDeviceRange({2, 1}, {2, 1}), - LogicalDeviceRange({3, 1}, {3, 1})}, // Run on individual devices - {LogicalDeviceRange({1, 0}, {2, 1})}, // Run on 2 center columns - {LogicalDeviceRange({2, 0}, {2, 1})}, // Run on single center column - {LogicalDeviceRange({1, 1}, {2, 1})}, // Run on 2 devices on the bottom row + std::vector>({ + {t3k_full_grid()}, + // Run on individual devices: + {MeshCoordinateRange({0, 0}, {0, 0}), + MeshCoordinateRange({0, 1}, {0, 1}), + MeshCoordinateRange({0, 2}, {0, 2}), + MeshCoordinateRange({0, 3}, {0, 3}), + MeshCoordinateRange({1, 0}, {1, 0}), + MeshCoordinateRange({1, 1}, {1, 1}), + MeshCoordinateRange({1, 2}, {1, 2}), + MeshCoordinateRange({1, 3}, {1, 3})}, + // Run on 2 center columns: + {MeshCoordinateRange({0, 1}, {1, 2})}, + // Run on single center column: + {MeshCoordinateRange({0, 2}, {1, 2})}, + // Run on 2 devices on the bottom row: + {MeshCoordinateRange({1, 1}, {1, 2})}, }), - std::vector>({ - {LogicalDeviceRange({0, 0}, {0, 1}), - LogicalDeviceRange({1, 0}, {1, 1}), - LogicalDeviceRange({2, 0}, {2, 1}), - LogicalDeviceRange({3, 0}, {3, 1})}, // Split grid into 4 columns - {LogicalDeviceRange({0, 0}, {3, 0}), LogicalDeviceRange({0, 1}, {3, 1})}, // Split grid into 2 rows - {LogicalDeviceRange({0, 0}, {3, 1})}, // Full grid - {LogicalDeviceRange({0, 0}, {3, 0})}, // Run on top row only - {LogicalDeviceRange({0, 1}, {3, 1})}, // Run on bottom row only + std::vector>({ + // Split grid into 4 columns: + {MeshCoordinateRange({0, 0}, {1, 0}), + MeshCoordinateRange({0, 1}, {1, 1}), + MeshCoordinateRange({0, 2}, {1, 2}), + MeshCoordinateRange({0, 3}, {1, 3})}, + // Split grid into 2 rows: + {t3k_top_row(), t3k_bottom_row()}, + {t3k_full_grid()}, + {t3k_top_row()}, + {t3k_bottom_row()}, }), - std::vector>({ - {LogicalDeviceRange({0, 0}, {3, 0})}, // Run on top row only - {LogicalDeviceRange({0, 1}, {3, 1})}, // Run on bottom row only - {LogicalDeviceRange({0, 0}, {0, 1})}, // Run on left most column only - {LogicalDeviceRange({1, 0}, {3, 1})}, // Run on right most 3-columns only - {LogicalDeviceRange({0, 0}, {1, 1})}, // Run on left most 2-columns only - {LogicalDeviceRange({0, 0}, {3, 1})}, // Full grid + std::vector>({ + {t3k_top_row()}, + {t3k_bottom_row()}, + // Run on left most column only: + {MeshCoordinateRange({0, 0}, {1, 0})}, + // Run on right most 3-columns only: + {MeshCoordinateRange({0, 1}, {1, 3})}, + // Run on left most 2-columns only: + {MeshCoordinateRange({0, 0}, {1, 1})}, + // Full grid: + {MeshCoordinateRange({0, 0}, {1, 3})}, }), - std::vector>({ - {LogicalDeviceRange({0, 0}, {0, 0}), - LogicalDeviceRange({1, 0}, {1, 0}), - LogicalDeviceRange({2, 0}, {2, 0}), - LogicalDeviceRange({3, 0}, {3, 0}), - LogicalDeviceRange({0, 1}, {0, 1}), - LogicalDeviceRange({1, 1}, {1, 1}), - LogicalDeviceRange({2, 1}, {2, 1}), - LogicalDeviceRange({3, 1}, {3, 1})}, // Run on individual devices - {LogicalDeviceRange({0, 0}, {3, 0})}, // Run on top row only - {LogicalDeviceRange({0, 1}, {3, 1})}, // Run on bottom row only - {LogicalDeviceRange({0, 0}, {3, 1})}, // Full grid + std::vector>({ + // Run on individual devices: + {MeshCoordinateRange({0, 0}, {0, 0}), + MeshCoordinateRange({0, 1}, {0, 1}), + MeshCoordinateRange({0, 2}, {0, 2}), + MeshCoordinateRange({0, 3}, {0, 3}), + MeshCoordinateRange({1, 0}, {1, 0}), + MeshCoordinateRange({1, 1}, {1, 1}), + MeshCoordinateRange({1, 2}, {1, 2}), + MeshCoordinateRange({1, 3}, {1, 3})}, + {t3k_top_row()}, + {t3k_bottom_row()}, + {t3k_full_grid()}, }))); TEST_F(MeshTraceTestT3000, EltwiseBinaryMeshTrace) { @@ -205,34 +241,34 @@ TEST_F(MeshTraceTestT3000, EltwiseBinaryMeshTrace) { CoreCoord worker_grid_size = mesh_device_->compute_with_storage_grid_size(); // Separate Mesh into top and bottom rows - LogicalDeviceRange row_0 = LogicalDeviceRange({0, 0}, {3, 0}); - LogicalDeviceRange row_1 = LogicalDeviceRange({0, 1}, {3, 1}); + MeshCoordinateRange row_0 = t3k_top_row(); + MeshCoordinateRange row_1 = t3k_bottom_row(); // Separate Mesh into 3 columns - LogicalDeviceRange col_0 = LogicalDeviceRange({0, 0}, {1, 1}); - LogicalDeviceRange col_1 = LogicalDeviceRange({2, 0}, {2, 1}); - LogicalDeviceRange col_2 = LogicalDeviceRange({3, 0}, {3, 1}); + MeshCoordinateRange col_0({0, 0}, {1, 1}); + MeshCoordinateRange col_1({0, 2}, {1, 2}); + MeshCoordinateRange col_2({0, 3}, {1, 3}); // Create first workload: running addition on top row and multiplication on bottom row auto programs = tt::tt_metal::distributed::test::utils::create_eltwise_bin_programs( mesh_device_, src0_bufs, src1_bufs, intermed_bufs_0); auto mesh_workload = CreateMeshWorkload(); - AddProgramToMeshWorkload(mesh_workload, *programs[0], row_0); - AddProgramToMeshWorkload(mesh_workload, *programs[1], row_1); + AddProgramToMeshWorkload(mesh_workload, std::move(*programs[0]), row_0); + AddProgramToMeshWorkload(mesh_workload, std::move(*programs[1]), row_1); // Create second workload: running addition on top row (src1 + intermed0) and multiplication on // bottom row (src1 * intermed0) auto programs_1 = tt::tt_metal::distributed::test::utils::create_eltwise_bin_programs( mesh_device_, intermed_bufs_0, src1_bufs, intermed_bufs_1); auto mesh_workload_1 = CreateMeshWorkload(); - AddProgramToMeshWorkload(mesh_workload_1, *programs_1[1], row_0); - AddProgramToMeshWorkload(mesh_workload_1, *programs_1[0], row_1); + AddProgramToMeshWorkload(mesh_workload_1, std::move(*programs_1[1]), row_0); + AddProgramToMeshWorkload(mesh_workload_1, std::move(*programs_1[0]), row_1); // Create third workload: running addition on 1st col (src1 + intermed1), multiplication on // second col (src1 * intermed1) and subtraction on the third col( src1 - intermed1) auto programs_2 = tt::tt_metal::distributed::test::utils::create_eltwise_bin_programs( mesh_device_, intermed_bufs_1, src1_bufs, output_bufs); auto mesh_workload_2 = CreateMeshWorkload(); - AddProgramToMeshWorkload(mesh_workload_2, *programs_2[0], col_0); - AddProgramToMeshWorkload(mesh_workload_2, *programs_2[1], col_1); - AddProgramToMeshWorkload(mesh_workload_2, *programs_2[2], col_2); + AddProgramToMeshWorkload(mesh_workload_2, std::move(*programs_2[0]), col_0); + AddProgramToMeshWorkload(mesh_workload_2, std::move(*programs_2[1]), col_1); + AddProgramToMeshWorkload(mesh_workload_2, std::move(*programs_2[2]), col_2); // Initialize inputs std::vector src0_vec = create_constant_vector_of_bfloat16(src0_bufs[0]->size(), 2); @@ -303,12 +339,11 @@ TEST_F(MeshTraceTestSuite, SyncWorkloadsOnSubDeviceTrace) { create_basic_sync_program(mesh_device_.get(), sub_device_1, sub_device_2); // Top row - first MeshWorkload set - LogicalDeviceRange top_row = LogicalDeviceRange({0, 0}, {mesh_device_->num_cols() - 1, 0}); + MeshCoordinateRange top_row({0, 0}, {0, mesh_device_->num_cols() - 1}); // Bottom row - second MeshWorkload set - LogicalDeviceRange bottom_row = LogicalDeviceRange({0, 1}, {mesh_device_->num_cols() - 1, 1}); + MeshCoordinateRange bottom_row({1, 0}, {1, mesh_device_->num_cols() - 1}); // All devices: third MeshWorkload set - LogicalDeviceRange all_devices = - LogicalDeviceRange({0, 0}, {mesh_device_->num_cols() - 1, mesh_device_->num_rows() - 1}); + MeshCoordinateRange all_devices(mesh_device_->shape()); // Initialize and construct all MeshWorkloads running on different SubDevices auto waiter_0 = CreateMeshWorkload(); @@ -323,17 +358,17 @@ TEST_F(MeshTraceTestSuite, SyncWorkloadsOnSubDeviceTrace) { auto syncer_2 = CreateMeshWorkload(); auto incrementer_2 = CreateMeshWorkload(); - AddProgramToMeshWorkload(waiter_0, waiter_program_0, top_row); - AddProgramToMeshWorkload(syncer_0, syncer_program_0, top_row); - AddProgramToMeshWorkload(incrementer_0, incrementer_program_0, top_row); + AddProgramToMeshWorkload(waiter_0, std::move(waiter_program_0), top_row); + AddProgramToMeshWorkload(syncer_0, std::move(syncer_program_0), top_row); + AddProgramToMeshWorkload(incrementer_0, std::move(incrementer_program_0), top_row); - AddProgramToMeshWorkload(waiter_1, waiter_program_1, bottom_row); - AddProgramToMeshWorkload(syncer_1, syncer_program_1, bottom_row); - AddProgramToMeshWorkload(incrementer_1, incrementer_program_1, bottom_row); + AddProgramToMeshWorkload(waiter_1, std::move(waiter_program_1), bottom_row); + AddProgramToMeshWorkload(syncer_1, std::move(syncer_program_1), bottom_row); + AddProgramToMeshWorkload(incrementer_1, std::move(incrementer_program_1), bottom_row); - AddProgramToMeshWorkload(waiter_2, waiter_program_2, all_devices); - AddProgramToMeshWorkload(syncer_2, syncer_program_2, all_devices); - AddProgramToMeshWorkload(incrementer_2, incrementer_program_2, all_devices); + AddProgramToMeshWorkload(waiter_2, std::move(waiter_program_2), all_devices); + AddProgramToMeshWorkload(syncer_2, std::move(syncer_program_2), all_devices); + AddProgramToMeshWorkload(incrementer_2, std::move(incrementer_program_2), all_devices); // Compile all MeshWorkloads EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), waiter_0, false); @@ -477,23 +512,22 @@ TEST_F(MeshTraceTestSuite, DataCopyOnSubDevicesTrace) { SetRuntimeArgs(add_program_2, add_kernel_2, add_core, add_rt_args_2); CBHandle add_cb_2 = CreateCircularBuffer(add_program_2, add_core, cb_src0_config); - LogicalDeviceRange devices = - LogicalDeviceRange({0, 0}, {mesh_device_->num_cols() - 1, mesh_device_->num_rows() - 1}); - LogicalDeviceRange top_row = LogicalDeviceRange({0, 0}, {mesh_device_->num_cols() - 1, 0}); - LogicalDeviceRange bottom_row = LogicalDeviceRange({0, 1}, {mesh_device_->num_cols() - 1, 1}); + MeshCoordinateRange devices(mesh_device_->shape()); + MeshCoordinateRange top_row({0, 0}, {0, mesh_device_->num_cols() - 1}); + MeshCoordinateRange bottom_row({1, 0}, {1, mesh_device_->num_cols() - 1}); // Create and initialize MeshWorkloads auto syncer_mesh_workload = CreateMeshWorkload(); auto datacopy_mesh_workload = CreateMeshWorkload(); auto add_mesh_workload = CreateMeshWorkload(); // Sync program goes to entire Mesh - AddProgramToMeshWorkload(syncer_mesh_workload, sync_and_incr_program, devices); + AddProgramToMeshWorkload(syncer_mesh_workload, std::move(sync_and_incr_program), devices); // Datacopy goes to top row - AddProgramToMeshWorkload(datacopy_mesh_workload, datacopy_program, top_row); + AddProgramToMeshWorkload(datacopy_mesh_workload, std::move(datacopy_program), top_row); // First addition goes to bottom row - AddProgramToMeshWorkload(datacopy_mesh_workload, add_program, bottom_row); + AddProgramToMeshWorkload(datacopy_mesh_workload, std::move(add_program), bottom_row); // Second addition goes to bottom row - AddProgramToMeshWorkload(add_mesh_workload, add_program_2, bottom_row); + AddProgramToMeshWorkload(add_mesh_workload, std::move(add_program_2), bottom_row); // Compile and load workloads mesh_device_->set_sub_device_stall_group({SubDeviceId{2}}); @@ -528,20 +562,17 @@ TEST_F(MeshTraceTestSuite, DataCopyOnSubDevicesTrace) { device->id(), syncer_core_phys, std::vector{1}, global_sem.address()); } mesh_device_->reset_sub_device_stall_group(); - for (std::size_t logical_x = 0; logical_x < output_buf->device()->num_cols(); logical_x++) { - for (std::size_t logical_y = 0; logical_y < 1; logical_y++) { - std::vector dst_vec; - ReadShard(mesh_device_->mesh_command_queue(), dst_vec, output_buf, MeshCoordinate(logical_y, logical_x)); - EXPECT_EQ(dst_vec, src_vec); - } + for (const auto& device_coord : top_row) { + std::vector dst_vec; + ReadShard(mesh_device_->mesh_command_queue(), dst_vec, output_buf, device_coord); + EXPECT_EQ(dst_vec, src_vec); } - for (std::size_t logical_x = 0; logical_x < output_buf->device()->num_cols(); logical_x++) { - for (std::size_t logical_y = 1; logical_y < 2; logical_y++) { - std::vector dst_vec; - ReadShard(mesh_device_->mesh_command_queue(), dst_vec, output_buf, MeshCoordinate(logical_y, logical_x)); - for (int j = 0; j < dst_vec.size(); j++) { - EXPECT_EQ(dst_vec[j], src_vec[j] + 3); - } + + for (const auto& device_coord : bottom_row) { + std::vector dst_vec; + ReadShard(mesh_device_->mesh_command_queue(), dst_vec, output_buf, device_coord); + for (int j = 0; j < dst_vec.size(); j++) { + EXPECT_EQ(dst_vec[j], src_vec[j] + 3); } } } diff --git a/tests/tt_metal/distributed/test_mesh_workload.cpp b/tests/tt_metal/distributed/test_mesh_workload.cpp index 5e88493d029..9ede136ed3d 100644 --- a/tests/tt_metal/distributed/test_mesh_workload.cpp +++ b/tests/tt_metal/distributed/test_mesh_workload.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include "tests/tt_metal/tt_metal/common/multi_device_fixture.hpp" #include "tests/tt_metal/distributed/utils.hpp" @@ -82,34 +83,30 @@ void verify_cb_config( NUM_CIRCULAR_BUFFERS * UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG * sizeof(uint32_t); for (const auto& device_range : workload.get_logical_device_ranges()) { - for (std::size_t logical_x = device_range.start_coord.x; logical_x < device_range.end_coord.x; logical_x++) { - for (std::size_t logical_y = device_range.start_coord.y; logical_y < device_range.end_coord.y; - logical_y++) { - auto device = mesh_device->get_device(logical_y, logical_x); - uint32_t l1_unreserved_base = device->allocator()->get_base_allocator_addr(HalMemType::L1); - for (const auto& core_range : crs.ranges()) { - for (const auto& core_coord : core_range) { - ::tt::tt_metal::detail::ReadFromDeviceL1( - device, - core_coord, - workload.get_cb_base_addr(mesh_device, core_coord, CoreType::WORKER), - cb_config_buffer_size, - cb_config_vector); - - uint32_t cb_addr = l1_unreserved_base; - for (uint32_t i = 0; i < golden_cb_config.size(); i++) { - const uint32_t index = golden_cb_config[i].cb_id * sizeof(uint32_t); - const uint32_t cb_num_pages = golden_cb_config[i].num_pages; - const uint32_t cb_size = cb_num_pages * golden_cb_config[i].page_size; - const bool addr_match = cb_config_vector.at(index) == cb_addr; - const bool size_match = cb_config_vector.at(index + 1) == cb_size; - const bool num_pages_match = cb_config_vector.at(index + 2) == cb_num_pages; - EXPECT_TRUE(addr_match); - EXPECT_TRUE(size_match); - EXPECT_TRUE(num_pages_match); - - cb_addr += cb_size; - } + for (const auto& coord : device_range) { + auto device = mesh_device->get_device(coord); + uint32_t l1_unreserved_base = device->allocator()->get_base_allocator_addr(HalMemType::L1); + for (const auto& core_range : crs.ranges()) { + for (const auto& core_coord : core_range) { + ::tt::tt_metal::detail::ReadFromDeviceL1( + device, + core_coord, + workload.get_cb_base_addr(mesh_device, core_coord, CoreType::WORKER), + cb_config_buffer_size, + cb_config_vector); + + uint32_t cb_addr = l1_unreserved_base; + for (uint32_t i = 0; i < golden_cb_config.size(); i++) { + const uint32_t index = golden_cb_config[i].cb_id * sizeof(uint32_t); + const uint32_t cb_num_pages = golden_cb_config[i].num_pages; + const uint32_t cb_size = cb_num_pages * golden_cb_config[i].page_size; + const bool addr_match = cb_config_vector.at(index) == cb_addr; + const bool size_match = cb_config_vector.at(index + 1) == cb_size; + const bool num_pages_match = cb_config_vector.at(index + 2) == cb_num_pages; + EXPECT_TRUE(addr_match); + EXPECT_TRUE(size_match); + EXPECT_TRUE(num_pages_match); + cb_addr += cb_size; } } } @@ -144,17 +141,15 @@ TEST_F(MeshWorkloadTestT3000, MeshWorkloadOnActiveEthAsserts) { // A MeshWorkload cannot be run on ethernet core - Runtime should assert if the // user tries this. Verify this functionality here. std::shared_ptr workload = std::make_shared(); - uint32_t x_end = mesh_device_->num_cols(); - uint32_t y_end = mesh_device_->num_rows(); uint32_t seed = 0; - for (std::size_t logical_x = 0; logical_x < x_end; logical_x++) { - for (std::size_t logical_y = 0; logical_y < y_end; logical_y++) { - IDevice* device = mesh_device_->get_device(logical_y, logical_x); - auto programs = tt::tt_metal::distributed::test::utils::create_random_programs( - 1, mesh_device_->compute_with_storage_grid_size(), seed, device->get_active_ethernet_cores(true)); - LogicalDeviceRange devices = {{logical_x, logical_y}, {logical_x, logical_y}}; - AddProgramToMeshWorkload(*workload, *programs[0], devices); - } + for (const auto& coord : MeshCoordinateRange(mesh_device_->shape())) { + IDevice* device = mesh_device_->get_device(coord); + auto programs = tt::tt_metal::distributed::test::utils::create_random_programs( + /*num_programs=*/1, + mesh_device_->compute_with_storage_grid_size(), + seed, + device->get_active_ethernet_cores(true)); + AddProgramToMeshWorkload(*workload, std::move(*programs[0]), MeshCoordinateRange(coord, coord)); } EXPECT_THROW(EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), *workload, false), std::exception); } @@ -178,15 +173,15 @@ TEST_F(MeshWorkloadTestT3000, SimultaneousMeshWorkloads) { for (int i = 0; i < num_programs; i += 2) { std::shared_ptr random_workload = std::make_shared(); if (i % 2) { - LogicalDeviceRange devices_0 = LogicalDeviceRange({0, 0}, {3, 0}); - LogicalDeviceRange devices_1 = LogicalDeviceRange({0, 1}, {3, 1}); - AddProgramToMeshWorkload(*random_workload, *programs[i], devices_0); - AddProgramToMeshWorkload(*random_workload, *programs[i + 1], devices_1); + MeshCoordinateRange devices_0(MeshCoordinate{0, 0}, MeshCoordinate{0, 3}); + MeshCoordinateRange devices_1(MeshCoordinate{1, 0}, MeshCoordinate{1, 3}); + AddProgramToMeshWorkload(*random_workload, std::move(*programs[i]), devices_0); + AddProgramToMeshWorkload(*random_workload, std::move(*programs[i + 1]), devices_1); } else { - LogicalDeviceRange devices_0 = LogicalDeviceRange({0, 0}, {1, 1}); - LogicalDeviceRange devices_1 = LogicalDeviceRange({2, 0}, {3, 1}); - AddProgramToMeshWorkload(*random_workload, *programs[i], devices_0); - AddProgramToMeshWorkload(*random_workload, *programs[i + 1], devices_1); + MeshCoordinateRange devices_0(MeshCoordinate{0, 0}, MeshCoordinate{1, 1}); + MeshCoordinateRange devices_1(MeshCoordinate{0, 2}, MeshCoordinate{1, 3}); + AddProgramToMeshWorkload(*random_workload, std::move(*programs[i]), devices_0); + AddProgramToMeshWorkload(*random_workload, std::move(*programs[i + 1]), devices_1); } EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), *random_workload, false); mesh_workloads.push_back(random_workload); @@ -195,14 +190,14 @@ TEST_F(MeshWorkloadTestT3000, SimultaneousMeshWorkloads) { num_programs, mesh_device_->compute_with_storage_grid_size(), seed); for (int i = 0; i < num_programs; i += 4) { std::shared_ptr random_workload = std::make_shared(); - LogicalDeviceRange devices_0 = LogicalDeviceRange({0, 0}, {0, 1}); - LogicalDeviceRange devices_1 = LogicalDeviceRange({1, 0}, {1, 1}); - LogicalDeviceRange devices_2 = LogicalDeviceRange({2, 0}, {2, 1}); - LogicalDeviceRange devices_3 = LogicalDeviceRange({3, 0}, {3, 1}); - AddProgramToMeshWorkload(*random_workload, *programs[i], devices_0); - AddProgramToMeshWorkload(*random_workload, *programs[i + 1], devices_1); - AddProgramToMeshWorkload(*random_workload, *programs[i + 2], devices_2); - AddProgramToMeshWorkload(*random_workload, *programs[i + 3], devices_3); + MeshCoordinateRange devices_0(MeshCoordinate{0, 0}, MeshCoordinate{1, 0}); + MeshCoordinateRange devices_1(MeshCoordinate{0, 1}, MeshCoordinate{1, 1}); + MeshCoordinateRange devices_2(MeshCoordinate{0, 2}, MeshCoordinate{1, 2}); + MeshCoordinateRange devices_3(MeshCoordinate{0, 3}, MeshCoordinate{1, 3}); + AddProgramToMeshWorkload(*random_workload, std::move(*programs[i]), devices_0); + AddProgramToMeshWorkload(*random_workload, std::move(*programs[i + 1]), devices_1); + AddProgramToMeshWorkload(*random_workload, std::move(*programs[i + 2]), devices_2); + AddProgramToMeshWorkload(*random_workload, std::move(*programs[i + 3]), devices_3); EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), *random_workload, false); mesh_workloads.push_back(random_workload); } @@ -210,23 +205,23 @@ TEST_F(MeshWorkloadTestT3000, SimultaneousMeshWorkloads) { num_heterogeneous_programs, mesh_device_->compute_with_storage_grid_size(), seed); for (int i = 0; i < num_heterogeneous_programs; i += 8) { std::shared_ptr random_workload = std::make_shared(); - LogicalDeviceRange devices_0 = LogicalDeviceRange({0, 0}, {0, 0}); - LogicalDeviceRange devices_1 = LogicalDeviceRange({0, 1}, {0, 1}); - LogicalDeviceRange devices_2 = LogicalDeviceRange({1, 0}, {1, 0}); - LogicalDeviceRange devices_3 = LogicalDeviceRange({1, 1}, {1, 1}); - LogicalDeviceRange devices_4 = LogicalDeviceRange({2, 0}, {2, 0}); - LogicalDeviceRange devices_5 = LogicalDeviceRange({2, 1}, {2, 1}); - LogicalDeviceRange devices_6 = LogicalDeviceRange({3, 0}, {3, 0}); - LogicalDeviceRange devices_7 = LogicalDeviceRange({3, 1}, {3, 1}); - - AddProgramToMeshWorkload(*random_workload, *programs[i], devices_0); - AddProgramToMeshWorkload(*random_workload, *programs[i + 1], devices_1); - AddProgramToMeshWorkload(*random_workload, *programs[i + 2], devices_2); - AddProgramToMeshWorkload(*random_workload, *programs[i + 3], devices_3); - AddProgramToMeshWorkload(*random_workload, *programs[i + 4], devices_4); - AddProgramToMeshWorkload(*random_workload, *programs[i + 5], devices_5); - AddProgramToMeshWorkload(*random_workload, *programs[i + 6], devices_6); - AddProgramToMeshWorkload(*random_workload, *programs[i + 7], devices_7); + MeshCoordinateRange devices_0(MeshCoordinate{0, 0}, MeshCoordinate{0, 0}); + MeshCoordinateRange devices_1(MeshCoordinate{0, 1}, MeshCoordinate{0, 1}); + MeshCoordinateRange devices_2(MeshCoordinate{0, 2}, MeshCoordinate{0, 2}); + MeshCoordinateRange devices_3(MeshCoordinate{0, 3}, MeshCoordinate{0, 3}); + MeshCoordinateRange devices_4(MeshCoordinate{1, 0}, MeshCoordinate{1, 0}); + MeshCoordinateRange devices_5(MeshCoordinate{1, 1}, MeshCoordinate{1, 1}); + MeshCoordinateRange devices_6(MeshCoordinate{1, 2}, MeshCoordinate{1, 2}); + MeshCoordinateRange devices_7(MeshCoordinate{1, 3}, MeshCoordinate{1, 3}); + + AddProgramToMeshWorkload(*random_workload, std::move(*programs[i]), devices_0); + AddProgramToMeshWorkload(*random_workload, std::move(*programs[i + 1]), devices_1); + AddProgramToMeshWorkload(*random_workload, std::move(*programs[i + 2]), devices_2); + AddProgramToMeshWorkload(*random_workload, std::move(*programs[i + 3]), devices_3); + AddProgramToMeshWorkload(*random_workload, std::move(*programs[i + 4]), devices_4); + AddProgramToMeshWorkload(*random_workload, std::move(*programs[i + 5]), devices_5); + AddProgramToMeshWorkload(*random_workload, std::move(*programs[i + 6]), devices_6); + AddProgramToMeshWorkload(*random_workload, std::move(*programs[i + 7]), devices_7); EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), *random_workload, false); mesh_workloads.push_back(random_workload); } @@ -254,8 +249,8 @@ TEST_F(MeshWorkloadTestSuite, RandomizedMeshWorkload) { auto programs = tt::tt_metal::distributed::test::utils::create_random_programs( num_programs, mesh_device_->compute_with_storage_grid_size(), seed); std::mt19937 rng(seed); - std::uniform_int_distribution gen_x(1, mesh_device_->num_cols()); - std::uniform_int_distribution gen_y(1, mesh_device_->num_rows()); + std::uniform_int_distribution gen_col(1, mesh_device_->num_cols()); + std::uniform_int_distribution gen_row(1, mesh_device_->num_rows()); std::vector> mesh_workloads = {}; // Create multiple mesh workloads on grids of random sizes. @@ -263,9 +258,9 @@ TEST_F(MeshWorkloadTestSuite, RandomizedMeshWorkload) { log_info(tt::LogTest, "Compile and load {} MeshWorkloads", num_programs); for (int i = 0; i < num_programs; i += 1) { // Choose a grid of random dimensions and run a MeshWorkload on it - LogicalDeviceRange device_range = LogicalDeviceRange({0, 0}, {gen_x(rng) - 1, gen_y(rng) - 1}); + MeshCoordinateRange device_range(MeshCoordinate{0, 0}, MeshCoordinate{gen_row(rng) - 1, gen_col(rng) - 1}); auto random_workload = std::make_shared(); - AddProgramToMeshWorkload(*random_workload, *programs[i], device_range); + AddProgramToMeshWorkload(*random_workload, std::move(*programs[i]), device_range); EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), *random_workload, false); mesh_workloads.push_back(random_workload); } @@ -291,11 +286,12 @@ TEST_F(MeshWorkloadTestSuite, EltwiseBinaryMeshWorkload) { auto programs = tt::tt_metal::distributed::test::utils::create_eltwise_bin_programs( mesh_device_, src0_bufs, src1_bufs, output_bufs); auto mesh_workload = CreateMeshWorkload(); - LogicalDeviceRange devices_0 = LogicalDeviceRange({0, 0}, {mesh_device_->num_cols() - 1, 0}); - LogicalDeviceRange devices_1 = LogicalDeviceRange( - {0, mesh_device_->num_rows() - 1}, {mesh_device_->num_cols() - 1, mesh_device_->num_rows() - 1}); - AddProgramToMeshWorkload(mesh_workload, *programs[0], devices_0); - AddProgramToMeshWorkload(mesh_workload, *programs[1], devices_1); + MeshCoordinateRange devices_0(MeshCoordinate{0, 0}, MeshCoordinate{0, mesh_device_->num_cols() - 1}); + MeshCoordinateRange devices_1( + MeshCoordinate{mesh_device_->num_rows() - 1, 0}, + MeshCoordinate{mesh_device_->num_rows() - 1, mesh_device_->num_cols() - 1}); + AddProgramToMeshWorkload(mesh_workload, std::move(*programs[0]), devices_0); + AddProgramToMeshWorkload(mesh_workload, std::move(*programs[1]), devices_1); std::vector src0_vec = create_constant_vector_of_bfloat16(src0_bufs[0]->size(), 2); std::vector src1_vec = create_constant_vector_of_bfloat16(src1_bufs[0]->size(), 3); @@ -313,24 +309,22 @@ TEST_F(MeshWorkloadTestSuite, EltwiseBinaryMeshWorkload) { EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), mesh_workload, false); } - for (std::size_t logical_y = 0; logical_y < mesh_device_->num_rows(); logical_y++) { - for (std::size_t logical_x = 0; logical_x < mesh_device_->num_cols(); logical_x++) { - for (std::size_t col_idx = 0; col_idx < worker_grid_size.x; col_idx++) { - for (std::size_t row_idx = 0; row_idx < worker_grid_size.y; row_idx++) { - std::vector dst_vec = {}; - ReadShard( - mesh_device_->mesh_command_queue(), - dst_vec, - output_bufs[col_idx * worker_grid_size.y + row_idx], - MeshCoordinate(logical_y, logical_x)); - if (logical_y == 0) { - for (int i = 0; i < dst_vec.size(); i++) { - EXPECT_EQ(dst_vec[i].to_float(), 5); - } - } else { - for (int i = 0; i < dst_vec.size(); i++) { - EXPECT_EQ(dst_vec[i].to_float(), 6); - } + for (const auto& device_coord : MeshCoordinateRange(mesh_device_->shape())) { + for (std::size_t col_idx = 0; col_idx < worker_grid_size.x; col_idx++) { + for (std::size_t row_idx = 0; row_idx < worker_grid_size.y; row_idx++) { + std::vector dst_vec = {}; + ReadShard( + mesh_device_->mesh_command_queue(), + dst_vec, + output_bufs[col_idx * worker_grid_size.y + row_idx], + device_coord); + if (device_coord[0] == 0) { + for (int i = 0; i < dst_vec.size(); i++) { + EXPECT_EQ(dst_vec[i].to_float(), 5); + } + } else { + for (int i = 0; i < dst_vec.size(); i++) { + EXPECT_EQ(dst_vec[i].to_float(), 6); } } } @@ -403,11 +397,12 @@ TEST_F(MeshWorkloadTestSuite, MeshWorkloadSanity) { } auto program_1 = initialize_dummy_program(worker_grid_size); auto mesh_workload = MeshWorkload(); - LogicalDeviceRange devices_0 = LogicalDeviceRange({0, 0}, {mesh_device_->num_cols() - 1, 0}); - LogicalDeviceRange devices_1 = LogicalDeviceRange( - {0, mesh_device_->num_rows() - 1}, {mesh_device_->num_cols() - 1, mesh_device_->num_rows() - 1}); - AddProgramToMeshWorkload(mesh_workload, program, devices_0); - AddProgramToMeshWorkload(mesh_workload, *program_1, devices_1); + MeshCoordinateRange devices_0(MeshCoordinate{0, 0}, MeshCoordinate{0, mesh_device_->num_cols() - 1}); + MeshCoordinateRange devices_1( + MeshCoordinate{mesh_device_->num_rows() - 1, 0}, + MeshCoordinate{mesh_device_->num_rows() - 1, mesh_device_->num_cols() - 1}); + AddProgramToMeshWorkload(mesh_workload, std::move(program), devices_0); + AddProgramToMeshWorkload(mesh_workload, std::move(*program_1), devices_1); std::size_t buffer_idx = 0; std::vector src_vec = create_constant_vector_of_bfloat16(dram_buffer_size, 1); @@ -430,23 +425,21 @@ TEST_F(MeshWorkloadTestSuite, MeshWorkloadSanity) { } EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), mesh_workload, false); buffer_idx = 0; - for (std::size_t logical_x = devices_0.start_coord.x; logical_x < devices_0.end_coord.x; logical_x++) { - for (std::size_t logical_y = devices_0.start_coord.y; logical_y < devices_0.end_coord.y; logical_y++) { - for (std::size_t col_idx = 0; col_idx < worker_grid_size.x; col_idx++) { - for (std::size_t row_idx = 0; row_idx < worker_grid_size.y; row_idx++) { - std::vector dst_vec = {}; - ReadShard( - mesh_device_->mesh_command_queue(), - dst_vec, - output_buffers[col_idx * worker_grid_size.y + row_idx], - MeshCoordinate(logical_y, logical_x)); - for (int i = 0; i < dst_vec.size(); i++) { - float ref_val = std::pow(2, (iter % 2) + 1); - if (i >= 512) { - ref_val = std::pow(2, 2 * ((iter % 2) + 1)); - } - EXPECT_EQ(dst_vec[i].to_float(), ref_val); + for (const auto& device_coord : devices_0) { + for (std::size_t col_idx = 0; col_idx < worker_grid_size.x; col_idx++) { + for (std::size_t row_idx = 0; row_idx < worker_grid_size.y; row_idx++) { + std::vector dst_vec = {}; + ReadShard( + mesh_device_->mesh_command_queue(), + dst_vec, + output_buffers[col_idx * worker_grid_size.y + row_idx], + device_coord); + for (int i = 0; i < dst_vec.size(); i++) { + float ref_val = std::pow(2, (iter % 2) + 1); + if (i >= 512) { + ref_val = std::pow(2, 2 * ((iter % 2) + 1)); } + EXPECT_EQ(dst_vec[i].to_float(), ref_val); } } } @@ -470,10 +463,9 @@ TEST_F(MeshWorkloadTestSuite, MeshWorkloadCBUpdate) { initialize_dummy_kernels(*program, cr_set); auto mesh_workload = CreateMeshWorkload(); - LogicalDeviceRange devices = - LogicalDeviceRange({0, 0}, {mesh_device_->num_cols() - 1, mesh_device_->num_rows() - 1}); + MeshCoordinateRange devices(mesh_device_->shape()); - AddProgramToMeshWorkload(mesh_workload, *program, devices); + AddProgramToMeshWorkload(mesh_workload, std::move(*program), devices); EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), mesh_workload, false); Finish(mesh_device_->mesh_command_queue()); verify_cb_config(mesh_device_, mesh_workload, cb_config_vector, cr_set); @@ -501,9 +493,8 @@ TEST_F(MeshWorkloadTestSuite, MeshWorkloadSemaphoreSanity) { expected_semaphore_values.push_back(sem); } auto mesh_workload = CreateMeshWorkload(); - LogicalDeviceRange devices = - LogicalDeviceRange({0, 0}, {mesh_device_->num_cols() - 1, mesh_device_->num_rows() - 1}); - AddProgramToMeshWorkload(mesh_workload, program, devices); + MeshCoordinateRange devices(mesh_device_->shape()); + AddProgramToMeshWorkload(mesh_workload, std::move(program), devices); EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), mesh_workload, false); Finish(mesh_device_->mesh_command_queue()); @@ -528,27 +519,24 @@ TEST_F(MeshWorkloadTestSuite, MeshWorkloadSemaphoreDifferentPrograms) { expected_semaphore_values_1.push_back(sem + 1); } auto mesh_workload = CreateMeshWorkload(); - LogicalDeviceRange devices_0 = LogicalDeviceRange({0, 0}, {mesh_device_->num_cols() - 1, 0}); - LogicalDeviceRange devices_1 = LogicalDeviceRange( - {0, mesh_device_->num_rows() - 1}, {mesh_device_->num_cols() - 1, mesh_device_->num_rows() - 1}); + MeshCoordinateRange devices_0({0, 0}, {0, mesh_device_->num_cols() - 1}); + MeshCoordinateRange devices_1( + MeshCoordinate{mesh_device_->num_rows() - 1, 0}, + MeshCoordinate{mesh_device_->num_rows() - 1, mesh_device_->num_cols() - 1}); - AddProgramToMeshWorkload(mesh_workload, program0, devices_0); - AddProgramToMeshWorkload(mesh_workload, program1, devices_1); + AddProgramToMeshWorkload(mesh_workload, std::move(program0), devices_0); + AddProgramToMeshWorkload(mesh_workload, std::move(program1), devices_1); EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), mesh_workload, false); Finish(mesh_device_->mesh_command_queue()); - for (std::size_t logical_x = devices_0.start_coord.x; logical_x < devices_0.end_coord.x; logical_x++) { - for (std::size_t logical_y = devices_0.start_coord.y; logical_y < devices_0.end_coord.y; logical_y++) { - auto device = mesh_device_->get_device(logical_y, logical_x); - validate_sems(mesh_device_, device, full_grid, mesh_workload, expected_semaphore_values_0); - } + for (const auto& device_coord : devices_0) { + auto device = mesh_device_->get_device(device_coord); + validate_sems(mesh_device_, device, full_grid, mesh_workload, expected_semaphore_values_0); } - for (std::size_t logical_x = devices_1.start_coord.x; logical_x < devices_1.end_coord.x; logical_x++) { - for (std::size_t logical_y = devices_1.start_coord.y; logical_y < devices_1.end_coord.y; logical_y++) { - auto device = mesh_device_->get_device(logical_y, logical_x); - validate_sems(mesh_device_, device, full_grid, mesh_workload, expected_semaphore_values_1); - } + for (const auto& device_coord : devices_1) { + auto device = mesh_device_->get_device(device_coord); + validate_sems(mesh_device_, device, full_grid, mesh_workload, expected_semaphore_values_1); } } diff --git a/tt_metal/api/tt-metalium/distributed.hpp b/tt_metal/api/tt-metalium/distributed.hpp index 31e02050724..f2eba83be1d 100644 --- a/tt_metal/api/tt-metalium/distributed.hpp +++ b/tt_metal/api/tt-metalium/distributed.hpp @@ -21,7 +21,7 @@ namespace distributed { MeshWorkload CreateMeshWorkload(); -void AddProgramToMeshWorkload(MeshWorkload& mesh_workload, Program& program, const LogicalDeviceRange& device_range); +void AddProgramToMeshWorkload(MeshWorkload& mesh_workload, Program&& program, const MeshCoordinateRange& device_range); void EnqueueMeshWorkload(MeshCommandQueue& mesh_cq, MeshWorkload& mesh_workload, bool blocking); @@ -83,13 +83,13 @@ void EnqueueRecordEvent( MeshCommandQueue& mesh_cq, const std::shared_ptr& event, tt::stl::Span sub_device_ids = {}, - const std::optional& device_range = std::nullopt); + const std::optional& device_range = std::nullopt); void EnqueueRecordEventToHost( MeshCommandQueue& mesh_cq, const std::shared_ptr& event, tt::stl::Span sub_device_ids = {}, - const std::optional& device_range = std::nullopt); + const std::optional& device_range = std::nullopt); void EnqueueWaitForEvent(MeshCommandQueue& mesh_cq, const std::shared_ptr& event); diff --git a/tt_metal/api/tt-metalium/mesh_command_queue.hpp b/tt_metal/api/tt-metalium/mesh_command_queue.hpp index 386b5418aa4..201724f695b 100644 --- a/tt_metal/api/tt-metalium/mesh_command_queue.hpp +++ b/tt_metal/api/tt-metalium/mesh_command_queue.hpp @@ -49,12 +49,12 @@ class MeshCommandQueue { const std::shared_ptr& event, tt::stl::Span sub_device_ids, bool notify_host, - const std::optional& device_range = std::nullopt); + const std::optional& device_range = std::nullopt); // Trace capture utility functions // Captures dispatch commands associated with running a program on a Virtual Mesh subgrid // inside the appropriate trace staging vector (corresponding to the specified subgrid) void capture_program_trace_on_subgrid( - const LogicalDeviceRange& sub_grid, + const MeshCoordinateRange& sub_grid, ProgramCommandSequence& program_cmd_seq, bool stall_first, bool stall_before_program); @@ -63,7 +63,7 @@ class MeshCommandQueue { // When running trace, the dispatch commands responsible for forwarding go signals must be // captured on these subgrids. void capture_go_signal_trace_on_unused_subgrids( - std::vector& active_sub_grids, + const MeshCoordinateRange& active_sub_grids, const SubDeviceId& sub_device_id, uint32_t expected_num_workers_completed, bool mcast_go_signals, @@ -71,7 +71,7 @@ class MeshCommandQueue { // Workload dispatch utility functions // Write dispatch commands associated with running a program on a Virtual Mesh subgrid void write_program_cmds_to_subgrid( - const LogicalDeviceRange& sub_grid, + const MeshCoordinateRange& sub_grid, ProgramCommandSequence& program_cmd_seq, bool stall_first, bool stall_before_program, @@ -129,7 +129,7 @@ class MeshCommandQueue { void enqueue_write_shard_to_sub_grid( const MeshBuffer& buffer, const void* host_data, - const LogicalDeviceRange& device_range, + const MeshCoordinateRange& device_range, bool blocking, std::optional region = std::nullopt); void enqueue_write_mesh_buffer(const std::shared_ptr& buffer, const void* host_data, bool blocking); @@ -148,11 +148,11 @@ class MeshCommandQueue { void enqueue_record_event( const std::shared_ptr& event, tt::stl::Span sub_device_ids = {}, - const std::optional& device_range = std::nullopt); + const std::optional& device_range = std::nullopt); void enqueue_record_event_to_host( const std::shared_ptr& event, tt::stl::Span sub_device_ids = {}, - const std::optional& device_range = std::nullopt); + const std::optional& device_range = std::nullopt); void enqueue_wait_for_event(const std::shared_ptr& sync_event); void drain_events_from_completion_queue(); void verify_reported_events_after_draining(const std::shared_ptr& event); @@ -163,7 +163,6 @@ class MeshCommandQueue { const vector_memcpy_aligned& go_signal_noc_data); void record_begin(const MeshTraceId& trace_id, const std::shared_ptr& ctx); void record_end(); - const std::vector& get_mesh_trace_md(); void enqueue_trace(const MeshTraceId& trace_id, bool blocking); }; diff --git a/tt_metal/api/tt-metalium/mesh_common.hpp b/tt_metal/api/tt-metalium/mesh_common.hpp index 5433e133d99..a4b875aa19f 100644 --- a/tt_metal/api/tt-metalium/mesh_common.hpp +++ b/tt_metal/api/tt-metalium/mesh_common.hpp @@ -11,14 +11,3 @@ // Define common types used across TT-Mesh data-structures and APIs using MeshTraceId = tt::stl::StrongType; - -// TODO (Issue #17477): MeshWorkload and MeshEvent currently rely on the coordinate systems -// exposed below. These must be uplifted to an ND coordinate system (DeviceCoord and DeviceRange), -// keeping things more consistent across the stack. -// For now, since the LogicalDeviceRange concept is fundamentally identical to the CoreRange concept -// on a 2D Mesh use this definition. CoreRange contains several utility functions required -// in the MeshWorkload context. - -using DeviceCoord = CoreCoord; -using LogicalDeviceRange = CoreRange; -using LogicalDeviceRangeSet = CoreRangeSet; diff --git a/tt_metal/api/tt-metalium/mesh_coord.hpp b/tt_metal/api/tt-metalium/mesh_coord.hpp index 0823ca1205d..a8f5e961616 100644 --- a/tt_metal/api/tt-metalium/mesh_coord.hpp +++ b/tt_metal/api/tt-metalium/mesh_coord.hpp @@ -5,6 +5,7 @@ #pragma once #include +#include #include #include @@ -114,6 +115,15 @@ class MeshCoordinateRange { // Returns true if the range contains the given coordinate. bool contains(const MeshCoordinate& coord) const; + // Returns true if the range contains the given range. + bool contains(const MeshCoordinateRange& range) const; + + // Returns true if the range intersects with the given range. + bool intersects(const MeshCoordinateRange& range) const; + + // Returns the intersection of the range with the given range. + std::optional intersection(const MeshCoordinateRange& range) const; + class Iterator { public: Iterator& operator++(); @@ -138,12 +148,33 @@ class MeshCoordinateRange { friend bool operator==(const MeshCoordinateRange& lhs, const MeshCoordinateRange& rhs); friend bool operator!=(const MeshCoordinateRange& lhs, const MeshCoordinateRange& rhs); + friend std::ostream& operator<<(std::ostream& os, const MeshCoordinateRange& range); private: MeshCoordinate start_; MeshCoordinate end_; }; +// Represents a set of non-overlapping MeshCoordinateRanges. +class MeshCoordinateRangeSet { +public: + MeshCoordinateRangeSet() = default; + + // Merges the given range into the set. + void merge(const MeshCoordinateRange& range); + + size_t size() const { return ranges_.size(); } + bool empty() const { return ranges_.empty(); } + + const auto& ranges() const { return ranges_; } + +private: + std::vector ranges_; +}; + +// Returns the set of ranges that result from subtracting the intersection from the parent range. +MeshCoordinateRangeSet subtract(const MeshCoordinateRange& parent, const MeshCoordinateRange& intersection); + namespace detail { // Proxy class that allows convenient structured binding to a pair of a coordinate and the value it points to. @@ -414,4 +445,14 @@ struct hash { } }; +template <> +struct hash { + size_t operator()(const tt::tt_metal::distributed::MeshCoordinateRange& range) const noexcept { + size_t seed = 0; + tt::utils::hash_combine(seed, range.start_coord()); + tt::utils::hash_combine(seed, range.end_coord()); + return seed; + } +}; + } // namespace std diff --git a/tt_metal/api/tt-metalium/mesh_event.hpp b/tt_metal/api/tt-metalium/mesh_event.hpp index f115a118d15..72beaeaef94 100644 --- a/tt_metal/api/tt-metalium/mesh_event.hpp +++ b/tt_metal/api/tt-metalium/mesh_event.hpp @@ -11,7 +11,7 @@ namespace tt::tt_metal::distributed { class MeshEvent { public: MeshDevice* device = nullptr; - LogicalDeviceRange device_range = LogicalDeviceRange({0, 0}); + MeshCoordinateRange device_range = MeshCoordinateRange(MeshCoordinate(0, 0), MeshCoordinate(0, 0)); uint32_t cq_id = 0; uint32_t event_id = 0; }; diff --git a/tt_metal/api/tt-metalium/mesh_trace.hpp b/tt_metal/api/tt-metalium/mesh_trace.hpp index 3d242248d45..3255f275405 100644 --- a/tt_metal/api/tt-metalium/mesh_trace.hpp +++ b/tt_metal/api/tt-metalium/mesh_trace.hpp @@ -27,8 +27,8 @@ namespace tt::tt_metal::distributed { // - The offset and size of the dispatch commands in the sysmem_manager // staging vector struct MeshTraceStagingMetadata { - LogicalDeviceRange device_range = LogicalDeviceRange({0, 0}); - DeviceCoord sysmem_manager_coord = DeviceCoord(0, 0); + MeshCoordinateRange device_range = MeshCoordinateRange(MeshShape(0, 0)); + MeshCoordinate sysmem_manager_coord = MeshCoordinate(0, 0); std::size_t offset = 0; std::size_t size = 0; }; @@ -36,8 +36,8 @@ struct MeshTraceStagingMetadata { // Finalized/Consolidated dispatch commands on a device_range, corresponding // to a trace struct MeshTraceData { - LogicalDeviceRange device_range = LogicalDeviceRange({0, 0}); - std::vector data = {}; + MeshCoordinateRange device_range = MeshCoordinateRange(MeshShape(0, 0)); + std::vector data; }; // Wrapper around the MeshTraceData. Captures the complete state of a MeshTrace diff --git a/tt_metal/api/tt-metalium/mesh_workload.hpp b/tt_metal/api/tt-metalium/mesh_workload.hpp index f57bccb3edf..961ad885980 100644 --- a/tt_metal/api/tt-metalium/mesh_workload.hpp +++ b/tt_metal/api/tt-metalium/mesh_workload.hpp @@ -45,10 +45,10 @@ class MeshWorkload { std::vector>> kernels_; std::vector>> kernel_groups_; std::vector semaphores_; - std::unordered_map programs_; - std::vector logical_device_ranges_; + std::unordered_map programs_; + std::vector logical_device_ranges_; bool finalized_ = false; - std::unordered_map> runtime_args_; + std::unordered_map> runtime_args_; MeshCommandQueue* last_used_command_queue_ = nullptr; template @@ -61,12 +61,11 @@ class MeshWorkload { public: // Main User-Facing API building blocks MeshWorkload(); - void add_program(const LogicalDeviceRange& device_range, Program&& program); - const std::unordered_map& get_programs() const { return this->programs_; } - const std::vector get_logical_device_ranges() const { return this->logical_device_ranges_; } - Program& get_program_on_device_range(const LogicalDeviceRange& device_range) { - return this->programs_.at(device_range); - } + void add_program(const MeshCoordinateRange& device_range, Program&& program); + const std::unordered_map& get_programs() const { return programs_; } + const std::vector get_logical_device_ranges() const { return logical_device_ranges_; } + Program& get_program_on_device_range(const MeshCoordinateRange& device_range) { return programs_.at(device_range); } + // For testing purposes only void set_last_used_command_queue_for_testing(MeshCommandQueue* mesh_cq); MeshCommandQueue* get_last_used_command_queue() const; diff --git a/tt_metal/common/mesh_coord.cpp b/tt_metal/common/mesh_coord.cpp index aefdb409642..8aa7602c791 100644 --- a/tt_metal/common/mesh_coord.cpp +++ b/tt_metal/common/mesh_coord.cpp @@ -3,9 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 #include +#include +#include #include -#include #include #include #include @@ -22,6 +23,36 @@ MeshCoordinate shape_back(const MeshShape& shape) { return MeshCoordinate(coords); } +// Returns a list of dimensions that differ between the two ranges. +std::vector find_diff_dimensions(const MeshCoordinateRange& a, const MeshCoordinateRange& b) { + TT_ASSERT(a.dims() == b.dims(), "Cannot compare ranges with different dimensions: {} != {}", a.dims(), b.dims()); + + std::vector diff_dims; + for (size_t i = 0; i < a.dims(); ++i) { + if (a.start_coord()[i] != b.start_coord()[i] || a.end_coord()[i] != b.end_coord()[i]) { + diff_dims.push_back(i); + } + } + return diff_dims; +} + +// Returns true if the two ranges are mergeable: ranges must either be identical, have an intersection, or be adjacent +// along exactly one dimension. +bool check_mergeable(const MeshCoordinateRange& a, const MeshCoordinateRange& b) { + TT_ASSERT(a.dims() == b.dims(), "Cannot compare ranges with different dimensions: {} != {}", a.dims(), b.dims()); + + auto diff_dims = find_diff_dimensions(a, b); + if (diff_dims.empty()) { + return true; + } else if (diff_dims.size() == 1) { + size_t diff_dim = diff_dims[0]; + return std::max(a.start_coord()[diff_dim], b.start_coord()[diff_dim]) <= + std::min(a.end_coord()[diff_dim], b.end_coord()[diff_dim]) + 1; + } else { + return false; + } +} + } // namespace MeshShape::MeshShape(uint32_t x) : MeshShape({x}) {} @@ -128,6 +159,34 @@ bool MeshCoordinateRange::contains(const MeshCoordinate& coord) const { return true; } +bool MeshCoordinateRange::contains(const MeshCoordinateRange& range) const { + return contains(range.start_coord()) && contains(range.end_coord()); +} + +bool MeshCoordinateRange::intersects(const MeshCoordinateRange& range) const { + TT_FATAL(range.dims() == dims(), "Coordinate dimensions do not match: {} != {}", range.dims(), dims()); + for (int i = 0; i < range.dims(); ++i) { + if (range.end_coord()[i] < start_[i] || range.start_coord()[i] > end_[i]) { + return false; + } + } + return true; +} + +std::optional MeshCoordinateRange::intersection(const MeshCoordinateRange& range) const { + if (!intersects(range)) { + return std::nullopt; + } + + tt::stl::SmallVector intersection_start(dims(), 0); + tt::stl::SmallVector intersection_end(dims(), 0); + for (size_t i = 0; i < dims(); ++i) { + intersection_start[i] = std::max(start_coord()[i], range.start_coord()[i]); + intersection_end[i] = std::min(end_coord()[i], range.end_coord()[i]); + } + return MeshCoordinateRange(MeshCoordinate(intersection_start), MeshCoordinate(intersection_end)); +} + MeshCoordinateRange::Iterator::Iterator( const MeshCoordinateRange* range, const MeshCoordinate& current, size_t linear_index) : range_(range), current_coord_(current), linear_index_(linear_index) {} @@ -168,6 +227,11 @@ bool operator==(const MeshCoordinateRange& lhs, const MeshCoordinateRange& rhs) } bool operator!=(const MeshCoordinateRange& lhs, const MeshCoordinateRange& rhs) { return !(lhs == rhs); } +std::ostream& operator<<(std::ostream& os, const MeshCoordinateRange& range) { + os << "MeshCoordinateRange(start=" << range.start_coord() << ", end=" << range.end_coord() << ")"; + return os; +} + size_t to_linear_index(const MeshShape& shape, const MeshCoordinate& coord) { TT_FATAL( shape.dims() == coord.dims(), @@ -183,4 +247,100 @@ size_t to_linear_index(const MeshShape& shape, const MeshCoordinate& coord) { return linear_index; } +void MeshCoordinateRangeSet::merge(const MeshCoordinateRange& to_merge) { + TT_FATAL( + ranges_.empty() || ranges_.front().dims() == to_merge.dims(), + "Cannot merge range with different dimensions into a range set: {} != {}", + ranges_.front().dims(), + to_merge.dims()); + + // Iteratively merge the new range with existing ranges until no more merges are possible. + MeshCoordinateRange merged = to_merge; + bool did_merge = true; + while (did_merge) { + did_merge = false; + for (auto it = ranges_.begin(); it != ranges_.end(); ++it) { + if (check_mergeable(merged, *it)) { + tt::stl::SmallVector new_start; + tt::stl::SmallVector new_end; + for (size_t i = 0; i < merged.dims(); ++i) { + new_start.push_back(std::min(merged.start_coord()[i], it->start_coord()[i])); + new_end.push_back(std::max(merged.end_coord()[i], it->end_coord()[i])); + } + merged = MeshCoordinateRange(MeshCoordinate(new_start), MeshCoordinate(new_end)); + ranges_.erase(it); + did_merge = true; + break; + } + } + } + ranges_.push_back(merged); +} + +MeshCoordinateRangeSet subtract(const MeshCoordinateRange& parent, const MeshCoordinateRange& intersection) { + TT_FATAL( + parent.dims() == intersection.dims(), + "Parent and intersection dimensions do not match: {} != {}", + parent.dims(), + intersection.dims()); + + MeshCoordinateRangeSet complement_set; + if (parent == intersection) { + return complement_set; + } + + if (!parent.intersects(intersection)) { + complement_set.merge(parent); + return complement_set; + } + + // Fast path: parent and intersection differ in exactly one dimension. + auto diff_dims = find_diff_dimensions(parent, intersection); + if (diff_dims.size() == 1) { + const size_t diff_dim = diff_dims[0]; + + // Left complement: portion before the intersection in diff_dim. + if (parent.start_coord()[diff_dim] < intersection.start_coord()[diff_dim]) { + tt::stl::SmallVector left_start; + tt::stl::SmallVector left_end; + for (size_t i = 0; i < parent.dims(); ++i) { + if (i == diff_dim) { + left_start.push_back(parent.start_coord()[i]); + left_end.push_back(intersection.start_coord()[i] - 1); + } else { + left_start.push_back(parent.start_coord()[i]); + left_end.push_back(parent.end_coord()[i]); + } + } + complement_set.merge(MeshCoordinateRange(MeshCoordinate(left_start), MeshCoordinate(left_end))); + } + + // Right complement: portion after the intersection in diff_dim. + if (intersection.end_coord()[diff_dim] < parent.end_coord()[diff_dim]) { + tt::stl::SmallVector right_start; + tt::stl::SmallVector right_end; + for (size_t i = 0; i < parent.dims(); ++i) { + if (i == diff_dim) { + right_start.push_back(intersection.end_coord()[i] + 1); + right_end.push_back(parent.end_coord()[i]); + } else { + right_start.push_back(parent.start_coord()[i]); + right_end.push_back(parent.end_coord()[i]); + } + } + complement_set.merge(MeshCoordinateRange(MeshCoordinate(right_start), MeshCoordinate(right_end))); + } + + return complement_set; + } else { + // Slow path: iterate over all coordinates in the parent range, and create ranges for the complement. + for (const auto& coord : parent) { + if (!intersection.contains(coord)) { + complement_set.merge(MeshCoordinateRange(coord, coord)); + } + } + return complement_set; + } +} + } // namespace tt::tt_metal::distributed diff --git a/tt_metal/distributed/distributed.cpp b/tt_metal/distributed/distributed.cpp index 8d067316db1..0152e2c4e2f 100644 --- a/tt_metal/distributed/distributed.cpp +++ b/tt_metal/distributed/distributed.cpp @@ -8,8 +8,7 @@ namespace tt::tt_metal::distributed { MeshWorkload CreateMeshWorkload() { return MeshWorkload(); } -void AddProgramToMeshWorkload( - MeshWorkload& mesh_workload, Program& program, const LogicalDeviceRange& device_range) { +void AddProgramToMeshWorkload(MeshWorkload& mesh_workload, Program&& program, const MeshCoordinateRange& device_range) { mesh_workload.add_program(device_range, std::move(program)); } @@ -24,7 +23,7 @@ void EnqueueRecordEvent( MeshCommandQueue& mesh_cq, const std::shared_ptr& event, tt::stl::Span sub_device_ids, - const std::optional& device_range) { + const std::optional& device_range) { mesh_cq.enqueue_record_event(event, sub_device_ids, device_range); } @@ -32,7 +31,7 @@ void EnqueueRecordEventToHost( MeshCommandQueue& mesh_cq, const std::shared_ptr& event, tt::stl::Span sub_device_ids, - const std::optional& device_range) { + const std::optional& device_range) { mesh_cq.enqueue_record_event_to_host(event, sub_device_ids, device_range); } diff --git a/tt_metal/distributed/mesh_command_queue.cpp b/tt_metal/distributed/mesh_command_queue.cpp index 2b5c09252a1..f7309cb4e8f 100644 --- a/tt_metal/distributed/mesh_command_queue.cpp +++ b/tt_metal/distributed/mesh_command_queue.cpp @@ -9,6 +9,7 @@ #include #include "buffer.hpp" +#include "mesh_coord.hpp" #include "tt_metal/distributed/mesh_workload_utils.hpp" #include "tt_metal/impl/buffers/dispatch.hpp" #include "tt_metal/impl/program/dispatch.hpp" @@ -21,7 +22,7 @@ namespace tt::tt_metal::distributed { struct MeshReadEventDescriptor { ReadEventDescriptor single_device_descriptor; - LogicalDeviceRange device_range; + MeshCoordinateRange device_range; }; MeshCommandQueue::MeshCommandQueue(MeshDevice* mesh_device, uint32_t id) { @@ -106,7 +107,7 @@ void MeshCommandQueue::enqueue_mesh_workload(MeshWorkload& mesh_workload, bool b dispatch_metadata); std::unordered_set chip_ids_in_workload = {}; - std::vector active_sub_grids = {}; + std::vector active_sub_grids = {}; // Iterate over all programs. Update dispatch commands per program to reflect // current device state. Write the finalized program command sequence to each // physical device tied to the program. @@ -146,8 +147,17 @@ void MeshCommandQueue::enqueue_mesh_workload(MeshWorkload& mesh_workload, bool b this->write_go_signal_to_unused_sub_grids( chip_ids_in_workload, sub_device_id, expected_num_workers_completed, mcast_go_signals, unicast_go_signals); } else { + MeshCoordinateRangeSet active_sub_grids_set; + for (const auto& sub_grid : active_sub_grids) { + active_sub_grids_set.merge(sub_grid); + } + TT_FATAL(active_sub_grids_set.size() == 1, "Cannot support non convex grids."); this->capture_go_signal_trace_on_unused_subgrids( - active_sub_grids, sub_device_id, expected_num_workers_completed, mcast_go_signals, unicast_go_signals); + active_sub_grids_set.ranges().front(), + sub_device_id, + expected_num_workers_completed, + mcast_go_signals, + unicast_go_signals); } // Increment Launch Message Buffer Write Pointers if (mcast_go_signals) { @@ -376,18 +386,14 @@ void MeshCommandQueue::read_sharded_buffer(MeshBuffer& buffer, void* dst) { void MeshCommandQueue::enqueue_write_shard_to_sub_grid( const MeshBuffer& buffer, const void* host_data, - const LogicalDeviceRange& device_range, + const MeshCoordinateRange& device_range, bool blocking, std::optional region) { if (buffer.global_layout() == MeshBufferLayout::REPLICATED) { - for (std::size_t logical_x = device_range.start_coord.x; logical_x < device_range.end_coord.x + 1; - logical_x++) { - for (std::size_t logical_y = device_range.start_coord.y; logical_y < device_range.end_coord.y + 1; - logical_y++) { - auto device_shard_view = buffer.get_device_buffer(MeshCoordinate(logical_y, logical_x)); - const BufferRegion buffer_region = region.value_or(BufferRegion(0, device_shard_view->size())); - this->write_shard_to_device(device_shard_view, host_data, buffer_region); - } + for (const auto& coord : device_range) { + auto device_shard_view = buffer.get_device_buffer(coord); + const BufferRegion buffer_region = region.value_or(BufferRegion(0, device_shard_view->size())); + this->write_shard_to_device(device_shard_view, host_data, buffer_region); } } else { this->write_sharded_buffer(buffer, host_data); @@ -399,7 +405,7 @@ void MeshCommandQueue::enqueue_write_shard_to_sub_grid( void MeshCommandQueue::enqueue_write_mesh_buffer( const std::shared_ptr& buffer, const void* host_data, bool blocking) { - LogicalDeviceRange mesh_device_extent({0, 0}, {buffer->device()->num_cols() - 1, buffer->device()->num_rows() - 1}); + MeshCoordinateRange mesh_device_extent(buffer->device()->shape()); this->enqueue_write_shard_to_sub_grid(*buffer, host_data, mesh_device_extent, blocking); } @@ -447,61 +453,47 @@ void MeshCommandQueue::enqueue_record_event_helper( const std::shared_ptr& event, tt::stl::Span sub_device_ids, bool notify_host, - const std::optional& device_range) { + const std::optional& device_range) { auto& sysmem_manager = this->reference_sysmem_manager(); event->cq_id = id_; event->event_id = sysmem_manager.get_next_event(id_); event->device = mesh_device_; - event->device_range = - device_range.value_or(LogicalDeviceRange({0, 0}, {mesh_device_->num_cols() - 1, mesh_device_->num_rows() - 1})); + event->device_range = device_range.value_or(MeshCoordinateRange(mesh_device_->shape())); sub_device_ids = buffer_dispatch::select_sub_device_ids(mesh_device_, sub_device_ids); - for (std::size_t logical_x = event->device_range.start_coord.x; logical_x < event->device_range.end_coord.x + 1; - logical_x++) { - for (std::size_t logical_y = event->device_range.start_coord.y; logical_y < event->device_range.end_coord.y + 1; - logical_y++) { - event_dispatch::issue_record_event_commands( - mesh_device_, - event->event_id, - id_, - mesh_device_->num_hw_cqs(), - mesh_device_->get_device(logical_y, logical_x)->sysmem_manager(), - sub_device_ids, - expected_num_workers_completed_, - notify_host); - } + for (const auto& coord : event->device_range) { + event_dispatch::issue_record_event_commands( + mesh_device_, + event->event_id, + id_, + mesh_device_->num_hw_cqs(), + mesh_device_->get_device(coord)->sysmem_manager(), + sub_device_ids, + expected_num_workers_completed_, + notify_host); } } void MeshCommandQueue::enqueue_record_event( const std::shared_ptr& event, tt::stl::Span sub_device_ids, - const std::optional& device_range) { + const std::optional& device_range) { this->enqueue_record_event_helper(event, sub_device_ids, false, device_range); } void MeshCommandQueue::enqueue_record_event_to_host( const std::shared_ptr& event, tt::stl::Span sub_device_ids, - const std::optional& device_range) { + const std::optional& device_range) { this->enqueue_record_event_helper(event, sub_device_ids, true, device_range); event_descriptors_.push(std::make_shared(MeshReadEventDescriptor{ .single_device_descriptor = ReadEventDescriptor(event->event_id), .device_range = event->device_range})); } void MeshCommandQueue::enqueue_wait_for_event(const std::shared_ptr& sync_event) { - for (std::size_t logical_x = sync_event->device_range.start_coord.x; - logical_x < sync_event->device_range.end_coord.x + 1; - logical_x++) { - for (std::size_t logical_y = sync_event->device_range.start_coord.y; - logical_y < sync_event->device_range.end_coord.y + 1; - logical_y++) { - event_dispatch::issue_wait_for_event_commands( - id_, - sync_event->cq_id, - mesh_device_->get_device(logical_y, logical_x)->sysmem_manager(), - sync_event->event_id); - } + for (const auto& coord : sync_event->device_range) { + event_dispatch::issue_wait_for_event_commands( + id_, sync_event->cq_id, mesh_device_->get_device(coord)->sysmem_manager(), sync_event->event_id); } } @@ -511,23 +503,15 @@ void MeshCommandQueue::drain_events_from_completion_queue() { for (std::size_t event_idx = 0; event_idx < num_events; event_idx++) { auto& mesh_read_descriptor = event_descriptors_.front(); auto& device_range = mesh_read_descriptor->device_range; - for (std::size_t logical_x = device_range.start_coord.x; logical_x < device_range.end_coord.x + 1; - logical_x++) { - for (std::size_t logical_y = device_range.start_coord.y; logical_y < device_range.end_coord.y + 1; - logical_y++) { - auto device = mesh_device_->get_device(logical_y, logical_x); - chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device->id()); - uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id()); - bool exit_condition = false; - device->sysmem_manager().completion_queue_wait_front(id_, exit_condition); - - event_dispatch::read_events_from_completion_queue( - mesh_read_descriptor->single_device_descriptor, - mmio_device_id, - channel, - id_, - device->sysmem_manager()); - } + for (const auto& coord : device_range) { + auto device = mesh_device_->get_device(coord); + chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device->id()); + uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id()); + bool exit_condition = false; + device->sysmem_manager().completion_queue_wait_front(id_, exit_condition); + + event_dispatch::read_events_from_completion_queue( + mesh_read_descriptor->single_device_descriptor, mmio_device_id, channel, id_, device->sysmem_manager()); } event_descriptors_.pop(); } @@ -535,16 +519,11 @@ void MeshCommandQueue::drain_events_from_completion_queue() { void MeshCommandQueue::verify_reported_events_after_draining(const std::shared_ptr& event) { auto& device_range = event->device_range; - for (std::size_t logical_x = device_range.start_coord.x; logical_x < device_range.end_coord.x + 1; logical_x++) { - for (std::size_t logical_y = device_range.start_coord.y; logical_y < device_range.end_coord.y + 1; - logical_y++) { - TT_FATAL( - mesh_device_->get_device(logical_y, logical_x) - ->sysmem_manager() - .get_last_completed_event(event->cq_id) >= event->event_id, - "Expected to see event id {} in completion queue", - event->event_id); - } + for (const auto& coord : device_range) { + TT_FATAL( + mesh_device_->get_device(coord)->sysmem_manager().get_last_completed_event(event->cq_id) >= event->event_id, + "Expected to see event id {} in completion queue", + event->event_id); } } @@ -571,7 +550,7 @@ void MeshCommandQueue::reset_worker_state( } void MeshCommandQueue::write_program_cmds_to_subgrid( - const LogicalDeviceRange& sub_grid, + const MeshCoordinateRange& sub_grid, ProgramCommandSequence& program_cmd_seq, bool stall_first, bool stall_before_program, @@ -579,17 +558,15 @@ void MeshCommandQueue::write_program_cmds_to_subgrid( auto dispatch_core_config = DispatchQueryManager::instance().get_dispatch_core_config(); CoreType dispatch_core_type = dispatch_core_config.get_core_type(); - for (std::size_t logical_x = sub_grid.start_coord.x; logical_x < sub_grid.end_coord.x + 1; logical_x++) { - for (std::size_t logical_y = sub_grid.start_coord.y; logical_y < sub_grid.end_coord.y + 1; logical_y++) { - program_dispatch::write_program_command_sequence( - program_cmd_seq, - this->mesh_device_->get_device(logical_y, logical_x)->sysmem_manager(), - id_, - dispatch_core_type, - stall_first, - stall_before_program); - chip_ids_in_workload.insert(this->mesh_device_->get_device(logical_y, logical_x)->id()); - } + for (const auto& coord : sub_grid) { + program_dispatch::write_program_command_sequence( + program_cmd_seq, + this->mesh_device_->get_device(coord)->sysmem_manager(), + id_, + dispatch_core_type, + stall_first, + stall_before_program); + chip_ids_in_workload.insert(this->mesh_device_->get_device(coord)->id()); } } @@ -616,12 +593,11 @@ void MeshCommandQueue::write_go_signal_to_unused_sub_grids( } void MeshCommandQueue::capture_program_trace_on_subgrid( - const LogicalDeviceRange& sub_grid, + const MeshCoordinateRange& sub_grid, ProgramCommandSequence& program_cmd_seq, bool stall_first, bool stall_before_program) { - auto start_coord = sub_grid.start_coord; - auto& sysmem_manager_for_trace = mesh_device_->get_device(start_coord.y, start_coord.x)->sysmem_manager(); + auto& sysmem_manager_for_trace = mesh_device_->get_device(sub_grid.start_coord())->sysmem_manager(); uint32_t sysmem_manager_offset = sysmem_manager_for_trace.get_issue_queue_write_ptr(id_); auto dispatch_core_config = DispatchQueryManager::instance().get_dispatch_core_config(); @@ -631,48 +607,39 @@ void MeshCommandQueue::capture_program_trace_on_subgrid( program_cmd_seq, sysmem_manager_for_trace, id_, dispatch_core_type, stall_first, stall_before_program); auto mesh_trace_md = MeshTraceStagingMetadata{ sub_grid, - start_coord, + sub_grid.start_coord(), sysmem_manager_offset, sysmem_manager_for_trace.get_issue_queue_write_ptr(id_) - sysmem_manager_offset}; ordered_mesh_trace_md_.push_back(mesh_trace_md); } void MeshCommandQueue::capture_go_signal_trace_on_unused_subgrids( - std::vector& active_sub_grids, + const MeshCoordinateRange& active_grid, const SubDeviceId& sub_device_id, uint32_t expected_num_workers_completed, bool mcast_go_signals, bool unicast_go_signals) { - LogicalDeviceRangeSet active_ranges = active_sub_grids[0]; - for (int i = 1; i < active_sub_grids.size(); i++) { - active_ranges = active_ranges.merge(active_sub_grids[i]); - } - TT_FATAL(active_ranges.size() == 1, "Cannot support non convex grids"); - CoreRange active_grid = active_ranges.bounding_box(); - CoreRange full_grid = CoreRange({0, 0}, {mesh_device_->num_cols() - 1, mesh_device_->num_rows() - 1}); - if (active_grid != full_grid) { - LogicalDeviceRangeSet unused_grids = relative_complement(full_grid, active_grid); - for (auto& unused_grid : unused_grids.ranges()) { - auto start_coord = unused_grid.start_coord; - auto& sysmem_manager_for_trace = mesh_device_->get_device(start_coord.y, start_coord.x)->sysmem_manager(); - uint32_t sysmem_manager_offset = sysmem_manager_for_trace.get_issue_queue_write_ptr(id_); - write_go_signal( - id_, - mesh_device_, - sub_device_id, - sysmem_manager_for_trace, - expected_num_workers_completed, - this->virtual_program_dispatch_core(), - mcast_go_signals, - unicast_go_signals, - mesh_device_->num_worker_cores(HalProgrammableCoreType::ACTIVE_ETH, sub_device_id)); - auto mesh_trace_md = MeshTraceStagingMetadata{ - unused_grid, - start_coord, - sysmem_manager_offset, - sysmem_manager_for_trace.get_issue_queue_write_ptr(id_) - sysmem_manager_offset}; - ordered_mesh_trace_md_.push_back(mesh_trace_md); - } + MeshCoordinateRange full_grid(mesh_device_->shape()); + MeshCoordinateRangeSet unused_grids = subtract(full_grid, active_grid); + for (const auto& unused_grid : unused_grids.ranges()) { + auto& sysmem_manager_for_trace = mesh_device_->get_device(unused_grid.start_coord())->sysmem_manager(); + uint32_t sysmem_manager_offset = sysmem_manager_for_trace.get_issue_queue_write_ptr(id_); + write_go_signal( + id_, + mesh_device_, + sub_device_id, + sysmem_manager_for_trace, + expected_num_workers_completed, + this->virtual_program_dispatch_core(), + mcast_go_signals, + unicast_go_signals, + mesh_device_->num_worker_cores(HalProgrammableCoreType::ACTIVE_ETH, sub_device_id)); + auto mesh_trace_md = MeshTraceStagingMetadata{ + unused_grid, + unused_grid.start_coord(), + sysmem_manager_offset, + sysmem_manager_for_trace.get_issue_queue_write_ptr(id_) - sysmem_manager_offset}; + ordered_mesh_trace_md_.push_back(mesh_trace_md); } } @@ -725,7 +692,7 @@ void MeshCommandQueue::record_begin(const MeshTraceId& trace_id, const std::shar } void MeshCommandQueue::record_end() { - trace_ctx_->assemble_dispatch_commands(this->device(), this->get_mesh_trace_md()); + trace_ctx_->assemble_dispatch_commands(this->device(), ordered_mesh_trace_md_); trace_id_ = std::nullopt; trace_ctx_ = nullptr; @@ -744,8 +711,6 @@ void MeshCommandQueue::record_end() { } } -const std::vector& MeshCommandQueue::get_mesh_trace_md() { return ordered_mesh_trace_md_; } - SystemMemoryManager& MeshCommandQueue::reference_sysmem_manager() { return mesh_device_->get_device(0, 0)->sysmem_manager(); } diff --git a/tt_metal/distributed/mesh_trace.cpp b/tt_metal/distributed/mesh_trace.cpp index 536f48bd977..e3117c4e86c 100644 --- a/tt_metal/distributed/mesh_trace.cpp +++ b/tt_metal/distributed/mesh_trace.cpp @@ -4,6 +4,7 @@ // SPDX-License-Identifier: Apache-2.0 #include +#include #include #include "tt_metal/distributed/mesh_workload_utils.hpp" @@ -21,7 +22,7 @@ void MeshTraceDescriptor::assemble_dispatch_commands( auto& trace_data = this->ordered_trace_data; for (auto& trace_md : mesh_trace_md) { auto& sysmem_mgr_coord = trace_md.sysmem_manager_coord; - auto& sysmem_manager = mesh_device->get_device(sysmem_mgr_coord.y, sysmem_mgr_coord.x)->sysmem_manager(); + auto& sysmem_manager = mesh_device->get_device(sysmem_mgr_coord)->sysmem_manager(); auto trace_data_word_offset = trace_md.offset / sizeof(uint32_t); auto trace_data_size_words = trace_md.size / sizeof(uint32_t); auto& bypass_data = sysmem_manager.get_bypass_data(); @@ -31,13 +32,13 @@ void MeshTraceDescriptor::assemble_dispatch_commands( std::vector program_cmds_vector( std::make_move_iterator(bypass_data.begin() + trace_data_word_offset), std::make_move_iterator(bypass_data.begin() + trace_data_word_offset + trace_data_size_words)); - std::vector device_ranges_to_invalidate = {}; + std::vector device_ranges_to_invalidate; for (auto& program : trace_data) { if (program.device_range.intersects(trace_md.device_range)) { // The current program intersects with a program that was previously // placed on the Mesh. intersection_found = true; - auto intersection = program.device_range.intersection(trace_md.device_range).value(); + auto intersection = *program.device_range.intersection(trace_md.device_range); if (intersection == program.device_range) { // Intersection matches the originally placed program. program.data.insert( @@ -46,8 +47,8 @@ void MeshTraceDescriptor::assemble_dispatch_commands( std::make_move_iterator(program_cmds_vector.end())); } else { // Intersection is a subset of the originally placed program. - auto complement = relative_complement(program.device_range, intersection); - for (auto& complement_range : complement.ranges()) { + auto complement = subtract(program.device_range, intersection); + for (const auto& complement_range : complement.ranges()) { intermed_trace_data.push_back(MeshTraceData{complement_range, program.data}); } intermed_trace_data.push_back(MeshTraceData{intersection, program.data}); @@ -77,7 +78,7 @@ void MeshTraceDescriptor::assemble_dispatch_commands( } this->total_trace_size += trace_md.size; } - auto bcast_device_range = LogicalDeviceRange({0, 0}, {mesh_device->num_cols() - 1, mesh_device->num_rows() - 1}); + MeshCoordinateRange bcast_device_range(mesh_device->shape()); std::vector exec_buf_end = {}; DeviceCommand command_sequence(hal.get_alignment(HalMemType::HOST)); @@ -134,7 +135,7 @@ void MeshTrace::populate_mesh_buffer(MeshCommandQueue& mesh_cq, std::shared_ptr< trace_buffer->mesh_buffer = MeshBuffer::create(global_trace_buf_config, device_local_trace_buf_config, mesh_cq.device()); - std::unordered_map write_offset_per_device_range = {}; + std::unordered_map write_offset_per_device_range = {}; for (auto& mesh_trace_data : trace_buffer->desc->ordered_trace_data) { auto& device_range = mesh_trace_data.device_range; if (write_offset_per_device_range.find(device_range) == write_offset_per_device_range.end()) { diff --git a/tt_metal/distributed/mesh_workload.cpp b/tt_metal/distributed/mesh_workload.cpp index a9efcb406c7..a3b999cd7e3 100644 --- a/tt_metal/distributed/mesh_workload.cpp +++ b/tt_metal/distributed/mesh_workload.cpp @@ -10,6 +10,15 @@ #include "tt_metal/distributed/mesh_workload_utils.hpp" namespace tt::tt_metal::distributed { +namespace { + +// TODO: Consider how this can be extended to ND. +uint32_t encode_device_range(const MeshCoordinate& coord) { + TT_FATAL(coord.dims() == 2, "Expected 2D coordinate: {}", coord); + return (coord[0] << 24) | (coord[1] << 16); +} + +} // namespace MeshWorkload::MeshWorkload() { // A MeshWorkload tracks maintains its own handles to kernels across all @@ -18,7 +27,7 @@ MeshWorkload::MeshWorkload() { kernels_.resize(hal.get_programmable_core_type_count()); } -void MeshWorkload::add_program(const LogicalDeviceRange& device_range, Program&& program) { +void MeshWorkload::add_program(const MeshCoordinateRange& device_range, Program&& program) { // Add a program to a MeshWorkload and tie it a specific logical device range programs_[device_range] = std::move(program); logical_device_ranges_.push_back(device_range); @@ -73,7 +82,6 @@ void MeshWorkload::load_binaries(MeshCommandQueue& mesh_cq) { MeshBuffer::create(global_kernel_bin_buf_config, device_local_kernel_bin_buf_config, mesh_device); // Iterate over the sub-grids and EnqueueWriteMeshBuffer to each sub-grid that runs an individual program for (auto& [device_range, program] : this->programs_) { - auto& grid_start = device_range.start_coord; std::size_t kernel_bin_size = program.get_program_transfer_info().binary_data.size() * sizeof(uint32_t); global_kernel_bin_buf_config.size = kernel_bin_size; auto kernel_bin_buf_view = MeshBuffer::create( @@ -155,9 +163,9 @@ bool MeshWorkload::kernel_binary_always_stored_in_ringbuffer() { std::unordered_map>& MeshWorkload::get_kernels( uint32_t programmable_core_type_index) { // Get all kernels across all programs in the MeshWorkload - if (not kernels_.at(programmable_core_type_index).size()) { + if (kernels_.at(programmable_core_type_index).empty()) { for (auto& [device_range, program] : programs_) { - uint32_t device_range_handle = (device_range.start_coord.y << 24) | (device_range.start_coord.x << 16); + uint32_t device_range_handle = encode_device_range(device_range.start_coord()); for (const auto& kernel : program.get_kernels(programmable_core_type_index)) { KernelHandle handle = (device_range_handle | kernel.first); kernels_.at(programmable_core_type_index).insert({handle, kernel.second}); @@ -169,9 +177,9 @@ std::unordered_map>& MeshWorkload::get_ker std::vector>& MeshWorkload::get_kernel_groups(uint32_t programmable_core_type_index) { // Get all kernel groups across all programs in the MeshWorkload - if (not kernel_groups_.at(programmable_core_type_index).size()) { + if (kernel_groups_.at(programmable_core_type_index).empty()) { for (auto& [device_range, program] : programs_) { - uint32_t device_range_handle = (device_range.start_coord.y << 24) | (device_range.start_coord.x << 16); + uint32_t device_range_handle = encode_device_range(device_range.start_coord()); for (auto& kg : program.get_kernel_groups(programmable_core_type_index)) { for (auto& optional_kernel_id : kg->kernel_ids) { if (optional_kernel_id.has_value()) { @@ -216,8 +224,7 @@ std::unordered_set MeshWorkload::determine_sub_device_ids(MeshDevic // Get the sub device ids for all program across all devices in the Workload std::unordered_set sub_devices_; for (auto& [device_range, program] : programs_) { - auto grid_start = device_range.start_coord; - IDevice* device = mesh_device->get_device(grid_start.y, grid_start.x); + IDevice* device = mesh_device->get_device(device_range.start_coord()); auto sub_devs_for_program = program.determine_sub_device_ids(mesh_device); for (auto& sub_dev : sub_devs_for_program) { sub_devices_.insert(sub_dev); diff --git a/tt_metal/distributed/mesh_workload_utils.cpp b/tt_metal/distributed/mesh_workload_utils.cpp index 2bbc713c87c..4af6698c34e 100644 --- a/tt_metal/distributed/mesh_workload_utils.cpp +++ b/tt_metal/distributed/mesh_workload_utils.cpp @@ -76,71 +76,4 @@ void write_go_signal( sysmem_manager.fetch_queue_reserve_back(cq_id); sysmem_manager.fetch_queue_write(cmd_sequence_sizeB, cq_id); } - -bool is_row_major_intersection(const LogicalDeviceRange& parent, const LogicalDeviceRange& intersection) { - return intersection.grid_size().x == parent.grid_size().x; -} -bool matching_dimensions(const LogicalDeviceRange& parent, const LogicalDeviceRange& intersection) { - auto intersection_grid_size = intersection.grid_size(); - auto parent_grid_size = parent.grid_size(); - return intersection_grid_size.x == parent_grid_size.x || intersection_grid_size.y == parent_grid_size.y; -} - -bool matching_vertices(const LogicalDeviceRange& parent, const LogicalDeviceRange& intersection) { - return (intersection.start_coord.x == parent.start_coord.x && intersection.start_coord.y == parent.start_coord.y) || - (intersection.end_coord.x == parent.end_coord.x && intersection.end_coord.y == parent.end_coord.y); -} - -bool has_convex_relative_complement(const LogicalDeviceRange& parent, const LogicalDeviceRange& intersection) { - return matching_dimensions(parent, intersection) && matching_vertices(parent, intersection); -} - -LogicalDeviceRange convex_relative_complement( - const LogicalDeviceRange& parent, const LogicalDeviceRange& intersection) { - TT_FATAL(parent.contains(intersection), "Parent must contain intersection"); - auto intersection_grid_size = intersection.grid_size(); - auto parent_grid_size = parent.grid_size(); - TT_FATAL(has_convex_relative_complement(parent, intersection), "Non convex grids not supported"); - - if (is_row_major_intersection(parent, intersection)) { - if (intersection.start_coord.y == parent.start_coord.y) { - return LogicalDeviceRange( - {parent.start_coord.x, intersection.end_coord.y + 1}, {parent.end_coord.x, parent.end_coord.y}); - } else { - return LogicalDeviceRange( - {parent.start_coord.x, parent.start_coord.y}, {parent.end_coord.x, intersection.start_coord.y - 1}); - } - } else { - if (intersection.start_coord.x == parent.start_coord.x) { - return LogicalDeviceRange( - {intersection.end_coord.x + 1, parent.start_coord.y}, {parent.end_coord.x, parent.end_coord.y}); - } else { - return LogicalDeviceRange( - {parent.start_coord.x, parent.start_coord.y}, {intersection.start_coord.x - 1, parent.end_coord.y}); - } - } -} - -LogicalDeviceRangeSet relative_complement(const LogicalDeviceRange& parent, const LogicalDeviceRange& intersection) { - TT_FATAL(parent.contains(intersection), "Parent must contain intersection"); - if (has_convex_relative_complement(parent, intersection)) { - return convex_relative_complement(parent, intersection); - } - std::vector relative_complement = {}; - std::unordered_set devices_in_intersection = {}; - for (auto& intersection_device : intersection) { - devices_in_intersection.insert(intersection_device); - } - for (auto& parent_device : parent) { - if (devices_in_intersection.find(parent_device) == devices_in_intersection.end()) { - relative_complement.push_back(CoreRange(parent_device)); - } - } - LogicalDeviceRangeSet merged_complement = relative_complement[0]; - for (int i = 1; i < relative_complement.size(); i++) { - merged_complement = merged_complement.merge(relative_complement[i]); - } - return merged_complement; -} - } // namespace tt::tt_metal::distributed diff --git a/tt_metal/distributed/mesh_workload_utils.hpp b/tt_metal/distributed/mesh_workload_utils.hpp index 577aff84af7..acc97ee27eb 100644 --- a/tt_metal/distributed/mesh_workload_utils.hpp +++ b/tt_metal/distributed/mesh_workload_utils.hpp @@ -20,6 +20,4 @@ void write_go_signal( bool send_unicasts, int num_unicast_txns = -1); -LogicalDeviceRangeSet relative_complement(const LogicalDeviceRange& parent, const LogicalDeviceRange& intersection); - } // namespace tt::tt_metal::distributed diff --git a/tt_metal/programming_examples/distributed/1_distributed_program_dispatch/distributed_program_dispatch.cpp b/tt_metal/programming_examples/distributed/1_distributed_program_dispatch/distributed_program_dispatch.cpp index 247c6cec967..e13cbe73ef6 100644 --- a/tt_metal/programming_examples/distributed/1_distributed_program_dispatch/distributed_program_dispatch.cpp +++ b/tt_metal/programming_examples/distributed/1_distributed_program_dispatch/distributed_program_dispatch.cpp @@ -33,12 +33,9 @@ int main(int argc, char** argv) { // Instantiate a MeshWorkload and attach the example program. We'll broadcast // this program by enqueueing it across all devices in our 2x4 mesh. auto mesh_workload = CreateMeshWorkload(); - auto target_devices = LogicalDeviceRange{ - DeviceCoord{0, 0} /* start_coord */, DeviceCoord{mesh_device->num_cols() - 1, mesh_device->num_rows() - 1} - /* end_coord */ - }; + auto target_devices = MeshCoordinateRange(mesh_device->shape()); - AddProgramToMeshWorkload(mesh_workload, example_program, target_devices); + AddProgramToMeshWorkload(mesh_workload, std::move(example_program), target_devices); EnqueueMeshWorkload(cq, mesh_workload, false /* blocking */); // Synchronize the mesh command queue to ensure the workload has completed. diff --git a/tt_metal/programming_examples/distributed/3_distributed_eltwise_add/distributed_eltwise_add.cpp b/tt_metal/programming_examples/distributed/3_distributed_eltwise_add/distributed_eltwise_add.cpp index c5760403898..e0234835539 100644 --- a/tt_metal/programming_examples/distributed/3_distributed_eltwise_add/distributed_eltwise_add.cpp +++ b/tt_metal/programming_examples/distributed/3_distributed_eltwise_add/distributed_eltwise_add.cpp @@ -128,12 +128,9 @@ int main(int argc, char** argv) { // Create mesh workload and broadcast the program across all devices auto mesh_workload = CreateMeshWorkload(); - auto device_range = LogicalDeviceRange{ - DeviceCoord{0, 0} /* start_coord */, DeviceCoord{mesh_device->num_cols() - 1, mesh_device->num_rows() - 1} - /* end_coord */ - }; + auto device_range = MeshCoordinateRange(mesh_device->shape()); - AddProgramToMeshWorkload(mesh_workload, program, device_range); + AddProgramToMeshWorkload(mesh_workload, std::move(program), device_range); EnqueueMeshWorkload(cq, mesh_workload, false /* blocking */); // Read back results diff --git a/tt_metal/programming_examples/distributed/4_distributed_trace_and_events/distributed_trace_and_events.cpp b/tt_metal/programming_examples/distributed/4_distributed_trace_and_events/distributed_trace_and_events.cpp index f64154f3c74..8f2611e637a 100644 --- a/tt_metal/programming_examples/distributed/4_distributed_trace_and_events/distributed_trace_and_events.cpp +++ b/tt_metal/programming_examples/distributed/4_distributed_trace_and_events/distributed_trace_and_events.cpp @@ -4,6 +4,7 @@ #include #include +#include using namespace tt; using namespace tt::tt_metal; @@ -170,10 +171,11 @@ int main(int argc, char** argv) { // =========== Step 3: Create Workloads to run on the Virtual Mesh =========== // Specify Device Ranges on which the Workloads will run - LogicalDeviceRange all_devices({0, 0}, {mesh_device->num_cols() - 1, mesh_device->num_rows() - 1}); - LogicalDeviceRange top_row({0, 0}, {mesh_device->num_cols() - 1, 0}); - LogicalDeviceRange bottom_row( - {0, mesh_device->num_rows() - 1}, {mesh_device->num_cols() - 1, mesh_device->num_rows() - 1}); + MeshCoordinateRange all_devices(mesh_device->shape()); + MeshCoordinateRange top_row(MeshCoordinate{0, 0}, MeshCoordinate{0, mesh_device->num_cols() - 1}); + MeshCoordinateRange bottom_row( + MeshCoordinate{mesh_device->num_rows() - 1, 0}, + MeshCoordinate{mesh_device->num_rows() - 1, mesh_device->num_cols() - 1}); // Create three eltwise binary ops using a simple program generation function auto add_program = EltwiseBinaryProgramGenerator( add_src0_buf, @@ -204,14 +206,14 @@ int main(int argc, char** argv) { auto add_mesh_workload = CreateMeshWorkload(); auto multiply_and_subtract_mesh_workload = CreateMeshWorkload(); AddProgramToMeshWorkload( - add_mesh_workload, *add_program, all_devices); // Addition runs on the full grid (sub_device 1) + add_mesh_workload, std::move(*add_program), all_devices); // Addition runs on the full grid (sub_device 1) AddProgramToMeshWorkload( multiply_and_subtract_mesh_workload, - *multiply_program, + std::move(*multiply_program), top_row); // Multiplication runs on the top row (sub_device 2) AddProgramToMeshWorkload( multiply_and_subtract_mesh_workload, - *subtract_program, + std::move(*subtract_program), bottom_row); // Subtraction runs on the bottom row (sub device 2) // =========== Step 4: Compile and Load Workloads on the Mesh =========== From 2f2c1b612eea0e067881147cfe41a9458eff9635 Mon Sep 17 00:00:00 2001 From: Andrew Fuller Date: Tue, 25 Feb 2025 15:49:28 -0500 Subject: [PATCH 306/316] [skip ci] Remove old Llama test (#18314) ### Ticket #17038 ### Problem description A test got removed, but this one still referenced it. ### What's changed Removed the test. A replacement is forthcoming, so keeping the workflow. ### Checklist - [x] TG Nightly [passes](https://github.com/tenstorrent/tt-metal/actions/runs/13530511542) --- .github/workflows/tg-nightly-tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tg-nightly-tests.yaml b/.github/workflows/tg-nightly-tests.yaml index 4e67f799a6b..5ffd94a6b0e 100644 --- a/.github/workflows/tg-nightly-tests.yaml +++ b/.github/workflows/tg-nightly-tests.yaml @@ -15,7 +15,7 @@ jobs: fail-fast: false matrix: test-group: [ - { name: "tg_llama3_70b_tests", arch: wormhole_b0, cmd: run_tg_llama3_70b_tests, timeout: 90, owner_id: U03FJB5TM5Y}, # Colman Glagovich + { name: "placeholder", arch: wormhole_b0, cmd: "echo 'Placeholder'", timeout: 90, owner_id: U03FJB5TM5Y}, # Colman Glagovich ] name: ${{ matrix.test-group.name }} env: From 87b1d577d9cac8f1702dfaf443204f0d3c59499e Mon Sep 17 00:00:00 2001 From: Brian Beggs Date: Tue, 25 Feb 2025 12:56:02 -0800 Subject: [PATCH 307/316] [skip ci] Update INSTALLING.md (#18259) ### Ticket N/A ### Problem description Installing.MD needs to be ready for Blackhole Launch. ### What's changed Adding Device Entry for Blackhole Launch. ### Checklist - [ ] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes - [ ] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [ ] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) - [ ] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [ ] New/Existing tests provide coverage for changes --- INSTALLING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/INSTALLING.md b/INSTALLING.md index 9f92d48c045..880aeb88f3a 100644 --- a/INSTALLING.md +++ b/INSTALLING.md @@ -20,9 +20,9 @@ Note the current compatability matrix: | Device | OS | Python | Driver (TT-KMD) | Firmware (TT-Flash) | TT-SMI | TT-Topology | |---------------------|-----------------|----------|--------------------|--------------------------------------------|-----------------------|--------------------------------| -| Grayskull | Ubuntu 20.04 | 3.8.10 | v1.29 | fw_pack-80.9.0.0 (v80.9.0.0) | v2.2.0 or above | N/A | | Wormhole | Ubuntu 20.04 | 3.8.10 | v1.29 | fw_pack-80.13.0.0 (v80.13.0.0) | v2.2.0 or above | N/A | | T3000 (Wormhole) | Ubuntu 20.04 | 3.8.10 | v1.29 | fw_pack-80.13.0.0 (v80.13.0.0) | v2.2.0 or above | v1.1.3 or above, `mesh` config | +| Blackhole | Ubuntu 20.04 | 3.10 | v1.31 | fw_pack-80.15.0.0 (v80.15.0.0) | v3.0.5 or above | v1.1.3 or above, 'mesh' config | --- From 663244a587f70974b812a1d668d14105d845dd85 Mon Sep 17 00:00:00 2001 From: Mohamed Bahnas <116673264+mbahnasTT@users.noreply.github.com> Date: Tue, 25 Feb 2025 12:56:15 -0800 Subject: [PATCH 308/316] #0: fix a pre commit issue in yolo perf code (#18315) Co-authored-by: Dalar Vartanians --- models/demos/yolov4/tests/test_perf_yolo.py | 1 + 1 file changed, 1 insertion(+) diff --git a/models/demos/yolov4/tests/test_perf_yolo.py b/models/demos/yolov4/tests/test_perf_yolo.py index 4230aa818e3..aaa3f883f31 100644 --- a/models/demos/yolov4/tests/test_perf_yolo.py +++ b/models/demos/yolov4/tests/test_perf_yolo.py @@ -32,6 +32,7 @@ def get_expected_compile_time_sec(): def get_expected_inference_time_sec(): return 0.37 + @pytest.mark.models_performance_bare_metal @pytest.mark.parametrize("device_params", [{"l1_small_size": 24576}], indirect=True) @pytest.mark.parametrize( From 598d43df3287152774544a337b366aa9f15970b1 Mon Sep 17 00:00:00 2001 From: William Ly Date: Tue, 25 Feb 2025 15:59:12 -0500 Subject: [PATCH 309/316] #0: [skip ci] Fix produce data crash when job is stuck in pending state (#18316) ### Ticket ... ### Problem description Fix crash in produce data: https://github.com/tenstorrent/tt-metal/actions/runs/13518418725/job/37810462342#step:10:494 ``` get_job_row_from_github_job assert github_job["status"] == "completed", f"{github_job_id} is not completed" AssertionError: 37759949624 is not completed ``` `build-deploy-docs` can sometimes be left in a pending status when cancelled: https://github.com/tenstorrent/tt-metal/actions/runs/13514047027/job/37759949624 This causes the produce data flow to trip an assert that only completed jobs are to be processed. ### What's changed Downgrade assert to warning and skip the job if its not completed instead of exiting. ### Checklist - [x] New/Existing tests provide coverage for changes Rerun of failed workflow in fix branch https://github.com/tenstorrent/tt-metal/actions/runs/13530691987 --- infra/data_collection/github/utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/infra/data_collection/github/utils.py b/infra/data_collection/github/utils.py index 7c58d351b5f..6f5a90b2e1c 100644 --- a/infra/data_collection/github/utils.py +++ b/infra/data_collection/github/utils.py @@ -192,7 +192,9 @@ def get_job_row_from_github_job(github_job, github_job_id_to_annotations): name = github_job["name"] - assert github_job["status"] == "completed", f"{github_job_id} is not completed" + if github_job["status"] != "completed": + logger.warning(f"{github_job_id} is not completed, skipping this job") + return None # Best effort card type getting @@ -286,9 +288,10 @@ def get_job_row_from_github_job(github_job, github_job_id_to_annotations): def get_job_rows_from_github_info(github_pipeline_json, github_jobs_json, github_job_id_to_annotations): - return list( + job_rows = list( map(lambda job: get_job_row_from_github_job(job, github_job_id_to_annotations), github_jobs_json["jobs"]) ) + return [x for x in job_rows if x is not None] def get_github_partial_benchmark_json_filenames(): From a32c40117cb822062e720991ae0fe22c9601980d Mon Sep 17 00:00:00 2001 From: Bryan Wilder Field Lozano Date: Tue, 25 Feb 2025 14:06:08 -0800 Subject: [PATCH 310/316] [skip ci] Add workflow to delete pre-releases (#18262) --- .github/workflows/release-cleanup.yaml | 46 ++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 .github/workflows/release-cleanup.yaml diff --git a/.github/workflows/release-cleanup.yaml b/.github/workflows/release-cleanup.yaml new file mode 100644 index 00000000000..faa0ac464fd --- /dev/null +++ b/.github/workflows/release-cleanup.yaml @@ -0,0 +1,46 @@ +name: Release Cleanup + +on: + schedule: + - cron: "0 7 * * *" # Runs daily at midnight UTC + workflow_dispatch: + inputs: + months_back: + description: "Number of months back to check for pre-releases" + required: false + default: "3" # Default set to 3 months + +jobs: + cleanup: + runs-on: ubuntu-latest + steps: + - name: Cleanup old pre-releases + uses: actions/github-script@v6 + with: + script: | + const monthsBack = parseInt(core.getInput("months_back") || "3"); // Default to 3 months + const now = new Date(); + const cutoffDate = new Date(now.setMonth(now.getMonth() - monthsBack)); + + // Retrieve all releases using pagination + const releases = await github.paginate(github.rest.repos.listReleases, { + owner: context.repo.owner, + repo: context.repo.repo, + }); + + for (const release of releases) { + if (release.prerelease && new Date(release.created_at) < cutoffDate) { + console.log(`Deleting pre-release: ${release.name || release.tag_name} (created at: ${release.created_at})`); + + try { + await github.rest.repos.deleteRelease({ + owner: context.repo.owner, + repo: context.repo.repo, + release_id: release.id, + }); + console.log(`Successfully deleted release: ${release.name || release.tag_name}`); + } catch (releaseError) { + console.error(`Failed to delete release ${release.name || release.tag_name}: ${releaseError.message}`); + } + } + } From 5db78f8b6fc7d48f94144b366641090a9ef9b932 Mon Sep 17 00:00:00 2001 From: Stanislav Minakov Date: Tue, 25 Feb 2025 22:53:29 +0000 Subject: [PATCH 311/316] Disable async mode for single device use cases (#18114) ### Ticket ### Problem description We're moving towards always using a single mesh and remove the concurrency code, including calls to push_work from ttnn. This PR disables async for a single device which should be the only path once we handle multi-device use cases with a single mesh. ### What's changed Ignore calls to `enable_async()` for single device, logging a warning that its being ignored. Add a mutex in `push_work` for sync mode, which provides the same call serialization guarantee as worker queue for async mode. Use a direct function call if the number of workers is 1 in `run_operation.cpp`. ### Checklist - [x] [All post commit CI passes](https://github.com/tenstorrent/tt-metal/actions/runs/13518698157) - [x] [Model perf CI passes](https://github.com/tenstorrent/tt-metal/actions/runs/13514345756) - [x] [T3K model perf CI passes](https://github.com/tenstorrent/tt-metal/actions/runs/13514349012) - [x] New/Existing tests provide coverage for changes --- .../tensors/test_async_tensor_apis.cpp | 7 ---- .../unit_tests/gtests/test_async_runtime.cpp | 4 --- tt_metal/api/tt-metalium/device_impl.hpp | 3 ++ tt_metal/api/tt-metalium/mesh_device.hpp | 1 + tt_metal/api/tt-metalium/work_executor.hpp | 3 ++ tt_metal/distributed/mesh_device.cpp | 11 +++++-- tt_metal/impl/device/device.cpp | 8 +++++ ttnn/cpp/ttnn/decorators.hpp | 4 +-- ttnn/cpp/ttnn/run_operation.cpp | 33 +++---------------- ttnn/cpp/ttnn/run_operation.hpp | 9 ++--- ttnn/cpp/ttnn/tensor/tensor_ops.cpp | 10 ++++-- 11 files changed, 41 insertions(+), 52 deletions(-) diff --git a/tests/tt_eager/tensors/test_async_tensor_apis.cpp b/tests/tt_eager/tensors/test_async_tensor_apis.cpp index 884160d86c3..7fd705644fa 100644 --- a/tests/tt_eager/tensors/test_async_tensor_apis.cpp +++ b/tests/tt_eager/tensors/test_async_tensor_apis.cpp @@ -187,11 +187,7 @@ TEST_F(DispatchFixture, TestAsyncRefCountManager) { /*layout=*/std::nullopt, *device); uint32_t tensor2_device_buf_addr = get_device_buffer_address(tensor2); - // Assign tensor1 to tensor2 and ensure that ref counts are appropriately updated with the buffer for tensor2 - // deallocated tensor2 = tensor1; - EXPECT_EQ(tensor2.tensor_attributes->main_thread_ref_count, 2); - EXPECT_EQ(tensor1.tensor_attributes->main_thread_ref_count, 2); // To check if tensor2 is deallocated, create a third tensor on device and ensure that its address matches the // prev addr for tensor2 Tensor tensor3 = ttnn::full( @@ -215,7 +211,6 @@ TEST_F(DispatchFixture, TestAsyncRefCountManager) { // This step will copy the tensor to a temp rval and std::move it back to the caller's instance of device_tensor // Ensure ref count and address remain unchanged device_tensor = tensor_identity_copy_function(device_tensor); - EXPECT_EQ(device_tensor.tensor_attributes->main_thread_ref_count, 1); EXPECT_EQ(get_device_buffer_address(device_tensor), device_tensor_address); } @@ -228,7 +223,6 @@ TEST_F(DispatchFixture, TestAsyncRefCountManager) { /*layout=*/std::nullopt, *device); Tensor tensor2 = std::move(tensor1); - EXPECT_EQ(tensor2.tensor_attributes->main_thread_ref_count, 1); } log_info(LogTest, "Testing Device tensor self-assignment"); @@ -240,7 +234,6 @@ TEST_F(DispatchFixture, TestAsyncRefCountManager) { *device); uint32_t tensor_to_self_assign_address = get_device_buffer_address(tensor_to_self_assign); tensor_to_self_assign = tensor_to_self_assign; - EXPECT_EQ(tensor_to_self_assign.tensor_attributes->main_thread_ref_count, 1); tensor_to_self_assign = std::move(tensor_to_self_assign); EXPECT_EQ(get_device_buffer_address(tensor_to_self_assign), tensor_to_self_assign_address); auto barrier_tensor = tensor_to_self_assign.cpu(); diff --git a/tests/ttnn/unit_tests/gtests/test_async_runtime.cpp b/tests/ttnn/unit_tests/gtests/test_async_runtime.cpp index 5cf8b13da82..8c26a6b93c6 100644 --- a/tests/ttnn/unit_tests/gtests/test_async_runtime.cpp +++ b/tests/ttnn/unit_tests/gtests/test_async_runtime.cpp @@ -83,10 +83,6 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncPreallocatedOutputs) { EXPECT_EQ(ttnn::event_query(workload_event), true); // Read output back, once workload is complete ttnn::read_buffer(io_cq, output_tensor, {readback_data}); - // Ensure that reference count book keeping is done correctly - // Tensors only have one reference in the main thread. Ensure this is true. - EXPECT_EQ(input_tensor.tensor_attributes->main_thread_ref_count, 1); - EXPECT_EQ(output_tensor.tensor_attributes->main_thread_ref_count, 1); // Buffers are currently jointly owned by the original buffer object, the storage object and the tensor (3). EXPECT_EQ(input_buffer.use_count(), 3); EXPECT_EQ(output_buffer.use_count(), 3); diff --git a/tt_metal/api/tt-metalium/device_impl.hpp b/tt_metal/api/tt-metalium/device_impl.hpp index 878569038d2..40499b619f2 100644 --- a/tt_metal/api/tt-metalium/device_impl.hpp +++ b/tt_metal/api/tt-metalium/device_impl.hpp @@ -155,7 +155,10 @@ class Device : public IDevice { // Puts device into reset bool close() override; + // Calls to enable_async are ignored in effort to forcefully disable async for single device use-cases + // MeshDevice calls force_enable_async directly avoiding enable_async call for multi-device use-case void enable_async(bool enable) override; + void force_enable_async(bool enable); void synchronize() override; WorkExecutorMode get_worker_mode() override { return work_executor_.get_worker_mode(); } bool is_worker_queue_empty() const override { return work_executor_.worker_queue.empty(); } diff --git a/tt_metal/api/tt-metalium/mesh_device.hpp b/tt_metal/api/tt-metalium/mesh_device.hpp index db0ebf1b7ca..d1712d36383 100644 --- a/tt_metal/api/tt-metalium/mesh_device.hpp +++ b/tt_metal/api/tt-metalium/mesh_device.hpp @@ -65,6 +65,7 @@ class MeshDevice : public IDevice, public std::enable_shared_from_this sub_device_manager_tracker_; std::unordered_map> trace_buffer_pool_; uint32_t trace_buffers_size_ = 0; + std::recursive_mutex push_work_mutex_; // This is a reference device used to query properties that are the same for all devices in the mesh. IDevice* reference_device() const; diff --git a/tt_metal/api/tt-metalium/work_executor.hpp b/tt_metal/api/tt-metalium/work_executor.hpp index 9064024ce06..004b2762254 100644 --- a/tt_metal/api/tt-metalium/work_executor.hpp +++ b/tt_metal/api/tt-metalium/work_executor.hpp @@ -143,6 +143,8 @@ class WorkExecutor { if (use_passthrough()) { // Worker is pushing to itself (nested work) or worker thread is not running. Execute work in current // thread. + // Using a lock to provide the same call serialization guarantee as with worker queue. + std::lock_guard guard(passthrough_mutex); work_executor(); } else { // Push to worker queue. @@ -200,6 +202,7 @@ class WorkExecutor { int managed_device_id; std::condition_variable cv; std::mutex cv_mutex; + std::recursive_mutex passthrough_mutex; inline void start_worker() { this->worker_queue.parent_thread_id = std::this_thread::get_id(); diff --git a/tt_metal/distributed/mesh_device.cpp b/tt_metal/distributed/mesh_device.cpp index 03f73ceaed9..0200eef6afd 100644 --- a/tt_metal/distributed/mesh_device.cpp +++ b/tt_metal/distributed/mesh_device.cpp @@ -357,8 +357,13 @@ std::vector> MeshDevice::get_submeshes() const { ret std::ostream& operator<<(std::ostream& os, const MeshDevice& mesh_device) { return os << mesh_device.to_string(); } void MeshDevice::enable_async(bool enable) { - for (auto device : this->get_devices()) { - device->enable_async(enable); + auto devices = this->get_devices(); + if (enable && devices.size() == 1) { + tt::log_warning("Async mode is always disabled for a single device, ignoring enable_async call"); + return; + } + for (auto device : devices) { + dynamic_cast(device)->force_enable_async(enable); } } @@ -675,6 +680,8 @@ WorkExecutorMode MeshDevice::get_worker_mode() { return WorkExecutorMode::SYNCHR bool MeshDevice::is_worker_queue_empty() const { return true; } void MeshDevice::push_work(std::function work, bool blocking) { // Execute inline synchronously. + // Using a lock to provide the same call serialization guarantee as an async single device scheduling. + std::lock_guard lock(push_work_mutex_); work(); } program_cache::detail::ProgramCache& MeshDevice::get_program_cache() { return reference_device()->get_program_cache(); } diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index f92904fa902..a7798a35ba9 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -1256,6 +1256,14 @@ void Device::set_worker_mode(const WorkExecutorMode& mode) { } void Device::enable_async(bool enable) { + if (enable) { + tt::log_warning("Async mode is always disabled for a single device, ignoring enable_async call"); + } else { + force_enable_async(false); + } +} + +void Device::force_enable_async(bool enable) { auto mode = enable ? WorkExecutorMode::ASYNCHRONOUS : WorkExecutorMode::SYNCHRONOUS; this->set_worker_mode(mode); // If a worker thread is spawned for a device, register/track it in a runtime structure. diff --git a/ttnn/cpp/ttnn/decorators.hpp b/ttnn/cpp/ttnn/decorators.hpp index 3e9d8ac323a..c122b6a601d 100644 --- a/ttnn/cpp/ttnn/decorators.hpp +++ b/ttnn/cpp/ttnn/decorators.hpp @@ -332,7 +332,6 @@ struct registered_operation_t { const OptionalTensors optional_output_tensors = detail::extract_args_to_vector>(args...); - bool enable_autoformat = false; tt::tt_metal::operation::launch_op( [args...]( const Tensors& input_tensors, @@ -350,8 +349,7 @@ struct registered_operation_t { input_tensors, output_tensors, optional_input_tensors, - optional_output_tensors, - enable_autoformat); + optional_output_tensors); if constexpr (std::is_same_v, Tensor>) { return output_tensors.at(0); diff --git a/ttnn/cpp/ttnn/run_operation.cpp b/ttnn/cpp/ttnn/run_operation.cpp index 3e317d67a22..da5b97be6f0 100644 --- a/ttnn/cpp/ttnn/run_operation.cpp +++ b/ttnn/cpp/ttnn/run_operation.cpp @@ -607,18 +607,18 @@ void launch_op_func( const Tensors input_tensors, OutputType& output_tensors, const OptionalConstTensors optional_input_tensors, - const OptionalTensors optional_output_tensors, - bool enable_autoformat_device) { + const OptionalTensors optional_output_tensors) { // Send host side op compile and run to the worker queue // Assert to ensure that worker threads are specified. ZoneScopedN("LaunchOp"); auto& workers = detail::get_workers(output_tensors); std::size_t workers_size = workers.size(); - if (not enable_autoformat_device and workers.empty() or tt::tt_metal::detail::InWorkerThread()) { + if (workers.size() <= 1 || tt::tt_metal::detail::InWorkerThread()) { // Run in main thread or immediately in worker thread output_tensors = op_func(input_tensors, optional_input_tensors, optional_output_tensors); return; } + detail::check_output(output_tensors, workers); validate_worker_modes(workers); // Record ref counts for all tensors before pushing to worker queue. @@ -667,27 +667,6 @@ void launch_op_func( // If so, mark them in use by current worker. Tensors shared across workers // are only supported when each tensor is tied to a single device/worker // (example all-gather). - if (workers_size == 1) { - // Single worker per tensor and. - for (int i = 0; i < async_safe_input_tensors.size(); i++) { - if (async_safe_input_tensors[i].get_workers().size() and - async_safe_input_tensors[i].get_workers()[0] != workers[0]) { - // This input has a worker assigned that doesn't match the worker of the output being created (its - // shared). - async_safe_input_tensors[i].tensor_attributes->num_sibling_workers_sharing_tensor++; - cross_worker_input_tensor_idx.insert(i); - } - } - for (int i = 0; i < async_safe_optional_input_tensors.size(); i++) { - if (async_safe_optional_input_tensors[i].has_value() and - async_safe_optional_input_tensors[i].value().get_workers().size() and - async_safe_optional_input_tensors[i].value().get_workers()[0] != workers[0]) { - async_safe_optional_input_tensors[i].value().tensor_attributes->num_sibling_workers_sharing_tensor++; - cross_worker_optional_input_tensor_idx.insert(i); - } - } - } - { ZoneScopedN("PushOpToWorkers"); auto work_lambda = std::make_shared>( @@ -810,14 +789,12 @@ template void launch_op_func( const Tensors input_tensors, Tensors& output_tensors, const OptionalConstTensors optional_input_tensors, - const OptionalTensors optional_output_tensors, - bool enable_autoformat_device); + const OptionalTensors optional_output_tensors); template void launch_op_func( const std::function& op_func, const Tensors input_tensors, OptionalTensors& output_tensors, const OptionalConstTensors optional_input_tensors, - const OptionalTensors optional_output_tensors, - bool enable_autoformat_device); + const OptionalTensors optional_output_tensors); } // namespace tt::tt_metal::operation diff --git a/ttnn/cpp/ttnn/run_operation.hpp b/ttnn/cpp/ttnn/run_operation.hpp index f83319dd02f..132fe0e9b2a 100644 --- a/ttnn/cpp/ttnn/run_operation.hpp +++ b/ttnn/cpp/ttnn/run_operation.hpp @@ -126,8 +126,7 @@ __attribute__((noinline)) void launch_op_func( const Tensors input_tensors, OutputType& output_tensors, const OptionalConstTensors optional_input_tensors = {}, - const OptionalTensors optional_output_tensors = {}, - bool enable_autoformat_device = true); + const OptionalTensors optional_output_tensors = {}); /* */ @@ -137,16 +136,14 @@ void launch_op( const Tensors input_tensors, OutputType& output_tensors, const OptionalConstTensors optional_input_tensors = {}, - const OptionalTensors optional_output_tensors = {}, - bool enable_autoformat_device = true) { + const OptionalTensors optional_output_tensors = {}) { using FuncType = std::function; launch_op_func( FuncType(std::forward(op_func)), input_tensors, output_tensors, optional_input_tensors, - optional_output_tensors, - enable_autoformat_device); + optional_output_tensors); } void launch_with_autoformat( diff --git a/ttnn/cpp/ttnn/tensor/tensor_ops.cpp b/ttnn/cpp/ttnn/tensor/tensor_ops.cpp index 913d67c136e..24ca1f4514d 100644 --- a/ttnn/cpp/ttnn/tensor/tensor_ops.cpp +++ b/ttnn/cpp/ttnn/tensor/tensor_ops.cpp @@ -146,7 +146,7 @@ Tensor tensor_to_layout(const Tensor& input_tensor, Layout target_layout, IDevic ZoneScoped; GraphTracker::instance().track_function_start("Tensor::to_layout", input_tensor, target_layout, worker); // Only push layout conversion to worker if running in async mode - if (worker and worker->get_worker_mode() == WorkExecutorMode::ASYNCHRONOUS) { + if (worker && worker->get_worker_mode() == WorkExecutorMode::ASYNCHRONOUS) { // Tensor can be using borrowed storage. If so, when running in async mode, copy this tensor to owned storage. Tensor async_safe_tensor = copy_borrowed_tensor_in_async_mode(worker, input_tensor); Tensor tensor_modified_layout = Tensor(1); @@ -163,12 +163,18 @@ Tensor tensor_to_layout(const Tensor& input_tensor, Layout target_layout, IDevic GraphTracker::instance().track_function_end(tensor_modified_layout); return tensor_modified_layout; } + // Running without worker threads (non-async) TT_ASSERT( input_tensor.storage_type() != StorageType::DEVICE or input_tensor.storage_type() != StorageType::MULTI_DEVICE && "Bring tensor to host before converting to target layout"); - auto output = tensor_impl::to_layout_wrapper(input_tensor, target_layout); + Tensor output; + if (worker) { + worker->push_work([&] { output = tensor_impl::to_layout_wrapper(input_tensor, target_layout); }); + } else { + output = tensor_impl::to_layout_wrapper(input_tensor, target_layout); + } output = tt::tt_metal::set_tensor_id(output); GraphTracker::instance().track_function_end(output); return output; From cbe0e1a4b1153bacc70430d72cd7c0dec1840056 Mon Sep 17 00:00:00 2001 From: Oleg Milyutin Date: Tue, 25 Feb 2025 18:06:52 -0500 Subject: [PATCH 312/316] #0: Fix test_all_gather_multiple_submeshes (#18319) ### Ticket N/A ### Problem description `test_all_gather_multiple_submeshes` is broken, as all gather OP assumes ring connected topology. Physical 2D mesh of devices is as follows (the first 2x2 submesh of T3K): ``` 4 0 5 1 ``` ... which givers `4 0 5 1` row-major ordering, while all gather expects `4 0 1 5`. ### What's changed Reshape submesh to `1x4` to force the the correct ordering. ### Checklist - [X] Ran the test locally and confirmed it fixes the issue. --- tests/ttnn/unit_tests/test_multi_device.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/ttnn/unit_tests/test_multi_device.py b/tests/ttnn/unit_tests/test_multi_device.py index 845ab31c894..231fa015962 100644 --- a/tests/ttnn/unit_tests/test_multi_device.py +++ b/tests/ttnn/unit_tests/test_multi_device.py @@ -681,6 +681,8 @@ def test_all_gather_multiple_submeshes(mesh_device): pytest.skip() def model(submesh): + # Reshape to a 1x4 mesh to enforce ring connected topological order. + submesh.reshape(ttnn.MeshShape(1, 4)) full_tensor = torch.ones((1, 1, 32, 32 * submesh.get_num_devices()), dtype=torch.bfloat16) for i in range(submesh.get_num_devices()): full_tensor[..., i * 32 : (i + 1) * 32] = i From eb34ec8e399fa56cb8917adbbec7f5f2ed9bcfd0 Mon Sep 17 00:00:00 2001 From: Bryan Wilder Field Lozano Date: Tue, 25 Feb 2025 15:56:03 -0800 Subject: [PATCH 313/316] [skip ci] Make fabric tests run on known good machines (#18320) --- .github/workflows/t3000-unit-tests-impl.yaml | 28 ++++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/workflows/t3000-unit-tests-impl.yaml b/.github/workflows/t3000-unit-tests-impl.yaml index 3d761f5b530..4b4e403f900 100644 --- a/.github/workflows/t3000-unit-tests-impl.yaml +++ b/.github/workflows/t3000-unit-tests-impl.yaml @@ -14,19 +14,19 @@ jobs: fail-fast: false matrix: test-group: [ - { name: "t3k ttmetal tests", arch: wormhole_b0, cmd: run_t3000_ttmetal_tests, timeout: 30, owner_id: ULMEPM2MA}, #Sean Nijjar - { name: "t3k fabric tests", arch: wormhole_b0, cmd: run_t3000_ttfabric_tests, timeout: 30, owner_id: UJ45FEC7M}, # Allan Liu - { name: "t3k ttnn tests", arch: wormhole_b0, cmd: run_t3000_ttnn_tests, timeout: 120, owner_id: UBHPP2NDP}, #Joseph Chu - { name: "t3k falcon7b tests", arch: wormhole_b0, cmd: run_t3000_falcon7b_tests, timeout: 30, owner_id: UBHPP2NDP}, #Joseph Chu - { name: "t3k falcon40b tests", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 30, owner_id: U053W15B6JF}, #Djordje Ivanovic - { name: "t3k llama3-small tests", arch: wormhole_b0, cmd: run_t3000_llama3-small_tests, timeout: 30, owner_id: U03PUAKE719}, #Miguel Tairum Cruz - { name: "t3k llama3.2-11b tests", arch: wormhole_b0, cmd: run_t3000_llama3.2-11b_tests, timeout: 30, owner_id: U03PUAKE719}, #Miguel Tairum Cruz - { name: "t3k llama3.2-11b-vision tests", arch: wormhole_b0, cmd: run_t3000_llama3.2-11b-vision_unit_tests, timeout: 30, owner_id: U03FJB5TM5Y}, #Colman Glagovich - { name: "t3k n300 mesh llama3.2-11b-vision tests", arch: wormhole_b0, cmd: run_t3000_spoof_n300_llama3.2-11b-vision_unit_tests, timeout: 30, owner_id: U03FJB5TM5Y}, #Colman Glagovich - { name: "t3k llama3.1-70b tests", arch: wormhole_b0, cmd: run_t3000_llama3.1-70b_tests, timeout: 30, owner_id: U03PUAKE719}, #Miguel Tairum Cruz - { name: "t3k mixtral tests", arch: wormhole_b0, cmd: run_t3000_mixtral_tests, timeout: 30, owner_id: U03PUAKE719}, #Miguel Tairum Cruz - { name: "t3k grok tests", arch: wormhole_b0, cmd: run_t3000_grok_tests, timeout: 30, owner_id: U03HY7MK4BT}, #Mark O'Connor - { name: "t3k unet shallow tests", arch: wormhole_b0, cmd: run_t3000_unet_shallow_tests, timeout: 30, owner_id: U06ECNVR0EN}, #Evan Smal + { name: "t3k ttmetal tests", arch: wormhole_b0, cmd: run_t3000_ttmetal_tests, timeout: 30, label: pipeline-functional, owner_id: ULMEPM2MA}, #Sean Nijjar + { name: "t3k fabric tests", arch: wormhole_b0, cmd: run_t3000_ttfabric_tests, timeout: 30, label: pipeline-fabric, owner_id: UJ45FEC7M}, # Allan Liu + { name: "t3k ttnn tests", arch: wormhole_b0, cmd: run_t3000_ttnn_tests, timeout: 120, label: pipeline-functional, owner_id: UBHPP2NDP}, #Joseph Chu + { name: "t3k falcon7b tests", arch: wormhole_b0, cmd: run_t3000_falcon7b_tests, timeout: 30, label: pipeline-functional, owner_id: UBHPP2NDP}, #Joseph Chu + { name: "t3k falcon40b tests", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 30, label: pipeline-functional, owner_id: U053W15B6JF}, #Djordje Ivanovic + { name: "t3k llama3-small tests", arch: wormhole_b0, cmd: run_t3000_llama3-small_tests, timeout: 30, label: pipeline-functional, owner_id: U03PUAKE719}, #Miguel Tairum Cruz + { name: "t3k llama3.2-11b tests", arch: wormhole_b0, cmd: run_t3000_llama3.2-11b_tests, timeout: 30, label: pipeline-functional, owner_id: U03PUAKE719}, #Miguel Tairum Cruz + { name: "t3k llama3.2-11b-vision tests", arch: wormhole_b0, cmd: run_t3000_llama3.2-11b-vision_unit_tests, timeout: 30, label: pipeline-functional, owner_id: U03FJB5TM5Y}, #Colman Glagovich + { name: "t3k n300 mesh llama3.2-11b-vision tests", arch: wormhole_b0, cmd: run_t3000_spoof_n300_llama3.2-11b-vision_unit_tests, timeout: 30, label: pipeline-functional, owner_id: U03FJB5TM5Y}, #Colman Glagovich + { name: "t3k llama3.1-70b tests", arch: wormhole_b0, cmd: run_t3000_llama3.1-70b_tests, timeout: 30, label: pipeline-functional, owner_id: U03PUAKE719}, #Miguel Tairum Cruz + { name: "t3k mixtral tests", arch: wormhole_b0, cmd: run_t3000_mixtral_tests, timeout: 30, label: pipeline-functional, owner_id: U03PUAKE719}, #Miguel Tairum Cruz + { name: "t3k grok tests", arch: wormhole_b0, cmd: run_t3000_grok_tests, timeout: 30, label: pipeline-functional, owner_id: U03HY7MK4BT}, #Mark O'Connor + { name: "t3k unet shallow tests", arch: wormhole_b0, cmd: run_t3000_unet_shallow_tests, timeout: 30, label: pipeline-functional, owner_id: U06ECNVR0EN}, #Evan Smal ] name: ${{ matrix.test-group.name }} env: @@ -36,7 +36,7 @@ jobs: runs-on: - arch-wormhole_b0 - config-t3000 - - pipeline-functional + - ${{ matrix.test-group.label}} - ${{ inputs.extra-tag }} steps: - uses: tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main From 2d2a4c5d6b23db6af38a2e8b4ee6b147e138cca4 Mon Sep 17 00:00:00 2001 From: Oleg Milyutin Date: Tue, 25 Feb 2025 23:16:05 -0500 Subject: [PATCH 314/316] #0: Disable ttnn::experimental::view path for multi device storage types (#17760) ### Ticket N/A ### Problem description Fix inconsistency between single vs multi device storage types. ### What's changed Disable `ttnn::experimental::view` path for multi device storage types. Dedup code in the similar place in `permute` op. ### Checklist - [X] [All post commit](https://github.com/tenstorrent/tt-metal/actions/runs/13532177165) - pending --- tests/ttnn/unit_tests/test_reshape.py | 22 ++++++++++++++++ .../data_movement/permute/permute.cpp | 7 +---- .../data_movement/reshape_view/reshape.cpp | 26 ++++++++++--------- 3 files changed, 37 insertions(+), 18 deletions(-) diff --git a/tests/ttnn/unit_tests/test_reshape.py b/tests/ttnn/unit_tests/test_reshape.py index 25dd9c37a07..40fd7c15052 100644 --- a/tests/ttnn/unit_tests/test_reshape.py +++ b/tests/ttnn/unit_tests/test_reshape.py @@ -533,3 +533,25 @@ def test_reshape_zero_element(input_shape, output_shape, layout, ttnn_reshape, u tt_output_tensor = ttnn.from_device(tt_output_tensor) tt_output_tensor = ttnn.to_torch(tt_output_tensor) assert tt_output_tensor.shape == torch.Size(output_shape) + + +@pytest.mark.xfail( + reason="Test that the previously supported reshape accounting for the physical shape is no longer possible" +) +@pytest.mark.parametrize( + "input_shape, output_shape", + [ + ([32, 256], [1, 256]), + ], +) +def test_reshape_replicated_tensor(mesh_device, input_shape, output_shape): + torch_input_tensor = torch.randn(input_shape) + mesh_mapper = ttnn.ReplicateTensorToMesh(mesh_device) + tt_input_tensor = ttnn.from_torch( + torch_input_tensor, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, mesh_mapper=mesh_mapper, device=mesh_device + ) + tt_output_tensor = ttnn.reshape(tt_input_tensor, ttnn.Shape(output_shape)) + + for tensor_shard in ttnn.get_device_tensors(tt_output_tensor): + tt_output_tensor = ttnn.to_torch(tensor_shard) + assert tt_output_tensor.shape == torch.Size(output_shape) diff --git a/ttnn/cpp/ttnn/operations/data_movement/permute/permute.cpp b/ttnn/cpp/ttnn/operations/data_movement/permute/permute.cpp index f0c4ee555ed..98ad66655c0 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/permute/permute.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/permute/permute.cpp @@ -19,11 +19,6 @@ namespace ttnn::operations::data_movement { namespace detail { -inline bool is_on_device(const Tensor& t) { - return ttnn::has_storage_type_of(t, ttnn::StorageType::DEVICE) or - ttnn::has_storage_type_of(t, ttnn::StorageType::MULTI_DEVICE); -} - ttnn::Tensor permute_impl( const ttnn::Tensor& a, const ttnn::SmallVector& dims, @@ -185,7 +180,7 @@ ttnn::Tensor ExecutePermute::invoke( TT_FATAL( input_rank == dims.size(), "The number of dimensions in the tensor input does not match the length of the desired ordering"); - TT_FATAL(detail::is_on_device(input_tensor), "Tensor must already be on device"); + TT_FATAL(is_tensor_on_device_or_multidevice(input_tensor), "Tensor must already be on device"); SmallVector normalized_dims(dims.size()); std::transform(dims.begin(), dims.end(), normalized_dims.begin(), [input_tensor](std::int64_t idx) { diff --git a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp index 982271baf61..90b35c86243 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp @@ -366,28 +366,34 @@ ttnn::Tensor ReshapeViewOperation::invoke( default_pad_value = (uint32_t)0; } - //const uint32_t tile_first_dim =tensor.get_tile().get_width(); - //const uint32_t tile_second_dim =tensor.get_tile().get_height(); + // const uint32_t tile_first_dim =tensor.get_tile().get_width(); + // const uint32_t tile_second_dim =tensor.get_tile().get_height(); const uint32_t tile_first_dim = 32; const uint32_t tile_second_dim = 32; - //The following case should only be called for the device storage case, the rest is a bandaid - //for issue 15317 + // The following case should only be called for the device storage case, the rest is a bandaid + // for issue 15317 const uint32_t shape_last_dim = logical_shape.rank() >= 1 ? logical_shape[-1] : 1; const uint32_t tensor_shape_last_dim = tensor_shape.rank() >= 1 ? tensor_shape[-1] : 1; const uint32_t shape_second_last_dim = logical_shape.rank() >= 2 ? logical_shape[-2] : 1; - const uint32_t tensor_shape_second_last_dim = tensor_shape.rank() >= 2 ? tensor_shape[-2]:1; + const uint32_t tensor_shape_second_last_dim = tensor_shape.rank() >= 2 ? tensor_shape[-2] : 1; // Just edit shape if shape has a 0 dimension if (tensor.get_logical_volume() == 0) { TT_FATAL(logical_shape.volume() == 0, "Tensor volume is 0, but shape's volume is not"); - TT_FATAL((tensor.storage_type() != StorageType::MULTI_DEVICE && - tensor.storage_type() != StorageType::MULTI_DEVICE_HOST), - "Reshaping a multi-device tensor with 0 volume is not supported"); + TT_FATAL( + (tensor.storage_type() != StorageType::MULTI_DEVICE && + tensor.storage_type() != StorageType::MULTI_DEVICE_HOST), + "Reshaping a multi-device tensor with 0 volume is not supported"); return ttnn::experimental::view(tensor, logical_shape, padded_shape); } TT_FATAL(logical_shape.volume() != 0, "Tensor volume is not 0, but shape volume is 0"); + if (!is_tensor_on_device_or_multidevice(tensor)) { + // This case has been allowed in the past though it means introducing padding values to the data + return ttnn::experimental::view(tensor, logical_shape, padded_shape); + } + bool this_is_view = (tensor_shape_last_dim == shape_last_dim) && (mem_config.is_sharded() == tensor.memory_config().is_sharded()) && (mem_config.is_l1() == tensor.memory_config().is_l1()) && @@ -395,10 +401,6 @@ ttnn::Tensor ReshapeViewOperation::invoke( (tensor_shape_second_last_dim == shape_second_last_dim) || // Second last dimension is the same (shape_second_last_dim % tile_second_dim == 0 && tensor_shape_second_last_dim % tile_first_dim == 0)); // There is no padding on the second last dimension - if (!(ttnn::has_storage_type_of(tensor, ttnn::StorageType::DEVICE))) { - // This case has been allowed in the past though it means introducing padding values to the data - return ttnn::experimental::view(tensor, logical_shape, padded_shape); - } if (this_is_view) { return PerformView(tensor, logical_shape, padded_shape, tile_first_dim, tile_second_dim); From 5fc77c98da21d0dd0968b9c06284945aa97b599b Mon Sep 17 00:00:00 2001 From: Sean Nijjar Date: Tue, 25 Feb 2025 23:31:27 -0500 Subject: [PATCH 315/316] enable -O3 compile for 1D fabric (#18312) ~ 13% perf bump @ 4k packet size --- .../ethernet/test_fabric_edm_bandwidth.py | 4 ++-- ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp | 14 ++++++++++---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/tests/tt_metal/microbenchmarks/ethernet/test_fabric_edm_bandwidth.py b/tests/tt_metal/microbenchmarks/ethernet/test_fabric_edm_bandwidth.py index de0e3ac5181..152b3c00296 100644 --- a/tests/tt_metal/microbenchmarks/ethernet/test_fabric_edm_bandwidth.py +++ b/tests/tt_metal/microbenchmarks/ethernet/test_fabric_edm_bandwidth.py @@ -90,7 +90,7 @@ def run_fabric_edm( @pytest.mark.parametrize("packet_size", [4096]) @pytest.mark.parametrize( "expected_bw", - [5.65], + [6.5], ) def test_fabric_edm_mcast_bw( num_mcasts, num_unicasts, num_links, num_op_invocations, line_sync, line_size, packet_size, expected_bw @@ -117,7 +117,7 @@ def test_fabric_edm_mcast_bw( @pytest.mark.parametrize("packet_size", [4096]) @pytest.mark.parametrize( "expected_bw", - [7.13], + [7.5], ) def test_fabric_edm_unicast_bw( num_mcasts, num_unicasts, num_links, num_op_invocations, line_sync, line_size, packet_size, expected_bw diff --git a/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp b/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp index 3d684c08996..faa87870ab8 100644 --- a/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp @@ -269,7 +269,8 @@ KernelHandle generate_edm_kernel_impl( EDMBuilder const& edm_builder, std::string const& kernel_path, CoreCoord const& eth_core, - NOC noc_id) { + NOC noc_id, + std::optional opt_level = std::nullopt) { edm_builder.dump_to_log(); std::vector const edm_kernel_rt_args = edm_builder.get_runtime_args(); @@ -281,11 +282,15 @@ KernelHandle generate_edm_kernel_impl( log_trace(tt::LogOp, "\t{}", s); } + auto kernel_config = tt::tt_metal::EthernetConfig{.noc = noc_id, .compile_args = eth_sender_ct_args}; + if (opt_level.has_value()) { + kernel_config.opt_level = opt_level.value(); + } auto eth_sender_kernel = tt::tt_metal::CreateKernel( program, kernel_path, eth_core, - tt::tt_metal::EthernetConfig{.noc = noc_id, .compile_args = eth_sender_ct_args}); + kernel_config); tt::tt_metal::SetRuntimeArgs(program, eth_sender_kernel, eth_core, edm_kernel_rt_args); @@ -311,7 +316,8 @@ KernelHandle generate_edm_kernel( edm_builder, "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp", eth_core, - noc_id); + noc_id, + tt::tt_metal::KernelBuildOptLevel::O3); } KernelHandle generate_edm_kernel( @@ -327,7 +333,7 @@ KernelHandle generate_edm_kernel( ccl::EriscDatamoverBuilder create_erisc_datamover_builder( std::size_t num_channels, uint32_t page_size, - std::size_t num_buffers_per_channel, + size_t num_buffers_per_channel, ccl::EriscDataMoverBufferSharingMode buffer_sharing_mode, ccl::EriscDataMoverTerminationMode termination_mode) { ccl::EriscDatamoverConfig config; From e0585b2cdb34e013544623d04beb2120209bda8e Mon Sep 17 00:00:00 2001 From: "Jush (yupiop12)" <36951064+JushBJJ@users.noreply.github.com> Date: Wed, 26 Feb 2025 15:39:39 +1000 Subject: [PATCH 316/316] Add cstdint into queue_id.hpp (#18174) --- ttnn/cpp/ttnn/common/queue_id.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/ttnn/cpp/ttnn/common/queue_id.hpp b/ttnn/cpp/ttnn/common/queue_id.hpp index dc9d801bbc6..2f0ecaf8578 100644 --- a/ttnn/cpp/ttnn/common/queue_id.hpp +++ b/ttnn/cpp/ttnn/common/queue_id.hpp @@ -5,6 +5,7 @@ #pragma once #include +#include namespace ttnn { /*