From da63d50e6d718c8622c229dc63df60234c09e8c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bojan=20Ro=C5=A1ko?= <156314064+broskoTT@users.noreply.github.com> Date: Wed, 26 Feb 2025 15:37:36 +0100 Subject: [PATCH] [UMD] Switch pci_cores and dram_cores to CoreCoord api. (#17620) ### Ticket Related to https://github.com/tenstorrent/tt-metal/issues/17002 ### Problem description Switched .pci_cores and .dram_cores to new API. ### What's changed - Changed .pci_cores and .dram_cores to get_cores - Changed old get_core_for_dram_channel with new get_dram_core_for_channel ### Checklist - [x] All post-commit tests : https://github.com/tenstorrent/tt-metal/actions/runs/13543882883 - [x] Blackhole post-commit tests : https://github.com/tenstorrent/tt-metal/actions/runs/13543884998 - [x] (Single-card) Model perf tests : https://github.com/tenstorrent/tt-metal/actions/runs/13543887484 - [ ] (Single-card) Device perf regressions : https://github.com/tenstorrent/tt-metal/actions/runs/13543889644 - [ ] (T3K) T3000 unit tests : https://github.com/tenstorrent/tt-metal/actions/runs/13543891837 - [ ] (T3K) T3000 demo tests : https://github.com/tenstorrent/tt-metal/actions/runs/13543893764 - [x] (TG) TG unit tests : https://github.com/tenstorrent/tt-metal/actions/runs/13543895516 - [ ] (TG) TG demo tests : https://github.com/tenstorrent/tt-metal/actions/runs/13543897570 - [x] (TGG) TGG unit tests : https://github.com/tenstorrent/tt-metal/actions/runs/13543899812 - [x] (TGG) TGG demo tests : https://github.com/tenstorrent/tt-metal/actions/runs/13543902321 --- .../perf_microbenchmark/dispatch/common.h | 5 +-- .../dispatch/test_bw_and_latency.cpp | 4 +-- .../api/tt-metalium/metal_soc_descriptor.h | 6 ---- tt_metal/common/metal_soc_descriptor.cpp | 33 ------------------- tt_metal/impl/device/device.cpp | 10 +++--- tt_metal/jit_build/build_env_manager.cpp | 2 +- tt_metal/llrt/sanitize_noc_host.hpp | 14 ++++++-- tt_metal/llrt/tt_cluster.cpp | 8 +++-- tt_metal/third_party/umd | 2 +- 9 files changed, 30 insertions(+), 54 deletions(-) diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h index 2d9742cb83d..264d5007bf3 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h @@ -119,8 +119,9 @@ DeviceData::DeviceData( this->amt_written = 0; const metal_SocDescriptor& soc_d = tt::Cluster::instance().get_soc_desc(device->id()); - const std::vector& pcie_cores = soc_d.get_pcie_cores(); - for (CoreCoord core : pcie_cores) { + const std::vector& pcie_cores = soc_d.get_cores(CoreType::PCIE, soc_d.get_umd_coord_system()); + for (const CoreCoord& core_coord : pcie_cores) { + CoreCoord core = {core_coord.x, core_coord.y}; // TODO: make this all work w/ phys coords // this is really annoying // the PCIE phys core conflicts w/ worker logical cores diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp index 31f7c2296ed..2dbc138b64b 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp @@ -208,7 +208,7 @@ int main(int argc, char** argv) { case 0: default: { src_mem = "FROM_PCIE"; - vector pcie_cores = soc_d.get_pcie_cores(); + vector pcie_cores = soc_d.get_cores(CoreType::PCIE, soc_d.get_umd_coord_system()); TT_ASSERT(pcie_cores.size() > 0); noc_addr_x = pcie_cores[0].x; noc_addr_y = pcie_cores[0].y; @@ -216,7 +216,7 @@ int main(int argc, char** argv) { } break; case 1: { src_mem = "FROM_DRAM"; - vector dram_cores = soc_d.get_dram_cores(); + vector dram_cores = soc_d.get_cores(CoreType::DRAM, soc_d.get_umd_coord_system()); TT_ASSERT(dram_cores.size() > dram_channel_g); noc_addr_x = dram_cores[dram_channel_g].x; noc_addr_y = dram_cores[dram_channel_g].y; diff --git a/tt_metal/api/tt-metalium/metal_soc_descriptor.h b/tt_metal/api/tt-metalium/metal_soc_descriptor.h index 26d17e84fd8..c256f9e219b 100644 --- a/tt_metal/api/tt-metalium/metal_soc_descriptor.h +++ b/tt_metal/api/tt-metalium/metal_soc_descriptor.h @@ -35,9 +35,6 @@ struct metal_SocDescriptor : public tt_SocDescriptor { size_t get_channel_for_dram_view(int dram_view) const; size_t get_num_dram_views() const; - const std::vector& get_pcie_cores() const; - const std::vector get_dram_cores() const; - int get_dram_channel_from_logical_core(const CoreCoord& logical_coord) const; CoreCoord get_physical_ethernet_core_from_logical(const CoreCoord& logical_coord) const; @@ -62,7 +59,4 @@ struct metal_SocDescriptor : public tt_SocDescriptor { void load_dram_metadata_from_device_descriptor(); void generate_logical_eth_coords_mapping(); void generate_physical_routing_to_profiler_flat_id(); - // This is temporary until virtual coordinates are enabled because BH chips on - // different cards use different physical PCIe NoC endpoints - void update_pcie_cores(const BoardType& board_type); }; diff --git a/tt_metal/common/metal_soc_descriptor.cpp b/tt_metal/common/metal_soc_descriptor.cpp index 80341239f3b..0535250dd26 100644 --- a/tt_metal/common/metal_soc_descriptor.cpp +++ b/tt_metal/common/metal_soc_descriptor.cpp @@ -60,21 +60,6 @@ size_t metal_SocDescriptor::get_channel_for_dram_view(int dram_view) const { size_t metal_SocDescriptor::get_num_dram_views() const { return this->dram_view_eth_cores.size(); } -const std::vector& metal_SocDescriptor::get_pcie_cores() const { return this->pcie_cores; } - -const std::vector metal_SocDescriptor::get_dram_cores() const { - std::vector cores; - - // This is inefficient, but is currently not used in a perf path - for (const auto& channel_it : this->dram_cores) { - for (const auto& core_it : channel_it) { - cores.push_back(core_it); - } - } - - return cores; -} - int metal_SocDescriptor::get_dram_channel_from_logical_core(const CoreCoord& logical_coord) const { const uint32_t num_dram_views = this->get_num_dram_views(); TT_FATAL( @@ -205,23 +190,6 @@ void metal_SocDescriptor::generate_physical_routing_to_profiler_flat_id() { #endif } -// TODO: This should be deleted once we switch to virtual coordinates -void metal_SocDescriptor::update_pcie_cores(const BoardType& board_type) { - if (this->arch != tt::ARCH::BLACKHOLE) { - return; - } - switch (board_type) { - case P100: - case UNKNOWN: { // Workaround for BHs running FW that does not return board type in the cluster yaml - this->pcie_cores = {CoreCoord(11, 0)}; - } break; - case P150A: { - this->pcie_cores = {CoreCoord(2, 0)}; - } break; - default: TT_THROW("Need to update PCIe core assignment for new Blackhole type, file issue to abhullar"); - } -} - // UMD initializes and owns tt_SocDescriptor // For architectures with translation tables enabled, UMD will remove the last x rows from the descriptors in // tt_SocDescriptor (workers list and worker_log_to_routing_x/y maps) This creates a virtual coordinate system, where @@ -235,5 +203,4 @@ metal_SocDescriptor::metal_SocDescriptor(const tt_SocDescriptor& other, const Bo this->load_dram_metadata_from_device_descriptor(); this->generate_logical_eth_coords_mapping(); this->generate_physical_routing_to_profiler_flat_id(); - this->update_pcie_cores(board_type); } diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index a7798a35ba9..71cae1833ab 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -597,14 +597,14 @@ void Device::initialize_and_launch_firmware() { core_info->noc_dram_addr_base = 0; core_info->noc_dram_addr_end = soc_d.dram_core_size; - const std::vector &pcie_cores = soc_d.get_pcie_cores(); - const std::vector &dram_cores = soc_d.get_dram_cores(); + const std::vector& pcie_cores = soc_d.get_cores(CoreType::PCIE, soc_d.get_umd_coord_system()); + const std::vector& dram_cores = soc_d.get_cores(CoreType::DRAM, soc_d.get_umd_coord_system()); const std::vector& eth_cores = soc_d.get_cores(CoreType::ETH, CoordSystem::PHYSICAL); // The SOC descriptor can list a dram core multiple times, depending on how GDDR is assigned to banks // Get a list of unique DRAM cores. std::unordered_set unique_dram_cores(dram_cores.begin(), dram_cores.end()); TT_ASSERT( - pcie_cores.size() + unique_dram_cores.size() + eth_cores.size() <= MAX_NON_WORKER_CORES, + pcie_cores.size() + dram_cores.size() + eth_cores.size() <= MAX_NON_WORKER_CORES, "Detected more pcie/dram/eth cores than fit in the device mailbox."); TT_ASSERT( eth_cores.size() <= MAX_VIRTUAL_NON_WORKER_CORES, @@ -617,10 +617,10 @@ void Device::initialize_and_launch_firmware() { } int non_worker_cores_idx = 0; - for (const CoreCoord &core : pcie_cores) { + for (const tt::umd::CoreCoord& core : pcie_cores) { core_info->non_worker_cores[non_worker_cores_idx++] = {core.x, core.y, AddressableCoreType::PCIE}; } - for (const CoreCoord &core : unique_dram_cores) { + for (const tt::umd::CoreCoord& core : dram_cores) { core_info->non_worker_cores[non_worker_cores_idx++] = {core.x, core.y, AddressableCoreType::DRAM}; } for (const tt::umd::CoreCoord& core : eth_cores) { diff --git a/tt_metal/jit_build/build_env_manager.cpp b/tt_metal/jit_build/build_env_manager.cpp index 0d0c0217ac0..afac9cb6c7b 100644 --- a/tt_metal/jit_build/build_env_manager.cpp +++ b/tt_metal/jit_build/build_env_manager.cpp @@ -70,7 +70,7 @@ std::map initialize_device_kernel_defines(chip_id_t de // TODO (abhullar): Until we switch to virtual coordinates, we need to pass physical PCIe coordinates to device // because Blackhole PCIe endpoint is dependent on board type - auto pcie_cores = soc_d.get_pcie_cores(); + auto pcie_cores = soc_d.get_cores(CoreType::PCIE, soc_d.get_umd_coord_system()); CoreCoord pcie_core = pcie_cores.empty() ? soc_d.grid_size : pcie_cores[0]; device_kernel_defines.emplace("PCIE_NOC_X", std::to_string(pcie_core.x)); diff --git a/tt_metal/llrt/sanitize_noc_host.hpp b/tt_metal/llrt/sanitize_noc_host.hpp index 17305169bfc..15b93b9af4e 100644 --- a/tt_metal/llrt/sanitize_noc_host.hpp +++ b/tt_metal/llrt/sanitize_noc_host.hpp @@ -21,6 +21,16 @@ namespace tt { ((((a) >= HAL_MEM_ETH_BASE) && ((a) + (l) <= HAL_MEM_ETH_BASE + HAL_MEM_ETH_SIZE)) || \ (DEBUG_VALID_REG_ADDR(a) && (l) == 4)) +static bool coord_found_p(std::vector coords, CoreCoord core) { + for (const tt::umd::CoreCoord& core_coord : coords) { + CoreCoord item = {core_coord.x, core_coord.y}; + if (item == core) { + return true; + } + } + return false; +} + static bool coord_found_p(std::vector coords, CoreCoord core) { for (CoreCoord item : coords) { if (item == core) { @@ -68,9 +78,9 @@ static void watcher_sanitize_host_noc( const CoreCoord& core, uint64_t addr, uint32_t lbytes) { - if (coord_found_p(soc_d.get_pcie_cores(), core)) { + if (coord_found_p(soc_d.get_cores(CoreType::PCIE, soc_d.get_umd_coord_system()), core)) { TT_THROW("Host watcher: bad {} NOC coord {}", what, core.str()); - } else if (coord_found_p(soc_d.get_dram_cores(), core)) { + } else if (coord_found_p(soc_d.get_cores(CoreType::DRAM, soc_d.get_umd_coord_system()), core)) { uint64_t dram_addr_base = 0; uint64_t dram_addr_size = soc_d.dram_core_size; uint64_t dram_addr_end = dram_addr_size - dram_addr_base; diff --git a/tt_metal/llrt/tt_cluster.cpp b/tt_metal/llrt/tt_cluster.cpp index d6f678b7217..11e3fdc30cc 100644 --- a/tt_metal/llrt/tt_cluster.cpp +++ b/tt_metal/llrt/tt_cluster.cpp @@ -512,7 +512,9 @@ void Cluster::write_dram_vec(std::vector &vec, tt_target_dram dram, ui TT_ASSERT( d_subchannel < desc_to_use.dram_cores.at(d_chan).size(), "Trying to address dram sub channel that doesnt exist in the device descriptor"); - tt_cxy_pair dram_core = tt_cxy_pair(chip_id, desc_to_use.get_core_for_dram_channel(d_chan, d_subchannel)); + tt::umd::CoreCoord dram_core_coord = + desc_to_use.get_dram_core_for_channel(d_chan, d_subchannel, CoordSystem::VIRTUAL); + tt_cxy_pair dram_core = tt_cxy_pair(chip_id, dram_core_coord.x, dram_core_coord.y); size_t offset = desc_to_use.get_address_offset(d_view); write_core(vec.data(), vec.size() * sizeof(uint32_t), dram_core, addr + offset, small_access); } @@ -531,7 +533,9 @@ void Cluster::read_dram_vec( TT_ASSERT( d_subchannel < desc_to_use.dram_cores.at(d_chan).size(), "Trying to address dram sub channel that doesnt exist in the device descriptor"); - tt_cxy_pair dram_core = tt_cxy_pair(chip_id, desc_to_use.get_core_for_dram_channel(d_chan, d_subchannel)); + tt::umd::CoreCoord dram_core_coord = + desc_to_use.get_dram_core_for_channel(d_chan, d_subchannel, CoordSystem::VIRTUAL); + tt_cxy_pair dram_core = tt_cxy_pair(chip_id, dram_core_coord.x, dram_core_coord.y); size_t offset = desc_to_use.get_address_offset(d_view); read_core(vec, sz_in_bytes, dram_core, addr + offset, small_access); } diff --git a/tt_metal/third_party/umd b/tt_metal/third_party/umd index ebb0f945ed8..89e23562388 160000 --- a/tt_metal/third_party/umd +++ b/tt_metal/third_party/umd @@ -1 +1 @@ -Subproject commit ebb0f945ed8d3c05e043158978201ed6fab884ec +Subproject commit 89e235623881014e475cdfcd172c2222217346a4