From 1eb7bce6cfaa9a8faad2f086b22d96822b7a7a15 Mon Sep 17 00:00:00 2001 From: Brett Grady Date: Fri, 21 Feb 2025 18:09:10 +0000 Subject: [PATCH] dataflow api refactor --- tt_metal/hw/firmware/src/brisc.cc | 1 + tt_metal/hw/firmware/src/brisck.cc | 1 + tt_metal/hw/firmware/src/erisc.cc | 1 + tt_metal/hw/firmware/src/erisck.cc | 1 + tt_metal/hw/firmware/src/idle_erisc.cc | 1 + tt_metal/hw/firmware/src/idle_erisck.cc | 1 + tt_metal/hw/firmware/src/ncrisc.cc | 1 + tt_metal/hw/firmware/src/ncrisck.cc | 1 + tt_metal/hw/firmware/src/slave_idle_erisc.cc | 1 + tt_metal/hw/inc/dataflow_api.h | 640 +----------------- tt_metal/hw/inc/dataflow_api_addrgen.h | 654 +++++++++++++++++++ tt_metal/tools/profiler/kernel_profiler.hpp | 10 +- 12 files changed, 673 insertions(+), 640 deletions(-) create mode 100644 tt_metal/hw/inc/dataflow_api_addrgen.h diff --git a/tt_metal/hw/firmware/src/brisc.cc b/tt_metal/hw/firmware/src/brisc.cc index 992b4dd8d67..3714698339f 100644 --- a/tt_metal/hw/firmware/src/brisc.cc +++ b/tt_metal/hw/firmware/src/brisc.cc @@ -17,6 +17,7 @@ #include "tdma_xmov.h" #include "noc_nonblocking_api.h" #include "firmware_common.h" +#include "dataflow_api.h" #include "tools/profiler/kernel_profiler.hpp" #include "dev_msgs.h" #include "risc_attribs.h" diff --git a/tt_metal/hw/firmware/src/brisck.cc b/tt_metal/hw/firmware/src/brisck.cc index bac6b3c611e..9484d571f21 100644 --- a/tt_metal/hw/firmware/src/brisck.cc +++ b/tt_metal/hw/firmware/src/brisck.cc @@ -15,6 +15,7 @@ #include "c_tensix_core.h" #include "noc_nonblocking_api.h" #include "firmware_common.h" +#include "dataflow_api.h" #include "tools/profiler/kernel_profiler.hpp" #include #if defined ALIGN_LOCAL_CBS_TO_REMOTE_CBS diff --git a/tt_metal/hw/firmware/src/erisc.cc b/tt_metal/hw/firmware/src/erisc.cc index 883b615c9c7..7dc0390f329 100644 --- a/tt_metal/hw/firmware/src/erisc.cc +++ b/tt_metal/hw/firmware/src/erisc.cc @@ -7,6 +7,7 @@ #include "firmware_common.h" #include "noc_parameters.h" #include "risc_attribs.h" +#include "dataflow_api.h" #include "tools/profiler/kernel_profiler.hpp" #include "debug/watcher_common.h" diff --git a/tt_metal/hw/firmware/src/erisck.cc b/tt_metal/hw/firmware/src/erisck.cc index e2c8f068a34..f7be09ef433 100644 --- a/tt_metal/hw/firmware/src/erisck.cc +++ b/tt_metal/hw/firmware/src/erisck.cc @@ -16,6 +16,7 @@ #include "stream_io_map.h" #include "tdma_xmov.h" #include "debug/dprint.h" +#include "dataflow_api.h" #include "tools/profiler/kernel_profiler.hpp" #include #include diff --git a/tt_metal/hw/firmware/src/idle_erisc.cc b/tt_metal/hw/firmware/src/idle_erisc.cc index fe762348a5a..68df5a39dc6 100644 --- a/tt_metal/hw/firmware/src/idle_erisc.cc +++ b/tt_metal/hw/firmware/src/idle_erisc.cc @@ -16,6 +16,7 @@ #include "tdma_xmov.h" #include "noc_nonblocking_api.h" #include "firmware_common.h" +#include "dataflow_api.h" #include "tools/profiler/kernel_profiler.hpp" #include "dev_msgs.h" #include "risc_attribs.h" diff --git a/tt_metal/hw/firmware/src/idle_erisck.cc b/tt_metal/hw/firmware/src/idle_erisck.cc index 3aa42a8af14..ee9a9ba31e5 100644 --- a/tt_metal/hw/firmware/src/idle_erisck.cc +++ b/tt_metal/hw/firmware/src/idle_erisck.cc @@ -16,6 +16,7 @@ #include "tdma_xmov.h" #include "noc_nonblocking_api.h" #include "firmware_common.h" +#include "dataflow_api.h" #include "tools/profiler/kernel_profiler.hpp" #include "dataflow_api.h" diff --git a/tt_metal/hw/firmware/src/ncrisc.cc b/tt_metal/hw/firmware/src/ncrisc.cc index f79713cc239..015ed8bf2af 100644 --- a/tt_metal/hw/firmware/src/ncrisc.cc +++ b/tt_metal/hw/firmware/src/ncrisc.cc @@ -9,6 +9,7 @@ #include "dev_msgs.h" #include "stream_io_map.h" #include "firmware_common.h" +#include "dataflow_api.h" #include "tools/profiler/kernel_profiler.hpp" #include "risc_attribs.h" #include "circular_buffer.h" diff --git a/tt_metal/hw/firmware/src/ncrisck.cc b/tt_metal/hw/firmware/src/ncrisck.cc index 5b99da3bd1b..9a28f743761 100644 --- a/tt_metal/hw/firmware/src/ncrisck.cc +++ b/tt_metal/hw/firmware/src/ncrisck.cc @@ -15,6 +15,7 @@ #include "risc_perf.h" #endif #include "firmware_common.h" +#include "dataflow_api.h" #include "tools/profiler/kernel_profiler.hpp" #include "tensix_functions.h" #include "c_tensix_core.h" diff --git a/tt_metal/hw/firmware/src/slave_idle_erisc.cc b/tt_metal/hw/firmware/src/slave_idle_erisc.cc index 0454641e778..90f2bd4ad17 100644 --- a/tt_metal/hw/firmware/src/slave_idle_erisc.cc +++ b/tt_metal/hw/firmware/src/slave_idle_erisc.cc @@ -9,6 +9,7 @@ #include "dev_msgs.h" #include "stream_io_map.h" #include "firmware_common.h" +#include "dataflow_api.h" #include "tools/profiler/kernel_profiler.hpp" #include "risc_attribs.h" #include "circular_buffer.h" diff --git a/tt_metal/hw/inc/dataflow_api.h b/tt_metal/hw/inc/dataflow_api.h index 4800b0dc42b..f8d5699e2b9 100644 --- a/tt_metal/hw/inc/dataflow_api.h +++ b/tt_metal/hw/inc/dataflow_api.h @@ -26,119 +26,7 @@ #include "utils/utils.h" #include "debug/assert.h" #include "dev_msgs.h" - -#if defined(COMPILE_FOR_BRISC) -constexpr uint8_t proc_type = static_cast>(TensixProcessorTypes::DM0); -#else -constexpr uint8_t proc_type = static_cast>(TensixProcessorTypes::DM1); -#endif -#if defined(KERNEL_BUILD) -constexpr uint8_t noc_index = NOC_INDEX; -constexpr uint8_t noc_mode = NOC_MODE; -#else - -extern uint8_t noc_index; -constexpr uint8_t noc_mode = DM_DEDICATED_NOC; -#endif -extern uint16_t dram_bank_to_noc_xy[NUM_NOCS][NUM_DRAM_BANKS]; -extern int32_t bank_to_dram_offset[NUM_DRAM_BANKS]; -extern uint16_t l1_bank_to_noc_xy[NUM_NOCS][NUM_L1_BANKS]; -extern int32_t bank_to_l1_offset[NUM_L1_BANKS]; - -extern uint32_t tt_l1_ptr* rta_l1_base; -extern uint32_t tt_l1_ptr* crta_l1_base; -extern uint32_t tt_l1_ptr* sem_l1_base[]; - -/** @file */ - -/** - * \private - */ - -// Use VC 1 for unicast writes, and VC 4 for mcast writes -#define NOC_UNICAST_WRITE_VC 1 -#define NOC_MULTICAST_WRITE_VC 4 -#define NOC_DISPATCH_MULTICAST_WRITE_VC 5 // Only to be used by the dispatch cores - -#define EXCLUDE_ENABLED 1 -#define EXCLUDE_ENABLED_OFFSET 22 -#define EXCLUDE_DIRECTION_Y_OFFSET 21 -#define EXCLUDE_DIRECTION_X_OFFSET 20 -#define EXCLUDE_START_Y_OFFSET 14 -#define EXCLUDE_START_X_OFFSET 8 -#define DYNAMIC_NOC_DIRECTION(noc, direction) (noc == 1 ? 1 - direction : direction) - -static_assert(NUM_NOCS == 2); -// "Scratch" in L1 has space allocated for 256 DRAM and L1 enteries, to store offsets and NOC XY data. -// (MEM_BANK_TO_NOC_XY_SCRATCH and MEM_BANK_OFFSET_SCRATCH) -static_assert((NUM_DRAM_BANKS + NUM_L1_BANKS) <= 256); - -namespace interleaved_addr_gen { - -template -FORCE_INLINE uint32_t get_bank_offset_index(uint32_t id) { - if constexpr (DRAM) { // DRAM -#ifdef IS_NOT_POW2_NUM_DRAM_BANKS - return udivsi3_const_divisor(id); -#else - return id >> LOG_BASE_2_OF_NUM_DRAM_BANKS; -#endif - } else { // L1 -#ifdef IS_NOT_POW2_NUM_L1_BANKS - return udivsi3_const_divisor(id); -#else - return id >> LOG_BASE_2_OF_NUM_L1_BANKS; -#endif - } -} - -template -FORCE_INLINE uint32_t get_bank_index(uint32_t id, uint32_t bank_offset_index) { - if constexpr (DRAM) { // DRAM - return id - bank_offset_index * NUM_DRAM_BANKS; - } else { // L1 - return id - bank_offset_index * NUM_L1_BANKS; - } -} - -template -FORCE_INLINE uint32_t get_noc_xy(uint32_t bank_index, uint8_t noc = noc_index) { - if constexpr (DRAM) { // DRAM - return dram_bank_to_noc_xy[noc][bank_index]; - } else { // L1 - return l1_bank_to_noc_xy[noc][bank_index]; - } -} - -template -FORCE_INLINE uint32_t get_bank_offset(uint32_t bank_index) { - if constexpr (DRAM) { // DRAM - return bank_to_dram_offset[bank_index]; - } else { // L1 - return bank_to_l1_offset[bank_index]; - } -} - -template -FORCE_INLINE -constexpr uint32_t get_allocator_alignment() { - if constexpr (DRAM) { - return DRAM_ALIGNMENT; - } else { - return L1_ALIGNMENT; - } -} - -template -FORCE_INLINE -constexpr uint32_t get_log_base2_of_allocator_alignment() { - if constexpr (DRAM) { - return LOG_BASE_2_OF_DRAM_ALIGNMENT; - } else { - return LOG_BASE_2_OF_L1_ALIGNMENT; - } -} -} // namespace interleaved_addr_gen +#include "dataflow_api_addrgen.h" // clang-format off /** @@ -221,44 +109,6 @@ FORCE_INLINE T get_common_arg_val(int arg_idx) { // clang-format on #define get_compile_time_arg_val(arg_idx) KERNEL_COMPILE_TIME_ARG_##arg_idx -template -FORCE_INLINE constexpr static std::uint32_t MUL_WITH_TILE_SIZE(uint format, uint index) { - constexpr uint8_t datum_shift = (tile_hw == 1024) ? 10 - : (tile_hw == 512) ? 9 - : (tile_hw == 256) ? 8 - : (tile_hw == 128) ? 7 - : (tile_hw == 64) ? 6 - : (tile_hw == 32) ? 5 - : (tile_hw == 16) ? 4 - : 10; - - constexpr uint8_t exp_shift = (tile_hw == 1024) ? 6 - : (tile_hw == 512) ? 5 - : (tile_hw == 256) ? 4 - : (tile_hw == 128) ? 4 - : (tile_hw == 64) ? 4 - : (tile_hw == 32) ? 4 - : (tile_hw == 16) ? 4 - : 6; - switch (format & 0x1F) { - case ((uint8_t)DataFormat::UInt8): return (index << datum_shift); - case ((uint8_t)DataFormat::UInt16): - case ((uint8_t)DataFormat::Float16): - case ((uint8_t)DataFormat::Float16_b): return (index << (datum_shift + 1)); - case ((uint8_t)DataFormat::Int32): - case ((uint8_t)DataFormat::UInt32): - case ((uint8_t)DataFormat::Float32): return (index << (datum_shift + 2)); - case ((uint8_t)DataFormat::Bfp2): - case ((uint8_t)DataFormat::Bfp2_b): return ((index << (datum_shift - 2)) + (index << (exp_shift))); - case ((uint8_t)DataFormat::Bfp4): - case ((uint8_t)DataFormat::Bfp4_b): return ((index << (datum_shift - 1)) + (index << (exp_shift))); - case ((uint8_t)DataFormat::Bfp8): - case ((uint8_t)DataFormat::Bfp8_b): - // Keep default as Bfp8? - default: return ((index << datum_shift) + (index << (exp_shift))); - }; -} - // clang-format off /** * Pushes a given number of tiles in the back of the specified CB’s queue. @@ -580,121 +430,6 @@ void cb_wait_front(int32_t operand, int32_t num_pages) { // simple APIs -FORCE_INLINE -std::uint64_t get_noc_multicast_addr( - std::uint32_t noc_x_start, - std::uint32_t noc_y_start, - std::uint32_t noc_x_end, - std::uint32_t noc_y_end, - std::uint32_t addr, - uint8_t noc = noc_index) { - /* - Get an encoding which contains tensix core and address you want to - read from/write to via the noc - */ - return NOC_MULTICAST_ADDR( - DYNAMIC_NOC_X(noc, noc_x_start), - DYNAMIC_NOC_Y(noc, noc_y_start), - DYNAMIC_NOC_X(noc, noc_x_end), - DYNAMIC_NOC_Y(noc, noc_y_end), - addr); -} - -FORCE_INLINE -std::uint64_t get_noc_addr(std::uint32_t noc_x, std::uint32_t noc_y, std::uint32_t addr, uint8_t noc = noc_index) { - /* - Get an encoding which contains tensix core and address you want to - write to via the noc multicast - */ - - return NOC_XY_ADDR(DYNAMIC_NOC_X(noc, noc_x), DYNAMIC_NOC_Y(noc, noc_y), addr); -} - -/* - Need an alias to get_noc_addr so that the structs below don't confuse the above get_noc_addr with - the struct variant -*/ -FORCE_INLINE -std::uint64_t get_noc_addr_helper(std::uint32_t noc_xy, std::uint32_t addr) { - /* - Get an encoding which contains tensix core and address you want to - write to via the noc multicast - */ - return ((uint64_t)(noc_xy) << NOC_ADDR_COORD_SHIFT) | addr; -} - -FORCE_INLINE -std::uint32_t get_noc_exclude_region( - std::uint32_t exclude_start_x, - std::uint32_t exclude_start_y, - std::uint32_t exclude_dir_x, - std::uint32_t exclude_dir_y, - uint8_t noc = noc_index) { - /* - Get an encoding which contians the definition of the exclusion area - */ - return ( - EXCLUDE_ENABLED << EXCLUDE_ENABLED_OFFSET | - DYNAMIC_NOC_DIRECTION(noc, exclude_dir_y) << EXCLUDE_DIRECTION_Y_OFFSET | - DYNAMIC_NOC_DIRECTION(noc, exclude_dir_x) << EXCLUDE_DIRECTION_X_OFFSET | - DYNAMIC_NOC_Y(noc, exclude_start_y) << EXCLUDE_START_Y_OFFSET | - DYNAMIC_NOC_X(noc, exclude_start_x) << EXCLUDE_START_X_OFFSET); -} - -uint64_t get_dram_noc_addr( - const uint32_t id, - const uint32_t page_size, - const uint32_t bank_base_address, - const uint32_t offset = 0, - uint8_t noc = noc_index) { - uint32_t bank_offset_index = interleaved_addr_gen::get_bank_offset_index(id); - uint32_t bank_index = interleaved_addr_gen::get_bank_index(id, bank_offset_index); - uint32_t addr = - (bank_offset_index * align_power_of_2(page_size, interleaved_addr_gen::get_allocator_alignment())) + - bank_base_address + offset + bank_to_dram_offset[bank_index]; - uint32_t noc_xy = interleaved_addr_gen::get_noc_xy(bank_index, noc); - uint64_t noc_addr = get_noc_addr_helper(noc_xy, addr); - return noc_addr; -} - -uint64_t get_l1_noc_addr( - const uint32_t id, - const uint32_t page_size, - const uint32_t bank_base_address, - const uint32_t offset = 0, - uint8_t noc = noc_index) { - uint32_t bank_offset_index = interleaved_addr_gen::get_bank_offset_index(id); - uint32_t bank_index = interleaved_addr_gen::get_bank_index(id, bank_offset_index); - uint32_t addr = - (bank_offset_index * align_power_of_2(page_size, interleaved_addr_gen::get_allocator_alignment())) + - bank_base_address + offset + bank_to_dram_offset[bank_index]; - uint32_t noc_xy = interleaved_addr_gen::get_noc_xy(bank_index, noc); - uint64_t noc_addr = get_noc_addr_helper(noc_xy, addr); - return noc_addr; -} - -uint64_t get_system_memory_noc_addr( - const uint32_t id, - const uint32_t page_size, - const uint32_t base_addr, - const uint32_t offset = 0, - uint8_t noc = noc_index) { - uint64_t pcie_core_noc_encoding = - uint64_t(NOC_XY_PCIE_ENCODING(DYNAMIC_NOC_X(noc, PCIE_NOC_X), DYNAMIC_NOC_Y(noc, PCIE_NOC_Y))); - uint32_t addr = base_addr + page_size * id + offset; - uint64_t noc_addr = pcie_core_noc_encoding | addr; - return noc_addr; -} - -FORCE_INLINE -std::uint64_t get_noc_addr(std::uint32_t addr, uint8_t noc = noc_index) { - /* - Get an encoding which contains the address in L1 on the current core that you want to - read from/write to via the noc - */ - return NOC_XY_ADDR(my_x[noc], my_y[noc], addr); -} - // TODO: write docs // this issues only a single packet with size <= NOC_MAX_BURST_SIZE (ie maximum packet size) FORCE_INLINE @@ -748,9 +483,9 @@ void noc_async_read_one_packet( * | size | Size of data transfer in bytes | uint32_t | 0..1MB | Yes | */ // clang-format on -template +template inline void noc_async_read( - std::uint64_t src_noc_addr, std::uint32_t dst_local_l1_addr, std::uint32_t size, uint8_t noc = noc_index) { + std::uint64_t src_noc_addr, std::uint32_t dst_local_l1_addr, std::uint32_t size, uint8_t noc) { /* Read requests - use static VC Read responses - assigned VCs dynamically @@ -1035,344 +770,6 @@ FORCE_INLINE void noc_async_write_one_packet_with_state( } } -template -struct InterleavedAddrGen { - uint32_t bank_base_address; // Base address for the whole tensor. - const uint32_t page_size; // Num bytes in page. - const uint32_t aligned_page_size = align_power_of_2(page_size, interleaved_addr_gen::get_allocator_alignment()); - - FORCE_INLINE - uint32_t get_addr( - const uint32_t id, - const uint32_t bank_offset_index, - const uint32_t bank_index, - const uint32_t offset = 0) const { - return (bank_offset_index * this->aligned_page_size) + this->bank_base_address + offset + - interleaved_addr_gen::get_bank_offset(bank_index); - } - - FORCE_INLINE - std::uint64_t get_noc_addr(const uint32_t id, const uint32_t offset = 0, uint8_t noc = noc_index) const { - uint32_t bank_offset_index = interleaved_addr_gen::get_bank_offset_index(id); - uint32_t bank_index = interleaved_addr_gen::get_bank_index(id, bank_offset_index); - uint32_t addr = this->get_addr(id, bank_offset_index, bank_index, offset); - uint32_t noc_xy = interleaved_addr_gen::get_noc_xy(bank_index, noc); - - uint64_t noc_addr = get_noc_addr_helper(noc_xy, addr); - return noc_addr; - } - - FORCE_INLINE - void noc_async_read_page( - const uint32_t id, const uint32_t dest_addr, const uint32_t offset = 0, uint8_t noc = noc_index) const { - noc_async_read(this->get_noc_addr(id, offset), dest_addr, page_size, noc); - } -}; - -template -struct InterleavedPow2AddrGen { - const uint32_t bank_base_address; - const uint32_t log_base_2_of_page_size; // WARNING: This struct is used for optimized get_noc_addr in which case - // you know that bank_unit_size is a power of 2 - static constexpr uint32_t log_base_2_of_allocator_alignment = - interleaved_addr_gen::get_log_base2_of_allocator_alignment(); - const uint32_t aligned_log_base_2_of_page_size = this->log_base_2_of_page_size > log_base_2_of_allocator_alignment - ? this->log_base_2_of_page_size - : log_base_2_of_allocator_alignment; - - FORCE_INLINE - uint32_t get_addr( - const uint32_t id, - const uint32_t bank_offset_index, - const uint32_t bank_index, - const uint32_t offset = 0) const { - return (bank_offset_index << this->aligned_log_base_2_of_page_size) + this->bank_base_address + offset + - interleaved_addr_gen::get_bank_offset(bank_index); - } - - FORCE_INLINE - std::uint64_t get_noc_addr(const uint32_t id, const uint32_t offset = 0, uint8_t noc = noc_index) const { - uint32_t bank_offset_index = interleaved_addr_gen::get_bank_offset_index(id); - uint32_t bank_index = interleaved_addr_gen::get_bank_index(id, bank_offset_index); - uint32_t addr = this->get_addr(id, bank_offset_index, bank_index, offset); - uint32_t noc_xy = interleaved_addr_gen::get_noc_xy(bank_index, noc); - - uint64_t noc_addr = get_noc_addr_helper(noc_xy, addr); - return noc_addr; - } -}; - -template -struct InterleavedAddrGenFast { - uint32_t bank_base_address; // Base address for the whole tensor. - // TODO: Remove page_size from argument list. This can be derived from data_format - uint32_t page_size; // Num bytes in bank unit. - DataFormat data_format; // Data format - - FORCE_INLINE - uint32_t get_addr( - const uint32_t id, - const uint32_t bank_offset_index, - const uint32_t bank_index, - const uint32_t offset = 0) const { - return MUL_WITH_TILE_SIZE((uint)this->data_format, bank_offset_index) + this->bank_base_address + - offset + interleaved_addr_gen::get_bank_offset(bank_index); - } - - FORCE_INLINE - std::uint64_t get_noc_addr(const uint32_t id, const uint32_t offset = 0, uint8_t noc = noc_index) const { - uint32_t bank_offset_index = interleaved_addr_gen::get_bank_offset_index(id); - uint32_t bank_index = interleaved_addr_gen::get_bank_index(id, bank_offset_index); - uint32_t addr = this->get_addr(id, bank_offset_index, bank_index, offset); - uint32_t noc_xy = interleaved_addr_gen::get_noc_xy(bank_index, noc); - - uint64_t noc_addr = get_noc_addr_helper(noc_xy, addr); - return noc_addr; - } - - FORCE_INLINE - void noc_async_read_tile( - const uint32_t id, uint32_t dest_addr, const uint32_t offset = 0, uint8_t noc = noc_index) const { - uint32_t bank_offset_index = interleaved_addr_gen::get_bank_offset_index(id); - uint32_t bank_index = interleaved_addr_gen::get_bank_index(id, bank_offset_index); - uint32_t src_addr = this->get_addr(id, bank_offset_index, bank_index, offset); - uint32_t src_noc_xy = interleaved_addr_gen::get_noc_xy(bank_index, noc); - - WAYPOINT("NRTW"); - DEBUG_SANITIZE_NOC_READ_TRANSACTION(noc, get_noc_addr_helper(src_noc_xy, src_addr), dest_addr, this->page_size); - while (!noc_cmd_buf_ready(noc, read_cmd_buf)); - WAYPOINT("NRTD"); - - NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_RET_ADDR_LO, dest_addr); - NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_TARG_ADDR_LO, src_addr); // (uint32_t)src_addr - NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_TARG_ADDR_COORDINATE, src_noc_xy); // src_addr >> 32 - NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_AT_LEN_BE, this->page_size); // len_bytes - NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); - noc_reads_num_issued[noc] += 1; - } - - FORCE_INLINE - void noc_async_write_tile(const uint32_t id, uint32_t src_addr, uint8_t noc = noc_index) const { - uint32_t bank_offset_index = interleaved_addr_gen::get_bank_offset_index(id); - uint32_t bank_index = interleaved_addr_gen::get_bank_index(id, bank_offset_index); - uint32_t dest_addr = this->get_addr(id, bank_offset_index, bank_index); - uint32_t dest_noc_xy = interleaved_addr_gen::get_noc_xy(bank_index, noc); - - WAYPOINT("NWTW"); - DEBUG_SANITIZE_NOC_WRITE_TRANSACTION( - noc, get_noc_addr_helper(dest_noc_xy, dest_addr), src_addr, this->page_size); - while (!noc_cmd_buf_ready(noc, write_cmd_buf)); - WAYPOINT("NWTD"); - - constexpr uint32_t noc_cmd_field = NOC_CMD_CPY | NOC_CMD_WR | NOC_CMD_VC_STATIC | - NOC_CMD_STATIC_VC(NOC_UNICAST_WRITE_VC) | - 0x0 | // (linked ? NOC_CMD_VC_LINKED : 0x0) - 0x0 | // (mcast ? (NOC_CMD_PATH_RESERVE | NOC_CMD_BRCST_PACKET) : 0x0) - NOC_CMD_RESP_MARKED; - - NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_CTRL, noc_cmd_field); - NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_TARG_ADDR_LO, src_addr); - NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_RET_ADDR_LO, dest_addr); // (uint32_t)dest_addr - NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_RET_ADDR_COORDINATE, dest_noc_xy); // dest_addr >> 32 - NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_AT_LEN_BE, this->page_size); // len_bytes - NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); - if constexpr (noc_mode == DM_DYNAMIC_NOC) { - inc_noc_nonposted_writes_acked(noc); - } else { - noc_nonposted_writes_num_issued[noc] += 1; - noc_nonposted_writes_acked[noc] += 1; // num_dests - } - } -}; - -// TODO: add noc_async_write_page -// TODO: need static assert + host assert that page size <= 8192, hard constraint -template -struct InterleavedPow2AddrGenFast { - uint32_t bank_base_address; // Base address for the whole tensor. - const uint32_t log_base_2_of_page_size; // Num bytes in bank unit. - static constexpr uint32_t log_base_2_of_allocator_alignment = - interleaved_addr_gen::get_log_base2_of_allocator_alignment(); - const uint32_t aligned_log_base_2_of_page_size = this->log_base_2_of_page_size > log_base_2_of_allocator_alignment - ? this->log_base_2_of_page_size - : log_base_2_of_allocator_alignment; - - FORCE_INLINE - uint32_t get_addr( - const uint32_t id, - const uint32_t bank_offset_index, - const uint32_t bank_index, - const uint32_t offset = 0) const { - return (bank_offset_index << this->aligned_log_base_2_of_page_size) + this->bank_base_address + offset + - interleaved_addr_gen::get_bank_offset(bank_index); - } - - FORCE_INLINE - std::uint64_t get_noc_addr(const uint32_t id, const uint32_t offset = 0, uint8_t noc = noc_index) const { - uint32_t bank_offset_index = interleaved_addr_gen::get_bank_offset_index(id); - uint32_t bank_index = interleaved_addr_gen::get_bank_index(id, bank_offset_index); - uint32_t addr = this->get_addr(id, bank_offset_index, bank_index, offset); - uint32_t noc_xy = interleaved_addr_gen::get_noc_xy(bank_index, noc); - - uint64_t noc_addr = get_noc_addr_helper(noc_xy, addr); - return noc_addr; - } - - FORCE_INLINE - void noc_async_read_page( - const uint32_t id, uint32_t dest_addr, const uint32_t offset = 0, uint8_t noc = noc_index) const { - uint32_t bank_offset_index = interleaved_addr_gen::get_bank_offset_index(id); - uint32_t bank_index = interleaved_addr_gen::get_bank_index(id, bank_offset_index); - uint32_t src_addr = this->get_addr(id, bank_offset_index, bank_index, offset); - uint32_t src_noc_xy = interleaved_addr_gen::get_noc_xy(bank_index, noc); - - WAYPOINT("NRPW"); - DEBUG_SANITIZE_NOC_READ_TRANSACTION( - noc, get_noc_addr_helper(src_noc_xy, src_addr), dest_addr, 1 << this->aligned_log_base_2_of_page_size); - while (!noc_cmd_buf_ready(noc, read_cmd_buf)); - WAYPOINT("NRPD"); - - NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_RET_ADDR_LO, dest_addr); - NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_TARG_ADDR_LO, src_addr); // (uint32_t)src_addr - NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_TARG_ADDR_COORDINATE, src_noc_xy); // src_addr >> 32 - NOC_CMD_BUF_WRITE_REG( - noc, read_cmd_buf, NOC_AT_LEN_BE, 1 << this->aligned_log_base_2_of_page_size); // len_bytes - NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); - noc_reads_num_issued[noc] += 1; - } - - FORCE_INLINE - void noc_async_read_partial_page( - const uint32_t id, - uint32_t dest_addr, - const uint32_t size, - const uint32_t offset, - uint8_t noc = noc_index) const { - uint32_t bank_offset_index = interleaved_addr_gen::get_bank_offset_index(id); - uint32_t bank_index = interleaved_addr_gen::get_bank_index(id, bank_offset_index); - uint32_t src_addr = this->get_addr(id, bank_offset_index, bank_index, offset); - uint32_t src_noc_xy = interleaved_addr_gen::get_noc_xy(bank_index, noc); - - WAYPOINT("RP1W"); - while (!noc_cmd_buf_ready(noc, read_cmd_buf)); - WAYPOINT("RP1D"); - DEBUG_SANITIZE_NOC_READ_TRANSACTION(noc, get_noc_addr_helper(src_noc_xy, src_addr), dest_addr, size); - - NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_RET_ADDR_LO, dest_addr); - NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_TARG_ADDR_LO, src_addr); // (uint32_t)src_addr - NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_TARG_ADDR_COORDINATE, src_noc_xy); // src_addr >> 32 - NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_AT_LEN_BE, size); // len_bytes - NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); - noc_reads_num_issued[noc] += 1; - } - - FORCE_INLINE - void noc_async_write_page( - const uint32_t id, - uint32_t src_addr, - const uint32_t write_size_bytes, - const uint32_t offset = 0, - uint8_t noc = noc_index) const { - uint32_t bank_offset_index = interleaved_addr_gen::get_bank_offset_index(id); - uint32_t bank_index = interleaved_addr_gen::get_bank_index(id, bank_offset_index); - uint32_t dest_addr = this->get_addr(id, bank_offset_index, bank_index, offset); - uint32_t dest_noc_xy = interleaved_addr_gen::get_noc_xy(bank_index, noc); - - WAYPOINT("NWPW"); - DEBUG_SANITIZE_NOC_WRITE_TRANSACTION( - noc, get_noc_addr_helper(dest_noc_xy, dest_addr), src_addr, write_size_bytes); - while (!noc_cmd_buf_ready(noc, write_cmd_buf)); - WAYPOINT("NWPD"); - - constexpr uint32_t noc_cmd_field = NOC_CMD_CPY | NOC_CMD_WR | NOC_CMD_VC_STATIC | - NOC_CMD_STATIC_VC(NOC_UNICAST_WRITE_VC) | - 0x0 | // (linked ? NOC_CMD_VC_LINKED : 0x0) - 0x0 | // (mcast ? (NOC_CMD_PATH_RESERVE | NOC_CMD_BRCST_PACKET) : 0x0) - NOC_CMD_RESP_MARKED; - - NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_CTRL, noc_cmd_field); - NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_TARG_ADDR_LO, src_addr); - NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_RET_ADDR_LO, dest_addr); // (uint32_t)dest_addr - NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_RET_ADDR_COORDINATE, dest_noc_xy); // dest_addr >> 32 - NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_AT_LEN_BE, write_size_bytes); // len_bytes - NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); - if constexpr (noc_mode == DM_DYNAMIC_NOC) { - inc_noc_nonposted_writes_acked(noc); - } else { - noc_nonposted_writes_num_issued[noc] += 1; - noc_nonposted_writes_acked[noc] += 1; // num_dests - } - } -}; - -template -FORCE_INLINE std::uint64_t get_noc_addr( - const uint32_t id, const InterleavedAddrGen& s, uint32_t offset = 0, uint8_t noc = noc_index) { - /* - Alternative API for getting the noc address when we are reading using a swizzled - layout. This version assumes bank unit size can be arbitrary size. Use - get_noc_addr(const uint32_t id, InterleavedPow2AddrGen s) for optimized algorithm in which stick size - is a power of 2. - - id: Unique id for the bank_unit you want to read, assuming row major order. We use this to compute the - bank for this unit of data. - - InterleavedAddrGen: Check struct for attribute definitions. - */ - return s.get_noc_addr(id, offset, noc); -} - -template -FORCE_INLINE std::uint64_t get_noc_addr( - const uint32_t id, const InterleavedPow2AddrGen& s, uint32_t offset = 0, uint8_t noc = noc_index) { - /* - Alternative API for getting the noc address when we are reading using a swizzled - layout. This version assumes bank unit size is a power of 2. For arbitrary bank - unit size, use get_noc_addr(const uint32_t id, const InterleavedOffset s) - - id: Unique id for the bank_unit you want to read, assuming row major order. We use this to compute the - bank for this unit of data. - - InterleavedPow2AddrGen: Check struct for attribute definitions. - */ - - return s.get_noc_addr(id, offset, noc); -} - -template -FORCE_INLINE std::uint64_t get_noc_addr( - const uint32_t id, const InterleavedAddrGenFast& s, uint32_t offset = 0, uint8_t noc = noc_index) { - /* - Alternative API for getting the noc address when we are reading using a swizzled - layout. This version assumes bank unit size can be arbitrary size. Use - get_noc_addr(const uint32_t id, InterleavedPow2AddrGen s) for optimized algorithm in which stick size - is a power of 2. - - id: Unique id for the bank_unit you want to read, assuming row major order. We use this to compute the - bank for this unit of data. - - InterleavedAddrGen: Check struct for attribute definitions. - */ - return s.get_noc_addr(id, offset, noc); -} - -template -FORCE_INLINE std::uint64_t get_noc_addr( - const uint32_t id, const InterleavedPow2AddrGenFast& s, uint32_t offset = 0, uint8_t noc = noc_index) { - /* - Alternative API for getting the noc address when we are reading using a swizzled - layout. This version assumes bank unit size is a power of 2 and less than or equal to NOC_MAX_BURST_SIZE. - For arbitrary bank unit size, use get_noc_addr(const uint32_t id, const InterleavedOffset s) - - id: Unique id for the bank_unit you want to read, assuming row major order. We use this to compute the - bank for this unit of data. - - InterleavedPow2AddrGenFast: Check struct for attribute definitions. - */ - - return s.get_noc_addr(id, offset, noc); -} - template FORCE_INLINE void noc_async_read_page( const uint32_t id, @@ -2124,34 +1521,3 @@ void noc_async_write_barrier_with_trid(uint32_t trid, uint8_t noc = noc_index) { invalidate_l1_cache(); WAYPOINT("NWTD"); } - -template -FORCE_INLINE uint64_t -get_noc_addr_from_bank_id(uint32_t bank_id, uint32_t bank_address_offset, uint8_t noc = noc_index) { - // Use addrgen tables to convert bank_ids to physical NOC coordinates - uint64_t noc_addr = 0; - if constexpr (DRAM) { - noc_addr = dram_bank_to_noc_xy[noc_index][bank_id]; - bank_address_offset += bank_to_dram_offset[bank_id]; - } else { - noc_addr = l1_bank_to_noc_xy[noc_index][bank_id]; - } - return (noc_addr << NOC_ADDR_COORD_SHIFT) | (bank_address_offset); -} - -template -FORCE_INLINE auto get_interleaved_addr_gen(uint32_t base_addr) { - constexpr bool is_pow_2 = is_power_of_2(page_size); - if constexpr (is_pow_2) { - constexpr uint32_t log2_page_size = __builtin_ctz(page_size); - if constexpr (page_size <= NOC_MAX_BURST_SIZE) { - return InterleavedPow2AddrGenFast{ - .bank_base_address = base_addr, .log_base_2_of_page_size = log2_page_size}; - } else { - return InterleavedPow2AddrGen{ - .bank_base_address = base_addr, .log_base_2_of_page_size = log2_page_size}; - } - } else { - return InterleavedAddrGen{.bank_base_address = base_addr, .page_size = page_size}; - } -} diff --git a/tt_metal/hw/inc/dataflow_api_addrgen.h b/tt_metal/hw/inc/dataflow_api_addrgen.h new file mode 100644 index 00000000000..3b5e4433fe8 --- /dev/null +++ b/tt_metal/hw/inc/dataflow_api_addrgen.h @@ -0,0 +1,654 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include "dataflow_cmd_bufs.h" +#include "debug/sanitize_noc.h" +#include "debug/waypoint.h" +#include "utils/utils.h" +#include "debug/assert.h" + +#if defined(COMPILE_FOR_BRISC) +constexpr uint8_t proc_type = static_cast>(TensixProcessorTypes::DM0); +#else +constexpr uint8_t proc_type = static_cast>(TensixProcessorTypes::DM1); +#endif +#if defined(KERNEL_BUILD) +constexpr uint8_t noc_index = NOC_INDEX; +constexpr uint8_t noc_mode = NOC_MODE; +#else + +extern uint8_t noc_index; +constexpr uint8_t noc_mode = DM_DEDICATED_NOC; +#endif +extern uint16_t dram_bank_to_noc_xy[NUM_NOCS][NUM_DRAM_BANKS]; +extern int32_t bank_to_dram_offset[NUM_DRAM_BANKS]; +extern uint16_t l1_bank_to_noc_xy[NUM_NOCS][NUM_L1_BANKS]; +extern int32_t bank_to_l1_offset[NUM_L1_BANKS]; + +extern uint32_t tt_l1_ptr* rta_l1_base; +extern uint32_t tt_l1_ptr* crta_l1_base; +extern uint32_t tt_l1_ptr* sem_l1_base[]; + +/** @file */ + +/** + * \private + */ + +// Use VC 1 for unicast writes, and VC 4 for mcast writes +#define NOC_UNICAST_WRITE_VC 1 +#define NOC_MULTICAST_WRITE_VC 4 +#define NOC_DISPATCH_MULTICAST_WRITE_VC 5 // Only to be used by the dispatch cores + +#define EXCLUDE_ENABLED 1 +#define EXCLUDE_ENABLED_OFFSET 22 +#define EXCLUDE_DIRECTION_Y_OFFSET 21 +#define EXCLUDE_DIRECTION_X_OFFSET 20 +#define EXCLUDE_START_Y_OFFSET 14 +#define EXCLUDE_START_X_OFFSET 8 +#define DYNAMIC_NOC_DIRECTION(noc, direction) (noc == 1 ? 1 - direction : direction) + +static_assert(NUM_NOCS == 2); +// "Scratch" in L1 has space allocated for 256 DRAM and L1 enteries, to store offsets and NOC XY data. +// (MEM_BANK_TO_NOC_XY_SCRATCH and MEM_BANK_OFFSET_SCRATCH) +static_assert((NUM_DRAM_BANKS + NUM_L1_BANKS) <= 256); + +namespace interleaved_addr_gen { + +template +FORCE_INLINE uint32_t get_bank_offset_index(uint32_t id) { + if constexpr (DRAM) { // DRAM +#ifdef IS_NOT_POW2_NUM_DRAM_BANKS + return udivsi3_const_divisor(id); +#else + return id >> LOG_BASE_2_OF_NUM_DRAM_BANKS; +#endif + } else { // L1 +#ifdef IS_NOT_POW2_NUM_L1_BANKS + return udivsi3_const_divisor(id); +#else + return id >> LOG_BASE_2_OF_NUM_L1_BANKS; +#endif + } +} + +template +FORCE_INLINE uint32_t get_bank_index(uint32_t id, uint32_t bank_offset_index) { + if constexpr (DRAM) { // DRAM + return id - bank_offset_index * NUM_DRAM_BANKS; + } else { // L1 + return id - bank_offset_index * NUM_L1_BANKS; + } +} + +template +FORCE_INLINE uint32_t get_noc_xy(uint32_t bank_index, uint8_t noc = noc_index) { + if constexpr (DRAM) { // DRAM + return dram_bank_to_noc_xy[noc][bank_index]; + } else { // L1 + return l1_bank_to_noc_xy[noc][bank_index]; + } +} + +template +FORCE_INLINE uint32_t get_bank_offset(uint32_t bank_index) { + if constexpr (DRAM) { // DRAM + return bank_to_dram_offset[bank_index]; + } else { // L1 + return bank_to_l1_offset[bank_index]; + } +} + +template +FORCE_INLINE constexpr uint32_t get_allocator_alignment() { + if constexpr (DRAM) { + return DRAM_ALIGNMENT; + } else { + return L1_ALIGNMENT; + } +} + +template +FORCE_INLINE constexpr uint32_t get_log_base2_of_allocator_alignment() { + if constexpr (DRAM) { + return LOG_BASE_2_OF_DRAM_ALIGNMENT; + } else { + return LOG_BASE_2_OF_L1_ALIGNMENT; + } +} +} // namespace interleaved_addr_gen + +template +FORCE_INLINE constexpr static std::uint32_t MUL_WITH_TILE_SIZE(uint format, uint index) { + constexpr uint8_t datum_shift = (tile_hw == 1024) ? 10 + : (tile_hw == 512) ? 9 + : (tile_hw == 256) ? 8 + : (tile_hw == 128) ? 7 + : (tile_hw == 64) ? 6 + : (tile_hw == 32) ? 5 + : (tile_hw == 16) ? 4 + : 10; + + constexpr uint8_t exp_shift = (tile_hw == 1024) ? 6 + : (tile_hw == 512) ? 5 + : (tile_hw == 256) ? 4 + : (tile_hw == 128) ? 4 + : (tile_hw == 64) ? 4 + : (tile_hw == 32) ? 4 + : (tile_hw == 16) ? 4 + : 6; + switch (format & 0x1F) { + case ((uint8_t)DataFormat::UInt8): return (index << datum_shift); + case ((uint8_t)DataFormat::UInt16): + case ((uint8_t)DataFormat::Float16): + case ((uint8_t)DataFormat::Float16_b): return (index << (datum_shift + 1)); + case ((uint8_t)DataFormat::Int32): + case ((uint8_t)DataFormat::UInt32): + case ((uint8_t)DataFormat::Float32): return (index << (datum_shift + 2)); + case ((uint8_t)DataFormat::Bfp2): + case ((uint8_t)DataFormat::Bfp2_b): return ((index << (datum_shift - 2)) + (index << (exp_shift))); + case ((uint8_t)DataFormat::Bfp4): + case ((uint8_t)DataFormat::Bfp4_b): return ((index << (datum_shift - 1)) + (index << (exp_shift))); + case ((uint8_t)DataFormat::Bfp8): + case ((uint8_t)DataFormat::Bfp8_b): + // Keep default as Bfp8? + default: return ((index << datum_shift) + (index << (exp_shift))); + }; +} + +FORCE_INLINE +std::uint64_t get_noc_multicast_addr( + std::uint32_t noc_x_start, + std::uint32_t noc_y_start, + std::uint32_t noc_x_end, + std::uint32_t noc_y_end, + std::uint32_t addr, + uint8_t noc = noc_index) { + /* + Get an encoding which contains tensix core and address you want to + read from/write to via the noc + */ + return NOC_MULTICAST_ADDR( + DYNAMIC_NOC_X(noc, noc_x_start), + DYNAMIC_NOC_Y(noc, noc_y_start), + DYNAMIC_NOC_X(noc, noc_x_end), + DYNAMIC_NOC_Y(noc, noc_y_end), + addr); +} + +FORCE_INLINE +std::uint64_t get_noc_addr(std::uint32_t noc_x, std::uint32_t noc_y, std::uint32_t addr, uint8_t noc = noc_index) { + /* + Get an encoding which contains tensix core and address you want to + write to via the noc multicast + */ + + return NOC_XY_ADDR(DYNAMIC_NOC_X(noc, noc_x), DYNAMIC_NOC_Y(noc, noc_y), addr); +} + +/* + Need an alias to get_noc_addr so that the structs below don't confuse the above get_noc_addr with + the struct variant +*/ +FORCE_INLINE +std::uint64_t get_noc_addr_helper(std::uint32_t noc_xy, std::uint32_t addr) { + /* + Get an encoding which contains tensix core and address you want to + write to via the noc multicast + */ + return ((uint64_t)(noc_xy) << NOC_ADDR_COORD_SHIFT) | addr; +} + +FORCE_INLINE +std::uint32_t get_noc_exclude_region( + std::uint32_t exclude_start_x, + std::uint32_t exclude_start_y, + std::uint32_t exclude_dir_x, + std::uint32_t exclude_dir_y, + uint8_t noc = noc_index) { + /* + Get an encoding which contians the definition of the exclusion area + */ + return ( + EXCLUDE_ENABLED << EXCLUDE_ENABLED_OFFSET | + DYNAMIC_NOC_DIRECTION(noc, exclude_dir_y) << EXCLUDE_DIRECTION_Y_OFFSET | + DYNAMIC_NOC_DIRECTION(noc, exclude_dir_x) << EXCLUDE_DIRECTION_X_OFFSET | + DYNAMIC_NOC_Y(noc, exclude_start_y) << EXCLUDE_START_Y_OFFSET | + DYNAMIC_NOC_X(noc, exclude_start_x) << EXCLUDE_START_X_OFFSET); +} + +uint64_t get_dram_noc_addr( + const uint32_t id, + const uint32_t page_size, + const uint32_t bank_base_address, + const uint32_t offset = 0, + uint8_t noc = noc_index) { + uint32_t bank_offset_index = interleaved_addr_gen::get_bank_offset_index(id); + uint32_t bank_index = interleaved_addr_gen::get_bank_index(id, bank_offset_index); + uint32_t addr = + (bank_offset_index * align_power_of_2(page_size, interleaved_addr_gen::get_allocator_alignment())) + + bank_base_address + offset + bank_to_dram_offset[bank_index]; + uint32_t noc_xy = interleaved_addr_gen::get_noc_xy(bank_index, noc); + uint64_t noc_addr = get_noc_addr_helper(noc_xy, addr); + return noc_addr; +} + +uint64_t get_l1_noc_addr( + const uint32_t id, + const uint32_t page_size, + const uint32_t bank_base_address, + const uint32_t offset = 0, + uint8_t noc = noc_index) { + uint32_t bank_offset_index = interleaved_addr_gen::get_bank_offset_index(id); + uint32_t bank_index = interleaved_addr_gen::get_bank_index(id, bank_offset_index); + uint32_t addr = + (bank_offset_index * align_power_of_2(page_size, interleaved_addr_gen::get_allocator_alignment())) + + bank_base_address + offset + bank_to_dram_offset[bank_index]; + uint32_t noc_xy = interleaved_addr_gen::get_noc_xy(bank_index, noc); + uint64_t noc_addr = get_noc_addr_helper(noc_xy, addr); + return noc_addr; +} + +uint64_t get_system_memory_noc_addr( + const uint32_t id, + const uint32_t page_size, + const uint32_t base_addr, + const uint32_t offset = 0, + uint8_t noc = noc_index) { + uint64_t pcie_core_noc_encoding = + uint64_t(NOC_XY_PCIE_ENCODING(DYNAMIC_NOC_X(noc, PCIE_NOC_X), DYNAMIC_NOC_Y(noc, PCIE_NOC_Y))); + uint32_t addr = base_addr + page_size * id + offset; + uint64_t noc_addr = pcie_core_noc_encoding | addr; + return noc_addr; +} + +FORCE_INLINE +std::uint64_t get_noc_addr(std::uint32_t addr, uint8_t noc = noc_index) { + /* + Get an encoding which contains the address in L1 on the current core that you want to + read from/write to via the noc + */ + return NOC_XY_ADDR(my_x[noc], my_y[noc], addr); +} + +// Forward declare noc_async_read and default template, arg vals here to AVOID +// circular dependency between InterleavedAddrGen::noc_async_read (defined and +// implemented here) and free function noc_async_read() defined in +// dataflow_api.h +template +void noc_async_read( + std::uint64_t src_noc_addr, std::uint32_t dst_local_l1_addr, std::uint32_t size, uint8_t noc = noc_index); + +template +struct InterleavedAddrGen { + uint32_t bank_base_address; // Base address for the whole tensor. + const uint32_t page_size; // Num bytes in page. + const uint32_t aligned_page_size = + align_power_of_2(page_size, interleaved_addr_gen::get_allocator_alignment()); + + FORCE_INLINE + uint32_t get_addr( + const uint32_t id, + const uint32_t bank_offset_index, + const uint32_t bank_index, + const uint32_t offset = 0) const { + return (bank_offset_index * this->aligned_page_size) + this->bank_base_address + offset + + interleaved_addr_gen::get_bank_offset(bank_index); + } + + FORCE_INLINE + std::uint64_t get_noc_addr(const uint32_t id, const uint32_t offset = 0, uint8_t noc = noc_index) const { + uint32_t bank_offset_index = interleaved_addr_gen::get_bank_offset_index(id); + uint32_t bank_index = interleaved_addr_gen::get_bank_index(id, bank_offset_index); + uint32_t addr = this->get_addr(id, bank_offset_index, bank_index, offset); + uint32_t noc_xy = interleaved_addr_gen::get_noc_xy(bank_index, noc); + + uint64_t noc_addr = get_noc_addr_helper(noc_xy, addr); + return noc_addr; + } + + FORCE_INLINE + void noc_async_read_page( + const uint32_t id, const uint32_t dest_addr, const uint32_t offset = 0, uint8_t noc = noc_index) const { + noc_async_read(this->get_noc_addr(id, offset), dest_addr, page_size, noc); + } +}; + +template +struct InterleavedPow2AddrGen { + const uint32_t bank_base_address; + const uint32_t log_base_2_of_page_size; // WARNING: This struct is used for optimized get_noc_addr in which case + // you know that bank_unit_size is a power of 2 + static constexpr uint32_t log_base_2_of_allocator_alignment = + interleaved_addr_gen::get_log_base2_of_allocator_alignment(); + const uint32_t aligned_log_base_2_of_page_size = this->log_base_2_of_page_size > log_base_2_of_allocator_alignment + ? this->log_base_2_of_page_size + : log_base_2_of_allocator_alignment; + + FORCE_INLINE + uint32_t get_addr( + const uint32_t id, + const uint32_t bank_offset_index, + const uint32_t bank_index, + const uint32_t offset = 0) const { + return (bank_offset_index << this->aligned_log_base_2_of_page_size) + this->bank_base_address + offset + + interleaved_addr_gen::get_bank_offset(bank_index); + } + + FORCE_INLINE + std::uint64_t get_noc_addr(const uint32_t id, const uint32_t offset = 0, uint8_t noc = noc_index) const { + uint32_t bank_offset_index = interleaved_addr_gen::get_bank_offset_index(id); + uint32_t bank_index = interleaved_addr_gen::get_bank_index(id, bank_offset_index); + uint32_t addr = this->get_addr(id, bank_offset_index, bank_index, offset); + uint32_t noc_xy = interleaved_addr_gen::get_noc_xy(bank_index, noc); + + uint64_t noc_addr = get_noc_addr_helper(noc_xy, addr); + return noc_addr; + } +}; + +template +struct InterleavedAddrGenFast { + uint32_t bank_base_address; // Base address for the whole tensor. + // TODO: Remove page_size from argument list. This can be derived from data_format + uint32_t page_size; // Num bytes in bank unit. + DataFormat data_format; // Data format + + FORCE_INLINE + uint32_t get_addr( + const uint32_t id, + const uint32_t bank_offset_index, + const uint32_t bank_index, + const uint32_t offset = 0) const { + return MUL_WITH_TILE_SIZE((uint)this->data_format, bank_offset_index) + this->bank_base_address + + offset + interleaved_addr_gen::get_bank_offset(bank_index); + } + + FORCE_INLINE + std::uint64_t get_noc_addr(const uint32_t id, const uint32_t offset = 0, uint8_t noc = noc_index) const { + uint32_t bank_offset_index = interleaved_addr_gen::get_bank_offset_index(id); + uint32_t bank_index = interleaved_addr_gen::get_bank_index(id, bank_offset_index); + uint32_t addr = this->get_addr(id, bank_offset_index, bank_index, offset); + uint32_t noc_xy = interleaved_addr_gen::get_noc_xy(bank_index, noc); + + uint64_t noc_addr = get_noc_addr_helper(noc_xy, addr); + return noc_addr; + } + + FORCE_INLINE + void noc_async_read_tile( + const uint32_t id, uint32_t dest_addr, const uint32_t offset = 0, uint8_t noc = noc_index) const { + uint32_t bank_offset_index = interleaved_addr_gen::get_bank_offset_index(id); + uint32_t bank_index = interleaved_addr_gen::get_bank_index(id, bank_offset_index); + uint32_t src_addr = this->get_addr(id, bank_offset_index, bank_index, offset); + uint32_t src_noc_xy = interleaved_addr_gen::get_noc_xy(bank_index, noc); + + WAYPOINT("NRTW"); + DEBUG_SANITIZE_NOC_READ_TRANSACTION(noc, get_noc_addr_helper(src_noc_xy, src_addr), dest_addr, this->page_size); + while (!noc_cmd_buf_ready(noc, read_cmd_buf)); + WAYPOINT("NRTD"); + + NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_RET_ADDR_LO, dest_addr); + NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_TARG_ADDR_LO, src_addr); // (uint32_t)src_addr + NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_TARG_ADDR_COORDINATE, src_noc_xy); // src_addr >> 32 + NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_AT_LEN_BE, this->page_size); // len_bytes + NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); + noc_reads_num_issued[noc] += 1; + } + + FORCE_INLINE + void noc_async_write_tile(const uint32_t id, uint32_t src_addr, uint8_t noc = noc_index) const { + uint32_t bank_offset_index = interleaved_addr_gen::get_bank_offset_index(id); + uint32_t bank_index = interleaved_addr_gen::get_bank_index(id, bank_offset_index); + uint32_t dest_addr = this->get_addr(id, bank_offset_index, bank_index); + uint32_t dest_noc_xy = interleaved_addr_gen::get_noc_xy(bank_index, noc); + + WAYPOINT("NWTW"); + DEBUG_SANITIZE_NOC_WRITE_TRANSACTION( + noc, get_noc_addr_helper(dest_noc_xy, dest_addr), src_addr, this->page_size); + while (!noc_cmd_buf_ready(noc, write_cmd_buf)); + WAYPOINT("NWTD"); + + constexpr uint32_t noc_cmd_field = NOC_CMD_CPY | NOC_CMD_WR | NOC_CMD_VC_STATIC | + NOC_CMD_STATIC_VC(NOC_UNICAST_WRITE_VC) | + 0x0 | // (linked ? NOC_CMD_VC_LINKED : 0x0) + 0x0 | // (mcast ? (NOC_CMD_PATH_RESERVE | NOC_CMD_BRCST_PACKET) : 0x0) + NOC_CMD_RESP_MARKED; + + NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_CTRL, noc_cmd_field); + NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_TARG_ADDR_LO, src_addr); + NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_RET_ADDR_LO, dest_addr); // (uint32_t)dest_addr + NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_RET_ADDR_COORDINATE, dest_noc_xy); // dest_addr >> 32 + NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_AT_LEN_BE, this->page_size); // len_bytes + NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); + if constexpr (noc_mode == DM_DYNAMIC_NOC) { + inc_noc_nonposted_writes_acked(noc); + } else { + noc_nonposted_writes_num_issued[noc] += 1; + noc_nonposted_writes_acked[noc] += 1; // num_dests + } + } +}; + +// TODO: add noc_async_write_page +// TODO: need static assert + host assert that page size <= 8192, hard constraint +template +struct InterleavedPow2AddrGenFast { + uint32_t bank_base_address; // Base address for the whole tensor. + const uint32_t log_base_2_of_page_size; // Num bytes in bank unit. + static constexpr uint32_t log_base_2_of_allocator_alignment = + interleaved_addr_gen::get_log_base2_of_allocator_alignment(); + const uint32_t aligned_log_base_2_of_page_size = this->log_base_2_of_page_size > log_base_2_of_allocator_alignment + ? this->log_base_2_of_page_size + : log_base_2_of_allocator_alignment; + + FORCE_INLINE + uint32_t get_addr( + const uint32_t id, + const uint32_t bank_offset_index, + const uint32_t bank_index, + const uint32_t offset = 0) const { + return (bank_offset_index << this->aligned_log_base_2_of_page_size) + this->bank_base_address + offset + + interleaved_addr_gen::get_bank_offset(bank_index); + } + + FORCE_INLINE + std::uint64_t get_noc_addr(const uint32_t id, const uint32_t offset = 0, uint8_t noc = noc_index) const { + uint32_t bank_offset_index = interleaved_addr_gen::get_bank_offset_index(id); + uint32_t bank_index = interleaved_addr_gen::get_bank_index(id, bank_offset_index); + uint32_t addr = this->get_addr(id, bank_offset_index, bank_index, offset); + uint32_t noc_xy = interleaved_addr_gen::get_noc_xy(bank_index, noc); + + uint64_t noc_addr = get_noc_addr_helper(noc_xy, addr); + return noc_addr; + } + + FORCE_INLINE + void noc_async_read_page( + const uint32_t id, uint32_t dest_addr, const uint32_t offset = 0, uint8_t noc = noc_index) const { + uint32_t bank_offset_index = interleaved_addr_gen::get_bank_offset_index(id); + uint32_t bank_index = interleaved_addr_gen::get_bank_index(id, bank_offset_index); + uint32_t src_addr = this->get_addr(id, bank_offset_index, bank_index, offset); + uint32_t src_noc_xy = interleaved_addr_gen::get_noc_xy(bank_index, noc); + + WAYPOINT("NRPW"); + DEBUG_SANITIZE_NOC_READ_TRANSACTION( + noc, get_noc_addr_helper(src_noc_xy, src_addr), dest_addr, 1 << this->aligned_log_base_2_of_page_size); + while (!noc_cmd_buf_ready(noc, read_cmd_buf)); + WAYPOINT("NRPD"); + + NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_RET_ADDR_LO, dest_addr); + NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_TARG_ADDR_LO, src_addr); // (uint32_t)src_addr + NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_TARG_ADDR_COORDINATE, src_noc_xy); // src_addr >> 32 + NOC_CMD_BUF_WRITE_REG( + noc, read_cmd_buf, NOC_AT_LEN_BE, 1 << this->aligned_log_base_2_of_page_size); // len_bytes + NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); + noc_reads_num_issued[noc] += 1; + } + + FORCE_INLINE + void noc_async_read_partial_page( + const uint32_t id, + uint32_t dest_addr, + const uint32_t size, + const uint32_t offset, + uint8_t noc = noc_index) const { + uint32_t bank_offset_index = interleaved_addr_gen::get_bank_offset_index(id); + uint32_t bank_index = interleaved_addr_gen::get_bank_index(id, bank_offset_index); + uint32_t src_addr = this->get_addr(id, bank_offset_index, bank_index, offset); + uint32_t src_noc_xy = interleaved_addr_gen::get_noc_xy(bank_index, noc); + + WAYPOINT("RP1W"); + while (!noc_cmd_buf_ready(noc, read_cmd_buf)); + WAYPOINT("RP1D"); + DEBUG_SANITIZE_NOC_READ_TRANSACTION(noc, get_noc_addr_helper(src_noc_xy, src_addr), dest_addr, size); + + NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_RET_ADDR_LO, dest_addr); + NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_TARG_ADDR_LO, src_addr); // (uint32_t)src_addr + NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_TARG_ADDR_COORDINATE, src_noc_xy); // src_addr >> 32 + NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_AT_LEN_BE, size); // len_bytes + NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); + noc_reads_num_issued[noc] += 1; + } + + FORCE_INLINE + void noc_async_write_page( + const uint32_t id, + uint32_t src_addr, + const uint32_t write_size_bytes, + const uint32_t offset = 0, + uint8_t noc = noc_index) const { + uint32_t bank_offset_index = interleaved_addr_gen::get_bank_offset_index(id); + uint32_t bank_index = interleaved_addr_gen::get_bank_index(id, bank_offset_index); + uint32_t dest_addr = this->get_addr(id, bank_offset_index, bank_index, offset); + uint32_t dest_noc_xy = interleaved_addr_gen::get_noc_xy(bank_index, noc); + + WAYPOINT("NWPW"); + DEBUG_SANITIZE_NOC_WRITE_TRANSACTION( + noc, get_noc_addr_helper(dest_noc_xy, dest_addr), src_addr, write_size_bytes); + while (!noc_cmd_buf_ready(noc, write_cmd_buf)); + WAYPOINT("NWPD"); + + constexpr uint32_t noc_cmd_field = NOC_CMD_CPY | NOC_CMD_WR | NOC_CMD_VC_STATIC | + NOC_CMD_STATIC_VC(NOC_UNICAST_WRITE_VC) | + 0x0 | // (linked ? NOC_CMD_VC_LINKED : 0x0) + 0x0 | // (mcast ? (NOC_CMD_PATH_RESERVE | NOC_CMD_BRCST_PACKET) : 0x0) + NOC_CMD_RESP_MARKED; + + NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_CTRL, noc_cmd_field); + NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_TARG_ADDR_LO, src_addr); + NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_RET_ADDR_LO, dest_addr); // (uint32_t)dest_addr + NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_RET_ADDR_COORDINATE, dest_noc_xy); // dest_addr >> 32 + NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_AT_LEN_BE, write_size_bytes); // len_bytes + NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); + if constexpr (noc_mode == DM_DYNAMIC_NOC) { + inc_noc_nonposted_writes_acked(noc); + } else { + noc_nonposted_writes_num_issued[noc] += 1; + noc_nonposted_writes_acked[noc] += 1; // num_dests + } + } +}; + +template +FORCE_INLINE std::uint64_t get_noc_addr( + const uint32_t id, const InterleavedAddrGen& s, uint32_t offset = 0, uint8_t noc = noc_index) { + /* + Alternative API for getting the noc address when we are reading using a swizzled + layout. This version assumes bank unit size can be arbitrary size. Use + get_noc_addr(const uint32_t id, InterleavedPow2AddrGen s) for optimized algorithm in which stick size + is a power of 2. + + id: Unique id for the bank_unit you want to read, assuming row major order. We use this to compute the + bank for this unit of data. + + InterleavedAddrGen: Check struct for attribute definitions. + */ + return s.get_noc_addr(id, offset, noc); +} + +template +FORCE_INLINE std::uint64_t get_noc_addr( + const uint32_t id, const InterleavedPow2AddrGen& s, uint32_t offset = 0, uint8_t noc = noc_index) { + /* + Alternative API for getting the noc address when we are reading using a swizzled + layout. This version assumes bank unit size is a power of 2. For arbitrary bank + unit size, use get_noc_addr(const uint32_t id, const InterleavedOffset s) + + id: Unique id for the bank_unit you want to read, assuming row major order. We use this to compute the + bank for this unit of data. + + InterleavedPow2AddrGen: Check struct for attribute definitions. + */ + + return s.get_noc_addr(id, offset, noc); +} + +template +FORCE_INLINE std::uint64_t get_noc_addr( + const uint32_t id, const InterleavedAddrGenFast& s, uint32_t offset = 0, uint8_t noc = noc_index) { + /* + Alternative API for getting the noc address when we are reading using a swizzled + layout. This version assumes bank unit size can be arbitrary size. Use + get_noc_addr(const uint32_t id, InterleavedPow2AddrGen s) for optimized algorithm in which stick size + is a power of 2. + + id: Unique id for the bank_unit you want to read, assuming row major order. We use this to compute the + bank for this unit of data. + + InterleavedAddrGen: Check struct for attribute definitions. + */ + return s.get_noc_addr(id, offset, noc); +} + +template +FORCE_INLINE std::uint64_t get_noc_addr( + const uint32_t id, const InterleavedPow2AddrGenFast& s, uint32_t offset = 0, uint8_t noc = noc_index) { + /* + Alternative API for getting the noc address when we are reading using a swizzled + layout. This version assumes bank unit size is a power of 2 and less than or equal to NOC_MAX_BURST_SIZE. + For arbitrary bank unit size, use get_noc_addr(const uint32_t id, const InterleavedOffset s) + + id: Unique id for the bank_unit you want to read, assuming row major order. We use this to compute the + bank for this unit of data. + + InterleavedPow2AddrGenFast: Check struct for attribute definitions. + */ + + return s.get_noc_addr(id, offset, noc); +} + +template +FORCE_INLINE uint64_t +get_noc_addr_from_bank_id(uint32_t bank_id, uint32_t bank_address_offset, uint8_t noc = noc_index) { + // Use addrgen tables to convert bank_ids to physical NOC coordinates + uint64_t noc_addr = 0; + if constexpr (DRAM) { + noc_addr = dram_bank_to_noc_xy[noc_index][bank_id]; + bank_address_offset += bank_to_dram_offset[bank_id]; + } else { + noc_addr = l1_bank_to_noc_xy[noc_index][bank_id]; + } + return (noc_addr << NOC_ADDR_COORD_SHIFT) | (bank_address_offset); +} + +template +FORCE_INLINE auto get_interleaved_addr_gen(uint32_t base_addr) { + constexpr bool is_pow_2 = is_power_of_2(page_size); + if constexpr (is_pow_2) { + constexpr uint32_t log2_page_size = __builtin_ctz(page_size); + if constexpr (page_size <= NOC_MAX_BURST_SIZE) { + return InterleavedPow2AddrGenFast{ + .bank_base_address = base_addr, .log_base_2_of_page_size = log2_page_size}; + } else { + return InterleavedPow2AddrGen{ + .bank_base_address = base_addr, .log_base_2_of_page_size = log2_page_size}; + } + } else { + return InterleavedAddrGen{.bank_base_address = base_addr, .page_size = page_size}; + } +} diff --git a/tt_metal/tools/profiler/kernel_profiler.hpp b/tt_metal/tools/profiler/kernel_profiler.hpp index 55a95d76ff3..18fb9c2d75b 100644 --- a/tt_metal/tools/profiler/kernel_profiler.hpp +++ b/tt_metal/tools/profiler/kernel_profiler.hpp @@ -9,7 +9,7 @@ #if defined(COMPILE_FOR_NCRISC) || defined(COMPILE_FOR_BRISC) || defined(COMPILE_FOR_ERISC) || \ defined(COMPILE_FOR_IDLE_ERISC) #include "risc_common.h" -#include "dataflow_api.h" +#include "dataflow_api_addrgen.h" #else #include "ckernel.h" #endif @@ -192,6 +192,10 @@ inline void __attribute__((always_inline)) noc_async_write_posted( noc, write_cmd_buf, src_local_l1_addr, dst_noc_addr, size, NOC_UNICAST_WRITE_VC, false, false, 1, true, true); WAYPOINT("NAWD"); } + +FORCE_INLINE +void noc_async_flush_posted_writes(uint8_t noc = noc_index) { while (!ncrisc_noc_posted_writes_sent(noc)); } + #endif __attribute__((noinline)) void finish_profiler() { @@ -253,7 +257,7 @@ __attribute__((noinline)) void finish_profiler() { } } - noc_async_posted_writes_flushed(); + noc_async_flush_posted_writes(); profiler_control_buffer[RUN_COUNTER]++; profiler_control_buffer[PROFILER_DONE] = 1; #endif @@ -298,7 +302,7 @@ __attribute__((noinline)) void quick_push() { dram_bank_dst_noc_addr, wIndex * sizeof(uint32_t)); - noc_async_posted_writes_flushed(); + noc_async_flush_posted_writes(); profiler_control_buffer[HOST_BUFFER_END_INDEX_BR_ER + myRiscID] = currEndIndex; } else {