diff --git a/tests/scripts/run_tools_tests.sh b/tests/scripts/run_tools_tests.sh index 0e80ac969a1..ec9fdfba632 100755 --- a/tests/scripts/run_tools_tests.sh +++ b/tests/scripts/run_tools_tests.sh @@ -7,8 +7,10 @@ if [[ -z "$TT_METAL_HOME" ]]; then exit 1 fi -# For now, only test watcher dump tool here. if [[ -z "$TT_METAL_SLOW_DISPATCH_MODE" ]] ; then + # Watcher dump tool testing + echo "Running watcher dump tool tests..." + # Run a test that populates basic fields but not watcher fields ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter=*PrintHanging @@ -30,4 +32,23 @@ if [[ -z "$TT_METAL_SLOW_DISPATCH_MODE" ]] ; then # Remove created files. rm tmp.log rm generated/watcher/watcher.log + echo "Watcher dump tool tests finished..." + + + # Clean init testing + echo "Running clean init tests - FD-on-Tensix" + echo "First run, no teardown" + ./build/test/tt_metal/test_clean_init --skip-teardown || { echo "Above failure is expected."; } + echo "Second run, expect clean init" + timeout 10 ./build/test/tt_metal/test_clean_init || { echo "Error: second run timed out, clean init (FD-on-Tensix) failed."; exit 1; } + echo "Clean init tests - FD-on-Tensix passed!" + + if [[ "$ARCH_NAME" == "wormhole_b0" ]]; then + echo "Running clean init tests - FD-on-Eth" + echo "First run, no teardown" + env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml ./build/test/tt_metal/test_clean_init --skip-teardown || { echo "Above failure is expected."; } + echo "Second run, expect clean init" + timeout 10 env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml ./build/test/tt_metal/test_clean_init || { echo "Error: second run timed out, clean init (FD-on-Eth) failed."; exit 1; } + echo "Clean init tests - FD-on-Eth passed!" + fi fi diff --git a/tests/tt_metal/tt_metal/CMakeLists.txt b/tests/tt_metal/tt_metal/CMakeLists.txt index e0bfbb80569..8a0d57cdd39 100644 --- a/tests/tt_metal/tt_metal/CMakeLists.txt +++ b/tests/tt_metal/tt_metal/CMakeLists.txt @@ -37,6 +37,7 @@ set (TT_METAL_TESTS test_core_range_set test_compile_sets_kernel_binaries test_compile_program + test_clean_init ) foreach (TEST ${TT_METAL_TESTS}) diff --git a/tests/tt_metal/tt_metal/test_clean_init.cpp b/tests/tt_metal/tt_metal/test_clean_init.cpp new file mode 100644 index 00000000000..35f9e7d25f4 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_clean_init.cpp @@ -0,0 +1,151 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "tt_metal/host_api.hpp" +#include "tt_metal/detail/tt_metal.hpp" +#include "common/bfloat16.hpp" +#include + +/* + * Similar to loopback programming example, except run on al devices and skip device teardown to check if we can + * recover from a "bad" state. +*/ + +using namespace tt::tt_metal; + +int main(int argc, char **argv) { + + if (getenv("TT_METAL_SLOW_DISPATCH_MODE") != nullptr) { + TT_THROW("Test not supported w/ slow dispatch, exiting"); + } + + // Any arg means that we shouldn't do teardown. + bool skip_teardown = (argc > 1); + if (skip_teardown) + tt::log_info("Running loopback test with no teardown, to see if we can recover next run."); + else + tt::log_info("Running loopback test with proper teardown"); + + bool pass = true; + auto num_devices = tt::tt_metal::GetNumAvailableDevices(); + vector ids; + for (unsigned int id = 0; id < num_devices; id++) { + ids.push_back(id); + } + tt::DevicePool::initialize(ids, 1, DEFAULT_L1_SMALL_SIZE); + std::vector devices = tt::DevicePool::instance().get_all_active_devices(); + + for (int device_id = 0; device_id < num_devices; device_id++) { + try { + /* + * Silicon accelerator setup + */ + Device *device = devices[device_id]; + + /* + * Setup program and command queue to execute along with its buffers and kernels to use + */ + CommandQueue& cq = device->command_queue(); + Program program = CreateProgram(); + + constexpr CoreCoord core = {0, 0}; + + KernelHandle dram_copy_kernel_id = CreateKernel( + program, + "tt_metal/programming_examples/loopback/kernels/loopback_dram_copy.cpp", + core, + DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default} + ); + + constexpr uint32_t single_tile_size = 2 * (32 * 32); + constexpr uint32_t num_tiles = 50; + constexpr uint32_t dram_buffer_size = single_tile_size * num_tiles; + + tt::tt_metal::InterleavedBufferConfig dram_config{ + .device= device, + .size = dram_buffer_size, + .page_size = dram_buffer_size, + .buffer_type = tt::tt_metal::BufferType::DRAM + }; + tt::tt_metal::InterleavedBufferConfig l1_config{ + .device= device, + .size = dram_buffer_size, + .page_size = dram_buffer_size, + .buffer_type = tt::tt_metal::BufferType::L1 + }; + + auto l1_buffer = CreateBuffer(l1_config); + + auto input_dram_buffer = CreateBuffer(dram_config); + const uint32_t input_dram_buffer_addr = input_dram_buffer->address(); + + auto output_dram_buffer = CreateBuffer(dram_config); + const uint32_t output_dram_buffer_addr = output_dram_buffer->address(); + + /* + * Create input data and runtime arguments, then execute + */ + std::vector input_vec = create_random_vector_of_bfloat16( + dram_buffer_size, 100, std::chrono::system_clock::now().time_since_epoch().count()); + EnqueueWriteBuffer(cq, input_dram_buffer, input_vec, false); + + const std::vector runtime_args = { + l1_buffer->address(), + input_dram_buffer->address(), + static_cast(input_dram_buffer->noc_coordinates().x), + static_cast(input_dram_buffer->noc_coordinates().y), + output_dram_buffer->address(), + static_cast(output_dram_buffer->noc_coordinates().x), + static_cast(output_dram_buffer->noc_coordinates().y), + l1_buffer->size() + }; + + SetRuntimeArgs( + program, + dram_copy_kernel_id, + core, + runtime_args + ); + + EnqueueProgram(cq, program, false); + tt::log_info("Started program"); + Finish(cq); + tt::log_info("Finished program"); + + /* + * Validation & Teardown + */ + std::vector result_vec; + EnqueueReadBuffer(cq, output_dram_buffer, result_vec, true); + + pass &= input_vec == result_vec; + + } catch (const std::exception &e) { + tt::log_error(tt::LogTest, "Test failed with exception!"); + tt::log_error(tt::LogTest, "{}", e.what()); + + throw; + } + } + + if (pass) { + tt::log_info(tt::LogTest, "Test Passed"); + } else { + TT_THROW("Test Failed"); + } + + // Skip teardown by throwing. + if (skip_teardown) { + TT_FATAL(false, "Skip teardown by throwing"); + } else { + for (auto device : devices) { + pass &= CloseDevice(device); + } + } + + // Error out with non-zero return code if we don't detect a pass + TT_FATAL(pass); + + return 0; +} diff --git a/tt_metal/hw/inc/dataflow_api.h b/tt_metal/hw/inc/dataflow_api.h index 9200114e393..b1c1902fe81 100644 --- a/tt_metal/hw/inc/dataflow_api.h +++ b/tt_metal/hw/inc/dataflow_api.h @@ -1485,10 +1485,10 @@ void noc_async_write_barrier() { */ FORCE_INLINE void noc_async_writes_flushed() { - DEBUG_STATUS("NWBW"); + DEBUG_STATUS("NWFW"); while (!ncrisc_noc_nonposted_writes_sent(noc_index)) ; - DEBUG_STATUS("NWBD"); + DEBUG_STATUS("NWFD"); } /** diff --git a/tt_metal/hw/inc/debug/assert.h b/tt_metal/hw/inc/debug/assert.h index bba33d7c7e7..e8eaf04102e 100644 --- a/tt_metal/hw/inc/debug/assert.h +++ b/tt_metal/hw/inc/debug/assert.h @@ -17,6 +17,10 @@ void assert_and_hang(uint32_t line_num) { v->which = debug_get_which_riscv(); } + // Update launch msg to show that we've exited. + tt_l1_ptr launch_msg_t *launch_msg = GET_MAILBOX_ADDRESS_DEV(launch); + launch_msg->run = RUN_MSG_DONE; + // Hang, or in the case of erisc, early exit. #if defined(COMPILE_FOR_ERISC) internal_::disable_erisc_app(); diff --git a/tt_metal/hw/inc/debug/sanitize_noc.h b/tt_metal/hw/inc/debug/sanitize_noc.h index be4770ac827..6ffe7f8317a 100644 --- a/tt_metal/hw/inc/debug/sanitize_noc.h +++ b/tt_metal/hw/inc/debug/sanitize_noc.h @@ -81,6 +81,10 @@ inline void debug_sanitize_post_noc_addr_and_hang( v[noc_index].invalid = invalid; } + // Update launch msg to show that we've exited. + tt_l1_ptr launch_msg_t *launch_msg = GET_MAILBOX_ADDRESS_DEV(launch); + launch_msg->run = RUN_MSG_DONE; + #if defined(COMPILE_FOR_ERISC) // For erisc, we can't hang the kernel/fw, because the core doesn't get restarted when a new // kernel is written. In this case we'll do an early exit back to base FW. @@ -88,11 +92,7 @@ inline void debug_sanitize_post_noc_addr_and_hang( erisc_early_exit(eth_l1_mem::address_map::ERISC_MEM_MAILBOX_STACK_SAVE); #endif - while (1) { -#if defined(COMPILE_FOR_ERISC) - internal_::risc_context_switch(); -#endif - } + while (1) { ; } } // Return value is the alignment mask for the type of core the noc address points diff --git a/tt_metal/hw/inc/dev_msgs.h b/tt_metal/hw/inc/dev_msgs.h index c63ac707050..a44a0442b68 100644 --- a/tt_metal/hw/inc/dev_msgs.h +++ b/tt_metal/hw/inc/dev_msgs.h @@ -75,6 +75,7 @@ struct launch_msg_t { // must be cacheline aligned volatile uint8_t max_cb_index; volatile uint8_t dispatch_core_x; volatile uint8_t dispatch_core_y; + volatile uint8_t exit_erisc_kernel; volatile uint8_t run; // must be in last cacheline of this msg }; diff --git a/tt_metal/impl/debug/watcher_server.cpp b/tt_metal/impl/debug/watcher_server.cpp index 78822e16bdd..9abc78532d4 100644 --- a/tt_metal/impl/debug/watcher_server.cpp +++ b/tt_metal/impl/debug/watcher_server.cpp @@ -675,6 +675,10 @@ static void dump_core( // requested explicitly dump_sync_regs(f, device, core); } + } else { + fprintf(f, "rmsg:"); + dump_run_state(f, core, &mbox_data->launch, mbox_data->launch.run); + fprintf(f, " "); } // Eth core only reports erisc kernel id, uses the brisc field diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index 65636dba1d3..8fb7e12f1ff 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include +#include #include "tt_metal/impl/device/device.hpp" #include "tt_metal/impl/trace/trace.hpp" #include "tt_metal/common/core_descriptor.hpp" @@ -35,6 +36,112 @@ Device::Device( this->initialize(num_hw_cqs, l1_small_size, trace_region_size, l1_bank_remap, minimal); } +/* Get all dispatch cores associated with this device. On return, my_dispatch_cores contains dispatch cores used by + * this device (split between cores on this device itself and if this is a remote device, the mmio device dispatch + * cores being used by this device). On return, other_dispatch_cores contains dispatch cores on this device that are + * used by other (remote) devices. +*/ +void Device::get_associated_dispatch_phys_cores( + std::unordered_map> &my_dispatch_cores, + std::unordered_map> &other_dispatch_cores) { + if (this->is_mmio_capable()) { + for (const chip_id_t &device_id : tt::Cluster::instance().get_devices_controlled_by_mmio_device(this->id_)) { + uint8_t curr_num_hw_cqs = device_id == this->id_ ? this->num_hw_cqs() : 1; + uint16_t curr_channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id); + CoreType dispatch_core_type = dispatch_core_manager::get(curr_num_hw_cqs).get_dispatch_core_type(device_id); + for (uint8_t cq_id = 0; cq_id < curr_num_hw_cqs; cq_id++) { + if (device_id == this->id_) { + //mmio device. + if (dispatch_core_manager::get(curr_num_hw_cqs).is_dispatcher_core_allocated(device_id, curr_channel, cq_id)) { + tt_cxy_pair dispatch_location = dispatch_core_manager::get(curr_num_hw_cqs).dispatcher_core(device_id, curr_channel, cq_id); + CoreCoord phys_core = get_physical_core_coordinate(dispatch_location, dispatch_core_type); + my_dispatch_cores[this->id_].insert(phys_core); + log_debug(tt::LogMetal, "MMIO Device Dispatch core: Logical: {} - Physical: {}", dispatch_location.str(), phys_core.str()); + } + if (dispatch_core_manager::get(curr_num_hw_cqs).is_prefetcher_core_allocated(device_id, curr_channel, cq_id)) { + tt_cxy_pair prefetch_location = dispatch_core_manager::get(curr_num_hw_cqs).prefetcher_core(device_id, curr_channel, cq_id); + CoreCoord phys_core = get_physical_core_coordinate(prefetch_location, dispatch_core_type); + my_dispatch_cores[this->id_].insert(phys_core); + log_debug(tt::LogMetal, "MMIO Device Prefetch core: Logical: {} - Physical: {}", prefetch_location.str(), phys_core.str()); + } + } else if (tt::DevicePool::instance().is_device_active(device_id)) { + //non mmio devices serviced by this mmio capable device. + //skip remote dispatch cores only if respective remote device is active. + if (dispatch_core_manager::get(curr_num_hw_cqs).is_dispatcher_core_allocated(device_id, curr_channel, cq_id)) { + tt_cxy_pair dispatch_location = dispatch_core_manager::get(curr_num_hw_cqs).dispatcher_core(device_id, curr_channel, cq_id); + CoreCoord phys_core = get_physical_core_coordinate(dispatch_location, dispatch_core_type); + other_dispatch_cores[this->id_].insert(phys_core); + log_debug(tt::LogMetal, "Remote Device Dispatch core: Logical: {} - Physical: {} will keep running on MMIO Device.", dispatch_location.str(), phys_core.str()); + } + if (dispatch_core_manager::get(curr_num_hw_cqs).is_prefetcher_core_allocated(device_id, curr_channel, cq_id)) { + tt_cxy_pair prefetch_location = dispatch_core_manager::get(curr_num_hw_cqs).prefetcher_core(device_id, curr_channel, cq_id); + CoreCoord phys_core = get_physical_core_coordinate(prefetch_location, dispatch_core_type); + other_dispatch_cores[this->id_].insert(phys_core); + log_debug(tt::LogMetal, "Remote Device Prefetch core: Logical: {} - Physical: {} will keep running on MMIO Device.", prefetch_location.str(), phys_core.str()); + } + if (dispatch_core_manager::get(curr_num_hw_cqs).is_mux_core_allocated(device_id, curr_channel, cq_id)) { + tt_cxy_pair mux_location = dispatch_core_manager::get(curr_num_hw_cqs).mux_core(device_id, curr_channel, cq_id); + CoreCoord phys_core = get_physical_core_coordinate(mux_location, dispatch_core_type); + other_dispatch_cores[this->id_].insert(phys_core); + log_debug(tt::LogMetal, "Remote Device Mux core: Logical: {} - Physical: {} will keep running on MMIO Device.", mux_location.str(), phys_core.str()); + } + if (dispatch_core_manager::get(curr_num_hw_cqs).is_demux_core_allocated(device_id, curr_channel, cq_id)) { + tt_cxy_pair demux_location = dispatch_core_manager::get(curr_num_hw_cqs).demux_core(device_id, curr_channel, cq_id); + CoreCoord phys_core = get_physical_core_coordinate(demux_location, dispatch_core_type); + other_dispatch_cores[this->id_].insert(phys_core); + log_debug(tt::LogMetal, "Remote Device Demux core: Logical: {} - Physical: {} will keep running on MMIO Device.", demux_location.str(), phys_core.str()); + } + } + } + } + } else { + //remote device that is active + uint8_t curr_num_hw_cqs = 1; + auto device_id = this->id_; + uint16_t curr_channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id); + CoreType dispatch_core_type = dispatch_core_manager::get(curr_num_hw_cqs).get_dispatch_core_type(device_id); + for (uint8_t cq_id = 0; cq_id < curr_num_hw_cqs; cq_id++) { + if (dispatch_core_manager::get(curr_num_hw_cqs).is_dispatcher_core_allocated(device_id, curr_channel, cq_id)) { + tt_cxy_pair dispatch_location = dispatch_core_manager::get(curr_num_hw_cqs).dispatcher_core(device_id, curr_channel, cq_id); + CoreCoord phys_core = get_physical_core_coordinate(dispatch_location, dispatch_core_type); + my_dispatch_cores[dispatch_location.chip].insert(phys_core); + log_debug(tt::LogMetal, "Remote Device Dispatch core: Logical: {} - Physical: {} will be reset on MMIO Device.", dispatch_location.str(), phys_core.str()); + } + if (dispatch_core_manager::get(curr_num_hw_cqs).is_prefetcher_core_allocated(device_id, curr_channel, cq_id)) { + tt_cxy_pair prefetch_location = dispatch_core_manager::get(curr_num_hw_cqs).prefetcher_core(device_id, curr_channel, cq_id); + CoreCoord phys_core = get_physical_core_coordinate(prefetch_location, dispatch_core_type); + my_dispatch_cores[prefetch_location.chip].insert(phys_core); + log_debug(tt::LogMetal, "Remote Device Prefetch core: Logical: {} - Physical: {} will be reset on MMIO Device.", prefetch_location.str(), phys_core.str()); + } + if (dispatch_core_manager::get(curr_num_hw_cqs).is_mux_core_allocated(device_id, curr_channel, cq_id)) { + tt_cxy_pair mux_location = dispatch_core_manager::get(curr_num_hw_cqs).mux_core(device_id, curr_channel, cq_id); + CoreCoord phys_core = get_physical_core_coordinate(mux_location, dispatch_core_type); + my_dispatch_cores[mux_location.chip].insert(phys_core); + log_debug(tt::LogMetal, "Remote Device Mux core: Logical: {} - Physical: {} will be reset on MMIO Device.", mux_location.str(), phys_core.str()); + } + if (dispatch_core_manager::get(curr_num_hw_cqs).is_demux_core_allocated(device_id, curr_channel, cq_id)) { + tt_cxy_pair demux_location = dispatch_core_manager::get(curr_num_hw_cqs).demux_core(device_id, curr_channel, cq_id); + CoreCoord phys_core = get_physical_core_coordinate(demux_location, dispatch_core_type); + my_dispatch_cores[demux_location.chip].insert(phys_core); + log_debug(tt::LogMetal, "Remote Device Demux core: Logical: {} - Physical: {} will be reset on MMIO Device.", demux_location.str(), phys_core.str()); + } + CoreCoord phys_core; + tt_cxy_pair dispatch_location = dispatch_core_manager::get(curr_num_hw_cqs).dispatcher_d_core(device_id, curr_channel, cq_id); + phys_core = get_physical_core_coordinate(dispatch_location, dispatch_core_type); + my_dispatch_cores[dispatch_location.chip].insert(phys_core); + tt_cxy_pair prefetch_location = dispatch_core_manager::get(curr_num_hw_cqs).prefetcher_d_core(device_id, curr_channel, cq_id); + phys_core = get_physical_core_coordinate(prefetch_location, dispatch_core_type); + my_dispatch_cores[dispatch_location.chip].insert(phys_core); + tt_cxy_pair mux_location = dispatch_core_manager::get(curr_num_hw_cqs).mux_d_core(device_id, curr_channel, cq_id); + phys_core = get_physical_core_coordinate(mux_location, dispatch_core_type); + my_dispatch_cores[dispatch_location.chip].insert(phys_core); + tt_cxy_pair demux_location = dispatch_core_manager::get(curr_num_hw_cqs).demux_d_core(device_id, curr_channel, cq_id); + phys_core = get_physical_core_coordinate(demux_location, dispatch_core_type); + my_dispatch_cores[dispatch_location.chip].insert(phys_core); + } + } +} + void Device::initialize_cluster() { ZoneScoped; if (llrt::OptionsG.get_clear_l1()) { @@ -190,6 +297,91 @@ void Device::initialize_firmware(CoreCoord phys_core, launch_msg_t *launch_msg) llrt::write_launch_msg_to_core(this->id(), phys_core, launch_msg); } +void Device::reset_cores() { + ZoneScoped; + + auto kernel_still_running = [](launch_msg_t *launch_msg) { + return launch_msg->run == RUN_MSG_GO && launch_msg->exit_erisc_kernel == 0; + }; + + auto mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(this->id_); + // Assert worker cores + dispatch cores, in case they were in a bad state from before. + std::unordered_map> dispatch_cores, other_dispatch_cores, device_to_early_exit_cores; + for (const auto ð_core : this->get_active_ethernet_cores()) { + CoreCoord physical_core = this->ethernet_core_from_logical_core(eth_core); + std::vector data(sizeof(launch_msg_t) / sizeof(uint32_t)); + data = tt::llrt::read_hex_vec_from_core( + this->id(), physical_core, GET_ETH_MAILBOX_ADDRESS_HOST(launch), sizeof(launch_msg_t)); + launch_msg_t *launch_msg = (launch_msg_t *)(&data[0]); + if (kernel_still_running(launch_msg)) { + log_info( + tt::LogMetal, + "While initializing Device {}, ethernet tunneler core {} on Device {} detected as still running, issuing exit signal.", + this->id(), + physical_core.str(), + this->id()); + launch_msg->exit_erisc_kernel = 1; + llrt::write_launch_msg_to_core(this->id(), physical_core, launch_msg); + device_to_early_exit_cores[this->id()].insert(physical_core); + } + } + + this->get_associated_dispatch_phys_cores(dispatch_cores, other_dispatch_cores); + // Ignore other_dispatch_cores, they will be reset by the devices that use them. + for (auto &id_and_cores : dispatch_cores) { + for (auto it = id_and_cores.second.begin(); it != id_and_cores.second.end(); it++) { + const auto &phys_core = *it; + // Only need to manually reset ethernet dispatch cores, tensix cores are all reset below. + if (llrt::is_ethernet_core(phys_core, id_and_cores.first)) { + // Ethernet cores won't be reset, so just signal the dispatch cores to early exit. + std::vector data(sizeof(launch_msg_t) / sizeof(uint32_t)); + data = tt::llrt::read_hex_vec_from_core( + id_and_cores.first, phys_core, GET_IERISC_MAILBOX_ADDRESS_HOST(launch), sizeof(launch_msg_t)); + launch_msg_t *launch_msg = (launch_msg_t *)(&data[0]); + if (kernel_still_running(launch_msg)) { + log_info( + tt::LogMetal, + "While initializing device {}, ethernet dispatch core {} on Device {} detected as still running, issuing exit signal.", + this->id(), + phys_core.str(), + id_and_cores.first); + launch_msg->exit_erisc_kernel = 1; + llrt::write_launch_msg_to_core(id_and_cores.first, phys_core, launch_msg); + device_to_early_exit_cores[id_and_cores.first].insert(phys_core); + } + } + } + } + + // Early exiting dispatch cores should show RUN_MSG_DONE when they exit. + for (auto &id_and_cores : device_to_early_exit_cores) { + const int timeout_ms = 10000; // 10 seconds for now + if (!id_and_cores.second.empty()) { + try { + llrt::internal_::wait_until_cores_done(id_and_cores.first, RUN_MSG_GO, id_and_cores.second, timeout_ms); + } catch (std::runtime_error &e) { + TT_THROW("Device {} init: failed to reset cores! Try resetting the board.", this->id()); + } + } + } + + // Reset Tensix cores + CoreCoord grid_size = this->logical_grid_size(); + for (uint32_t y = 0; y < grid_size.y; y++) { + for (uint32_t x = 0; x < grid_size.x; x++) { + CoreCoord logical_core(x, y); + CoreCoord worker_core = this->worker_core_from_logical_core(logical_core); + + // Don't reset dispatch cores for other devices, in case they're still running. + if (other_dispatch_cores[this->id_].find(worker_core) == other_dispatch_cores[this->id_].end()) { + if (this->storage_only_cores_.find(logical_core) == this->storage_only_cores_.end()) { + tt::Cluster::instance().assert_risc_reset_at_core(tt_cxy_pair(this->id(), worker_core)); + } + } + } + } +} + void Device::initialize_and_launch_firmware() { ZoneScoped; @@ -270,19 +462,13 @@ void Device::clear_l1_state() { } } - for (const auto ð_core : this->get_inactive_ethernet_cores()) { - CoreCoord physical_core = this->ethernet_core_from_logical_core(eth_core); - std::vector zero_vec_mailbox(128 / sizeof(uint32_t), 0); - llrt::write_hex_vec_to_core(this->id(), physical_core, zero_vec_mailbox, MEM_IERISC_MAILBOX_BASE); - } - // These L1 ranges are restricted becase UMD base routing FW uses L1 below FIRMWARE_BASE and // between TILE_HEADER_BUFFER_BASE to COMMAND_Q_BASE std::vector zero_vec_above_tile_header_buffer( - (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - eth_l1_mem::address_map::TILE_HEADER_BUFFER_BASE) / - sizeof(uint32_t), + (eth_l1_mem::address_map::SEMAPHORE_BASE - eth_l1_mem::address_map::TILE_HEADER_BUFFER_BASE) / sizeof(uint32_t), 0); + // Clear erisc sync info for (const auto ð_core : this->get_active_ethernet_cores()) { CoreCoord physical_core = this->ethernet_core_from_logical_core(eth_core); @@ -1567,7 +1753,7 @@ void Device::configure_command_queue_programs() { } } -void Device::initialize_command_queue() { +void Device::init_command_queue_host() { TT_ASSERT(this->is_mmio_capable() or (not this->is_mmio_capable() and this->num_hw_cqs() == 1), "Only support one hardware command queue for fast dispatch on remote device"); using_fast_dispatch = true; this->sysmem_manager_ = std::make_unique(this->id_, this->num_hw_cqs()); @@ -1577,6 +1763,9 @@ void Device::initialize_command_queue() { // Need to do this since CommandQueue constructor is private sw_command_queues_.push_back(std::unique_ptr(new CommandQueue(this, cq_id))); } +} + +void Device::init_command_queue_device() { this->compile_command_queue_programs(); if (this->is_mmio_capable()) { @@ -1666,95 +1855,12 @@ bool Device::close() { this->deallocate_buffers(); - std::unordered_set not_done_dispatch_cores; - std::unordered_set cores_to_skip; - - if (this->is_mmio_capable()) { - for (const chip_id_t &device_id : tt::Cluster::instance().get_devices_controlled_by_mmio_device(this->id_)) { - uint8_t curr_num_hw_cqs = device_id == this->id_ ? this->num_hw_cqs() : 1; - uint16_t curr_channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id); - CoreType dispatch_core_type = dispatch_core_manager::get(curr_num_hw_cqs).get_dispatch_core_type(device_id); - for (uint8_t cq_id = 0; cq_id < curr_num_hw_cqs; cq_id++) { - if (device_id == this->id_) { - //mmio device. - if (dispatch_core_manager::get(curr_num_hw_cqs).is_dispatcher_core_allocated(device_id, curr_channel, cq_id)) { - tt_cxy_pair dispatch_location = dispatch_core_manager::get(curr_num_hw_cqs).dispatcher_core(device_id, curr_channel, cq_id); - CoreCoord phys_core = get_physical_core_coordinate(dispatch_location, dispatch_core_type); - not_done_dispatch_cores.insert(phys_core); - log_debug(tt::LogMetal, "MMIO Device Dispatch core: Logical: {} - Physical: {}", dispatch_location.str(), phys_core.str()); - } - if (dispatch_core_manager::get(curr_num_hw_cqs).is_prefetcher_core_allocated(device_id, curr_channel, cq_id)) { - tt_cxy_pair prefetch_location = dispatch_core_manager::get(curr_num_hw_cqs).prefetcher_core(device_id, curr_channel, cq_id); - CoreCoord phys_core = get_physical_core_coordinate(prefetch_location, dispatch_core_type); - not_done_dispatch_cores.insert(phys_core); - log_debug(tt::LogMetal, "MMIO Device Prefetch core: Logical: {} - Physical: {}", prefetch_location.str(), phys_core.str()); - } - } else if (tt::DevicePool::instance().is_device_active(device_id)) { - //non mmio devices serviced by this mmio capable device. - //skip remote dispatch cores only if respective remote device is active. - if (dispatch_core_manager::get(curr_num_hw_cqs).is_dispatcher_core_allocated(device_id, curr_channel, cq_id)) { - tt_cxy_pair dispatch_location = dispatch_core_manager::get(curr_num_hw_cqs).dispatcher_core(device_id, curr_channel, cq_id); - CoreCoord phys_core = get_physical_core_coordinate(dispatch_location, dispatch_core_type); - cores_to_skip.insert(phys_core); - log_debug(tt::LogMetal, "Remote Device Dispatch core: Logical: {} - Physical: {} will keep running on MMIO Device.", dispatch_location.str(), phys_core.str()); - } - if (dispatch_core_manager::get(curr_num_hw_cqs).is_prefetcher_core_allocated(device_id, curr_channel, cq_id)) { - tt_cxy_pair prefetch_location = dispatch_core_manager::get(curr_num_hw_cqs).prefetcher_core(device_id, curr_channel, cq_id); - CoreCoord phys_core = get_physical_core_coordinate(prefetch_location, dispatch_core_type); - cores_to_skip.insert(phys_core); - log_debug(tt::LogMetal, "Remote Device Prefetch core: Logical: {} - Physical: {} will keep running on MMIO Device.", prefetch_location.str(), phys_core.str()); - } - if (dispatch_core_manager::get(curr_num_hw_cqs).is_mux_core_allocated(device_id, curr_channel, cq_id)) { - tt_cxy_pair mux_location = dispatch_core_manager::get(curr_num_hw_cqs).mux_core(device_id, curr_channel, cq_id); - CoreCoord phys_core = get_physical_core_coordinate(mux_location, dispatch_core_type); - cores_to_skip.insert(phys_core); - log_debug(tt::LogMetal, "Remote Device Mux core: Logical: {} - Physical: {} will keep running on MMIO Device.", mux_location.str(), phys_core.str()); - } - if (dispatch_core_manager::get(curr_num_hw_cqs).is_demux_core_allocated(device_id, curr_channel, cq_id)) { - tt_cxy_pair demux_location = dispatch_core_manager::get(curr_num_hw_cqs).demux_core(device_id, curr_channel, cq_id); - CoreCoord phys_core = get_physical_core_coordinate(demux_location, dispatch_core_type); - cores_to_skip.insert(phys_core); - log_debug(tt::LogMetal, "Remote Device Demux core: Logical: {} - Physical: {} will keep running on MMIO Device.", demux_location.str(), phys_core.str()); - } - } - } - } - } else { - //remote device that is active - uint8_t curr_num_hw_cqs = 1; - auto device_id = this->id_; - uint16_t curr_channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id); - CoreType dispatch_core_type = dispatch_core_manager::get(curr_num_hw_cqs).get_dispatch_core_type(device_id); - for (uint8_t cq_id = 0; cq_id < curr_num_hw_cqs; cq_id++) { - if (dispatch_core_manager::get(curr_num_hw_cqs).is_dispatcher_core_allocated(device_id, curr_channel, cq_id)) { - tt_cxy_pair dispatch_location = dispatch_core_manager::get(curr_num_hw_cqs).dispatcher_core(device_id, curr_channel, cq_id); - CoreCoord phys_core = get_physical_core_coordinate(dispatch_location, dispatch_core_type); - not_done_dispatch_cores.insert(phys_core); - log_debug(tt::LogMetal, "Remote Device Dispatch core: Logical: {} - Physical: {} will be reset on MMIO Device.", dispatch_location.str(), phys_core.str()); - } - if (dispatch_core_manager::get(curr_num_hw_cqs).is_prefetcher_core_allocated(device_id, curr_channel, cq_id)) { - tt_cxy_pair prefetch_location = dispatch_core_manager::get(curr_num_hw_cqs).prefetcher_core(device_id, curr_channel, cq_id); - CoreCoord phys_core = get_physical_core_coordinate(prefetch_location, dispatch_core_type); - not_done_dispatch_cores.insert(phys_core); - log_debug(tt::LogMetal, "Remote Device Prefetch core: Logical: {} - Physical: {} will be reset on MMIO Device.", prefetch_location.str(), phys_core.str()); - } - if (dispatch_core_manager::get(curr_num_hw_cqs).is_mux_core_allocated(device_id, curr_channel, cq_id)) { - tt_cxy_pair mux_location = dispatch_core_manager::get(curr_num_hw_cqs).mux_core(device_id, curr_channel, cq_id); - CoreCoord phys_core = get_physical_core_coordinate(mux_location, dispatch_core_type); - not_done_dispatch_cores.insert(phys_core); - log_debug(tt::LogMetal, "Remote Device Mux core: Logical: {} - Physical: {} will be reset on MMIO Device.", mux_location.str(), phys_core.str()); - } - if (dispatch_core_manager::get(curr_num_hw_cqs).is_demux_core_allocated(device_id, curr_channel, cq_id)) { - tt_cxy_pair demux_location = dispatch_core_manager::get(curr_num_hw_cqs).demux_core(device_id, curr_channel, cq_id); - CoreCoord phys_core = get_physical_core_coordinate(demux_location, dispatch_core_type); - not_done_dispatch_cores.insert(phys_core); - log_debug(tt::LogMetal, "Remote Device Demux core: Logical: {} - Physical: {} will be reset on MMIO Device.", demux_location.str(), phys_core.str()); - } - } - } + std::unordered_map> not_done_dispatch_cores; + std::unordered_map> cores_to_skip; + this->get_associated_dispatch_phys_cores(not_done_dispatch_cores, cores_to_skip); auto mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(this->id_); - std::unordered_set wait_for_cores = not_done_dispatch_cores; + std::unordered_set wait_for_cores = not_done_dispatch_cores[mmio_device_id]; llrt::internal_::wait_until_cores_done(mmio_device_id, RUN_MSG_GO, wait_for_cores); @@ -1768,7 +1874,7 @@ bool Device::close() { CoreCoord logical_core(x, y); CoreCoord worker_core = this->worker_core_from_logical_core(logical_core); - if (cores_to_skip.find(worker_core) == cores_to_skip.end()) { + if (cores_to_skip[mmio_device_id].find(worker_core) == cores_to_skip[mmio_device_id].end()) { if (this->storage_only_cores_.find(logical_core) == this->storage_only_cores_.end()) { tt::Cluster::instance().assert_risc_reset_at_core(tt_cxy_pair(this->id(), worker_core)); } @@ -1779,7 +1885,7 @@ bool Device::close() { } if (this->id_ != mmio_device_id) { - for (auto it = not_done_dispatch_cores.begin(); it != not_done_dispatch_cores.end(); it++) { + for (auto it = not_done_dispatch_cores[mmio_device_id].begin(); it != not_done_dispatch_cores[mmio_device_id].end(); it++) { const auto &phys_core = *it; if(llrt::is_ethernet_core(phys_core, this->id_)) { log_debug(tt::LogMetal, "Ethernet dispatch core {} on Device {} is idle. Closing Device {}", phys_core.str(), mmio_device_id, this->id()); diff --git a/tt_metal/impl/device/device.hpp b/tt_metal/impl/device/device.hpp index 35a962273e9..f3171259c78 100644 --- a/tt_metal/impl/device/device.hpp +++ b/tt_metal/impl/device/device.hpp @@ -215,14 +215,19 @@ class Device { void initialize_build(); void build_firmware(); void initialize_firmware(CoreCoord phys_core, launch_msg_t *launch_msg); + void reset_cores(); void initialize_and_launch_firmware(); - void initialize_command_queue(); + void init_command_queue_host(); + void init_command_queue_device(); void initialize_synchronous_sw_cmd_queue(); void configure_kernel_variant(Program& program, string path, std::vector compile_args, CoreCoord kernel_core, CoreCoord Kernel_physical_core, CoreType dispatch_core_type, CoreCoord upstream_physical_core, CoreCoord downstream_physical_core, std::map defines_in, NOC noc_index, bool is_active_eth_core = false); void compile_command_queue_programs(); void configure_command_queue_programs(); void clear_l1_state(); + void get_associated_dispatch_phys_cores( + std::unordered_map> &my_dispatch_cores, + std::unordered_map> &other_dispatch_cores); std::pair build_processor_type_to_index(JitBuildProcessorType t) const; // Puts device into reset diff --git a/tt_metal/impl/device/device_pool.cpp b/tt_metal/impl/device/device_pool.cpp index 38ec6efa00b..0c8ba918bcf 100644 --- a/tt_metal/impl/device/device_pool.cpp +++ b/tt_metal/impl/device/device_pool.cpp @@ -107,23 +107,31 @@ DevicePool* DevicePool::_inst = nullptr; void DevicePool::initialize_device(Device* dev) const { detail::ClearProfilerControlBuffer(dev); - // TODO: as optimization, investigate removing all thisi call for already initialized devivces - dev->initialize_and_launch_firmware(); - - DprintServerAttach(dev); - watcher_init(dev); - watcher_attach(dev); // Create system memory writer for this device to have an associated interface to hardware command queue (i.e. - // hugepage) - if (std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr) { + // hugepage). Need to do this before FW init so we know what dispatch cores to reset. + bool using_fast_dispatch = (std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr); + if (using_fast_dispatch) { detail::DispatchStateCheck(true); - dev->initialize_command_queue(); + dev->init_command_queue_host(); } else { detail::DispatchStateCheck(false); dev->initialize_synchronous_sw_cmd_queue(); TT_ASSERT(dev->num_hw_cqs() == 1, "num_hw_cqs must be 1 in slow dispatch"); } + + DprintServerAttach(dev); + watcher_init(dev); + + // TODO: as optimization, investigate removing all this call for already initialized devivces + dev->reset_cores(); + dev->initialize_and_launch_firmware(); + + watcher_attach(dev); + + // Set up HW command queues on device for FD + if (using_fast_dispatch) + dev->init_command_queue_device(); detail::InitDeviceProfiler(dev); } diff --git a/tt_metal/impl/dispatch/kernels/cq_common.hpp b/tt_metal/impl/dispatch/kernels/cq_common.hpp index 6a417a9ba95..daa7bc376bf 100644 --- a/tt_metal/impl/dispatch/kernels/cq_common.hpp +++ b/tt_metal/impl/dispatch/kernels/cq_common.hpp @@ -8,6 +8,7 @@ #include "dataflow_api.h" #include "debug/dprint.h" #include "debug/ring_buffer.h" +#include "cq_helpers.hpp" FORCE_INLINE uint32_t round_up_pow2(uint32_t v, uint32_t pow2_size) { @@ -110,7 +111,10 @@ FORCE_INLINE void cq_noc_async_write_init_state(uint32_t src_addr, uint64_t dst_addr, uint32_t size = 0) { DEBUG_STATUS("NSIW"); - while (!noc_cmd_buf_ready(noc_index, NCRISC_WR_CMD_BUF)); + uint32_t heartbeat = 0; + while (!noc_cmd_buf_ready(noc_index, NCRISC_WR_CMD_BUF)) { + IDLE_ERISC_HEARTBEAT_AND_RETURN(heartbeat); + } DEBUG_STATUS("NSID"); constexpr bool multicast_path_reserve = false; @@ -155,7 +159,10 @@ void cb_acquire_pages(uint32_t n) { DEBUG_STATUS("DAPW"); // Use a wrapping compare here to compare distance // Required for trace which steals downstream credits and may make the value negative - while (wrap_gt(n, *sem_addr)); + uint32_t heartbeat = 0; + while (wrap_gt(n, *sem_addr)) { + IDLE_ERISC_HEARTBEAT_AND_RETURN(heartbeat); + } DEBUG_STATUS("DAPD"); noc_semaphore_inc(get_noc_addr_helper(noc_xy, (uint32_t)sem_addr), -n); } @@ -183,8 +190,10 @@ uint32_t cb_acquire_pages(uint32_t cb_fence, // Ensure last sem_inc has landed noc_async_atomic_barrier(); - DEBUG_STATUS("UAPW"); - while ((available = *sem_addr) == 0); + uint32_t heartbeat = 0; + while ((available = *sem_addr) == 0) { + IDLE_ERISC_HEARTBEAT_AND_RETURN(heartbeat, 0); + } DEBUG_STATUS("UAPD"); } diff --git a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp index 42ddb5f36e0..7540ca5ff82 100644 --- a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp +++ b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp @@ -806,14 +806,10 @@ static void process_wait() { DEBUG_STATUS("PWW"); volatile tt_l1_ptr uint32_t *sem_addr = reinterpret_cast(addr); DPRINT << " DISPATCH WAIT " << HEX() << addr << DEC() << " count " << count << ENDL(); -#if defined(COMPILE_FOR_IDLE_ERISC) uint32_t heartbeat = 0; -#endif if (wait) { while (!wrap_ge(*sem_addr, count)) { -#if defined(COMPILE_FOR_IDLE_ERISC) - RISC_POST_HEARTBEAT(heartbeat); -#endif + IDLE_ERISC_HEARTBEAT_AND_RETURN(heartbeat); } } DEBUG_STATUS("PWD"); @@ -1012,6 +1008,7 @@ void kernel_main() { cq_write_interface.completion_fifo_wr_toggle = completion_queue_wr_ptr_and_toggle >> 31; } bool done = false; + uint32_t heartbeat = 0; while (!done) { DeviceZoneScopedND("CQ-DISPATCH", block_noc_writes_to_clear, rd_block_idx ); if (cmd_ptr == cb_fence) { @@ -1024,6 +1021,8 @@ void kernel_main() { cmd_ptr, cb_fence, block_noc_writes_to_clear, block_next_start_addr, rd_block_idx); } + IDLE_ERISC_HEARTBEAT_AND_RETURN(heartbeat); + done = is_d_variant ? process_cmd_d(cmd_ptr) : process_cmd_h(cmd_ptr); // Move to next page @@ -1048,11 +1047,6 @@ void kernel_main() { noc_semaphore_inc(get_noc_addr_helper(upstream_noc_xy, get_semaphore(upstream_dispatch_cb_sem_id)), 0x80000000); } -#if defined(COMPILE_FOR_IDLE_ERISC) - uint32_t heartbeat = 0; - RISC_POST_HEARTBEAT(heartbeat); -#endif - // Release any held pages from the last block if (rd_block_idx != wr_block_idx) { // We're 1 block behind diff --git a/tt_metal/impl/dispatch/kernels/cq_helpers.hpp b/tt_metal/impl/dispatch/kernels/cq_helpers.hpp new file mode 100644 index 00000000000..62b6ae9a92a --- /dev/null +++ b/tt_metal/impl/dispatch/kernels/cq_helpers.hpp @@ -0,0 +1,29 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "risc_attribs.h" +#include "dataflow_api.h" + +// Macros for determining if an early exit is signalled, ERISC only. +#if defined(COMPILE_FOR_IDLE_ERISC) +// Helper function to determine if the dispatch kernel needs to early exit, only valid for IERISC. +FORCE_INLINE bool early_exit() { + tt_l1_ptr mailboxes_t * const mailbox = (tt_l1_ptr mailboxes_t *)(MEM_IERISC_MAILBOX_BASE); + return mailbox->launch.exit_erisc_kernel; +} + +#define IDLE_ERISC_RETURN(...) \ + if (early_exit()) { return __VA_ARGS__; } + +#define IDLE_ERISC_HEARTBEAT_AND_RETURN(heartbeat, ...) \ + RISC_POST_HEARTBEAT(heartbeat); \ + IDLE_ERISC_RETURN(__VA_ARGS__); + +#else + +#define IDLE_ERISC_RETURN(...) +#define IDLE_ERISC_HEARTBEAT_AND_RETURN(heartbeat, ...) + +#endif diff --git a/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp b/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp index 53d8f0bdafe..bb8723021d5 100644 --- a/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp +++ b/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp @@ -279,7 +279,10 @@ void fetch_q_get_cmds(uint32_t& fence, uint32_t& cmd_ptr, uint32_t& pcie_read_pt // By here, prefetch_q_ready must be false // Nothing to fetch, nothing pending, nothing available, stall on host DEBUG_STATUS("HQW"); - while ((fetch_size = *prefetch_q_rd_ptr) == 0); + uint32_t heartbeat = 0; + while ((fetch_size = *prefetch_q_rd_ptr) == 0) { + IDLE_ERISC_HEARTBEAT_AND_RETURN(heartbeat); + } fetch_q_get_cmds(fence, cmd_ptr, pcie_read_ptr); DEBUG_STATUS("HQD"); } @@ -842,13 +845,9 @@ uint32_t process_stall(uint32_t cmd_ptr) { DEBUG_STATUS("PSW"); volatile tt_l1_ptr uint32_t* sem_addr = reinterpret_cast(get_semaphore(downstream_sync_sem_id)); -#if defined(COMPILE_FOR_IDLE_ERISC) uint32_t heartbeat = 0; -#endif while (*sem_addr != count) { -#if defined(COMPILE_FOR_IDLE_ERISC) - RISC_POST_HEARTBEAT(heartbeat); -#endif + IDLE_ERISC_HEARTBEAT_AND_RETURN(heartbeat, CQ_PREFETCH_CMD_BARE_MIN_SIZE); } DEBUG_STATUS("PSD"); @@ -1253,6 +1252,7 @@ inline uint32_t relay_cb_get_cmds(uint32_t& fence, uint32_t& data_ptr) { block_noc_writes_to_clear, block_next_start_addr, rd_block_idx); + IDLE_ERISC_RETURN(length - sizeof(CQPrefetchHToPrefetchDHeader)); } data_ptr += sizeof(CQPrefetchHToPrefetchDHeader); @@ -1266,9 +1266,12 @@ void kernel_main_h() { uint32_t fence = cmddat_q_base; bool done = false; + uint32_t heartbeat = 0; while (!done) { fetch_q_get_cmds(fence, cmd_ptr, pcie_read_ptr); + IDLE_ERISC_HEARTBEAT_AND_RETURN(heartbeat); + volatile CQPrefetchCmd tt_l1_ptr *cmd = (volatile CQPrefetchCmd tt_l1_ptr *)(cmd_ptr + sizeof(CQPrefetchHToPrefetchDHeader)); uint32_t cmd_id = cmd->base.cmd_id; // Infer that an exec_buf command is to be executed based on the stall state. @@ -1281,10 +1284,6 @@ void kernel_main_h() { DPRINT << "prefetch terminating_10" << ENDL(); done = true; } -#if defined(COMPILE_FOR_IDLE_ERISC) - uint32_t heartbeat = 0; - RISC_POST_HEARTBEAT(heartbeat); -#endif } } @@ -1303,11 +1302,14 @@ void kernel_main_d() { uint32_t fence = cmddat_q_base; bool done = false; + uint32_t heartbeat = 0; while (!done) { // cmds come in packed batches based on HostQ reads in prefetch_h // once a packed batch ends, we need to jump to the next page uint32_t length = relay_cb_get_cmds(fence, cmd_ptr); + IDLE_ERISC_HEARTBEAT_AND_RETURN(heartbeat); + uint32_t amt_processed = 0; while (length > amt_processed) { uint32_t stride; @@ -1335,10 +1337,6 @@ void kernel_main_d() { // Move to next page cmd_ptr = round_up_pow2(cmd_ptr, cmddat_q_page_size); -#if defined(COMPILE_FOR_IDLE_ERISC) - uint32_t heartbeat = 0; - RISC_POST_HEARTBEAT(heartbeat); -#endif } // Set upstream semaphore MSB to signal completion and path teardown @@ -1354,20 +1352,19 @@ void kernel_main_hd() { uint32_t cmd_ptr = cmddat_q_base; uint32_t fence = cmddat_q_base; bool done = false; + uint32_t heartbeat = 0; while (!done) { DeviceZoneScopedND("KERNEL-MAIN-HD", block_noc_writes_to_clear, rd_block_idx ); constexpr uint32_t preamble_size = 0; fetch_q_get_cmds(fence, cmd_ptr, pcie_read_ptr); + IDLE_ERISC_HEARTBEAT_AND_RETURN(heartbeat); + volatile CQPrefetchCmd tt_l1_ptr *cmd = (volatile CQPrefetchCmd tt_l1_ptr *)cmd_ptr; uint32_t stride; done = process_cmd(cmd_ptr, downstream_data_ptr, stride); cmd_ptr += stride; -#if defined(COMPILE_FOR_IDLE_ERISC) - uint32_t heartbeat = 0; - RISC_POST_HEARTBEAT(heartbeat); -#endif } } @@ -1386,6 +1383,7 @@ void kernel_main() { } else { ASSERT(0); } + IDLE_ERISC_RETURN(); // Confirm expected number of pages, spinning here is a leak cb_wait_all_pages(downstream_cb_pages); diff --git a/tt_metal/impl/dispatch/kernels/eth_tunneler.cpp b/tt_metal/impl/dispatch/kernels/eth_tunneler.cpp index 284d17e363a..38a543a8082 100644 --- a/tt_metal/impl/dispatch/kernels/eth_tunneler.cpp +++ b/tt_metal/impl/dispatch/kernels/eth_tunneler.cpp @@ -5,6 +5,7 @@ // clang-format off #include "dataflow_api.h" #include "debug/dprint.h" +#include "debug/ring_buffer.h" #include "tt_metal/impl/dispatch/kernels/packet_queue.hpp" #include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen.hpp" // clang-format on @@ -151,6 +152,10 @@ void kernel_main() { all_outputs_finished &= output_finished; } + tt_l1_ptr launch_msg_t * const launch_msg = GET_MAILBOX_ADDRESS_DEV(launch); + if (launch_msg->exit_erisc_kernel) { + return; + } // need to optimize this. // context switch to base fw is very costly. internal_::risc_context_switch(); diff --git a/tt_metal/impl/dispatch/kernels/packet_demux.cpp b/tt_metal/impl/dispatch/kernels/packet_demux.cpp index 589ff65db19..01cb0e379b0 100644 --- a/tt_metal/impl/dispatch/kernels/packet_demux.cpp +++ b/tt_metal/impl/dispatch/kernels/packet_demux.cpp @@ -5,9 +5,9 @@ #include "dataflow_api.h" #include "debug/dprint.h" #include "tt_metal/impl/dispatch/kernels/packet_queue.hpp" +#include "tt_metal/impl/dispatch/kernels/cq_helpers.hpp" #include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen.hpp" - packet_input_queue_state_t input_queue; packet_output_queue_state_t output_queues[MAX_SWITCH_FAN_OUT]; @@ -223,7 +223,9 @@ void kernel_main() { uint64_t iter = 0; uint64_t start_timestamp = get_timestamp(); uint32_t progress_timestamp = start_timestamp & 0xFFFFFFFF; + uint32_t heartbeat = 0; while (!all_outputs_finished && !timeout) { + IDLE_ERISC_HEARTBEAT_AND_RETURN(heartbeat); iter++; if (timeout_cycles > 0) { uint32_t cycles_since_progress = get_timestamp_32b() - progress_timestamp; diff --git a/tt_metal/impl/dispatch/kernels/packet_mux.cpp b/tt_metal/impl/dispatch/kernels/packet_mux.cpp index 8127f40a678..726b8f7cc30 100644 --- a/tt_metal/impl/dispatch/kernels/packet_mux.cpp +++ b/tt_metal/impl/dispatch/kernels/packet_mux.cpp @@ -5,6 +5,7 @@ #include "dataflow_api.h" #include "debug/dprint.h" #include "tt_metal/impl/dispatch/kernels/packet_queue.hpp" +#include "tt_metal/impl/dispatch/kernels/cq_helpers.hpp" packet_input_queue_state_t input_queues[MAX_SWITCH_FAN_IN]; packet_output_queue_state_t output_queue; @@ -175,7 +176,9 @@ void kernel_main() { uint64_t iter = 0; uint64_t start_timestamp = get_timestamp(); uint32_t progress_timestamp = start_timestamp & 0xFFFFFFFF; + uint32_t heartbeat = 0; while (!dest_finished && !timeout) { + IDLE_ERISC_HEARTBEAT_AND_RETURN(heartbeat); iter++; if (timeout_cycles > 0) { uint32_t cycles_since_progress = get_timestamp_32b() - progress_timestamp; diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp index 9b054d4abec..01b736c0f3c 100644 --- a/tt_metal/impl/program/program.cpp +++ b/tt_metal/impl/program/program.cpp @@ -172,6 +172,7 @@ KernelGroup::KernelGroup( this->launch_msg.watcher_kernel_ids[DISPATCH_CLASS_ETH_DM0] = kernel->get_watcher_kernel_id(); } + this->launch_msg.exit_erisc_kernel = false; this->launch_msg.max_cb_index = last_cb_index + 1; this->launch_msg.run = RUN_MSG_GO; } diff --git a/tt_metal/llrt/llrt.cpp b/tt_metal/llrt/llrt.cpp index bc54c1c697b..b4445c914e1 100644 --- a/tt_metal/llrt/llrt.cpp +++ b/tt_metal/llrt/llrt.cpp @@ -339,13 +339,25 @@ static bool check_if_riscs_on_specified_core_done(chip_id_t chip_id, const CoreC return get_mailbox_is_done(run_mailbox_addr); } -void wait_until_cores_done(chip_id_t device_id, - int run_state, - std::unordered_set& not_done_phys_cores) { - +void wait_until_cores_done( + chip_id_t device_id, int run_state, std::unordered_set ¬_done_phys_cores, int timeout_ms) { // poll the cores until the set of not done cores is empty int loop_count = 1; + auto start = std::chrono::high_resolution_clock::now(); while (!not_done_phys_cores.empty()) { + if (timeout_ms > 0) { + auto now = std::chrono::high_resolution_clock::now(); + auto elapsed = std::chrono::duration_cast(now - start).count(); + if (elapsed > timeout_ms) { + std::string cores = fmt::format("{}", fmt::join(not_done_phys_cores, ", ")); + TT_THROW( + "Device {}: Timeout ({} ms) waiting for physical cores to finish: {}.", + device_id, + timeout_ms, + cores); + } + } + // Print not-done cores if (loop_count % 1000 == 0) { string not_done_cores_str = "Not done phys cores: "; diff --git a/tt_metal/llrt/llrt.hpp b/tt_metal/llrt/llrt.hpp index 693ed16f02c..33ddb6a44aa 100644 --- a/tt_metal/llrt/llrt.hpp +++ b/tt_metal/llrt/llrt.hpp @@ -114,7 +114,8 @@ CoreCoord get_core_for_dram_channel(int dram_channel_id, chip_id_t chip_id = 0); namespace internal_ { -void wait_until_cores_done(chip_id_t device_id, int run_state, std::unordered_set ¬_done_phys_cores); +void wait_until_cores_done( + chip_id_t device_id, int run_state, std::unordered_set ¬_done_phys_cores, int timeout_ms = 0); } // namespace internal_