Skip to content

Commit

Permalink
#6430: Fix reset-based hangs for WH
Browse files Browse the repository at this point in the history
  • Loading branch information
tt-dma committed Jul 8, 2024
1 parent b217738 commit 9c21cd2
Show file tree
Hide file tree
Showing 21 changed files with 509 additions and 154 deletions.
23 changes: 22 additions & 1 deletion tests/scripts/run_tools_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@ if [[ -z "$TT_METAL_HOME" ]]; then
exit 1
fi

# For now, only test watcher dump tool here.
if [[ -z "$TT_METAL_SLOW_DISPATCH_MODE" ]] ; then
# Watcher dump tool testing
echo "Running watcher dump tool tests..."

# Run a test that populates basic fields but not watcher fields
./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter=*PrintHanging

Expand All @@ -30,4 +32,23 @@ if [[ -z "$TT_METAL_SLOW_DISPATCH_MODE" ]] ; then
# Remove created files.
rm tmp.log
rm generated/watcher/watcher.log
echo "Watcher dump tool tests finished..."


# Clean init testing
echo "Running clean init tests - FD-on-Tensix"
echo "First run, no teardown"
./build/test/tt_metal/test_clean_init --skip-teardown || { echo "Above failure is expected."; }
echo "Second run, expect clean init"
timeout 10 ./build/test/tt_metal/test_clean_init || { echo "Error: second run timed out, clean init (FD-on-Tensix) failed."; exit 1; }
echo "Clean init tests - FD-on-Tensix passed!"

if [[ "$ARCH_NAME" == "wormhole_b0" ]]; then
echo "Running clean init tests - FD-on-Eth"
echo "First run, no teardown"
env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml ./build/test/tt_metal/test_clean_init --skip-teardown || { echo "Above failure is expected."; }
echo "Second run, expect clean init"
timeout 10 env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml ./build/test/tt_metal/test_clean_init || { echo "Error: second run timed out, clean init (FD-on-Eth) failed."; exit 1; }
echo "Clean init tests - FD-on-Eth passed!"
fi
fi
1 change: 1 addition & 0 deletions tests/tt_metal/tt_metal/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ set (TT_METAL_TESTS
test_core_range_set
test_compile_sets_kernel_binaries
test_compile_program
test_clean_init
)

foreach (TEST ${TT_METAL_TESTS})
Expand Down
151 changes: 151 additions & 0 deletions tests/tt_metal/tt_metal/test_clean_init.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include "tt_metal/host_api.hpp"
#include "tt_metal/detail/tt_metal.hpp"
#include "common/bfloat16.hpp"
#include <chrono>

/*
* Similar to loopback programming example, except run on al devices and skip device teardown to check if we can
* recover from a "bad" state.
*/

using namespace tt::tt_metal;

int main(int argc, char **argv) {

if (getenv("TT_METAL_SLOW_DISPATCH_MODE") != nullptr) {
TT_THROW("Test not supported w/ slow dispatch, exiting");
}

// Any arg means that we shouldn't do teardown.
bool skip_teardown = (argc > 1);
if (skip_teardown)
tt::log_info("Running loopback test with no teardown, to see if we can recover next run.");
else
tt::log_info("Running loopback test with proper teardown");

bool pass = true;
auto num_devices = tt::tt_metal::GetNumAvailableDevices();
vector<chip_id_t> ids;
for (unsigned int id = 0; id < num_devices; id++) {
ids.push_back(id);
}
tt::DevicePool::initialize(ids, 1, DEFAULT_L1_SMALL_SIZE);
std::vector<Device *> devices = tt::DevicePool::instance().get_all_active_devices();

for (int device_id = 0; device_id < num_devices; device_id++) {
try {
/*
* Silicon accelerator setup
*/
Device *device = devices[device_id];

/*
* Setup program and command queue to execute along with its buffers and kernels to use
*/
CommandQueue& cq = device->command_queue();
Program program = CreateProgram();

constexpr CoreCoord core = {0, 0};

KernelHandle dram_copy_kernel_id = CreateKernel(
program,
"tt_metal/programming_examples/loopback/kernels/loopback_dram_copy.cpp",
core,
DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}
);

constexpr uint32_t single_tile_size = 2 * (32 * 32);
constexpr uint32_t num_tiles = 50;
constexpr uint32_t dram_buffer_size = single_tile_size * num_tiles;

tt::tt_metal::InterleavedBufferConfig dram_config{
.device= device,
.size = dram_buffer_size,
.page_size = dram_buffer_size,
.buffer_type = tt::tt_metal::BufferType::DRAM
};
tt::tt_metal::InterleavedBufferConfig l1_config{
.device= device,
.size = dram_buffer_size,
.page_size = dram_buffer_size,
.buffer_type = tt::tt_metal::BufferType::L1
};

auto l1_buffer = CreateBuffer(l1_config);

auto input_dram_buffer = CreateBuffer(dram_config);
const uint32_t input_dram_buffer_addr = input_dram_buffer->address();

auto output_dram_buffer = CreateBuffer(dram_config);
const uint32_t output_dram_buffer_addr = output_dram_buffer->address();

/*
* Create input data and runtime arguments, then execute
*/
std::vector<uint32_t> input_vec = create_random_vector_of_bfloat16(
dram_buffer_size, 100, std::chrono::system_clock::now().time_since_epoch().count());
EnqueueWriteBuffer(cq, input_dram_buffer, input_vec, false);

const std::vector<uint32_t> runtime_args = {
l1_buffer->address(),
input_dram_buffer->address(),
static_cast<uint32_t>(input_dram_buffer->noc_coordinates().x),
static_cast<uint32_t>(input_dram_buffer->noc_coordinates().y),
output_dram_buffer->address(),
static_cast<uint32_t>(output_dram_buffer->noc_coordinates().x),
static_cast<uint32_t>(output_dram_buffer->noc_coordinates().y),
l1_buffer->size()
};

SetRuntimeArgs(
program,
dram_copy_kernel_id,
core,
runtime_args
);

EnqueueProgram(cq, program, false);
tt::log_info("Started program");
Finish(cq);
tt::log_info("Finished program");

/*
* Validation & Teardown
*/
std::vector<uint32_t> result_vec;
EnqueueReadBuffer(cq, output_dram_buffer, result_vec, true);

pass &= input_vec == result_vec;

} catch (const std::exception &e) {
tt::log_error(tt::LogTest, "Test failed with exception!");
tt::log_error(tt::LogTest, "{}", e.what());

throw;
}
}

if (pass) {
tt::log_info(tt::LogTest, "Test Passed");
} else {
TT_THROW("Test Failed");
}

// Skip teardown by throwing.
if (skip_teardown) {
TT_FATAL(false, "Skip teardown by throwing");
} else {
for (auto device : devices) {
pass &= CloseDevice(device);
}
}

// Error out with non-zero return code if we don't detect a pass
TT_FATAL(pass);

return 0;
}
4 changes: 2 additions & 2 deletions tt_metal/hw/inc/dataflow_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -1485,10 +1485,10 @@ void noc_async_write_barrier() {
*/
FORCE_INLINE
void noc_async_writes_flushed() {
DEBUG_STATUS("NWBW");
DEBUG_STATUS("NWFW");
while (!ncrisc_noc_nonposted_writes_sent(noc_index))
;
DEBUG_STATUS("NWBD");
DEBUG_STATUS("NWFD");
}

/**
Expand Down
4 changes: 4 additions & 0 deletions tt_metal/hw/inc/debug/assert.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ void assert_and_hang(uint32_t line_num) {
v->which = debug_get_which_riscv();
}

// Update launch msg to show that we've exited.
tt_l1_ptr launch_msg_t *launch_msg = GET_MAILBOX_ADDRESS_DEV(launch);
launch_msg->run = RUN_MSG_DONE;

// Hang, or in the case of erisc, early exit.
#if defined(COMPILE_FOR_ERISC)
internal_::disable_erisc_app();
Expand Down
10 changes: 5 additions & 5 deletions tt_metal/hw/inc/debug/sanitize_noc.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,18 +81,18 @@ inline void debug_sanitize_post_noc_addr_and_hang(
v[noc_index].invalid = invalid;
}

// Update launch msg to show that we've exited.
tt_l1_ptr launch_msg_t *launch_msg = GET_MAILBOX_ADDRESS_DEV(launch);
launch_msg->run = RUN_MSG_DONE;

#if defined(COMPILE_FOR_ERISC)
// For erisc, we can't hang the kernel/fw, because the core doesn't get restarted when a new
// kernel is written. In this case we'll do an early exit back to base FW.
internal_::disable_erisc_app();
erisc_early_exit(eth_l1_mem::address_map::ERISC_MEM_MAILBOX_STACK_SAVE);
#endif

while (1) {
#if defined(COMPILE_FOR_ERISC)
internal_::risc_context_switch();
#endif
}
while (1) { ; }
}

// Return value is the alignment mask for the type of core the noc address points
Expand Down
1 change: 1 addition & 0 deletions tt_metal/hw/inc/dev_msgs.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ struct launch_msg_t { // must be cacheline aligned
volatile uint8_t max_cb_index;
volatile uint8_t dispatch_core_x;
volatile uint8_t dispatch_core_y;
volatile uint8_t exit_erisc_kernel;
volatile uint8_t run; // must be in last cacheline of this msg
};

Expand Down
4 changes: 4 additions & 0 deletions tt_metal/impl/debug/watcher_server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -675,6 +675,10 @@ static void dump_core(
// requested explicitly
dump_sync_regs(f, device, core);
}
} else {
fprintf(f, "rmsg:");
dump_run_state(f, core, &mbox_data->launch, mbox_data->launch.run);
fprintf(f, " ");
}

// Eth core only reports erisc kernel id, uses the brisc field
Expand Down
Loading

0 comments on commit 9c21cd2

Please sign in to comment.