Skip to content

Commit

Permalink
#753: Working 8-chip sync
Browse files Browse the repository at this point in the history
  • Loading branch information
mo-tenstorrent committed May 3, 2024
1 parent d569105 commit e35a490
Show file tree
Hide file tree
Showing 18 changed files with 493 additions and 45 deletions.
2 changes: 2 additions & 0 deletions tt_metal/hw/firmware/src/brisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ namespace kernel_profiler {
uint32_t stackSize __attribute__((used));
uint32_t sums[SUM_COUNT] __attribute__((used));
uint32_t sumIDs[SUM_COUNT] __attribute__((used));
bool resultsPushed __attribute__((used));
uint16_t core_flat_id __attribute__((used));
}

void enable_power_management() {
Expand Down
2 changes: 2 additions & 0 deletions tt_metal/hw/firmware/src/erisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ namespace kernel_profiler {
uint32_t stackSize __attribute__((used));
uint32_t sums[SUM_COUNT] __attribute__((used));
uint32_t sumIDs[SUM_COUNT] __attribute__((used));
bool resultsPushed __attribute__((used));
uint16_t core_flat_id __attribute__((used));
}

uint8_t noc_index = 0; // TODO: remove hardcoding
Expand Down
2 changes: 2 additions & 0 deletions tt_metal/hw/firmware/src/idle_erisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ namespace kernel_profiler {
uint32_t stackSize __attribute__((used));
uint32_t sums[SUM_COUNT] __attribute__((used));
uint32_t sumIDs[SUM_COUNT] __attribute__((used));
bool resultsPushed __attribute__((used));
uint16_t core_flat_id __attribute__((used));
}

inline void RISC_POST_STATUS(uint32_t status) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ void kernel_main() {
uint32_t remote_issue_cmd_slots = 0;

while (routing_info->routing_enabled) {
DeviceZoneScopedMainN("ETH-TUNNEL");
// Implement yielding if SENDER is not ISSUE, this may help with devices getting commands first
while (routing_info->routing_enabled && eth_src_db_semaphore_addr[0] == 0 &&
routing_info->fd_buffer_msgs[buffer_id].bytes_sent != 1 &&
Expand Down
1 change: 0 additions & 1 deletion tt_metal/impl/dispatch/kernels/command_queue_consumer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,6 @@ void kernel_main() {
completion_queue_push_back(completion_data_size, completion_queue_start_addr, host_completion_queue_write_ptr_addr);
record_last_completed_event(header->event);
{
DeviceZoneScopedN("CQ-NOTIFY_PROC");
// notify producer that it has completed a command
noc_semaphore_inc(producer_noc_encoding | get_semaphore(0), 1);
db_buf_switch = not db_buf_switch;
Expand Down
2 changes: 1 addition & 1 deletion tt_metal/impl/dispatch/kernels/cq_dispatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ void kernel_main() {
db_cb_config_t* db_cb_config = get_local_db_cb_config(CQ_CONSUMER_CB_BASE);
uint32_t l1_consumer_fifo_limit = (db_cb_config->rd_ptr_16B << 4) + (db_cb_config->total_size_16B << 4);
while (true) {
//DeviceZoneScopedMainN("CQ-DISPATCHER");
DeviceZoneScopedMainN("CQ-DISPATCHER");
// Wait for producer to supply a command
uint32_t command_start_addr = get_command_slot_addr<cmd_base_address, 0>(db_buf_switch);
uint32_t program_transfer_start_addr = command_start_addr + ((DeviceCommand::NUM_ENTRIES_IN_COMMAND_HEADER + DeviceCommand::NUM_ENTRIES_PER_BUFFER_TRANSFER_INSTRUCTION * DeviceCommand::NUM_POSSIBLE_BUFFER_TRANSFERS) * sizeof(uint32_t));
Expand Down
5 changes: 5 additions & 0 deletions tt_metal/llrt/rtoptions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,16 @@ RunTimeOptions::RunTimeOptions() {
test_mode_enabled = false;

profiler_enabled = false;
profiler_sync_enabled = false;
#if defined(PROFILER)
const char *profiler_enabled_str = std::getenv("TT_METAL_DEVICE_PROFILER");
if (profiler_enabled_str != nullptr && profiler_enabled_str[0] == '1') {
profiler_enabled = true;
}
const char *profiler_sync_enabled_str = std::getenv("TT_METAL_PROFILER_SYNC");
if (profiler_enabled && profiler_sync_enabled_str != nullptr && profiler_sync_enabled_str[0] == '1') {
profiler_sync_enabled = true;
}
#endif
TT_FATAL(!(get_dprint_enabled() && get_profiler_enabled()), "Cannot enable both debug printing and profiling");

Expand Down
2 changes: 2 additions & 0 deletions tt_metal/llrt/rtoptions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ class RunTimeOptions {
bool test_mode_enabled = false;

bool profiler_enabled = false;
bool profiler_sync_enabled = false;

bool null_kernels = false;

Expand Down Expand Up @@ -124,6 +125,7 @@ class RunTimeOptions {
inline void set_test_mode_enabled(bool enable) { test_mode_enabled = enable; }

inline bool get_profiler_enabled() { return profiler_enabled; }
inline bool get_profiler_sync_enabled() { return profiler_sync_enabled; }

inline void set_kernels_nullified(bool v) { null_kernels = v; }
inline bool get_kernels_nullified() { return null_kernels; }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#include <cstdint>

#include "debug/dprint.h"
/**
* LOOP_COUNT and LOOP_SIZE provide the ability to decide how many cycles this kernel takes.
* With a large enough LOOP_COUNT and a LOOP_SIZEs within icache size, cycle count will be
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,19 @@
#include "tt_metal/host_api.hpp"
#include "tt_metal/detail/tt_metal.hpp"

#include <chrono>
#include <thread>

using namespace tt;

bool RunCustomCycle(tt_metal::Device *device, int loop_count, bool dumpProfile = false)
bool RunCustomCycle(tt_metal::Device *device, int loop_count, bool lastCall = false)
{
bool pass = true;

CoreCoord compute_with_storage_size = device->compute_with_storage_grid_size();
CoreCoord start_core = {0, 0};
CoreCoord end_core = {compute_with_storage_size.x - 1, compute_with_storage_size.y - 1};
CoreCoord end_core = {0, 0};
//CoreCoord end_core = {compute_with_storage_size.x - 1, compute_with_storage_size.y - 1};
CoreRange all_cores(start_core, end_core);

tt_metal::Program program = tt_metal::CreateProgram();
Expand Down Expand Up @@ -43,7 +47,7 @@ bool RunCustomCycle(tt_metal::Device *device, int loop_count, bool dumpProfile =
);

EnqueueProgram(device->command_queue(), program, false);
tt_metal::DumpDeviceProfileResults(device, program);
tt_metal::detail::DumpDeviceProfileResults(device, lastCall);

return pass;
}
Expand All @@ -61,6 +65,16 @@ int main(int argc, char **argv) {

int loop_count = 2000;
pass &= RunCustomCycle(device, loop_count);
std::this_thread::sleep_for(std::chrono::milliseconds(600));
pass &= RunCustomCycle(device, loop_count);
std::this_thread::sleep_for(std::chrono::milliseconds(600));
pass &= RunCustomCycle(device, loop_count);
std::this_thread::sleep_for(std::chrono::milliseconds(600));
pass &= RunCustomCycle(device, loop_count);
std::this_thread::sleep_for(std::chrono::milliseconds(600));
pass &= RunCustomCycle(device, loop_count);
std::this_thread::sleep_for(std::chrono::milliseconds(600));
pass &= RunCustomCycle(device, loop_count, true);

pass &= tt_metal::CloseDevice(device);

Expand Down
30 changes: 15 additions & 15 deletions tt_metal/tools/profiler/kernel_profiler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,16 +42,21 @@ namespace kernel_profiler{
extern uint32_t sums[SUM_COUNT];
extern uint32_t sumIDs[SUM_COUNT];

constexpr int WALL_CLOCK_HIGH_INDEX = 1;
constexpr int WALL_CLOCK_LOW_INDEX = 0;

#if defined(COMPILE_FOR_BRISC)
constexpr uint32_t profilerBuffer = PROFILER_L1_BUFFER_BR;
constexpr uint32_t deviceBufferEndIndex = DEVICE_BUFFER_END_INDEX_BR;
volatile tt_l1_ptr uint32_t *profiler_control_buffer = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(PROFILER_L1_BUFFER_CONTROL);
uint16_t core_flat_id;
extern uint16_t core_flat_id;
extern bool resultsPushed;
#elif defined(COMPILE_FOR_ERISC)
constexpr uint32_t profilerBuffer = eth_l1_mem::address_map::PROFILER_L1_BUFFER_ER;
constexpr uint32_t deviceBufferEndIndex = DEVICE_BUFFER_END_INDEX_ER;
volatile tt_l1_ptr uint32_t *profiler_control_buffer = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(eth_l1_mem::address_map::PROFILER_L1_BUFFER_CONTROL);
uint16_t core_flat_id;
extern uint16_t core_flat_id;
extern bool resultsPushed;
#elif defined(COMPILE_FOR_NCRISC)
constexpr uint32_t profilerBuffer = PROFILER_L1_BUFFER_NC;
constexpr uint32_t deviceBufferEndIndex = DEVICE_BUFFER_END_INDEX_NC;
Expand Down Expand Up @@ -83,6 +88,7 @@ namespace kernel_profiler{

#if defined(COMPILE_FOR_ERISC) || defined(COMPILE_FOR_BRISC)
uint32_t runCounter = profiler_control_buffer[RUN_COUNTER];
resultsPushed = false;

#if defined(COMPILE_FOR_ERISC)
volatile tt_l1_ptr uint32_t *eriscBuffer = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(eth_l1_mem::address_map::PROFILER_L1_BUFFER_ER);
Expand Down Expand Up @@ -164,6 +170,7 @@ namespace kernel_profiler{
trisc1Buffer[ID_LL] = runCounter;
trisc2Buffer[ID_LL] = runCounter;


#endif //BRISC_INIT
#endif
}
Expand All @@ -182,8 +189,8 @@ namespace kernel_profiler{
{
volatile tt_l1_ptr uint32_t *buffer = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(kernel_profiler::profilerBuffer);
volatile tt_reg_ptr uint32_t *p_reg = reinterpret_cast<volatile tt_reg_ptr uint32_t *> (RISCV_DEBUG_REG_WALL_CLOCK_L);
buffer[index] = 0x80000000 | ((timer_id & 0x7FFFF) << 12) | (p_reg[1] & 0xFFF);
buffer[index+1] = p_reg[0];
buffer[index] = 0x80000000 | ((timer_id & 0x7FFFF) << 12) | (p_reg[WALL_CLOCK_HIGH_INDEX] & 0xFFF);
buffer[index+1] = p_reg[WALL_CLOCK_LOW_INDEX];
}

PROFILER_INLINE void mark_padding()
Expand All @@ -197,15 +204,6 @@ namespace kernel_profiler{
}
}

PROFILER_INLINE void mark_BR_fw_first_start()
{
uint32_t time_L = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L);
uint32_t time_H = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_H);

profiler_control_buffer[FW_RESET_L] = time_L;
profiler_control_buffer[FW_RESET_H] = time_H;
}

inline __attribute__((always_inline)) void risc_finished_profiling()
{
for (int i = 0; i < SUM_COUNT; i ++)
Expand Down Expand Up @@ -235,6 +233,7 @@ namespace kernel_profiler{
risc_finished_profiling();
#if (defined(COMPILE_FOR_ERISC) || defined(COMPILE_FOR_BRISC))

if (resultsPushed) return;
uint32_t pageSize =
PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC * PROFILER_RISC_COUNT * profiler_core_count_per_dram;

Expand Down Expand Up @@ -317,6 +316,7 @@ namespace kernel_profiler{
#endif
noc_async_write_barrier();
profiler_control_buffer[RUN_COUNTER] ++;
resultsPushed = true;
#endif
}

Expand Down Expand Up @@ -392,12 +392,12 @@ namespace kernel_profiler{

inline __attribute__((always_inline)) profileScopeAccumulate ()
{
start_time = ((uint64_t)p_reg[1] << 32) | p_reg[0];
start_time = ((uint64_t)p_reg[WALL_CLOCK_HIGH_INDEX] << 32) | p_reg[WALL_CLOCK_LOW_INDEX];
}
inline __attribute__((always_inline)) ~profileScopeAccumulate ()
{
sumIDs[index] = timer_id;
sums[index] += (((uint64_t)p_reg[1] << 32) | p_reg[0]) - start_time;
sums[index] += (((uint64_t)p_reg[WALL_CLOCK_HIGH_INDEX] << 32) | p_reg[WALL_CLOCK_LOW_INDEX]) - start_time;
}
};
}
Expand Down
133 changes: 133 additions & 0 deletions tt_metal/tools/profiler/process_sync_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
#!/usr/bin/env python3

# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.

# SPDX-License-Identifier: Apache-2.0

import os
import sys
import subprocess
from io import StringIO
import contextlib
from loguru import logger

import pandas as pd
from sklearn.linear_model import TheilSenRegressor, LinearRegression
from sklearn.datasets import make_regression

from tt_metal.tools.profiler.common import (
TT_METAL_HOME,
PROFILER_DEVICE_SIDE_LOG,
PROFILER_BIN_DIR,
TRACY_FILE_NAME,
TRACY_CSVEXPROT_TOOL,
)

# RUN_TYPE = "runs_2min_apart_syncs"
RUN_TYPE = "runs_first_last_10ms_sync"

lsCmd = subprocess.run(
f"cd {TT_METAL_HOME / RUN_TYPE}; ls",
shell=True,
check=True,
capture_output=True,
)

runsList = lsCmd.stdout.decode().split("\n")[:-1]

runData = {}

pd.options.display.float_format = "{:,.2f}".format
pd.options.display.max_rows = 250

finalReport = pd.DataFrame()
for run in runsList:
tracyCsv = ""
logsFolder = TT_METAL_HOME / RUN_TYPE / run / ".logs"
tracyFile = logsFolder / TRACY_FILE_NAME
deviceCsvFile = logsFolder / PROFILER_DEVICE_SIDE_LOG
syncInfoFile = logsFolder / "sync_device_info.csv"

filesExist = True
for file in [tracyFile, deviceCsvFile, syncInfoFile]:
filesExist &= os.path.isfile(file)

testPoints = 2
if filesExist:
tracyDF = pd.DataFrame()
for zoneName in ["CQ-CONSUMER-MAIN", "HWCommandQueue_finish"]:
csvExportCmd = subprocess.run(
f"{PROFILER_BIN_DIR / TRACY_CSVEXPROT_TOOL} -u -f {zoneName} {tracyFile}",
shell=True,
check=True,
capture_output=True,
)

tracyCsv = csvExportCmd.stdout.decode()
if not tracyDF.empty:
tracyDF = pd.concat([tracyDF, pd.read_csv(StringIO(tracyCsv))])
else:
tracyDF = pd.read_csv(StringIO(tracyCsv))

tracyDF = tracyDF.reset_index()
tracyDF = tracyDF.drop(list(range(0, testPoints * 2, 2))) # remove CONSUMER_MAIN that is not relevant
tracyDF = tracyDF.reset_index()
tracyDF = tracyDF.drop(columns=["level_0", "index"])
tracyDF["ns_end_time"] = tracyDF["ns_since_start"] + tracyDF["exec_time_ns"]

syncinfoDF = pd.read_csv(syncInfoFile)

tracyBaseTime_ns = syncinfoDF.iloc[0]["tracy_base_time"] * syncinfoDF.iloc[0]["tracy_ratio"]

syncinfoDF["host_time_global [ns]"] = (syncinfoDF["host_tracy"] + syncinfoDF["host_start"]) * syncinfoDF[
"tracy_ratio"
]

X = [[x] for x in list(syncinfoDF["host_time_global [ns]"])]
y = list(syncinfoDF["device"])
with contextlib.redirect_stderr(StringIO()) as f:
# reg = TheilSenRegressor(random_state=0).fit(X, y)
reg = LinearRegression().fit(X, y)

frequency = reg.coef_[0]
delayCycle = reg.intercept_

print(frequency, delayCycle)

deviceDF = pd.read_csv(deviceCsvFile, skiprows=[0])
deviceDF = deviceDF.loc[(deviceDF[" zone name"] == "CQ-CONSUMER-MAIN") & (deviceDF[" zone phase"] == "end")]
deviceDF = deviceDF.reset_index()
deviceDF = deviceDF.drop(list(range(0, testPoints * 2, 2))) # remove CONSUMER_MAIN that is not relevant
deviceDF = deviceDF.reset_index()

deviceDF = deviceDF[[" time[cycles since reset]"]]
deviceDF["time[ns since reset]"] = deviceDF[" time[cycles since reset]"] / frequency
deviceDF["time[cycles host synced]"] = deviceDF[" time[cycles since reset]"] - delayCycle
deviceDF["time[ns host synced]"] = deviceDF["time[cycles host synced]"] / frequency
# deviceDF["corrected device time [ns]"] = round(deviceDF["time[ns host synced]"] + hostStart - tracyBaseTime_ns)

testDF = deviceDF[["time[ns host synced]"]].copy()
testDF["tracy device time [ns]"] = (
tracyDF.loc[(tracyDF["name"] == "CQ-CONSUMER-MAIN")]["ns_end_time"] + tracyBaseTime_ns
)
testDF["tracy host time [ns]"] = (
tracyDF.loc[(tracyDF["name"] == "HWCommandQueue_finish")].reset_index()["ns_end_time"] + tracyBaseTime_ns
)
testDF["original host device diff [ns]"] = testDF["tracy host time [ns]"] - testDF["tracy device time [ns]"]
testDF["new host device diff [ns]"] = testDF["tracy host time [ns]"] - testDF["time[ns host synced]"]
finalReportTmp = pd.DataFrame()
finalReportTmp["C++ diff 1"] = [testDF.iloc[0]["original host device diff [ns]"]]
finalReportTmp["C++ diff 2"] = testDF.iloc[1]["original host device diff [ns]"]
finalReportTmp["Python diff 1"] = testDF.iloc[0]["new host device diff [ns]"]
finalReportTmp["Python diff 2"] = testDF.iloc[1]["new host device diff [ns]"]
finalReportTmp["frequency init"] = syncinfoDF.iloc[5]["frequency"] * 1e9
finalReportTmp["frequency init + dump"] = syncinfoDF.iloc[260]["frequency"] * 1e9
finalReportTmp["delay init"] = syncinfoDF.iloc[5]["delay"]
finalReportTmp["delay init + dump"] = syncinfoDF.iloc[260]["delay"]

finalReport = pd.concat([finalReport, finalReportTmp])

print(finalReportTmp)

finalReport.reindex()
finalReport.to_excel("final_report.xlsx")
Loading

0 comments on commit e35a490

Please sign in to comment.