Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#753: Syncing device host times for tracy profiler #8101

Merged
merged 2 commits into from
Jun 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 24 additions & 1 deletion tests/tt_metal/tools/profiler/test_device_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,11 @@
import inspect
import pytest

import pandas as pd

from tt_metal.tools.profiler.common import (
TT_METAL_HOME,
PROFILER_HOST_DEVICE_SYNC_INFO,
PROFILER_SCRIPTS_ROOT,
PROFILER_ARTIFACTS_DIR,
PROFILER_LOGS_DIR,
Expand All @@ -21,8 +24,10 @@
PROG_EXMP_DIR = "programming_examples/profiler"


def run_device_profiler_test(doubleRun=False, setup=False):
def run_device_profiler_test(testName=None, doubleRun=False, setup=False):
name = inspect.stack()[1].function
if testName:
name = testName
clear_profiler_runtime_artifacts()
profilerRun = os.system(f"cd {TT_METAL_HOME} && " f"build/{PROG_EXMP_DIR}/{name}")
assert profilerRun == 0
Expand Down Expand Up @@ -171,3 +176,21 @@ def test_dispatch_cores():
if statType in stat:
statTypesSet.remove(statType)
assert len(statTypesSet) == 0


def test_profiler_host_device_sync():
TOLERANCE = 0.1

os.environ["TT_METAL_PROFILER_SYNC"] = "1"
deviceData = run_device_profiler_test(testName="test_custom_cycle_count")
reprotedFreq = deviceData["data"]["deviceInfo"]["freq"] * 1e6

syncInfoFile = PROFILER_LOGS_DIR / PROFILER_HOST_DEVICE_SYNC_INFO

assert os.path.isfile(syncInfoFile)

syncinfoDF = pd.read_csv(syncInfoFile)
freq = float(syncinfoDF.iloc[-1]["frequency"]) * 1e9

assert freq < (reprotedFreq * (1 + TOLERANCE)), "Frequency too large on this {ENV_VAR_ARCH_NAME}"
assert freq > (reprotedFreq * (1 - TOLERANCE)), "Frequency too small on this {ENV_VAR_ARCH_NAME}"
5 changes: 5 additions & 0 deletions tt_metal/llrt/rtoptions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ RunTimeOptions::RunTimeOptions() {

profiler_enabled = false;
profile_dispatch_cores = false;
profiler_sync_enabled = false;
#if defined(PROFILER)
const char *profiler_enabled_str = std::getenv("TT_METAL_DEVICE_PROFILER");
if (profiler_enabled_str != nullptr && profiler_enabled_str[0] == '1') {
Expand All @@ -56,6 +57,10 @@ RunTimeOptions::RunTimeOptions() {
if (profile_dispatch_str != nullptr && profile_dispatch_str[0] == '1') {
profile_dispatch_cores = true;
}
const char *profiler_sync_enabled_str = std::getenv("TT_METAL_PROFILER_SYNC");
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Man another env var 🥴
@TT-billteng

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These env vars will be hidden behind tracy.py cli options pretty soon. User never need to now about them. I need this ticket in before I can add more CLI options.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you pass this via function parameters somehow?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is a runtime variable. We are deciding at runtime.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can pass runtime variables from the top all the way down through the stack?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In tracy.py I can bring tt-lib bindings for this switch and set it through that. That way I can avoid the env_var

if (profiler_enabled && profiler_sync_enabled_str != nullptr && profiler_sync_enabled_str[0] == '1') {
profiler_sync_enabled = true;
}
}
#endif
TT_FATAL(
Expand Down
2 changes: 2 additions & 0 deletions tt_metal/llrt/rtoptions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ class RunTimeOptions {

bool profiler_enabled = false;
bool profile_dispatch_cores = false;
bool profiler_sync_enabled = false;

bool null_kernels = false;

Expand Down Expand Up @@ -220,6 +221,7 @@ class RunTimeOptions {

inline bool get_profiler_enabled() { return profiler_enabled; }
inline bool get_profiler_do_dispatch_cores() { return profile_dispatch_cores; }
inline bool get_profiler_sync_enabled() { return profiler_sync_enabled; }
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these functions should all be marked with const

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are return bools by value, does const mean much here?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wow this class is a lot bigger than I thought; maybe it isn't worthwhile to stay consistent with rest of class
usually you should mark member functions const wherever you aren't modifying member variables


inline void set_kernels_nullified(bool v) { null_kernels = v; }
inline bool get_kernels_nullified() { return null_kernels; }
Expand Down
1 change: 1 addition & 0 deletions tt_metal/tools/profiler/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

PROFILER_DEVICE_SIDE_LOG = "profile_log_device.csv"
PROFILER_HOST_SIDE_LOG = "profile_log_host.csv"
PROFILER_HOST_DEVICE_SYNC_INFO = "sync_device_info.csv"

PROFILER_SCRIPTS_ROOT = TT_METAL_HOME / "tt_metal/tools/profiler"
PROFILER_ARTIFACTS_DIR = TT_METAL_HOME / "generated/profiler"
Expand Down
22 changes: 8 additions & 14 deletions tt_metal/tools/profiler/kernel_profiler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ namespace kernel_profiler{
#endif

constexpr uint32_t QUICK_PUSH_MARKER_COUNT = 2;
constexpr int WALL_CLOCK_HIGH_INDEX = 1;
constexpr int WALL_CLOCK_LOW_INDEX = 0;

#if defined(COMPILE_FOR_BRISC)
constexpr uint32_t profilerBuffer = PROFILER_L1_BUFFER_BR;
Expand Down Expand Up @@ -191,6 +193,7 @@ namespace kernel_profiler{
trisc1Buffer[ID_LL] = runCounter;
trisc2Buffer[ID_LL] = runCounter;


#endif //BRISC_INIT
#endif
}
Expand All @@ -209,8 +212,8 @@ namespace kernel_profiler{
{
volatile tt_l1_ptr uint32_t *buffer = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(kernel_profiler::profilerBuffer);
volatile tt_reg_ptr uint32_t *p_reg = reinterpret_cast<volatile tt_reg_ptr uint32_t *> (RISCV_DEBUG_REG_WALL_CLOCK_L);
buffer[index] = 0x80000000 | ((timer_id & 0x7FFFF) << 12) | (p_reg[1] & 0xFFF);
buffer[index+1] = p_reg[0];
buffer[index] = 0x80000000 | ((timer_id & 0x7FFFF) << 12) | (p_reg[WALL_CLOCK_HIGH_INDEX] & 0xFFF);
buffer[index+1] = p_reg[WALL_CLOCK_LOW_INDEX];
}

inline __attribute__((always_inline)) void mark_start_at_index_inlined(uint32_t index)
Expand Down Expand Up @@ -240,22 +243,13 @@ namespace kernel_profiler{
}
}

PROFILER_INLINE void mark_BR_fw_first_start()
{
uint32_t time_L = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L);
uint32_t time_H = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_H);

profiler_control_buffer[FW_RESET_L] = time_L;
profiler_control_buffer[FW_RESET_H] = time_H;
}


inline __attribute__((always_inline)) void mark_dropped_timestamps(uint32_t index)
{
uint32_t curr = profiler_control_buffer[DROPPED_ZONES];
profiler_control_buffer[DROPPED_ZONES] = (1 << index) | curr;
}


inline __attribute__((always_inline)) void risc_finished_profiling()
{
for (int i = 0; i < SUM_COUNT; i ++)
Expand Down Expand Up @@ -521,12 +515,12 @@ namespace kernel_profiler{

inline __attribute__((always_inline)) profileScopeAccumulate ()
{
start_time = ((uint64_t)p_reg[1] << 32) | p_reg[0];
start_time = ((uint64_t)p_reg[WALL_CLOCK_HIGH_INDEX] << 32) | p_reg[WALL_CLOCK_LOW_INDEX];
}
inline __attribute__((always_inline)) ~profileScopeAccumulate ()
{
sumIDs[index] = timer_id;
sums[index] += (((uint64_t)p_reg[1] << 32) | p_reg[0]) - start_time;
sums[index] += (((uint64_t)p_reg[WALL_CLOCK_HIGH_INDEX] << 32) | p_reg[WALL_CLOCK_LOW_INDEX]) - start_time;
}
};
}
Expand Down
37 changes: 34 additions & 3 deletions tt_metal/tools/profiler/profiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ void DeviceProfiler::readRiscProfilerResults(
riscNumRead = profile_buffer[index] & 0x7;
coreFlatIDRead = (profile_buffer[index] >> 3) & 0xFF;
runCounterRead = profile_buffer[index + 1];

}
else
{
Expand Down Expand Up @@ -356,11 +357,13 @@ void DeviceProfiler::generateZoneSourceLocationsHashes()
}
}


void DeviceProfiler::dumpResults (
Device *device,
const vector<CoreCoord> &worker_cores){
#if defined(PROFILER)
ZoneScoped;

auto device_id = device->id();
device_core_frequency = tt::Cluster::instance().get_device_aiclk(device_id);

Expand All @@ -379,7 +382,6 @@ void DeviceProfiler::dumpResults (
worker_core);

}

}
else
{
Expand All @@ -404,18 +406,48 @@ void DeviceProfiler::pushTracyDeviceResults()
}
}

double delay = 0;
double frequency = 0;
uint64_t cpuTime = 0;

for (auto& device_core: device_cores)
{
int device_id = device_core.first;
CoreCoord worker_core = device_core.second;

if (device_core_sync_info.find(worker_core) != device_core_sync_info.end())
{
cpuTime = get<0>(device_core_sync_info.at(worker_core));
delay = get<1>(device_core_sync_info.at(worker_core));
frequency = get<2>(device_core_sync_info.at(worker_core));
log_info("Device {} sync info are, frequency {} GHz, delay {} cycles and, sync point {} seconds",
device_id,
frequency,
delay,
cpuTime);
}
}

for (auto& device_core: device_cores)
{
int device_id = device_core.first;
CoreCoord worker_core = device_core.second;

if (delay == 0.0 || frequency == 0.0)
{
delay = smallest_timestamp;
frequency = device_core_frequency/1000.0;
cpuTime = TracyGetCpuTime();
log_warning("For device {}, core {},{} default frequency was used and its zones will be out of sync", device_id, worker_core.x, worker_core.y);
}


if (device_tracy_contexts.find(device_core) == device_tracy_contexts.end())
{
auto tracyCtx = TracyTTContext();
std::string tracyTTCtxName = fmt::format("Device: {}, Core ({},{})", device_id, worker_core.x, worker_core.y);

TracyTTContextPopulate(tracyCtx, smallest_timestamp, 1000.f / (float)device_core_frequency);
TracyTTContextPopulate(tracyCtx, cpuTime, delay, frequency);

TracyTTContextName(tracyCtx, tracyTTCtxName.c_str(), tracyTTCtxName.size());

Expand All @@ -429,7 +461,6 @@ void DeviceProfiler::pushTracyDeviceResults()
for (auto& event: device_events)
{
std::pair<uint32_t, CoreCoord> device_core = {event.chip_id, (CoreCoord){event.core_x,event.core_y}};

if (event.zone_phase == tracy::TTDeviceEventPhase::begin)
{
TracyTTPushStartZone(device_tracy_contexts[device_core], event);
Expand Down
6 changes: 5 additions & 1 deletion tt_metal/tools/profiler/profiler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <filesystem>

#include "tt_metal/impl/buffers/buffer.hpp"
#include "tt_metal/impl/program/program.hpp"
#include "llrt/llrt.hpp"
#include "tools/profiler/profiler_state.hpp"
#include "tools/profiler/common.hpp"
Expand Down Expand Up @@ -50,7 +51,7 @@ class DeviceProfiler {
// Device-Core tracy context
std::map<std::pair<uint16_t,CoreCoord>, TracyTTCtx> device_tracy_contexts;

// Device-Core tracy context
// Device events
std::set<tracy::TTDeviceEvent> device_events;

// Hash to zone source locations
Expand Down Expand Up @@ -101,7 +102,10 @@ class DeviceProfiler {

//DRAM buffer for device side results
std::shared_ptr<tt::tt_metal::Buffer> output_dram_buffer = nullptr;
std::shared_ptr<tt::tt_metal::Program> sync_program = nullptr;

// Device-core Syncdata
std::map<CoreCoord, std::tuple<double,double,double>> device_core_sync_info;

//Set the device side file flag
void setNewLogFlag(bool new_log_flag);
Expand Down
46 changes: 46 additions & 0 deletions tt_metal/tools/profiler/sync/sync_kernel.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include <cstdint>

void kernel_main() {
DeviceZoneScopedMainN("SYNC-MAIN");
volatile tt_reg_ptr uint32_t *p_reg = reinterpret_cast<volatile tt_reg_ptr uint32_t *> (RISCV_DEBUG_REG_WALL_CLOCK_L);
volatile tt_l1_ptr uint32_t *profiler_control_buffer = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(PROFILER_L1_BUFFER_CONTROL);
volatile tt_l1_ptr uint32_t *briscBuffer = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(PROFILER_L1_BUFFER_BR + kernel_profiler::CUSTOM_MARKERS * sizeof(uint32_t));

uint32_t syncTimeBufferIndex = 0;

constexpr int FIRST_READ_COUNT = 2;


while ( syncTimeBufferIndex < FIRST_READ_COUNT) {
uint32_t deviceTime = p_reg[kernel_profiler::WALL_CLOCK_LOW_INDEX];

uint32_t hostTime = profiler_control_buffer[kernel_profiler::FW_RESET_L];
if (hostTime > 0)
{
briscBuffer[syncTimeBufferIndex++] = p_reg[kernel_profiler::WALL_CLOCK_HIGH_INDEX];
briscBuffer[syncTimeBufferIndex++] = deviceTime;
briscBuffer[syncTimeBufferIndex++] = deviceTime;
briscBuffer[syncTimeBufferIndex++] = hostTime;
profiler_control_buffer[kernel_profiler::FW_RESET_L] = 0;
}
}

{
DeviceZoneScopedMainChildN("SYNC-LOOP");
while ( syncTimeBufferIndex < ((SAMPLE_COUNT + 1) * 2) ) {
uint32_t deviceTime = p_reg[kernel_profiler::WALL_CLOCK_LOW_INDEX];

uint32_t hostTime = profiler_control_buffer[kernel_profiler::FW_RESET_L];
if (hostTime > 0)
{
briscBuffer[syncTimeBufferIndex++] = deviceTime;
briscBuffer[syncTimeBufferIndex++] = hostTime;
profiler_control_buffer[kernel_profiler::FW_RESET_L] = 0;
}
}
}
}
Loading
Loading