From 6cbbb6ea65bad8099ef512455e3870e65b8acca3 Mon Sep 17 00:00:00 2001 From: Mo Memarian Date: Tue, 16 Apr 2024 01:36:32 +0000 Subject: [PATCH 1/2] #753: Profiler device-host time sync This data is only used by tracy GUI to align device and host zones. It is disabled by default. It is enabled by setting TT_METAL_PROFILER_SYNC=1. --- .../tools/profiler/test_device_profiler.py | 25 ++- tt_metal/llrt/rtoptions.cpp | 5 + tt_metal/llrt/rtoptions.hpp | 2 + tt_metal/third_party/tracy | 2 +- tt_metal/tools/profiler/common.py | 1 + tt_metal/tools/profiler/kernel_profiler.hpp | 22 +- tt_metal/tools/profiler/profiler.cpp | 37 +++- tt_metal/tools/profiler/profiler.hpp | 4 +- tt_metal/tools/profiler/sync/sync_kernel.cpp | 46 +++++ tt_metal/tools/profiler/tt_metal_profiler.cpp | 194 +++++++++++++++++- tt_metal/tt_metal.cpp | 3 + 11 files changed, 318 insertions(+), 23 deletions(-) create mode 100644 tt_metal/tools/profiler/sync/sync_kernel.cpp diff --git a/tests/tt_metal/tools/profiler/test_device_profiler.py b/tests/tt_metal/tools/profiler/test_device_profiler.py index 1e6705abb40..3ef485f3cfe 100644 --- a/tests/tt_metal/tools/profiler/test_device_profiler.py +++ b/tests/tt_metal/tools/profiler/test_device_profiler.py @@ -8,8 +8,11 @@ import inspect import pytest +import pandas as pd + from tt_metal.tools.profiler.common import ( TT_METAL_HOME, + PROFILER_HOST_DEVICE_SYNC_INFO, PROFILER_SCRIPTS_ROOT, PROFILER_ARTIFACTS_DIR, PROFILER_LOGS_DIR, @@ -21,8 +24,10 @@ PROG_EXMP_DIR = "programming_examples/profiler" -def run_device_profiler_test(doubleRun=False, setup=False): +def run_device_profiler_test(testName=None, doubleRun=False, setup=False): name = inspect.stack()[1].function + if testName: + name = testName clear_profiler_runtime_artifacts() profilerRun = os.system(f"cd {TT_METAL_HOME} && " f"build/{PROG_EXMP_DIR}/{name}") assert profilerRun == 0 @@ -171,3 +176,21 @@ def test_dispatch_cores(): if statType in stat: statTypesSet.remove(statType) assert len(statTypesSet) == 0 + + +def test_profiler_host_device_sync(): + TOLERANCE = 0.1 + + os.environ["TT_METAL_PROFILER_SYNC"] = "1" + deviceData = run_device_profiler_test(testName="test_custom_cycle_count") + reprotedFreq = deviceData["data"]["deviceInfo"]["freq"] * 1e6 + + syncInfoFile = PROFILER_LOGS_DIR / PROFILER_HOST_DEVICE_SYNC_INFO + + assert os.path.isfile(syncInfoFile) + + syncinfoDF = pd.read_csv(syncInfoFile) + freq = float(syncinfoDF.iloc[-1]["frequency"]) * 1e9 + + assert freq < (reprotedFreq * (1 + TOLERANCE)), "Frequency too large on this {ENV_VAR_ARCH_NAME}" + assert freq > (reprotedFreq * (1 - TOLERANCE)), "Frequency too small on this {ENV_VAR_ARCH_NAME}" diff --git a/tt_metal/llrt/rtoptions.cpp b/tt_metal/llrt/rtoptions.cpp index 010cb4c1c15..804803a8070 100644 --- a/tt_metal/llrt/rtoptions.cpp +++ b/tt_metal/llrt/rtoptions.cpp @@ -48,6 +48,7 @@ RunTimeOptions::RunTimeOptions() { profiler_enabled = false; profile_dispatch_cores = false; + profiler_sync_enabled = false; #if defined(PROFILER) const char *profiler_enabled_str = std::getenv("TT_METAL_DEVICE_PROFILER"); if (profiler_enabled_str != nullptr && profiler_enabled_str[0] == '1') { @@ -56,6 +57,10 @@ RunTimeOptions::RunTimeOptions() { if (profile_dispatch_str != nullptr && profile_dispatch_str[0] == '1') { profile_dispatch_cores = true; } + const char *profiler_sync_enabled_str = std::getenv("TT_METAL_PROFILER_SYNC"); + if (profiler_enabled && profiler_sync_enabled_str != nullptr && profiler_sync_enabled_str[0] == '1') { + profiler_sync_enabled = true; + } } #endif TT_FATAL( diff --git a/tt_metal/llrt/rtoptions.hpp b/tt_metal/llrt/rtoptions.hpp index a8576550c60..68a63c688ff 100644 --- a/tt_metal/llrt/rtoptions.hpp +++ b/tt_metal/llrt/rtoptions.hpp @@ -87,6 +87,7 @@ class RunTimeOptions { bool profiler_enabled = false; bool profile_dispatch_cores = false; + bool profiler_sync_enabled = false; bool null_kernels = false; @@ -220,6 +221,7 @@ class RunTimeOptions { inline bool get_profiler_enabled() { return profiler_enabled; } inline bool get_profiler_do_dispatch_cores() { return profile_dispatch_cores; } + inline bool get_profiler_sync_enabled() { return profiler_sync_enabled; } inline void set_kernels_nullified(bool v) { null_kernels = v; } inline bool get_kernels_nullified() { return null_kernels; } diff --git a/tt_metal/third_party/tracy b/tt_metal/third_party/tracy index 77f94cbb6f6..2591e70eaca 160000 --- a/tt_metal/third_party/tracy +++ b/tt_metal/third_party/tracy @@ -1 +1 @@ -Subproject commit 77f94cbb6f6725b6768668b5907a95e9e1e8d6ab +Subproject commit 2591e70eaca0a12705ea23cbe4059e086c9a2a9f diff --git a/tt_metal/tools/profiler/common.py b/tt_metal/tools/profiler/common.py index 2039b66220f..284193b9f09 100644 --- a/tt_metal/tools/profiler/common.py +++ b/tt_metal/tools/profiler/common.py @@ -16,6 +16,7 @@ PROFILER_DEVICE_SIDE_LOG = "profile_log_device.csv" PROFILER_HOST_SIDE_LOG = "profile_log_host.csv" +PROFILER_HOST_DEVICE_SYNC_INFO = "sync_device_info.csv" PROFILER_SCRIPTS_ROOT = TT_METAL_HOME / "tt_metal/tools/profiler" PROFILER_ARTIFACTS_DIR = TT_METAL_HOME / "generated/profiler" diff --git a/tt_metal/tools/profiler/kernel_profiler.hpp b/tt_metal/tools/profiler/kernel_profiler.hpp index 366c49933cf..e01331703ed 100644 --- a/tt_metal/tools/profiler/kernel_profiler.hpp +++ b/tt_metal/tools/profiler/kernel_profiler.hpp @@ -49,6 +49,8 @@ namespace kernel_profiler{ #endif constexpr uint32_t QUICK_PUSH_MARKER_COUNT = 2; + constexpr int WALL_CLOCK_HIGH_INDEX = 1; + constexpr int WALL_CLOCK_LOW_INDEX = 0; #if defined(COMPILE_FOR_BRISC) constexpr uint32_t profilerBuffer = PROFILER_L1_BUFFER_BR; @@ -191,6 +193,7 @@ namespace kernel_profiler{ trisc1Buffer[ID_LL] = runCounter; trisc2Buffer[ID_LL] = runCounter; + #endif //BRISC_INIT #endif } @@ -209,8 +212,8 @@ namespace kernel_profiler{ { volatile tt_l1_ptr uint32_t *buffer = reinterpret_cast(kernel_profiler::profilerBuffer); volatile tt_reg_ptr uint32_t *p_reg = reinterpret_cast (RISCV_DEBUG_REG_WALL_CLOCK_L); - buffer[index] = 0x80000000 | ((timer_id & 0x7FFFF) << 12) | (p_reg[1] & 0xFFF); - buffer[index+1] = p_reg[0]; + buffer[index] = 0x80000000 | ((timer_id & 0x7FFFF) << 12) | (p_reg[WALL_CLOCK_HIGH_INDEX] & 0xFFF); + buffer[index+1] = p_reg[WALL_CLOCK_LOW_INDEX]; } inline __attribute__((always_inline)) void mark_start_at_index_inlined(uint32_t index) @@ -240,22 +243,13 @@ namespace kernel_profiler{ } } - PROFILER_INLINE void mark_BR_fw_first_start() - { - uint32_t time_L = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L); - uint32_t time_H = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_H); - - profiler_control_buffer[FW_RESET_L] = time_L; - profiler_control_buffer[FW_RESET_H] = time_H; - } - - inline __attribute__((always_inline)) void mark_dropped_timestamps(uint32_t index) { uint32_t curr = profiler_control_buffer[DROPPED_ZONES]; profiler_control_buffer[DROPPED_ZONES] = (1 << index) | curr; } + inline __attribute__((always_inline)) void risc_finished_profiling() { for (int i = 0; i < SUM_COUNT; i ++) @@ -521,12 +515,12 @@ namespace kernel_profiler{ inline __attribute__((always_inline)) profileScopeAccumulate () { - start_time = ((uint64_t)p_reg[1] << 32) | p_reg[0]; + start_time = ((uint64_t)p_reg[WALL_CLOCK_HIGH_INDEX] << 32) | p_reg[WALL_CLOCK_LOW_INDEX]; } inline __attribute__((always_inline)) ~profileScopeAccumulate () { sumIDs[index] = timer_id; - sums[index] += (((uint64_t)p_reg[1] << 32) | p_reg[0]) - start_time; + sums[index] += (((uint64_t)p_reg[WALL_CLOCK_HIGH_INDEX] << 32) | p_reg[WALL_CLOCK_LOW_INDEX]) - start_time; } }; } diff --git a/tt_metal/tools/profiler/profiler.cpp b/tt_metal/tools/profiler/profiler.cpp index f444731c287..c2078b324af 100644 --- a/tt_metal/tools/profiler/profiler.cpp +++ b/tt_metal/tools/profiler/profiler.cpp @@ -109,6 +109,7 @@ void DeviceProfiler::readRiscProfilerResults( riscNumRead = profile_buffer[index] & 0x7; coreFlatIDRead = (profile_buffer[index] >> 3) & 0xFF; runCounterRead = profile_buffer[index + 1]; + } else { @@ -356,11 +357,13 @@ void DeviceProfiler::generateZoneSourceLocationsHashes() } } + void DeviceProfiler::dumpResults ( Device *device, const vector &worker_cores){ #if defined(PROFILER) ZoneScoped; + auto device_id = device->id(); device_core_frequency = tt::Cluster::instance().get_device_aiclk(device_id); @@ -379,7 +382,6 @@ void DeviceProfiler::dumpResults ( worker_core); } - } else { @@ -404,18 +406,48 @@ void DeviceProfiler::pushTracyDeviceResults() } } + double delay = 0; + double frequency = 0; + uint64_t cpuTime = 0; + + for (auto& device_core: device_cores) + { + int device_id = device_core.first; + CoreCoord worker_core = device_core.second; + + if (device_core_sync_info.find(worker_core) != device_core_sync_info.end()) + { + cpuTime = get<0>(device_core_sync_info.at(worker_core)); + delay = get<1>(device_core_sync_info.at(worker_core)); + frequency = get<2>(device_core_sync_info.at(worker_core)); + log_info("Device {} sync info are, frequency {} GHz, delay {} cycles and, sync point {} seconds", + device_id, + frequency, + delay, + cpuTime); + } + } + for (auto& device_core: device_cores) { int device_id = device_core.first; CoreCoord worker_core = device_core.second; + if (delay == 0.0 || frequency == 0.0) + { + delay = smallest_timestamp; + frequency = device_core_frequency/1000.0; + cpuTime = TracyGetCpuTime(); + log_warning("For device {}, core {},{} default frequency was used and its zones will be out of sync", device_id, worker_core.x, worker_core.y); + } + if (device_tracy_contexts.find(device_core) == device_tracy_contexts.end()) { auto tracyCtx = TracyTTContext(); std::string tracyTTCtxName = fmt::format("Device: {}, Core ({},{})", device_id, worker_core.x, worker_core.y); - TracyTTContextPopulate(tracyCtx, smallest_timestamp, 1000.f / (float)device_core_frequency); + TracyTTContextPopulate(tracyCtx, cpuTime, delay, frequency); TracyTTContextName(tracyCtx, tracyTTCtxName.c_str(), tracyTTCtxName.size()); @@ -429,7 +461,6 @@ void DeviceProfiler::pushTracyDeviceResults() for (auto& event: device_events) { std::pair device_core = {event.chip_id, (CoreCoord){event.core_x,event.core_y}}; - if (event.zone_phase == tracy::TTDeviceEventPhase::begin) { TracyTTPushStartZone(device_tracy_contexts[device_core], event); diff --git a/tt_metal/tools/profiler/profiler.hpp b/tt_metal/tools/profiler/profiler.hpp index 129c4e6cc85..78227d0e2c7 100644 --- a/tt_metal/tools/profiler/profiler.hpp +++ b/tt_metal/tools/profiler/profiler.hpp @@ -50,7 +50,7 @@ class DeviceProfiler { // Device-Core tracy context std::map, TracyTTCtx> device_tracy_contexts; - // Device-Core tracy context + // Device events std::set device_events; // Hash to zone source locations @@ -102,6 +102,8 @@ class DeviceProfiler { //DRAM buffer for device side results std::shared_ptr output_dram_buffer = nullptr; + // Device-core Syncdata + std::map> device_core_sync_info; //Set the device side file flag void setNewLogFlag(bool new_log_flag); diff --git a/tt_metal/tools/profiler/sync/sync_kernel.cpp b/tt_metal/tools/profiler/sync/sync_kernel.cpp new file mode 100644 index 00000000000..12e1abe72cd --- /dev/null +++ b/tt_metal/tools/profiler/sync/sync_kernel.cpp @@ -0,0 +1,46 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +void kernel_main() { + DeviceZoneScopedMainN("SYNC-MAIN"); + volatile tt_reg_ptr uint32_t *p_reg = reinterpret_cast (RISCV_DEBUG_REG_WALL_CLOCK_L); + volatile tt_l1_ptr uint32_t *profiler_control_buffer = reinterpret_cast(PROFILER_L1_BUFFER_CONTROL); + volatile tt_l1_ptr uint32_t *briscBuffer = reinterpret_cast(PROFILER_L1_BUFFER_BR + kernel_profiler::CUSTOM_MARKERS * sizeof(uint32_t)); + + uint32_t syncTimeBufferIndex = 0; + + constexpr int FIRST_READ_COUNT = 2; + + + while ( syncTimeBufferIndex < FIRST_READ_COUNT) { + uint32_t deviceTime = p_reg[kernel_profiler::WALL_CLOCK_LOW_INDEX]; + + uint32_t hostTime = profiler_control_buffer[kernel_profiler::FW_RESET_L]; + if (hostTime > 0) + { + briscBuffer[syncTimeBufferIndex++] = p_reg[kernel_profiler::WALL_CLOCK_HIGH_INDEX]; + briscBuffer[syncTimeBufferIndex++] = deviceTime; + briscBuffer[syncTimeBufferIndex++] = deviceTime; + briscBuffer[syncTimeBufferIndex++] = hostTime; + profiler_control_buffer[kernel_profiler::FW_RESET_L] = 0; + } + } + + { + DeviceZoneScopedMainChildN("SYNC-LOOP"); + while ( syncTimeBufferIndex < ((SAMPLE_COUNT + 1) * 2) ) { + uint32_t deviceTime = p_reg[kernel_profiler::WALL_CLOCK_LOW_INDEX]; + + uint32_t hostTime = profiler_control_buffer[kernel_profiler::FW_RESET_L]; + if (hostTime > 0) + { + briscBuffer[syncTimeBufferIndex++] = deviceTime; + briscBuffer[syncTimeBufferIndex++] = hostTime; + profiler_control_buffer[kernel_profiler::FW_RESET_L] = 0; + } + } + } +} diff --git a/tt_metal/tools/profiler/tt_metal_profiler.cpp b/tt_metal/tools/profiler/tt_metal_profiler.cpp index 1237ca64c59..9920dd40ad2 100644 --- a/tt_metal/tools/profiler/tt_metal_profiler.cpp +++ b/tt_metal/tools/profiler/tt_metal_profiler.cpp @@ -1,9 +1,9 @@ // SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. // // SPDX-License-Identifier: Apache-2.0 - -#include #include +#include +#include #include "tt_metal/host_api.hpp" #include "impl/debug/dprint_server.hpp" @@ -37,19 +37,197 @@ namespace detail { std::map tt_metal_device_profiler_map; +std::unordered_map >> deviceHostTimePair; +std::unordered_map smallestHostime; + + +constexpr CoreCoord SYNC_CORE = {0,0}; + +void syncDeviceHost(Device *device, CoreCoord logical_core, bool doHeader) +{ + if (!tt::llrt::OptionsG.get_profiler_sync_enabled()) return; + ZoneScopedC(tracy::Color::Tomato3); + auto core = device->worker_core_from_logical_core(logical_core); + auto device_id = device->id(); + + deviceHostTimePair.emplace(device_id, (std::vector >){}); + smallestHostime.emplace(device_id, 0); + + tt_metal::Program program = tt_metal::CreateProgram(); + + constexpr uint16_t sampleCount = 249; + std::map kernel_defines = { + {"SAMPLE_COUNT", std::to_string(sampleCount)}, + }; + + tt_metal::KernelHandle brisc_kernel = tt_metal::CreateKernel( + program, "tt_metal/tools/profiler/sync/sync_kernel.cpp", + logical_core, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_0, + .noc = tt_metal::NOC::RISCV_0_default, + .defines = kernel_defines} + ); + + EnqueueProgram(device->command_queue(), program, false); + + std::filesystem::path output_dir = std::filesystem::path(string(PROFILER_RUNTIME_ROOT_DIR) + string(PROFILER_LOGS_DIR_NAME)); + std::filesystem::path log_path = output_dir / "sync_device_info.csv"; + std::ofstream log_file; + + int64_t writeSum = 0; + + constexpr int millisecond_wait = 4; + + const double tracyToSecRatio = TracyGetTimerMul(); + const int64_t tracyBaseTime = TracyGetBaseTime(); + const int64_t hostStartTime = TracyGetCpuTime(); + std::vector writeTimes(sampleCount); + + for (int i = 0; i < sampleCount; i++) + { + ZoneScopedC(tracy::Color::Tomato2); + std::this_thread::sleep_for(std::chrono::milliseconds(millisecond_wait)); + int64_t writeStart = TracyGetCpuTime(); + uint32_t sinceStart = writeStart - hostStartTime; + tt::Cluster::instance().write_reg(&sinceStart, tt_cxy_pair(device_id, core) , PROFILER_L1_BUFFER_CONTROL + kernel_profiler::FW_RESET_L * sizeof(uint32_t)); + writeTimes[i] = (TracyGetCpuTime() - writeStart); + } + + Finish(device->command_queue()); + + log_info ("SYNC PROGRAM FINISH IS DONE ON {}",device_id); + if ((smallestHostime[device_id] == 0) || (smallestHostime[device_id] > hostStartTime)) + { + smallestHostime[device_id] = hostStartTime; + } + + for (auto writeTime : writeTimes) + { + writeSum += writeTime; + } + double writeOverhead = (double)writeSum / sampleCount; + vector sync_times = tt::llrt::read_hex_vec_from_core( + device_id, + core, + PROFILER_L1_BUFFER_BR + kernel_profiler::CUSTOM_MARKERS * sizeof(uint32_t), + (sampleCount + 1) * 2 * sizeof(uint32_t)); + + uint32_t preDeviceTime = 0; + uint32_t preHostTime = 0; + bool firstSample = true; + + uint64_t deviceStartTime = (uint64_t(sync_times[0] & 0xFFF) << 32) | sync_times[1]; + uint32_t deviceStartTime_H = sync_times[0] & 0xFFF; + uint32_t deviceStartTime_L = sync_times[1]; + preDeviceTime = deviceStartTime_L; + + uint32_t hostStartTime_H = 0; + + uint64_t preDeviceTimeLarge = 0; + uint64_t preHostTimeLarge = 0; + uint64_t firstDeviceTimeLarge = 0; + uint64_t firstHostTimeLarge = 0; + + for (int i = 2; i < 2 * (sampleCount + 1); i += 2) + { + + uint32_t deviceTime = sync_times[i]; + if (deviceTime < preDeviceTime) deviceStartTime_H ++; + preDeviceTime = deviceTime; + uint64_t deviceTimeLarge = (uint64_t(deviceStartTime_H) << 32) | deviceTime; + + uint32_t hostTime = sync_times[i + 1] + writeTimes[i/2 - 1]; + if (hostTime < preHostTime) hostStartTime_H ++; + preHostTime = hostTime; + uint64_t hostTimeLarge = hostStartTime - smallestHostime[device_id] + ((uint64_t(hostStartTime_H) << 32) | hostTime); + + deviceHostTimePair[device_id].push_back(std::pair {deviceTimeLarge,hostTimeLarge}); + + if (firstSample) + { + firstDeviceTimeLarge = deviceTimeLarge; + firstHostTimeLarge = hostTimeLarge; + firstSample = false; + } + + preDeviceTimeLarge = deviceTimeLarge; + preHostTimeLarge = hostTimeLarge; + } + + double hostSum = 0; + double deviceSum = 0; + double hostSquaredSum = 0; + double hostDeviceProductSum = 0; + + for (auto& deviceHostTime : deviceHostTimePair[device_id]) + { + double deviceTime = deviceHostTime.first; + double hostTime = deviceHostTime.second; + + deviceSum += deviceTime; + hostSum += hostTime; + hostSquaredSum += (hostTime * hostTime); + hostDeviceProductSum += (hostTime * deviceTime); + } + + uint16_t accumulateSampleCount = deviceHostTimePair[device_id].size(); + + double frequencyFit = (hostDeviceProductSum * accumulateSampleCount - hostSum * deviceSum) / ((hostSquaredSum * accumulateSampleCount - hostSum * hostSum) * tracyToSecRatio); + + double delay = (deviceSum - frequencyFit * hostSum * tracyToSecRatio) / accumulateSampleCount; + + log_file.open(log_path, std::ios_base::app); + if (doHeader) + { + log_file << fmt::format("device id,core_x, core_y,device,host_tracy,host_real,write_overhead,host_start,delay,frequency,tracy_ratio,tracy_base_time") << std::endl; + } + int init = deviceHostTimePair[device_id].size() - sampleCount; + for (int i = init ;i < deviceHostTimePair[device_id].size(); i++) + { + log_file << fmt::format( + "{:5},{:5},{:5},{:20},{:20},{:20.2f},{:20},{:20},{:20.2f},{:20.15f},{:20.15f},{:20}", + device_id, + core.x, + core.y, + deviceHostTimePair[device_id][i].first, + deviceHostTimePair[device_id][i].second, + (double) deviceHostTimePair[device_id][i].second * tracyToSecRatio, + writeTimes[i - init], + smallestHostime[device_id], + delay, + frequencyFit, + tracyToSecRatio, + tracyBaseTime + ) + << std::endl; + } + + log_info("Sync data for device: {}, c:{}, d:{}, f:{}",device_id, smallestHostime[device_id], delay, frequencyFit); + + tt_metal_device_profiler_map.at(device_id).device_core_sync_info.emplace(core, std::make_tuple(smallestHostime[device_id], delay, frequencyFit)); + tt_metal_device_profiler_map.at(device_id).device_core_sync_info[core] = std::make_tuple(smallestHostime[device_id], delay, frequencyFit); +} + + void InitDeviceProfiler(Device *device){ #if defined(PROFILER) ZoneScoped; - TracySetCpuTime(); + TracySetCpuTime (TracyGetCpuTime()); + + bool doSync = false; auto device_id = device->id(); if (getDeviceProfilerState()) { static std::atomic firstInit = true; + bool doHeader = firstInit; auto device_id = device->id(); + if (tt_metal_device_profiler_map.find(device_id) == tt_metal_device_profiler_map.end()) { + doSync = true; if (firstInit.exchange(false)) { tt_metal_device_profiler_map.emplace(device_id, DeviceProfiler(true)); @@ -59,6 +237,7 @@ void InitDeviceProfiler(Device *device){ tt_metal_device_profiler_map.emplace(device_id, DeviceProfiler(false)); } } + uint32_t dramBankCount = tt::Cluster::instance().get_soc_desc(device_id).get_num_dram_channels(); uint32_t coreCountPerDram = tt::Cluster::instance().get_soc_desc(device_id).profiler_ceiled_core_count_perf_dram_bank; @@ -80,6 +259,7 @@ void InitDeviceProfiler(Device *device){ std::vector control_buffer(PROFILER_L1_CONTROL_VECTOR_SIZE, 0); control_buffer[kernel_profiler::DRAM_PROFILER_ADDRESS] = tt_metal_device_profiler_map.at(device_id).output_dram_buffer->address(); + const metal_SocDescriptor& soc_d = tt::Cluster::instance().get_soc_desc(device_id); auto ethCores = soc_d.get_physical_ethernet_cores() ; @@ -105,6 +285,10 @@ void InitDeviceProfiler(Device *device){ std::vector inputs_DRAM(tt_metal_device_profiler_map.at(device_id).output_dram_buffer->size()/sizeof(uint32_t), 0); tt_metal::detail::WriteToBuffer(tt_metal_device_profiler_map.at(device_id).output_dram_buffer, inputs_DRAM); + if (doSync) + { + syncDeviceHost (device, SYNC_CORE, doHeader); + } } #endif } @@ -220,6 +404,10 @@ void DumpDeviceProfileResults(Device *device, std::vector &worker_cor auto device_id = device->id(); if (tt_metal_device_profiler_map.find(device_id) != tt_metal_device_profiler_map.end()) { + if (!lastDump) + { + syncDeviceHost (device, SYNC_CORE, false); + } tt_metal_device_profiler_map.at(device_id).setDeviceArchitecture(device->arch()); tt_metal_device_profiler_map.at(device_id).dumpResults(device, worker_cores); if (lastDump) diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index e1a4a510843..3fabec43b4b 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -304,6 +304,9 @@ std::map CreateDevices( } // TODO: need to only enable routing for used mmio chips tt::Cluster::instance().set_internal_routing_info_for_ethernet_cores(true); + for (auto &active_device: active_devices){ + detail::InitDeviceProfiler(active_device.second); + } return active_devices; } From 46c7f2fd8935fec88c7513f5f0c94343dcc6dda5 Mon Sep 17 00:00:00 2001 From: Austin Ho Date: Fri, 7 Jun 2024 13:47:53 +0000 Subject: [PATCH 2/2] #9174: Cache profiler sync program to enable use with trace --- tt_metal/tools/profiler/profiler.hpp | 2 + tt_metal/tools/profiler/tt_metal_profiler.cpp | 42 ++++++++++--------- 2 files changed, 24 insertions(+), 20 deletions(-) diff --git a/tt_metal/tools/profiler/profiler.hpp b/tt_metal/tools/profiler/profiler.hpp index 78227d0e2c7..79bbf2a3777 100644 --- a/tt_metal/tools/profiler/profiler.hpp +++ b/tt_metal/tools/profiler/profiler.hpp @@ -11,6 +11,7 @@ #include #include "tt_metal/impl/buffers/buffer.hpp" +#include "tt_metal/impl/program/program.hpp" #include "llrt/llrt.hpp" #include "tools/profiler/profiler_state.hpp" #include "tools/profiler/common.hpp" @@ -101,6 +102,7 @@ class DeviceProfiler { //DRAM buffer for device side results std::shared_ptr output_dram_buffer = nullptr; + std::shared_ptr sync_program = nullptr; // Device-core Syncdata std::map> device_core_sync_info; diff --git a/tt_metal/tools/profiler/tt_metal_profiler.cpp b/tt_metal/tools/profiler/tt_metal_profiler.cpp index 9920dd40ad2..0653a7c79c0 100644 --- a/tt_metal/tools/profiler/tt_metal_profiler.cpp +++ b/tt_metal/tools/profiler/tt_metal_profiler.cpp @@ -43,7 +43,7 @@ std::unordered_map smallestHostime; constexpr CoreCoord SYNC_CORE = {0,0}; -void syncDeviceHost(Device *device, CoreCoord logical_core, bool doHeader) +void syncDeviceHost(Device *device, CoreCoord logical_core, std::shared_ptr &sync_program, bool doHeader) { if (!tt::llrt::OptionsG.get_profiler_sync_enabled()) return; ZoneScopedC(tracy::Color::Tomato3); @@ -53,23 +53,24 @@ void syncDeviceHost(Device *device, CoreCoord logical_core, bool doHeader) deviceHostTimePair.emplace(device_id, (std::vector >){}); smallestHostime.emplace(device_id, 0); - tt_metal::Program program = tt_metal::CreateProgram(); - constexpr uint16_t sampleCount = 249; - std::map kernel_defines = { - {"SAMPLE_COUNT", std::to_string(sampleCount)}, - }; - - tt_metal::KernelHandle brisc_kernel = tt_metal::CreateKernel( - program, "tt_metal/tools/profiler/sync/sync_kernel.cpp", - logical_core, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt_metal::NOC::RISCV_0_default, - .defines = kernel_defines} - ); - - EnqueueProgram(device->command_queue(), program, false); + if (sync_program == nullptr) { + sync_program = std::make_shared(); + + std::map kernel_defines = { + {"SAMPLE_COUNT", std::to_string(sampleCount)}, + }; + + tt_metal::KernelHandle brisc_kernel = tt_metal::CreateKernel( + *sync_program, "tt_metal/tools/profiler/sync/sync_kernel.cpp", + logical_core, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_0, + .noc = tt_metal::NOC::RISCV_0_default, + .defines = kernel_defines} + ); + } + EnqueueProgram(device->command_queue(), sync_program, false); std::filesystem::path output_dir = std::filesystem::path(string(PROFILER_RUNTIME_ROOT_DIR) + string(PROFILER_LOGS_DIR_NAME)); std::filesystem::path log_path = output_dir / "sync_device_info.csv"; @@ -287,7 +288,7 @@ void InitDeviceProfiler(Device *device){ tt_metal::detail::WriteToBuffer(tt_metal_device_profiler_map.at(device_id).output_dram_buffer, inputs_DRAM); if (doSync) { - syncDeviceHost (device, SYNC_CORE, doHeader); + syncDeviceHost (device, SYNC_CORE, tt_metal_device_profiler_map.at(device_id).sync_program, doHeader); } } #endif @@ -406,15 +407,16 @@ void DumpDeviceProfileResults(Device *device, std::vector &worker_cor { if (!lastDump) { - syncDeviceHost (device, SYNC_CORE, false); + syncDeviceHost (device, SYNC_CORE, tt_metal_device_profiler_map.at(device_id).sync_program, false); } tt_metal_device_profiler_map.at(device_id).setDeviceArchitecture(device->arch()); tt_metal_device_profiler_map.at(device_id).dumpResults(device, worker_cores); if (lastDump) { // Process is ending, no more device dumps are coming, reset your ref on the buffer so deallocate is the last - // owner. + // owner. Sync program also contains a buffer so it is safter to release it here tt_metal_device_profiler_map.at(device_id).output_dram_buffer.reset(); + tt_metal_device_profiler_map.at(device_id).sync_program.reset(); } else {