diff --git a/tt_metal/detail/reports/compilation_reporter.cpp b/tt_metal/detail/reports/compilation_reporter.cpp index 2940e7fe8794..9a681cbca0b8 100644 --- a/tt_metal/detail/reports/compilation_reporter.cpp +++ b/tt_metal/detail/reports/compilation_reporter.cpp @@ -79,13 +79,13 @@ std::string kernel_attributes_str(std::shared_ptr kernel) { return attr_str; } -void CompilationReporter::add_kernel_compile_stats(const Program &program, std::shared_ptr kernel, bool cache_hit, size_t kernel_hash) { +void CompilationReporter::add_kernel_compile_stats(uint64_t program_id, std::shared_ptr kernel, bool cache_hit, size_t kernel_hash) { std::unique_lock lock(mutex_); if (cache_hit) { - this->program_id_to_cache_hit_counter_[program.get_id()].hits++; + this->program_id_to_cache_hit_counter_[program_id].hits++; } else { - this->program_id_to_cache_hit_counter_[program.get_id()].misses++; + this->program_id_to_cache_hit_counter_[program_id].misses++; } std::string kernel_stats = "," + kernel->name() + ","; std::string cache_status = cache_hit ? "cache hit" : "cache miss"; @@ -99,13 +99,13 @@ void CompilationReporter::add_kernel_compile_stats(const Program &program, std:: } index++; } - this->program_id_to_kernel_stats_[program.get_id()].push_back(kernel_stats); + this->program_id_to_kernel_stats_[program_id].push_back(kernel_stats); } -void CompilationReporter::flush_program_entry(const Program &program, bool persistent_compilation_cache_enabled) { +void CompilationReporter::flush_program_entry(uint64_t program_id, size_t num_kernels, std::function(size_t)> get_kernel, bool persistent_compilation_cache_enabled) { std::unique_lock lock(mutex_); - auto num_cache_misses = this->program_id_to_cache_hit_counter_.at(program.get_id()).misses; - auto num_cache_hits = this->program_id_to_cache_hit_counter_.at(program.get_id()).hits; + auto num_cache_misses = this->program_id_to_cache_hit_counter_.at(program_id).misses; + auto num_cache_hits = this->program_id_to_cache_hit_counter_.at(program_id).hits; if (this->total_num_compile_programs_ == 0) { this->init_reports(); } @@ -113,8 +113,8 @@ void CompilationReporter::flush_program_entry(const Program &program, bool persi auto get_num_compute_and_data_movement_kernels = [&]() { uint32_t num_compute = 0; uint32_t num_data_movement = 0; - for (size_t kernel_id = 0; kernel_id < program.num_kernels(); kernel_id++) { - const auto kernel = detail::GetKernel(program, kernel_id); + for (size_t kernel_id = 0; kernel_id < num_kernels; kernel_id++) { + const auto kernel = get_kernel(kernel_id); if (kernel->processor() == tt::RISCV::BRISC or kernel->processor() == tt::RISCV::NCRISC) { num_data_movement++; } else { @@ -126,14 +126,14 @@ void CompilationReporter::flush_program_entry(const Program &program, bool persi auto [num_compute_kernels, num_data_movement_kernels] = get_num_compute_and_data_movement_kernels(); - this->summary_report_ << program.get_id() << ", " + this->summary_report_ << program_id << ", " << num_compute_kernels << ", " << num_data_movement_kernels << ", " << (persistent_compilation_cache_enabled ? "Y" : "N") << ", " << num_cache_misses << ", " << num_cache_hits << "\n"; - this->detailed_report_ << "Compiling Program: " << program.get_id() << "\n"; + this->detailed_report_ << "Compiling Program: " << program_id << "\n"; this->detailed_report_ << "\n,Kernel Creation Report:\n"; this->detailed_report_ << ",,Number of Compute CreateKernel API calls: " << num_compute_kernels << "\n"; this->detailed_report_ << ",,Number of Datamovement CreateKernel API calls: " << num_data_movement_kernels << "\n"; @@ -144,7 +144,7 @@ void CompilationReporter::flush_program_entry(const Program &program, bool persi this->detailed_report_ << ",,Total number of kernel compile cache hits: " << num_cache_hits << "\n"; this->detailed_report_ << "\n,Kernel File Name, Core Range, Cache Hit, Kernel Attributes, Hash\n"; - auto kernel_stats_vec = this->program_id_to_kernel_stats_.at(program.get_id()); + auto kernel_stats_vec = this->program_id_to_kernel_stats_.at(program_id); for (const auto &kernel_stats : kernel_stats_vec) { this->detailed_report_ << kernel_stats; } diff --git a/tt_metal/detail/reports/compilation_reporter.hpp b/tt_metal/detail/reports/compilation_reporter.hpp index c976bf5c8bc5..23707b8eff34 100644 --- a/tt_metal/detail/reports/compilation_reporter.hpp +++ b/tt_metal/detail/reports/compilation_reporter.hpp @@ -45,9 +45,9 @@ class CompilationReporter { CompilationReporter(const CompilationReporter&) = delete; CompilationReporter(CompilationReporter&& other) noexcept = delete; - void add_kernel_compile_stats(const Program &program, std::shared_ptr kernel, bool cache_hit, size_t kernel_hash); + void add_kernel_compile_stats(uint64_t program_id, std::shared_ptr kernel, bool cache_hit, size_t kernel_hash); - void flush_program_entry(const Program &program, bool persistent_compilation_cache_enabled); + void flush_program_entry(uint64_t program_id, size_t num_kernels, std::function(size_t)> get_kernel, bool persistent_compilation_cache_enabled); static CompilationReporter& inst(); static void toggle (bool state); static bool enabled (); diff --git a/tt_metal/detail/reports/memory_reporter.cpp b/tt_metal/detail/reports/memory_reporter.cpp index 1bffc4421ddf..5275d438742d 100644 --- a/tt_metal/detail/reports/memory_reporter.cpp +++ b/tt_metal/detail/reports/memory_reporter.cpp @@ -100,14 +100,14 @@ void populate_reports(const Device *device, std::ofstream &memory_usage_summary_ write_memory_usage(device, BufferType::L1, memory_usage_summary_report, detailed_memory_usage_report, l1_usage_summary_report); } -void MemoryReporter::flush_program_memory_usage(const Program &program, const Device *device) { +void MemoryReporter::flush_program_memory_usage(uint64_t program_id, const Device *device) { if (not this->program_memory_usage_summary_report_.is_open()) { this->init_reports(); } - this->program_memory_usage_summary_report_ << program.get_id(); - this->program_l1_usage_summary_report_ << program.get_id(); - this->program_detailed_memory_usage_report_ << program.get_id(); + this->program_memory_usage_summary_report_ << program_id; + this->program_l1_usage_summary_report_ << program_id; + this->program_detailed_memory_usage_report_ << program_id; populate_reports(device, this->program_memory_usage_summary_report_, this->program_detailed_memory_usage_report_, this->program_l1_usage_summary_report_); } diff --git a/tt_metal/detail/reports/memory_reporter.hpp b/tt_metal/detail/reports/memory_reporter.hpp index e5138f02a352..217f64905228 100644 --- a/tt_metal/detail/reports/memory_reporter.hpp +++ b/tt_metal/detail/reports/memory_reporter.hpp @@ -60,7 +60,7 @@ class MemoryReporter { MemoryReporter(const MemoryReporter&) = delete; MemoryReporter(MemoryReporter&& other) noexcept = delete; - void flush_program_memory_usage(const Program &program, const Device *device); + void flush_program_memory_usage(uint64_t program_id, const Device *device); void dump_memory_usage_state(const Device *device, std::string prefix="") const; diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp index 010ad9e43599..d212bfac0d9f 100644 --- a/tt_metal/impl/dispatch/command_queue.cpp +++ b/tt_metal/impl/dispatch/command_queue.cpp @@ -730,8 +730,9 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro const uint32_t max_prefetch_command_size = dispatch_constants::get(dispatch_core_type).max_prefetch_command_size(); + auto &program_transfer_info = program.get_program_transfer_info(); // Multicast Semaphore Cmd - uint32_t num_multicast_semaphores = program.program_transfer_info.multicast_semaphores.size(); + uint32_t num_multicast_semaphores = program_transfer_info.multicast_semaphores.size(); std::vector> multicast_sem_sub_cmds(num_multicast_semaphores); std::vector>> multicast_sem_data(num_multicast_semaphores); std::vector>> multicast_sem_payload(num_multicast_semaphores); @@ -739,7 +740,7 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro multicast_sem_dst_size.reserve(num_multicast_semaphores); if (num_multicast_semaphores > 0) { uint32_t i = 0; - for (const auto& [dst, transfer_info_vec] : program.program_transfer_info.multicast_semaphores) { + for (const auto& [dst, transfer_info_vec] : program_transfer_info.multicast_semaphores) { // TODO: loop over things inside transfer_info[i] uint32_t write_packed_len = transfer_info_vec[0].data.size(); multicast_sem_dst_size.emplace_back(std::make_pair(dst, write_packed_len * sizeof(uint32_t))); @@ -768,7 +769,7 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro } // Unicast Semaphore Cmd - uint32_t num_unicast_semaphores = program.program_transfer_info.unicast_semaphores.size(); + uint32_t num_unicast_semaphores = program_transfer_info.unicast_semaphores.size(); std::vector> unicast_sem_sub_cmds(num_unicast_semaphores); std::vector>> unicast_sem_data(num_unicast_semaphores); std::vector>> unicast_sem_payload(num_unicast_semaphores); @@ -776,7 +777,7 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro unicast_sem_dst_size.reserve(num_unicast_semaphores); if (num_unicast_semaphores > 0) { uint32_t i = 0; - for (const auto& [dst, transfer_info_vec] : program.program_transfer_info.unicast_semaphores) { + for (const auto& [dst, transfer_info_vec] : program_transfer_info.unicast_semaphores) { // TODO: loop over things inside transfer_info[i] uint32_t write_packed_len = transfer_info_vec[0].data.size(); unicast_sem_dst_size.emplace_back(std::make_pair(dst, write_packed_len * sizeof(uint32_t))); @@ -876,7 +877,8 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro const uint32_t max_length_per_sub_cmd = dispatch_constants::get(this->dispatch_core_type).scratch_db_size() / 2; const uint32_t max_paged_length_per_sub_cmd = max_length_per_sub_cmd / HostMemDeviceCommand::PROGRAM_PAGE_SIZE * HostMemDeviceCommand::PROGRAM_PAGE_SIZE; - for (const auto& [cores, num_mcast_dests, kg_transfer_info] : program.program_transfer_info.kernel_bins) { + const auto &kernels_buffer = program.get_kernels_buffer(); + for (const auto& [cores, num_mcast_dests, kg_transfer_info] : program_transfer_info.kernel_bins) { bool write_linear; uint32_t noc_encoding; std::visit( @@ -913,14 +915,14 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro uint32_t base_address, page_offset; if (kg_transfer_info.page_offsets[kernel_idx] > CQ_PREFETCH_RELAY_PAGED_START_PAGE_MASK) { - const uint32_t num_banks = this->device->num_banks(this->program.kernels_buffer->buffer_type()); + const uint32_t num_banks = this->device->num_banks(kernels_buffer->buffer_type()); page_offset = kg_transfer_info.page_offsets[kernel_idx] % num_banks; uint32_t num_full_pages_written_per_bank = kg_transfer_info.page_offsets[kernel_idx] / num_banks; - base_address = this->program.kernels_buffer->address() + - num_full_pages_written_per_bank * this->program.kernels_buffer->page_size(); + base_address = kernels_buffer->address() + + num_full_pages_written_per_bank * kernels_buffer->page_size(); } else { - base_address = this->program.kernels_buffer->address(); + base_address = kernels_buffer->address(); page_offset = kg_transfer_info.page_offsets[kernel_idx]; } @@ -928,11 +930,11 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro true, // is_dram page_offset, base_address, - this->program.kernels_buffer->page_size(), - relayed_bytes / this->program.kernels_buffer->page_size(), + kernels_buffer->page_size(), + relayed_bytes / kernels_buffer->page_size(), length_adjust); } else { - uint32_t base_address = this->program.kernels_buffer->address(); + uint32_t base_address = kernels_buffer->address(); uint32_t page_offset = kg_transfer_info.page_offsets[kernel_idx]; // TODO: pack all these writes into 1 linear write @@ -1070,7 +1072,7 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro } // if dispatch_s is enabled have dispatch_d send a semaphore update to dispatch_s (this will include a write barrier on dispatch_d if program is active) // if not, check if the program is active on workers. If active, have dispatch_d issue a write barrier - cmd_sequence_sizeB += (this->device->dispatch_s_enabled() || program.program_transfer_info.num_active_cores > 0) * CQ_PREFETCH_CMD_BARE_MIN_SIZE; + cmd_sequence_sizeB += (this->device->dispatch_s_enabled() || program_transfer_info.num_active_cores > 0) * CQ_PREFETCH_CMD_BARE_MIN_SIZE; // either dispatch_s or dispatch_d will send the go signal (go_signal_mcast command) cmd_sequence_sizeB += CQ_PREFETCH_CMD_BARE_MIN_SIZE; @@ -1257,11 +1259,11 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro DispatcherSelect dispatcher_for_go_signal = DispatcherSelect::DISPATCH_MASTER; if (this->device->dispatch_s_enabled()) { // dispatch_d signals dispatch_s to send the go signal, use a barrier if there are cores active - device_command_sequence.add_notify_dispatch_s_go_signal_cmd(program.program_transfer_info.num_active_cores > 0); + device_command_sequence.add_notify_dispatch_s_go_signal_cmd(program_transfer_info.num_active_cores > 0); dispatcher_for_go_signal = DispatcherSelect::DISPATCH_SLAVE; } else { // Wait Noc Write Barrier, wait for binaries/configs and launch_msg to be written to worker cores - if (program.program_transfer_info.num_active_cores > 0) { + if (program_transfer_info.num_active_cores > 0) { device_command_sequence.add_dispatch_wait(true, this->dispatch_message_addr, 0, 0, false, false); } } @@ -1464,7 +1466,7 @@ void EnqueueProgramCommand::write_program_command_sequence(const ProgramCommandS void EnqueueProgramCommand::process() { const std::pair&> reservation = - this->manager.get_config_buffer_mgr().reserve(program.program_config_sizes_); + this->manager.get_config_buffer_mgr().reserve(program.get_program_config_sizes()); bool stall_first = reservation.first.need_sync; // Note: since present implementation always stalls, we always free up to "now" this->manager.get_config_buffer_mgr().free(reservation.first.sync_count); @@ -1486,8 +1488,8 @@ void EnqueueProgramCommand::process() { // If cache has a program entry but the program is not finalized, then the cache is stale // Currently this is mapped by device, but will be mapped by multiple values in the future uint64_t command_hash = this->device->id(); - auto cached_cmd_iter = this->program.cached_program_command_sequences_.find(command_hash); - bool is_cached = program.is_cached() && cached_cmd_iter != this->program.cached_program_command_sequences_.end(); + auto cached_cmd_iter = this->program.get_cached_program_command_sequences().find(command_hash); + bool is_cached = program.is_cached() && cached_cmd_iter != this->program.get_cached_program_command_sequences().end(); // Calculate all commands size and determine how many fetch q entries to use // Preamble, some waits and stalls @@ -1507,7 +1509,7 @@ void EnqueueProgramCommand::process() { this->assemble_device_commands(program_command_sequence, kernel_config_addrs); this->write_program_command_sequence(program_command_sequence, stall_first); this->assemble_stall_commands(program_command_sequence, false); - this->program.cached_program_command_sequences_.insert({command_hash, std::move(program_command_sequence)}); + this->program.get_cached_program_command_sequences().insert({command_hash, std::move(program_command_sequence)}); program.set_cached(); } else { static constexpr uint32_t wait_count_offset = (sizeof(CQPrefetchCmd) + offsetof(CQDispatchCmd, wait.count)); @@ -2233,21 +2235,20 @@ void HWCommandQueue::enqueue_program(Program& program, bool blocking) { if (not program.is_finalized()) { program.finalize(device); TT_FATAL(!this->manager.get_bypass_mode(), "Tracing should only be used when programs have been cached"); - if (program.kernels_buffer != nullptr) { + if (const auto &kernels_buffer = program.get_kernels_buffer()) { this->enqueue_write_buffer( - *program.kernels_buffer, program.program_transfer_info.binary_data.data(), false); + *kernels_buffer, program.get_program_transfer_info().binary_data.data(), false); } } #ifdef DEBUG if (tt::llrt::OptionsG.get_validate_kernel_binaries()) { TT_FATAL(!this->manager.get_bypass_mode(), "Tracing cannot be used while validating program binaries"); - if (program.kernels_buffer != nullptr) { - const auto& buffer = program.kernels_buffer; + if (const auto &buffer = program.get_kernels_buffer()) { std::vector read_data(buffer->page_size() * buffer->num_pages() / sizeof(uint32_t)); - this->enqueue_read_buffer(*program.kernels_buffer, read_data.data(), true); + this->enqueue_read_buffer(*buffer, read_data.data(), true); TT_FATAL( - program.program_transfer_info.binary_data == read_data, + program.get_program_transfer_info().binary_data == read_data, "Binary for program to be executed is corrupted. Another program likely corrupted this binary"); } } @@ -2297,12 +2298,11 @@ void HWCommandQueue::enqueue_program(Program& program, bool blocking) { #ifdef DEBUG if (tt::llrt::OptionsG.get_validate_kernel_binaries()) { TT_FATAL(!this->manager.get_bypass_mode(), "Tracing cannot be used while validating program binaries"); - if (program.kernels_buffer != nullptr) { - const auto& buffer = program.kernels_buffer; + if (const auto& buffer = program.get_kernels_buffer()) { std::vector read_data(buffer->page_size() * buffer->num_pages() / sizeof(uint32_t)); - this->enqueue_read_buffer(*program.kernels_buffer, read_data.data(), true); + this->enqueue_read_buffer(*buffer, read_data.data(), true); TT_FATAL( - program.program_transfer_info.binary_data == read_data, + program.get_program_transfer_info().binary_data == read_data, "Binary for program that executed is corrupted. This program likely corrupted its own binary."); } } @@ -2311,7 +2311,7 @@ void HWCommandQueue::enqueue_program(Program& program, bool blocking) { log_trace( tt::LogMetal, "Created EnqueueProgramCommand (active_cores: {} bypass_mode: {} expected_workers_completed: {})", - program.program_transfer_info.num_active_cores, + program.get_program_transfer_info().num_active_cores, this->manager.get_bypass_mode(), expected_workers_completed); } diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp index ffb8fb6a4999..abb371f675c2 100644 --- a/tt_metal/impl/program/program.cpp +++ b/tt_metal/impl/program/program.cpp @@ -78,8 +78,176 @@ size_t KernelCompileHash(const std::shared_ptr kernel, JitBuildOptions & } // namespace namespace detail { +class Program_ { + public: + Program_(); + + Program_(const Program_ &other) = delete; + Program_& operator=(const Program_ &other) = delete; + + Program_(Program_ &&other) = default; + Program_& operator=(Program_ &&other) = default; + + void set_runtime_id(uint64_t id); + ~Program_() noexcept = default; + + uint64_t get_id() const; + uint64_t get_runtime_id() const; + + size_t num_kernels() const; + + const std::vector> &circular_buffers() const; + + const std::vector< Semaphore > & semaphores() const; + + KernelGroup * kernels_on_core(const CoreCoord &core, uint32_t programmable_core_type_index); + std::vector& get_kernel_groups(uint32_t programmable_core_type_index); + void add_buffer(std::shared_ptr buf); + void release_buffers(); + std::vector> circular_buffers_on_core(const CoreCoord &core) const; + + std::vector> circular_buffers_on_corerange(const CoreRange &cr) const; + + std::vector circular_buffers_unique_coreranges() const; + + std::vector> semaphores_on_core(const CoreCoord &core) const; + + size_t num_semaphores () const; + void init_semaphores ( const Device & device, const CoreCoord &logical_core, uint32_t programmable_core_type_index) const; + // XXXXX TODO: this should return a const reference + std::vector> logical_cores() const; + + void compile(Device * device, bool fd_bootloader_mode = false); + + void invalidate_circular_buffer_allocation(); + + void allocate_circular_buffers(const Device *device); + + bool is_finalized() const; + bool is_cached() const; + void set_cached(); + void finalize(Device *device); + std::shared_ptr get_kernel(KernelHandle kernel_id) const; + + ProgramConfig& get_program_config(uint32_t programmable_core_type_index); + + // debug/test + uint32_t get_sem_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) const; + uint32_t get_cb_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) const; + uint32_t get_sem_size(Device *device, CoreCoord logical_core, CoreType core_type) const; + uint32_t get_cb_size(Device *device, CoreCoord logical_core, CoreType core_type) const; + + private: + void populate_dispatch_data(Device *device); + + // Buffers temporarily owned by the program + std::vector> owned_buffer_pool = {}; + + // The buffer that holds the kernel/binaries/etc for this program + std::shared_ptr kernels_buffer = nullptr; + ProgramTransferInfo program_transfer_info; + + bool finalized_; + bool cached_; + + struct CircularBufferAllocator { + CircularBufferAllocator(const CoreRange &core_range_) : core_range(core_range_) {} + + // Circular buffers are created and allocated at core range granularity + CoreRange core_range; + + // Holds vector of addresses where circular buffers are allocated [start, end) + // There are multiple ranges because per core L1 regions are not in lockstep but circular buffers spanning multiple cores must share the same address + // To enable this, circular buffer address is the maximum address amongst all of its target cores + // This vector is sorted from lower to higher address spaces + std::vector> l1_regions; + + // Returns address for next circular buffer + // Circular buffers are placed sequentially on a core so the next available address gets appended to the last L1 region + uint64_t get_cb_region_end() const { + return this->l1_regions.empty() ? 0 : this->l1_regions.back().second; + } + + // If address is the end of the last L1 region, the last region is extended by size bytes, + // otherwise address must be higher than existing regions and a new L1 region [address, size) is added + void mark_address(uint64_t address, uint64_t size, uint64_t base_address); + + // Reset when circular buffer allocation is invalidated + void reset_available_addresses() { this->l1_regions.clear(); } + }; + + uint64_t id; // Need to make non-const due to move constructor + uint64_t runtime_id; + static std::atomic program_counter; + std::vector >> kernels_; + std::vector grid_extent_; + + std::vector> circular_buffers_; + std::unordered_map> circular_buffer_by_id_; + // Tracks which circular buffer indices are being used + std::unordered_map> per_core_cb_indices_; + // Used to generate circular buffer addresses. There is one CircularBufferAllocator per unique CoreRange + std::vector cb_allocators_; + + std::vector semaphores_; + + std::unordered_set compiled_; + bool local_circular_buffer_allocation_needed_; + + static constexpr uint8_t core_to_kernel_group_invalid_index = 0xff; + std::vector> kernel_groups_; + std::vector> core_to_kernel_group_index_table_; + uint32_t tensix_go_signal_count_; + + std::vector> config_buffers_; + + std::vector program_configs_; + std::vector program_config_sizes_; + + std::unordered_map cached_program_command_sequences_; + + friend std::shared_ptr GetCircularBuffer(const Program &program, CBHandle id); + friend void ValidateCircularBufferRegion(const Program &program, const Device *device); + + friend KernelHandle AddKernel(Program &program, std::shared_ptr kernel, const HalProgrammableCoreType core_type); + + KernelHandle add_kernel(std::shared_ptr kernel, const HalProgrammableCoreType &core_type); + + CBHandle add_circular_buffer(const CoreRangeSet &core_range_set, const CircularBufferConfig &config); + std::shared_ptr get_circular_buffer(CBHandle cb_id) const; + + void add_semaphore(const CoreRangeSet & crs, uint32_t semaphore_id, uint32_t init_value, CoreType core_type); + + friend void AddConfigBuffer(Program &program, std::shared_ptr config_buffer); + void add_config_buffer(std::shared_ptr config_buffer); + + // Ensures that statically allocated circular buffers do not grow into L1 buffer space + void validate_circular_buffer_region(const Device *device) const; + + void set_cb_data_fmt( Device *device, const std::vector & crs, JitBuildOptions& build_options) const; + + void set_cb_tile_dims( Device *device, const std::vector & crs, JitBuildOptions& build_options) const; + + void update_kernel_groups(uint32_t programmable_core_type_index); + + uint32_t& get_program_config_size(uint32_t programmable_core_type_index); + + uint32_t finalize_rt_args(uint32_t programmable_core_type_index, uint32_t base_offset); + uint32_t finalize_sems(uint32_t programmable_core_type_index, uint32_t base_offset); + uint32_t finalize_cbs(uint32_t programmable_core_type_index, uint32_t base_offset); + uint32_t finalize_kernel_bins(Device *device, uint32_t programmable_core_type_index, uint32_t base_offset); + void set_launch_msg_sem_offsets(); + + bool runs_on_noc_unicast_only_cores(); + bool runs_on_noc_multicast_only_cores(); + + friend HWCommandQueue; + friend EnqueueProgramCommand; + friend Program; +}; + KernelHandle AddKernel (Program &program, std::shared_ptr kernel, const HalProgrammableCoreType core_type) { - return program.add_kernel(kernel, core_type); + return program.pimpl_->add_kernel(std::move(kernel), core_type); } std::shared_ptr GetKernel(const Program &program, KernelHandle kernel_id) { @@ -87,16 +255,16 @@ std::shared_ptr GetKernel(const Program &program, KernelHandle kernel_id } std::shared_ptr GetCircularBuffer(const Program &program, CBHandle id) { - return program.get_circular_buffer(id); + return program.pimpl_->get_circular_buffer(id); } // Checks that circular buffers do not grow into L1 buffer space void ValidateCircularBufferRegion(const Program &program, const Device *device) { - program.validate_circular_buffer_region(device); + program.pimpl_->validate_circular_buffer_region(device); } void AddConfigBuffer(Program &program, std::shared_ptr config_buffer) { - program.add_config_buffer(config_buffer); + program.pimpl_->add_config_buffer(std::move(config_buffer)); } void EnablePersistentKernelCache() { enable_persistent_kernel_cache = true; } @@ -104,12 +272,11 @@ void EnablePersistentKernelCache() { enable_persistent_kernel_cache = true; } void DisablePersistentKernelCache() { enable_persistent_kernel_cache = false; } } // namespace detail -std::atomic Program::program_counter = 0; +std::atomic detail::Program_::program_counter = 0; -Program::Program() : +detail::Program_::Program_() : id(program_counter++), runtime_id(0), - worker_crs_(), local_circular_buffer_allocation_needed_(false), finalized_(false), cached_(false) { @@ -126,7 +293,11 @@ Program::Program() : program_config_sizes_.resize(programmable_core_count); } -KernelHandle Program::add_kernel(std::shared_ptr kernel, const HalProgrammableCoreType &programmable_core_type) { +Program::Program() : + pimpl_(std::make_unique()) { +} + +KernelHandle detail::Program_::add_kernel(std::shared_ptr kernel, const HalProgrammableCoreType &programmable_core_type) { TT_FATAL(this->compiled_.empty(), "Cannot add kernel to an already compiled program {}", this->id); // Id is unique across all kernels on all core types KernelHandle id = this->num_kernels(); @@ -137,7 +308,7 @@ KernelHandle Program::add_kernel(std::shared_ptr kernel, const HalProgra return id; } -std::shared_ptr Program::get_kernel(KernelHandle kernel_id) const { +std::shared_ptr detail::Program_::get_kernel(KernelHandle kernel_id) const { // TT_ASSERT(kernel_id < this->kernels_.size(), "Expected Kernel with ID {} to be in Program {}", kernel_id, // this->id); // find coretype based on kernel_id @@ -151,10 +322,14 @@ std::shared_ptr Program::get_kernel(KernelHandle kernel_id) const { return nullptr; } +std::shared_ptr Program::get_kernel(KernelHandle kernel_id) const { + return pimpl_->get_kernel(kernel_id); +} + KernelGroup::KernelGroup() : core_ranges(CoreRangeSet()) {} KernelGroup::KernelGroup( - const Program &program, + const void *program_ptr, uint32_t programmable_core_type_index, kernel_id_array_t kernel_ids, bool erisc_is_idle, @@ -179,7 +354,7 @@ KernelGroup::KernelGroup( for (int class_id = 0; class_id < processor_classes; class_id++) { auto& optional_id = kernel_ids[class_id]; if (optional_id) { - const auto kernel = program.get_kernel(optional_id.value()); + const auto kernel = static_cast(program_ptr)->get_kernel(optional_id.value()); this->launch_msg.kernel_config.watcher_kernel_ids[class_id] = kernel->get_watcher_kernel_id(); this->launch_msg.kernel_config.enables |= 1 << class_id; @@ -222,12 +397,16 @@ CoreType KernelGroup::get_core_type() const { return hal.get_core_type(this->programmable_core_type_index); }; -std::vector &Program::get_kernel_groups(uint32_t programmable_core_type_index) { +std::vector &detail::Program_::get_kernel_groups(uint32_t programmable_core_type_index) { update_kernel_groups(programmable_core_type_index); return kernel_groups_[programmable_core_type_index]; } -KernelGroup *Program::kernels_on_core(const CoreCoord &core, uint32_t programmable_core_type_index) { +std::vector &Program::get_kernel_groups(uint32_t programmable_core_type_index) { + return pimpl_->get_kernel_groups(programmable_core_type_index); +} + +KernelGroup *detail::Program_::kernels_on_core(const CoreCoord &core, uint32_t programmable_core_type_index) { update_kernel_groups(programmable_core_type_index); if (core.x >= grid_extent_[programmable_core_type_index].x || core.y >= grid_extent_[programmable_core_type_index].y) return nullptr; @@ -235,6 +414,10 @@ KernelGroup *Program::kernels_on_core(const CoreCoord &core, uint32_t programmab return (index == core_to_kernel_group_invalid_index) ? nullptr : &kernel_groups_[programmable_core_type_index].at(index); } +KernelGroup *Program::kernels_on_core(const CoreCoord &core, uint32_t programmable_core_type_index) { + return pimpl_->kernels_on_core(core, programmable_core_type_index); +} + struct KernelGroupInt { bool valid; kernel_id_array_t kernel_ids; @@ -265,7 +448,7 @@ struct KernelGroupIntHasher { } }; -void Program::update_kernel_groups(uint32_t programmable_core_type_index) { +void detail::Program_::update_kernel_groups(uint32_t programmable_core_type_index) { if (core_to_kernel_group_index_table_[programmable_core_type_index].size() == 0) { bool erisc_is_idle = false; @@ -343,7 +526,7 @@ void Program::update_kernel_groups(uint32_t programmable_core_type_index) { } kernel_groups_[programmable_core_type_index].push_back(KernelGroup( - *this, + this, programmable_core_type_index, kg_to_cores.first.kernel_ids, erisc_is_idle, @@ -354,7 +537,7 @@ void Program::update_kernel_groups(uint32_t programmable_core_type_index) { } } -void Program::CircularBufferAllocator::mark_address(uint64_t address, uint64_t size, uint64_t base_address) { +void detail::Program_::CircularBufferAllocator::mark_address(uint64_t address, uint64_t size, uint64_t base_address) { if (this->l1_regions.empty()) { this->l1_regions.emplace_back(base_address, base_address); } @@ -373,7 +556,7 @@ void Program::CircularBufferAllocator::mark_address(uint64_t address, uint64_t s } } -CBHandle Program::add_circular_buffer(const CoreRangeSet &core_range_set, const CircularBufferConfig &config) { +CBHandle detail::Program_::add_circular_buffer(const CoreRangeSet &core_range_set, const CircularBufferConfig &config) { TT_FATAL(this->compiled_.empty(), "Cannot add circular buffer to an already compiled program {}", this->id); std::shared_ptr circular_buffer = std::make_shared(core_range_set, config); // Globally allocated circular buffer do not invalidate allocation because their addresses are tracked by memory @@ -424,14 +607,18 @@ CBHandle Program::add_circular_buffer(const CoreRangeSet &core_range_set, const return circular_buffer->id(); } -std::shared_ptr Program::get_circular_buffer(CBHandle cb_id) const { +CBHandle Program::add_circular_buffer(const CoreRangeSet &core_range_set, const CircularBufferConfig &config) { + return pimpl_->add_circular_buffer(core_range_set, config); +} + +std::shared_ptr detail::Program_::get_circular_buffer(CBHandle cb_id) const { if (this->circular_buffer_by_id_.find(cb_id) == this->circular_buffer_by_id_.end()) { TT_THROW("No circular buffer with id {} exists in Program {}", cb_id, this->id); } return this->circular_buffer_by_id_.at(cb_id); } -const std::vector> Program::circular_buffers_on_core(const CoreCoord &core) const { +std::vector> detail::Program_::circular_buffers_on_core(const CoreCoord &core) const { std::vector> cbs_on_core; for (auto circular_buffer : circular_buffers_) { if (circular_buffer->is_on_logical_core(core)) { @@ -441,7 +628,11 @@ const std::vector> Program::circular_buffers_on_ return cbs_on_core; } -const std::vector> Program::circular_buffers_on_corerange(const CoreRange &cr) const { +std::vector> Program::circular_buffers_on_core(const CoreCoord &core) const { + return pimpl_->circular_buffers_on_core(core); +} + +std::vector> detail::Program_::circular_buffers_on_corerange(const CoreRange &cr) const { std::vector> cbs_on_core; for (auto circular_buffer : circular_buffers_) { if (circular_buffer->is_on_logical_corerange(cr)) { @@ -451,7 +642,11 @@ const std::vector> Program::circular_buffers_on_ return cbs_on_core; } -const std::vector Program::circular_buffers_unique_coreranges() const { +std::vector> Program::circular_buffers_on_corerange(const CoreRange &cr) const { + return pimpl_->circular_buffers_on_corerange(cr); +} + +std::vector detail::Program_::circular_buffers_unique_coreranges() const { std::vector core_ranges; for (auto circular_buffer : circular_buffers_) { for (const CoreRange &core_range : circular_buffer->core_ranges().ranges()) { @@ -463,7 +658,11 @@ const std::vector Program::circular_buffers_unique_coreranges() const return core_ranges; } -void Program::invalidate_circular_buffer_allocation() { +std::vector Program::circular_buffers_unique_coreranges() const { + return pimpl_->circular_buffers_unique_coreranges(); +} + +void detail::Program_::invalidate_circular_buffer_allocation() { if (this->local_circular_buffer_allocation_needed_) { return; } @@ -473,7 +672,11 @@ void Program::invalidate_circular_buffer_allocation() { this->local_circular_buffer_allocation_needed_ = true; } -void Program::allocate_circular_buffers(const Device *device) { +void Program::invalidate_circular_buffer_allocation() { + pimpl_->invalidate_circular_buffer_allocation(); +} + +void detail::Program_::allocate_circular_buffers(const Device *device) { ZoneScoped; if (not this->local_circular_buffer_allocation_needed_) { return; @@ -515,7 +718,11 @@ void Program::allocate_circular_buffers(const Device *device) { this->local_circular_buffer_allocation_needed_ = false; } -void Program::validate_circular_buffer_region(const Device *device) const { +void Program::allocate_circular_buffers(const Device *device) { + pimpl_->allocate_circular_buffers(device); +} + +void detail::Program_::validate_circular_buffer_region(const Device *device) const { ZoneScoped; // Banks are in lockstep so we only need to get lowest L1 address of one compute and storage core @@ -552,9 +759,13 @@ void Program::validate_circular_buffer_region(const Device *device) const { size_t Program::num_semaphores(const CoreCoord &core) const { return semaphores_on_core(core).size(); } -size_t Program::num_semaphores() const { return semaphores_.size(); } +size_t detail::Program_::num_semaphores() const {return semaphores_.size(); } -void Program::init_semaphores(const Device &device, const CoreCoord &logical_core, uint32_t programmable_core_type_index) const { +size_t Program::num_semaphores() const { + return pimpl_->num_semaphores(); +} + +void detail::Program_::init_semaphores(const Device &device, const CoreCoord &logical_core, uint32_t programmable_core_type_index) const { auto semaphores_on_core = this->semaphores_on_core(logical_core); uint64_t kernel_config_base = hal.get_dev_addr(programmable_core_type_index, HalL1MemAddrType::KERNEL_CONFIG); @@ -569,14 +780,22 @@ void Program::init_semaphores(const Device &device, const CoreCoord &logical_cor } } -void Program::add_semaphore(const CoreRangeSet &crs, uint32_t semaphore_id, uint32_t init_value, CoreType core_type) { +void Program::init_semaphores(const Device &device, const CoreCoord &logical_core, uint32_t programmable_core_type_index) const { + pimpl_->init_semaphores(device, logical_core, programmable_core_type_index); +} + +void detail::Program_::add_semaphore(const CoreRangeSet &crs, uint32_t semaphore_id, uint32_t init_value, CoreType core_type) { TT_FATAL(this->compiled_.empty(), "Cannot add semaphore to an already compiled program {}", this->id); semaphores_.emplace_back(Semaphore(crs, semaphore_id, init_value, core_type)); } -void Program::add_config_buffer(std::shared_ptr config_buffer) { config_buffers_.emplace_back(config_buffer); } +void Program::add_semaphore(const CoreRangeSet &crs, uint32_t semaphore_id, uint32_t init_value, CoreType core_type) { + pimpl_->add_semaphore(crs, semaphore_id, init_value, core_type); +} -std::vector> Program::logical_cores() const { +void detail::Program_::add_config_buffer(std::shared_ptr config_buffer) { config_buffers_.emplace_back(config_buffer); } + +std::vector> detail::Program_::logical_cores() const { std::vector> cores_in_program; std::vector> unique_cores; for (uint32_t programmable_core_type_index = 0; programmable_core_type_index < kernels_.size(); programmable_core_type_index++) { @@ -596,17 +815,11 @@ std::vector> Program::logical_cores() const { return cores_in_program; } -void Program::construct_core_range_set_for_worker_cores() { - bool found_kernels = false; - uint32_t index = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX); - for (auto [id, kernel] : kernels_[index]) { - this->worker_crs_ = this->worker_crs_.merge(kernel->core_range_set()); - found_kernels = true; - } - TT_ASSERT(!found_kernels || this->worker_crs_.ranges().size() >= 1, "Invalid core range set"); +std::vector> Program::logical_cores() const { + return pimpl_->logical_cores(); } -void Program::set_cb_data_fmt(Device *device, const std::vector &crs, JitBuildOptions &build_options) const { +void detail::Program_::set_cb_data_fmt(Device *device, const std::vector &crs, JitBuildOptions &build_options) const { ZoneScoped; for (auto logical_cr : crs) { auto cbs_on_core = this->circular_buffers_on_corerange(logical_cr); @@ -619,7 +832,7 @@ void Program::set_cb_data_fmt(Device *device, const std::vector &crs, } } -void Program::set_cb_tile_dims(Device *device, const std::vector &crs, JitBuildOptions &build_options) const { +void detail::Program_::set_cb_tile_dims(Device *device, const std::vector &crs, JitBuildOptions &build_options) const { ZoneScoped; for (const auto &logical_cr : crs) { auto cbs_on_core = this->circular_buffers_on_corerange(logical_cr); @@ -650,7 +863,7 @@ void Program::set_cb_tile_dims(Device *device, const std::vector &crs } } -void Program::populate_dispatch_data(Device *device) { +void detail::Program_::populate_dispatch_data(Device *device) { auto extract_dst_noc_unicast_info = [&device](const auto &ranges, const CoreType core_type) -> std::vector> { // This API extracts all the pairs of noc multicast encodings given a set of core ranges @@ -823,7 +1036,7 @@ void Program::populate_dispatch_data(Device *device) { return; } -uint32_t Program::finalize_rt_args(uint32_t programmable_core_type_index, uint32_t base_offset) { +uint32_t detail::Program_::finalize_rt_args(uint32_t programmable_core_type_index, uint32_t base_offset) { // Iterate over kernels in the program and "level" the number of RTAs based on the max // Unique RTAs are packed across dispatch classes @@ -846,7 +1059,7 @@ uint32_t Program::finalize_rt_args(uint32_t programmable_core_type_index, uint32 max_rtas[dispatch_class] = 0; auto& optional_id = kg.kernel_ids[dispatch_class]; if (optional_id) { - auto kernel = detail::GetKernel(*this, optional_id.value()); + auto kernel = get_kernel(optional_id.value()); for (const CoreRange &core_range : kg.core_ranges.ranges()) { for (auto x = core_range.start_coord.x; x <= core_range.end_coord.x; x++) { for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) { @@ -864,7 +1077,7 @@ uint32_t Program::finalize_rt_args(uint32_t programmable_core_type_index, uint32 auto& optional_id = kg.kernel_ids[dispatch_class]; kg.rta_sizes[dispatch_class] = max_rtas[dispatch_class] * sizeof(uint32_t); if (optional_id) { - auto kernel = detail::GetKernel(*this, optional_id.value()); + auto kernel = get_kernel(optional_id.value()); kernel->set_runtime_args_count(kg.core_ranges, max_rtas[dispatch_class]); kg.launch_msg.kernel_config.rta_offset[dispatch_class].rta_offset = base_offset + offset; offset += max_rtas[dispatch_class] * sizeof(uint32_t); @@ -883,7 +1096,7 @@ uint32_t Program::finalize_rt_args(uint32_t programmable_core_type_index, uint32 } // Find the max # common RTAs across all kernels for each dispatch class for (size_t kernel_id = 0; kernel_id < this->num_kernels(); kernel_id++) { - auto kernel = detail::GetKernel(*this, kernel_id); + auto kernel = get_kernel(kernel_id); // TODO: kernels should be stored by programmable core type if (core_type == kernel->get_kernel_core_type() && (programmable_core_type == HalProgrammableCoreType::IDLE_ETH) == kernel->is_idle_eth()) { @@ -906,7 +1119,7 @@ uint32_t Program::finalize_rt_args(uint32_t programmable_core_type_index, uint32 // Set the runtime_args_data sizing info based on the shared max for (size_t kernel_id = 0; kernel_id < this->num_kernels(); kernel_id++) { - auto kernel = detail::GetKernel(*this, kernel_id); + auto kernel = get_kernel(kernel_id); // TODO: as above, fix when kernels are stored by programmable core type if (core_type == kernel->get_kernel_core_type() && (programmable_core_type == HalProgrammableCoreType::IDLE_ETH) == kernel->is_idle_eth()) { @@ -929,11 +1142,15 @@ uint32_t Program::finalize_rt_args(uint32_t programmable_core_type_index, uint32 return max_unique_rta_size + total_crta_size; } -ProgramConfig& Program::get_program_config(uint32_t programmable_core_type_index) { +ProgramConfig& detail::Program_::get_program_config(uint32_t programmable_core_type_index) { return this->program_configs_[programmable_core_type_index]; } -uint32_t Program::finalize_sems(uint32_t programmable_core_type_index, uint32_t base_offset) { +ProgramConfig& Program::get_program_config(uint32_t programmable_core_type_index) { + return pimpl_->get_program_config(programmable_core_type_index); +} + +uint32_t detail::Program_::finalize_sems(uint32_t programmable_core_type_index, uint32_t base_offset) { int max_id = -1; CoreType core_type = hal.get_core_type(programmable_core_type_index); @@ -951,7 +1168,7 @@ uint32_t Program::finalize_sems(uint32_t programmable_core_type_index, uint32_t return base_offset + sem_size; } -void Program::set_launch_msg_sem_offsets() { +void detail::Program_::set_launch_msg_sem_offsets() { for (uint32_t kg_type_index = 0; kg_type_index < hal.get_programmable_core_type_count(); kg_type_index++) { for (auto& kg : this->get_kernel_groups(kg_type_index)) { @@ -963,7 +1180,7 @@ void Program::set_launch_msg_sem_offsets() { } } -uint32_t Program::finalize_cbs(uint32_t programmable_core_type_index, uint32_t base_offset) { +uint32_t detail::Program_::finalize_cbs(uint32_t programmable_core_type_index, uint32_t base_offset) { int count = 0; @@ -985,7 +1202,7 @@ uint32_t Program::finalize_cbs(uint32_t programmable_core_type_index, uint32_t b return base_offset + cb_size; } -uint32_t Program::finalize_kernel_bins(Device *device, uint32_t programmable_core_type_index, uint32_t base_offset) { +uint32_t detail::Program_::finalize_kernel_bins(Device *device, uint32_t programmable_core_type_index, uint32_t base_offset) { uint32_t l1_alignment = hal.get_alignment(HalMemType::L1); @@ -1057,14 +1274,11 @@ uint32_t Program::finalize_kernel_bins(Device *device, uint32_t programmable_cor return max_offset; } -uint32_t& Program::get_program_config_size(uint32_t programmable_core_type_index) { +uint32_t& detail::Program_::get_program_config_size(uint32_t programmable_core_type_index) { return this->program_config_sizes_[programmable_core_type_index]; } -void Program::finalize(Device *device) { - - this->construct_core_range_set_for_worker_cores(); - +void detail::Program_::finalize(Device *device) { // Store the number of tensix "go signals" for use by CQ // CQ iterates over these to update runtime addresses, needs to know when eth begins (after tensix) // TODO: should store all the counts @@ -1113,7 +1327,11 @@ void Program::finalize(Device *device) { finalized_ = true; } -void Program::compile(Device *device, bool fd_bootloader_mode) { +void Program::finalize(Device *device) { + pimpl_->finalize(device); +} + +void detail::Program_::compile(Device *device, bool fd_bootloader_mode) { ZoneScoped; if (compiled_.contains(device->id())) { return; @@ -1193,7 +1411,7 @@ void Program::compile(Device *device, bool fd_bootloader_mode) { } if (detail::CompilationReporter::enabled()) { detail::CompilationReporter::inst().add_kernel_compile_stats( - *this, kernel, cache_hit, kernel_hash); + get_id(), kernel, cache_hit, kernel_hash); } kernel->set_binary_path(build_options.path); }, @@ -1211,17 +1429,27 @@ void Program::compile(Device *device, bool fd_bootloader_mode) { sync_build_step(events); if (detail::CompilationReporter::enabled()) { - detail::CompilationReporter::inst().flush_program_entry(*this, enable_persistent_kernel_cache); + detail::CompilationReporter::inst().flush_program_entry(get_id(), num_kernels(), [this](size_t kernel_id) { + return get_kernel(kernel_id); + }, enable_persistent_kernel_cache); } if (detail::MemoryReporter::enabled()) { - detail::MemoryReporter::inst().flush_program_memory_usage(*this, device); + detail::MemoryReporter::inst().flush_program_memory_usage(get_id(), device); } compiled_.insert(device->id()); } -void Program::set_runtime_id(uint64_t id) { this->runtime_id = id; } +void Program::compile(Device *device, bool fd_bootloader_mode) { + pimpl_->compile(device, fd_bootloader_mode); +} + +void detail::Program_::set_runtime_id(uint64_t id) { this->runtime_id = id; } -uint32_t Program::get_sem_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) const { +void Program::set_runtime_id(uint64_t id) { + pimpl_->set_runtime_id(id); +} + +uint32_t detail::Program_::get_sem_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) const { CoreCoord phys_core = device->physical_core_from_logical_core(logical_core, core_type); HalProgrammableCoreType programmable_core_type = device->get_programmable_core_type(phys_core); @@ -1234,7 +1462,11 @@ uint32_t Program::get_sem_base_addr(Device *device, CoreCoord logical_core, Core return base_addr + this->program_configs_[index].sem_offset; } -uint32_t Program::get_cb_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) const { +uint32_t Program::get_sem_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) const { + return pimpl_->get_sem_base_addr(device, logical_core, core_type); +} + +uint32_t detail::Program_::get_cb_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) const { CoreCoord phys_core = device->physical_core_from_logical_core(logical_core, core_type); HalProgrammableCoreType programmable_core_type = device->get_programmable_core_type(phys_core); @@ -1247,7 +1479,11 @@ uint32_t Program::get_cb_base_addr(Device *device, CoreCoord logical_core, CoreT return base_addr + this->program_configs_[index].cb_offset; } -uint32_t Program::get_sem_size(Device *device, CoreCoord logical_core, CoreType core_type) const { +uint32_t Program::get_cb_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) const { + return pimpl_->get_cb_base_addr(device, logical_core, core_type); +} + +uint32_t detail::Program_::get_sem_size(Device *device, CoreCoord logical_core, CoreType core_type) const { CoreCoord phys_core = device->physical_core_from_logical_core(logical_core, core_type); HalProgrammableCoreType programmable_core_type = device->get_programmable_core_type(phys_core); @@ -1256,7 +1492,11 @@ uint32_t Program::get_sem_size(Device *device, CoreCoord logical_core, CoreType return this->program_configs_[index].sem_size; } -uint32_t Program::get_cb_size(Device *device, CoreCoord logical_core, CoreType core_type) const { +uint32_t Program::get_sem_size(Device *device, CoreCoord logical_core, CoreType core_type) const { + return pimpl_->get_sem_size(device, logical_core, core_type); +} + +uint32_t detail::Program_::get_cb_size(Device *device, CoreCoord logical_core, CoreType core_type) const { CoreCoord phys_core = device->physical_core_from_logical_core(logical_core, core_type); HalProgrammableCoreType programmable_core_type = device->get_programmable_core_type(phys_core); @@ -1265,17 +1505,130 @@ uint32_t Program::get_cb_size(Device *device, CoreCoord logical_core, CoreType c return this->program_configs_[index].cb_size; } +uint32_t Program::get_cb_size(Device *device, CoreCoord logical_core, CoreType core_type) const { + return pimpl_->get_cb_size(device, logical_core, core_type); +} + // TODO: Too low level for program.cpp. Move this to HAL, once we have support. -bool Program::runs_on_noc_unicast_only_cores() { +bool detail::Program_::runs_on_noc_unicast_only_cores() { return (hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH) != -1 and - this->get_kernel_groups(hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH)).size()); + not this->get_kernel_groups(hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH)).empty()); +} + +bool Program::runs_on_noc_unicast_only_cores() { + return pimpl_->runs_on_noc_unicast_only_cores(); } // TODO: Too low level for program.cpp. Move this to HAL, once we have support. -bool Program::runs_on_noc_multicast_only_cores() { +bool detail::Program_::runs_on_noc_multicast_only_cores() { return (hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX) != -1 and - this->get_kernel_groups(hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX)).size()); + not this->get_kernel_groups(hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX)).empty()); +} + +bool Program::runs_on_noc_multicast_only_cores() { + return pimpl_->runs_on_noc_multicast_only_cores(); +} + +Program::Program(Program &&other) noexcept = default; + +Program& Program::operator=(Program &&other) noexcept = default; + +Program::~Program() noexcept = default; + +uint64_t detail::Program_::get_id() const { return this->id; } + +uint64_t Program::get_id() const { + return pimpl_->get_id(); +} + +uint64_t detail::Program_::get_runtime_id() const { return this->runtime_id; } + +uint64_t Program::get_runtime_id() const { + return pimpl_->get_runtime_id(); +} + +size_t detail::Program_::num_kernels() const { + size_t count = 0; + for (const auto& kernels : kernels_) { + count += kernels.size(); + } + return count; +} + +size_t Program::num_kernels() const { + return pimpl_->num_kernels(); +} + +const std::vector> &detail::Program_::circular_buffers() const { return circular_buffers_; } + +const std::vector> &Program::circular_buffers() const { + return pimpl_->circular_buffers(); +} + +const std::vector< Semaphore > & detail::Program_::semaphores() const { return semaphores_; } + +const std::vector< Semaphore > & Program::semaphores() const { + return pimpl_->semaphores(); +} + +void detail::Program_::add_buffer(std::shared_ptr buf) { owned_buffer_pool.push_back(std::move(buf)); } + +void Program::add_buffer(std::shared_ptr buf) { + pimpl_->add_buffer(std::move(buf)); +} + +void detail::Program_::release_buffers() { owned_buffer_pool = {}; } + +void Program::release_buffers() { + pimpl_->release_buffers(); +} + +std::vector> detail::Program_::semaphores_on_core(const CoreCoord &core) const { + std::vector> semaphores; + for ( const Semaphore & s : this->semaphores_) { + if (s.initialized_on_logical_core(core)) { + semaphores.emplace_back(std::cref(s)); + } + } + return semaphores; +} + +std::vector> Program::semaphores_on_core(const CoreCoord &core) const { + return pimpl_->semaphores_on_core(core); +} + +bool detail::Program_::is_finalized() const { return this->finalized_; } + +bool Program::is_finalized() const { + return pimpl_->is_finalized(); +} + +bool detail::Program_::is_cached() const { return this->cached_; } + +bool Program::is_cached() const { + return pimpl_->is_cached(); +} + +void detail::Program_::set_cached() { this->cached_ = true; } + +void Program::set_cached() { + pimpl_->set_cached(); +} + +const ProgramTransferInfo &Program::get_program_transfer_info() const noexcept { + return pimpl_->program_transfer_info; +} + +const std::shared_ptr &Program::get_kernels_buffer() const noexcept { + return pimpl_->kernels_buffer; +} + +const std::vector &Program::get_program_config_sizes() const noexcept { + return pimpl_->program_config_sizes_; +} + +std::unordered_map &Program::get_cached_program_command_sequences() noexcept { + return pimpl_->cached_program_command_sequences_; } -Program::~Program() {} } // namespace tt::tt_metal diff --git a/tt_metal/impl/program/program.hpp b/tt_metal/impl/program/program.hpp index 8e1dbb587ada..10c4ac63d461 100644 --- a/tt_metal/impl/program/program.hpp +++ b/tt_metal/impl/program/program.hpp @@ -35,6 +35,8 @@ class EnqueueProgramCommand; class HWCommandQueue; class JitBuildOptions; namespace detail{ + class Program_; + void ValidateCircularBufferRegion(const Program &program, const Device *device); KernelHandle AddKernel (Program &program, std::shared_ptr kernel, const HalProgrammableCoreType core_type); std::shared_ptr GetKernel(const Program &program, KernelHandle kernel_id); @@ -57,7 +59,7 @@ struct KernelGroup { KernelGroup(); KernelGroup( - const Program &program, + const void *program, uint32_t programmable_core_type_index, kernel_id_array_t kernel_ids, bool erisc_is_idle, @@ -91,48 +93,32 @@ class Program { Program(const Program &other) = delete; Program& operator=(const Program &other) = delete; - Program(Program &&other) = default; - Program& operator=(Program &&other) = default; + Program(Program &&other) noexcept; + Program& operator=(Program &&other) noexcept; void set_runtime_id(uint64_t id); - ~Program(); - - void construct_core_range_set_for_worker_cores(); + ~Program() noexcept; - const uint64_t get_id() const { return this->id; } - const uint64_t get_runtime_id() const { return this->runtime_id; } + uint64_t get_id() const; + uint64_t get_runtime_id() const; - size_t num_kernels() const { - size_t count = 0; - for (const auto& kernels : kernels_) { - count += kernels.size(); - } - return count; - } + size_t num_kernels() const; - const std::vector> &circular_buffers() const { return circular_buffers_; } + const std::vector> &circular_buffers() const; - const std::vector< Semaphore > & semaphores() const { return semaphores_; } + const std::vector< Semaphore > & semaphores() const; KernelGroup * kernels_on_core(const CoreCoord &core, uint32_t programmable_core_type_index); std::vector& get_kernel_groups(uint32_t programmable_core_type_index); - inline void add_buffer(std::shared_ptr buf) { owned_buffer_pool.push_back(buf); } - inline void release_buffers() { owned_buffer_pool = {}; } - const std::vector> circular_buffers_on_core(const CoreCoord &core) const; + void add_buffer(std::shared_ptr buf); + void release_buffers(); + std::vector> circular_buffers_on_core(const CoreCoord &core) const; - const std::vector> circular_buffers_on_corerange(const CoreRange &cr) const; + std::vector> circular_buffers_on_corerange(const CoreRange &cr) const; - const std::vector circular_buffers_unique_coreranges() const; + std::vector circular_buffers_unique_coreranges() const; - auto semaphores_on_core(const CoreCoord &core) const { - std::vector> semaphores; - for ( const Semaphore & s : this->semaphores_) { - if (s.initialized_on_logical_core(core)) { - semaphores.emplace_back(std::cref(s)); - } - } - return semaphores; - } + std::vector> semaphores_on_core(const CoreCoord &core) const; size_t num_semaphores ( const CoreCoord & core ) const; size_t num_semaphores () const; @@ -140,18 +126,15 @@ class Program { // XXXXX TODO: this should return a const reference std::vector> logical_cores() const; - // Is worker_crs_ used anywhere? - const CoreRangeSet& get_worker_core_range_set() const { return worker_crs_; }; - void compile(Device * device, bool fd_bootloader_mode = false); void invalidate_circular_buffer_allocation(); void allocate_circular_buffers(const Device *device); - bool is_finalized() const { return this->finalized_; } - bool is_cached() const { return this->cached_; } - void set_cached() { this->cached_ = true; } + bool is_finalized() const; + bool is_cached() const; + void set_cached(); void finalize(Device *device); std::shared_ptr get_kernel(KernelHandle kernel_id) const; @@ -164,74 +147,7 @@ class Program { uint32_t get_cb_size(Device *device, CoreCoord logical_core, CoreType core_type) const; private: - void populate_dispatch_data(Device *device); - - // Buffers temporarily owned by the program - std::vector> owned_buffer_pool = {}; - - // The buffer that holds the kernel/binaries/etc for this program - std::shared_ptr kernels_buffer = nullptr; - ProgramTransferInfo program_transfer_info; - - bool finalized_; - bool cached_; - - struct CircularBufferAllocator { - CircularBufferAllocator(const CoreRange &core_range_) : core_range(core_range_) {} - - // Circular buffers are created and allocated at core range granularity - CoreRange core_range; - - // Holds vector of addresses where circular buffers are allocated [start, end) - // There are multiple ranges because per core L1 regions are not in lockstep but circular buffers spanning multiple cores must share the same address - // To enable this, circular buffer address is the maximum address amongst all of its target cores - // This vector is sorted from lower to higher address spaces - std::vector> l1_regions; - - // Returns address for next circular buffer - // Circular buffers are placed sequentially on a core so the next available address gets appended to the last L1 region - uint64_t get_cb_region_end() const { - return this->l1_regions.empty() ? 0 : this->l1_regions.back().second; - } - - // If address is the end of the last L1 region, the last region is extended by size bytes, - // otherwise address must be higher than existing regions and a new L1 region [address, size) is added - void mark_address(uint64_t address, uint64_t size, uint64_t base_address); - - // Reset when circular buffer allocation is invalidated - void reset_available_addresses() { this->l1_regions.clear(); } - }; - - uint64_t id; // Need to make non-const due to move constructor - uint64_t runtime_id; - static std::atomic program_counter; - std::vector >> kernels_; - std::vector grid_extent_; - - std::vector> circular_buffers_; - std::unordered_map> circular_buffer_by_id_; - // Tracks which circular buffer indices are being used - std::unordered_map> per_core_cb_indices_; - // Used to generate circular buffer addresses. There is one CircularBufferAllocator per unique CoreRange - std::vector cb_allocators_; - - std::vector semaphores_; - - CoreRangeSet worker_crs_; - std::unordered_set compiled_; - bool local_circular_buffer_allocation_needed_; - - static constexpr uint8_t core_to_kernel_group_invalid_index = 0xff; - std::vector> kernel_groups_; - std::vector> core_to_kernel_group_index_table_; - uint32_t tensix_go_signal_count_; - - std::vector> config_buffers_; - - std::vector program_configs_; - std::vector program_config_sizes_; - - std::unordered_map cached_program_command_sequences_; + std::unique_ptr pimpl_; friend CBHandle CreateCircularBuffer(Program &program, const std::variant &core_spec, const CircularBufferConfig &config); friend std::shared_ptr detail::GetCircularBuffer(const Program &program, CBHandle id); @@ -241,38 +157,23 @@ class Program { friend std::shared_ptr detail::GetKernel(const Program &program, KernelHandle kernel_id); friend uint32_t CreateSemaphore(Program &program, const std::variant &core_spec, uint32_t initial_value, CoreType core_type); - KernelHandle add_kernel(std::shared_ptr kernel, const HalProgrammableCoreType &core_type); CBHandle add_circular_buffer(const CoreRangeSet &core_range_set, const CircularBufferConfig &config); - std::shared_ptr get_circular_buffer(CBHandle cb_id) const; void add_semaphore(const CoreRangeSet & crs, uint32_t semaphore_id, uint32_t init_value, CoreType core_type); friend void detail::AddConfigBuffer(Program &program, std::shared_ptr config_buffer); - void add_config_buffer(std::shared_ptr config_buffer); - - // Ensures that statically allocated circular buffers do not grow into L1 buffer space - void validate_circular_buffer_region(const Device *device) const; - - void set_cb_data_fmt( Device *device, const std::vector & crs, JitBuildOptions& build_options) const; - - void set_cb_tile_dims( Device *device, const std::vector & crs, JitBuildOptions& build_options) const; - - void update_kernel_groups(uint32_t programmable_core_type_index); - - uint32_t& get_program_config_size(uint32_t programmable_core_type_index); - - uint32_t finalize_rt_args(uint32_t programmable_core_type_index, uint32_t base_offset); - uint32_t finalize_sems(uint32_t programmable_core_type_index, uint32_t base_offset); - uint32_t finalize_cbs(uint32_t programmable_core_type_index, uint32_t base_offset); - uint32_t finalize_kernel_bins(Device *device, uint32_t programmable_core_type_index, uint32_t base_offset); - void set_launch_msg_sem_offsets(); bool runs_on_noc_unicast_only_cores(); bool runs_on_noc_multicast_only_cores(); friend HWCommandQueue; friend EnqueueProgramCommand; + + const ProgramTransferInfo &get_program_transfer_info() const noexcept; + const std::shared_ptr &get_kernels_buffer() const noexcept; + const std::vector &get_program_config_sizes() const noexcept; + std::unordered_map &get_cached_program_command_sequences() noexcept; }; } // namespace v0