diff --git a/tt_metal/detail/reports/compilation_reporter.cpp b/tt_metal/detail/reports/compilation_reporter.cpp
index 2940e7fe8794..9a681cbca0b8 100644
--- a/tt_metal/detail/reports/compilation_reporter.cpp
+++ b/tt_metal/detail/reports/compilation_reporter.cpp
@@ -79,13 +79,13 @@ std::string kernel_attributes_str(std::shared_ptr<Kernel> kernel) {
     return attr_str;
 }
 
-void CompilationReporter::add_kernel_compile_stats(const Program &program, std::shared_ptr<Kernel> kernel, bool cache_hit, size_t kernel_hash) {
+void CompilationReporter::add_kernel_compile_stats(uint64_t program_id, std::shared_ptr<Kernel> kernel, bool cache_hit, size_t kernel_hash) {
     std::unique_lock<std::mutex> lock(mutex_);
 
     if (cache_hit) {
-        this->program_id_to_cache_hit_counter_[program.get_id()].hits++;
+        this->program_id_to_cache_hit_counter_[program_id].hits++;
     } else {
-        this->program_id_to_cache_hit_counter_[program.get_id()].misses++;
+        this->program_id_to_cache_hit_counter_[program_id].misses++;
     }
     std::string kernel_stats = "," + kernel->name() + ",";
     std::string cache_status = cache_hit ? "cache hit" : "cache miss";
@@ -99,13 +99,13 @@ void CompilationReporter::add_kernel_compile_stats(const Program &program, std::
         }
         index++;
     }
-    this->program_id_to_kernel_stats_[program.get_id()].push_back(kernel_stats);
+    this->program_id_to_kernel_stats_[program_id].push_back(kernel_stats);
 }
 
-void CompilationReporter::flush_program_entry(const Program &program, bool persistent_compilation_cache_enabled) {
+void CompilationReporter::flush_program_entry(uint64_t program_id, size_t num_kernels, std::function<std::shared_ptr<Kernel>(size_t)> get_kernel, bool persistent_compilation_cache_enabled) {
     std::unique_lock<std::mutex> lock(mutex_);
-    auto num_cache_misses = this->program_id_to_cache_hit_counter_.at(program.get_id()).misses;
-    auto num_cache_hits = this->program_id_to_cache_hit_counter_.at(program.get_id()).hits;
+    auto num_cache_misses = this->program_id_to_cache_hit_counter_.at(program_id).misses;
+    auto num_cache_hits = this->program_id_to_cache_hit_counter_.at(program_id).hits;
     if (this->total_num_compile_programs_ == 0) {
         this->init_reports();
     }
@@ -113,8 +113,8 @@ void CompilationReporter::flush_program_entry(const Program &program, bool persi
     auto get_num_compute_and_data_movement_kernels = [&]() {
         uint32_t num_compute = 0;
         uint32_t num_data_movement = 0;
-        for (size_t kernel_id = 0; kernel_id < program.num_kernels(); kernel_id++) {
-            const auto kernel = detail::GetKernel(program, kernel_id);
+        for (size_t kernel_id = 0; kernel_id < num_kernels; kernel_id++) {
+            const auto kernel = get_kernel(kernel_id);
             if (kernel->processor() == tt::RISCV::BRISC or kernel->processor() == tt::RISCV::NCRISC) {
                 num_data_movement++;
             } else {
@@ -126,14 +126,14 @@ void CompilationReporter::flush_program_entry(const Program &program, bool persi
 
     auto [num_compute_kernels, num_data_movement_kernels] = get_num_compute_and_data_movement_kernels();
 
-    this->summary_report_ << program.get_id() << ", "
+    this->summary_report_ << program_id << ", "
                             << num_compute_kernels << ", "
                             << num_data_movement_kernels << ", "
                             << (persistent_compilation_cache_enabled ? "Y" : "N") << ", "
                             << num_cache_misses << ", "
                             << num_cache_hits << "\n";
 
-    this->detailed_report_ << "Compiling Program: " << program.get_id() << "\n";
+    this->detailed_report_ << "Compiling Program: " << program_id << "\n";
     this->detailed_report_ << "\n,Kernel Creation Report:\n";
     this->detailed_report_ << ",,Number of Compute CreateKernel API calls: " << num_compute_kernels << "\n";
     this->detailed_report_ << ",,Number of Datamovement CreateKernel API calls: " << num_data_movement_kernels << "\n";
@@ -144,7 +144,7 @@ void CompilationReporter::flush_program_entry(const Program &program, bool persi
     this->detailed_report_ << ",,Total number of kernel compile cache hits: " << num_cache_hits << "\n";
 
     this->detailed_report_ << "\n,Kernel File Name, Core Range, Cache Hit, Kernel Attributes, Hash\n";
-    auto kernel_stats_vec = this->program_id_to_kernel_stats_.at(program.get_id());
+    auto kernel_stats_vec = this->program_id_to_kernel_stats_.at(program_id);
     for (const auto &kernel_stats : kernel_stats_vec) {
         this->detailed_report_ << kernel_stats;
     }
diff --git a/tt_metal/detail/reports/compilation_reporter.hpp b/tt_metal/detail/reports/compilation_reporter.hpp
index c976bf5c8bc5..23707b8eff34 100644
--- a/tt_metal/detail/reports/compilation_reporter.hpp
+++ b/tt_metal/detail/reports/compilation_reporter.hpp
@@ -45,9 +45,9 @@ class CompilationReporter {
     CompilationReporter(const CompilationReporter&) = delete;
     CompilationReporter(CompilationReporter&& other) noexcept = delete;
 
-    void add_kernel_compile_stats(const Program &program, std::shared_ptr<Kernel> kernel, bool cache_hit, size_t kernel_hash);
+    void add_kernel_compile_stats(uint64_t program_id, std::shared_ptr<Kernel> kernel, bool cache_hit, size_t kernel_hash);
 
-    void flush_program_entry(const Program &program, bool persistent_compilation_cache_enabled);
+    void flush_program_entry(uint64_t program_id, size_t num_kernels, std::function<std::shared_ptr<Kernel>(size_t)> get_kernel, bool persistent_compilation_cache_enabled);
     static CompilationReporter& inst();
     static void toggle (bool state);
     static bool enabled ();
diff --git a/tt_metal/detail/reports/memory_reporter.cpp b/tt_metal/detail/reports/memory_reporter.cpp
index 1bffc4421ddf..5275d438742d 100644
--- a/tt_metal/detail/reports/memory_reporter.cpp
+++ b/tt_metal/detail/reports/memory_reporter.cpp
@@ -100,14 +100,14 @@ void populate_reports(const Device *device, std::ofstream &memory_usage_summary_
     write_memory_usage(device, BufferType::L1, memory_usage_summary_report, detailed_memory_usage_report, l1_usage_summary_report);
 }
 
-void MemoryReporter::flush_program_memory_usage(const Program &program, const Device *device) {
+void MemoryReporter::flush_program_memory_usage(uint64_t program_id, const Device *device) {
     if (not this->program_memory_usage_summary_report_.is_open()) {
         this->init_reports();
     }
 
-    this->program_memory_usage_summary_report_ << program.get_id();
-    this->program_l1_usage_summary_report_ << program.get_id();
-    this->program_detailed_memory_usage_report_ << program.get_id();
+    this->program_memory_usage_summary_report_ << program_id;
+    this->program_l1_usage_summary_report_ << program_id;
+    this->program_detailed_memory_usage_report_ << program_id;
 
     populate_reports(device, this->program_memory_usage_summary_report_, this->program_detailed_memory_usage_report_, this->program_l1_usage_summary_report_);
 }
diff --git a/tt_metal/detail/reports/memory_reporter.hpp b/tt_metal/detail/reports/memory_reporter.hpp
index e5138f02a352..217f64905228 100644
--- a/tt_metal/detail/reports/memory_reporter.hpp
+++ b/tt_metal/detail/reports/memory_reporter.hpp
@@ -60,7 +60,7 @@ class MemoryReporter {
     MemoryReporter(const MemoryReporter&) = delete;
     MemoryReporter(MemoryReporter&& other) noexcept = delete;
 
-    void flush_program_memory_usage(const Program &program, const Device *device);
+    void flush_program_memory_usage(uint64_t program_id, const Device *device);
 
     void dump_memory_usage_state(const Device *device, std::string prefix="") const;
 
diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp
index 010ad9e43599..d212bfac0d9f 100644
--- a/tt_metal/impl/dispatch/command_queue.cpp
+++ b/tt_metal/impl/dispatch/command_queue.cpp
@@ -730,8 +730,9 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro
     const uint32_t max_prefetch_command_size =
         dispatch_constants::get(dispatch_core_type).max_prefetch_command_size();
 
+    auto &program_transfer_info = program.get_program_transfer_info();
     // Multicast Semaphore Cmd
-    uint32_t num_multicast_semaphores = program.program_transfer_info.multicast_semaphores.size();
+    uint32_t num_multicast_semaphores = program_transfer_info.multicast_semaphores.size();
     std::vector<std::vector<CQDispatchWritePackedMulticastSubCmd>> multicast_sem_sub_cmds(num_multicast_semaphores);
     std::vector<std::vector<std::pair<const void*, uint32_t>>> multicast_sem_data(num_multicast_semaphores);
     std::vector<std::vector<std::pair<uint32_t, uint32_t>>> multicast_sem_payload(num_multicast_semaphores);
@@ -739,7 +740,7 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro
     multicast_sem_dst_size.reserve(num_multicast_semaphores);
     if (num_multicast_semaphores > 0) {
         uint32_t i = 0;
-        for (const auto& [dst, transfer_info_vec] : program.program_transfer_info.multicast_semaphores) {
+        for (const auto& [dst, transfer_info_vec] : program_transfer_info.multicast_semaphores) {
             // TODO: loop over things inside transfer_info[i]
             uint32_t write_packed_len = transfer_info_vec[0].data.size();
             multicast_sem_dst_size.emplace_back(std::make_pair(dst, write_packed_len * sizeof(uint32_t)));
@@ -768,7 +769,7 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro
     }
 
     // Unicast Semaphore Cmd
-    uint32_t num_unicast_semaphores = program.program_transfer_info.unicast_semaphores.size();
+    uint32_t num_unicast_semaphores = program_transfer_info.unicast_semaphores.size();
     std::vector<std::vector<CQDispatchWritePackedUnicastSubCmd>> unicast_sem_sub_cmds(num_unicast_semaphores);
     std::vector<std::vector<std::pair<const void*, uint32_t>>> unicast_sem_data(num_unicast_semaphores);
     std::vector<std::vector<std::pair<uint32_t, uint32_t>>> unicast_sem_payload(num_unicast_semaphores);
@@ -776,7 +777,7 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro
     unicast_sem_dst_size.reserve(num_unicast_semaphores);
     if (num_unicast_semaphores > 0) {
         uint32_t i = 0;
-        for (const auto& [dst, transfer_info_vec] : program.program_transfer_info.unicast_semaphores) {
+        for (const auto& [dst, transfer_info_vec] : program_transfer_info.unicast_semaphores) {
             // TODO: loop over things inside transfer_info[i]
             uint32_t write_packed_len = transfer_info_vec[0].data.size();
             unicast_sem_dst_size.emplace_back(std::make_pair(dst, write_packed_len * sizeof(uint32_t)));
@@ -876,7 +877,8 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro
     const uint32_t max_length_per_sub_cmd = dispatch_constants::get(this->dispatch_core_type).scratch_db_size() / 2;
     const uint32_t max_paged_length_per_sub_cmd =
         max_length_per_sub_cmd / HostMemDeviceCommand::PROGRAM_PAGE_SIZE * HostMemDeviceCommand::PROGRAM_PAGE_SIZE;
-    for (const auto& [cores, num_mcast_dests, kg_transfer_info] : program.program_transfer_info.kernel_bins) {
+    const auto &kernels_buffer = program.get_kernels_buffer();
+    for (const auto& [cores, num_mcast_dests, kg_transfer_info] : program_transfer_info.kernel_bins) {
         bool write_linear;
         uint32_t noc_encoding;
         std::visit(
@@ -913,14 +915,14 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro
 
                 uint32_t base_address, page_offset;
                 if (kg_transfer_info.page_offsets[kernel_idx] > CQ_PREFETCH_RELAY_PAGED_START_PAGE_MASK) {
-                    const uint32_t num_banks = this->device->num_banks(this->program.kernels_buffer->buffer_type());
+                    const uint32_t num_banks = this->device->num_banks(kernels_buffer->buffer_type());
                     page_offset = kg_transfer_info.page_offsets[kernel_idx] % num_banks;
                     uint32_t num_full_pages_written_per_bank =
                         kg_transfer_info.page_offsets[kernel_idx] / num_banks;
-                    base_address = this->program.kernels_buffer->address() +
-                                    num_full_pages_written_per_bank * this->program.kernels_buffer->page_size();
+                    base_address = kernels_buffer->address() +
+                                    num_full_pages_written_per_bank * kernels_buffer->page_size();
                 } else {
-                    base_address = this->program.kernels_buffer->address();
+                    base_address = kernels_buffer->address();
                     page_offset = kg_transfer_info.page_offsets[kernel_idx];
                 }
 
@@ -928,11 +930,11 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro
                     true,  // is_dram
                     page_offset,
                     base_address,
-                    this->program.kernels_buffer->page_size(),
-                    relayed_bytes / this->program.kernels_buffer->page_size(),
+                    kernels_buffer->page_size(),
+                    relayed_bytes / kernels_buffer->page_size(),
                     length_adjust);
             } else {
-                uint32_t base_address = this->program.kernels_buffer->address();
+                uint32_t base_address = kernels_buffer->address();
                 uint32_t page_offset = kg_transfer_info.page_offsets[kernel_idx];
 
                 // TODO: pack all these writes into 1 linear write
@@ -1070,7 +1072,7 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro
     }
     // if dispatch_s is enabled have dispatch_d send a semaphore update to dispatch_s (this will include a write barrier on dispatch_d if program is active)
     // if not,  check if the program is active on workers. If active, have dispatch_d issue a write barrier
-    cmd_sequence_sizeB += (this->device->dispatch_s_enabled() || program.program_transfer_info.num_active_cores > 0) * CQ_PREFETCH_CMD_BARE_MIN_SIZE;
+    cmd_sequence_sizeB += (this->device->dispatch_s_enabled() || program_transfer_info.num_active_cores > 0) * CQ_PREFETCH_CMD_BARE_MIN_SIZE;
 
     // either dispatch_s or dispatch_d will send the go signal (go_signal_mcast command)
     cmd_sequence_sizeB += CQ_PREFETCH_CMD_BARE_MIN_SIZE;
@@ -1257,11 +1259,11 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro
     DispatcherSelect dispatcher_for_go_signal = DispatcherSelect::DISPATCH_MASTER;
     if (this->device->dispatch_s_enabled()) {
         // dispatch_d signals dispatch_s to send the go signal, use a barrier if there are cores active
-        device_command_sequence.add_notify_dispatch_s_go_signal_cmd(program.program_transfer_info.num_active_cores > 0);
+        device_command_sequence.add_notify_dispatch_s_go_signal_cmd(program_transfer_info.num_active_cores > 0);
         dispatcher_for_go_signal = DispatcherSelect::DISPATCH_SLAVE;
     } else {
         // Wait Noc Write Barrier, wait for binaries/configs and launch_msg to be written to worker cores
-        if (program.program_transfer_info.num_active_cores > 0) {
+        if (program_transfer_info.num_active_cores > 0) {
             device_command_sequence.add_dispatch_wait(true, this->dispatch_message_addr, 0, 0, false, false);
         }
     }
@@ -1464,7 +1466,7 @@ void EnqueueProgramCommand::write_program_command_sequence(const ProgramCommandS
 void EnqueueProgramCommand::process() {
 
     const std::pair<ConfigBufferSync, std::vector<ConfigBufferEntry>&> reservation =
-        this->manager.get_config_buffer_mgr().reserve(program.program_config_sizes_);
+        this->manager.get_config_buffer_mgr().reserve(program.get_program_config_sizes());
     bool stall_first = reservation.first.need_sync;
     // Note: since present implementation always stalls, we always free up to "now"
     this->manager.get_config_buffer_mgr().free(reservation.first.sync_count);
@@ -1486,8 +1488,8 @@ void EnqueueProgramCommand::process() {
     // If cache has a program entry but the program is not finalized, then the cache is stale
     // Currently this is mapped by device, but will be mapped by multiple values in the future
     uint64_t command_hash = this->device->id();
-    auto cached_cmd_iter = this->program.cached_program_command_sequences_.find(command_hash);
-    bool is_cached = program.is_cached() && cached_cmd_iter != this->program.cached_program_command_sequences_.end();
+    auto cached_cmd_iter = this->program.get_cached_program_command_sequences().find(command_hash);
+    bool is_cached = program.is_cached() && cached_cmd_iter != this->program.get_cached_program_command_sequences().end();
 
     // Calculate all commands size and determine how many fetch q entries to use
     // Preamble, some waits and stalls
@@ -1507,7 +1509,7 @@ void EnqueueProgramCommand::process() {
         this->assemble_device_commands(program_command_sequence, kernel_config_addrs);
         this->write_program_command_sequence(program_command_sequence, stall_first);
         this->assemble_stall_commands(program_command_sequence, false);
-        this->program.cached_program_command_sequences_.insert({command_hash, std::move(program_command_sequence)});
+        this->program.get_cached_program_command_sequences().insert({command_hash, std::move(program_command_sequence)});
         program.set_cached();
     } else {
         static constexpr uint32_t wait_count_offset = (sizeof(CQPrefetchCmd) + offsetof(CQDispatchCmd, wait.count));
@@ -2233,21 +2235,20 @@ void HWCommandQueue::enqueue_program(Program& program, bool blocking) {
     if (not program.is_finalized()) {
         program.finalize(device);
         TT_FATAL(!this->manager.get_bypass_mode(), "Tracing should only be used when programs have been cached");
-        if (program.kernels_buffer != nullptr) {
+        if (const auto &kernels_buffer = program.get_kernels_buffer()) {
             this->enqueue_write_buffer(
-                *program.kernels_buffer, program.program_transfer_info.binary_data.data(), false);
+                *kernels_buffer, program.get_program_transfer_info().binary_data.data(), false);
         }
     }
 
 #ifdef DEBUG
     if (tt::llrt::OptionsG.get_validate_kernel_binaries()) {
         TT_FATAL(!this->manager.get_bypass_mode(), "Tracing cannot be used while validating program binaries");
-        if (program.kernels_buffer != nullptr) {
-            const auto& buffer = program.kernels_buffer;
+        if (const auto &buffer = program.get_kernels_buffer()) {
             std::vector<uint32_t> read_data(buffer->page_size() * buffer->num_pages() / sizeof(uint32_t));
-            this->enqueue_read_buffer(*program.kernels_buffer, read_data.data(), true);
+            this->enqueue_read_buffer(*buffer, read_data.data(), true);
             TT_FATAL(
-                program.program_transfer_info.binary_data == read_data,
+                program.get_program_transfer_info().binary_data == read_data,
                 "Binary for program to be executed is corrupted. Another program likely corrupted this binary");
         }
     }
@@ -2297,12 +2298,11 @@ void HWCommandQueue::enqueue_program(Program& program, bool blocking) {
 #ifdef DEBUG
     if (tt::llrt::OptionsG.get_validate_kernel_binaries()) {
         TT_FATAL(!this->manager.get_bypass_mode(), "Tracing cannot be used while validating program binaries");
-        if (program.kernels_buffer != nullptr) {
-            const auto& buffer = program.kernels_buffer;
+        if (const auto& buffer = program.get_kernels_buffer()) {
             std::vector<uint32_t> read_data(buffer->page_size() * buffer->num_pages() / sizeof(uint32_t));
-            this->enqueue_read_buffer(*program.kernels_buffer, read_data.data(), true);
+            this->enqueue_read_buffer(*buffer, read_data.data(), true);
             TT_FATAL(
-                program.program_transfer_info.binary_data == read_data,
+                program.get_program_transfer_info().binary_data == read_data,
                 "Binary for program that executed is corrupted. This program likely corrupted its own binary.");
         }
     }
@@ -2311,7 +2311,7 @@ void HWCommandQueue::enqueue_program(Program& program, bool blocking) {
     log_trace(
         tt::LogMetal,
         "Created EnqueueProgramCommand (active_cores: {} bypass_mode: {} expected_workers_completed: {})",
-        program.program_transfer_info.num_active_cores,
+        program.get_program_transfer_info().num_active_cores,
         this->manager.get_bypass_mode(),
         expected_workers_completed);
 }
diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp
index ffb8fb6a4999..abb371f675c2 100644
--- a/tt_metal/impl/program/program.cpp
+++ b/tt_metal/impl/program/program.cpp
@@ -78,8 +78,176 @@ size_t KernelCompileHash(const std::shared_ptr<Kernel> kernel, JitBuildOptions &
 }  // namespace
 namespace detail {
 
+class Program_ {
+   public:
+    Program_();
+
+    Program_(const Program_ &other) = delete;
+    Program_& operator=(const Program_ &other) = delete;
+
+    Program_(Program_ &&other) = default;
+    Program_& operator=(Program_ &&other) = default;
+
+    void set_runtime_id(uint64_t id);
+    ~Program_() noexcept = default;
+
+    uint64_t get_id() const;
+    uint64_t get_runtime_id() const;
+
+    size_t num_kernels() const;
+
+    const std::vector<std::shared_ptr<CircularBuffer>> &circular_buffers() const;
+
+    const std::vector< Semaphore > & semaphores() const;
+
+    KernelGroup * kernels_on_core(const CoreCoord &core, uint32_t programmable_core_type_index);
+    std::vector<KernelGroup>& get_kernel_groups(uint32_t programmable_core_type_index);
+    void add_buffer(std::shared_ptr<Buffer> buf);
+    void release_buffers();
+    std::vector<std::shared_ptr<CircularBuffer>> circular_buffers_on_core(const CoreCoord &core) const;
+
+    std::vector<std::shared_ptr<CircularBuffer>> circular_buffers_on_corerange(const CoreRange &cr) const;
+
+    std::vector<CoreRange> circular_buffers_unique_coreranges() const;
+
+    std::vector<std::reference_wrapper<const Semaphore>> semaphores_on_core(const CoreCoord &core) const;
+
+    size_t num_semaphores () const;
+    void init_semaphores ( const Device & device, const CoreCoord &logical_core, uint32_t programmable_core_type_index) const;
+    // XXXXX TODO: this should return a const reference
+    std::vector<std::vector<CoreCoord>> logical_cores() const;
+
+    void compile(Device * device, bool fd_bootloader_mode = false);
+
+    void invalidate_circular_buffer_allocation();
+
+    void allocate_circular_buffers(const Device *device);
+
+    bool is_finalized() const;
+    bool is_cached() const;
+    void set_cached();
+    void finalize(Device *device);
+    std::shared_ptr<Kernel> get_kernel(KernelHandle kernel_id) const;
+
+    ProgramConfig& get_program_config(uint32_t programmable_core_type_index);
+
+    // debug/test
+    uint32_t get_sem_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) const;
+    uint32_t get_cb_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) const;
+    uint32_t get_sem_size(Device *device, CoreCoord logical_core, CoreType core_type) const;
+    uint32_t get_cb_size(Device *device, CoreCoord logical_core, CoreType core_type) const;
+
+   private:
+    void populate_dispatch_data(Device *device);
+
+    // Buffers temporarily owned by the program
+    std::vector<std::shared_ptr<Buffer>> owned_buffer_pool = {};
+
+    // The buffer that holds the kernel/binaries/etc for this program
+    std::shared_ptr<Buffer> kernels_buffer = nullptr;
+    ProgramTransferInfo program_transfer_info;
+
+    bool finalized_;
+    bool cached_;
+
+    struct CircularBufferAllocator {
+        CircularBufferAllocator(const CoreRange &core_range_) : core_range(core_range_) {}
+
+        // Circular buffers are created and allocated at core range granularity
+        CoreRange core_range;
+
+        // Holds vector of addresses where circular buffers are allocated [start, end)
+        // There are multiple ranges because per core L1 regions are not in lockstep but circular buffers spanning multiple cores must share the same address
+        // To enable this, circular buffer address is the maximum address amongst all of its target cores
+        // This vector is sorted from lower to higher address spaces
+        std::vector<std::pair<uint64_t, uint64_t>> l1_regions;
+
+        // Returns address for next circular buffer
+        // Circular buffers are placed sequentially on a core so the next available address gets appended to the last L1 region
+        uint64_t get_cb_region_end() const {
+            return this->l1_regions.empty() ? 0 : this->l1_regions.back().second;
+        }
+
+        // If address is the end of the last L1 region, the last region is extended by size bytes,
+        //  otherwise address must be higher than existing regions and a new L1 region [address, size) is added
+        void mark_address(uint64_t address, uint64_t size, uint64_t base_address);
+
+        // Reset when circular buffer allocation is invalidated
+        void reset_available_addresses() { this->l1_regions.clear(); }
+    };
+
+    uint64_t id; // Need to make non-const due to move constructor
+    uint64_t runtime_id;
+    static std::atomic<uint64_t> program_counter;
+    std::vector<std::unordered_map<KernelHandle, std::shared_ptr<Kernel> >> kernels_;
+    std::vector<CoreCoord> grid_extent_;
+
+    std::vector<std::shared_ptr<CircularBuffer>> circular_buffers_;
+    std::unordered_map<CBHandle,  std::shared_ptr<CircularBuffer>> circular_buffer_by_id_;
+    // Tracks which circular buffer indices are being used
+    std::unordered_map<CoreCoord, std::bitset<NUM_CIRCULAR_BUFFERS>> per_core_cb_indices_;
+    // Used to generate circular buffer addresses. There is one CircularBufferAllocator per unique CoreRange
+    std::vector<CircularBufferAllocator> cb_allocators_;
+
+    std::vector<Semaphore> semaphores_;
+
+    std::unordered_set<chip_id_t> compiled_;
+    bool local_circular_buffer_allocation_needed_;
+
+    static constexpr uint8_t core_to_kernel_group_invalid_index = 0xff;
+    std::vector<std::vector<KernelGroup>> kernel_groups_;
+    std::vector<std::vector<uint8_t>> core_to_kernel_group_index_table_;
+    uint32_t tensix_go_signal_count_;
+
+    std::vector<std::shared_ptr<Buffer>> config_buffers_;
+
+    std::vector<ProgramConfig> program_configs_;
+    std::vector<uint32_t> program_config_sizes_;
+
+    std::unordered_map<uint64_t, ProgramCommandSequence> cached_program_command_sequences_;
+
+    friend std::shared_ptr<CircularBuffer> GetCircularBuffer(const Program &program, CBHandle id);
+    friend void ValidateCircularBufferRegion(const Program &program, const Device *device);
+
+    friend KernelHandle AddKernel(Program &program, std::shared_ptr<Kernel> kernel, const HalProgrammableCoreType core_type);
+
+    KernelHandle add_kernel(std::shared_ptr<Kernel> kernel, const HalProgrammableCoreType &core_type);
+
+    CBHandle add_circular_buffer(const CoreRangeSet &core_range_set, const CircularBufferConfig &config);
+    std::shared_ptr<CircularBuffer> get_circular_buffer(CBHandle cb_id) const;
+
+    void add_semaphore(const CoreRangeSet & crs, uint32_t semaphore_id, uint32_t init_value, CoreType core_type);
+
+    friend void AddConfigBuffer(Program &program, std::shared_ptr<Buffer> config_buffer);
+    void add_config_buffer(std::shared_ptr<Buffer> config_buffer);
+
+    // Ensures that statically allocated circular buffers do not grow into L1 buffer space
+    void validate_circular_buffer_region(const Device *device) const;
+
+    void set_cb_data_fmt( Device *device, const std::vector<CoreRange> & crs, JitBuildOptions& build_options) const;
+
+    void set_cb_tile_dims( Device *device, const std::vector<CoreRange> & crs, JitBuildOptions& build_options) const;
+
+    void update_kernel_groups(uint32_t programmable_core_type_index);
+
+    uint32_t& get_program_config_size(uint32_t programmable_core_type_index);
+
+    uint32_t finalize_rt_args(uint32_t programmable_core_type_index, uint32_t base_offset);
+    uint32_t finalize_sems(uint32_t programmable_core_type_index, uint32_t base_offset);
+    uint32_t finalize_cbs(uint32_t programmable_core_type_index, uint32_t base_offset);
+    uint32_t finalize_kernel_bins(Device *device, uint32_t programmable_core_type_index, uint32_t base_offset);
+    void set_launch_msg_sem_offsets();
+
+    bool runs_on_noc_unicast_only_cores();
+    bool runs_on_noc_multicast_only_cores();
+
+    friend HWCommandQueue;
+    friend EnqueueProgramCommand;
+    friend Program;
+};
+
 KernelHandle AddKernel (Program &program, std::shared_ptr<Kernel> kernel, const HalProgrammableCoreType core_type) {
-    return program.add_kernel(kernel, core_type);
+    return program.pimpl_->add_kernel(std::move(kernel), core_type);
 }
 
 std::shared_ptr<Kernel> GetKernel(const Program &program, KernelHandle kernel_id) {
@@ -87,16 +255,16 @@ std::shared_ptr<Kernel> GetKernel(const Program &program, KernelHandle kernel_id
 }
 
 std::shared_ptr<CircularBuffer> GetCircularBuffer(const Program &program, CBHandle id) {
-    return program.get_circular_buffer(id);
+    return program.pimpl_->get_circular_buffer(id);
 }
 
 // Checks that circular buffers do not grow into L1 buffer space
 void ValidateCircularBufferRegion(const Program &program, const Device *device) {
-    program.validate_circular_buffer_region(device);
+    program.pimpl_->validate_circular_buffer_region(device);
 }
 
 void AddConfigBuffer(Program &program, std::shared_ptr<Buffer> config_buffer) {
-    program.add_config_buffer(config_buffer);
+    program.pimpl_->add_config_buffer(std::move(config_buffer));
 }
 
 void EnablePersistentKernelCache() { enable_persistent_kernel_cache = true; }
@@ -104,12 +272,11 @@ void EnablePersistentKernelCache() { enable_persistent_kernel_cache = true; }
 void DisablePersistentKernelCache() { enable_persistent_kernel_cache = false; }
 }  // namespace detail
 
-std::atomic<uint64_t> Program::program_counter = 0;
+std::atomic<uint64_t> detail::Program_::program_counter = 0;
 
-Program::Program() :
+detail::Program_::Program_() :
     id(program_counter++),
     runtime_id(0),
-    worker_crs_(),
     local_circular_buffer_allocation_needed_(false),
     finalized_(false),
     cached_(false) {
@@ -126,7 +293,11 @@ Program::Program() :
     program_config_sizes_.resize(programmable_core_count);
 }
 
-KernelHandle Program::add_kernel(std::shared_ptr<Kernel> kernel, const HalProgrammableCoreType &programmable_core_type) {
+Program::Program() :
+    pimpl_(std::make_unique<detail::Program_>()) {
+}
+
+KernelHandle detail::Program_::add_kernel(std::shared_ptr<Kernel> kernel, const HalProgrammableCoreType &programmable_core_type) {
     TT_FATAL(this->compiled_.empty(), "Cannot add kernel to an already compiled program {}", this->id);
     // Id is unique across all kernels on all core types
     KernelHandle id = this->num_kernels();
@@ -137,7 +308,7 @@ KernelHandle Program::add_kernel(std::shared_ptr<Kernel> kernel, const HalProgra
     return id;
 }
 
-std::shared_ptr<Kernel> Program::get_kernel(KernelHandle kernel_id) const {
+std::shared_ptr<Kernel> detail::Program_::get_kernel(KernelHandle kernel_id) const {
     // TT_ASSERT(kernel_id < this->kernels_.size(), "Expected Kernel with ID {} to be in Program {}", kernel_id,
     // this->id);
     //  find coretype based on kernel_id
@@ -151,10 +322,14 @@ std::shared_ptr<Kernel> Program::get_kernel(KernelHandle kernel_id) const {
     return nullptr;
 }
 
+std::shared_ptr<Kernel> Program::get_kernel(KernelHandle kernel_id) const {
+    return pimpl_->get_kernel(kernel_id);
+}
+
 KernelGroup::KernelGroup() : core_ranges(CoreRangeSet()) {}
 
 KernelGroup::KernelGroup(
-    const Program &program,
+    const void *program_ptr,
     uint32_t programmable_core_type_index,
     kernel_id_array_t kernel_ids,
     bool erisc_is_idle,
@@ -179,7 +354,7 @@ KernelGroup::KernelGroup(
     for (int class_id = 0; class_id < processor_classes; class_id++) {
         auto& optional_id = kernel_ids[class_id];
         if (optional_id) {
-            const auto kernel = program.get_kernel(optional_id.value());
+            const auto kernel = static_cast<const detail::Program_ *>(program_ptr)->get_kernel(optional_id.value());
             this->launch_msg.kernel_config.watcher_kernel_ids[class_id] = kernel->get_watcher_kernel_id();
             this->launch_msg.kernel_config.enables |= 1 << class_id;
 
@@ -222,12 +397,16 @@ CoreType KernelGroup::get_core_type() const {
     return hal.get_core_type(this->programmable_core_type_index);
 };
 
-std::vector<KernelGroup> &Program::get_kernel_groups(uint32_t programmable_core_type_index) {
+std::vector<KernelGroup> &detail::Program_::get_kernel_groups(uint32_t programmable_core_type_index) {
     update_kernel_groups(programmable_core_type_index);
     return kernel_groups_[programmable_core_type_index];
 }
 
-KernelGroup *Program::kernels_on_core(const CoreCoord &core, uint32_t programmable_core_type_index) {
+std::vector<KernelGroup> &Program::get_kernel_groups(uint32_t programmable_core_type_index) {
+    return pimpl_->get_kernel_groups(programmable_core_type_index);
+}
+
+KernelGroup *detail::Program_::kernels_on_core(const CoreCoord &core, uint32_t programmable_core_type_index) {
     update_kernel_groups(programmable_core_type_index);
     if (core.x >= grid_extent_[programmable_core_type_index].x || core.y >= grid_extent_[programmable_core_type_index].y)
         return nullptr;
@@ -235,6 +414,10 @@ KernelGroup *Program::kernels_on_core(const CoreCoord &core, uint32_t programmab
     return (index == core_to_kernel_group_invalid_index) ? nullptr : &kernel_groups_[programmable_core_type_index].at(index);
 }
 
+KernelGroup *Program::kernels_on_core(const CoreCoord &core, uint32_t programmable_core_type_index) {
+    return pimpl_->kernels_on_core(core, programmable_core_type_index);
+}
+
 struct KernelGroupInt {
     bool valid;
     kernel_id_array_t kernel_ids;
@@ -265,7 +448,7 @@ struct KernelGroupIntHasher {
     }
 };
 
-void Program::update_kernel_groups(uint32_t programmable_core_type_index) {
+void detail::Program_::update_kernel_groups(uint32_t programmable_core_type_index) {
     if (core_to_kernel_group_index_table_[programmable_core_type_index].size() == 0) {
         bool erisc_is_idle = false;
 
@@ -343,7 +526,7 @@ void Program::update_kernel_groups(uint32_t programmable_core_type_index) {
             }
 
             kernel_groups_[programmable_core_type_index].push_back(KernelGroup(
-                *this,
+                this,
                 programmable_core_type_index,
                 kg_to_cores.first.kernel_ids,
                 erisc_is_idle,
@@ -354,7 +537,7 @@ void Program::update_kernel_groups(uint32_t programmable_core_type_index) {
     }
 }
 
-void Program::CircularBufferAllocator::mark_address(uint64_t address, uint64_t size, uint64_t base_address) {
+void detail::Program_::CircularBufferAllocator::mark_address(uint64_t address, uint64_t size, uint64_t base_address) {
     if (this->l1_regions.empty()) {
         this->l1_regions.emplace_back(base_address, base_address);
     }
@@ -373,7 +556,7 @@ void Program::CircularBufferAllocator::mark_address(uint64_t address, uint64_t s
     }
 }
 
-CBHandle Program::add_circular_buffer(const CoreRangeSet &core_range_set, const CircularBufferConfig &config) {
+CBHandle detail::Program_::add_circular_buffer(const CoreRangeSet &core_range_set, const CircularBufferConfig &config) {
     TT_FATAL(this->compiled_.empty(), "Cannot add circular buffer to an already compiled program {}", this->id);
     std::shared_ptr<CircularBuffer> circular_buffer = std::make_shared<CircularBuffer>(core_range_set, config);
     // Globally allocated circular buffer do not invalidate allocation because their addresses are tracked by memory
@@ -424,14 +607,18 @@ CBHandle Program::add_circular_buffer(const CoreRangeSet &core_range_set, const
     return circular_buffer->id();
 }
 
-std::shared_ptr<CircularBuffer> Program::get_circular_buffer(CBHandle cb_id) const {
+CBHandle Program::add_circular_buffer(const CoreRangeSet &core_range_set, const CircularBufferConfig &config) {
+    return pimpl_->add_circular_buffer(core_range_set, config);
+}
+
+std::shared_ptr<CircularBuffer> detail::Program_::get_circular_buffer(CBHandle cb_id) const {
     if (this->circular_buffer_by_id_.find(cb_id) == this->circular_buffer_by_id_.end()) {
         TT_THROW("No circular buffer with id {} exists in Program {}", cb_id, this->id);
     }
     return this->circular_buffer_by_id_.at(cb_id);
 }
 
-const std::vector<std::shared_ptr<CircularBuffer>> Program::circular_buffers_on_core(const CoreCoord &core) const {
+std::vector<std::shared_ptr<CircularBuffer>> detail::Program_::circular_buffers_on_core(const CoreCoord &core) const {
     std::vector<std::shared_ptr<CircularBuffer>> cbs_on_core;
     for (auto circular_buffer : circular_buffers_) {
         if (circular_buffer->is_on_logical_core(core)) {
@@ -441,7 +628,11 @@ const std::vector<std::shared_ptr<CircularBuffer>> Program::circular_buffers_on_
     return cbs_on_core;
 }
 
-const std::vector<std::shared_ptr<CircularBuffer>> Program::circular_buffers_on_corerange(const CoreRange &cr) const {
+std::vector<std::shared_ptr<CircularBuffer>> Program::circular_buffers_on_core(const CoreCoord &core) const {
+    return pimpl_->circular_buffers_on_core(core);
+}
+
+std::vector<std::shared_ptr<CircularBuffer>> detail::Program_::circular_buffers_on_corerange(const CoreRange &cr) const {
     std::vector<std::shared_ptr<CircularBuffer>> cbs_on_core;
     for (auto circular_buffer : circular_buffers_) {
         if (circular_buffer->is_on_logical_corerange(cr)) {
@@ -451,7 +642,11 @@ const std::vector<std::shared_ptr<CircularBuffer>> Program::circular_buffers_on_
     return cbs_on_core;
 }
 
-const std::vector<CoreRange> Program::circular_buffers_unique_coreranges() const {
+std::vector<std::shared_ptr<CircularBuffer>> Program::circular_buffers_on_corerange(const CoreRange &cr) const {
+    return pimpl_->circular_buffers_on_corerange(cr);
+}
+
+std::vector<CoreRange> detail::Program_::circular_buffers_unique_coreranges() const {
     std::vector<CoreRange> core_ranges;
     for (auto circular_buffer : circular_buffers_) {
         for (const CoreRange &core_range : circular_buffer->core_ranges().ranges()) {
@@ -463,7 +658,11 @@ const std::vector<CoreRange> Program::circular_buffers_unique_coreranges() const
     return core_ranges;
 }
 
-void Program::invalidate_circular_buffer_allocation() {
+std::vector<CoreRange> Program::circular_buffers_unique_coreranges() const {
+    return pimpl_->circular_buffers_unique_coreranges();
+}
+
+void detail::Program_::invalidate_circular_buffer_allocation() {
     if (this->local_circular_buffer_allocation_needed_) {
         return;
     }
@@ -473,7 +672,11 @@ void Program::invalidate_circular_buffer_allocation() {
     this->local_circular_buffer_allocation_needed_ = true;
 }
 
-void Program::allocate_circular_buffers(const Device *device) {
+void Program::invalidate_circular_buffer_allocation() {
+    pimpl_->invalidate_circular_buffer_allocation();
+}
+
+void detail::Program_::allocate_circular_buffers(const Device *device) {
     ZoneScoped;
     if (not this->local_circular_buffer_allocation_needed_) {
         return;
@@ -515,7 +718,11 @@ void Program::allocate_circular_buffers(const Device *device) {
     this->local_circular_buffer_allocation_needed_ = false;
 }
 
-void Program::validate_circular_buffer_region(const Device *device) const {
+void Program::allocate_circular_buffers(const Device *device) {
+    pimpl_->allocate_circular_buffers(device);
+}
+
+void detail::Program_::validate_circular_buffer_region(const Device *device) const {
     ZoneScoped;
 
     // Banks are in lockstep so we only need to get lowest L1 address of one compute and storage core
@@ -552,9 +759,13 @@ void Program::validate_circular_buffer_region(const Device *device) const {
 
 size_t Program::num_semaphores(const CoreCoord &core) const { return semaphores_on_core(core).size(); }
 
-size_t Program::num_semaphores() const { return semaphores_.size(); }
+size_t detail::Program_::num_semaphores() const {return semaphores_.size(); }
 
-void Program::init_semaphores(const Device &device, const CoreCoord &logical_core, uint32_t programmable_core_type_index) const {
+size_t Program::num_semaphores() const {
+    return pimpl_->num_semaphores();
+}
+
+void detail::Program_::init_semaphores(const Device &device, const CoreCoord &logical_core, uint32_t programmable_core_type_index) const {
     auto semaphores_on_core = this->semaphores_on_core(logical_core);
 
     uint64_t kernel_config_base = hal.get_dev_addr(programmable_core_type_index, HalL1MemAddrType::KERNEL_CONFIG);
@@ -569,14 +780,22 @@ void Program::init_semaphores(const Device &device, const CoreCoord &logical_cor
     }
 }
 
-void Program::add_semaphore(const CoreRangeSet &crs, uint32_t semaphore_id, uint32_t init_value, CoreType core_type) {
+void Program::init_semaphores(const Device &device, const CoreCoord &logical_core, uint32_t programmable_core_type_index) const {
+    pimpl_->init_semaphores(device, logical_core, programmable_core_type_index);
+}
+
+void detail::Program_::add_semaphore(const CoreRangeSet &crs, uint32_t semaphore_id, uint32_t init_value, CoreType core_type) {
     TT_FATAL(this->compiled_.empty(), "Cannot add semaphore to an already compiled program {}", this->id);
     semaphores_.emplace_back(Semaphore(crs, semaphore_id, init_value, core_type));
 }
 
-void Program::add_config_buffer(std::shared_ptr<Buffer> config_buffer) { config_buffers_.emplace_back(config_buffer); }
+void Program::add_semaphore(const CoreRangeSet &crs, uint32_t semaphore_id, uint32_t init_value, CoreType core_type) {
+    pimpl_->add_semaphore(crs, semaphore_id, init_value, core_type);
+}
 
-std::vector<std::vector<CoreCoord>> Program::logical_cores() const {
+void detail::Program_::add_config_buffer(std::shared_ptr<Buffer> config_buffer) { config_buffers_.emplace_back(config_buffer); }
+
+std::vector<std::vector<CoreCoord>> detail::Program_::logical_cores() const {
     std::vector<std::vector<CoreCoord>> cores_in_program;
     std::vector<std::set<CoreCoord>> unique_cores;
     for (uint32_t programmable_core_type_index = 0; programmable_core_type_index < kernels_.size(); programmable_core_type_index++) {
@@ -596,17 +815,11 @@ std::vector<std::vector<CoreCoord>> Program::logical_cores() const {
     return cores_in_program;
 }
 
-void Program::construct_core_range_set_for_worker_cores() {
-    bool found_kernels = false;
-    uint32_t index = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX);
-    for (auto [id, kernel] : kernels_[index]) {
-        this->worker_crs_ = this->worker_crs_.merge(kernel->core_range_set());
-        found_kernels = true;
-    }
-    TT_ASSERT(!found_kernels || this->worker_crs_.ranges().size() >= 1, "Invalid core range set");
+std::vector<std::vector<CoreCoord>> Program::logical_cores() const {
+    return pimpl_->logical_cores();
 }
 
-void Program::set_cb_data_fmt(Device *device, const std::vector<CoreRange> &crs, JitBuildOptions &build_options) const {
+void detail::Program_::set_cb_data_fmt(Device *device, const std::vector<CoreRange> &crs, JitBuildOptions &build_options) const {
     ZoneScoped;
     for (auto logical_cr : crs) {
         auto cbs_on_core = this->circular_buffers_on_corerange(logical_cr);
@@ -619,7 +832,7 @@ void Program::set_cb_data_fmt(Device *device, const std::vector<CoreRange> &crs,
     }
 }
 
-void Program::set_cb_tile_dims(Device *device, const std::vector<CoreRange> &crs, JitBuildOptions &build_options) const {
+void detail::Program_::set_cb_tile_dims(Device *device, const std::vector<CoreRange> &crs, JitBuildOptions &build_options) const {
     ZoneScoped;
     for (const auto &logical_cr : crs) {
         auto cbs_on_core = this->circular_buffers_on_corerange(logical_cr);
@@ -650,7 +863,7 @@ void Program::set_cb_tile_dims(Device *device, const std::vector<CoreRange> &crs
     }
 }
 
-void Program::populate_dispatch_data(Device *device) {
+void detail::Program_::populate_dispatch_data(Device *device) {
     auto extract_dst_noc_unicast_info =
         [&device](const auto &ranges, const CoreType core_type) -> std::vector<std::pair<transfer_info_cores, uint32_t>> {
         // This API extracts all the pairs of noc multicast encodings given a set of core ranges
@@ -823,7 +1036,7 @@ void Program::populate_dispatch_data(Device *device) {
     return;
 }
 
-uint32_t Program::finalize_rt_args(uint32_t programmable_core_type_index, uint32_t base_offset) {
+uint32_t detail::Program_::finalize_rt_args(uint32_t programmable_core_type_index, uint32_t base_offset) {
 
     // Iterate over kernels in the program and "level" the number of RTAs based on the max
     // Unique RTAs are packed across dispatch classes
@@ -846,7 +1059,7 @@ uint32_t Program::finalize_rt_args(uint32_t programmable_core_type_index, uint32
             max_rtas[dispatch_class] = 0;
             auto& optional_id = kg.kernel_ids[dispatch_class];
             if (optional_id) {
-                auto kernel = detail::GetKernel(*this, optional_id.value());
+                auto kernel = get_kernel(optional_id.value());
                 for (const CoreRange &core_range : kg.core_ranges.ranges()) {
                     for (auto x = core_range.start_coord.x; x <= core_range.end_coord.x; x++) {
                         for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) {
@@ -864,7 +1077,7 @@ uint32_t Program::finalize_rt_args(uint32_t programmable_core_type_index, uint32
             auto& optional_id = kg.kernel_ids[dispatch_class];
             kg.rta_sizes[dispatch_class] = max_rtas[dispatch_class] * sizeof(uint32_t);
             if (optional_id) {
-                auto kernel = detail::GetKernel(*this, optional_id.value());
+                auto kernel = get_kernel(optional_id.value());
                 kernel->set_runtime_args_count(kg.core_ranges, max_rtas[dispatch_class]);
                 kg.launch_msg.kernel_config.rta_offset[dispatch_class].rta_offset = base_offset + offset;
                 offset += max_rtas[dispatch_class] * sizeof(uint32_t);
@@ -883,7 +1096,7 @@ uint32_t Program::finalize_rt_args(uint32_t programmable_core_type_index, uint32
     }
     // Find the max # common RTAs across all kernels for each dispatch class
     for (size_t kernel_id = 0; kernel_id < this->num_kernels(); kernel_id++) {
-        auto kernel = detail::GetKernel(*this, kernel_id);
+        auto kernel = get_kernel(kernel_id);
         // TODO: kernels should be stored by programmable core type
         if (core_type == kernel->get_kernel_core_type() &&
             (programmable_core_type == HalProgrammableCoreType::IDLE_ETH) == kernel->is_idle_eth()) {
@@ -906,7 +1119,7 @@ uint32_t Program::finalize_rt_args(uint32_t programmable_core_type_index, uint32
 
     // Set the runtime_args_data sizing info based on the shared max
     for (size_t kernel_id = 0; kernel_id < this->num_kernels(); kernel_id++) {
-        auto kernel = detail::GetKernel(*this, kernel_id);
+        auto kernel = get_kernel(kernel_id);
         // TODO: as above, fix when kernels are stored by programmable core type
         if (core_type == kernel->get_kernel_core_type() &&
             (programmable_core_type == HalProgrammableCoreType::IDLE_ETH) == kernel->is_idle_eth()) {
@@ -929,11 +1142,15 @@ uint32_t Program::finalize_rt_args(uint32_t programmable_core_type_index, uint32
     return max_unique_rta_size + total_crta_size;
 }
 
-ProgramConfig& Program::get_program_config(uint32_t programmable_core_type_index) {
+ProgramConfig& detail::Program_::get_program_config(uint32_t programmable_core_type_index) {
     return this->program_configs_[programmable_core_type_index];
 }
 
-uint32_t Program::finalize_sems(uint32_t programmable_core_type_index, uint32_t base_offset) {
+ProgramConfig& Program::get_program_config(uint32_t programmable_core_type_index) {
+    return pimpl_->get_program_config(programmable_core_type_index);
+}
+
+uint32_t detail::Program_::finalize_sems(uint32_t programmable_core_type_index, uint32_t base_offset) {
 
     int max_id = -1;
     CoreType core_type = hal.get_core_type(programmable_core_type_index);
@@ -951,7 +1168,7 @@ uint32_t Program::finalize_sems(uint32_t programmable_core_type_index, uint32_t
     return base_offset + sem_size;
 }
 
-void Program::set_launch_msg_sem_offsets() {
+void detail::Program_::set_launch_msg_sem_offsets() {
 
     for (uint32_t kg_type_index = 0; kg_type_index < hal.get_programmable_core_type_count(); kg_type_index++) {
         for (auto& kg : this->get_kernel_groups(kg_type_index)) {
@@ -963,7 +1180,7 @@ void Program::set_launch_msg_sem_offsets() {
     }
 }
 
-uint32_t Program::finalize_cbs(uint32_t programmable_core_type_index, uint32_t base_offset) {
+uint32_t detail::Program_::finalize_cbs(uint32_t programmable_core_type_index, uint32_t base_offset) {
 
     int count = 0;
 
@@ -985,7 +1202,7 @@ uint32_t Program::finalize_cbs(uint32_t programmable_core_type_index, uint32_t b
     return base_offset + cb_size;
 }
 
-uint32_t Program::finalize_kernel_bins(Device *device, uint32_t programmable_core_type_index, uint32_t base_offset) {
+uint32_t detail::Program_::finalize_kernel_bins(Device *device, uint32_t programmable_core_type_index, uint32_t base_offset) {
 
     uint32_t l1_alignment = hal.get_alignment(HalMemType::L1);
 
@@ -1057,14 +1274,11 @@ uint32_t Program::finalize_kernel_bins(Device *device, uint32_t programmable_cor
     return max_offset;
 }
 
-uint32_t& Program::get_program_config_size(uint32_t programmable_core_type_index) {
+uint32_t& detail::Program_::get_program_config_size(uint32_t programmable_core_type_index) {
     return this->program_config_sizes_[programmable_core_type_index];
 }
 
-void Program::finalize(Device *device) {
-
-    this->construct_core_range_set_for_worker_cores();
-
+void detail::Program_::finalize(Device *device) {
     // Store the number of tensix "go signals" for use by CQ
     // CQ iterates over these to update runtime addresses, needs to know when eth begins (after tensix)
     // TODO: should store all the counts
@@ -1113,7 +1327,11 @@ void Program::finalize(Device *device) {
     finalized_ = true;
 }
 
-void Program::compile(Device *device, bool fd_bootloader_mode) {
+void Program::finalize(Device *device) {
+    pimpl_->finalize(device);
+}
+
+void detail::Program_::compile(Device *device, bool fd_bootloader_mode) {
     ZoneScoped;
     if (compiled_.contains(device->id())) {
         return;
@@ -1193,7 +1411,7 @@ void Program::compile(Device *device, bool fd_bootloader_mode) {
                     }
                     if (detail::CompilationReporter::enabled()) {
                         detail::CompilationReporter::inst().add_kernel_compile_stats(
-                            *this, kernel, cache_hit, kernel_hash);
+                            get_id(), kernel, cache_hit, kernel_hash);
                     }
                     kernel->set_binary_path(build_options.path);
                 },
@@ -1211,17 +1429,27 @@ void Program::compile(Device *device, bool fd_bootloader_mode) {
     sync_build_step(events);
 
     if (detail::CompilationReporter::enabled()) {
-        detail::CompilationReporter::inst().flush_program_entry(*this, enable_persistent_kernel_cache);
+        detail::CompilationReporter::inst().flush_program_entry(get_id(), num_kernels(), [this](size_t kernel_id) {
+            return get_kernel(kernel_id);
+        }, enable_persistent_kernel_cache);
     }
     if (detail::MemoryReporter::enabled()) {
-        detail::MemoryReporter::inst().flush_program_memory_usage(*this, device);
+        detail::MemoryReporter::inst().flush_program_memory_usage(get_id(), device);
     }
     compiled_.insert(device->id());
 }
 
-void Program::set_runtime_id(uint64_t id) { this->runtime_id = id; }
+void Program::compile(Device *device, bool fd_bootloader_mode) {
+    pimpl_->compile(device, fd_bootloader_mode);
+}
+
+void detail::Program_::set_runtime_id(uint64_t id) { this->runtime_id = id; }
 
-uint32_t Program::get_sem_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) const {
+void Program::set_runtime_id(uint64_t id) {
+    pimpl_->set_runtime_id(id);
+}
+
+uint32_t detail::Program_::get_sem_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) const {
 
     CoreCoord phys_core = device->physical_core_from_logical_core(logical_core, core_type);
     HalProgrammableCoreType programmable_core_type = device->get_programmable_core_type(phys_core);
@@ -1234,7 +1462,11 @@ uint32_t Program::get_sem_base_addr(Device *device, CoreCoord logical_core, Core
     return base_addr + this->program_configs_[index].sem_offset;
 }
 
-uint32_t Program::get_cb_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) const {
+uint32_t Program::get_sem_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) const {
+    return pimpl_->get_sem_base_addr(device, logical_core, core_type);
+}
+
+uint32_t detail::Program_::get_cb_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) const {
 
     CoreCoord phys_core = device->physical_core_from_logical_core(logical_core, core_type);
     HalProgrammableCoreType programmable_core_type = device->get_programmable_core_type(phys_core);
@@ -1247,7 +1479,11 @@ uint32_t Program::get_cb_base_addr(Device *device, CoreCoord logical_core, CoreT
     return base_addr + this->program_configs_[index].cb_offset;
 }
 
-uint32_t Program::get_sem_size(Device *device, CoreCoord logical_core, CoreType core_type) const {
+uint32_t Program::get_cb_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) const {
+    return pimpl_->get_cb_base_addr(device, logical_core, core_type);
+}
+
+uint32_t detail::Program_::get_sem_size(Device *device, CoreCoord logical_core, CoreType core_type) const {
 
     CoreCoord phys_core = device->physical_core_from_logical_core(logical_core, core_type);
     HalProgrammableCoreType programmable_core_type = device->get_programmable_core_type(phys_core);
@@ -1256,7 +1492,11 @@ uint32_t Program::get_sem_size(Device *device, CoreCoord logical_core, CoreType
     return this->program_configs_[index].sem_size;
 }
 
-uint32_t Program::get_cb_size(Device *device, CoreCoord logical_core, CoreType core_type) const {
+uint32_t Program::get_sem_size(Device *device, CoreCoord logical_core, CoreType core_type) const {
+    return pimpl_->get_sem_size(device, logical_core, core_type);
+}
+
+uint32_t detail::Program_::get_cb_size(Device *device, CoreCoord logical_core, CoreType core_type) const {
 
     CoreCoord phys_core = device->physical_core_from_logical_core(logical_core, core_type);
     HalProgrammableCoreType programmable_core_type = device->get_programmable_core_type(phys_core);
@@ -1265,17 +1505,130 @@ uint32_t Program::get_cb_size(Device *device, CoreCoord logical_core, CoreType c
     return this->program_configs_[index].cb_size;
 }
 
+uint32_t Program::get_cb_size(Device *device, CoreCoord logical_core, CoreType core_type) const {
+    return pimpl_->get_cb_size(device, logical_core, core_type);
+}
+
 // TODO: Too low level for program.cpp. Move this to HAL, once we have support.
-bool Program::runs_on_noc_unicast_only_cores() {
+bool detail::Program_::runs_on_noc_unicast_only_cores() {
     return (hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH) != -1 and
-            this->get_kernel_groups(hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH)).size());
+            not this->get_kernel_groups(hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH)).empty());
+}
+
+bool Program::runs_on_noc_unicast_only_cores() {
+    return pimpl_->runs_on_noc_unicast_only_cores();
 }
 
 // TODO: Too low level for program.cpp. Move this to HAL, once we have support.
-bool Program::runs_on_noc_multicast_only_cores() {
+bool detail::Program_::runs_on_noc_multicast_only_cores() {
     return (hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX) != -1 and
-            this->get_kernel_groups(hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX)).size());
+            not this->get_kernel_groups(hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX)).empty());
+}
+
+bool Program::runs_on_noc_multicast_only_cores() {
+    return pimpl_->runs_on_noc_multicast_only_cores();
+}
+
+Program::Program(Program &&other) noexcept = default;
+
+Program& Program::operator=(Program &&other) noexcept = default;
+
+Program::~Program() noexcept = default;
+
+uint64_t detail::Program_::get_id() const { return this->id; }
+
+uint64_t Program::get_id() const {
+    return pimpl_->get_id();
+}
+
+uint64_t detail::Program_::get_runtime_id() const { return this->runtime_id; }
+
+uint64_t Program::get_runtime_id() const {
+    return pimpl_->get_runtime_id();
+}
+
+size_t detail::Program_::num_kernels() const {
+    size_t count = 0;
+    for (const auto& kernels : kernels_) {
+    count += kernels.size();
+    }
+    return count;
+}
+
+size_t Program::num_kernels() const {
+    return pimpl_->num_kernels();
+}
+
+const std::vector<std::shared_ptr<CircularBuffer>> &detail::Program_::circular_buffers() const { return circular_buffers_; }
+
+const std::vector<std::shared_ptr<CircularBuffer>> &Program::circular_buffers() const {
+    return pimpl_->circular_buffers();
+}
+
+const std::vector< Semaphore > & detail::Program_::semaphores() const { return semaphores_; }
+
+const std::vector< Semaphore > & Program::semaphores() const {
+    return pimpl_->semaphores();
+}
+
+void detail::Program_::add_buffer(std::shared_ptr<Buffer> buf) { owned_buffer_pool.push_back(std::move(buf)); }
+
+void Program::add_buffer(std::shared_ptr<Buffer> buf) {
+    pimpl_->add_buffer(std::move(buf));
+}
+
+void detail::Program_::release_buffers() { owned_buffer_pool = {}; }
+
+void Program::release_buffers() {
+    pimpl_->release_buffers();
+}
+
+std::vector<std::reference_wrapper<const Semaphore>> detail::Program_::semaphores_on_core(const CoreCoord &core) const {
+    std::vector<std::reference_wrapper<const Semaphore>> semaphores;
+    for ( const Semaphore & s : this->semaphores_) {
+        if (s.initialized_on_logical_core(core)) {
+            semaphores.emplace_back(std::cref(s));
+        }
+    }
+    return semaphores;
+}
+
+std::vector<std::reference_wrapper<const Semaphore>> Program::semaphores_on_core(const CoreCoord &core) const {
+    return pimpl_->semaphores_on_core(core);
+}
+
+bool detail::Program_::is_finalized() const { return this->finalized_; }
+
+bool Program::is_finalized() const {
+    return pimpl_->is_finalized();
+}
+
+bool detail::Program_::is_cached() const { return this->cached_; }
+
+bool Program::is_cached() const {
+    return pimpl_->is_cached();
+}
+
+void detail::Program_::set_cached() { this->cached_ = true; }
+
+void Program::set_cached() {
+    pimpl_->set_cached();
+}
+
+const ProgramTransferInfo &Program::get_program_transfer_info() const noexcept {
+    return pimpl_->program_transfer_info;
+}
+
+const std::shared_ptr<Buffer> &Program::get_kernels_buffer() const noexcept {
+    return pimpl_->kernels_buffer;
+}
+
+const std::vector<uint32_t> &Program::get_program_config_sizes() const noexcept {
+    return pimpl_->program_config_sizes_;
+}
+
+std::unordered_map<uint64_t, ProgramCommandSequence> &Program::get_cached_program_command_sequences() noexcept {
+    return pimpl_->cached_program_command_sequences_;
 }
 
-Program::~Program() {}
 }  // namespace tt::tt_metal
diff --git a/tt_metal/impl/program/program.hpp b/tt_metal/impl/program/program.hpp
index 8e1dbb587ada..10c4ac63d461 100644
--- a/tt_metal/impl/program/program.hpp
+++ b/tt_metal/impl/program/program.hpp
@@ -35,6 +35,8 @@ class EnqueueProgramCommand;
 class HWCommandQueue;
 class JitBuildOptions;
 namespace detail{
+    class Program_;
+
     void ValidateCircularBufferRegion(const Program &program, const Device *device);
     KernelHandle AddKernel (Program &program, std::shared_ptr<Kernel> kernel, const HalProgrammableCoreType core_type);
     std::shared_ptr<Kernel> GetKernel(const Program &program, KernelHandle kernel_id);
@@ -57,7 +59,7 @@ struct KernelGroup {
 
     KernelGroup();
     KernelGroup(
-        const Program &program,
+        const void *program,
         uint32_t programmable_core_type_index,
         kernel_id_array_t kernel_ids,
         bool erisc_is_idle,
@@ -91,48 +93,32 @@ class Program {
     Program(const Program &other) = delete;
     Program& operator=(const Program &other) = delete;
 
-    Program(Program &&other) = default;
-    Program& operator=(Program &&other) = default;
+    Program(Program &&other) noexcept;
+    Program& operator=(Program &&other) noexcept;
 
     void set_runtime_id(uint64_t id);
-    ~Program();
-
-    void construct_core_range_set_for_worker_cores();
+    ~Program() noexcept;
 
-    const uint64_t get_id() const { return this->id; }
-    const uint64_t get_runtime_id() const { return this->runtime_id; }
+    uint64_t get_id() const;
+    uint64_t get_runtime_id() const;
 
-    size_t num_kernels() const {
-      size_t count = 0;
-      for (const auto& kernels : kernels_) {
-        count += kernels.size();
-      }
-      return count;
-    }
+    size_t num_kernels() const;
 
-    const std::vector<std::shared_ptr<CircularBuffer>> &circular_buffers() const { return circular_buffers_; }
+    const std::vector<std::shared_ptr<CircularBuffer>> &circular_buffers() const;
 
-    const std::vector< Semaphore > & semaphores() const { return semaphores_; }
+    const std::vector< Semaphore > & semaphores() const;
 
     KernelGroup * kernels_on_core(const CoreCoord &core, uint32_t programmable_core_type_index);
     std::vector<KernelGroup>& get_kernel_groups(uint32_t programmable_core_type_index);
-    inline void add_buffer(std::shared_ptr<Buffer> buf) { owned_buffer_pool.push_back(buf); }
-    inline void release_buffers() { owned_buffer_pool = {}; }
-    const std::vector<std::shared_ptr<CircularBuffer>> circular_buffers_on_core(const CoreCoord &core) const;
+    void add_buffer(std::shared_ptr<Buffer> buf);
+    void release_buffers();
+    std::vector<std::shared_ptr<CircularBuffer>> circular_buffers_on_core(const CoreCoord &core) const;
 
-    const std::vector<std::shared_ptr<CircularBuffer>> circular_buffers_on_corerange(const CoreRange &cr) const;
+    std::vector<std::shared_ptr<CircularBuffer>> circular_buffers_on_corerange(const CoreRange &cr) const;
 
-    const std::vector<CoreRange> circular_buffers_unique_coreranges() const;
+    std::vector<CoreRange> circular_buffers_unique_coreranges() const;
 
-    auto semaphores_on_core(const CoreCoord &core) const {
-        std::vector<std::reference_wrapper<const Semaphore>> semaphores;
-        for ( const Semaphore & s : this->semaphores_) {
-            if (s.initialized_on_logical_core(core)) {
-                semaphores.emplace_back(std::cref(s));
-            }
-        }
-        return semaphores;
-    }
+    std::vector<std::reference_wrapper<const Semaphore>> semaphores_on_core(const CoreCoord &core) const;
 
     size_t num_semaphores ( const CoreCoord & core ) const;
     size_t num_semaphores () const;
@@ -140,18 +126,15 @@ class Program {
     // XXXXX TODO: this should return a const reference
     std::vector<std::vector<CoreCoord>> logical_cores() const;
 
-    // Is worker_crs_ used anywhere?
-    const CoreRangeSet& get_worker_core_range_set() const { return worker_crs_; };
-
     void compile(Device * device, bool fd_bootloader_mode = false);
 
     void invalidate_circular_buffer_allocation();
 
     void allocate_circular_buffers(const Device *device);
 
-    bool is_finalized() const { return this->finalized_; }
-    bool is_cached() const { return this->cached_; }
-    void set_cached() { this->cached_ = true; }
+    bool is_finalized() const;
+    bool is_cached() const;
+    void set_cached();
     void finalize(Device *device);
     std::shared_ptr<Kernel> get_kernel(KernelHandle kernel_id) const;
 
@@ -164,74 +147,7 @@ class Program {
     uint32_t get_cb_size(Device *device, CoreCoord logical_core, CoreType core_type) const;
 
    private:
-    void populate_dispatch_data(Device *device);
-
-    // Buffers temporarily owned by the program
-    std::vector<std::shared_ptr<Buffer>> owned_buffer_pool = {};
-
-    // The buffer that holds the kernel/binaries/etc for this program
-    std::shared_ptr<Buffer> kernels_buffer = nullptr;
-    ProgramTransferInfo program_transfer_info;
-
-    bool finalized_;
-    bool cached_;
-
-    struct CircularBufferAllocator {
-        CircularBufferAllocator(const CoreRange &core_range_) : core_range(core_range_) {}
-
-        // Circular buffers are created and allocated at core range granularity
-        CoreRange core_range;
-
-        // Holds vector of addresses where circular buffers are allocated [start, end)
-        // There are multiple ranges because per core L1 regions are not in lockstep but circular buffers spanning multiple cores must share the same address
-        // To enable this, circular buffer address is the maximum address amongst all of its target cores
-        // This vector is sorted from lower to higher address spaces
-        std::vector<std::pair<uint64_t, uint64_t>> l1_regions;
-
-        // Returns address for next circular buffer
-        // Circular buffers are placed sequentially on a core so the next available address gets appended to the last L1 region
-        uint64_t get_cb_region_end() const {
-            return this->l1_regions.empty() ? 0 : this->l1_regions.back().second;
-        }
-
-        // If address is the end of the last L1 region, the last region is extended by size bytes,
-        //  otherwise address must be higher than existing regions and a new L1 region [address, size) is added
-        void mark_address(uint64_t address, uint64_t size, uint64_t base_address);
-
-        // Reset when circular buffer allocation is invalidated
-        void reset_available_addresses() { this->l1_regions.clear(); }
-    };
-
-    uint64_t id; // Need to make non-const due to move constructor
-    uint64_t runtime_id;
-    static std::atomic<uint64_t> program_counter;
-    std::vector<std::unordered_map<KernelHandle, std::shared_ptr<Kernel> >> kernels_;
-    std::vector<CoreCoord> grid_extent_;
-
-    std::vector<std::shared_ptr<CircularBuffer>> circular_buffers_;
-    std::unordered_map<CBHandle,  std::shared_ptr<CircularBuffer>> circular_buffer_by_id_;
-    // Tracks which circular buffer indices are being used
-    std::unordered_map<CoreCoord, std::bitset<NUM_CIRCULAR_BUFFERS>> per_core_cb_indices_;
-    // Used to generate circular buffer addresses. There is one CircularBufferAllocator per unique CoreRange
-    std::vector<CircularBufferAllocator> cb_allocators_;
-
-    std::vector<Semaphore> semaphores_;
-
-    CoreRangeSet worker_crs_;
-    std::unordered_set<chip_id_t> compiled_;
-    bool local_circular_buffer_allocation_needed_;
-
-    static constexpr uint8_t core_to_kernel_group_invalid_index = 0xff;
-    std::vector<std::vector<KernelGroup>> kernel_groups_;
-    std::vector<std::vector<uint8_t>> core_to_kernel_group_index_table_;
-    uint32_t tensix_go_signal_count_;
-
-    std::vector<std::shared_ptr<Buffer>> config_buffers_;
-
-    std::vector<ProgramConfig> program_configs_;
-    std::vector<uint32_t> program_config_sizes_;
-
-    std::unordered_map<uint64_t, ProgramCommandSequence> cached_program_command_sequences_;
+    std::unique_ptr<detail::Program_> pimpl_;
 
     friend CBHandle CreateCircularBuffer(Program &program, const std::variant<CoreCoord, CoreRange, CoreRangeSet> &core_spec, const CircularBufferConfig &config);
     friend std::shared_ptr<CircularBuffer> detail::GetCircularBuffer(const Program &program, CBHandle id);
@@ -241,38 +157,23 @@ class Program {
     friend std::shared_ptr<Kernel> detail::GetKernel(const Program &program, KernelHandle kernel_id);
 
     friend uint32_t CreateSemaphore(Program &program, const std::variant<CoreRange,CoreRangeSet> &core_spec, uint32_t initial_value, CoreType core_type);
-    KernelHandle add_kernel(std::shared_ptr<Kernel> kernel, const HalProgrammableCoreType &core_type);
 
     CBHandle add_circular_buffer(const CoreRangeSet &core_range_set, const CircularBufferConfig &config);
-    std::shared_ptr<CircularBuffer> get_circular_buffer(CBHandle cb_id) const;
 
     void add_semaphore(const CoreRangeSet & crs, uint32_t semaphore_id, uint32_t init_value, CoreType core_type);
 
     friend void detail::AddConfigBuffer(Program &program, std::shared_ptr<Buffer> config_buffer);
-    void add_config_buffer(std::shared_ptr<Buffer> config_buffer);
-
-    // Ensures that statically allocated circular buffers do not grow into L1 buffer space
-    void validate_circular_buffer_region(const Device *device) const;
-
-    void set_cb_data_fmt( Device *device, const std::vector<CoreRange> & crs, JitBuildOptions& build_options) const;
-
-    void set_cb_tile_dims( Device *device, const std::vector<CoreRange> & crs, JitBuildOptions& build_options) const;
-
-    void update_kernel_groups(uint32_t programmable_core_type_index);
-
-    uint32_t& get_program_config_size(uint32_t programmable_core_type_index);
-
-    uint32_t finalize_rt_args(uint32_t programmable_core_type_index, uint32_t base_offset);
-    uint32_t finalize_sems(uint32_t programmable_core_type_index, uint32_t base_offset);
-    uint32_t finalize_cbs(uint32_t programmable_core_type_index, uint32_t base_offset);
-    uint32_t finalize_kernel_bins(Device *device, uint32_t programmable_core_type_index, uint32_t base_offset);
-    void set_launch_msg_sem_offsets();
 
     bool runs_on_noc_unicast_only_cores();
     bool runs_on_noc_multicast_only_cores();
 
     friend HWCommandQueue;
     friend EnqueueProgramCommand;
+
+    const ProgramTransferInfo &get_program_transfer_info() const noexcept;
+    const std::shared_ptr<Buffer> &get_kernels_buffer() const noexcept;
+    const std::vector<uint32_t> &get_program_config_sizes() const noexcept;
+    std::unordered_map<uint64_t, ProgramCommandSequence> &get_cached_program_command_sequences() noexcept;
 };
 
 }  // namespace v0