From 74cc5d9c6486d081ab37d8e010579708b22e9035 Mon Sep 17 00:00:00 2001
From: David Ma <davidma@tenstorrent.com>
Date: Fri, 24 Jan 2025 23:22:35 +0000
Subject: [PATCH] #0: Add a way to specify custom dispatch topology

---
 tt_metal/impl/dispatch/topology.cpp | 71 ++++++++++++++---------------
 tt_metal/impl/dispatch/topology.hpp | 27 ++++++++++-
 2 files changed, 58 insertions(+), 40 deletions(-)

diff --git a/tt_metal/impl/dispatch/topology.cpp b/tt_metal/impl/dispatch/topology.cpp
index ce15c23a345..277023cd5ce 100644
--- a/tt_metal/impl/dispatch/topology.cpp
+++ b/tt_metal/impl/dispatch/topology.cpp
@@ -15,28 +15,12 @@
 #include "kernel_config/eth_router.hpp"
 #include "kernel_config/eth_tunneler.hpp"
 
-#define DISPATCH_MAX_UPSTREAM 4
-#define DISPATCH_MAX_DOWNSTREAM 4
-
-using namespace tt::tt_metal;
-
-typedef struct {
-    int id;
-    chip_id_t device_id;                          // Device that this kernel is located on
-    chip_id_t servicing_device_id;                // Remote device that this kernel services, used for kernels on MMIO
-    uint8_t cq_id;                                // CQ this kernel implements
-    DispatchWorkerType kernel_type;               // Type of dispatch kernel this is
-    int upstream_ids[DISPATCH_MAX_UPSTREAM];      // Upstream dispatch kernels
-    int downstream_ids[DISPATCH_MAX_DOWNSTREAM];  // Downstream dispatch kernels
-    NOC my_noc;                                   // NOC this kernel uses to dispatch kernels
-    NOC upstream_noc;                             // NOC used to communicate upstream
-    NOC downstream_noc;                           // NOC used to communicate downstream
-} dispatch_kernel_node_t;
+namespace tt::tt_metal {
 
 // For readablity, unset = x = -1
 #define x -1
 
-void increment_node_ids(dispatch_kernel_node_t& node, uint32_t inc) {
+void increment_node_ids(DispatchKernelNode& node, uint32_t inc) {
     node.id += inc;
     for (int& id : node.upstream_ids) {
         if (id != x) {
@@ -50,20 +34,20 @@ void increment_node_ids(dispatch_kernel_node_t& node, uint32_t inc) {
     }
 }
 
-static const std::vector<dispatch_kernel_node_t> single_chip_arch_1cq = {
+static const std::vector<DispatchKernelNode> single_chip_arch_1cq = {
     {0, 0, 0, 0, PREFETCH_HD, {x, x, x, x}, {1, 2, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
     {1, 0, 0, 0, DISPATCH_HD, {0, x, x, x}, {2, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0},
     {2, 0, 0, 0, DISPATCH_S, {0, x, x, x}, {1, x, x, x}, NOC::NOC_1, NOC::NOC_1, NOC::NOC_1},
 };
 
-static const std::vector<dispatch_kernel_node_t> single_chip_arch_2cq = {
+static const std::vector<DispatchKernelNode> single_chip_arch_2cq = {
     {0, 0, 0, 0, PREFETCH_HD, {x, x, x, x}, {1, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
     {1, 0, 0, 0, DISPATCH_HD, {0, x, x, x}, {x, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0},
     {2, 0, 0, 1, PREFETCH_HD, {x, x, x, x}, {3, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
     {3, 0, 0, 1, DISPATCH_HD, {2, x, x, x}, {x, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0},
 };
 
-static const std::vector<dispatch_kernel_node_t> single_chip_arch_2cq_dispatch_s = {
+static const std::vector<DispatchKernelNode> single_chip_arch_2cq_dispatch_s = {
     {0, 0, 0, 0, PREFETCH_HD, {x, x, x, x}, {1, 4, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
     {1, 0, 0, 0, DISPATCH_HD, {0, x, x, x}, {4, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0},
     {2, 0, 0, 1, PREFETCH_HD, {x, x, x, x}, {3, 5, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
@@ -72,7 +56,7 @@ static const std::vector<dispatch_kernel_node_t> single_chip_arch_2cq_dispatch_s
     {5, 0, 0, 1, DISPATCH_S, {2, x, x, x}, {3, x, x, x}, NOC::NOC_1, NOC::NOC_1, NOC::NOC_1},
 };
 
-static const std::vector<dispatch_kernel_node_t> two_chip_arch_1cq = {
+static const std::vector<DispatchKernelNode> two_chip_arch_1cq = {
     {0, 0, 0, 0, PREFETCH_HD, {x, x, x, x}, {1, 2, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
     {1, 0, 0, 0, DISPATCH_HD, {0, x, x, x}, {2, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0},
     {2, 0, 0, 0, DISPATCH_S, {0, x, x, x}, {1, x, x, x}, NOC::NOC_1, NOC::NOC_1, NOC::NOC_1},
@@ -93,7 +77,7 @@ static const std::vector<dispatch_kernel_node_t> two_chip_arch_1cq = {
     {13, 1, x, 0, PACKET_ROUTER_DEMUX, {11, x, x, x}, {8, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
 };
 
-static const std::vector<dispatch_kernel_node_t> two_chip_arch_2cq = {
+static const std::vector<DispatchKernelNode> two_chip_arch_2cq = {
     {0, 0, 0, 0, PREFETCH_HD, {x, x, x, x}, {2, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
     {1, 0, 0, 1, PREFETCH_HD, {x, x, x, x}, {3, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
     {2, 0, 0, 0, DISPATCH_HD, {0, x, x, x}, {x, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0},
@@ -119,7 +103,7 @@ static const std::vector<dispatch_kernel_node_t> two_chip_arch_2cq = {
 
 };
 
-static const std::vector<dispatch_kernel_node_t> galaxy_nine_chip_arch_1cq = {
+static const std::vector<DispatchKernelNode> galaxy_nine_chip_arch_1cq = {
     // For MMIO chip, TODO: investigate removing these, they aren't needed
     {0, 0, 0, 0, PREFETCH_HD, {x, x, x, x}, {1, 2, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
     {1, 0, 0, 0, DISPATCH_HD, {0, x, x, x}, {2, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0},
@@ -232,7 +216,7 @@ static const std::vector<dispatch_kernel_node_t> galaxy_nine_chip_arch_1cq = {
     {88, 8, x, 0, DISPATCH_S, {86, x, x, x}, {87, x, x, x}, NOC::NOC_1, NOC::NOC_1, NOC::NOC_1},
 };
 
-static const std::vector<dispatch_kernel_node_t> galaxy_nine_chip_arch_2cq = {
+static const std::vector<DispatchKernelNode> galaxy_nine_chip_arch_2cq = {
     // For MMIO chip
     {0, 0, 0, 0, PREFETCH_HD, {x, x, x, x}, {2, 4, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
     {1, 0, 0, 1, PREFETCH_HD, {x, x, x, x}, {3, 5, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
@@ -396,8 +380,8 @@ static const std::vector<dispatch_kernel_node_t> galaxy_nine_chip_arch_2cq = {
 
 std::vector<FDKernel*> node_id_to_kernel;
 
-// Helper function to get the nodes for this platform
-std::vector<dispatch_kernel_node_t> get_nodes(const std::set<chip_id_t>& device_ids, uint32_t num_hw_cqs) {
+// Helper function to automatically generate dispatch nodes given devices + num hw CQs + detection of card type.
+std::vector<DispatchKernelNode> generate_nodes(const std::set<chip_id_t>& device_ids, uint32_t num_hw_cqs) {
     // Select/generate the right input table, depends on (1) board [detected from total # of devices], and (2) number
     // of active devices. TODO: read this out of YAML instead of the structs above?
     uint32_t total_devices = tt::Cluster::instance().number_of_devices();
@@ -407,7 +391,7 @@ std::vector<dispatch_kernel_node_t> get_nodes(const std::set<chip_id_t>& device_
     uint32_t num_devices = device_ids.size();
     TT_ASSERT(num_devices > 0, "Can't determine dispatch architecture with no active devices.");
     TT_ASSERT(num_devices <= total_devices);
-    std::vector<dispatch_kernel_node_t> nodes;
+    std::vector<DispatchKernelNode> nodes;
 
     std::set<chip_id_t> mmio_devices;
     std::set<chip_id_t> remote_devices;
@@ -436,7 +420,7 @@ std::vector<dispatch_kernel_node_t> get_nodes(const std::set<chip_id_t>& device_
 
     if (remote_devices.empty()) {
         // MMIO devices only, just replicate a single chip arch for each
-        std::vector<dispatch_kernel_node_t> nodes_for_one_mmio = populate_single_device();
+        std::vector<DispatchKernelNode> nodes_for_one_mmio = populate_single_device();
         uint32_t index_offset = 0;
         for (auto id : mmio_devices) {
             for (auto node : nodes_for_one_mmio) {
@@ -451,7 +435,7 @@ std::vector<dispatch_kernel_node_t> get_nodes(const std::set<chip_id_t>& device_
         // Need to handle N300/T3000 separately from TG/TGG since they have different templates/tunnel depths
         if (tt::Cluster::instance().is_galaxy_cluster()) {
             // For Galaxy, we always init all remote devices associated with an mmio device.
-            const std::vector<dispatch_kernel_node_t>* nodes_for_one_mmio =
+            const std::vector<DispatchKernelNode>* nodes_for_one_mmio =
                 (num_hw_cqs == 1) ? &galaxy_nine_chip_arch_1cq : &galaxy_nine_chip_arch_2cq;
             uint32_t index_offset = 0;
             for (auto mmio_device_id : mmio_devices) {
@@ -468,7 +452,7 @@ std::vector<dispatch_kernel_node_t> get_nodes(const std::set<chip_id_t>& device_
                 }
 
                 // Pull nodes from the template, updating their index and device id
-                for (dispatch_kernel_node_t node : *nodes_for_one_mmio) {
+                for (DispatchKernelNode node : *nodes_for_one_mmio) {
                     node.device_id = template_id_to_device_id[node.device_id];
                     node.servicing_device_id = template_id_to_device_id[node.servicing_device_id];
                     increment_node_ids(node, index_offset);
@@ -481,7 +465,7 @@ std::vector<dispatch_kernel_node_t> get_nodes(const std::set<chip_id_t>& device_
             TT_ASSERT(
                 mmio_devices.size() == remote_devices.size() or remote_devices.empty(),
                 "N300/T3K expects devices in mmio/remote pairs.");
-            const std::vector<dispatch_kernel_node_t>* nodes_for_one_mmio =
+            const std::vector<DispatchKernelNode>* nodes_for_one_mmio =
                 (num_hw_cqs == 1) ? &two_chip_arch_1cq : &two_chip_arch_2cq;
             uint32_t index_offset = 0;
             for (auto mmio_device_id : mmio_devices) {
@@ -498,7 +482,7 @@ std::vector<dispatch_kernel_node_t> get_nodes(const std::set<chip_id_t>& device_
                 TT_ASSERT(found_remote, "Couldn't find paired remote chip for device {}", mmio_device_id);
 
                 // Add dispatch kernels for the mmio/remote pair
-                for (dispatch_kernel_node_t node : *nodes_for_one_mmio) {
+                for (DispatchKernelNode node : *nodes_for_one_mmio) {
                     TT_ASSERT(node.device_id == 0 || node.device_id == 1);
                     if (node.device_id == 0) {
                         node.device_id = mmio_device_id;
@@ -525,6 +509,9 @@ std::vector<dispatch_kernel_node_t> get_nodes(const std::set<chip_id_t>& device_
 // Device until fields are populated, (2) need to be connected to kernel objects for devices that aren't created yet,
 // and (3) the table to choose depends on total number of devices, not know at Device creation.
 void populate_fd_kernels(const std::set<chip_id_t>& device_ids, uint32_t num_hw_cqs) {
+    populate_fd_kernels(generate_nodes(device_ids, num_hw_cqs));
+}
+void populate_fd_kernels(const std::vector<DispatchKernelNode>& nodes) {
     // If we already had nodes from a previous run, clear them (since we could have a different # of devices or CQs).
     if (!node_id_to_kernel.empty()) {
         for (int idx = 0; idx < node_id_to_kernel.size(); idx++) {
@@ -533,8 +520,9 @@ void populate_fd_kernels(const std::set<chip_id_t>& device_ids, uint32_t num_hw_
         node_id_to_kernel.clear();
     }
 
-    // Read the input table, create configs for each node
-    std::vector<dispatch_kernel_node_t> nodes = get_nodes(device_ids, num_hw_cqs);
+    // Read the input table, create configs for each node + track mmio devices and number of cqs.
+    std::unordered_set<chip_id_t> mmio_device_ids;
+    std::unordered_set<uint8_t> hw_cq_ids;
     for (const auto& node : nodes) {
         TT_ASSERT(node_id_to_kernel.size() == node.id);
         node_id_to_kernel.push_back(FDKernel::Generate(
@@ -544,16 +532,21 @@ void populate_fd_kernels(const std::set<chip_id_t>& device_ids, uint32_t num_hw_
             node.cq_id,
             {node.my_noc, node.upstream_noc, node.downstream_noc},
             node.kernel_type));
+        if (tt::Cluster::instance().get_associated_mmio_device(node.device_id) == node.device_id) {
+            mmio_device_ids.insert(node.device_id);
+        }
+        hw_cq_ids.insert(node.cq_id);
     }
+    uint32_t num_hw_cqs = hw_cq_ids.size();
 
     // Connect the graph with upstream/downstream kernels
     for (const auto& node : nodes) {
-        for (int idx = 0; idx < DISPATCH_MAX_UPSTREAM; idx++) {
+        for (int idx = 0; idx < DISPATCH_MAX_UPSTREAM_KERNELS; idx++) {
             if (node.upstream_ids[idx] >= 0) {
                 node_id_to_kernel.at(node.id)->AddUpstreamKernel(node_id_to_kernel.at(node.upstream_ids[idx]));
             }
         }
-        for (int idx = 0; idx < DISPATCH_MAX_DOWNSTREAM; idx++) {
+        for (int idx = 0; idx < DISPATCH_MAX_DOWNSTREAM_KERNELS; idx++) {
             if (node.downstream_ids[idx] >= 0) {
                 node_id_to_kernel.at(node.id)->AddDownstreamKernel(node_id_to_kernel.at(node.downstream_ids[idx]));
             }
@@ -564,7 +557,7 @@ void populate_fd_kernels(const std::set<chip_id_t>& device_ids, uint32_t num_hw_
     std::map<chip_id_t, uint32_t> device_id_to_tunnel_stop;
     std::map<chip_id_t, std::vector<chip_id_t>> mmio_device_id_to_serviced_devices;
     uint32_t tunnel_depth;
-    for (auto mmio_device_id : device_ids) {
+    for (auto mmio_device_id : mmio_device_ids) {
         if (tt::Cluster::instance().get_associated_mmio_device(mmio_device_id) != mmio_device_id) {
             continue;
         }
@@ -753,3 +746,5 @@ void configure_dispatch_cores(IDevice* device) {
         }
     }
 }
+
+}  // namespace tt::tt_metal
diff --git a/tt_metal/impl/dispatch/topology.hpp b/tt_metal/impl/dispatch/topology.hpp
index d1e032d329b..956c0b6644b 100644
--- a/tt_metal/impl/dispatch/topology.hpp
+++ b/tt_metal/impl/dispatch/topology.hpp
@@ -4,12 +4,35 @@
 #pragma once
 #include <device.hpp>
 
+namespace tt::tt_metal {
+
+// Max number of upstream/downstream dispatch kernels that can be connected to a single dispatch kernel.
+#define DISPATCH_MAX_UPSTREAM_KERNELS 4
+#define DISPATCH_MAX_DOWNSTREAM_KERNELS 4
+
+struct DispatchKernelNode {
+    int id;
+    chip_id_t device_id;             // Device that this kernel is located on
+    chip_id_t servicing_device_id;   // Remote device that this kernel services, used for kernels on MMIO
+    uint8_t cq_id;                   // CQ this kernel implements
+    DispatchWorkerType kernel_type;  // Type of dispatch kernel this is
+    int upstream_ids[DISPATCH_MAX_UPSTREAM_KERNELS];      // Upstream dispatch kernels
+    int downstream_ids[DISPATCH_MAX_DOWNSTREAM_KERNELS];  // Downstream dispatch kernels
+    NOC my_noc;                                           // NOC this kernel uses to dispatch kernels
+    NOC upstream_noc;                                     // NOC used to communicate upstream
+    NOC downstream_noc;                                   // NOC used to communicate downstream
+};
+
 // Create FD kernels for all given device ids. Creates all objects, but need to call create_and_compile_cq_program() use
-// a created Device to fill out the settings.
+// a created Device to fill out the settings. First version automatically generates the topology based on devices, num
+// cqs, and detected board. Second version uses the topology passed in.
 void populate_fd_kernels(const std::set<chip_id_t>& device_ids, uint32_t num_hw_cqs);
+void populate_fd_kernels(const std::vector<DispatchKernelNode>& nodes);
 
 // Fill out all settings for FD kernels on the given device, and add them to a Program and return it.
 std::unique_ptr<tt::tt_metal::Program> create_and_compile_cq_program(tt::tt_metal::IDevice* device);
 
-// Performa additional configuration (writing to specific L1 addresses, etc.) for FD kernels on this device.
+// Perform additional configuration (writing to specific L1 addresses, etc.) for FD kernels on this device.
 void configure_dispatch_cores(tt::tt_metal::IDevice* device);
+
+}  // namespace tt::tt_metal