From 74cc5d9c6486d081ab37d8e010579708b22e9035 Mon Sep 17 00:00:00 2001 From: David Ma Date: Fri, 24 Jan 2025 23:22:35 +0000 Subject: [PATCH] #0: Add a way to specify custom dispatch topology --- tt_metal/impl/dispatch/topology.cpp | 71 ++++++++++++++--------------- tt_metal/impl/dispatch/topology.hpp | 27 ++++++++++- 2 files changed, 58 insertions(+), 40 deletions(-) diff --git a/tt_metal/impl/dispatch/topology.cpp b/tt_metal/impl/dispatch/topology.cpp index ce15c23a345..277023cd5ce 100644 --- a/tt_metal/impl/dispatch/topology.cpp +++ b/tt_metal/impl/dispatch/topology.cpp @@ -15,28 +15,12 @@ #include "kernel_config/eth_router.hpp" #include "kernel_config/eth_tunneler.hpp" -#define DISPATCH_MAX_UPSTREAM 4 -#define DISPATCH_MAX_DOWNSTREAM 4 - -using namespace tt::tt_metal; - -typedef struct { - int id; - chip_id_t device_id; // Device that this kernel is located on - chip_id_t servicing_device_id; // Remote device that this kernel services, used for kernels on MMIO - uint8_t cq_id; // CQ this kernel implements - DispatchWorkerType kernel_type; // Type of dispatch kernel this is - int upstream_ids[DISPATCH_MAX_UPSTREAM]; // Upstream dispatch kernels - int downstream_ids[DISPATCH_MAX_DOWNSTREAM]; // Downstream dispatch kernels - NOC my_noc; // NOC this kernel uses to dispatch kernels - NOC upstream_noc; // NOC used to communicate upstream - NOC downstream_noc; // NOC used to communicate downstream -} dispatch_kernel_node_t; +namespace tt::tt_metal { // For readablity, unset = x = -1 #define x -1 -void increment_node_ids(dispatch_kernel_node_t& node, uint32_t inc) { +void increment_node_ids(DispatchKernelNode& node, uint32_t inc) { node.id += inc; for (int& id : node.upstream_ids) { if (id != x) { @@ -50,20 +34,20 @@ void increment_node_ids(dispatch_kernel_node_t& node, uint32_t inc) { } } -static const std::vector single_chip_arch_1cq = { +static const std::vector single_chip_arch_1cq = { {0, 0, 0, 0, PREFETCH_HD, {x, x, x, x}, {1, 2, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, {1, 0, 0, 0, DISPATCH_HD, {0, x, x, x}, {2, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0}, {2, 0, 0, 0, DISPATCH_S, {0, x, x, x}, {1, x, x, x}, NOC::NOC_1, NOC::NOC_1, NOC::NOC_1}, }; -static const std::vector single_chip_arch_2cq = { +static const std::vector single_chip_arch_2cq = { {0, 0, 0, 0, PREFETCH_HD, {x, x, x, x}, {1, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, {1, 0, 0, 0, DISPATCH_HD, {0, x, x, x}, {x, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0}, {2, 0, 0, 1, PREFETCH_HD, {x, x, x, x}, {3, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, {3, 0, 0, 1, DISPATCH_HD, {2, x, x, x}, {x, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0}, }; -static const std::vector single_chip_arch_2cq_dispatch_s = { +static const std::vector single_chip_arch_2cq_dispatch_s = { {0, 0, 0, 0, PREFETCH_HD, {x, x, x, x}, {1, 4, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, {1, 0, 0, 0, DISPATCH_HD, {0, x, x, x}, {4, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0}, {2, 0, 0, 1, PREFETCH_HD, {x, x, x, x}, {3, 5, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, @@ -72,7 +56,7 @@ static const std::vector single_chip_arch_2cq_dispatch_s {5, 0, 0, 1, DISPATCH_S, {2, x, x, x}, {3, x, x, x}, NOC::NOC_1, NOC::NOC_1, NOC::NOC_1}, }; -static const std::vector two_chip_arch_1cq = { +static const std::vector two_chip_arch_1cq = { {0, 0, 0, 0, PREFETCH_HD, {x, x, x, x}, {1, 2, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, {1, 0, 0, 0, DISPATCH_HD, {0, x, x, x}, {2, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0}, {2, 0, 0, 0, DISPATCH_S, {0, x, x, x}, {1, x, x, x}, NOC::NOC_1, NOC::NOC_1, NOC::NOC_1}, @@ -93,7 +77,7 @@ static const std::vector two_chip_arch_1cq = { {13, 1, x, 0, PACKET_ROUTER_DEMUX, {11, x, x, x}, {8, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, }; -static const std::vector two_chip_arch_2cq = { +static const std::vector two_chip_arch_2cq = { {0, 0, 0, 0, PREFETCH_HD, {x, x, x, x}, {2, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, {1, 0, 0, 1, PREFETCH_HD, {x, x, x, x}, {3, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, {2, 0, 0, 0, DISPATCH_HD, {0, x, x, x}, {x, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0}, @@ -119,7 +103,7 @@ static const std::vector two_chip_arch_2cq = { }; -static const std::vector galaxy_nine_chip_arch_1cq = { +static const std::vector galaxy_nine_chip_arch_1cq = { // For MMIO chip, TODO: investigate removing these, they aren't needed {0, 0, 0, 0, PREFETCH_HD, {x, x, x, x}, {1, 2, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, {1, 0, 0, 0, DISPATCH_HD, {0, x, x, x}, {2, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0}, @@ -232,7 +216,7 @@ static const std::vector galaxy_nine_chip_arch_1cq = { {88, 8, x, 0, DISPATCH_S, {86, x, x, x}, {87, x, x, x}, NOC::NOC_1, NOC::NOC_1, NOC::NOC_1}, }; -static const std::vector galaxy_nine_chip_arch_2cq = { +static const std::vector galaxy_nine_chip_arch_2cq = { // For MMIO chip {0, 0, 0, 0, PREFETCH_HD, {x, x, x, x}, {2, 4, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, {1, 0, 0, 1, PREFETCH_HD, {x, x, x, x}, {3, 5, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, @@ -396,8 +380,8 @@ static const std::vector galaxy_nine_chip_arch_2cq = { std::vector node_id_to_kernel; -// Helper function to get the nodes for this platform -std::vector get_nodes(const std::set& device_ids, uint32_t num_hw_cqs) { +// Helper function to automatically generate dispatch nodes given devices + num hw CQs + detection of card type. +std::vector generate_nodes(const std::set& device_ids, uint32_t num_hw_cqs) { // Select/generate the right input table, depends on (1) board [detected from total # of devices], and (2) number // of active devices. TODO: read this out of YAML instead of the structs above? uint32_t total_devices = tt::Cluster::instance().number_of_devices(); @@ -407,7 +391,7 @@ std::vector get_nodes(const std::set& device_ uint32_t num_devices = device_ids.size(); TT_ASSERT(num_devices > 0, "Can't determine dispatch architecture with no active devices."); TT_ASSERT(num_devices <= total_devices); - std::vector nodes; + std::vector nodes; std::set mmio_devices; std::set remote_devices; @@ -436,7 +420,7 @@ std::vector get_nodes(const std::set& device_ if (remote_devices.empty()) { // MMIO devices only, just replicate a single chip arch for each - std::vector nodes_for_one_mmio = populate_single_device(); + std::vector nodes_for_one_mmio = populate_single_device(); uint32_t index_offset = 0; for (auto id : mmio_devices) { for (auto node : nodes_for_one_mmio) { @@ -451,7 +435,7 @@ std::vector get_nodes(const std::set& device_ // Need to handle N300/T3000 separately from TG/TGG since they have different templates/tunnel depths if (tt::Cluster::instance().is_galaxy_cluster()) { // For Galaxy, we always init all remote devices associated with an mmio device. - const std::vector* nodes_for_one_mmio = + const std::vector* nodes_for_one_mmio = (num_hw_cqs == 1) ? &galaxy_nine_chip_arch_1cq : &galaxy_nine_chip_arch_2cq; uint32_t index_offset = 0; for (auto mmio_device_id : mmio_devices) { @@ -468,7 +452,7 @@ std::vector get_nodes(const std::set& device_ } // Pull nodes from the template, updating their index and device id - for (dispatch_kernel_node_t node : *nodes_for_one_mmio) { + for (DispatchKernelNode node : *nodes_for_one_mmio) { node.device_id = template_id_to_device_id[node.device_id]; node.servicing_device_id = template_id_to_device_id[node.servicing_device_id]; increment_node_ids(node, index_offset); @@ -481,7 +465,7 @@ std::vector get_nodes(const std::set& device_ TT_ASSERT( mmio_devices.size() == remote_devices.size() or remote_devices.empty(), "N300/T3K expects devices in mmio/remote pairs."); - const std::vector* nodes_for_one_mmio = + const std::vector* nodes_for_one_mmio = (num_hw_cqs == 1) ? &two_chip_arch_1cq : &two_chip_arch_2cq; uint32_t index_offset = 0; for (auto mmio_device_id : mmio_devices) { @@ -498,7 +482,7 @@ std::vector get_nodes(const std::set& device_ TT_ASSERT(found_remote, "Couldn't find paired remote chip for device {}", mmio_device_id); // Add dispatch kernels for the mmio/remote pair - for (dispatch_kernel_node_t node : *nodes_for_one_mmio) { + for (DispatchKernelNode node : *nodes_for_one_mmio) { TT_ASSERT(node.device_id == 0 || node.device_id == 1); if (node.device_id == 0) { node.device_id = mmio_device_id; @@ -525,6 +509,9 @@ std::vector get_nodes(const std::set& device_ // Device until fields are populated, (2) need to be connected to kernel objects for devices that aren't created yet, // and (3) the table to choose depends on total number of devices, not know at Device creation. void populate_fd_kernels(const std::set& device_ids, uint32_t num_hw_cqs) { + populate_fd_kernels(generate_nodes(device_ids, num_hw_cqs)); +} +void populate_fd_kernels(const std::vector& nodes) { // If we already had nodes from a previous run, clear them (since we could have a different # of devices or CQs). if (!node_id_to_kernel.empty()) { for (int idx = 0; idx < node_id_to_kernel.size(); idx++) { @@ -533,8 +520,9 @@ void populate_fd_kernels(const std::set& device_ids, uint32_t num_hw_ node_id_to_kernel.clear(); } - // Read the input table, create configs for each node - std::vector nodes = get_nodes(device_ids, num_hw_cqs); + // Read the input table, create configs for each node + track mmio devices and number of cqs. + std::unordered_set mmio_device_ids; + std::unordered_set hw_cq_ids; for (const auto& node : nodes) { TT_ASSERT(node_id_to_kernel.size() == node.id); node_id_to_kernel.push_back(FDKernel::Generate( @@ -544,16 +532,21 @@ void populate_fd_kernels(const std::set& device_ids, uint32_t num_hw_ node.cq_id, {node.my_noc, node.upstream_noc, node.downstream_noc}, node.kernel_type)); + if (tt::Cluster::instance().get_associated_mmio_device(node.device_id) == node.device_id) { + mmio_device_ids.insert(node.device_id); + } + hw_cq_ids.insert(node.cq_id); } + uint32_t num_hw_cqs = hw_cq_ids.size(); // Connect the graph with upstream/downstream kernels for (const auto& node : nodes) { - for (int idx = 0; idx < DISPATCH_MAX_UPSTREAM; idx++) { + for (int idx = 0; idx < DISPATCH_MAX_UPSTREAM_KERNELS; idx++) { if (node.upstream_ids[idx] >= 0) { node_id_to_kernel.at(node.id)->AddUpstreamKernel(node_id_to_kernel.at(node.upstream_ids[idx])); } } - for (int idx = 0; idx < DISPATCH_MAX_DOWNSTREAM; idx++) { + for (int idx = 0; idx < DISPATCH_MAX_DOWNSTREAM_KERNELS; idx++) { if (node.downstream_ids[idx] >= 0) { node_id_to_kernel.at(node.id)->AddDownstreamKernel(node_id_to_kernel.at(node.downstream_ids[idx])); } @@ -564,7 +557,7 @@ void populate_fd_kernels(const std::set& device_ids, uint32_t num_hw_ std::map device_id_to_tunnel_stop; std::map> mmio_device_id_to_serviced_devices; uint32_t tunnel_depth; - for (auto mmio_device_id : device_ids) { + for (auto mmio_device_id : mmio_device_ids) { if (tt::Cluster::instance().get_associated_mmio_device(mmio_device_id) != mmio_device_id) { continue; } @@ -753,3 +746,5 @@ void configure_dispatch_cores(IDevice* device) { } } } + +} // namespace tt::tt_metal diff --git a/tt_metal/impl/dispatch/topology.hpp b/tt_metal/impl/dispatch/topology.hpp index d1e032d329b..956c0b6644b 100644 --- a/tt_metal/impl/dispatch/topology.hpp +++ b/tt_metal/impl/dispatch/topology.hpp @@ -4,12 +4,35 @@ #pragma once #include +namespace tt::tt_metal { + +// Max number of upstream/downstream dispatch kernels that can be connected to a single dispatch kernel. +#define DISPATCH_MAX_UPSTREAM_KERNELS 4 +#define DISPATCH_MAX_DOWNSTREAM_KERNELS 4 + +struct DispatchKernelNode { + int id; + chip_id_t device_id; // Device that this kernel is located on + chip_id_t servicing_device_id; // Remote device that this kernel services, used for kernels on MMIO + uint8_t cq_id; // CQ this kernel implements + DispatchWorkerType kernel_type; // Type of dispatch kernel this is + int upstream_ids[DISPATCH_MAX_UPSTREAM_KERNELS]; // Upstream dispatch kernels + int downstream_ids[DISPATCH_MAX_DOWNSTREAM_KERNELS]; // Downstream dispatch kernels + NOC my_noc; // NOC this kernel uses to dispatch kernels + NOC upstream_noc; // NOC used to communicate upstream + NOC downstream_noc; // NOC used to communicate downstream +}; + // Create FD kernels for all given device ids. Creates all objects, but need to call create_and_compile_cq_program() use -// a created Device to fill out the settings. +// a created Device to fill out the settings. First version automatically generates the topology based on devices, num +// cqs, and detected board. Second version uses the topology passed in. void populate_fd_kernels(const std::set& device_ids, uint32_t num_hw_cqs); +void populate_fd_kernels(const std::vector& nodes); // Fill out all settings for FD kernels on the given device, and add them to a Program and return it. std::unique_ptr create_and_compile_cq_program(tt::tt_metal::IDevice* device); -// Performa additional configuration (writing to specific L1 addresses, etc.) for FD kernels on this device. +// Perform additional configuration (writing to specific L1 addresses, etc.) for FD kernels on this device. void configure_dispatch_cores(tt::tt_metal::IDevice* device); + +} // namespace tt::tt_metal