Skip to content

Commit

Permalink
#0: Add a way to specify custom dispatch topology
Browse files Browse the repository at this point in the history
  • Loading branch information
tt-dma committed Jan 27, 2025
1 parent dc5def7 commit 74cc5d9
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 40 deletions.
71 changes: 33 additions & 38 deletions tt_metal/impl/dispatch/topology.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,28 +15,12 @@
#include "kernel_config/eth_router.hpp"
#include "kernel_config/eth_tunneler.hpp"

#define DISPATCH_MAX_UPSTREAM 4
#define DISPATCH_MAX_DOWNSTREAM 4

using namespace tt::tt_metal;

typedef struct {
int id;
chip_id_t device_id; // Device that this kernel is located on
chip_id_t servicing_device_id; // Remote device that this kernel services, used for kernels on MMIO
uint8_t cq_id; // CQ this kernel implements
DispatchWorkerType kernel_type; // Type of dispatch kernel this is
int upstream_ids[DISPATCH_MAX_UPSTREAM]; // Upstream dispatch kernels
int downstream_ids[DISPATCH_MAX_DOWNSTREAM]; // Downstream dispatch kernels
NOC my_noc; // NOC this kernel uses to dispatch kernels
NOC upstream_noc; // NOC used to communicate upstream
NOC downstream_noc; // NOC used to communicate downstream
} dispatch_kernel_node_t;
namespace tt::tt_metal {

// For readablity, unset = x = -1
#define x -1

void increment_node_ids(dispatch_kernel_node_t& node, uint32_t inc) {
void increment_node_ids(DispatchKernelNode& node, uint32_t inc) {
node.id += inc;
for (int& id : node.upstream_ids) {
if (id != x) {
Expand All @@ -50,20 +34,20 @@ void increment_node_ids(dispatch_kernel_node_t& node, uint32_t inc) {
}
}

static const std::vector<dispatch_kernel_node_t> single_chip_arch_1cq = {
static const std::vector<DispatchKernelNode> single_chip_arch_1cq = {
{0, 0, 0, 0, PREFETCH_HD, {x, x, x, x}, {1, 2, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
{1, 0, 0, 0, DISPATCH_HD, {0, x, x, x}, {2, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0},
{2, 0, 0, 0, DISPATCH_S, {0, x, x, x}, {1, x, x, x}, NOC::NOC_1, NOC::NOC_1, NOC::NOC_1},
};

static const std::vector<dispatch_kernel_node_t> single_chip_arch_2cq = {
static const std::vector<DispatchKernelNode> single_chip_arch_2cq = {
{0, 0, 0, 0, PREFETCH_HD, {x, x, x, x}, {1, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
{1, 0, 0, 0, DISPATCH_HD, {0, x, x, x}, {x, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0},
{2, 0, 0, 1, PREFETCH_HD, {x, x, x, x}, {3, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
{3, 0, 0, 1, DISPATCH_HD, {2, x, x, x}, {x, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0},
};

static const std::vector<dispatch_kernel_node_t> single_chip_arch_2cq_dispatch_s = {
static const std::vector<DispatchKernelNode> single_chip_arch_2cq_dispatch_s = {
{0, 0, 0, 0, PREFETCH_HD, {x, x, x, x}, {1, 4, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
{1, 0, 0, 0, DISPATCH_HD, {0, x, x, x}, {4, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0},
{2, 0, 0, 1, PREFETCH_HD, {x, x, x, x}, {3, 5, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
Expand All @@ -72,7 +56,7 @@ static const std::vector<dispatch_kernel_node_t> single_chip_arch_2cq_dispatch_s
{5, 0, 0, 1, DISPATCH_S, {2, x, x, x}, {3, x, x, x}, NOC::NOC_1, NOC::NOC_1, NOC::NOC_1},
};

static const std::vector<dispatch_kernel_node_t> two_chip_arch_1cq = {
static const std::vector<DispatchKernelNode> two_chip_arch_1cq = {
{0, 0, 0, 0, PREFETCH_HD, {x, x, x, x}, {1, 2, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
{1, 0, 0, 0, DISPATCH_HD, {0, x, x, x}, {2, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0},
{2, 0, 0, 0, DISPATCH_S, {0, x, x, x}, {1, x, x, x}, NOC::NOC_1, NOC::NOC_1, NOC::NOC_1},
Expand All @@ -93,7 +77,7 @@ static const std::vector<dispatch_kernel_node_t> two_chip_arch_1cq = {
{13, 1, x, 0, PACKET_ROUTER_DEMUX, {11, x, x, x}, {8, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
};

static const std::vector<dispatch_kernel_node_t> two_chip_arch_2cq = {
static const std::vector<DispatchKernelNode> two_chip_arch_2cq = {
{0, 0, 0, 0, PREFETCH_HD, {x, x, x, x}, {2, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
{1, 0, 0, 1, PREFETCH_HD, {x, x, x, x}, {3, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
{2, 0, 0, 0, DISPATCH_HD, {0, x, x, x}, {x, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0},
Expand All @@ -119,7 +103,7 @@ static const std::vector<dispatch_kernel_node_t> two_chip_arch_2cq = {

};

static const std::vector<dispatch_kernel_node_t> galaxy_nine_chip_arch_1cq = {
static const std::vector<DispatchKernelNode> galaxy_nine_chip_arch_1cq = {
// For MMIO chip, TODO: investigate removing these, they aren't needed
{0, 0, 0, 0, PREFETCH_HD, {x, x, x, x}, {1, 2, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
{1, 0, 0, 0, DISPATCH_HD, {0, x, x, x}, {2, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0},
Expand Down Expand Up @@ -232,7 +216,7 @@ static const std::vector<dispatch_kernel_node_t> galaxy_nine_chip_arch_1cq = {
{88, 8, x, 0, DISPATCH_S, {86, x, x, x}, {87, x, x, x}, NOC::NOC_1, NOC::NOC_1, NOC::NOC_1},
};

static const std::vector<dispatch_kernel_node_t> galaxy_nine_chip_arch_2cq = {
static const std::vector<DispatchKernelNode> galaxy_nine_chip_arch_2cq = {
// For MMIO chip
{0, 0, 0, 0, PREFETCH_HD, {x, x, x, x}, {2, 4, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
{1, 0, 0, 1, PREFETCH_HD, {x, x, x, x}, {3, 5, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0},
Expand Down Expand Up @@ -396,8 +380,8 @@ static const std::vector<dispatch_kernel_node_t> galaxy_nine_chip_arch_2cq = {

std::vector<FDKernel*> node_id_to_kernel;

// Helper function to get the nodes for this platform
std::vector<dispatch_kernel_node_t> get_nodes(const std::set<chip_id_t>& device_ids, uint32_t num_hw_cqs) {
// Helper function to automatically generate dispatch nodes given devices + num hw CQs + detection of card type.
std::vector<DispatchKernelNode> generate_nodes(const std::set<chip_id_t>& device_ids, uint32_t num_hw_cqs) {
// Select/generate the right input table, depends on (1) board [detected from total # of devices], and (2) number
// of active devices. TODO: read this out of YAML instead of the structs above?
uint32_t total_devices = tt::Cluster::instance().number_of_devices();
Expand All @@ -407,7 +391,7 @@ std::vector<dispatch_kernel_node_t> get_nodes(const std::set<chip_id_t>& device_
uint32_t num_devices = device_ids.size();
TT_ASSERT(num_devices > 0, "Can't determine dispatch architecture with no active devices.");
TT_ASSERT(num_devices <= total_devices);
std::vector<dispatch_kernel_node_t> nodes;
std::vector<DispatchKernelNode> nodes;

std::set<chip_id_t> mmio_devices;
std::set<chip_id_t> remote_devices;
Expand Down Expand Up @@ -436,7 +420,7 @@ std::vector<dispatch_kernel_node_t> get_nodes(const std::set<chip_id_t>& device_

if (remote_devices.empty()) {
// MMIO devices only, just replicate a single chip arch for each
std::vector<dispatch_kernel_node_t> nodes_for_one_mmio = populate_single_device();
std::vector<DispatchKernelNode> nodes_for_one_mmio = populate_single_device();
uint32_t index_offset = 0;
for (auto id : mmio_devices) {
for (auto node : nodes_for_one_mmio) {
Expand All @@ -451,7 +435,7 @@ std::vector<dispatch_kernel_node_t> get_nodes(const std::set<chip_id_t>& device_
// Need to handle N300/T3000 separately from TG/TGG since they have different templates/tunnel depths
if (tt::Cluster::instance().is_galaxy_cluster()) {
// For Galaxy, we always init all remote devices associated with an mmio device.
const std::vector<dispatch_kernel_node_t>* nodes_for_one_mmio =
const std::vector<DispatchKernelNode>* nodes_for_one_mmio =
(num_hw_cqs == 1) ? &galaxy_nine_chip_arch_1cq : &galaxy_nine_chip_arch_2cq;
uint32_t index_offset = 0;
for (auto mmio_device_id : mmio_devices) {
Expand All @@ -468,7 +452,7 @@ std::vector<dispatch_kernel_node_t> get_nodes(const std::set<chip_id_t>& device_
}

// Pull nodes from the template, updating their index and device id
for (dispatch_kernel_node_t node : *nodes_for_one_mmio) {
for (DispatchKernelNode node : *nodes_for_one_mmio) {
node.device_id = template_id_to_device_id[node.device_id];
node.servicing_device_id = template_id_to_device_id[node.servicing_device_id];
increment_node_ids(node, index_offset);
Expand All @@ -481,7 +465,7 @@ std::vector<dispatch_kernel_node_t> get_nodes(const std::set<chip_id_t>& device_
TT_ASSERT(
mmio_devices.size() == remote_devices.size() or remote_devices.empty(),
"N300/T3K expects devices in mmio/remote pairs.");
const std::vector<dispatch_kernel_node_t>* nodes_for_one_mmio =
const std::vector<DispatchKernelNode>* nodes_for_one_mmio =
(num_hw_cqs == 1) ? &two_chip_arch_1cq : &two_chip_arch_2cq;
uint32_t index_offset = 0;
for (auto mmio_device_id : mmio_devices) {
Expand All @@ -498,7 +482,7 @@ std::vector<dispatch_kernel_node_t> get_nodes(const std::set<chip_id_t>& device_
TT_ASSERT(found_remote, "Couldn't find paired remote chip for device {}", mmio_device_id);

// Add dispatch kernels for the mmio/remote pair
for (dispatch_kernel_node_t node : *nodes_for_one_mmio) {
for (DispatchKernelNode node : *nodes_for_one_mmio) {
TT_ASSERT(node.device_id == 0 || node.device_id == 1);
if (node.device_id == 0) {
node.device_id = mmio_device_id;
Expand All @@ -525,6 +509,9 @@ std::vector<dispatch_kernel_node_t> get_nodes(const std::set<chip_id_t>& device_
// Device until fields are populated, (2) need to be connected to kernel objects for devices that aren't created yet,
// and (3) the table to choose depends on total number of devices, not know at Device creation.
void populate_fd_kernels(const std::set<chip_id_t>& device_ids, uint32_t num_hw_cqs) {
populate_fd_kernels(generate_nodes(device_ids, num_hw_cqs));
}
void populate_fd_kernels(const std::vector<DispatchKernelNode>& nodes) {
// If we already had nodes from a previous run, clear them (since we could have a different # of devices or CQs).
if (!node_id_to_kernel.empty()) {
for (int idx = 0; idx < node_id_to_kernel.size(); idx++) {
Expand All @@ -533,8 +520,9 @@ void populate_fd_kernels(const std::set<chip_id_t>& device_ids, uint32_t num_hw_
node_id_to_kernel.clear();
}

// Read the input table, create configs for each node
std::vector<dispatch_kernel_node_t> nodes = get_nodes(device_ids, num_hw_cqs);
// Read the input table, create configs for each node + track mmio devices and number of cqs.
std::unordered_set<chip_id_t> mmio_device_ids;
std::unordered_set<uint8_t> hw_cq_ids;
for (const auto& node : nodes) {
TT_ASSERT(node_id_to_kernel.size() == node.id);
node_id_to_kernel.push_back(FDKernel::Generate(
Expand All @@ -544,16 +532,21 @@ void populate_fd_kernels(const std::set<chip_id_t>& device_ids, uint32_t num_hw_
node.cq_id,
{node.my_noc, node.upstream_noc, node.downstream_noc},
node.kernel_type));
if (tt::Cluster::instance().get_associated_mmio_device(node.device_id) == node.device_id) {
mmio_device_ids.insert(node.device_id);
}
hw_cq_ids.insert(node.cq_id);
}
uint32_t num_hw_cqs = hw_cq_ids.size();

// Connect the graph with upstream/downstream kernels
for (const auto& node : nodes) {
for (int idx = 0; idx < DISPATCH_MAX_UPSTREAM; idx++) {
for (int idx = 0; idx < DISPATCH_MAX_UPSTREAM_KERNELS; idx++) {
if (node.upstream_ids[idx] >= 0) {
node_id_to_kernel.at(node.id)->AddUpstreamKernel(node_id_to_kernel.at(node.upstream_ids[idx]));
}
}
for (int idx = 0; idx < DISPATCH_MAX_DOWNSTREAM; idx++) {
for (int idx = 0; idx < DISPATCH_MAX_DOWNSTREAM_KERNELS; idx++) {
if (node.downstream_ids[idx] >= 0) {
node_id_to_kernel.at(node.id)->AddDownstreamKernel(node_id_to_kernel.at(node.downstream_ids[idx]));
}
Expand All @@ -564,7 +557,7 @@ void populate_fd_kernels(const std::set<chip_id_t>& device_ids, uint32_t num_hw_
std::map<chip_id_t, uint32_t> device_id_to_tunnel_stop;
std::map<chip_id_t, std::vector<chip_id_t>> mmio_device_id_to_serviced_devices;
uint32_t tunnel_depth;
for (auto mmio_device_id : device_ids) {
for (auto mmio_device_id : mmio_device_ids) {
if (tt::Cluster::instance().get_associated_mmio_device(mmio_device_id) != mmio_device_id) {
continue;
}
Expand Down Expand Up @@ -753,3 +746,5 @@ void configure_dispatch_cores(IDevice* device) {
}
}
}

} // namespace tt::tt_metal
27 changes: 25 additions & 2 deletions tt_metal/impl/dispatch/topology.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,35 @@
#pragma once
#include <device.hpp>

namespace tt::tt_metal {

// Max number of upstream/downstream dispatch kernels that can be connected to a single dispatch kernel.
#define DISPATCH_MAX_UPSTREAM_KERNELS 4
#define DISPATCH_MAX_DOWNSTREAM_KERNELS 4

struct DispatchKernelNode {
int id;
chip_id_t device_id; // Device that this kernel is located on
chip_id_t servicing_device_id; // Remote device that this kernel services, used for kernels on MMIO
uint8_t cq_id; // CQ this kernel implements
DispatchWorkerType kernel_type; // Type of dispatch kernel this is
int upstream_ids[DISPATCH_MAX_UPSTREAM_KERNELS]; // Upstream dispatch kernels
int downstream_ids[DISPATCH_MAX_DOWNSTREAM_KERNELS]; // Downstream dispatch kernels
NOC my_noc; // NOC this kernel uses to dispatch kernels
NOC upstream_noc; // NOC used to communicate upstream
NOC downstream_noc; // NOC used to communicate downstream
};

// Create FD kernels for all given device ids. Creates all objects, but need to call create_and_compile_cq_program() use
// a created Device to fill out the settings.
// a created Device to fill out the settings. First version automatically generates the topology based on devices, num
// cqs, and detected board. Second version uses the topology passed in.
void populate_fd_kernels(const std::set<chip_id_t>& device_ids, uint32_t num_hw_cqs);
void populate_fd_kernels(const std::vector<DispatchKernelNode>& nodes);

// Fill out all settings for FD kernels on the given device, and add them to a Program and return it.
std::unique_ptr<tt::tt_metal::Program> create_and_compile_cq_program(tt::tt_metal::IDevice* device);

// Performa additional configuration (writing to specific L1 addresses, etc.) for FD kernels on this device.
// Perform additional configuration (writing to specific L1 addresses, etc.) for FD kernels on this device.
void configure_dispatch_cores(tt::tt_metal::IDevice* device);

} // namespace tt::tt_metal

0 comments on commit 74cc5d9

Please sign in to comment.