Skip to content

Commit

Permalink
Fix parsing cluster descriptor for Blackhole and Grayskull(#280)
Browse files Browse the repository at this point in the history
---------

Co-authored-by: Almeet Bhullar <abhullar@tenstorrent.com>
  • Loading branch information
pjanevskiTT and abhullar-tt authored Nov 9, 2024
1 parent a44f5f8 commit 4b5dffd
Show file tree
Hide file tree
Showing 12 changed files with 71 additions and 92 deletions.
25 changes: 0 additions & 25 deletions blackhole_1chip_cluster.yaml

This file was deleted.

2 changes: 1 addition & 1 deletion device/hugepage.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,4 @@ std::string find_hugepage_dir(std::size_t pagesize);
// Today we assume there's only one pipeline running within the system.
// One hugepage per device such that each device gets unique memory.
int open_hugepage_file(const std::string &dir, chip_id_t physical_device_id, uint16_t channel);
}
}
6 changes: 6 additions & 0 deletions device/libs/create_ethernet_map.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
/*
* SPDX-FileCopyrightText: (c) 2024 Tenstorrent Inc.
*
* SPDX-License-Identifier: Apache-2.0
*/

// External function to create the cluster descriptor yaml file.
extern "C" {
int create_ethernet_map(char *file);
Expand Down
21 changes: 17 additions & 4 deletions device/tt_cluster_descriptor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -523,16 +523,21 @@ void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descripto
}

void tt_ClusterDescriptor::load_chips_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc) {

for (YAML::const_iterator node = yaml["arch"].begin(); node != yaml["arch"].end(); ++node) {
chip_id_t chip_id = node->first.as<int>();
desc.all_chips.insert(chip_id);
}

for (YAML::const_iterator node = yaml["chips"].begin(); node != yaml["chips"].end(); ++node) {
chip_id_t chip_id = node->first.as<int>();
std::vector<int> chip_rack_coords = node->second.as<std::vector<int>>();
log_assert(chip_rack_coords.size() == 4, "Galaxy (x, y, rack, shelf) coords must be size 4");
eth_coord_t chip_location{
chip_rack_coords.at(0), chip_rack_coords.at(1), chip_rack_coords.at(2), chip_rack_coords.at(3)};

desc.chip_locations.insert({chip_id, chip_location});
desc.coords_to_chip_ids[std::get<2>(chip_location)][std::get<3>(chip_location)][std::get<1>(chip_location)][std::get<0>(chip_location)] = chip_id;
desc.all_chips.insert(chip_id);
}

for(const auto& chip : yaml["chips_with_mmio"]) {
Expand All @@ -558,7 +563,7 @@ void tt_ClusterDescriptor::load_chips_from_connectivity_descriptor(YAML::Node &y
std::get<3>(chip_location));
}

if (yaml["boardtype"]) {
if (yaml["boardtype"]) {
for (const auto& chip_board_type : yaml["boardtype"].as<std::map<int, std::string>>()) {
auto &chip = chip_board_type.first;
BoardType board_type;
Expand All @@ -568,7 +573,13 @@ void tt_ClusterDescriptor::load_chips_from_connectivity_descriptor(YAML::Node &y
board_type = BoardType::N300;
} else if (chip_board_type.second == "GALAXY") {
board_type = BoardType::GALAXY;
} else if (chip_board_type.second == "e150") {
board_type = BoardType::E150;
}
else if (chip_board_type.second == "p150A") {
board_type = BoardType::P150A;
} else {
log_warning(LogSiliconDriver, "Unknown board type for chip {}. This might happen because chip is running old firmware. Defaulting to DEFAULT", chip);
board_type = BoardType::DEFAULT;
}
desc.chip_board_type.insert({chip, board_type});
Expand Down Expand Up @@ -622,7 +633,7 @@ const std::unordered_map<chip_id_t, std::unordered_map<ethernet_channel_t, std::

const std::unordered_map<chip_id_t, eth_coord_t>& tt_ClusterDescriptor::get_chip_locations() const {
static auto locations = std::unordered_map<chip_id_t, eth_coord_t>();
if (locations.empty()) {
if (locations.empty() and !this->chip_locations.empty()) {
for (auto chip_id : this->enabled_active_chips) {
locations[chip_id] = chip_locations.at(chip_id);
}
Expand All @@ -632,6 +643,7 @@ const std::unordered_map<chip_id_t, eth_coord_t>& tt_ClusterDescriptor::get_chip
}

chip_id_t tt_ClusterDescriptor::get_shelf_local_physical_chip_coords(chip_id_t virtual_coord) {
log_assert(!this->chip_locations.empty(), "Getting physical chip coordinates is only valid for systems where chips have coordinates");
// Physical cooridnates of chip inside a single rack. Calculated based on Galaxy topology.
// See: https://yyz-gitlab.local.tenstorrent.com/tenstorrent/budabackend/-/wikis/uploads/23e7a5168f38dfb706f9887fde78cb03/image.png
int x = std::get<0>(get_chip_locations().at(virtual_coord));
Expand Down Expand Up @@ -667,6 +679,7 @@ const std::unordered_map<chip_id_t, bool>& tt_ClusterDescriptor::get_noc_transla
std::size_t tt_ClusterDescriptor::get_number_of_chips() const { return this->enabled_active_chips.size(); }

int tt_ClusterDescriptor::get_ethernet_link_distance(chip_id_t chip_a, chip_id_t chip_b) const {
log_assert(!this->chip_locations.empty(), "Getting physical chip coordinates is only valid for systems where chips have coordinates");
return this->get_ethernet_link_coord_distance(chip_locations.at(chip_a), chip_locations.at(chip_b));
}

Expand Down
14 changes: 10 additions & 4 deletions device/tt_cluster_descriptor.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@ namespace YAML { class Node; }
enum BoardType : uint32_t {
N150 = 0,
N300 = 1,
GALAXY = 2,
DEFAULT = 3,
E150 = 2,
P150A = 3,
GALAXY = 4,
DEFAULT = 5,
};

class tt_ClusterDescriptor {
Expand Down Expand Up @@ -88,9 +90,13 @@ class tt_ClusterDescriptor {
// get_cluster_descriptor_file_path will create ethernet map in the background.
static std::string get_cluster_descriptor_file_path();
static std::unique_ptr<tt_ClusterDescriptor> create_from_yaml(const std::string &cluster_descriptor_file_path);

// TODO: This function is used to create mock cluster descriptor yaml files, for example for simulation.
// The name of the function is kept to not gate the changes regarding create-ethernet-map.
// It should be renamed to something like create_mock_cluster_descriptor and changed in tt-metal/tt-debuda.
static std::unique_ptr<tt_ClusterDescriptor> create_for_grayskull_cluster(
const std::set<chip_id_t> &logical_mmio_device_ids,
const std::vector<chip_id_t> &physical_mmio_device_ids);
const std::set<chip_id_t> &logical_mmio_device_ids,
const std::vector<chip_id_t> &physical_mmio_device_ids);

const std::unordered_map<chip_id_t, std::uint32_t>& get_harvesting_info() const;
const std::unordered_map<chip_id_t, bool>& get_noc_translation_table_en() const;
Expand Down
12 changes: 6 additions & 6 deletions device/tt_silicon_driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -302,14 +302,14 @@ tt_SiliconDevice::tt_SiliconDevice(const std::string &sdesc_path, const std::str
log_info(LogSiliconDriver, "Detected {} PCI device{} : {}", m_num_pci_devices, (m_num_pci_devices > 1) ? "s":"", available_device_ids);
log_debug(LogSiliconDriver, "Passed target devices: {}", target_devices);
}

if (ndesc_path == "") {
ndesc = tt_ClusterDescriptor::create_for_grayskull_cluster(target_devices, available_device_ids);
}
else {
ndesc = tt_ClusterDescriptor::create_from_yaml(ndesc_path);

std::string cluster_descriptor_path = ndesc_path;
if (cluster_descriptor_path == "") {
cluster_descriptor_path = tt_ClusterDescriptor::get_cluster_descriptor_file_path();
}

ndesc = tt_ClusterDescriptor::create_from_yaml(cluster_descriptor_path);

for (auto &d: target_devices){
if (ndesc->is_chip_mmio_capable(d)){
target_mmio_device_ids.insert(d);
Expand Down
15 changes: 4 additions & 11 deletions tests/api/test_chip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,16 +47,9 @@ inline std::unique_ptr<tt_ClusterDescriptor> get_cluster_desc() {

// TODO: Remove different branch for different archs
std::unique_ptr<tt_ClusterDescriptor> cluster_desc;
if (device_arch == tt::ARCH::GRAYSKULL) {
cluster_desc = tt_ClusterDescriptor::create_for_grayskull_cluster(pci_device_ids_set, pci_device_ids);
} else if (device_arch == tt::ARCH::BLACKHOLE) {
std::string yaml_path = test_utils::GetAbsPath("blackhole_1chip_cluster.yaml");
cluster_desc = tt_ClusterDescriptor::create_from_yaml(yaml_path);
} else {
// TODO: remove getting manually cluster descriptor from yaml.
std::string yaml_path = tt_ClusterDescriptor::get_cluster_descriptor_file_path();
cluster_desc = tt_ClusterDescriptor::create_from_yaml(yaml_path);
}
// TODO: remove getting manually cluster descriptor from yaml.
std::string yaml_path = tt_ClusterDescriptor::get_cluster_descriptor_file_path();
cluster_desc = tt_ClusterDescriptor::create_from_yaml(yaml_path);

return cluster_desc;
}
Expand Down Expand Up @@ -116,7 +109,7 @@ inline std::unique_ptr<Cluster> get_cluster() {


// TODO: Don't pass each of these arguments.
return std::unique_ptr<Cluster>(new Cluster(soc_path, device_arch == tt::ARCH::GRAYSKULL ? "" : yaml_path, detected_num_chips_set));
return std::unique_ptr<Cluster>(new Cluster(soc_path, tt_ClusterDescriptor::get_cluster_descriptor_file_path(), detected_num_chips_set));
}

// TODO: Once default auto TLB setup is in, check it is setup properly.
Expand Down
16 changes: 4 additions & 12 deletions tests/api/test_cluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,18 +56,10 @@ inline std::unique_ptr<tt_ClusterDescriptor> get_cluster_desc() {
return nullptr;
}

// TODO: Remove different branch for different archs
std::unique_ptr<tt_ClusterDescriptor> cluster_desc;
if (device_arch == tt::ARCH::GRAYSKULL) {
cluster_desc = tt_ClusterDescriptor::create_for_grayskull_cluster(pci_device_ids_set, pci_device_ids);
} else if (device_arch == tt::ARCH::BLACKHOLE) {
std::string yaml_path = test_utils::GetAbsPath("blackhole_1chip_cluster.yaml");
cluster_desc = tt_ClusterDescriptor::create_from_yaml(yaml_path);
} else {
// TODO: remove getting manually cluster descriptor from yaml.
std::string yaml_path = tt_ClusterDescriptor::get_cluster_descriptor_file_path();
cluster_desc = tt_ClusterDescriptor::create_from_yaml(yaml_path);
}
// TODO: remove getting manually cluster descriptor from yaml.
std::string yaml_path = tt_ClusterDescriptor::get_cluster_descriptor_file_path();
cluster_desc = tt_ClusterDescriptor::create_from_yaml(yaml_path);

return cluster_desc;
}
Expand Down Expand Up @@ -126,7 +118,7 @@ inline std::unique_ptr<Cluster> get_cluster() {

// TODO: Don't pass each of these arguments.
return std::unique_ptr<Cluster>(
new Cluster(soc_path, device_arch == tt::ARCH::GRAYSKULL ? "" : yaml_path, detected_num_chips_set));
new Cluster(soc_path, tt_ClusterDescriptor::get_cluster_descriptor_file_path(), detected_num_chips_set));
}

// TODO: Should not be wormhole specific.
Expand Down
13 changes: 3 additions & 10 deletions tests/api/test_cluster_descriptor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,16 +38,9 @@ inline std::unique_ptr<tt_ClusterDescriptor> get_cluster_desc() {

// TODO: Remove different branch for different archs
std::unique_ptr<tt_ClusterDescriptor> cluster_desc;
if (device_arch == tt::ARCH::GRAYSKULL) {
cluster_desc = tt_ClusterDescriptor::create_for_grayskull_cluster(pci_device_ids_set, pci_device_ids);
} else if (device_arch == tt::ARCH::BLACKHOLE) {
std::string yaml_path = test_utils::GetAbsPath("blackhole_1chip_cluster.yaml");
cluster_desc = tt_ClusterDescriptor::create_from_yaml(yaml_path);
} else {
// TODO: remove getting manually cluster descriptor from yaml.
std::string yaml_path = tt_ClusterDescriptor::get_cluster_descriptor_file_path();
cluster_desc = tt_ClusterDescriptor::create_from_yaml(yaml_path);
}
// TODO: remove getting manually cluster descriptor from yaml.
std::string yaml_path = tt_ClusterDescriptor::get_cluster_descriptor_file_path();
cluster_desc = tt_ClusterDescriptor::create_from_yaml(yaml_path);

return cluster_desc;
}
Expand Down
2 changes: 1 addition & 1 deletion tests/api/test_mockup_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,4 @@ TEST(ApiMockupTest, CreateDevice) {
auto device_driver = std::make_unique<tt_MockupDevice>(get_soc_descriptor_file(arch));
}

} // namespace test::mockup_device
} // namespace test::mockup_device
18 changes: 9 additions & 9 deletions tests/blackhole/test_silicon_driver_bh.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ std::int32_t get_static_tlb_index(tt_xy_pair target) {

std::set<chip_id_t> get_target_devices() {
std::set<chip_id_t> target_devices;
std::unique_ptr<tt_ClusterDescriptor> cluster_desc_uniq = tt_ClusterDescriptor::create_from_yaml(test_utils::GetAbsPath("blackhole_1chip_cluster.yaml"));
std::unique_ptr<tt_ClusterDescriptor> cluster_desc_uniq = tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path());
for (int i = 0; i < cluster_desc_uniq->get_number_of_chips(); i++) {
target_devices.insert(i);
}
Expand All @@ -72,7 +72,7 @@ TEST(SiliconDriverBH, CreateDestroy) {
tt_device_params default_params;
// Initialize the driver with a 1x1 descriptor and explictly do not perform harvesting
for(int i = 0; i < 50; i++) {
tt_SiliconDevice device = tt_SiliconDevice(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), test_utils::GetAbsPath("blackhole_1chip_cluster.yaml"), target_devices, num_host_mem_ch_per_mmio_device, false, true, false);
tt_SiliconDevice device = tt_SiliconDevice(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, false);
set_params_for_remote_txn(device);
device.start_device(default_params);
device.deassert_risc_reset();
Expand Down Expand Up @@ -205,7 +205,7 @@ TEST(SiliconDriverBH, UnalignedStaticTLB_RW) {

uint32_t num_host_mem_ch_per_mmio_device = 1;

tt_SiliconDevice device = tt_SiliconDevice(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), test_utils::GetAbsPath("blackhole_1chip_cluster.yaml"), target_devices, num_host_mem_ch_per_mmio_device, false, true, true);
tt_SiliconDevice device = tt_SiliconDevice(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true);
set_params_for_remote_txn(device);
auto mmio_devices = device.get_target_mmio_device_ids();

Expand Down Expand Up @@ -264,7 +264,7 @@ TEST(SiliconDriverBH, StaticTLB_RW) {

uint32_t num_host_mem_ch_per_mmio_device = 1;

tt_SiliconDevice device = tt_SiliconDevice(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), test_utils::GetAbsPath("blackhole_1chip_cluster.yaml"), target_devices, num_host_mem_ch_per_mmio_device, false, true, true);
tt_SiliconDevice device = tt_SiliconDevice(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true);
set_params_for_remote_txn(device);
auto mmio_devices = device.get_target_mmio_device_ids();

Expand Down Expand Up @@ -314,7 +314,7 @@ TEST(SiliconDriverBH, DynamicTLB_RW) {
std::set<chip_id_t> target_devices = get_target_devices();

uint32_t num_host_mem_ch_per_mmio_device = 1;
tt_SiliconDevice device = tt_SiliconDevice(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), test_utils::GetAbsPath("blackhole_1chip_cluster.yaml"), target_devices, num_host_mem_ch_per_mmio_device, false, true, true);
tt_SiliconDevice device = tt_SiliconDevice(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true);

set_params_for_remote_txn(device);

Expand Down Expand Up @@ -378,7 +378,7 @@ TEST(SiliconDriverBH, MultiThreadedDevice) {
std::set<chip_id_t> target_devices = get_target_devices();

uint32_t num_host_mem_ch_per_mmio_device = 1;
tt_SiliconDevice device = tt_SiliconDevice(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), test_utils::GetAbsPath("blackhole_1chip_cluster.yaml"), target_devices, num_host_mem_ch_per_mmio_device, false, true, true);
tt_SiliconDevice device = tt_SiliconDevice(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true);

set_params_for_remote_txn(device);

Expand Down Expand Up @@ -437,7 +437,7 @@ TEST(SiliconDriverBH, MultiThreadedMemBar) {
uint32_t base_addr = l1_mem::address_map::DATA_BUFFER_SPACE_BASE;
uint32_t num_host_mem_ch_per_mmio_device = 1;

tt_SiliconDevice device = tt_SiliconDevice(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), test_utils::GetAbsPath("blackhole_1chip_cluster.yaml"), target_devices, num_host_mem_ch_per_mmio_device, false, true, true);
tt_SiliconDevice device = tt_SiliconDevice(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true);
set_params_for_remote_txn(device);
for(int i = 0; i < target_devices.size(); i++) {
// Iterate over devices and only setup static TLBs for functional worker cores
Expand Down Expand Up @@ -539,7 +539,7 @@ TEST(SiliconDriverBH, DISABLED_BroadcastWrite) { // Cannot broadcast to tensix/e

uint32_t num_host_mem_ch_per_mmio_device = 1;

tt_SiliconDevice device = tt_SiliconDevice(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), test_utils::GetAbsPath("blackhole_1chip_cluster.yaml"), target_devices, num_host_mem_ch_per_mmio_device, false, true, true);
tt_SiliconDevice device = tt_SiliconDevice(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true);
set_params_for_remote_txn(device);
auto mmio_devices = device.get_target_mmio_device_ids();

Expand Down Expand Up @@ -596,7 +596,7 @@ TEST(SiliconDriverBH, DISABLED_VirtualCoordinateBroadcast) { // same problem as

uint32_t num_host_mem_ch_per_mmio_device = 1;

tt_SiliconDevice device = tt_SiliconDevice(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), test_utils::GetAbsPath("blackhole_1chip_cluster.yaml"), target_devices, num_host_mem_ch_per_mmio_device, false, true, true);
tt_SiliconDevice device = tt_SiliconDevice(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true);
set_params_for_remote_txn(device);
auto mmio_devices = device.get_target_mmio_device_ids();

Expand Down
Loading

0 comments on commit 4b5dffd

Please sign in to comment.