-
Notifications
You must be signed in to change notification settings - Fork 493
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Title: New NCCL Collectives Latency Estimator
Description: This PR introduces a new analytical latency estimator for NCCL collectives, enabled via the next flags: --xla_gpu_enable_analytical_sol_latency_estimator \ --xla_gpu_analytical_latency_estimator_options='nccl_op_launch_us=<value>,nic_speed_gbps=<value>,chunk_prep_us=<value>,rtt_us=<value>,gpus_per_node=<value>,chunk_size_bytes=<value>' Replace <value> with appropriate number for your system (e.g., nccl_op_launch_us=XX). This estimator should improve accuracy and performance, especially for large-scale distributed training." PiperOrigin-RevId: 707261072
- Loading branch information
1 parent
2445c22
commit 09bc536
Showing
11 changed files
with
748 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,189 @@ | ||
/* Copyright 2024 The OpenXLA Authors. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
==============================================================================*/ | ||
|
||
#include "xla/service/gpu/model/sol_gpu_cost_model.h" | ||
|
||
#include <cmath> | ||
#include <cstdint> | ||
#include <string> | ||
|
||
#include "absl/log/check.h" | ||
#include "absl/log/log.h" | ||
#include "absl/numeric/bits.h" | ||
#include "absl/strings/numbers.h" | ||
#include "absl/strings/string_view.h" | ||
#include "absl/time/time.h" | ||
#include "xla/hlo/ir/hlo_module.h" | ||
|
||
namespace xla { | ||
namespace gpu { | ||
namespace { | ||
// Constants for NCCL SoL model | ||
constexpr double kHeaderOverhead = 0.025; | ||
constexpr absl::string_view kNcclOpLaunchUs = "nccl_op_launch_us"; | ||
constexpr absl::string_view kNicSpeedGbps = "nic_speed_gbps"; | ||
constexpr absl::string_view kChunkPrepUs = "chunk_prep_us"; | ||
constexpr absl::string_view kRttUs = "rtt_us"; | ||
constexpr absl::string_view kGpusPerNode = "gpus_per_node"; | ||
constexpr absl::string_view kChunkSizeBytes = "chunk_size_bytes"; | ||
|
||
// Returns the number of communicators in the mask. | ||
// For example, if the mask is 0x0, this function returns 1. If the mask is 0x7, | ||
// this function returns 8. | ||
int NumCommunicators(const absl::string_view mask) { | ||
// Assuming the mask is a hexadecimal number | ||
uint64_t mask_value = std::stoul(std::string(mask), nullptr, 16); | ||
int bit_count = absl::popcount(mask_value); // Count set bits | ||
return static_cast<int>(std::pow(2, bit_count)); | ||
} | ||
|
||
// Returns the number of rounds for the given collective type. | ||
int NumRounds(const SolGPUCostModel::CollectiveType& coll_type) { | ||
// AllReduce requires ReduceScatter and AllGather, so it has 2 rounds. | ||
return coll_type == SolGPUCostModel::CollectiveType::kAllReduce ? 2 : 1; | ||
} | ||
|
||
} // namespace | ||
|
||
SolGPUCostModel::Config GetConfig(const HloModule* module) { | ||
SolGPUCostModel::Config config; | ||
const auto& extra_options = | ||
module->config() | ||
.debug_options() | ||
.xla_gpu_analytical_latency_estimator_options(); | ||
for (const auto& [option_name, option_value] : extra_options) { | ||
int64_t value; | ||
double value_d; | ||
VLOG(2) << "[SoL] option: " << option_name << " is " << option_value; | ||
if (option_name == kNcclOpLaunchUs && | ||
absl::SimpleAtoi(option_value, &value)) { | ||
config.nccl_op_launch_time = absl::Microseconds(value); | ||
} else if (option_name == kNicSpeedGbps && | ||
absl::SimpleAtod(option_value, &value_d)) { | ||
config.nic_speed_gbps = value_d; | ||
} else if (option_name == kChunkPrepUs && | ||
absl::SimpleAtoi(option_value, &value)) { | ||
config.chunk_prep_time = absl::Microseconds(value); | ||
} else if (option_name == kRttUs && | ||
absl::SimpleAtoi(option_value, &value)) { | ||
config.rtt = absl::Microseconds(value); | ||
} else if (option_name == kGpusPerNode && | ||
absl::SimpleAtoi(option_value, &value)) { | ||
config.gpus_per_node = value; | ||
} else if (option_name == kChunkSizeBytes && | ||
absl::SimpleAtoi(option_value, &value)) { | ||
config.chunk_size_bytes = value; | ||
} | ||
} | ||
return config; | ||
} | ||
|
||
SolGPUCostModel::SolGPUCostModel(const Config& sys_config) | ||
: xla_flag_config_(sys_config) { | ||
VLOG(2) << "[SoL] NIC speed: " << xla_flag_config_.nic_speed_gbps; | ||
VLOG(2) << "[SoL] RTT: " << xla_flag_config_.rtt; | ||
VLOG(2) << "[SoL] Chunk preparation time: " | ||
<< xla_flag_config_.chunk_prep_time; | ||
VLOG(2) << "[SoL] NCCL op launch time: " | ||
<< xla_flag_config_.nccl_op_launch_time; | ||
VLOG(2) << "[SoL] GPUs per node: " << xla_flag_config_.gpus_per_node; | ||
} | ||
|
||
// This is a insignificant term, and we are making it consistent | ||
// with the existing formula. | ||
absl::Duration SolGPUCostModel::ChunkPrepLatency( | ||
const int64_t per_gpu_msg_size_bytes) const { | ||
return std::ceil(static_cast<double>(per_gpu_msg_size_bytes) / | ||
xla_flag_config_.chunk_size_bytes) * | ||
xla_flag_config_.chunk_prep_time; | ||
} | ||
|
||
absl::Duration SolGPUCostModel::TransferDuration( | ||
const int64_t per_gpu_msg_size_bytes) const { | ||
// x1e6 to comvert secs to microseconds; | ||
// x1024*1024 *1024 to convert Gbytes/sec to bytes/sec | ||
const long double ret = | ||
(1e6 * static_cast<long double>(per_gpu_msg_size_bytes)) / | ||
(std::pow(1024.0, 3) * xla_flag_config_.nic_speed_gbps); | ||
return absl::Microseconds(ret * (1 + kHeaderOverhead)); | ||
} | ||
|
||
absl::Duration SolGPUCostModel::RingLatency( | ||
const int64_t buff_size_bytes, const int num_nodes, | ||
const CollectiveType& coll_type, const absl::string_view mask) const { | ||
const int num_gpus = NumGpusPerComm(num_nodes, coll_type, mask); | ||
|
||
int64_t per_gpu_msg_size_bytes; | ||
if (coll_type == CollectiveType::kSendRecv) { | ||
per_gpu_msg_size_bytes = buff_size_bytes; | ||
} else { | ||
per_gpu_msg_size_bytes = buff_size_bytes / num_gpus; | ||
} | ||
|
||
// This is the number of GPUs per communicator per node. We assume that each | ||
// GPU has a NIC, and this is also the number of NICs per communicator per | ||
// node. | ||
// Note that this happens to be correct value (i.e. 1) for SendRecv. | ||
int num_gpus_per_node = num_gpus / num_nodes; | ||
|
||
// In each channel, consider one GPU next to the Ethernet link. Below is the | ||
// sum of 3 time costs for each piece of data of size | ||
// `per_gpu_msg_size_bytes` | ||
// | ||
// 1. transfer duration defined by the NIC bandwidth, | ||
// 2. chunk preparation latency, and | ||
// 3. RTT | ||
// | ||
// then followed by two factors: | ||
// | ||
// 1. Multiply by `num_gpus - 1`, as `num_gpus - 1` pieces of data will be | ||
// sent over the link in AllGather. | ||
// 2. Divide by `num_gpus_per_node` as there are `num_gpus_per_node` NICs | ||
// and | ||
// GPUs in each node for parallelism. | ||
// | ||
// Better estimates of terms like this will come in future versions | ||
// of the SoL model. | ||
absl::Duration ret = TransferDuration(per_gpu_msg_size_bytes) + | ||
ChunkPrepLatency(per_gpu_msg_size_bytes) + | ||
xla_flag_config_.rtt; | ||
ret *= (num_gpus - 1.0) / static_cast<long double>(num_gpus_per_node); | ||
// Multiply by the number of rounds, which is different for AllReduce. | ||
ret = ret * NumRounds(coll_type); | ||
|
||
// Time to initiate the collective. | ||
return ret + xla_flag_config_.nccl_op_launch_time; | ||
} | ||
|
||
// Helper functions | ||
int SolGPUCostModel::NumGpusPerComm(int num_nodes, | ||
const CollectiveType& coll_type, | ||
const absl::string_view mask) const { | ||
if (coll_type == CollectiveType::kSendRecv) { | ||
return 2; | ||
} | ||
int num_comms = NumCommunicators(mask); | ||
CHECK_EQ(xla_flag_config_.gpus_per_node % num_comms, 0) | ||
<< "GPU_PER_NODE must be divisible by the number of communicators. " | ||
"GPU_PER_NODE: " | ||
<< xla_flag_config_.gpus_per_node | ||
<< " Number of communicators: " << num_comms | ||
<< ". Adjust the number of GPUs per node with the flag " | ||
"gpus_per_node in xla_gpu_analytical_latency_estimator_options."; | ||
return num_nodes * xla_flag_config_.gpus_per_node / num_comms; | ||
} | ||
|
||
} // namespace gpu | ||
} // namespace xla |
Oops, something went wrong.