Skip to content

Commit

Permalink
feat: add nvidia MIG Settings
Browse files Browse the repository at this point in the history
  • Loading branch information
piyush-jena committed Jan 29, 2025
1 parent 0cf8573 commit cb83a60
Show file tree
Hide file tree
Showing 5 changed files with 179 additions and 10 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ bottlerocket-template-helper = { path = "./bottlerocket-template-helper", versio

# Settings Models
bottlerocket-model-derive = { path = "./bottlerocket-settings-models/model-derive", version = "0.1" }
bottlerocket-modeled-types = { path = "./bottlerocket-settings-models/modeled-types", version = "0.7" }
bottlerocket-modeled-types = { path = "./bottlerocket-settings-models/modeled-types", version = "0.8" }
bottlerocket-scalar = { path = "./bottlerocket-settings-models/scalar", version = "0.1" }
bottlerocket-scalar-derive = { path = "./bottlerocket-settings-models/scalar-derive", version = "0.1" }
bottlerocket-string-impls-for = { path = "./bottlerocket-settings-models/string-impls-for", version = "0.1" }
Expand Down
2 changes: 1 addition & 1 deletion bottlerocket-settings-models/modeled-types/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "bottlerocket-modeled-types"
version = "0.7.0"
version = "0.8.0"
authors = []
license = "Apache-2.0 OR MIT"
edition = "2021"
Expand Down
168 changes: 164 additions & 4 deletions bottlerocket-settings-models/modeled-types/src/kubernetes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1461,14 +1461,16 @@ mod test_hostname_override_source {

// =^..^= =^..^= =^..^= =^..^= =^..^= =^..^= =^..^= =^..^= =^..^=

/// NvidiaRuntimeSettings contains the container runtime settings for Nvidia gpu.
/// NvidiaDevicePluginSettings contains the device sharing and partitioning related settings for Nvidia gpu.
#[model(impl_default = true)]
pub struct NvidiaDevicePluginSettings {
pass_device_specs: bool,
device_id_strategy: NvidiaDeviceIdStrategy,
device_list_strategy: NvidiaDeviceListStrategy,
device_sharing_strategy: NvidiaDeviceSharingStrategy,
time_slicing: NvidiaTimeSlicingSettings,
device_partitioning_strategy: NvidiaDevicePartitioningStrategy,
mig: NvidiaMIGSettings,
}

#[derive(Debug, Clone, Eq, PartialEq, Hash, Serialize, Deserialize)]
Expand Down Expand Up @@ -1499,10 +1501,115 @@ pub struct NvidiaTimeSlicingSettings {
fail_requests_greater_than_one: bool,
}

#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
#[serde(rename_all = "lowercase")]
pub enum NvidiaDevicePartitioningStrategy {
#[default]
None,
MIG,
}

#[model(impl_default = true)]
pub struct NvidiaMIGSettings {
profile: HashMap<NvidiaGpuModel, MigProfile>,
}

#[derive(Debug, Clone, Eq, PartialEq, Hash)]
pub struct NvidiaGpuModel {
inner: String,
}

lazy_static! {
pub(crate) static ref NVIDIAGPU_NAME: Regex = Regex::new(r"^([a-z])(\d+)\.(\d+)gb$").unwrap();
}

impl TryFrom<&str> for NvidiaGpuModel {
type Error = error::Error;

fn try_from(input: &str) -> Result<Self, Self::Error> {
ensure!(
NVIDIAGPU_NAME.is_match(input),
error::PatternSnafu {
thing: "NVIDIA GPU Model",
pattern: NVIDIAGPU_NAME.clone(),
input
}
);

Ok(NvidiaGpuModel {
inner: input.to_string(),
})
}
}

string_impls_for!(NvidiaGpuModel, "NvidiaGpuModel");

#[derive(Debug, Clone, Eq, PartialEq, Hash)]
pub struct MigProfile {
inner: String,
}

lazy_static! {
pub(crate) static ref MIGPROFILE_NAME: Regex = Regex::new(r"^[0-9]g\.\d+gb$").unwrap();
}

impl TryFrom<&str> for MigProfile {
type Error = error::Error;

fn try_from(input: &str) -> Result<Self, Self::Error> {
let slice_format = matches!(input, "1" | "2" | "3" | "4" | "7");

ensure!(
slice_format | MIGPROFILE_NAME.is_match(input),
error::PatternSnafu {
thing: "MIG Profile",
pattern: MIGPROFILE_NAME.clone(),
input
}
);

Ok(MigProfile {
inner: input.to_string(),
})
}
}

string_impls_for!(MigProfile, "MigProfile");

#[cfg(test)]
mod tests {
mod test_nvidia_device_plugins {
use super::*;

#[test]
fn valid_gpu_model() {
for ok in &["a100.40gb", "a100.80gb", "h100.80gb", "h100.141gb"] {
assert!(NvidiaGpuModel::try_from(*ok).is_ok());
}
}

#[test]
fn invalid_gpu_model() {
assert!(NvidiaGpuModel::try_from("invalid").is_err());
assert!(NvidiaGpuModel::try_from("1000").is_err());
}

#[test]
fn valid_mig_profile() {
for ok in &[
"1g.5gb", "2g.10gb", "3g.20gb", "7g.40gb", "1g.10gb", "1g.20gb", "2g.20gb", "3g.40gb",
"7g.80gb", "1g.18gb", "1g.35gb", "2g.35gb", "3g.71gb", "7g.141gb", "1", "2", "3", "4",
"7",
] {
assert!(MigProfile::try_from(*ok).is_ok());
}
}

#[test]
fn invalid_mig_profile() {
assert!(MigProfile::try_from("invalid").is_err());
assert!(MigProfile::try_from("1000").is_err());
}

#[test]
fn test_serde_nvidia_device_plugins() {
let test_json = r#"{"pass-device-specs":false,"device-id-strategy":"uuid","device-list-strategy":"envvar"}"#;
Expand All @@ -1515,7 +1622,9 @@ mod tests {
device_id_strategy: Some(NvidiaDeviceIdStrategy::Uuid),
device_list_strategy: Some(NvidiaDeviceListStrategy::Envvar),
device_sharing_strategy: None,
time_slicing: None
time_slicing: None,
device_partitioning_strategy: None,
mig: None
}
);
let results = serde_json::to_string(&nvidia_device_plugins).unwrap();
Expand All @@ -1534,7 +1643,9 @@ mod tests {
device_id_strategy: Some(NvidiaDeviceIdStrategy::Uuid),
device_list_strategy: Some(NvidiaDeviceListStrategy::Envvar),
device_sharing_strategy: Some(NvidiaDeviceSharingStrategy::TimeSlicing),
time_slicing: None
time_slicing: None,
device_partitioning_strategy: None,
mig: None
}
);

Expand All @@ -1548,4 +1659,53 @@ mod tests {
let result: Result<NvidiaDevicePluginSettings, _> = serde_json::from_str(test_json);
assert!(result.is_err(), "The JSON should not be parsed successfully as it contains an invalid value for 'replicas'.");
}

#[test]
fn test_serde_nvidia_device_plugins_with_mig() {
let test_json = r#"{"pass-device-specs":false,"device-id-strategy":"uuid","device-list-strategy":"envvar","device-partitioning-strategy":"mig"}"#;
let nvidia_device_plugins: NvidiaDevicePluginSettings =
serde_json::from_str(test_json).unwrap();
assert_eq!(
nvidia_device_plugins,
NvidiaDevicePluginSettings {
pass_device_specs: Some(false),
device_id_strategy: Some(NvidiaDeviceIdStrategy::Uuid),
device_list_strategy: Some(NvidiaDeviceListStrategy::Envvar),
device_sharing_strategy: None,
time_slicing: None,
device_partitioning_strategy: Some(NvidiaDevicePartitioningStrategy::MIG),
mig: None
}
);

let results = serde_json::to_string(&nvidia_device_plugins).unwrap();
assert_eq!(results, test_json);
}

#[test]
fn test_serde_nvidia_device_plugins_with_mig_profile() {
let test_json = r#"{"pass-device-specs":false,"device-id-strategy":"uuid","device-list-strategy":"envvar","device-partitioning-strategy":"mig","mig":{"profile":{"a100.40gb":"1g.5gb"}}}"#;
let nvidia_device_plugins: NvidiaDevicePluginSettings =
serde_json::from_str(test_json).unwrap();
assert_eq!(
nvidia_device_plugins,
NvidiaDevicePluginSettings {
pass_device_specs: Some(false),
device_id_strategy: Some(NvidiaDeviceIdStrategy::Uuid),
device_list_strategy: Some(NvidiaDeviceListStrategy::Envvar),
device_sharing_strategy: None,
time_slicing: None,
device_partitioning_strategy: Some(NvidiaDevicePartitioningStrategy::MIG),
mig: Some(NvidiaMIGSettings {
profile: Some(HashMap::from([(
NvidiaGpuModel::try_from("a100.40gb").unwrap(),
MigProfile::try_from("1g.5gb").unwrap()
)]))
}),
}
);

let results = serde_json::to_string(&nvidia_device_plugins).unwrap();
assert_eq!(results, test_json);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,12 @@ impl SettingsModel for KubeletDevicePluginsV1 {
mod test {
use super::*;
use bottlerocket_modeled_types::{
NvidiaDeviceIdStrategy, NvidiaDeviceListStrategy, NvidiaDeviceSharingStrategy,
NvidiaTimeSlicingSettings,
MigProfile, NvidiaDeviceIdStrategy, NvidiaDeviceListStrategy,
NvidiaDevicePartitioningStrategy, NvidiaDeviceSharingStrategy, NvidiaGpuModel,
NvidiaMIGSettings, NvidiaTimeSlicingSettings,
};
use bounded_integer::BoundedI32;
use std::collections::HashMap;

#[test]
fn test_generate_kubelet_device_plugins() {
Expand All @@ -59,7 +61,7 @@ mod test {

#[test]
fn test_serde_kubelet_device_plugins() {
let test_json = r#"{"nvidia":{"pass-device-specs":true,"device-id-strategy":"index","device-list-strategy":"volume-mounts","device-sharing-strategy":"time-slicing","time-slicing":{"replicas":2,"rename-by-default":true,"fail-requests-greater-than-one":true}}}"#;
let test_json = r#"{"nvidia":{"pass-device-specs":true,"device-id-strategy":"index","device-list-strategy":"volume-mounts","device-sharing-strategy":"time-slicing","time-slicing":{"replicas":2,"rename-by-default":true,"fail-requests-greater-than-one":true},"device-partitioning-strategy":"mig","mig":{"profile":{"a100.40gb":"1g.5gb"}}}}"#;

let device_plugins: KubeletDevicePluginsV1 = serde_json::from_str(test_json).unwrap();
assert_eq!(
Expand All @@ -75,6 +77,13 @@ mod test {
rename_by_default: Some(true),
fail_requests_greater_than_one: Some(true),
}),
device_partitioning_strategy: Some(NvidiaDevicePartitioningStrategy::MIG),
mig: Some(NvidiaMIGSettings {
profile: Some(HashMap::from([(
NvidiaGpuModel::try_from("a100.40gb").unwrap(),
MigProfile::try_from("1g.5gb").unwrap()
)]))
}),
}),
}
);
Expand Down
2 changes: 1 addition & 1 deletion bottlerocket-settings-models/settings-models/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "bottlerocket-settings-models"
version = "0.7.0"
version = "0.8.0"
authors = ["Tom Kirchner <tjk@amazon.com>"]
license = "Apache-2.0 OR MIT"
edition = "2021"
Expand Down

0 comments on commit cb83a60

Please sign in to comment.