Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add nvidia MIG Settings #63

Merged
merged 1 commit into from
Feb 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ bottlerocket-template-helper = { path = "./bottlerocket-template-helper", versio

# Settings Models
bottlerocket-model-derive = { path = "./bottlerocket-settings-models/model-derive", version = "0.1" }
bottlerocket-modeled-types = { path = "./bottlerocket-settings-models/modeled-types", version = "0.7" }
bottlerocket-modeled-types = { path = "./bottlerocket-settings-models/modeled-types", version = "0.8" }
bottlerocket-scalar = { path = "./bottlerocket-settings-models/scalar", version = "0.1" }
bottlerocket-scalar-derive = { path = "./bottlerocket-settings-models/scalar-derive", version = "0.1" }
bottlerocket-string-impls-for = { path = "./bottlerocket-settings-models/string-impls-for", version = "0.1" }
Expand All @@ -75,7 +75,7 @@ settings-extension-ecs = { path = "./bottlerocket-settings-models/settings-exten
settings-extension-host-containers = { path = "./bottlerocket-settings-models/settings-extensions/host-containers", version = "0.1" }
settings-extension-kernel = { path = "./bottlerocket-settings-models/settings-extensions/kernel", version = "0.1" }
settings-extension-kubernetes = { path = "./bottlerocket-settings-models/settings-extensions/kubernetes", version = "0.2" }
settings-extension-kubelet-device-plugins = { path = "./bottlerocket-settings-models/settings-extensions/kubelet-device-plugins", version = "0.1" }
settings-extension-kubelet-device-plugins = { path = "./bottlerocket-settings-models/settings-extensions/kubelet-device-plugins", version = "0.2" }
settings-extension-metrics = { path = "./bottlerocket-settings-models/settings-extensions/metrics", version = "0.1" }
settings-extension-motd = { path = "./bottlerocket-settings-models/settings-extensions/motd", version = "0.1" }
settings-extension-network = { path = "./bottlerocket-settings-models/settings-extensions/network", version = "0.1" }
Expand Down
2 changes: 1 addition & 1 deletion bottlerocket-settings-models/modeled-types/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "bottlerocket-modeled-types"
version = "0.7.0"
version = "0.8.0"
authors = []
license = "Apache-2.0 OR MIT"
edition = "2021"
Expand Down
176 changes: 172 additions & 4 deletions bottlerocket-settings-models/modeled-types/src/kubernetes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1461,14 +1461,16 @@ mod test_hostname_override_source {

// =^..^= =^..^= =^..^= =^..^= =^..^= =^..^= =^..^= =^..^= =^..^=

/// NvidiaRuntimeSettings contains the container runtime settings for Nvidia gpu.
/// NvidiaDevicePluginSettings contains the device sharing and partitioning related settings for Nvidia gpu.
#[model(impl_default = true)]
pub struct NvidiaDevicePluginSettings {
pass_device_specs: bool,
device_id_strategy: NvidiaDeviceIdStrategy,
device_list_strategy: NvidiaDeviceListStrategy,
device_sharing_strategy: NvidiaDeviceSharingStrategy,
time_slicing: NvidiaTimeSlicingSettings,
device_partitioning_strategy: NvidiaDevicePartitioningStrategy,
mig: NvidiaMigSettings,
}

#[derive(Debug, Clone, Eq, PartialEq, Hash, Serialize, Deserialize)]
Expand Down Expand Up @@ -1499,10 +1501,123 @@ pub struct NvidiaTimeSlicingSettings {
fail_requests_greater_than_one: bool,
}

#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
#[serde(rename_all = "lowercase")]
pub enum NvidiaDevicePartitioningStrategy {
#[default]
None,
MIG,
}

#[model(impl_default = true)]
pub struct NvidiaMigSettings {
profile: HashMap<NvidiaGpuModel, MigProfile>,
}

#[derive(Debug, Clone, Eq, PartialEq, Hash)]
pub struct NvidiaGpuModel {
inner: String,
}

lazy_static! {
pub(crate) static ref NVIDIAGPU_NAME: Regex = Regex::new(r"^([a-z])(\d+)\.(\d+)gb$").unwrap();
}

impl TryFrom<&str> for NvidiaGpuModel {
type Error = error::Error;

fn try_from(input: &str) -> Result<Self, Self::Error> {
ensure!(
NVIDIAGPU_NAME.is_match(input),
error::PatternSnafu {
thing: "NVIDIA GPU Model",
pattern: NVIDIAGPU_NAME.clone(),
input
}
);

Ok(NvidiaGpuModel {
inner: input.to_string(),
})
}
}

string_impls_for!(NvidiaGpuModel, "NvidiaGpuModel");

#[derive(Debug, Clone, Eq, PartialEq, Hash)]
pub struct MigProfile {
inner: String,
}

lazy_static! {
pub(crate) static ref MIGPROFILE_NAME: Regex = Regex::new(r"^[0-9]g\.\d+gb$").unwrap();
}

impl TryFrom<&str> for MigProfile {
type Error = error::Error;

fn try_from(input: &str) -> Result<Self, Self::Error> {
let slice_format = matches!(input, "1" | "2" | "3" | "4" | "7");

ensure!(
slice_format | MIGPROFILE_NAME.is_match(input),
error::PatternSnafu {
thing: "MIG Profile",
pattern: MIGPROFILE_NAME.clone(),
input
}
);

Ok(MigProfile {
inner: input.to_string(),
})
}
}

string_impls_for!(MigProfile, "MigProfile");

#[cfg(test)]
mod tests {
mod test_nvidia_device_plugins {
use super::*;

#[test]
fn valid_gpu_model() {
for ok in &["a100.40gb", "a100.80gb", "h100.80gb", "h100.141gb"] {
assert!(NvidiaGpuModel::try_from(*ok).is_ok());
}
}

#[test]
fn invalid_gpu_model() {
assert!(NvidiaGpuModel::try_from("invalid").is_err());
assert!(NvidiaGpuModel::try_from("1000").is_err());
assert!(NvidiaGpuModel::try_from("A100.40GB").is_err());
assert!(NvidiaGpuModel::try_from("a100.40").is_err());
}

#[test]
fn valid_mig_profile() {
for ok in &[
"1g.5gb", "2g.10gb", "3g.20gb", "7g.40gb", "1g.10gb", "1g.20gb", "2g.20gb", "3g.40gb",
"7g.80gb", "1g.18gb", "1g.35gb", "2g.35gb", "3g.71gb", "7g.141gb", "1", "2", "3", "4",
"7",
] {
assert!(MigProfile::try_from(*ok).is_ok());
}
}

#[test]
fn invalid_mig_profile() {
assert!(MigProfile::try_from("invalid").is_err());
assert!(MigProfile::try_from("1000").is_err());
assert!(MigProfile::try_from("5").is_err());
assert!(MigProfile::try_from("10g.100GB").is_err());
assert!(MigProfile::try_from("1g.10GB").is_err());
assert!(MigProfile::try_from("1g10gb").is_err());
assert!(MigProfile::try_from("g.10gb").is_err());
assert!(MigProfile::try_from("1g.gb").is_err());
}

#[test]
fn test_serde_nvidia_device_plugins() {
let test_json = r#"{"pass-device-specs":false,"device-id-strategy":"uuid","device-list-strategy":"envvar"}"#;
Expand All @@ -1515,7 +1630,9 @@ mod tests {
device_id_strategy: Some(NvidiaDeviceIdStrategy::Uuid),
device_list_strategy: Some(NvidiaDeviceListStrategy::Envvar),
device_sharing_strategy: None,
time_slicing: None
time_slicing: None,
device_partitioning_strategy: None,
mig: None
}
);
let results = serde_json::to_string(&nvidia_device_plugins).unwrap();
Expand All @@ -1534,7 +1651,9 @@ mod tests {
device_id_strategy: Some(NvidiaDeviceIdStrategy::Uuid),
device_list_strategy: Some(NvidiaDeviceListStrategy::Envvar),
device_sharing_strategy: Some(NvidiaDeviceSharingStrategy::TimeSlicing),
time_slicing: None
time_slicing: None,
device_partitioning_strategy: None,
mig: None
}
);

Expand All @@ -1548,4 +1667,53 @@ mod tests {
let result: Result<NvidiaDevicePluginSettings, _> = serde_json::from_str(test_json);
assert!(result.is_err(), "The JSON should not be parsed successfully as it contains an invalid value for 'replicas'.");
}

#[test]
fn test_serde_nvidia_device_plugins_with_mig() {
let test_json = r#"{"pass-device-specs":false,"device-id-strategy":"uuid","device-list-strategy":"envvar","device-partitioning-strategy":"mig"}"#;
let nvidia_device_plugins: NvidiaDevicePluginSettings =
serde_json::from_str(test_json).unwrap();
assert_eq!(
nvidia_device_plugins,
NvidiaDevicePluginSettings {
pass_device_specs: Some(false),
device_id_strategy: Some(NvidiaDeviceIdStrategy::Uuid),
device_list_strategy: Some(NvidiaDeviceListStrategy::Envvar),
device_sharing_strategy: None,
time_slicing: None,
device_partitioning_strategy: Some(NvidiaDevicePartitioningStrategy::MIG),
mig: None
}
);

let results = serde_json::to_string(&nvidia_device_plugins).unwrap();
assert_eq!(results, test_json);
}

#[test]
fn test_serde_nvidia_device_plugins_with_mig_profile() {
let test_json = r#"{"pass-device-specs":false,"device-id-strategy":"uuid","device-list-strategy":"envvar","device-partitioning-strategy":"mig","mig":{"profile":{"a100.40gb":"1g.5gb"}}}"#;
let nvidia_device_plugins: NvidiaDevicePluginSettings =
serde_json::from_str(test_json).unwrap();
assert_eq!(
nvidia_device_plugins,
NvidiaDevicePluginSettings {
pass_device_specs: Some(false),
device_id_strategy: Some(NvidiaDeviceIdStrategy::Uuid),
device_list_strategy: Some(NvidiaDeviceListStrategy::Envvar),
device_sharing_strategy: None,
time_slicing: None,
device_partitioning_strategy: Some(NvidiaDevicePartitioningStrategy::MIG),
mig: Some(NvidiaMigSettings {
profile: Some(HashMap::from([(
NvidiaGpuModel::try_from("a100.40gb").unwrap(),
MigProfile::try_from("1g.5gb").unwrap()
)]))
}),
}
);

let results = serde_json::to_string(&nvidia_device_plugins).unwrap();
assert_eq!(results, test_json);
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "settings-extension-kubelet-device-plugins"
version = "0.1.0"
version = "0.2.0"
authors = ["Arnaldo Garcia Rincon <agarrcia@amazon.com>"]
license = "Apache-2.0 OR MIT"
edition = "2021"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,12 @@ impl SettingsModel for KubeletDevicePluginsV1 {
mod test {
use super::*;
use bottlerocket_modeled_types::{
NvidiaDeviceIdStrategy, NvidiaDeviceListStrategy, NvidiaDeviceSharingStrategy,
NvidiaTimeSlicingSettings,
MigProfile, NvidiaDeviceIdStrategy, NvidiaDeviceListStrategy,
NvidiaDevicePartitioningStrategy, NvidiaDeviceSharingStrategy, NvidiaGpuModel,
NvidiaMigSettings, NvidiaTimeSlicingSettings,
};
use bounded_integer::BoundedI32;
use std::collections::HashMap;

#[test]
fn test_generate_kubelet_device_plugins() {
Expand All @@ -59,7 +61,7 @@ mod test {

#[test]
fn test_serde_kubelet_device_plugins() {
let test_json = r#"{"nvidia":{"pass-device-specs":true,"device-id-strategy":"index","device-list-strategy":"volume-mounts","device-sharing-strategy":"time-slicing","time-slicing":{"replicas":2,"rename-by-default":true,"fail-requests-greater-than-one":true}}}"#;
let test_json = r#"{"nvidia":{"pass-device-specs":true,"device-id-strategy":"index","device-list-strategy":"volume-mounts","device-sharing-strategy":"time-slicing","time-slicing":{"replicas":2,"rename-by-default":true,"fail-requests-greater-than-one":true},"device-partitioning-strategy":"mig","mig":{"profile":{"a100.40gb":"1g.5gb"}}}}"#;

let device_plugins: KubeletDevicePluginsV1 = serde_json::from_str(test_json).unwrap();
assert_eq!(
Expand All @@ -75,6 +77,13 @@ mod test {
rename_by_default: Some(true),
fail_requests_greater_than_one: Some(true),
}),
device_partitioning_strategy: Some(NvidiaDevicePartitioningStrategy::MIG),
mig: Some(NvidiaMigSettings {
profile: Some(HashMap::from([(
NvidiaGpuModel::try_from("a100.40gb").unwrap(),
MigProfile::try_from("1g.5gb").unwrap()
)]))
}),
}),
}
);
Expand Down
2 changes: 1 addition & 1 deletion bottlerocket-settings-models/settings-models/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "bottlerocket-settings-models"
version = "0.7.0"
version = "0.8.0"
authors = ["Tom Kirchner <tjk@amazon.com>"]
license = "Apache-2.0 OR MIT"
edition = "2021"
Expand Down