From cb83a607ec1934acb1a5e63f4bead7b0cccfd5fc Mon Sep 17 00:00:00 2001 From: Piyush Jena Date: Fri, 10 Jan 2025 22:26:21 +0000 Subject: [PATCH] feat: add nvidia MIG Settings --- Cargo.toml | 2 +- .../modeled-types/Cargo.toml | 2 +- .../modeled-types/src/kubernetes.rs | 168 +++++++++++++++++- .../kubelet-device-plugins/src/lib.rs | 15 +- .../settings-models/Cargo.toml | 2 +- 5 files changed, 179 insertions(+), 10 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 1bc93bf..367ebf6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -57,7 +57,7 @@ bottlerocket-template-helper = { path = "./bottlerocket-template-helper", versio # Settings Models bottlerocket-model-derive = { path = "./bottlerocket-settings-models/model-derive", version = "0.1" } -bottlerocket-modeled-types = { path = "./bottlerocket-settings-models/modeled-types", version = "0.7" } +bottlerocket-modeled-types = { path = "./bottlerocket-settings-models/modeled-types", version = "0.8" } bottlerocket-scalar = { path = "./bottlerocket-settings-models/scalar", version = "0.1" } bottlerocket-scalar-derive = { path = "./bottlerocket-settings-models/scalar-derive", version = "0.1" } bottlerocket-string-impls-for = { path = "./bottlerocket-settings-models/string-impls-for", version = "0.1" } diff --git a/bottlerocket-settings-models/modeled-types/Cargo.toml b/bottlerocket-settings-models/modeled-types/Cargo.toml index 771bcf7..96b506b 100644 --- a/bottlerocket-settings-models/modeled-types/Cargo.toml +++ b/bottlerocket-settings-models/modeled-types/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "bottlerocket-modeled-types" -version = "0.7.0" +version = "0.8.0" authors = [] license = "Apache-2.0 OR MIT" edition = "2021" diff --git a/bottlerocket-settings-models/modeled-types/src/kubernetes.rs b/bottlerocket-settings-models/modeled-types/src/kubernetes.rs index 338309b..1c03f72 100644 --- a/bottlerocket-settings-models/modeled-types/src/kubernetes.rs +++ b/bottlerocket-settings-models/modeled-types/src/kubernetes.rs @@ -1461,7 +1461,7 @@ mod test_hostname_override_source { // =^..^= =^..^= =^..^= =^..^= =^..^= =^..^= =^..^= =^..^= =^..^= -/// NvidiaRuntimeSettings contains the container runtime settings for Nvidia gpu. +/// NvidiaDevicePluginSettings contains the device sharing and partitioning related settings for Nvidia gpu. #[model(impl_default = true)] pub struct NvidiaDevicePluginSettings { pass_device_specs: bool, @@ -1469,6 +1469,8 @@ pub struct NvidiaDevicePluginSettings { device_list_strategy: NvidiaDeviceListStrategy, device_sharing_strategy: NvidiaDeviceSharingStrategy, time_slicing: NvidiaTimeSlicingSettings, + device_partitioning_strategy: NvidiaDevicePartitioningStrategy, + mig: NvidiaMIGSettings, } #[derive(Debug, Clone, Eq, PartialEq, Hash, Serialize, Deserialize)] @@ -1499,10 +1501,115 @@ pub struct NvidiaTimeSlicingSettings { fail_requests_greater_than_one: bool, } +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)] +#[serde(rename_all = "lowercase")] +pub enum NvidiaDevicePartitioningStrategy { + #[default] + None, + MIG, +} + +#[model(impl_default = true)] +pub struct NvidiaMIGSettings { + profile: HashMap, +} + +#[derive(Debug, Clone, Eq, PartialEq, Hash)] +pub struct NvidiaGpuModel { + inner: String, +} + +lazy_static! { + pub(crate) static ref NVIDIAGPU_NAME: Regex = Regex::new(r"^([a-z])(\d+)\.(\d+)gb$").unwrap(); +} + +impl TryFrom<&str> for NvidiaGpuModel { + type Error = error::Error; + + fn try_from(input: &str) -> Result { + ensure!( + NVIDIAGPU_NAME.is_match(input), + error::PatternSnafu { + thing: "NVIDIA GPU Model", + pattern: NVIDIAGPU_NAME.clone(), + input + } + ); + + Ok(NvidiaGpuModel { + inner: input.to_string(), + }) + } +} + +string_impls_for!(NvidiaGpuModel, "NvidiaGpuModel"); + +#[derive(Debug, Clone, Eq, PartialEq, Hash)] +pub struct MigProfile { + inner: String, +} + +lazy_static! { + pub(crate) static ref MIGPROFILE_NAME: Regex = Regex::new(r"^[0-9]g\.\d+gb$").unwrap(); +} + +impl TryFrom<&str> for MigProfile { + type Error = error::Error; + + fn try_from(input: &str) -> Result { + let slice_format = matches!(input, "1" | "2" | "3" | "4" | "7"); + + ensure!( + slice_format | MIGPROFILE_NAME.is_match(input), + error::PatternSnafu { + thing: "MIG Profile", + pattern: MIGPROFILE_NAME.clone(), + input + } + ); + + Ok(MigProfile { + inner: input.to_string(), + }) + } +} + +string_impls_for!(MigProfile, "MigProfile"); + #[cfg(test)] -mod tests { +mod test_nvidia_device_plugins { use super::*; + #[test] + fn valid_gpu_model() { + for ok in &["a100.40gb", "a100.80gb", "h100.80gb", "h100.141gb"] { + assert!(NvidiaGpuModel::try_from(*ok).is_ok()); + } + } + + #[test] + fn invalid_gpu_model() { + assert!(NvidiaGpuModel::try_from("invalid").is_err()); + assert!(NvidiaGpuModel::try_from("1000").is_err()); + } + + #[test] + fn valid_mig_profile() { + for ok in &[ + "1g.5gb", "2g.10gb", "3g.20gb", "7g.40gb", "1g.10gb", "1g.20gb", "2g.20gb", "3g.40gb", + "7g.80gb", "1g.18gb", "1g.35gb", "2g.35gb", "3g.71gb", "7g.141gb", "1", "2", "3", "4", + "7", + ] { + assert!(MigProfile::try_from(*ok).is_ok()); + } + } + + #[test] + fn invalid_mig_profile() { + assert!(MigProfile::try_from("invalid").is_err()); + assert!(MigProfile::try_from("1000").is_err()); + } + #[test] fn test_serde_nvidia_device_plugins() { let test_json = r#"{"pass-device-specs":false,"device-id-strategy":"uuid","device-list-strategy":"envvar"}"#; @@ -1515,7 +1622,9 @@ mod tests { device_id_strategy: Some(NvidiaDeviceIdStrategy::Uuid), device_list_strategy: Some(NvidiaDeviceListStrategy::Envvar), device_sharing_strategy: None, - time_slicing: None + time_slicing: None, + device_partitioning_strategy: None, + mig: None } ); let results = serde_json::to_string(&nvidia_device_plugins).unwrap(); @@ -1534,7 +1643,9 @@ mod tests { device_id_strategy: Some(NvidiaDeviceIdStrategy::Uuid), device_list_strategy: Some(NvidiaDeviceListStrategy::Envvar), device_sharing_strategy: Some(NvidiaDeviceSharingStrategy::TimeSlicing), - time_slicing: None + time_slicing: None, + device_partitioning_strategy: None, + mig: None } ); @@ -1548,4 +1659,53 @@ mod tests { let result: Result = serde_json::from_str(test_json); assert!(result.is_err(), "The JSON should not be parsed successfully as it contains an invalid value for 'replicas'."); } + + #[test] + fn test_serde_nvidia_device_plugins_with_mig() { + let test_json = r#"{"pass-device-specs":false,"device-id-strategy":"uuid","device-list-strategy":"envvar","device-partitioning-strategy":"mig"}"#; + let nvidia_device_plugins: NvidiaDevicePluginSettings = + serde_json::from_str(test_json).unwrap(); + assert_eq!( + nvidia_device_plugins, + NvidiaDevicePluginSettings { + pass_device_specs: Some(false), + device_id_strategy: Some(NvidiaDeviceIdStrategy::Uuid), + device_list_strategy: Some(NvidiaDeviceListStrategy::Envvar), + device_sharing_strategy: None, + time_slicing: None, + device_partitioning_strategy: Some(NvidiaDevicePartitioningStrategy::MIG), + mig: None + } + ); + + let results = serde_json::to_string(&nvidia_device_plugins).unwrap(); + assert_eq!(results, test_json); + } + + #[test] + fn test_serde_nvidia_device_plugins_with_mig_profile() { + let test_json = r#"{"pass-device-specs":false,"device-id-strategy":"uuid","device-list-strategy":"envvar","device-partitioning-strategy":"mig","mig":{"profile":{"a100.40gb":"1g.5gb"}}}"#; + let nvidia_device_plugins: NvidiaDevicePluginSettings = + serde_json::from_str(test_json).unwrap(); + assert_eq!( + nvidia_device_plugins, + NvidiaDevicePluginSettings { + pass_device_specs: Some(false), + device_id_strategy: Some(NvidiaDeviceIdStrategy::Uuid), + device_list_strategy: Some(NvidiaDeviceListStrategy::Envvar), + device_sharing_strategy: None, + time_slicing: None, + device_partitioning_strategy: Some(NvidiaDevicePartitioningStrategy::MIG), + mig: Some(NvidiaMIGSettings { + profile: Some(HashMap::from([( + NvidiaGpuModel::try_from("a100.40gb").unwrap(), + MigProfile::try_from("1g.5gb").unwrap() + )])) + }), + } + ); + + let results = serde_json::to_string(&nvidia_device_plugins).unwrap(); + assert_eq!(results, test_json); + } } diff --git a/bottlerocket-settings-models/settings-extensions/kubelet-device-plugins/src/lib.rs b/bottlerocket-settings-models/settings-extensions/kubelet-device-plugins/src/lib.rs index 9607e29..21a2d16 100644 --- a/bottlerocket-settings-models/settings-extensions/kubelet-device-plugins/src/lib.rs +++ b/bottlerocket-settings-models/settings-extensions/kubelet-device-plugins/src/lib.rs @@ -43,10 +43,12 @@ impl SettingsModel for KubeletDevicePluginsV1 { mod test { use super::*; use bottlerocket_modeled_types::{ - NvidiaDeviceIdStrategy, NvidiaDeviceListStrategy, NvidiaDeviceSharingStrategy, - NvidiaTimeSlicingSettings, + MigProfile, NvidiaDeviceIdStrategy, NvidiaDeviceListStrategy, + NvidiaDevicePartitioningStrategy, NvidiaDeviceSharingStrategy, NvidiaGpuModel, + NvidiaMIGSettings, NvidiaTimeSlicingSettings, }; use bounded_integer::BoundedI32; + use std::collections::HashMap; #[test] fn test_generate_kubelet_device_plugins() { @@ -59,7 +61,7 @@ mod test { #[test] fn test_serde_kubelet_device_plugins() { - let test_json = r#"{"nvidia":{"pass-device-specs":true,"device-id-strategy":"index","device-list-strategy":"volume-mounts","device-sharing-strategy":"time-slicing","time-slicing":{"replicas":2,"rename-by-default":true,"fail-requests-greater-than-one":true}}}"#; + let test_json = r#"{"nvidia":{"pass-device-specs":true,"device-id-strategy":"index","device-list-strategy":"volume-mounts","device-sharing-strategy":"time-slicing","time-slicing":{"replicas":2,"rename-by-default":true,"fail-requests-greater-than-one":true},"device-partitioning-strategy":"mig","mig":{"profile":{"a100.40gb":"1g.5gb"}}}}"#; let device_plugins: KubeletDevicePluginsV1 = serde_json::from_str(test_json).unwrap(); assert_eq!( @@ -75,6 +77,13 @@ mod test { rename_by_default: Some(true), fail_requests_greater_than_one: Some(true), }), + device_partitioning_strategy: Some(NvidiaDevicePartitioningStrategy::MIG), + mig: Some(NvidiaMIGSettings { + profile: Some(HashMap::from([( + NvidiaGpuModel::try_from("a100.40gb").unwrap(), + MigProfile::try_from("1g.5gb").unwrap() + )])) + }), }), } ); diff --git a/bottlerocket-settings-models/settings-models/Cargo.toml b/bottlerocket-settings-models/settings-models/Cargo.toml index 2063581..599af1f 100644 --- a/bottlerocket-settings-models/settings-models/Cargo.toml +++ b/bottlerocket-settings-models/settings-models/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "bottlerocket-settings-models" -version = "0.7.0" +version = "0.8.0" authors = ["Tom Kirchner "] license = "Apache-2.0 OR MIT" edition = "2021"