Skip to content

Commit

Permalink
nvidia-mig: nvidia-mig implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
piyush-jena committed Feb 4, 2025
1 parent 5902dc0 commit cf9e015
Show file tree
Hide file tree
Showing 16 changed files with 1,010 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@ std = { version = "v1", helpers = ["default"] }
+++
version: v1
flags:
{{#if (eq settings.kubelet-device-plugins.nvidia.device-partitioning-strategy "mig")}}
migStrategy: "single"
{{else}}
migStrategy: "none"
{{/if}}
failOnInitError: true
plugin:
passDeviceSpecs: {{default true settings.kubelet-device-plugins.nvidia.pass-device-specs}}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[required-extensions]
kubelet-device-plugins = "v1"
std = { version = "v1", helpers = ["if_not_null", "toml_encode"]}
+++
{{#if_not_null settings.kubelet-device-plugins.nvidia.device-partitioning-strategy}}
device-partitioning-strategy = "{{{settings.kubelet-device-plugins.nvidia.device-partitioning-strategy}}}"
{{/if_not_null}}
{{#if_not_null settings.kubelet-device-plugins.nvidia.mig.profile}}
profile = {{ toml_encode settings.kubelet-device-plugins.nvidia.mig.profile }}
{{/if_not_null}}
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Source0: https://%{goimport}/archive/v%{gover}/v%{gover}.tar.gz#/k8s-device-plug
Source1: nvidia-k8s-device-plugin.service
Source2: nvidia-k8s-device-plugin-conf
Source3: nvidia-k8s-device-plugin-exec-start-conf
Source4: nvidia-k8s-device-plugin-mig-conf


BuildRequires: %{_cross_os}glibc-devel
Expand Down Expand Up @@ -69,6 +70,7 @@ install -p -m 0644 %{S:1} %{buildroot}%{_cross_unitdir}
install -d %{buildroot}%{_cross_unitdir}/nvidia-k8s-device-plugin.service.d
install -D -m 0644 %{S:2} %{buildroot}%{_cross_templatedir}/nvidia-k8s-device-plugin-conf
install -D -m 0644 %{S:3} %{buildroot}%{_cross_templatedir}/nvidia-k8s-device-plugin-exec-start-conf
install -D -m 0644 %{S:4} %{buildroot}%{_cross_templatedir}/nvidia-k8s-device-plugin-mig-conf


%files
Expand All @@ -78,6 +80,7 @@ install -D -m 0644 %{S:3} %{buildroot}%{_cross_templatedir}/nvidia-k8s-device-pl
%dir %{_cross_unitdir}/nvidia-k8s-device-plugin.service.d
%{_cross_templatedir}/nvidia-k8s-device-plugin-conf
%{_cross_templatedir}/nvidia-k8s-device-plugin-exec-start-conf
%{_cross_templatedir}/nvidia-k8s-device-plugin-mig-conf

%files bin
%{_cross_bindir}/nvidia-device-plugin
Expand Down
2 changes: 2 additions & 0 deletions packages/os/mig-nvidia-fabricmanager.service.drop-in.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[Service]
ExecCondition=/usr/bin/nvidia-migmanager is-fabric-manager-compatible
2 changes: 2 additions & 0 deletions packages/os/mig-reboot-if-required.service.drop-in.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[Service]
ExecStart=-/usr/bin/nvidia-migmanager reboot-if-required
2 changes: 2 additions & 0 deletions packages/os/nvidia-migmanager-tmpfiles.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
d /etc/nvidia-migmanager 0750 root root -
d /run/nvidia-migmanager 0755 root root -
15 changes: 15 additions & 0 deletions packages/os/nvidia-migmanager.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
[Unit]
Description=NVIDIA MIG manager service
After=nvidia-fabricmanager.service nvidia-persistenced.service
RefuseManualStart=true
RefuseManualStop=true

[Service]
Type=oneshot
ExecStart=/usr/bin/nvidia-migmanager apply-mig
RemainAfterExit=true
StandardError=journal+console
SyslogIdentifier=nvidia-migmanager

[Install]
WantedBy=configured.target
28 changes: 27 additions & 1 deletion packages/os/os.spec
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ Source121: warm-pool-wait.service
Source122: has-boot-ever-succeeded.service
Source123: pluto.service
Source124: bootstrap-commands.service
Source125: nvidia-migmanager.service

# 2xx sources: tmpfilesd configs
Source200: migration-tmpfiles.conf
Expand All @@ -61,6 +62,9 @@ Source202: thar-be-updates-tmpfiles.conf
Source203: bootstrap-containers-tmpfiles.conf
Source204: storewolf-tmpfiles.conf
Source205: bootstrap-commands-tmpfiles.conf
Source206: nvidia-migmanager-tmpfiles.conf
Source207: mig-nvidia-fabricmanager.service.drop-in.conf
Source208: mig-reboot-if-required.service.drop-in.conf

# 3xx sources: udev rules
Source300: ephemeral-storage.rules
Expand Down Expand Up @@ -107,6 +111,7 @@ Requires: (%{_cross_os}cfsignal if %{_cross_os}variant-platform(aws))
Requires: (%{_cross_os}warm-pool-wait if %{_cross_os}variant-family(aws-k8s))

Requires: (%{_cross_os}driverdog if %{_cross_os}variant-flavor(nvidia))
Requires: (%{_cross_os}nvidia-migmanager if %{_cross_os}variant-flavor(nvidia))

%description
%{summary}.
Expand Down Expand Up @@ -425,6 +430,11 @@ Summary: XFS progs cli
%description -n %{_cross_os}xfscli
%{summary}.

%package -n %{_cross_os}nvidia-migmanager
Summary: Manages NVIDIA MIG configuration
%description -n %{_cross_os}nvidia-migmanager
%{summary}.

%prep
%setup -T -c
%cargo_prep
Expand Down Expand Up @@ -545,6 +555,7 @@ echo "** Output from non-static builds:"
-p xfscli \
-p shibaken \
-p driverdog \
-p nvidia-migmanager \
%{nil}

# Wait for fips builds from the background, if they're not already done.
Expand Down Expand Up @@ -608,6 +619,7 @@ for p in \
kubernetes-cis-checks \
shibaken \
driverdog \
nvidia-migmanager \
; do
install -p -m 0755 %{__cargo_outdir}/${p} %{buildroot}%{_cross_bindir}
done
Expand Down Expand Up @@ -705,7 +717,7 @@ install -d %{buildroot}%{_cross_unitdir}
install -p -m 0644 \
%{S:100} %{S:102} %{S:103} %{S:105} \
%{S:106} %{S:107} %{S:110} %{S:111} %{S:112} \
%{S:113} %{S:114} %{S:120} %{S:122} %{S:123} %{S:124} \
%{S:113} %{S:114} %{S:120} %{S:122} %{S:123} %{S:124} %{S:125} \
%{buildroot}%{_cross_unitdir}

install -p -m 0644 %{S:10} %{buildroot}%{_cross_templatedir}
Expand All @@ -721,6 +733,13 @@ install -p -m 0644 %{S:202} %{buildroot}%{_cross_tmpfilesdir}/thar-be-updates.co
install -p -m 0644 %{S:203} %{buildroot}%{_cross_tmpfilesdir}/bootstrap-containers.conf
install -p -m 0644 %{S:204} %{buildroot}%{_cross_tmpfilesdir}/storewolf.conf
install -p -m 0644 %{S:205} %{buildroot}%{_cross_tmpfilesdir}/bootstrap-commands.conf
install -p -m 0644 %{S:206} %{buildroot}%{_cross_tmpfilesdir}/nvidia-migmanager.conf

install -d %{buildroot}%{_cross_unitdir}/nvidia-fabricmanager.service.d
install -p -m 0644 %{S:207} %{buildroot}%{_cross_unitdir}/nvidia-fabricmanager.service.d/mig.conf

install -d %{buildroot}%{_cross_unitdir}/reboot-if-required.service.d
install -p -m 0644 %{S:208} %{buildroot}%{_cross_unitdir}/reboot-if-required.service.d/mig-gpu-reset.conf

install -d %{buildroot}%{_cross_udevrulesdir}
install -p -m 0644 %{S:300} %{buildroot}%{_cross_udevrulesdir}/80-ephemeral-storage.rules
Expand Down Expand Up @@ -927,4 +946,11 @@ install -p -m 0644 %{S:400} %{S:401} %{S:402} %{buildroot}%{_cross_licensedir}
%{_cross_sbindir}/xfs_info
%{_cross_sbindir}/fsck.xfs

%files -n %{_cross_os}nvidia-migmanager
%{_cross_bindir}/nvidia-migmanager
%{_cross_unitdir}/nvidia-migmanager.service
%{_cross_tmpfilesdir}/nvidia-migmanager.conf
%{_cross_unitdir}/nvidia-fabricmanager.service.d/mig.conf
%{_cross_unitdir}/reboot-if-required.service.d/mig-gpu-reset.conf

%changelog
18 changes: 18 additions & 0 deletions sources/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions sources/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ members = [

"netdog",

"nvidia-migmanager",

"cfsignal",

"logdog",
Expand Down
28 changes: 28 additions & 0 deletions sources/nvidia-migmanager/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
[package]
name = "nvidia-migmanager"
version = "0.1.0"
authors = ["Piyush Jena <jepiyush@amazon.com>"]
license = "Apache-2.0 OR MIT"
edition = "2021"
publish = false
build = "build.rs"
# Don't rebuild crate just because of changes to README.
exclude = ["README.md"]

[dependencies]
argh.workspace = true
base64.workspace = true
constants.workspace = true
log.workspace = true
regex.workspace = true
serde = { workspace = true, features = ["derive"] }
serde_plain.workspace = true
simplelog.workspace = true
snafu.workspace = true
toml.workspace = true

[dev-dependencies]
tempfile.workspace = true

[build-dependencies]
generate-readme.workspace = true
30 changes: 30 additions & 0 deletions sources/nvidia-migmanager/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# nvidia-migmanager

Current version: 0.1.0

## NVIDIA MIG Manager
`nvidia-migmanager` ensures that MIG settings are applied to an instance that supports
it. It is called by `nvidia-migmanager.service`.

The binary first checks if MIG is activated by checking the value of
`settings.kubelet-device-plugins.nvidia.device-partitioning-strategy`. Then, it activates
MIG and applies the profile according to the type of GPU present in the instance.

NVIDIA MIG is currently supported only in A30, A100, H100 and H200 GPUs.

### Example:
```toml
[settings.kubelet-device-plugins.nvidia]
device-partitioning-strategy="mig"

[settings.kubelet-device-plugins.nvidia.mig.profile]
"a100.40gb"="2"
"h100.80gb"="4"
"h200.141gb"="3"
```
This would partition the GPUs in an instance with A100 GPU into 2 parts, instance with H100
into 4 parts and instance with H200 into 3 parts.

## Colophon

This text was generated using [cargo-readme](https://crates.io/crates/cargo-readme), and includes the rustdoc from `src/main.rs`.
9 changes: 9 additions & 0 deletions sources/nvidia-migmanager/README.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# {{crate}}

Current version: {{version}}

{{readme}}

## Colophon

This text was generated using [cargo-readme](https://crates.io/crates/cargo-readme), and includes the rustdoc from `src/main.rs`.
3 changes: 3 additions & 0 deletions sources/nvidia-migmanager/build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
fn main() {
generate_readme::from_main().unwrap();
}
Loading

0 comments on commit cf9e015

Please sign in to comment.