-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
5235e6a
commit 95944c4
Showing
10 changed files
with
311 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
name: Building and Pushing to MCR | ||
on: [workflow_dispatch] | ||
# on: | ||
# push: | ||
# branches: | ||
# - main | ||
# workflow_dispatch: {} | ||
permissions: | ||
id-token: write | ||
contents: read | ||
|
||
jobs: | ||
publish: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- uses: actions/checkout@v2 | ||
- uses: paulhatch/semantic-version@v5.0.0-alpha2 | ||
with: | ||
bump_each_commit: true | ||
id: semver | ||
- name: 'Check version' | ||
run: | | ||
echo "version is ${{ steps.semver.output.version }}" | ||
- name: 'Az CLI login' | ||
uses: azure/login@v1 | ||
with: | ||
client-id: ${{ secrets.AZURE_CLIENT_ID }} | ||
tenant-id: ${{ secrets.AZURE_TENANT_ID }} | ||
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} | ||
- name: 'Run Azure CLI commands' | ||
run: | | ||
docker build -f Dockerfile -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu:${{ steps.semver.outputs.version }} . | ||
az acr login -n ${{ secrets.AZURE_REGISTRY_SERVER }} | ||
# docker push ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu:${{ steps.semver.outputs.version }} | ||
# echo "acr push done" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
FROM ubuntu:18.04 as gpu | ||
RUN apt update && apt install -y curl xz-utils gnupg2 ca-certificates --no-install-recommends | ||
|
||
WORKDIR /opt/gpu | ||
COPY blacklist-nouveau.conf blacklist-nouveau.conf | ||
COPY config.sh config.sh | ||
COPY download.sh download.sh | ||
RUN bash download.sh | ||
|
||
FROM ubuntu:18.04 | ||
|
||
COPY --from=gpu /opt/gpu /opt/gpu | ||
COPY entrypoint.sh /entrypoint.sh | ||
COPY install.sh /opt/actions/install.sh | ||
|
||
RUN mkdir -p /mnt | ||
|
||
ENTRYPOINT ["/entrypoint.sh"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
blacklist nouveau | ||
options nouveau modeset=0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
DRIVER_VERSION="470.57.02" | ||
NVIDIA_CONTAINER_RUNTIME_VERSION="3.6.0" | ||
NVIDIA_CONTAINER_TOOLKIT_VER="1.6.0" | ||
NVIDIA_PACKAGES="libnvidia-container1 libnvidia-container-tools nvidia-container-toolkit" | ||
GPU_DEST="/usr/local/nvidia" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
#!/usr/bin/env bash | ||
set -euo pipefail | ||
|
||
source /etc/os-release | ||
source /opt/gpu/config.sh | ||
|
||
NVIDIA_CONTAINER_RUNTIME_VERSION="3.6.0" | ||
NVIDIA_CONTAINER_TOOLKIT_VER="1.6.0" | ||
NVIDIA_PACKAGES="libnvidia-container1 libnvidia-container-tools nvidia-container-toolkit" | ||
GPU_DEST="/usr/local/nvidia" | ||
|
||
workdir="$(mktemp -d)" | ||
pushd "$workdir" || exit | ||
|
||
# download nvidia drivers, move to permanent cache | ||
curl -fsSLO https://us.download.nvidia.com/tesla/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run | ||
mv NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run /opt/gpu/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run | ||
|
||
# download fabricmanager for nvlink based systems, e.g. multi instance gpu vms. | ||
curl -fsSLO https://developer.download.nvidia.com/compute/cuda/redist/fabricmanager/linux-x86_64/fabricmanager-linux-x86_64-${DRIVER_VERSION}-archive.tar.xz | ||
tar -xvf fabricmanager-linux-x86_64-${DRIVER_VERSION}-archive.tar.xz | ||
mv fabricmanager-linux-x86_64-${DRIVER_VERSION}-archive /opt/gpu/fabricmanager-linux-x86_64-${DRIVER_VERSION} | ||
|
||
# configure nvidia apt repo to cache packages | ||
curl -fsSLO https://nvidia.github.io/nvidia-docker/gpgkey | ||
gpg --dearmor -o aptnvidia.gpg gpgkey | ||
mv aptnvidia.gpg /etc/apt/trusted.gpg.d/aptnvidia.gpg | ||
curl -fsSL https://nvidia.github.io/nvidia-docker/ubuntu${VERSION_ID}/nvidia-docker.list -o /etc/apt/sources.list.d/nvidia-docker.list | ||
|
||
apt update | ||
|
||
# download nvidia debian packages for nvidia-container-runtime compat | ||
for apt_package in $NVIDIA_PACKAGES; do | ||
apt-get download ${apt_package}=${NVIDIA_CONTAINER_TOOLKIT_VER}* | ||
mv ${apt_package}_${NVIDIA_CONTAINER_TOOLKIT_VER}* /opt/gpu | ||
done | ||
apt-get download nvidia-container-runtime=${NVIDIA_CONTAINER_RUNTIME_VERSION}* | ||
|
||
# move debs to permanent cache | ||
mv nvidia-container-runtime_${NVIDIA_CONTAINER_RUNTIME_VERSION}* /opt/gpu | ||
|
||
popd || exit | ||
rm -r "$workdir" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
#!/usr/bin/env bash | ||
set -o errexit | ||
set -o pipefail | ||
set -o nounset | ||
|
||
set -x | ||
|
||
if [[ -z "${1}" ]]; then | ||
echo "Must provide a non-empty action as first argument" | ||
exit 1 | ||
fi | ||
|
||
if [[ "${1}" == "copy" ]]; then | ||
echo "copying gpu cache files and exiting" | ||
cp -a /opt/gpu/. /mnt/gpu/ | ||
echo "Completed successfully!" | ||
exit 0 | ||
fi | ||
|
||
ACTION_FILE="/opt/actions/${1}" | ||
|
||
if [[ ! -f "$ACTION_FILE" ]]; then | ||
echo "Expected to find action file '$ACTION_FILE', but did not exist" | ||
exit 1 | ||
fi | ||
|
||
echo "Cleaning up stale actions" | ||
|
||
rm -rf /mnt/actions/* | ||
|
||
echo "Copying fresh actions" | ||
|
||
cp -R /opt/actions/. /mnt/actions | ||
|
||
echo "Executing nsenter" | ||
|
||
cp -a /opt/gpu/. /mnt/gpu/ | ||
nsenter -t 1 -m bash "${ACTION_FILE}" | ||
RESULT="${PIPESTATUS[0]}" | ||
|
||
if [ $RESULT -eq 0 ]; then | ||
# Success. | ||
rm -rf /mnt/actions/* | ||
echo "Completed successfully!" | ||
else | ||
echo "Failed during nsenter command execution" | ||
exit 1 | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
#!/usr/bin/env bash | ||
set -euxo pipefail | ||
|
||
source /opt/gpu/config.sh | ||
|
||
KERNEL_NAME=$(uname -r) | ||
LOG_FILE_NAME="/var/log/nvidia-installer-$(date +%s).log" | ||
|
||
# host needs these tools to build and load kernel module, can remove ca-certificates, was only for testing | ||
apt update && apt install -y kmod gcc make dkms initramfs-tools linux-headers-$(uname -r) ca-certificates --no-install-recommends | ||
|
||
# install cached nvidia debian packages for container runtime compatibility | ||
for apt_package in $NVIDIA_PACKAGES; do | ||
dpkg -i /opt/gpu/${apt_package}* | ||
done | ||
dpkg -i /opt/gpu/nvidia-container-runtime* | ||
|
||
# blacklist nouveau driver, nvidia driver dependency | ||
cp /opt/gpu/blacklist-nouveau.conf /etc/modprobe.d/blacklist-nouveau.conf | ||
update-initramfs -u | ||
|
||
# clean up lingering files from previous install | ||
set +e | ||
umount -l /usr/lib/x86_64-linux-gnu || true | ||
umount -l /tmp/overlay || true | ||
rm -r /tmp/overlay | ||
rm -r /opt/gpu/NVIDIA-Linux-x86_64-${DRIVER_VERSION} | ||
set -e | ||
|
||
# set up overlayfs to change install location of nvidia libs from /usr/lib/x86_64-linux-gnu to /usr/local/nvidia | ||
# add an extra layer of indirection via tmpfs because it's not possible to have an overlayfs on an overlayfs (i.e., inside a container) | ||
mkdir /tmp/overlay | ||
mount -t tmpfs tmpfs /tmp/overlay | ||
mkdir /tmp/overlay/{workdir,lib64} | ||
mkdir -p ${GPU_DEST}/lib64 | ||
mount -t overlay overlay -o lowerdir=/usr/lib/x86_64-linux-gnu,upperdir=/tmp/overlay/lib64,workdir=/tmp/overlay/workdir /usr/lib/x86_64-linux-gnu | ||
|
||
# clean up previously uncompressed driver, if it exists | ||
# causes driver installer to fail if it exists | ||
pushd /opt/gpu | ||
# extract runfile, takes some time, so do ahead of time | ||
sh /opt/gpu/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run -x | ||
popd | ||
|
||
# install nvidia drivers | ||
/opt/gpu/NVIDIA-Linux-x86_64-${DRIVER_VERSION}/nvidia-installer -s -k=$KERNEL_NAME --log-file-name=${LOG_FILE_NAME} -a --no-drm --dkms --utility-prefix="${GPU_DEST}" --opengl-prefix="${GPU_DEST}" | ||
|
||
# move nvidia libs to correct location from temporary overlayfs | ||
cp -a /tmp/overlay/lib64 ${GPU_DEST}/lib64 | ||
|
||
# move nvidia binaries to /usr/bin...because we like that? | ||
cp -rvT ${GPU_DEST}/bin /usr/bin | ||
|
||
# configure system to know about nvidia lib paths | ||
echo "${GPU_DEST}/lib64" > /etc/ld.so.conf.d/nvidia.conf | ||
ldconfig | ||
|
||
# unmount, cleanup | ||
set +e | ||
umount -l /usr/lib/x86_64-linux-gnu | ||
umount /tmp/overlay | ||
rm -r /tmp/overlay | ||
set -e | ||
|
||
# validate that nvidia driver is working | ||
dkms status | ||
nvidia-modprobe -u -c0 | ||
nvidia-smi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
default: push | ||
|
||
push: (containerize) | ||
docker push docker.io/alexeldeib/aks-gpu:latest | ||
|
||
containerize: | ||
docker build -f Dockerfile -t docker.io/alexeldeib/aks-gpu:latest . |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
apiVersion: apps/v1 | ||
kind: DaemonSet | ||
metadata: | ||
name: &name nsenter | ||
labels: | ||
app: *name | ||
spec: | ||
selector: | ||
matchLabels: | ||
app: *name | ||
template: | ||
metadata: | ||
labels: | ||
app: *name | ||
spec: | ||
# affinity: | ||
# nodeAffinity: | ||
# requiredDuringSchedulingIgnoredDuringExecution: | ||
# nodeSelectorTerms: | ||
# - matchExpressions: | ||
# - key: node.kubernetes.io/instance-type | ||
# operator: In | ||
# values: | ||
# - Standard_NP10s | ||
# - Standard_NP20s | ||
# - Standard_NP40s | ||
hostNetwork: true | ||
hostPID: true | ||
containers: | ||
- image: docker.io/alexeldeib/aks-gpu:latest # requires an image with bash, curl, sleep, and nsenter (vanilla ubuntu works) | ||
imagePullPolicy: Always | ||
name: *name | ||
command: ["/entrypoint.sh"] | ||
args: ["install.sh"] # if you don't use my image or build one from Dockerfile, set this to "downloadandinstall" | ||
resources: | ||
requests: | ||
{} | ||
limits: | ||
{} | ||
securityContext: | ||
privileged: true | ||
volumeMounts: | ||
- name: actions | ||
mountPath: "/opt/actions" | ||
- name: hostmount | ||
mountPath: "/mnt/actions" | ||
- name: gpu | ||
mountPath: "/mnt/gpu" | ||
volumes: | ||
- name: gpu | ||
hostPath: | ||
path: /opt/gpu | ||
type: DirectoryOrCreate | ||
- name: hostmount | ||
hostPath: | ||
path: /opt/actions | ||
type: DirectoryOrCreate | ||
- name: actions | ||
configMap: | ||
name: nsenter-actions | ||
--- |