From 7ed137fde19553c711ec12b846793d334dd4db45 Mon Sep 17 00:00:00 2001 From: Bianco95 Date: Mon, 15 Apr 2024 10:54:44 +0200 Subject: [PATCH] first commit of docker sidecar plugin --- Makefile | 5 + cmd/main.go | 73 +++ docker/Dockerfile.sidecar-docker | 37 ++ go.mod | 71 +++ go.sum | 181 +++++++ pkg/common/func.go | 166 +++++++ pkg/common/types.go | 84 ++++ pkg/docker/Create.go | 248 ++++++++++ pkg/docker/Delete.go | 98 ++++ pkg/docker/GetLogs.go | 125 +++++ pkg/docker/Status.go | 98 ++++ pkg/docker/aux.go | 267 ++++++++++ pkg/docker/gpustrategies/AmdHandler.go | 1 + pkg/docker/gpustrategies/NvidiaHandler.go | 270 ++++++++++ tests/sidecars/docker/PodGpu_test.go | 468 ++++++++++++++++++ tests/sidecars/docker/export_vars.sh | 8 + .../templates/nvidia_gpu_pod_template.go | 38 ++ 17 files changed, 2238 insertions(+) create mode 100644 Makefile create mode 100644 cmd/main.go create mode 100644 docker/Dockerfile.sidecar-docker create mode 100644 go.mod create mode 100644 go.sum create mode 100644 pkg/common/func.go create mode 100644 pkg/common/types.go create mode 100644 pkg/docker/Create.go create mode 100644 pkg/docker/Delete.go create mode 100644 pkg/docker/GetLogs.go create mode 100644 pkg/docker/Status.go create mode 100644 pkg/docker/aux.go create mode 100644 pkg/docker/gpustrategies/AmdHandler.go create mode 100644 pkg/docker/gpustrategies/NvidiaHandler.go create mode 100644 tests/sidecars/docker/PodGpu_test.go create mode 100755 tests/sidecars/docker/export_vars.sh create mode 100644 tests/sidecars/docker/templates/nvidia_gpu_pod_template.go diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..8ee2e34 --- /dev/null +++ b/Makefile @@ -0,0 +1,5 @@ +all: sidecars + +sidecars: + CGO_ENABLED=1 GOOS=linux go build -o bin/docker-sd cmd/main.go + diff --git a/cmd/main.go b/cmd/main.go new file mode 100644 index 0000000..4075d86 --- /dev/null +++ b/cmd/main.go @@ -0,0 +1,73 @@ +package main + +import ( + "context" + "net/http" + "strconv" + + "github.com/sirupsen/logrus" + "github.com/virtual-kubelet/virtual-kubelet/log" + + commonIL "github.com/intertwin-eu/interlink/pkg/common" + docker "github.com/intertwin-eu/interlink-docker-plugin/pkg/docker" + "github.com/intertwin-eu/interlink-docker-plugin/pkg/docker/gpustrategies" +) + +func main() { + logger := logrus.StandardLogger() + + interLinkConfig, err := commonIL.NewInterLinkConfig() + if err != nil { + log.L.Fatal(err) + } + + if interLinkConfig.VerboseLogging { + logger.SetLevel(logrus.DebugLevel) + } else if interLinkConfig.ErrorsOnlyLogging { + logger.SetLevel(logrus.ErrorLevel) + } else { + logger.SetLevel(logrus.InfoLevel) + } + + Ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + log.G(Ctx).Debug("Debug level: " + strconv.FormatBool(interLinkConfig.VerboseLogging)) + + var gpuManager gpustrategies.GPUManagerInterface + gpuManager = &gpustrategies.GPUManager{ + GPUSpecsList: []gpustrategies.GPUSpecs{}, + Ctx: Ctx, + } + + err = gpuManager.Init() + if err != nil { + log.G(Ctx).Fatal(err) + } + + err = gpuManager.Discover() + if err != nil { + log.G(Ctx).Fatal(err) + } + + err = gpuManager.Check() + if err != nil { + log.G(Ctx).Fatal(err) + } + + SidecarAPIs := docker.SidecarHandler{ + Config: interLinkConfig, + Ctx: Ctx, + GpuManager: gpuManager, + } + + mutex := http.NewServeMux() + mutex.HandleFunc("/status", SidecarAPIs.StatusHandler) + mutex.HandleFunc("/create", SidecarAPIs.CreateHandler) + mutex.HandleFunc("/delete", SidecarAPIs.DeleteHandler) + mutex.HandleFunc("/getLogs", SidecarAPIs.GetLogsHandler) + err = http.ListenAndServe(":"+interLinkConfig.Sidecarport, mutex) + + if err != nil { + log.G(Ctx).Fatal(err) + } +} diff --git a/docker/Dockerfile.sidecar-docker b/docker/Dockerfile.sidecar-docker new file mode 100644 index 0000000..4ccafeb --- /dev/null +++ b/docker/Dockerfile.sidecar-docker @@ -0,0 +1,37 @@ +FROM golang:1.21 as build-stage + +WORKDIR /app +COPY .. . +RUN CGO_ENABLED=1 GOOS=linux go build -o bin/docker-sidecar cmd/main.go + +FROM bash:latest as bash-stage + +# Deploy the application binary into a lean image +FROM docker:24.0-dind-rootless AS build-release-stage + +WORKDIR / + +COPY --from=build-stage /app/bin/docker-sidecar /sidecar/docker-sidecar + +# adding bash binary to be able to perform commands within the sidecar binary +COPY --from=bash-stage /usr/local/bin/bash /bin + +USER root:root + +ENV PATH "$PATH:/bin" + +#creating a simple startup script to start both docker rootless and the sidecar +RUN echo -e '#!/bin/bash\ndockerd-entrypoint.sh & /sidecar/docker-sidecar' > /sidecar/startup-docker.sh +RUN chmod +x /sidecar/startup-docker.sh +RUN chmod -R 777 /sidecar + +ENV INTERLINKCONFIGPATH=/InterLinkConfig.yaml + +USER 1000:1000 + +#setting up the path for the docker daemon +ENV DOCKER_HOST=unix:///run/user/1000/docker.sock + +WORKDIR /sidecar + +ENTRYPOINT ["/sidecar/startup-docker.sh"] diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..5ca96fb --- /dev/null +++ b/go.mod @@ -0,0 +1,71 @@ +module github.com/intertwin-eu/interlink + +go 1.21 + +toolchain go1.21.3 + +require ( + github.com/containerd/containerd v1.7.15 + gopkg.in/yaml.v2 v2.4.0 + k8s.io/api v0.29.1 + k8s.io/client-go v0.29.1 +) + +require ( + github.com/Microsoft/go-winio v0.6.1 // indirect + github.com/NVIDIA/go-nvml v0.12.0-4 // indirect + github.com/alexellis/go-execute v0.6.0 // indirect + github.com/containerd/log v0.1.0 // indirect + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/distribution/reference v0.6.0 // indirect + github.com/docker/docker v26.0.1+incompatible // indirect + github.com/docker/go-connections v0.5.0 // indirect + github.com/docker/go-units v0.5.0 // indirect + github.com/emicklei/go-restful/v3 v3.11.0 // indirect + github.com/felixge/httpsnoop v1.0.3 // indirect + github.com/go-logr/logr v1.4.1 // indirect + github.com/go-logr/stdr v1.2.2 // indirect + github.com/go-openapi/jsonpointer v0.19.6 // indirect + github.com/go-openapi/jsonreference v0.20.2 // indirect + github.com/go-openapi/swag v0.22.3 // indirect + github.com/gogo/protobuf v1.3.2 // indirect + github.com/golang/protobuf v1.5.4 // indirect + github.com/google/gnostic-models v0.6.8 // indirect + github.com/google/gofuzz v1.2.0 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/josharian/intern v1.0.0 // indirect + github.com/json-iterator/go v1.1.12 // indirect + github.com/mailru/easyjson v0.7.7 // indirect + github.com/moby/docker-image-spec v1.3.1 // indirect + github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect + github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/opencontainers/go-digest v1.0.0 // indirect + github.com/opencontainers/image-spec v1.1.0-rc2.0.20221005185240-3a7f492d3f1b // indirect + github.com/pkg/errors v0.9.1 // indirect + github.com/sirupsen/logrus v1.9.3 // indirect + github.com/virtual-kubelet/virtual-kubelet v1.11.0 // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.45.0 // indirect + go.opentelemetry.io/otel v1.22.0 // indirect + go.opentelemetry.io/otel/metric v1.22.0 // indirect + go.opentelemetry.io/otel/trace v1.22.0 // indirect + golang.org/x/mod v0.14.0 // indirect + golang.org/x/net v0.19.0 // indirect + golang.org/x/oauth2 v0.11.0 // indirect + golang.org/x/sys v0.16.0 // indirect + golang.org/x/term v0.15.0 // indirect + golang.org/x/text v0.14.0 // indirect + golang.org/x/time v0.3.0 // indirect + golang.org/x/tools v0.16.1 // indirect + google.golang.org/appengine v1.6.7 // indirect + google.golang.org/protobuf v1.33.0 // indirect + gopkg.in/inf.v0 v0.9.1 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect + k8s.io/apimachinery v0.29.1 // indirect + k8s.io/klog/v2 v2.110.1 // indirect + k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 // indirect + k8s.io/utils v0.0.0-20230726121419-3b25d923346b // indirect + sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect + sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect + sigs.k8s.io/yaml v1.3.0 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..3685357 --- /dev/null +++ b/go.sum @@ -0,0 +1,181 @@ +github.com/Microsoft/go-winio v0.6.1 h1:9/kr64B9VUZrLm5YYwbGtUJnMgqWVOdUAXu6Migciow= +github.com/Microsoft/go-winio v0.6.1/go.mod h1:LRdKpFKfdobln8UmuiYcKPot9D2v6svN5+sAH+4kjUM= +github.com/NVIDIA/go-nvml v0.12.0-4 h1:BvPjnjJr6qje0zov57Md7TwEA8i/12kZeUQIpyWzTEE= +github.com/NVIDIA/go-nvml v0.12.0-4/go.mod h1:8Llmj+1Rr+9VGGwZuRer5N/aCjxGuR5nPb/9ebBiIEQ= +github.com/alexellis/go-execute v0.6.0 h1:FVGoudJnWSObwf9qmehbvVuvhK6g1UpKOCBjS+OUXEA= +github.com/alexellis/go-execute v0.6.0/go.mod h1:nlg2F6XdYydUm1xXQMMiuibQCV1mveybBkNWfdNznjk= +github.com/containerd/containerd v1.7.15 h1:afEHXdil9iAm03BmhjzKyXnnEBtjaLJefdU7DV0IFes= +github.com/containerd/containerd v1.7.15/go.mod h1:ISzRRTMF8EXNpJlTzyr2XMhN+j9K302C21/+cr3kUnY= +github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I= +github.com/containerd/log v0.1.0/go.mod h1:VRRf09a7mHDIRezVKTRCrOq78v577GXq3bSa3EhrzVo= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk= +github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E= +github.com/docker/docker v26.0.1+incompatible h1:t39Hm6lpXuXtgkF0dm1t9a5HkbUfdGy6XbWexmGr+hA= +github.com/docker/docker v26.0.1+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= +github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj1Br63c= +github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc= +github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= +github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= +github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxERmMY4rD+g= +github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/felixge/httpsnoop v1.0.3 h1:s/nj+GCswXYzN5v2DpNMuMQYe+0DDwt5WVCU6CWBdXk= +github.com/felixge/httpsnoop v1.0.3/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/logr v1.3.0/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/logr v1.4.1 h1:pKouT5E8xu9zeFC39JXRDukb6JFQPXM5p5I91188VAQ= +github.com/go-logr/logr v1.4.1/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/go-openapi/jsonpointer v0.19.6 h1:eCs3fxoIi3Wh6vtgmLTOjdhSpiqphQ+DaPn38N2ZdrE= +github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs= +github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE= +github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k= +github.com/go-openapi/swag v0.22.3 h1:yMBqmnQ0gyZvEb/+KzuWZOXgllrXT4SADYbvDaXHv/g= +github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= +github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= +github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I= +github.com/google/gnostic-models v0.6.8/go.mod h1:5n7qKqH0f5wFt+aWF8CW6pZLLNOfYuF5OpfBSENuI8U= +github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= +github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= +github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= +github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= +github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= +github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= +github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= +github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0= +github.com/moby/docker-image-spec v1.3.1/go.mod h1:eKmb5VW8vQEh/BAr2yvVNvuiJuY6UIocYsFu/DxxRpo= +github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= +github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= +github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= +github.com/opencontainers/image-spec v1.1.0-rc2.0.20221005185240-3a7f492d3f1b h1:YWuSjZCQAPM8UUBLkYUk1e+rZcvWHJmFb6i6rM44Xs8= +github.com/opencontainers/image-spec v1.1.0-rc2.0.20221005185240-3a7f492d3f1b/go.mod h1:3OVijpioIKYWTqjiG0zfF6wvoJ4fAXGbjdZuI2NgsRQ= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= +github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +github.com/virtual-kubelet/virtual-kubelet v1.11.0 h1:LOMcZQfP083xmYH9mYtyHAR+ybFbK1uMaRA+EtDcd1I= +github.com/virtual-kubelet/virtual-kubelet v1.11.0/go.mod h1:WQfPHbIlzfhMNYkh6hFXF1ctGfNM8UJCYLYpLa/trxc= +github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.45.0 h1:x8Z78aZx8cOF0+Kkazoc7lwUNMGy0LrzEMxTm4BbTxg= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.45.0/go.mod h1:62CPTSry9QZtOaSsE3tOzhx6LzDhHnXJ6xHeMNNiM6Q= +go.opentelemetry.io/otel v1.22.0 h1:xS7Ku+7yTFvDfDraDIJVpw7XPyuHlB9MCiqqX5mcJ6Y= +go.opentelemetry.io/otel v1.22.0/go.mod h1:eoV4iAi3Ea8LkAEI9+GFT44O6T/D0GWAVFyZVCC6pMI= +go.opentelemetry.io/otel/metric v1.22.0 h1:lypMQnGyJYeuYPhOM/bgjbFM6WE44W1/T45er4d8Hhg= +go.opentelemetry.io/otel/metric v1.22.0/go.mod h1:evJGjVpZv0mQ5QBRJoBF64yMuOf4xCWdXjK8pzFvliY= +go.opentelemetry.io/otel/trace v1.22.0 h1:Hg6pPujv0XG9QaVbGOBVHunyuLcCC3jN7WEhPx83XD0= +go.opentelemetry.io/otel/trace v1.22.0/go.mod h1:RbbHXVqKES9QhzZq/fE5UnOSILqRt40a21sPw2He1xo= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.11.0 h1:bUO06HqtnRcc/7l71XBe4WcqTZ+3AH1J59zWDDwLKgU= +golang.org/x/mod v0.11.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.14.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.19.0 h1:zTwKpTd2XuCqf8huc7Fo2iSy+4RHPd10s4KzeTnVr1c= +golang.org/x/net v0.19.0/go.mod h1:CfAk/cbD4CthTvqiEl8NpboMuiuOYsAr/7NOjZJtv1U= +golang.org/x/oauth2 v0.11.0 h1:vPL4xzxBM4niKCW6g9whtaWVXTJf1U5e4aZxxFx/gbU= +golang.org/x/oauth2 v0.11.0/go.mod h1:LdF7O/8bLR/qWK9DrpXmbHLTouvRHK0SgJl0GmDBchk= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE= +golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.16.0 h1:xWw16ngr6ZMtmxDyKyIgsE93KNKz5HKmMa3b8ALHidU= +golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.15.0 h1:y/Oo/a/q3IXu26lQgl04j/gjuBDOBlx7X6Om1j2CPW4= +golang.org/x/term v0.15.0/go.mod h1:BDl952bC7+uMoWR75FIrCDx79TPU9oHkTZ9yRbYOrX0= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4= +golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= +golang.org/x/tools v0.10.0 h1:tvDr/iQoUqNdohiYm0LmmKcBk+q86lb9EprIUFhHHGg= +golang.org/x/tools v0.10.0/go.mod h1:UJwyiVBsOA2uwvK/e5OY3GTpDUJriEd+/YlqAwLPmyM= +golang.org/x/tools v0.16.1/go.mod h1:kYVVN6I1mBNoB1OX+noeBjbRk4IUEPa7JJ+TJMEooJ0= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/appengine v1.6.7 h1:FZR1q0exgwxzPzp/aF+VccGrSfxfPpkBqjIIEq3ru6c= +google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= +google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI= +google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= +gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= +gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= +gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +k8s.io/api v0.29.1 h1:DAjwWX/9YT7NQD4INu49ROJuZAAAP/Ijki48GUPzxqw= +k8s.io/api v0.29.1/go.mod h1:7Kl10vBRUXhnQQI8YR/R327zXC8eJ7887/+Ybta+RoQ= +k8s.io/apimachinery v0.29.1 h1:KY4/E6km/wLBguvCZv8cKTeOwwOBqFNjwJIdMkMbbRc= +k8s.io/apimachinery v0.29.1/go.mod h1:6HVkd1FwxIagpYrHSwJlQqZI3G9LfYWRPAkUvLnXTKU= +k8s.io/client-go v0.29.1 h1:19B/+2NGEwnFLzt0uB5kNJnfTsbV8w6TgQRz9l7ti7A= +k8s.io/client-go v0.29.1/go.mod h1:TDG/psL9hdet0TI9mGyHJSgRkW3H9JZk2dNEUS7bRks= +k8s.io/klog/v2 v2.110.1 h1:U/Af64HJf7FcwMcXyKm2RPM22WZzyR7OSpYj5tg3cL0= +k8s.io/klog/v2 v2.110.1/go.mod h1:YGtd1984u+GgbuZ7e08/yBuAfKLSO0+uR1Fhi6ExXjo= +k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 h1:aVUu9fTY98ivBPKR9Y5w/AuzbMm96cd3YHRTU83I780= +k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00/go.mod h1:AsvuZPBlUDVuCdzJ87iajxtXuR9oktsTctW/R9wwouA= +k8s.io/utils v0.0.0-20230726121419-3b25d923346b h1:sgn3ZU783SCgtaSJjpcVVlRqd6GSnlTLKgpAAttJvpI= +k8s.io/utils v0.0.0-20230726121419-3b25d923346b/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= +sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= +sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4= +sigs.k8s.io/structured-merge-diff/v4 v4.4.1/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08= +sigs.k8s.io/yaml v1.3.0 h1:a2VclLzOGrwOHDiV8EfBGhvjHvP46CtW5j6POvhYGGo= +sigs.k8s.io/yaml v1.3.0/go.mod h1:GeOyir5tyXNByN85N/dRIT9es5UQNerPYEKK56eTBm8= diff --git a/pkg/common/func.go b/pkg/common/func.go new file mode 100644 index 0000000..3ba8d84 --- /dev/null +++ b/pkg/common/func.go @@ -0,0 +1,166 @@ +package common + +import ( + "context" + "flag" + "fmt" + "io" + "net/http" + "os" + "strconv" + + "k8s.io/client-go/kubernetes" + + "github.com/containerd/containerd/log" + "gopkg.in/yaml.v2" +) + +var InterLinkConfigInst InterLinkConfig +var Clientset *kubernetes.Clientset + +// TODO: implement factory design + +// NewInterLinkConfig returns a variable of type InterLinkConfig, used in many other functions and the first encountered error. +func NewInterLinkConfig() (InterLinkConfig, error) { + if !InterLinkConfigInst.set { + var path string + verbose := flag.Bool("verbose", false, "Enable or disable Debug level logging") + errorsOnly := flag.Bool("errorsonly", false, "Prints only errors if enabled") + InterLinkConfigPath := flag.String("interlinkconfigpath", "", "Path to InterLink config") + flag.Parse() + + if *verbose { + InterLinkConfigInst.VerboseLogging = true + InterLinkConfigInst.ErrorsOnlyLogging = false + } else if *errorsOnly { + InterLinkConfigInst.VerboseLogging = false + InterLinkConfigInst.ErrorsOnlyLogging = true + } + + if *InterLinkConfigPath != "" { + path = *InterLinkConfigPath + } else if os.Getenv("INTERLINKCONFIGPATH") != "" { + path = os.Getenv("INTERLINKCONFIGPATH") + } else { + path = "/etc/interlink/InterLinkConfig.yaml" + } + + if _, err := os.Stat(path); err != nil { + log.G(context.Background()).Error("File " + path + " doesn't exist. You can set a custom path by exporting INTERLINKCONFIGPATH. Exiting...") + return InterLinkConfig{}, err + } + + log.G(context.Background()).Info("Loading InterLink config from " + path) + yfile, err := os.ReadFile(path) + if err != nil { + log.G(context.Background()).Error("Error opening config file, exiting...") + return InterLinkConfig{}, err + } + yaml.Unmarshal(yfile, &InterLinkConfigInst) + + if os.Getenv("INTERLINKURL") != "" { + InterLinkConfigInst.Interlinkurl = os.Getenv("INTERLINKURL") + } + + if os.Getenv("SIDECARURL") != "" { + InterLinkConfigInst.Sidecarurl = os.Getenv("SIDECARURL") + } + + if os.Getenv("INTERLINKPORT") != "" { + InterLinkConfigInst.Interlinkport = os.Getenv("INTERLINKPORT") + } + + if os.Getenv("SIDECARPORT") != "" { + InterLinkConfigInst.Sidecarport = os.Getenv("SIDECARPORT") + } + + if os.Getenv("SBATCHPATH") != "" { + InterLinkConfigInst.Sbatchpath = os.Getenv("SBATCHPATH") + } + + if os.Getenv("SCANCELPATH") != "" { + InterLinkConfigInst.Scancelpath = os.Getenv("SCANCELPATH") + } + + if os.Getenv("POD_IP") != "" { + InterLinkConfigInst.PodIP = os.Getenv("POD_IP") + } + + if os.Getenv("TSOCKS") != "" { + if os.Getenv("TSOCKS") != "true" && os.Getenv("TSOCKS") != "false" { + fmt.Println("export TSOCKS as true or false") + return InterLinkConfig{}, err + } + if os.Getenv("TSOCKS") == "true" { + InterLinkConfigInst.Tsocks = true + } else { + InterLinkConfigInst.Tsocks = false + } + } + + if os.Getenv("TSOCKSPATH") != "" { + path = os.Getenv("TSOCKSPATH") + if _, err := os.Stat(path); err != nil { + log.G(context.Background()).Error("File " + path + " doesn't exist. You can set a custom path by exporting TSOCKSPATH. Exiting...") + return InterLinkConfig{}, err + } + + InterLinkConfigInst.Tsockspath = path + } + + if os.Getenv("VKTOKENFILE") != "" { + path = os.Getenv("VKTOKENFILE") + if _, err := os.Stat(path); err != nil { + log.G(context.Background()).Error("File " + path + " doesn't exist. You can set a custom path by exporting VKTOKENFILE. Exiting...") + return InterLinkConfig{}, err + } + + InterLinkConfigInst.VKTokenFile = path + } else { + path = InterLinkConfigInst.DataRootFolder + "token" + InterLinkConfigInst.VKTokenFile = path + } + + InterLinkConfigInst.set = true + } + return InterLinkConfigInst, nil +} + +// PingInterLink pings the InterLink API and returns true if there's an answer. The second return value is given by the answer provided by the API. +func PingInterLink(ctx context.Context) (bool, int, error) { + log.G(ctx).Info("Pinging: " + InterLinkConfigInst.Interlinkurl + ":" + InterLinkConfigInst.Interlinkport + "/pinglink") + retVal := -1 + req, err := http.NewRequest(http.MethodPost, InterLinkConfigInst.Interlinkurl+":"+InterLinkConfigInst.Interlinkport+"/pinglink", nil) + + if err != nil { + log.G(ctx).Error(err) + } + + token, err := os.ReadFile(InterLinkConfigInst.VKTokenFile) // just pass the file name + if err != nil { + log.G(ctx).Error(err) + return false, retVal, err + } + req.Header.Add("Authorization", "Bearer "+string(token)) + resp, err := http.DefaultClient.Do(req) + if err != nil { + return false, retVal, err + } + + if resp.StatusCode == http.StatusOK { + retBytes, err := io.ReadAll(resp.Body) + if err != nil { + log.G(ctx).Error(err) + return false, retVal, err + } + retVal, err = strconv.Atoi(string(retBytes)) + if err != nil { + log.G(ctx).Error(err) + return false, retVal, err + } + return true, retVal, nil + } else { + log.G(ctx).Error("server error: " + fmt.Sprint(resp.StatusCode)) + return false, retVal, nil + } +} diff --git a/pkg/common/types.go b/pkg/common/types.go new file mode 100644 index 0000000..ef2862a --- /dev/null +++ b/pkg/common/types.go @@ -0,0 +1,84 @@ +package common + +import ( + "time" + + v1 "k8s.io/api/core/v1" +) + +// PodCreateRequests is a struct holding data for a create request. Retrieved ConfigMaps and Secrets are held along the Pod description itself. +type PodCreateRequests struct { + Pod v1.Pod `json:"pod"` + ConfigMaps []v1.ConfigMap `json:"configmaps"` + Secrets []v1.Secret `json:"secrets"` +} + +// PodStatus is a simplified v1.Pod struct, holding only necessary variables to uniquely identify a job/service in the sidecar. It is used to request +type PodStatus struct { + PodName string `json:"name"` + PodUID string `json:"UID"` + PodNamespace string `json:"namespace"` + Containers []v1.ContainerStatus `json:"containers"` +} + +// RetrievedContainer is used in InterLink to rearrange data structure in a suitable way for the sidecar +type RetrievedContainer struct { + Name string `json:"name"` + ConfigMaps []v1.ConfigMap `json:"configMaps"` + Secrets []v1.Secret `json:"secrets"` + EmptyDirs []string `json:"emptyDirs"` +} + +// RetrievedPoData is used in InterLink to rearrange data structure in a suitable way for the sidecar +type RetrievedPodData struct { + Pod v1.Pod `json:"pod"` + Containers []RetrievedContainer `json:"container"` +} + +// InterLinkConfig holds the whole configuration +type InterLinkConfig struct { + VKConfigPath string `yaml:"VKConfigPath"` + VKTokenFile string `yaml:"VKTokenFile"` + Interlinkurl string `yaml:"InterlinkURL"` + Sidecarurl string `yaml:"SidecarURL"` + Sbatchpath string `yaml:"SbatchPath"` + Scancelpath string `yaml:"ScancelPath"` + Squeuepath string `yaml:"SqueuePath"` + Interlinkport string `yaml:"InterlinkPort"` + Sidecarport string `yaml:"SidecarPort"` + Commandprefix string `yaml:"CommandPrefix"` + ExportPodData bool `yaml:"ExportPodData"` + DataRootFolder string `yaml:"DataRootFolder"` + ServiceAccount string `yaml:"ServiceAccount"` + Namespace string `yaml:"Namespace"` + Tsocks bool `yaml:"Tsocks"` + Tsockspath string `yaml:"TsocksPath"` + Tsocksconfig string `yaml:"TsocksConfig"` + Tsockslogin string `yaml:"TsocksLoginNode"` + BashPath string `yaml:"BashPath"` + VerboseLogging bool `yaml:"VerboseLogging"` + ErrorsOnlyLogging bool `yaml:"ErrorsOnlyLogging"` + PodIP string `yaml:"PodIP"` + SingularityPrefix string `yaml:"SingularityPrefix"` + set bool +} + +// ContainerLogOpts is a struct in which it is possible to specify options to retrieve logs from the sidecar +type ContainerLogOpts struct { + Tail int `json:"Tail"` + LimitBytes int `json:"Bytes"` + Timestamps bool `json:"Timestamps"` + Follow bool `json:"Follow"` + Previous bool `json:"Previous"` + SinceSeconds int `json:"SinceSeconds"` + SinceTime time.Time `json:"SinceTime"` +} + +// LogStruct is needed to identify the job/container running on the sidecar to retrieve the logs from. Using ContainerLogOpts struct allows to specify more options on how to collect logs +type LogStruct struct { + Namespace string `json:"Namespace"` + PodUID string `json:"PodUID"` + PodName string `json:"PodName"` + ContainerName string `json:"ContainerName"` + Opts ContainerLogOpts `json:"Opts"` +} diff --git a/pkg/docker/Create.go b/pkg/docker/Create.go new file mode 100644 index 0000000..512e31d --- /dev/null +++ b/pkg/docker/Create.go @@ -0,0 +1,248 @@ +package docker + +import ( + "encoding/json" + "io" + "net/http" + "os" + "strconv" + "strings" + + exec "github.com/alexellis/go-execute/pkg/v1" + "github.com/containerd/containerd/log" + v1 "k8s.io/api/core/v1" + + commonIL "github.com/intertwin-eu/interlink/pkg/common" +) + +// CreateHandler creates a Docker Container based on data provided by the InterLink API. +func (h *SidecarHandler) CreateHandler(w http.ResponseWriter, r *http.Request) { + log.G(h.Ctx).Info("Docker Sidecar: received Create call") + var execReturn exec.ExecResult + statusCode := http.StatusOK + bodyBytes, err := io.ReadAll(r.Body) + if err != nil { + HandleErrorAndRemoveData(h, w, statusCode, "Some errors occurred while creating container. Check Docker Sidecar's logs", err, nil) + return + } + + var req []commonIL.RetrievedPodData + err = json.Unmarshal(bodyBytes, &req) + + if err != nil { + HandleErrorAndRemoveData(h, w, statusCode, "Some errors occurred while creating container. Check Docker Sidecar's logs", err, nil) + return + } + + for _, data := range req { + + pathsOfVolumes := make(map[string]string) + + for _, volume := range data.Pod.Spec.Volumes { + if volume.HostPath != nil { + if *volume.HostPath.Type == v1.HostPathDirectoryOrCreate || *volume.HostPath.Type == v1.HostPathDirectory { + _, err := os.Stat(volume.HostPath.Path + "/" + volume.Name) + if os.IsNotExist(err) { + log.G(h.Ctx).Info("-- Creating directory " + volume.HostPath.Path + "/" + volume.Name) + err = os.MkdirAll(volume.HostPath.Path+"/"+volume.Name, os.ModePerm) + if err != nil { + HandleErrorAndRemoveData(h, w, statusCode, "Some errors occurred while creating container. Check Docker Sidecar's logs", err, &data) + } else { + log.G(h.Ctx).Info("-- Created directory " + volume.HostPath.Path) + pathsOfVolumes[volume.Name] = volume.HostPath.Path + "/" + volume.Name + } + } else { + log.G(h.Ctx).Info("-- Directory " + volume.HostPath.Path + "/" + volume.Name + " already exists") + pathsOfVolumes[volume.Name] = volume.HostPath.Path + "/" + volume.Name + } + } + } + } + + for _, container := range data.Pod.Spec.Containers { + + var isGpuRequested bool = false + var additionalGpuArgs []string + + if val, ok := container.Resources.Limits["nvidia.com/gpu"]; ok { + + numGpusRequested := val.Value() + + log.G(h.Ctx).Infof("Number of GPU requested: %d", numGpusRequested) + + isGpuRequested = true + + log.G(h.Ctx).Info("Container " + container.Name + " is requesting " + val.String() + " GPU") + + numGpusRequestedInt := int(numGpusRequested) + _, err := h.GpuManager.GetAvailableGPUs(numGpusRequestedInt) + + if err != nil { + HandleErrorAndRemoveData(h, w, statusCode, "Some errors occurred while creating container. Check Docker Sidecar's logs", err, &data) + return + } + + gpuSpecs, err := h.GpuManager.GetAndAssignAvailableGPUs(numGpusRequestedInt, container.Name) + if err != nil { + HandleErrorAndRemoveData(h, w, statusCode, "Some errors occurred while creating container. Check Docker Sidecar's logs", err, &data) + return + } + + var gpuUUIDs string = "" + for _, gpuSpec := range gpuSpecs { + if gpuSpec.UUID == gpuSpecs[len(gpuSpecs)-1].UUID { + gpuUUIDs += strconv.Itoa(gpuSpec.Index) + } else { + gpuUUIDs += strconv.Itoa(gpuSpec.Index) + "," + } + } + + additionalGpuArgs = append(additionalGpuArgs, "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES="+gpuUUIDs) + + } else { + log.G(h.Ctx).Info("Container " + container.Name + " is not requesting a GPU") + } + + log.G(h.Ctx).Info("- Creating container " + container.Name) + + var envVars string = "" + // add environment variables to the docker command + for _, envVar := range container.Env { + if envVar.Value != "" { + // check if the env variable is an array, in this case the value needs to be between '' + if strings.Contains(envVar.Value, "[") { + envVars += " -e " + envVar.Name + "='" + envVar.Value + "'" + } else { + envVars += " -e " + envVar.Name + "=" + envVar.Value + } + } else { + envVars += " -e " + envVar.Name + } + } + + // iterate over the container volumes and mount them in the docker command line; get the volume path in the host from pathsOfVolumes + for _, volumeMount := range container.VolumeMounts { + if volumeMount.MountPath != "" { + + // check if volumeMount.name is inside pathsOfVolumes, if it is add the volume to the docker command + if _, ok := pathsOfVolumes[volumeMount.Name]; !ok { + log.G(h.Ctx).Error("Volume " + volumeMount.Name + " not found in pathsOfVolumes") + continue + } + + if volumeMount.ReadOnly { + envVars += " -v " + pathsOfVolumes[volumeMount.Name] + ":" + volumeMount.MountPath + ":ro" + } else { + envVars += " -v " + pathsOfVolumes[volumeMount.Name] + ":" + volumeMount.MountPath + } + } + } + + log.G(h.Ctx).Info("- Creating container " + container.Name) + + cmd := []string{"run", "-d", "--name", container.Name} + + cmd = append(cmd, envVars) + + if isGpuRequested { + cmd = append(cmd, additionalGpuArgs...) + } + + var additionalPortArgs []string + for _, port := range container.Ports { + if port.HostPort != 0 { + additionalPortArgs = append(additionalPortArgs, "-p", strconv.Itoa(int(port.HostPort))+":"+strconv.Itoa(int(port.ContainerPort))) + } + } + + cmd = append(cmd, additionalPortArgs...) + + if h.Config.ExportPodData { + mounts, err := prepareMounts(h.Ctx, h.Config, req, container) + if err != nil { + HandleErrorAndRemoveData(h, w, statusCode, "Some errors occurred while creating container. Check Docker Sidecar's logs", err, &data) + return + } + cmd = append(cmd, mounts) + } + + cmd = append(cmd, container.Image) + cmd = append(cmd, container.Command...) + cmd = append(cmd, container.Args...) + + dockerOptions := "" + + if dockerFlags, ok := data.Pod.ObjectMeta.Annotations["docker-options.vk.io/flags"]; ok { + parsedDockerOptions := strings.Split(dockerFlags, " ") + for _, option := range parsedDockerOptions { + dockerOptions += " " + option + } + } + + // print the docker command + log.G(h.Ctx).Info("Docker command: " + "docker" + dockerOptions + " " + strings.Join(cmd, " ")) + + shell := exec.ExecTask{ + Command: "docker" + dockerOptions, + Args: cmd, + Shell: true, + } + + execReturn, err = shell.Execute() + if err != nil { + HandleErrorAndRemoveData(h, w, statusCode, "Some errors occurred while creating container. Check Docker Sidecar's logs", err, &data) + return + } + + if execReturn.Stdout == "" { + eval := "Conflict. The container name \"/" + container.Name + "\" is already in use" + if strings.Contains(execReturn.Stderr, eval) { + log.G(h.Ctx).Warning("Container named " + container.Name + " already exists. Skipping its creation.") + } else { + log.G(h.Ctx).Error("Unable to create container " + container.Name + " : " + execReturn.Stderr) + HandleErrorAndRemoveData(h, w, statusCode, "Some errors occurred while creating container. Check Docker Sidecar's logs", err, &data) + return + } + } else { + log.G(h.Ctx).Info("-- Created container " + container.Name) + } + + shell = exec.ExecTask{ + Command: "docker", + Args: []string{"ps", "-aqf", "name=^" + container.Name + "$"}, + Shell: true, + } + + execReturn, err = shell.Execute() + execReturn.Stdout = strings.ReplaceAll(execReturn.Stdout, "\n", "") + if execReturn.Stderr != "" { + log.G(h.Ctx).Error("Failed to retrieve " + container.Name + " ID : " + execReturn.Stderr) + HandleErrorAndRemoveData(h, w, statusCode, "Some errors occurred while creating container. Check Docker Sidecar's logs", err, &data) + return + } else if execReturn.Stdout == "" { + log.G(h.Ctx).Error("Container name not found. Maybe creation failed?") + } else { + log.G(h.Ctx).Debug("-- Retrieved " + container.Name + " ID: " + execReturn.Stdout) + } + } + } + + w.WriteHeader(statusCode) + + if statusCode != http.StatusOK { + w.Write([]byte("Some errors occurred while creating containers. Check Docker Sidecar's logs")) + } else { + w.Write([]byte("Containers created")) + } +} + +func HandleErrorAndRemoveData(h *SidecarHandler, w http.ResponseWriter, statusCode int, s string, err error, data *commonIL.RetrievedPodData) { + statusCode = http.StatusInternalServerError + log.G(h.Ctx).Error(err) + w.WriteHeader(statusCode) + w.Write([]byte("Some errors occurred while creating container. Check Docker Sidecar's logs")) + + if data != nil { + os.RemoveAll(h.Config.DataRootFolder + data.Pod.Namespace + "-" + string(data.Pod.UID)) + } +} diff --git a/pkg/docker/Delete.go b/pkg/docker/Delete.go new file mode 100644 index 0000000..232e018 --- /dev/null +++ b/pkg/docker/Delete.go @@ -0,0 +1,98 @@ +package docker + +import ( + "encoding/json" + "io" + "net/http" + "os" + "strings" + + exec "github.com/alexellis/go-execute/pkg/v1" + "github.com/containerd/containerd/log" + v1 "k8s.io/api/core/v1" +) + +// DeleteHandler stops and deletes Docker containers from provided data +func (h *SidecarHandler) DeleteHandler(w http.ResponseWriter, r *http.Request) { + log.G(h.Ctx).Info("Docker Sidecar: received Delete call") + var execReturn exec.ExecResult + statusCode := http.StatusOK + bodyBytes, err := io.ReadAll(r.Body) + + if err != nil { + statusCode = http.StatusInternalServerError + log.G(h.Ctx).Error(err) + w.WriteHeader(statusCode) + w.Write([]byte("Some errors occurred while deleting container. Check Docker Sidecar's logs")) + return + } + + var pod v1.Pod + err = json.Unmarshal(bodyBytes, &pod) + if err != nil { + statusCode = http.StatusInternalServerError + w.WriteHeader(statusCode) + w.Write([]byte("Some errors occurred while creating container. Check Docker Sidecar's logs")) + log.G(h.Ctx).Error(err) + return + } + + for _, container := range pod.Spec.Containers { + log.G(h.Ctx).Debug("- Deleting container " + container.Name) + + // added a timeout to the stop container command + cmd := []string{"stop", "-t", "10", container.Name} + shell := exec.ExecTask{ + Command: "docker", + Args: cmd, + Shell: true, + } + execReturn, _ = shell.Execute() + + if execReturn.Stderr != "" { + if strings.Contains(execReturn.Stderr, "No such container") { + log.G(h.Ctx).Debug("-- Unable to find container " + container.Name + ". Probably already removed? Skipping its removal") + } else { + log.G(h.Ctx).Error("-- Error stopping container " + container.Name + ". Skipping its removal") + statusCode = http.StatusInternalServerError + w.WriteHeader(statusCode) + w.Write([]byte("Some errors occurred while deleting container. Check Docker Sidecar's logs")) + return + } + continue + } + + if execReturn.Stdout != "" { + cmd = []string{"rm", execReturn.Stdout} + shell = exec.ExecTask{ + Command: "docker", + Args: cmd, + Shell: true, + } + execReturn, _ = shell.Execute() + execReturn.Stdout = strings.ReplaceAll(execReturn.Stdout, "\n", "") + + if execReturn.Stderr != "" { + log.G(h.Ctx).Error("-- Error deleting container " + container.Name) + statusCode = http.StatusInternalServerError + w.WriteHeader(statusCode) + w.Write([]byte("Some errors occurred while deleting container. Check Docker Sidecar's logs")) + return + } else { + log.G(h.Ctx).Info("- Deleted container " + container.Name) + } + } + + // check if the container has GPU devices attacched using the GpuManager and release them + h.GpuManager.Release(container.Name) + + os.RemoveAll(h.Config.DataRootFolder + pod.Namespace + "-" + string(pod.UID)) + } + + w.WriteHeader(statusCode) + if statusCode != http.StatusOK { + w.Write([]byte("Some errors occurred deleting containers. Check Docker Sidecar's logs")) + } else { + w.Write([]byte("All containers for submitted Pods have been deleted")) + } +} diff --git a/pkg/docker/GetLogs.go b/pkg/docker/GetLogs.go new file mode 100644 index 0000000..9afef6a --- /dev/null +++ b/pkg/docker/GetLogs.go @@ -0,0 +1,125 @@ +package docker + +import ( + "encoding/json" + "io" + "net/http" + "strings" + "time" + + OSexec "os/exec" + + "github.com/containerd/containerd/log" + + commonIL "github.com/intertwin-eu/interlink/pkg/common" +) + +// GetLogsHandler performs a Docker logs command and returns its manipulated output +func (h *SidecarHandler) GetLogsHandler(w http.ResponseWriter, r *http.Request) { + log.G(h.Ctx).Info("Docker Sidecar: received GetLogs call") + var req commonIL.LogStruct + statusCode := http.StatusOK + currentTime := time.Now() + + //orario, _ := time.Parse("2006-01-02T15:04:05.999999999Z", "2023-09-14T10:35:44.665672258Z") + //test := commonIL.LogStruct{PodName: "test-pod", ContainerName: "busyecho", Opts: commonIL.ContainerLogOpts{Tail: 0, LimitBytes: 350, SinceTime: orario, Timestamps: true}} + //testBytes, _ := json.Marshal(test) + //log.G(h.Ctx).Debug(string(testBytes)) + + bodyBytes, err := io.ReadAll(r.Body) + if err != nil { + statusCode = http.StatusInternalServerError + w.WriteHeader(statusCode) + w.Write([]byte("Some errors occurred while checking container status. Check Docker Sidecar's logs")) + log.G(h.Ctx).Error(err) + return + } + + err = json.Unmarshal(bodyBytes, &req) + if err != nil { + statusCode = http.StatusInternalServerError + w.WriteHeader(statusCode) + w.Write([]byte("Some errors occurred while checking container status. Check Docker Sidecar's logs")) + log.G(h.Ctx).Error(err) + return + } + + //req = test + + var cmd *OSexec.Cmd + if req.Opts.Timestamps { + cmd = OSexec.Command("docker", "logs", "-t", req.ContainerName) + } else { + cmd = OSexec.Command("docker", "logs", req.ContainerName) + } + + output, err := cmd.CombinedOutput() + + if err != nil { + log.G(h.Ctx).Error(err) + statusCode = http.StatusInternalServerError + w.WriteHeader(statusCode) + return + } + + var returnedLogs string + + if req.Opts.Tail != 0 { + var lastLines []string + + splittedLines := strings.Split(string(output), "\n") + + if req.Opts.Tail > len(splittedLines) { + lastLines = splittedLines + } else { + lastLines = splittedLines[len(splittedLines)-req.Opts.Tail-1:] + } + + for _, line := range lastLines { + returnedLogs += line + "\n" + } + } else if req.Opts.LimitBytes != 0 { + var lastBytes []byte + if req.Opts.LimitBytes > len(output) { + lastBytes = output + } else { + lastBytes = output[len(output)-req.Opts.LimitBytes-1:] + } + + returnedLogs = string(lastBytes) + } else { + returnedLogs = string(output) + } + + if req.Opts.Timestamps && (req.Opts.SinceSeconds != 0 || !req.Opts.SinceTime.IsZero()) { + temp := returnedLogs + returnedLogs = "" + splittedLogs := strings.Split(temp, "\n") + timestampFormat := "2006-01-02T15:04:05.999999999Z" + + for _, Log := range splittedLogs { + part := strings.SplitN(Log, " ", 2) + timestampString := part[0] + timestamp, err := time.Parse(timestampFormat, timestampString) + if err != nil { + continue + } + if req.Opts.SinceSeconds != 0 { + if currentTime.Sub(timestamp).Seconds() > float64(req.Opts.SinceSeconds) { + returnedLogs += Log + "\n" + } + } else { + if timestamp.Sub(req.Opts.SinceTime).Seconds() >= 0 { + returnedLogs += Log + "\n" + } + } + } + } + + if statusCode != http.StatusOK { + w.Write([]byte("Some errors occurred while checking container status. Check Docker Sidecar's logs")) + } else { + w.WriteHeader(statusCode) + w.Write([]byte(returnedLogs)) + } +} diff --git a/pkg/docker/Status.go b/pkg/docker/Status.go new file mode 100644 index 0000000..f4f9ed4 --- /dev/null +++ b/pkg/docker/Status.go @@ -0,0 +1,98 @@ +package docker + +import ( + "encoding/json" + "io" + "net/http" + "strings" + + exec "github.com/alexellis/go-execute/pkg/v1" + "github.com/containerd/containerd/log" + v1 "k8s.io/api/core/v1" + + commonIL "github.com/intertwin-eu/interlink/pkg/common" +) + +// StatusHandler checks Docker Container's status by running docker ps -af command and returns that status +func (h *SidecarHandler) StatusHandler(w http.ResponseWriter, r *http.Request) { + log.G(h.Ctx).Info("Docker Sidecar: received GetStatus call") + var resp []commonIL.PodStatus + var req []*v1.Pod + statusCode := http.StatusOK + + bodyBytes, err := io.ReadAll(r.Body) + if err != nil { + statusCode = http.StatusInternalServerError + w.WriteHeader(statusCode) + w.Write([]byte("Some errors occurred while checking container status. Check Docker Sidecar's logs")) + log.G(h.Ctx).Error(err) + return + } + + err = json.Unmarshal(bodyBytes, &req) + if err != nil { + statusCode = http.StatusInternalServerError + w.WriteHeader(statusCode) + w.Write([]byte("Some errors occurred while checking container status. Check Docker Sidecar's logs")) + log.G(h.Ctx).Error(err) + return + } + + for i, pod := range req { + resp = append(resp, commonIL.PodStatus{PodName: pod.Name, PodUID: string(pod.UID), PodNamespace: pod.Namespace}) + for _, container := range pod.Spec.Containers { + log.G(h.Ctx).Debug("- Getting status for container " + container.Name) + cmd := []string{"ps -af name=^" + container.Name + "$ --format \"{{.Status}}\""} + + shell := exec.ExecTask{ + Command: "docker", + Args: cmd, + Shell: true, + } + execReturn, err := shell.Execute() + execReturn.Stdout = strings.ReplaceAll(execReturn.Stdout, "\n", "") + + if err != nil { + log.G(h.Ctx).Error(err) + statusCode = http.StatusInternalServerError + break + } + + containerstatus := strings.Split(execReturn.Stdout, " ") + + // TODO: why first container? + if execReturn.Stdout != "" { + if containerstatus[0] == "Created" { + log.G(h.Ctx).Info("-- Container " + container.Name + " is going ready...") + resp[i].Containers = append(resp[i].Containers, v1.ContainerStatus{Name: container.Name, State: v1.ContainerState{Waiting: &v1.ContainerStateWaiting{}}, Ready: false}) + } else if containerstatus[0] == "Up" { + log.G(h.Ctx).Info("-- Container " + container.Name + " is running") + resp[i].Containers = append(resp[i].Containers, v1.ContainerStatus{Name: container.Name, State: v1.ContainerState{Running: &v1.ContainerStateRunning{}}, Ready: true}) + } else if containerstatus[0] == "Exited" { + log.G(h.Ctx).Info("-- Container " + container.Name + " has been stopped") + resp[i].Containers = append(resp[i].Containers, v1.ContainerStatus{Name: container.Name, State: v1.ContainerState{Terminated: &v1.ContainerStateTerminated{}}, Ready: false}) + // release all the GPUs from the container + h.GpuManager.Release(container.Name) + } + } else { + log.G(h.Ctx).Info("-- Container " + container.Name + " doesn't exist") + resp[i].Containers = append(resp[i].Containers, v1.ContainerStatus{Name: container.Name, State: v1.ContainerState{Terminated: &v1.ContainerStateTerminated{}}, Ready: false}) + } + } + } + + w.WriteHeader(statusCode) + + if statusCode != http.StatusOK { + w.Write([]byte("Some errors occurred while checking container status. Check Docker Sidecar's logs")) + } else { + bodyBytes, err = json.Marshal(resp) + if err != nil { + log.G(h.Ctx).Error(err) + statusCode = http.StatusInternalServerError + w.WriteHeader(statusCode) + w.Write([]byte("Some errors occurred while checking container status. Check Docker Sidecar's logs")) + } + w.Write(bodyBytes) + } +} diff --git a/pkg/docker/aux.go b/pkg/docker/aux.go new file mode 100644 index 0000000..844595f --- /dev/null +++ b/pkg/docker/aux.go @@ -0,0 +1,267 @@ +package docker + +import ( + "context" + "errors" + "os" + "path/filepath" + "strings" + + exec2 "github.com/alexellis/go-execute/pkg/v1" + "github.com/containerd/containerd/log" + v1 "k8s.io/api/core/v1" + + "fmt" + + commonIL "github.com/intertwin-eu/interlink/pkg/common" + "github.com/intertwin-eu/interlink-docker-plugin/pkg/sidecars/docker/gpustrategies" +) + +type SidecarHandler struct { + Config commonIL.InterLinkConfig + Ctx context.Context + GpuManager gpustrategies.GPUManagerInterface +} + +// prepareMounts iterates along the struct provided in the data parameter and checks for ConfigMaps, Secrets and EmptyDirs to be mounted. +// For each element found, the mountData function is called. +// It returns a string composed as the docker -v command to bind mount directories and files and the first encountered error. +func prepareMounts(Ctx context.Context, config commonIL.InterLinkConfig, data []commonIL.RetrievedPodData, container v1.Container) (string, error) { + log.G(Ctx).Info("- Preparing mountpoints for " + container.Name) + mountedData := "" + + for _, podData := range data { + err := os.MkdirAll(config.DataRootFolder+podData.Pod.Namespace+"-"+string(podData.Pod.UID), os.ModePerm) + if err != nil { + log.G(Ctx).Error(err) + return "", err + } else { + log.G(Ctx).Info("-- Created directory " + config.DataRootFolder + podData.Pod.Namespace + "-" + string(podData.Pod.UID)) + } + + log.G(Ctx).Info("pod data values: " + fmt.Sprintf("%+v", podData)) + + for _, cont := range podData.Containers { + + log.G(Ctx).Info("cont values: " + fmt.Sprintf("%+v", cont)) + + log.G(Ctx).Info("-- Inside Preparing mountpoints for " + cont.Name) + for _, cfgMap := range cont.ConfigMaps { + if container.Name == cont.Name { + log.G(Ctx).Info("-- Mounting ConfigMap " + cfgMap.Name) + paths, err := mountData(Ctx, config, podData.Pod, cfgMap, container) + if err != nil { + log.G(Ctx).Error("Error mounting ConfigMap " + cfgMap.Name) + return "", errors.New("Error mounting ConfigMap " + cfgMap.Name) + } + for _, path := range paths { + mountedData += "-v " + path + " " + } + } + } + + for _, secret := range cont.Secrets { + if container.Name == cont.Name { + paths, err := mountData(Ctx, config, podData.Pod, secret, container) + if err != nil { + log.G(Ctx).Error("Error mounting Secret " + secret.Name) + return "", errors.New("Error mounting Secret " + secret.Name) + } + for _, path := range paths { + mountedData += "-v " + path + " " + } + } + } + + for _, emptyDir := range cont.EmptyDirs { + if container.Name == cont.Name { + paths, err := mountData(Ctx, config, podData.Pod, emptyDir, container) + if err != nil { + log.G(Ctx).Error("Error mounting EmptyDir " + emptyDir) + return "", errors.New("Error mounting EmptyDir " + emptyDir) + } + for _, path := range paths { + mountedData += "-v " + path + " " + } + } + } + } + } + + if last := len(mountedData) - 1; last >= 0 && mountedData[last] == ',' { + mountedData = mountedData[:last] + } + return mountedData, nil +} + +// mountData is called by prepareMounts and creates files and directory according to their definition in the pod structure. +// The data parameter is an interface and it can be of type v1.ConfigMap, v1.Secret and string (for the empty dir). +// Returns a string which is a bind mount of the file/directory. Example: path/to/file/on/host:path/to/file/in/container. +// It also returns the first encountered error. +func mountData(Ctx context.Context, config commonIL.InterLinkConfig, pod v1.Pod, data interface{}, container v1.Container) ([]string, error) { + wd, err := os.Getwd() + if err != nil { + log.G(Ctx).Error(err) + return nil, err + } + + log.G(Ctx).Info("Inside mountData ") + + if config.ExportPodData { + + log.G(Ctx).Info("Mounting data for " + container.Name) + + for _, mountSpec := range container.VolumeMounts { + + log.G(Ctx).Info("Mounting " + mountSpec.Name + " at " + mountSpec.MountPath) + + var podVolumeSpec *v1.VolumeSource + + for _, vol := range pod.Spec.Volumes { + if vol.Name == mountSpec.Name { + podVolumeSpec = &vol.VolumeSource + } + + switch mount := data.(type) { + case v1.ConfigMap: + var configMapNamePaths []string + err := os.RemoveAll(config.DataRootFolder + string(pod.UID) + "/" + "configMaps/" + vol.Name) + + if err != nil { + log.G(Ctx).Error("Unable to delete root folder") + return nil, err + } + if podVolumeSpec != nil && podVolumeSpec.ConfigMap != nil { + podConfigMapDir := filepath.Join(wd+"/"+config.DataRootFolder, string(pod.UID)+"/", "configMaps/", vol.Name) + mode := os.FileMode(*podVolumeSpec.ConfigMap.DefaultMode) + + if mount.Data != nil { + for key := range mount.Data { + path := filepath.Join(wd+podConfigMapDir, key) + path += (":" + mountSpec.MountPath + "/" + key + " ") + configMapNamePaths = append(configMapNamePaths, path) + } + } + + cmd := []string{"-p " + podConfigMapDir} + shell := exec2.ExecTask{ + Command: "mkdir", + Args: cmd, + Shell: true, + } + + execReturn, _ := shell.Execute() + if execReturn.Stderr != "" { + log.G(Ctx).Error(err) + return nil, err + } else { + log.G(Ctx).Debug("-- Created directory " + podConfigMapDir) + } + + log.G(Ctx).Info("-- Writing ConfigMaps files") + for k, v := range mount.Data { + // TODO: Ensure that these files are deleted in failure cases + fullPath := filepath.Join(podConfigMapDir, k) + os.WriteFile(fullPath, []byte(v), mode) + if err != nil { + log.G(Ctx).Errorf("Could not write ConfigMap file %s", fullPath) + err = os.RemoveAll(fullPath) + if err != nil { + log.G(Ctx).Error("Unable to remove file " + fullPath) + } + return nil, err + } else { + log.G(Ctx).Debug("--- Written ConfigMap file " + fullPath) + } + } + return configMapNamePaths, nil + } + + case v1.Secret: + var secretNamePaths []string + err := os.RemoveAll(config.DataRootFolder + string(pod.UID) + "/" + "secrets/" + vol.Name) + + if err != nil { + log.G(Ctx).Error("Unable to delete root folder") + return nil, err + } + if podVolumeSpec != nil && podVolumeSpec.Secret != nil { + mode := os.FileMode(*podVolumeSpec.Secret.DefaultMode) + podSecretDir := filepath.Join(wd+"/"+config.DataRootFolder, string(pod.UID)+"/", "secrets/", vol.Name) + + if mount.Data != nil { + for key := range mount.Data { + path := filepath.Join(podSecretDir, key) + path += (":" + mountSpec.MountPath + "/" + key + " ") + secretNamePaths = append(secretNamePaths, path) + } + } + + cmd := []string{"-p " + podSecretDir} + shell := exec2.ExecTask{ + Command: "mkdir", + Args: cmd, + Shell: true, + } + + execReturn, _ := shell.Execute() + if strings.Compare(execReturn.Stdout, "") != 0 { + log.G(Ctx).Error(err) + return nil, err + } + if execReturn.Stderr != "" { + log.G(Ctx).Error(execReturn.Stderr) + return nil, errors.New(execReturn.Stderr) + } else { + log.G(Ctx).Debug("-- Created directory " + podSecretDir) + } + + log.G(Ctx).Info("-- Writing Secret files") + for k, v := range mount.Data { + // TODO: Ensure that these files are deleted in failure cases + fullPath := filepath.Join(podSecretDir, k) + os.WriteFile(fullPath, v, mode) + if err != nil { + log.G(Ctx).Errorf("Could not write Secret file %s", fullPath) + err = os.RemoveAll(fullPath) + if err != nil { + log.G(Ctx).Error("Unable to remove file " + fullPath) + } + return nil, err + } else { + log.G(Ctx).Debug("--- Written Secret file " + fullPath) + } + } + return secretNamePaths, nil + } + + case string: + if podVolumeSpec != nil && podVolumeSpec.EmptyDir != nil { + var edPath string + + edPath = filepath.Join(wd+"/"+config.DataRootFolder, string(pod.UID)+"/"+"emptyDirs/"+vol.Name) + log.G(Ctx).Info("-- Creating EmptyDir in " + edPath) + cmd := []string{"-p " + edPath} + shell := exec2.ExecTask{ + Command: "mkdir", + Args: cmd, + Shell: true, + } + + _, err := shell.Execute() + if err != nil { + log.G(Ctx).Error(err) + return []string{""}, nil + } else { + log.G(Ctx).Debug("-- Created EmptyDir in " + edPath) + } + + edPath += (":" + mountSpec.MountPath + "/" + mountSpec.Name + " ") + return []string{edPath}, nil + } + } + } + } + } + return nil, err +} diff --git a/pkg/docker/gpustrategies/AmdHandler.go b/pkg/docker/gpustrategies/AmdHandler.go new file mode 100644 index 0000000..45e09bd --- /dev/null +++ b/pkg/docker/gpustrategies/AmdHandler.go @@ -0,0 +1 @@ +package gpustrategies diff --git a/pkg/docker/gpustrategies/NvidiaHandler.go b/pkg/docker/gpustrategies/NvidiaHandler.go new file mode 100644 index 0000000..5c454f0 --- /dev/null +++ b/pkg/docker/gpustrategies/NvidiaHandler.go @@ -0,0 +1,270 @@ +package gpustrategies + +import ( + "context" + "encoding/json" + "fmt" + "io/ioutil" + "strconv" + "strings" + + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/containerd/containerd/log" + "github.com/docker/docker/api/types/container" + "github.com/docker/docker/client" + + "sync" +) + +type GPUSpecs struct { + Name string + UUID string + Type string + ContainerID string + Available bool + Index int +} + +type GPUManager struct { + GPUSpecsList []GPUSpecs + GPUSpecsMutex sync.Mutex // Mutex to make GPUSpecsList access atomic + Vendor string + Ctx context.Context +} + +type GPUManagerInterface interface { + Init() error + Shutdown() error + GetGPUSpecsList() []GPUSpecs + Dump() error + Discover() error + Check() error + GetAvailableGPUs(numGPUs int) ([]GPUSpecs, error) + Assign(UUID string, containerID string) error + Release(UUID string) error + GetAndAssignAvailableGPUs(numGPUs int, containerID string) ([]GPUSpecs, error) +} + +func (a *GPUManager) Init() error { + + ret := nvml.Init() + if ret != nvml.SUCCESS { + return fmt.Errorf("Unable to initialize NVML") + } + + return nil +} + +// Discover implements the Discover function of the GPUManager interface +func (a *GPUManager) Discover() error { + + log.G(a.Ctx).Info("Discovering GPUs...") + + count, ret := nvml.DeviceGetCount() + if ret != nvml.SUCCESS { + return fmt.Errorf("Unable to get device count: %v", nvml.ErrorString(ret)) + } + + for i := 0; i < count; i++ { + device, ret := nvml.DeviceGetHandleByIndex(i) + if ret != nvml.SUCCESS { + return fmt.Errorf("Unable to get device at index %d: %v", i, nvml.ErrorString(ret)) + } + + uuid, ret := device.GetUUID() + if ret != nvml.SUCCESS { + return fmt.Errorf("Unable to get uuid of device at index %d: %v", i, nvml.ErrorString(ret)) + } + + name, ret := device.GetName() + if ret != nvml.SUCCESS { + return fmt.Errorf("Unable to get name of device at index %d: %v", i, nvml.ErrorString(ret)) + } + + index, ret := device.GetIndex() + if ret != nvml.SUCCESS { + return fmt.Errorf("Unable to get index of device at index %d: %v", i, nvml.ErrorString(ret)) + } + + // Add the GPU to the GPUSpecsList + a.GPUSpecsList = append(a.GPUSpecsList, GPUSpecs{Name: name, UUID: uuid, Type: "NVIDIA", ContainerID: "", Available: true, Index: index}) + } + + // print the GPUSpecsList if the length is greater than 0 + if len(a.GPUSpecsList) > 0 { + log.G(a.Ctx).Info("Discovered GPUs:") + for _, gpuSpec := range a.GPUSpecsList { + log.G(a.Ctx).Info(fmt.Sprintf("Name: %s, UUID: %s, Type: %s, Available: %t, Index: %d", gpuSpec.Name, gpuSpec.UUID, gpuSpec.Type, gpuSpec.Available, gpuSpec.Index)) + } + } else { + log.G(a.Ctx).Info("No GPUs discovered") + } + + return nil +} + +func (a *GPUManager) Check() error { + + log.G(a.Ctx).Info("Checking the availability of GPUs...") + + cli, err := client.NewEnvClient() + if err != nil { + return fmt.Errorf("unable to create a new Docker client: %v", err) + } + + containers, err := cli.ContainerList(context.Background(), container.ListOptions{All: false}) // With All set to false I get only the running containers, if I set All to true I get all the containers (running and stopped) + if err != nil { + return fmt.Errorf("unable to list containers: %v", err) + } + + for _, container := range containers { + containerInfo, err := cli.ContainerInspect(context.Background(), container.ID) + if err != nil { + return fmt.Errorf("unable to inspect container: %v", err) + } + + for _, env := range containerInfo.Config.Env { + if strings.Contains(env, "NVIDIA_VISIBLE_DEVICES=") { + indexOfEqualSign := strings.Index(env, "=") + gpuIDs := env[indexOfEqualSign+1:] + gpuIDsSplitted := strings.Split(gpuIDs, ",") + + for _, gpuID := range gpuIDsSplitted { + gpuIndex, err := strconv.Atoi(gpuID) + if err != nil { + return fmt.Errorf("unable to convert GPU ID to int: %v", err) + } + for i := range a.GPUSpecsList { + if a.GPUSpecsList[i].Index == gpuIndex { + a.GPUSpecsList[i].ContainerID = containerInfo.ID + a.GPUSpecsList[i].Available = false + } + } + } + } + } + } + + // print the GPUSpecsList that are not available + for _, gpuSpec := range a.GPUSpecsList { + if !gpuSpec.Available { + log.G(a.Ctx).Info(fmt.Sprintf("GPU with UUID %s is not available. It is in use by container %s", gpuSpec.UUID, gpuSpec.ContainerID)) + } else { + log.G(a.Ctx).Info(fmt.Sprintf("GPU with UUID %s is available", gpuSpec.UUID)) + } + } + + return nil +} + +func (a *GPUManager) Shutdown() error { + + log.G(a.Ctx).Info("Shutting down NVML...") + + ret := nvml.Shutdown() + if ret != nvml.SUCCESS { + return fmt.Errorf("Unable to shutdown NVML: %v", nvml.ErrorString(ret)) + } + + return nil +} + +func (a *GPUManager) GetGPUSpecsList() []GPUSpecs { + return a.GPUSpecsList +} + +func (a *GPUManager) Assign(UUID string, containerID string) error { + + for i := range a.GPUSpecsList { + if a.GPUSpecsList[i].UUID == UUID { + + if a.GPUSpecsList[i].Available == false { + return fmt.Errorf("GPU with UUID %s is already in use by container %s", UUID, a.GPUSpecsList[i].ContainerID) + } + + a.GPUSpecsList[i].ContainerID = containerID + a.GPUSpecsList[i].Available = false + break + } + } + return nil + +} + +func (a *GPUManager) Release(containerID string) error { + + log.G(a.Ctx).Info("Releasing GPU from container " + containerID) + + a.GPUSpecsMutex.Lock() + defer a.GPUSpecsMutex.Unlock() + + for i := range a.GPUSpecsList { + if a.GPUSpecsList[i].ContainerID == containerID { + + if a.GPUSpecsList[i].Available == true { + continue + } + + a.GPUSpecsList[i].ContainerID = "" + a.GPUSpecsList[i].Available = true + } + } + + log.G(a.Ctx).Info("Correctly released GPU from container " + containerID) + + return nil +} + +func (a *GPUManager) GetAvailableGPUs(numGPUs int) ([]GPUSpecs, error) { + + var availableGPUs []GPUSpecs + for _, gpuSpec := range a.GPUSpecsList { + if gpuSpec.Available == true { + availableGPUs = append(availableGPUs, gpuSpec) + if len(availableGPUs) == numGPUs { + return availableGPUs, nil + } + } + } + return nil, fmt.Errorf("Not enough available GPUs. Requested: %d, Available: %d", numGPUs, len(availableGPUs)) +} + +func (a *GPUManager) GetAndAssignAvailableGPUs(numGPUs int, containerID string) ([]GPUSpecs, error) { + + a.GPUSpecsMutex.Lock() + defer a.GPUSpecsMutex.Unlock() + + gpuSpecs, err := a.GetAvailableGPUs(numGPUs) + if err != nil { + return nil, err + } + + for _, gpuSpec := range gpuSpecs { + err = a.Assign(gpuSpec.UUID, containerID) + if err != nil { + return nil, err + } + } + + return gpuSpecs, nil +} + +// dump the GPUSpecsList into a JSON file +func (a *GPUManager) Dump() error { + + log.G(a.Ctx).Info("Dumping the GPUSpecsList into a JSON file...") + + // Convert the array to JSON format + jsonData, err := json.MarshalIndent(a.GPUSpecsList, "", " ") + if err != nil { + return fmt.Errorf("Error marshalling JSON: %v", err) + } + + // Write JSON data to a file + err = ioutil.WriteFile("gpu_specs.json", jsonData, 0644) + if err != nil { + return fmt.Errorf("Error writing to file: %v", err) + } + + return nil +} diff --git a/tests/sidecars/docker/PodGpu_test.go b/tests/sidecars/docker/PodGpu_test.go new file mode 100644 index 0000000..43806ba --- /dev/null +++ b/tests/sidecars/docker/PodGpu_test.go @@ -0,0 +1,468 @@ +package main + +import ( + "bytes" + "context" + "fmt" + "io" + "os" + "strings" + "sync" + "testing" + "text/template" + "time" + + "k8s.io/client-go/tools/clientcmd" + + "io/ioutil" + + "github.com/intertwin-eu/interlink/tests/sidecars/docker/templates" // replace with the actual module path + v1core "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "sigs.k8s.io/yaml" +) + +type PodTemplateData struct { + Name string + Namespace string + Image string + ContainerName string + NodeSelector string + GpuRequested string + GpuLimits string +} + +// create a function that take as parameter the clientset and configure it +func CreateClientSet(kubeconfig string) (*kubernetes.Clientset, error) { + config, err := clientcmd.BuildConfigFromFlags("", kubeconfig) + if err != nil { + return nil, fmt.Errorf("Error building kubeconfig: %v", err) + } + + clientset, err := kubernetes.NewForConfig(config) + if err != nil { + return nil, fmt.Errorf("Error creating Kubernetes client: %v", err) + } + + return clientset, nil +} + +func CreatePodDependencies(namespace, name, image, containerName, nodeSelector, gpuRequested, gpuLimits string) (string, string, error) { + + podData := new(PodTemplateData) + podData.ContainerName = containerName + podData.Image = image + podData.Name = name + podData.Namespace = namespace + podData.NodeSelector = nodeSelector + podData.GpuRequested = gpuRequested + podData.GpuLimits = gpuLimits + + startingPodTemplate, err := template.New("output_" + name + ".yaml").Parse(templates.NvidiaGpuPod) + if err != nil { + fmt.Printf("Error parsing template: %v\n", err) + return "", "", err + } + + folder, err := os.MkdirTemp(".", "output") + if err != nil { + fmt.Printf("Error creating temp folder: %v\n", err) + return "", "", err + } + + f, err := os.Create(folder + "/output_" + name + ".yaml") + if err != nil { + fmt.Printf("Error creating file: %v\n", err) + return "", "", err + } + err = startingPodTemplate.Execute(f, podData) + if err != nil { + fmt.Printf("Error executing template: %v\n", err) + return "", "", err + } + f.Close() + + return folder, folder + "/output_" + name + ".yaml", nil +} + +func watchPodStatus(clientset *kubernetes.Clientset, namespace string, createdPod *v1core.Pod, t *testing.T) (error, v1core.PodPhase, string) { + timeout := time.After(1 * time.Minute) + + watch, err := clientset.CoreV1().Pods(namespace).Watch(context.TODO(), metav1.ListOptions{ + FieldSelector: "metadata.name=" + createdPod.GetName(), + }) + if err != nil { + return fmt.Errorf("Error watching pod: %v", err), "", "" + } + + for { + select { + case event, ok := <-watch.ResultChan(): + if !ok { + return fmt.Errorf("pod watch channel closed before timeout"), "", "" + } + + pod, ok := event.Object.(*v1core.Pod) + if !ok { + fmt.Printf("Unexpected type\n") + continue + } + + t.Logf("Pod %s status: %s\n", pod.Name, pod.Status.Phase) + + if pod.Status.Phase == v1core.PodFailed || pod.Status.Phase == v1core.PodSucceeded { + watch.Stop() + + if pod.Status.Phase == v1core.PodFailed { + return nil, pod.Status.Phase, "" + } + // Get the logs of the pod + podLogOpts := v1core.PodLogOptions{} + req := clientset.CoreV1().Pods(namespace).GetLogs(pod.Name, &podLogOpts) + podLogs, err := req.Stream(context.TODO()) + if err != nil { + return fmt.Errorf("error in opening stream: %v", err), "", "" + } + defer podLogs.Close() + + buf := new(bytes.Buffer) + _, err = io.Copy(buf, podLogs) + if err != nil { + return fmt.Errorf("error in copy information from podLogs to buf: %v", err), "", "" + } + str := buf.String() + return nil, pod.Status.Phase, str + } + case <-timeout: + watch.Stop() + return fmt.Errorf("pod did not reach 'Succeeded' or 'Failed' status within 1 minute"), "", "" + } + } +} + +func CreatePod(yamlFilePath string, clientset *kubernetes.Clientset, namespace string) (*v1core.Pod, error) { + // Read the YAML file + yamlFile, err := ioutil.ReadFile(yamlFilePath) + if err != nil { + return nil, fmt.Errorf("Error reading YAML file: %v", err) + } + + // Unmarshal the YAML into a Pod object + var podTemplate v1core.Pod + err = yaml.Unmarshal(yamlFile, &podTemplate) + if err != nil { + return nil, fmt.Errorf("Error unmarshalling YAML: %v", err) + } + + // Create the Pod using the template + createdPod, err := clientset.CoreV1().Pods(namespace).Create(context.TODO(), &podTemplate, metav1.CreateOptions{}) + if err != nil { + return nil, fmt.Errorf("Error creating pod: %v", err) + } + + return createdPod, nil +} + +func AvoidTestPodFailure(t *testing.T) { + + name := "cuda-sample-fail" + containerName := "cuda-sample-container-fail" + namespace := "vk" + image := "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda10.2" + node := "vkgpu" + kubeconfig := "/home/ubuntu/kubeconfig/kubeconfig.yaml" + gpuLimits := "3" // requesting 3 GPUs should fail because the VK node has only 2 GPU + gpuRequested := "3" // requesting 3 GPUs should fail because the VK node has only 2 GPU + + // call createClientSet function to create the clientset + clientset, err := CreateClientSet(kubeconfig) + if err != nil { + t.Fatalf("Error creating Kubernetes client: %v\n", err) + return + } + + // Create the Pod dependencies + folder, yamlFilePath, err := CreatePodDependencies(namespace, name, image, containerName, node, gpuRequested, gpuLimits) + if err != nil { + t.Fatalf("Error creating pod dependencies: %v\n", err) + return + } + + // Ensure that the temporary folder is removed + defer func() { + err := os.RemoveAll(folder) + if err != nil { + fmt.Printf("Error removing temp folder: %v\n", err) + } + }() + + createdPod, err := CreatePod(yamlFilePath, clientset, namespace) + if err != nil { + t.Fatalf("Error creating pod: %v\n", err) + return + } + + defer func() { + err := clientset.CoreV1().Pods(namespace).Delete(context.TODO(), createdPod.GetName(), metav1.DeleteOptions{}) + if err != nil { + fmt.Printf("Error deleting pod: %v\n", err) + } + }() + + var podPhase v1core.PodPhase + var expectedFinalPodPhase v1core.PodPhase = v1core.PodFailed + + err, podPhase, _ = watchPodStatus(clientset, namespace, createdPod, t) + if err != nil { + t.Fatalf("Error watching pod: %v\n", err) + return + } + + if podPhase != expectedFinalPodPhase { + t.Fatalf("Pod %s did not reach the expected phase: %s\n", createdPod.GetName(), podPhase) + return + } + + t.Logf("Pod %s reached the expected phase: %s\n", createdPod.GetName(), podPhase) + +} + +/* +* The following test creates a new client and create n cuda test pods in parallel. This test is to check +* if the Virtual Kubelet node can handle multiple pods requesting one gpu at the same time. + */ +func TestParallelMultiplePodOneGpuConcurrent(t *testing.T) { + name := "cuda-sample-" + containerName := "cuda-sample-container-" + namespace := "vk" + image := "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda10.2" + node := "vkgpu" + kubeconfig := "/home/ubuntu/kubeconfig/kubeconfig.yaml" + gpuLimits := "1" + gpuRequested := "1" + + var nPodToRun int = 1 // number of pods to run + + // call createClientSet function to create the clientset + clientset, err := CreateClientSet(kubeconfig) + if err != nil { + t.Fatalf("Error creating Kubernetes client: %v\n", err) + return + } + + // initialize a list of created pods + createdPods := make([]*v1core.Pod, nPodToRun) + + for i := 0; i < nPodToRun; i++ { + + name := name + fmt.Sprint(i) + containerName := containerName + fmt.Sprint(i) + // Create the Pod dependencies + folder, yamlFilePath, err := CreatePodDependencies(namespace, name, image, containerName, node, gpuRequested, gpuLimits) + if err != nil { + t.Fatalf("Error creating pod dependencies: %v\n", err) + return + } + + // Ensure that the temporary folder is removed + defer func() { + err := os.RemoveAll(folder) + if err != nil { + fmt.Printf("Error removing temp folder: %v\n", err) + } + }() + + createdPod, err := CreatePod(yamlFilePath, clientset, namespace) + createdPods[i] = createdPod + + if err != nil { + t.Fatalf("Error creating pod: %v\n", err) + return + } + + } + + if err != nil { + t.Fatalf("Error converting gpuRequested to int: %v\n", err) + return + } + + // initialize a list of podPhases + podPhases := make([]v1core.PodPhase, nPodToRun) + + // check the status of the pods + var wg sync.WaitGroup + for i := 0; i < nPodToRun; i++ { + wg.Add(1) + go func(i int) { + defer wg.Done() + + // Watch for the pod to reach 'Succeeded' status + watch, err := clientset.CoreV1().Pods(namespace).Watch(context.TODO(), metav1.ListOptions{ + FieldSelector: "metadata.name=" + createdPods[i].GetName(), + }) + if err != nil { + t.Errorf("Error watching pod: %v\n", err) + return + } + + timeout := time.After(time.Duration((i+1)*60) * time.Second) + done := make(chan bool) + + go func() { + for event := range watch.ResultChan() { + pod, ok := event.Object.(*v1core.Pod) + if !ok { + t.Errorf("Unexpected type\n") + return + } + + if pod.Status.Phase == v1core.PodSucceeded || pod.Status.Phase == v1core.PodFailed { + podPhases[i] = pod.Status.Phase + done <- true + return + } + } + }() + + // Wait for done or timeout + select { + case <-timeout: + t.Errorf("Timeout waiting for pod %s to reach 'Succeeded' or 'Failed' status\n", createdPods[i].GetName()) + podPhases[i] = v1core.PodFailed + case <-done: + } + }(i) + } + + wg.Wait() + + // delete the pods + for i := 0; i < nPodToRun; i++ { + err := clientset.CoreV1().Pods(namespace).Delete(context.TODO(), createdPods[i].GetName(), metav1.DeleteOptions{}) + if err != nil { + fmt.Printf("Error deleting pod: %v\n", err) + } + } + + // expect all pods to be succeded + for i := 0; i < nPodToRun; i++ { + t.Logf("Pod %s status: %s\n", createdPods[i].GetName(), podPhases[i]) + } + + // expect all pods to be succeded + for i := 0; i < nPodToRun; i++ { + if podPhases[i] != v1core.PodSucceeded { + t.Fatalf("Pod %s did not reach the expected phase: %s\n", createdPods[i].GetName(), podPhases[i]) + return + } + } +} + +/* +* The following test is a sequential version of the previous test. +* It creates a new client and create a cuda test pod sequentially and waits for the pod to reach the Succeeded status (or Failed status). +* Then it checks the logs of the pod to see if the test passed (it should contain the string "Test PASSED"). +* At the end of the test, the pod is deleted. +* This tests checks the log of the container while the other test checks only the status of the pod. The reason is that +* the pod can reach the Succeeded status even if the test failed. In this way we can check if the test is truly passed or not. + */ + +func TestSequentialMultiplePodOneGpu(t *testing.T) { + + name := "cuda-sample-" + containerName := "cuda-sample-container-" + namespace := "vk" + image := "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda10.2" + node := "vkgpu" + kubeconfig := "/home/ubuntu/kubeconfig/kubeconfig.yaml" + gpuRequested := "1" + gpuLimit := "1" + + var nPodToRun int = 1 + + // call createClientSet function to create the clientset + clientset, err := CreateClientSet(kubeconfig) + if err != nil { + t.Fatalf("Error creating Kubernetes client: %v\n", err) + return + } + + for i := 0; i < nPodToRun; i++ { + + name := name + fmt.Sprint(i) + containerName := containerName + fmt.Sprint(i) + // Create the Pod dependencies + folder, yamlFilePath, err := CreatePodDependencies(namespace, name, image, containerName, node, gpuRequested, gpuLimit) + if err != nil { + t.Fatalf("Error creating pod dependencies: %v\n", err) + return + } + // Ensure that the temporary folder is removed + defer func() { + err := os.RemoveAll(folder) + if err != nil { + fmt.Printf("Error removing temp folder: %v\n", err) + } + }() + + createdPod, err := CreatePod(yamlFilePath, clientset, namespace) + + if err != nil { + t.Fatalf("Error creating pod: %v\n", err) + return + } + + // Ensure to delete the pod at the end of the test + defer func() { + err := clientset.CoreV1().Pods(namespace).Delete(context.TODO(), createdPod.GetName(), metav1.DeleteOptions{}) + if err != nil { + fmt.Printf("Error deleting pod: %v\n", err) + } + }() + + var podPhase v1core.PodPhase + var expectedFinalPodPhase v1core.PodPhase = v1core.PodSucceeded + var podLogs string + + err, podPhase, podLogs = watchPodStatus(clientset, namespace, createdPod, t) + if err != nil { + t.Fatalf("Error watching pod: %v\n", err) + return + } + + if podPhase != expectedFinalPodPhase { + t.Fatalf("Pod %s did not reach the expected phase: %s\n", createdPod.GetName(), podPhase) + return + } + + // check if podLogs contains the string "Test PASSED" + if !strings.Contains(podLogs, "Test PASSED") { + t.Fatalf("Pod %s did not pass the test\n", createdPod.GetName()) + return + } + + t.Log("Pod " + createdPod.GetName() + " passed the test") + } + +} + +func TestDeletePossibleOutputFolders(t *testing.T) { + + folders, err := os.ReadDir(".") + if err != nil { + t.Fatalf("Error reading directories: %v\n", err) + return + } + + for _, folder := range folders { + if folder.IsDir() && strings.HasPrefix(folder.Name(), "output") { + err := os.RemoveAll(folder.Name()) + if err != nil { + t.Fatalf("Error removing folder: %v\n", err) + return + } + } + } +} diff --git a/tests/sidecars/docker/export_vars.sh b/tests/sidecars/docker/export_vars.sh new file mode 100755 index 0000000..4af710c --- /dev/null +++ b/tests/sidecars/docker/export_vars.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +export TEST_POD_NAMESPACE="vk" +export TEST_POD_NAME="test-pod-vector-add" +export TEST_POD_IMAGE="nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda10.2" +export TEST_POD_CONTAINER_NAME="gpu-vect-add" +export TEST_POD_NODE_SELECTOR="vkgpu" +export TEST_KUBECONFIG_FILEPATH="/home/ubuntu/kubeconfig/kubeconfig.yaml" \ No newline at end of file diff --git a/tests/sidecars/docker/templates/nvidia_gpu_pod_template.go b/tests/sidecars/docker/templates/nvidia_gpu_pod_template.go new file mode 100644 index 0000000..6336f79 --- /dev/null +++ b/tests/sidecars/docker/templates/nvidia_gpu_pod_template.go @@ -0,0 +1,38 @@ +package templates + +const ( + NvidiaGpuPod = `apiVersion: v1 +kind: Pod +metadata: + name: {{.Name}} + namespace: {{.Namespace}} +spec: + restartPolicy: Never + containers: + - image: {{.Image}} + imagePullPolicy: Always + name: {{.ContainerName}} + resources: + requests: + nvidia.com/gpu: {{.GpuRequested }} # requesting 1 GPU + limits: + nvidia.com/gpu: {{.GpuLimits }} # requesting 1 GPU + dnsPolicy: ClusterFirst + nodeSelector: + kubernetes.io/hostname: {{.NodeSelector}} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: nvidia.com/gpu + operator: Gte + values: + - {{.GpuLimits }} + tolerations: + - key: virtual-node.interlink/no-schedule + operator: Exists + - key: node.kubernetes.io/not-ready + operator: Exists +` +)