Skip to content

Commit

Permalink
updated number of retries for DIND container
Browse files Browse the repository at this point in the history
  • Loading branch information
Bianco95 committed Aug 6, 2024
1 parent 86a3279 commit 48d1009
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 95 deletions.
15 changes: 11 additions & 4 deletions pkg/docker/Create.go
Original file line number Diff line number Diff line change
Expand Up @@ -333,12 +333,15 @@ func (h *SidecarHandler) CreateHandler(w http.ResponseWriter, r *http.Request) {
Args: []string{"network", "create", "--driver", "bridge", string(data.Pod.UID) + "_dind_network"},
Shell: true,
}
execReturn, err = shell.Execute()
execReturnNetworkCommand, err := shell.Execute()
if err != nil {
HandleErrorAndRemoveData(h, w, "An error occurred during the creation of the network for the DIND container", err, "", "")
return
}

// log the docker network creation command
log.G(h.Ctx).Info("\u2705 [POD FLOW] Docker network created successfully with command: " + "docker " + strings.Join(shell.Args, " "))

dindContainerArgs := []string{"run"}
dindContainerArgs = append(dindContainerArgs, gpuArgsAsArray...)
if _, err := os.Stat("/cvmfs"); err == nil {
Expand All @@ -363,21 +366,25 @@ func (h *SidecarHandler) CreateHandler(w http.ResponseWriter, r *http.Request) {
}
dindContainerID = execReturn.Stdout

// log also the command executed to create the DIND container
log.G(h.Ctx).Info("\u2705 [POD FLOW] DIND container command executed successfully: " + "docker " + strings.Join(shell.Args, " "))

log.G(h.Ctx).Info("\u2705 [POD FLOW] DIND container created successfully with ID: " + dindContainerID)

// create a variable of maximum number of retries
maxRetries := 10
maxRetries := 20
output := []byte{}

// wait until the dind container is up and running by check that the command docker ps inside of it does not return an error
for {

if maxRetries == 0 {
HandleErrorAndRemoveData(h, w, "The number of attempts to check if the DIND container is running is 0. This means that an error occurred during the creation of the DIND container", err, "", "")
HandleErrorAndRemoveData(h, w, "The number of attempts to check if the DIND container is running is 0. This means that an error occurred during the creation of the DIND container UID. "+dindContainerID+" output: "+string(output)+" Network creation output "+string(execReturnNetworkCommand.Stdout), err, "", "")
return
}

cmd := OSexec.Command("docker", "logs", string(data.Pod.UID)+"_dind")
output, err := cmd.CombinedOutput()
output, err = cmd.CombinedOutput()

if err != nil {
time.Sleep(1 * time.Second)
Expand Down
129 changes: 39 additions & 90 deletions pkg/docker/Delete.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,103 +43,52 @@ func (h *SidecarHandler) DeleteHandler(w http.ResponseWriter, r *http.Request) {
podNamespace := string(pod.Namespace)

for _, container := range pod.Spec.Containers {

containerName := podNamespace + "-" + podUID + "-" + container.Name

log.G(h.Ctx).Debug("\u2705 [DELETE CALL] Deleting container " + containerName)

// added a timeout to the stop container command
cmd := []string{"exec", podUID + "_dind", "docker", "stop", "-t", "10", containerName}
shell := exec.ExecTask{
Command: "docker",
Args: cmd,
Shell: true,
}
execReturn, _ = shell.Execute()

if execReturn.Stderr != "" {
if strings.Contains(execReturn.Stderr, "No such container") {
log.G(h.Ctx).Debug("\u26A0 [DELETE CALL] Unable to find container " + containerName + ". Probably already removed? Skipping its removal")
} else {
log.G(h.Ctx).Error("\u274C [DELETE CALL] Error stopping container " + containerName + ". Skipping its removal")
statusCode = http.StatusInternalServerError
w.WriteHeader(statusCode)
w.Write([]byte("Some errors occurred while deleting container. Check Docker Sidecar's logs"))
return
}
continue
}

if execReturn.Stdout != "" {
cmd = []string{"exec", podUID + "_dind", "docker", "rm", execReturn.Stdout}
shell = exec.ExecTask{
Command: "docker",
Args: cmd,
Shell: true,
}
execReturn, _ = shell.Execute()
execReturn.Stdout = strings.ReplaceAll(execReturn.Stdout, "\n", "")

if execReturn.Stderr != "" {
log.G(h.Ctx).Error("\u274C [DELETE CALL] Error deleting container " + containerName)
statusCode = http.StatusInternalServerError
w.WriteHeader(statusCode)
w.Write([]byte("Some errors occurred while deleting container. Check Docker Sidecar's logs"))
return
} else {
log.G(h.Ctx).Info("\u2705 [DELETE CALL] Deleted container " + containerName)
}
}

cmd = []string{"rm", "-f", podUID + "_dind"}
shell = exec.ExecTask{
Command: "docker",
Args: cmd,
Shell: true,
}
execReturn, _ = shell.Execute()
execReturn.Stdout = strings.ReplaceAll(execReturn.Stdout, "\n", "")

if execReturn.Stderr != "" {
log.G(h.Ctx).Error("\u274C [DELETE CALL] Error deleting container " + podUID + "_dind")
statusCode = http.StatusInternalServerError
w.WriteHeader(statusCode)
w.Write([]byte("Some errors occurred while deleting container. Check Docker Sidecar's logs"))
return
} else {
log.G(h.Ctx).Info("\u2705 [DELETE CALL] Deleted container " + podUID + "_dind")
}

// check if the container has GPU devices attacched using the GpuManager and release them
h.GpuManager.Release(containerName)
}

// delete also the network of the docker dind container that is called string(data.Pod.UID) + "_dind_network"
cmd = []string{"network", "rm", podUID + "_dind_network"}
shell = exec.ExecTask{
Command: "docker",
Args: cmd,
Shell: true,
}
execReturn, _ = shell.Execute()
execReturn.Stdout = strings.ReplaceAll(execReturn.Stdout, "\n", "")
if execReturn.Stderr != "" {
log.G(h.Ctx).Error("\u274C [DELETE CALL] Error deleting network " + podUID + "_dind_network")
} else {
log.G(h.Ctx).Info("\u2705 [DELETE CALL] Deleted network " + podUID + "_dind_network")
}
log.G(h.Ctx).Debug("\u2705 [DELETE CALL] Deleting POD " + podUID + "_dind")

wd, err := os.Getwd()
if err != nil {
HandleErrorAndRemoveData(h, w, "Unable to get current working directory", err, "", "")
return
}
podDirectoryPathToDelete := filepath.Join(wd, h.Config.DataRootFolder+"/"+podNamespace+"-"+podUID)
log.G(h.Ctx).Info("\u2705 [DELETE CALL] Deleting directory " + podDirectoryPathToDelete)
cmd := []string{"rm", "-f", podUID + "_dind"}
shell := exec.ExecTask{
Command: "docker",
Args: cmd,
Shell: true,
}
execReturn, _ = shell.Execute()
execReturn.Stdout = strings.ReplaceAll(execReturn.Stdout, "\n", "")

err = os.RemoveAll(podDirectoryPathToDelete)
if execReturn.Stderr != "" {
log.G(h.Ctx).Error("\u274C [DELETE CALL] Error deleting container " + podUID + "_dind")
statusCode = http.StatusInternalServerError
} else {
log.G(h.Ctx).Info("\u2705 [DELETE CALL] Deleted container " + podUID + "_dind")
}

// delete also the network of the docker dind container that is called string(data.Pod.UID) + "_dind_network"
cmd = []string{"network", "rm", podUID + "_dind_network"}
shell = exec.ExecTask{
Command: "docker",
Args: cmd,
Shell: true,
}
execReturn, _ = shell.Execute()
execReturn.Stdout = strings.ReplaceAll(execReturn.Stdout, "\n", "")
if execReturn.Stderr != "" {
log.G(h.Ctx).Error("\u274C [DELETE CALL] Error deleting network " + podUID + "_dind_network")
} else {
log.G(h.Ctx).Info("\u2705 [DELETE CALL] Deleted network " + podUID + "_dind_network")
}

//os.RemoveAll(h.Config.DataRootFolder + pod.Namespace + "-" + string(pod.UID))
wd, err := os.Getwd()
if err != nil {
HandleErrorAndRemoveData(h, w, "Unable to get current working directory", err, "", "")
return
}
podDirectoryPathToDelete := filepath.Join(wd, h.Config.DataRootFolder+"/"+podNamespace+"-"+podUID)
log.G(h.Ctx).Info("\u2705 [DELETE CALL] Deleting directory " + podDirectoryPathToDelete)

err = os.RemoveAll(podDirectoryPathToDelete)

w.WriteHeader(statusCode)
if statusCode != http.StatusOK {
Expand Down
12 changes: 11 additions & 1 deletion pkg/docker/Status.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,19 @@ func (h *SidecarHandler) StatusHandler(w http.ResponseWriter, r *http.Request) {
break
}

dindUUID := strings.ReplaceAll(execReturn.Stdout, "\n", "")
//dindUUID := strings.ReplaceAll(execReturn.Stdout, "\n", "")
dindUUID := strings.Join(strings.Fields(execReturn.Stdout), "")
log.G(h.Ctx).Info("\u2705 [STATUS CALL] UUID of the dind container retrieved successfully: ", dindUUID)

// if the string is empty or the length of the string is 0, return an error and 404 status code\
if len(dindUUID) == 0 || dindUUID == "" {
log.G(h.Ctx).Error("\u274C [STATUS CALL] Error retrieving UUID of the dind container")
statusCode = http.StatusNotFound
w.WriteHeader(statusCode)
w.Write([]byte("DIND container with UUID " + dindUUID + " not found. Maybe it was deleted or never existed."))
return
}

resp = append(resp, commonIL.PodStatus{PodName: pod.Name, PodUID: podUID, PodNamespace: podNamespace, JobID: dindUUID})
for _, container := range pod.Spec.Containers {

Expand Down

0 comments on commit 48d1009

Please sign in to comment.