diff --git a/pkg/docker/Create.go b/pkg/docker/Create.go index 107e047..fa62ab9 100644 --- a/pkg/docker/Create.go +++ b/pkg/docker/Create.go @@ -333,12 +333,15 @@ func (h *SidecarHandler) CreateHandler(w http.ResponseWriter, r *http.Request) { Args: []string{"network", "create", "--driver", "bridge", string(data.Pod.UID) + "_dind_network"}, Shell: true, } - execReturn, err = shell.Execute() + execReturnNetworkCommand, err := shell.Execute() if err != nil { HandleErrorAndRemoveData(h, w, "An error occurred during the creation of the network for the DIND container", err, "", "") return } + // log the docker network creation command + log.G(h.Ctx).Info("\u2705 [POD FLOW] Docker network created successfully with command: " + "docker " + strings.Join(shell.Args, " ")) + dindContainerArgs := []string{"run"} dindContainerArgs = append(dindContainerArgs, gpuArgsAsArray...) if _, err := os.Stat("/cvmfs"); err == nil { @@ -363,21 +366,25 @@ func (h *SidecarHandler) CreateHandler(w http.ResponseWriter, r *http.Request) { } dindContainerID = execReturn.Stdout + // log also the command executed to create the DIND container + log.G(h.Ctx).Info("\u2705 [POD FLOW] DIND container command executed successfully: " + "docker " + strings.Join(shell.Args, " ")) + log.G(h.Ctx).Info("\u2705 [POD FLOW] DIND container created successfully with ID: " + dindContainerID) // create a variable of maximum number of retries - maxRetries := 10 + maxRetries := 20 + output := []byte{} // wait until the dind container is up and running by check that the command docker ps inside of it does not return an error for { if maxRetries == 0 { - HandleErrorAndRemoveData(h, w, "The number of attempts to check if the DIND container is running is 0. This means that an error occurred during the creation of the DIND container", err, "", "") + HandleErrorAndRemoveData(h, w, "The number of attempts to check if the DIND container is running is 0. This means that an error occurred during the creation of the DIND container UID. "+dindContainerID+" output: "+string(output)+" Network creation output "+string(execReturnNetworkCommand.Stdout), err, "", "") return } cmd := OSexec.Command("docker", "logs", string(data.Pod.UID)+"_dind") - output, err := cmd.CombinedOutput() + output, err = cmd.CombinedOutput() if err != nil { time.Sleep(1 * time.Second) diff --git a/pkg/docker/Delete.go b/pkg/docker/Delete.go index 3dccee8..e0fb496 100644 --- a/pkg/docker/Delete.go +++ b/pkg/docker/Delete.go @@ -43,103 +43,52 @@ func (h *SidecarHandler) DeleteHandler(w http.ResponseWriter, r *http.Request) { podNamespace := string(pod.Namespace) for _, container := range pod.Spec.Containers { - containerName := podNamespace + "-" + podUID + "-" + container.Name - - log.G(h.Ctx).Debug("\u2705 [DELETE CALL] Deleting container " + containerName) - - // added a timeout to the stop container command - cmd := []string{"exec", podUID + "_dind", "docker", "stop", "-t", "10", containerName} - shell := exec.ExecTask{ - Command: "docker", - Args: cmd, - Shell: true, - } - execReturn, _ = shell.Execute() - - if execReturn.Stderr != "" { - if strings.Contains(execReturn.Stderr, "No such container") { - log.G(h.Ctx).Debug("\u26A0 [DELETE CALL] Unable to find container " + containerName + ". Probably already removed? Skipping its removal") - } else { - log.G(h.Ctx).Error("\u274C [DELETE CALL] Error stopping container " + containerName + ". Skipping its removal") - statusCode = http.StatusInternalServerError - w.WriteHeader(statusCode) - w.Write([]byte("Some errors occurred while deleting container. Check Docker Sidecar's logs")) - return - } - continue - } - - if execReturn.Stdout != "" { - cmd = []string{"exec", podUID + "_dind", "docker", "rm", execReturn.Stdout} - shell = exec.ExecTask{ - Command: "docker", - Args: cmd, - Shell: true, - } - execReturn, _ = shell.Execute() - execReturn.Stdout = strings.ReplaceAll(execReturn.Stdout, "\n", "") - - if execReturn.Stderr != "" { - log.G(h.Ctx).Error("\u274C [DELETE CALL] Error deleting container " + containerName) - statusCode = http.StatusInternalServerError - w.WriteHeader(statusCode) - w.Write([]byte("Some errors occurred while deleting container. Check Docker Sidecar's logs")) - return - } else { - log.G(h.Ctx).Info("\u2705 [DELETE CALL] Deleted container " + containerName) - } - } - - cmd = []string{"rm", "-f", podUID + "_dind"} - shell = exec.ExecTask{ - Command: "docker", - Args: cmd, - Shell: true, - } - execReturn, _ = shell.Execute() - execReturn.Stdout = strings.ReplaceAll(execReturn.Stdout, "\n", "") - - if execReturn.Stderr != "" { - log.G(h.Ctx).Error("\u274C [DELETE CALL] Error deleting container " + podUID + "_dind") - statusCode = http.StatusInternalServerError - w.WriteHeader(statusCode) - w.Write([]byte("Some errors occurred while deleting container. Check Docker Sidecar's logs")) - return - } else { - log.G(h.Ctx).Info("\u2705 [DELETE CALL] Deleted container " + podUID + "_dind") - } - - // check if the container has GPU devices attacched using the GpuManager and release them h.GpuManager.Release(containerName) + } - // delete also the network of the docker dind container that is called string(data.Pod.UID) + "_dind_network" - cmd = []string{"network", "rm", podUID + "_dind_network"} - shell = exec.ExecTask{ - Command: "docker", - Args: cmd, - Shell: true, - } - execReturn, _ = shell.Execute() - execReturn.Stdout = strings.ReplaceAll(execReturn.Stdout, "\n", "") - if execReturn.Stderr != "" { - log.G(h.Ctx).Error("\u274C [DELETE CALL] Error deleting network " + podUID + "_dind_network") - } else { - log.G(h.Ctx).Info("\u2705 [DELETE CALL] Deleted network " + podUID + "_dind_network") - } + log.G(h.Ctx).Debug("\u2705 [DELETE CALL] Deleting POD " + podUID + "_dind") - wd, err := os.Getwd() - if err != nil { - HandleErrorAndRemoveData(h, w, "Unable to get current working directory", err, "", "") - return - } - podDirectoryPathToDelete := filepath.Join(wd, h.Config.DataRootFolder+"/"+podNamespace+"-"+podUID) - log.G(h.Ctx).Info("\u2705 [DELETE CALL] Deleting directory " + podDirectoryPathToDelete) + cmd := []string{"rm", "-f", podUID + "_dind"} + shell := exec.ExecTask{ + Command: "docker", + Args: cmd, + Shell: true, + } + execReturn, _ = shell.Execute() + execReturn.Stdout = strings.ReplaceAll(execReturn.Stdout, "\n", "") - err = os.RemoveAll(podDirectoryPathToDelete) + if execReturn.Stderr != "" { + log.G(h.Ctx).Error("\u274C [DELETE CALL] Error deleting container " + podUID + "_dind") + statusCode = http.StatusInternalServerError + } else { + log.G(h.Ctx).Info("\u2705 [DELETE CALL] Deleted container " + podUID + "_dind") + } + + // delete also the network of the docker dind container that is called string(data.Pod.UID) + "_dind_network" + cmd = []string{"network", "rm", podUID + "_dind_network"} + shell = exec.ExecTask{ + Command: "docker", + Args: cmd, + Shell: true, + } + execReturn, _ = shell.Execute() + execReturn.Stdout = strings.ReplaceAll(execReturn.Stdout, "\n", "") + if execReturn.Stderr != "" { + log.G(h.Ctx).Error("\u274C [DELETE CALL] Error deleting network " + podUID + "_dind_network") + } else { + log.G(h.Ctx).Info("\u2705 [DELETE CALL] Deleted network " + podUID + "_dind_network") + } - //os.RemoveAll(h.Config.DataRootFolder + pod.Namespace + "-" + string(pod.UID)) + wd, err := os.Getwd() + if err != nil { + HandleErrorAndRemoveData(h, w, "Unable to get current working directory", err, "", "") + return } + podDirectoryPathToDelete := filepath.Join(wd, h.Config.DataRootFolder+"/"+podNamespace+"-"+podUID) + log.G(h.Ctx).Info("\u2705 [DELETE CALL] Deleting directory " + podDirectoryPathToDelete) + + err = os.RemoveAll(podDirectoryPathToDelete) w.WriteHeader(statusCode) if statusCode != http.StatusOK { diff --git a/pkg/docker/Status.go b/pkg/docker/Status.go index b16ea9c..b418644 100644 --- a/pkg/docker/Status.go +++ b/pkg/docker/Status.go @@ -59,9 +59,19 @@ func (h *SidecarHandler) StatusHandler(w http.ResponseWriter, r *http.Request) { break } - dindUUID := strings.ReplaceAll(execReturn.Stdout, "\n", "") + //dindUUID := strings.ReplaceAll(execReturn.Stdout, "\n", "") + dindUUID := strings.Join(strings.Fields(execReturn.Stdout), "") log.G(h.Ctx).Info("\u2705 [STATUS CALL] UUID of the dind container retrieved successfully: ", dindUUID) + // if the string is empty or the length of the string is 0, return an error and 404 status code\ + if len(dindUUID) == 0 || dindUUID == "" { + log.G(h.Ctx).Error("\u274C [STATUS CALL] Error retrieving UUID of the dind container") + statusCode = http.StatusNotFound + w.WriteHeader(statusCode) + w.Write([]byte("DIND container with UUID " + dindUUID + " not found. Maybe it was deleted or never existed.")) + return + } + resp = append(resp, commonIL.PodStatus{PodName: pod.Name, PodUID: podUID, PodNamespace: podNamespace, JobID: dindUUID}) for _, container := range pod.Spec.Containers {