Skip to content
This repository has been archived by the owner on Apr 24, 2023. It is now read-only.

Commit

Permalink
add support for modifying pod image when checkpointing (#1729)
Browse files Browse the repository at this point in the history
  • Loading branch information
nsinkov authored Nov 11, 2020
1 parent 6f1ed4b commit 4e185df
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 9 deletions.
17 changes: 17 additions & 0 deletions scheduler/src/cook/kubernetes/api.clj
Original file line number Diff line number Diff line change
Expand Up @@ -724,6 +724,21 @@
checkpoint))
checkpoint)))))

(defn calculate-effective-image
"Transform the supplied job's image as necessary. e.g. do special transformation if checkpointing is enabled
and an image transformation function is supplied."
[{:keys [calculate-effective-image-fn] :as kubernetes-config} job-submit-time image {:keys [mode]} task-id]
(if (and mode calculate-effective-image-fn)
(try
((util/lazy-load-var-memo calculate-effective-image-fn) kubernetes-config job-submit-time image)
(catch Exception e
(log/error e "Error calculating effective image for checkpointing"
{:calculate-effective-image-fn calculate-effective-image-fn
:image image
:task-id task-id})
image))
image))

(defn job->pod-labels
"Returns the dictionary of labels that should be
added to the job's pod based on the job's labels
Expand Down Expand Up @@ -779,6 +794,8 @@
{:keys [volumes volume-mounts sandbox-volume-mount-fn]} (make-volumes volumes sandbox-dir)
{:keys [custom-shell init-container set-container-cpu-limit? sidecar]} (config/kubernetes)
checkpoint (calculate-effective-checkpointing-config job task-id)
job-submit-time (tools/job->submit-time job)
image (calculate-effective-image (config/kubernetes) job-submit-time image checkpoint task-id)
checkpoint-memory-overhead (:memory-overhead checkpoint)
use-cook-init? (and init-container pod-supports-cook-init?)
use-cook-sidecar? (and sidecar pod-supports-cook-sidecar?)
Expand Down
9 changes: 2 additions & 7 deletions scheduler/src/cook/kubernetes/controller.clj
Original file line number Diff line number Diff line change
Expand Up @@ -311,26 +311,21 @@
; At this point, we don't care about the launch pod or the metric timers, so toss their dictionary away.
{:cook-expected-state :cook-expected-state/running})

(def get-pod-ip->hostname-fn
(memoize
(fn [pod-ip->hostname-fn]
(if pod-ip->hostname-fn (util/lazy-load-var pod-ip->hostname-fn) identity))))

(defn record-sandbox-url
"Record the sandbox file server URL in datomic."
[pod-name {:keys [pod]}]
(when-not (api/synthetic-pod? pod-name)
(let [task-id (-> pod .getMetadata .getName)
pod-ip (-> pod .getStatus .getPodIP)
{:keys [default-workdir pod-ip->hostname-fn sidecar]} (config/kubernetes)
{:keys [default-workdir sidecar]} (config/kubernetes)
sandbox-fileserver-port (:port sidecar)
sandbox-health-check-endpoint (:health-check-endpoint sidecar)
sandbox-url (try
(when (and sandbox-fileserver-port
sandbox-health-check-endpoint
(not (str/blank? pod-ip)))
(str "http://"
((get-pod-ip->hostname-fn pod-ip->hostname-fn) pod-ip)
pod-ip
":" sandbox-fileserver-port
"/files/read.json?path="
(URLEncoder/encode default-workdir "UTF-8")))
Expand Down
3 changes: 1 addition & 2 deletions scheduler/src/cook/rest/api.clj
Original file line number Diff line number Diff line change
Expand Up @@ -1276,8 +1276,7 @@
(->> [attribute (str/upper-case (name operator)) pattern]
(map str)))))
instances (map #(fetch-instance-map db %1) (:job/instance job))
submit-time (when (:job/submit-time job) ; due to a bug, submit time may not exist for some jobs
(.getTime (:job/submit-time job)))
submit-time (util/job->submit-time job)
datasets (when (seq (:job/datasets job))
(dl/get-dataset-maps job))
attempts-consumed (util/job-ent->attempts-consumed db job)
Expand Down
5 changes: 5 additions & 0 deletions scheduler/src/cook/tools.clj
Original file line number Diff line number Diff line change
Expand Up @@ -1030,3 +1030,8 @@
(format "%.3f" %)
(str %))
resource-map))

(defn job->submit-time
"Get submit-time for a job. due to a bug, submit time may not exist for some jobs"
[job]
(when (:job/submit-time job) (.getTime (:job/submit-time job))))
3 changes: 3 additions & 0 deletions scheduler/src/cook/util.clj
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,9 @@
resolved
(throw (ex-info "Unable to resolve var, is it valid?" {:var-sym var-sym}))))))

(def lazy-load-var-memo
(memoize lazy-load-var))

(def ZeroInt
(s/both s/Int (s/pred zero? 'zero?)))

Expand Down

0 comments on commit 4e185df

Please sign in to comment.