From 1d84675ae2d81e6dd4822068fb72cccab6bdc980 Mon Sep 17 00:00:00 2001 From: Daniel Posada Date: Tue, 25 Aug 2020 16:25:07 -0500 Subject: [PATCH] Adds support for EQUALS constraints in k8s (#1607) * Adds support for EQUALS constraints in k8s * Remove any default machine-type constraints if user specified any * Update config-k8s.edn --- scheduler/bin/help-make-cluster | 52 +++++++---------- scheduler/config-k8s.edn | 11 +++- scheduler/src/cook/config.clj | 12 ++++ scheduler/src/cook/kubernetes/api.clj | 23 +++++++- .../src/cook/kubernetes/compute_cluster.clj | 47 ++++++++++----- scheduler/src/cook/scheduler/constraints.clj | 58 ++++++++++++++++++- .../test/cook/test/scheduler/constraints.clj | 34 ++++++++++- 7 files changed, 185 insertions(+), 52 deletions(-) diff --git a/scheduler/bin/help-make-cluster b/scheduler/bin/help-make-cluster index b9d34e343e..44ed7c07b9 100755 --- a/scheduler/bin/help-make-cluster +++ b/scheduler/bin/help-make-cluster @@ -22,6 +22,22 @@ VERSION=1.15.11-gke.17 gcloud="gcloud --project $PROJECT" +function create_node_pool { + local cook_pool=$1 + local machine_type=$2 + echo "---- Creating $machine_type node pool for Cook pool $cook_pool" + $gcloud container node-pools create "cook-pool-$cook_pool-$machine_type" \ + --zone "$ZONE" \ + --cluster="$CLUSTERNAME" \ + --disk-size=20gb \ + --machine-type="$machine_type" \ + --node-taints="cook-pool=$cook_pool:NoSchedule" \ + --enable-autoscaling \ + --min-nodes=0 \ + --max-nodes=6 \ + --node-labels="cook-pool=$cook_pool,test-label=true,node-type=$machine_type" +} + echo "---- Building kubernetes cluster for project=$PROJECT zone=$ZONE clustername=$CLUSTERNAME kubeconfig=$COOK_KUBECONFIG" # This creates a cluster, with some nodes that are needed for kubernetes management. @@ -48,37 +64,11 @@ KUBECONFIG=${COOK_KUBECONFIG} kubectl create -f "${DIR}/priority-class-synthetic # Make some node-pools --- this takes a while, but it helps guarantee that we have nodes with the appropriate cook-pool taints. echo "---- Making extra k8s-alpha and k8s-gamma nodepools for cook pools (please wait 5-10 minutes)" -$gcloud container node-pools create cook-pool-k8s-gamma \ - --zone "$ZONE" \ - --cluster="$CLUSTERNAME" \ - --disk-size=20gb \ - --machine-type=g1-small \ - --node-taints=cook-pool=k8s-gamma:NoSchedule \ - --enable-autoscaling \ - --min-nodes=0 \ - --max-nodes=6 \ - --node-labels=cook-pool=k8s-gamma,test-label=true -$gcloud container node-pools create cook-pool-k8s-alpha \ - --zone "$ZONE" \ - --cluster="$CLUSTERNAME" \ - --disk-size=20gb \ - --machine-type=g1-small \ - --node-taints=cook-pool=k8s-alpha:NoSchedule \ - --enable-autoscaling \ - --min-nodes=0 \ - --max-nodes=6 \ - --node-labels=cook-pool=k8s-alpha,test-label=true -$gcloud container node-pools create cook-pool-k8s-quota \ - --zone "$ZONE" \ - --cluster="$CLUSTERNAME" \ - --disk-size=20gb \ - --machine-type=g1-small \ - --node-taints=cook-pool=k8s-quota:NoSchedule \ - --enable-autoscaling \ - --min-nodes=0 \ - --max-nodes=6 \ - --node-labels=cook-pool=k8s-quota,test-label=true - +create_node_pool k8s-alpha g1-small +create_node_pool k8s-gamma g1-small +create_node_pool k8s-quota g1-small +create_node_pool k8s-alpha e2-small +create_node_pool k8s-gamma e2-small echo "---- Setting up cook namespace in kubernetes" KUBECONFIG=${COOK_KUBECONFIG} kubectl create -f docs/make-kubernetes-namespace.json diff --git a/scheduler/config-k8s.edn b/scheduler/config-k8s.edn index 760e64932d..069af684d4 100644 --- a/scheduler/config-k8s.edn +++ b/scheduler/config-k8s.edn @@ -91,8 +91,15 @@ :network "HOST" :force-pull-image false :parameters []}}}] - :valid-gpu-models - [{:pool-regex "k8s-alpha" :valid-models #{"nvidia-tesla-k80" "nvidia-tesla-p100"} :default-model "nvidia-tesla-p100"}]} + :valid-gpu-models + [{:pool-regex "k8s-alpha" + :valid-models #{"nvidia-tesla-k80" "nvidia-tesla-p100"} + :default-model "nvidia-tesla-p100"}] + :default-job-constraints + [{:pool-regex "^k8s-.*" + :default-constraints [{:constraint/attribute "node-type" + :constraint/operator :constraint.operator/equals + :constraint/pattern "g1-small"}]}]} :port #config/env-int "COOK_PORT" :ssl {:port #config/env-int "COOK_SSL_PORT" :keystore-path #config/env "COOK_KEYSTORE_PATH" diff --git a/scheduler/src/cook/config.clj b/scheduler/src/cook/config.clj index ad332c908b..bc3af55cd5 100644 --- a/scheduler/src/cook/config.clj +++ b/scheduler/src/cook/config.clj @@ -557,6 +557,18 @@ [] (-> config :settings :pools :job-resource-adjustment)) +(defn default-job-constraints + "Returns the per-pool-regex default job constraints. + This function returns a list with the following shape: + [{:pool-regex ... + :default-constraints [{:constraint/attribute ... + :constraint/operator ... + :constraint/pattern ...} + ...]} + ...]" + [] + (-> config :settings :pools :default-job-constraints)) + (defn api-only-mode? "Returns true if api-only? mode is turned on" [] diff --git a/scheduler/src/cook/kubernetes/api.clj b/scheduler/src/cook/kubernetes/api.clj index c709fbbf64..187178e6e4 100644 --- a/scheduler/src/cook/kubernetes/api.clj +++ b/scheduler/src/cook/kubernetes/api.clj @@ -714,8 +714,8 @@ (defn ^V1Pod task-metadata->pod "Given a task-request and other data generate the kubernetes V1Pod to launch that task." [namespace compute-cluster-name - {:keys [task-id command container task-request hostname pod-annotations pod-labels pod-hostnames-to-avoid - pod-priority-class pod-supports-cook-init? pod-supports-cook-sidecar?] + {:keys [task-id command container task-request hostname pod-annotations pod-constraints pod-hostnames-to-avoid + pod-labels pod-priority-class pod-supports-cook-init? pod-supports-cook-sidecar?] :or {pod-priority-class cook-job-pod-priority-class pod-supports-cook-init? true pod-supports-cook-sidecar? true}}] @@ -929,6 +929,25 @@ ; synthetic pods (which don't specify a node name), but it ; doesn't hurt to add it for all pods we submit. (add-node-selector pod-spec cook-pool-label pool-name)) + + (when pod-constraints + (doseq [{:keys [constraint/attribute + constraint/operator + constraint/pattern]} + pod-constraints] + (condp = operator + :constraint.operator/equals + ; Add a node selector for any EQUALS constraints + ; either defined by the user on their job spec + ; or defaulted on the Cook pool via configuration + (add-node-selector pod-spec attribute pattern) + :else + ; A bad constraint operator should have + ; been blocked at job submission time + (throw (ex-info "Encountered unexpected constraint operator" + {:operator operator + :task-id task-id}))))) + (.setVolumes pod-spec (filterv some? (conj volumes init-container-workdir-volume scratch-space-volume diff --git a/scheduler/src/cook/kubernetes/compute_cluster.clj b/scheduler/src/cook/kubernetes/compute_cluster.clj index 0009425729..1647c3a1b2 100644 --- a/scheduler/src/cook/kubernetes/compute_cluster.clj +++ b/scheduler/src/cook/kubernetes/compute_cluster.clj @@ -108,19 +108,36 @@ (->> node-name->available (filter #(schedulable-node-filter node-name->node node-name->pods compute-cluster %)) (map (fn [[node-name available]] - {:id {:value (str (UUID/randomUUID))} - :framework-id compute-cluster-name - :slave-id {:value node-name} - :hostname node-name - :resources [{:name "mem" :type :value-scalar :scalar (max 0.0 (:mem available))} - {:name "cpus" :type :value-scalar :scalar (max 0.0 (:cpus available))} - {:name "disk" :type :value-scalar :scalar 0.0} - {:name "gpus" :type :value-text->scalar :text->scalar (:gpus available)}] - :attributes [{:name "compute-cluster-type" :type :value-text :text "kubernetes"}] - :executor-ids [] - :compute-cluster compute-cluster - :reject-after-match-attempt true - :offer-match-timer (timers/start (ccmetrics/timer "offer-match-timer" compute-cluster-name))}))))) + (let [node-label-attributes + ; Convert all node labels to offer + ; attributes so that job constraints + ; can use them in the matching process + (or + (some->> node-name + ^V1Node (get node-name->node) + .getMetadata + .getLabels + (map (fn [[key value]] + {:name key + :type :value-text + :text value}))) + [])] + {:id {:value (str (UUID/randomUUID))} + :framework-id compute-cluster-name + :slave-id {:value node-name} + :hostname node-name + :resources [{:name "mem" :type :value-scalar :scalar (max 0.0 (:mem available))} + {:name "cpus" :type :value-scalar :scalar (max 0.0 (:cpus available))} + {:name "disk" :type :value-scalar :scalar 0.0} + {:name "gpus" :type :value-text->scalar :text->scalar (:gpus available)}] + :attributes (conj node-label-attributes + {:name "compute-cluster-type" + :type :value-text + :text "kubernetes"}) + :executor-ids [] + :compute-cluster compute-cluster + :reject-after-match-attempt true + :offer-match-timer (timers/start (ccmetrics/timer "offer-match-timer" compute-cluster-name))})))))) (defn taskids-to-scan "Determine all task ids to scan by union-ing task id's from cook expected state and existing task id maps. " @@ -421,6 +438,10 @@ ; We need to *not* prevent the cluster autoscaler from ; removing a node just because it's running synthetic pods :pod-annotations {api/k8s-safe-to-evict-annotation "true"} + ; Job constraints need to be expressed on synthetic + ; pods so that we trigger the cluster autoscaler to + ; spin up nodes that will end up satisfying them + :pod-constraints (constraints/job->constraints job) ; Cook has a "novel host constraint", which disallows a job from ; running on the same host twice. So, we need to avoid running a ; synthetic pod on any of the hosts that the real job won't be able diff --git a/scheduler/src/cook/scheduler/constraints.clj b/scheduler/src/cook/scheduler/constraints.clj index 55b382b857..a8a856c43e 100644 --- a/scheduler/src/cook/scheduler/constraints.clj +++ b/scheduler/src/cook/scheduler/constraints.clj @@ -177,11 +177,62 @@ (when (and (not (empty? datasets)) (= fitness-calculator dl/data-local-fitness-calculator)) (->data-locality-constraint job launch-wait-seconds)))) -(defrecord user-defined-constraint [constraints] +(defn job->default-constraints + "Returns the list of default constraints configured for the job's pool" + [job] + (util/match-based-on-pool-name + (config/default-job-constraints) + (util/job->pool-name job) + :default-constraints)) + +(def machine-type-constraint-attributes + #{"cpu-architecture" "node-family" "node-type"}) + +(defn job->constraints + "Given a job, returns all job constraints that should be in effect, + either specified on the job submission or defaulted via configuration" + [{:keys [job/constraint] :as job}] + (let [user-specified-constraints constraint + + user-specified-constraint-attributes + (map :constraint/attribute + user-specified-constraints) + + user-specified-machine-type-constraint? + (some + machine-type-constraint-attributes + user-specified-constraint-attributes) + + default-constraints + (->> job + job->default-constraints + (remove + (fn [{:keys [constraint/attribute]}] + (or + ; Remove any default constraints for which the user + ; has specified a constraint on the same attribute + (some #{attribute} + user-specified-constraint-attributes) + + ; Remove any default machine-type constraints if + ; the user has specified any machine-type constraint + ; (otherwise, they could conflict with each other) + (and + user-specified-machine-type-constraint? + (some #{attribute} + machine-type-constraint-attributes))))))] + + (-> user-specified-constraints + (concat default-constraints) + ; De-lazy the output; Fenzo can call from multiple + ; threads and we want to avoid contention in LazySeq + vec))) + +(defrecord user-defined-constraint [job] JobConstraint (job-constraint-name [this] (get-class-name this)) (job-constraint-evaluate - [this _ vm-attributes] + [_ _ vm-attributes] (let [vm-passes-constraint? (fn vm-passes-constraint? [{attribute :constraint/attribute pattern :constraint/pattern @@ -193,6 +244,7 @@ (log/error (str "Unknown operator " operator " api.clj should have prevented this from happening.")) true)))) + constraints (job->constraints job) passes? (every? vm-passes-constraint? constraints)] [passes? (when-not passes? "Host doesn't pass at least one user supplied constraint.")])) @@ -204,7 +256,7 @@ "Constructs a user-defined-constraint. The constraint asserts that the vm passes the constraints the user supplied as host constraints" [job] - (->user-defined-constraint (:job/constraint job))) + (->user-defined-constraint job)) (defrecord estimated-completion-constraint [estimated-end-time host-lifetime-mins] JobConstraint diff --git a/scheduler/test/cook/test/scheduler/constraints.clj b/scheduler/test/cook/test/scheduler/constraints.clj index 346ba698a7..6a0d28e8f8 100644 --- a/scheduler/test/cook/test/scheduler/constraints.clj +++ b/scheduler/test/cook/test/scheduler/constraints.clj @@ -39,13 +39,15 @@ (constraints/group-constraint-name (constraints/->attribute-equals-host-placement-group-constraint nil))))) (deftest test-user-defined-constraint + (setup) (let [constraints [{:constraint/attribute "is_spot" :constraint/operator :constraint.operator/equals :constraint/pattern "true"} {:constraint/attribute "instance_type" :constraint/operator :constraint.operator/equals :constraint/pattern "mem.large"}] - user-defined-constraint (constraints/->user-defined-constraint constraints)] + job {:job/constraint constraints} + user-defined-constraint (constraints/->user-defined-constraint job)] (is (= true (first (constraints/job-constraint-evaluate user-defined-constraint nil {"is_spot" "true" "instance_type" "mem.large"})))) (is (= false (first (constraints/job-constraint-evaluate user-defined-constraint nil {"is_spot" "true" "instance_type" "cpu.large"})))) (is (= false (first (constraints/job-constraint-evaluate user-defined-constraint nil {"is_spot" "false" "instance_type" "mem.large"})))) @@ -436,3 +438,33 @@ {:instance/hostname "host-1"}]})] (is (= 3 (count hostnames))) (is (= (set ["host-1" "host-2" "host-3"]) (set hostnames)))))) + +(deftest test-job->constraints + (let [default-constraints [{:constraint/attribute "cpu-architecture" + :constraint/operator :constraint.operator/equals + :constraint/pattern "intel-haswell"} + {:constraint/attribute "node-family" + :constraint/operator :constraint.operator/equals + :constraint/pattern "n1"}]] + (with-redefs [config/default-job-constraints + (constantly [{:pool-regex ".*" + :default-constraints default-constraints}])] + + (testing "default constraints" + (is (= default-constraints (constraints/job->constraints {})))) + + (testing "default constraints plus user constraint" + (let [user-constraints [{:constraint/attribute "foo" + :constraint/operator :constraint.operator/equals + :constraint/pattern "bar"}]] + (is (= (concat user-constraints default-constraints) + (constraints/job->constraints + {:job/constraint user-constraints}))))) + + (testing "removes default machine-type constraints if user specified any" + (let [user-constraints [{:constraint/attribute "node-type" + :constraint/operator :constraint.operator/equals + :constraint/pattern "c2-standard-30"}]] + (is (= user-constraints + (constraints/job->constraints + {:job/constraint user-constraints}))))))))