From c7fca9edef9a63b5844f694a0310599baa41cd9f Mon Sep 17 00:00:00 2001 From: Daniel Posada Date: Tue, 1 Jun 2021 12:48:45 -0400 Subject: [PATCH] Allows cluster features to be dynamic (#1877) * Allows cluster features to be dynamic * Correctly updates features on existing clusters * Removes features from keys-to-keep-synced between db and in-memory --- scheduler/src/cook/compute_cluster.clj | 27 ++++--- scheduler/src/cook/schema.clj | 35 ++++++++- scheduler/test/cook/test/compute_cluster.clj | 80 +++++++++++++++++++- 3 files changed, 125 insertions(+), 17 deletions(-) diff --git a/scheduler/src/cook/compute_cluster.clj b/scheduler/src/cook/compute_cluster.clj index dc1b906567..37e69bc2f6 100644 --- a/scheduler/src/cook/compute_cluster.clj +++ b/scheduler/src/cook/compute_cluster.clj @@ -303,11 +303,10 @@ (get current-in-mem-configs key) keys-to-keep-synced (cond-> [:base-path :ca-cert :state] - ; The location and features fields are treated specially when comparing db and in-mem configs -- - ; since these are new fields, they can be nil in-memory and non-nil in the db for a + ; The location field is treated specially when comparing db and in-mem configs -- + ; since this is a new field, it can be nil in-memory and non-nil in the db for a ; limited time after the db has been populated and before the leader is restarted. - current-in-mem-location (conj :location) - current-in-mem-features (conj :features))] + current-in-mem-location (conj :location))] (when (not= (select-keys current-db-config keys-to-keep-synced) (select-keys current-in-mem-config keys-to-keep-synced)) (log/error keys-to-keep-synced "differ between in-memory and database cluster configurations." @@ -356,16 +355,12 @@ The location and state fields are special in this regard -- location can only transition from nil to non-nil, and state is validated in cluster-state-change-valid?." - [{current-location :location current-features :features :as current-config} - {new-location :location new-features :features :as new-config}] + [{current-location :location :as current-config} + {new-location :location :as new-config}] (cond (and (some? current-location) (not= current-location new-location)) false - (and (seq current-features) - (not= (set current-features) (set new-features))) - false - :else (let [dissoc-non-comparable-fields #(-> % @@ -504,8 +499,16 @@ (try (when differs? ; :db.unique/identity gives upsert behavior (update on insert), so we don't need to specify an entity ID when updating - @(d/transact conn [(assoc (compute-cluster-config->compute-cluster-config-ent goal-config) - :db/id (d/tempid :db.part/user))])) + @(d/transact + conn + [; We need to retract existing features; otherwise, + ; features simply get added to the existing list + [:db.fn/resetAttribute + [:compute-cluster-config/name name] + :compute-cluster-config/features + nil] + (assoc (compute-cluster-config->compute-cluster-config-ent goal-config) + :db/id (d/tempid :db.part/user))])) (let [{:keys [state-atom state-locked?-atom] :as cluster} (@cluster-name->compute-cluster-atom name)] (if cluster (do diff --git a/scheduler/src/cook/schema.clj b/scheduler/src/cook/schema.clj index e62a9587da..29078d025e 100644 --- a/scheduler/src/cook/schema.clj +++ b/scheduler/src/cook/schema.clj @@ -1358,7 +1358,40 @@ for a job. E.g. {:resources {:cpus 4 :mem 3} :constraints {\"unique_host_constra (< attempts-consumed retries)) [[:db/add job-entity-id :job/state :job.state/waiting] [:db/add job-entity-id :job/last-waiting-start-time (Date.)]] - []))}}]) + []))}} + ; https://stackoverflow.com/a/42119449 + {:db/id (d/tempid :db.part/user) + :db/ident :db.fn/resetAttribute + :db/doc + "Unconditionally set an entity's attribute's values to those provided, + retracting all other existing values. + + Values must be a collection (list, seq, vector), even for cardinality-one + attributes. An empty collection (or nil) will retract all values. The values + themselves must be primitive, i.e. no map forms are permitted for refs, use + tempids directly. If the attribute is-component, removed values will be + :db.fn/retractEntity-ed." + :db/fn + #db/fn {:lang "clojure" + :params [db ent attr values] + :code (let [eid (datomic.api/entid db ent) + aid (datomic.api/entid db attr) + {:keys [value-type is-component]} (datomic.api/attribute db aid) + newvalues (if (= value-type :db.type/ref) + (into #{} (map #(if (string? %) % (d/entid db %))) values) + (into #{} values)) + oldvalues (into #{} (map :v) (datomic.api/datoms db :eavt eid aid))] + (-> [] + (into (comp + (remove newvalues) + (map (if is-component + #(do [:db.fn/retractEntity %]) + #(do [:db/retract eid aid %])))) + oldvalues) + (into (comp + (remove oldvalues) + (map #(do [:db/add eid aid %]))) + newvalues)))}}]) (def reason-entities [{:db/id (d/tempid :db.part/user) diff --git a/scheduler/test/cook/test/compute_cluster.clj b/scheduler/test/cook/test/compute_cluster.clj index c93ccdff54..2c400deb2c 100644 --- a/scheduler/test/cook/test/compute_cluster.clj +++ b/scheduler/test/cook/test/compute_cluster.clj @@ -604,6 +604,7 @@ (is (= {} (get-in-mem-configs)))))) (deftest test-execute-update! + (setup) (with-redefs [config/compute-cluster-templates (constantly {"template1" {:config {:a :bb :c :dd :dynamic-cluster-config? true} @@ -694,7 +695,66 @@ :location "us-east1" :features [{:key "NVIDIA_DRIVER_VERSION" :value "450-80-02"}]}} - (get-in-mem-configs))))) + (get-in-mem-configs)))) + + (testing "features are appropriately updated" + (is (= {:update-succeeded true} + (execute-update! conn + {:goal-config + {:a :a + :name "name" + :template "template1" + :base-path "base-path" + :ca-cert "ca-cert" + :state :running + :state-locked? true + :location "us-east1" + :features [{:key "foo" :value "bar"} + {:key "baz" :value "qux"}]} + :valid? true + :differs? true + :active? true}))) + (is (= {:base-path "base-path" + :ca-cert "ca-cert" + :name "name" + :state :running + :state-locked? true + :template "template1" + :location "us-east1" + :features [{:key "foo" :value "bar"} + {:key "baz" :value "qux"}]} + (-> (get-db-config-ents (d/db conn)) + (get "name") + compute-cluster-config-ent->compute-cluster-config))) + (is (= {:update-succeeded true} + (execute-update! conn + {:goal-config + {:a :a + :name "name" + :template "template1" + :base-path "base-path" + :ca-cert "ca-cert" + :state :running + :state-locked? true + :location "us-east1" + :features [{:key "one" :value "two"} + {:key "three" :value "four"}]} + :valid? true + :differs? true + :active? true}))) + (is (= {:base-path "base-path" + :ca-cert "ca-cert" + :name "name" + :state :running + :state-locked? true + :template "template1" + :location "us-east1" + :features [{:key "three" :value "four"} + {:key "one" :value "two"}]} + (-> (get-db-config-ents (d/db conn)) + (get "name") + compute-cluster-config-ent->compute-cluster-config))))) + (let [conn (restore-fresh-database! uri)] (testing "normal update - insert then update" (testing "insert" @@ -759,8 +819,7 @@ :state-locked? true :template "template1" :location "us-east1" - :features [{:key "NVIDIA_DRIVER_VERSION" - :value "450-80-02"}]} + :features []} (-> (get-db-config-ents (d/db conn)) (get "name") compute-cluster-config-ent->compute-cluster-config))) (is (= {"name" {:base-path "base-path" :ca-cert "ca-cert" @@ -772,6 +831,7 @@ :features [{:key "NVIDIA_DRIVER_VERSION" :value "450-80-02"}]}} (get-in-mem-configs)))))) + (let [conn (restore-fresh-database! uri)] (testing "exceptions" (reset! cluster-name->compute-cluster-atom {}) @@ -1058,4 +1118,16 @@ (deftest test-config=? (testing "features can be in different orders" - (is (true? (config=? {:features [:a :b]} {:features [:b :a]}))))) + (is (true? (config=? {:features [:a :b]} {:features [:b :a]})))) + + (testing "features can be completely dynamic" + (is (true? (config=? {:features [:a :b]} {}))) + (is (true? (config=? {:features [:a :b]} {:features nil}))) + (is (true? (config=? {:features [:a :b]} {:features []}))) + (is (true? (config=? {:features [:a :b]} {:features [:a]}))) + (is (true? (config=? {:features [:a :b]} {:features [:c]}))) + (is (true? (config=? {} {:features [:a :b]}))) + (is (true? (config=? {:features nil} {:features [:a :b]}))) + (is (true? (config=? {:features []} {:features [:a :b]}))) + (is (true? (config=? {:features [:a]} {:features [:a :b]}))) + (is (true? (config=? {:features [:c]} {:features [:a :b]})))))