Skip to content
This repository has been archived by the owner on Apr 24, 2023. It is now read-only.

Commit

Permalink
Allows cluster features to be dynamic (#1877)
Browse files Browse the repository at this point in the history
* Allows cluster features to be dynamic

* Correctly updates features on existing clusters

* Removes features from keys-to-keep-synced between db and in-memory
  • Loading branch information
dposada authored Jun 1, 2021
1 parent a7325e6 commit c7fca9e
Show file tree
Hide file tree
Showing 3 changed files with 125 additions and 17 deletions.
27 changes: 15 additions & 12 deletions scheduler/src/cook/compute_cluster.clj
Original file line number Diff line number Diff line change
Expand Up @@ -303,11 +303,10 @@
(get current-in-mem-configs key)
keys-to-keep-synced
(cond-> [:base-path :ca-cert :state]
; The location and features fields are treated specially when comparing db and in-mem configs --
; since these are new fields, they can be nil in-memory and non-nil in the db for a
; The location field is treated specially when comparing db and in-mem configs --
; since this is a new field, it can be nil in-memory and non-nil in the db for a
; limited time after the db has been populated and before the leader is restarted.
current-in-mem-location (conj :location)
current-in-mem-features (conj :features))]
current-in-mem-location (conj :location))]
(when (not= (select-keys current-db-config keys-to-keep-synced)
(select-keys current-in-mem-config keys-to-keep-synced))
(log/error keys-to-keep-synced "differ between in-memory and database cluster configurations."
Expand Down Expand Up @@ -356,16 +355,12 @@
The location and state fields are special in this regard -- location can
only transition from nil to non-nil, and state is validated in
cluster-state-change-valid?."
[{current-location :location current-features :features :as current-config}
{new-location :location new-features :features :as new-config}]
[{current-location :location :as current-config}
{new-location :location :as new-config}]
(cond (and (some? current-location)
(not= current-location new-location))
false

(and (seq current-features)
(not= (set current-features) (set new-features)))
false

:else
(let [dissoc-non-comparable-fields
#(-> %
Expand Down Expand Up @@ -504,8 +499,16 @@
(try
(when differs?
; :db.unique/identity gives upsert behavior (update on insert), so we don't need to specify an entity ID when updating
@(d/transact conn [(assoc (compute-cluster-config->compute-cluster-config-ent goal-config)
:db/id (d/tempid :db.part/user))]))
@(d/transact
conn
[; We need to retract existing features; otherwise,
; features simply get added to the existing list
[:db.fn/resetAttribute
[:compute-cluster-config/name name]
:compute-cluster-config/features
nil]
(assoc (compute-cluster-config->compute-cluster-config-ent goal-config)
:db/id (d/tempid :db.part/user))]))
(let [{:keys [state-atom state-locked?-atom] :as cluster} (@cluster-name->compute-cluster-atom name)]
(if cluster
(do
Expand Down
35 changes: 34 additions & 1 deletion scheduler/src/cook/schema.clj
Original file line number Diff line number Diff line change
Expand Up @@ -1358,7 +1358,40 @@ for a job. E.g. {:resources {:cpus 4 :mem 3} :constraints {\"unique_host_constra
(< attempts-consumed retries))
[[:db/add job-entity-id :job/state :job.state/waiting]
[:db/add job-entity-id :job/last-waiting-start-time (Date.)]]
[]))}}])
[]))}}
; https://stackoverflow.com/a/42119449
{:db/id (d/tempid :db.part/user)
:db/ident :db.fn/resetAttribute
:db/doc
"Unconditionally set an entity's attribute's values to those provided,
retracting all other existing values.
Values must be a collection (list, seq, vector), even for cardinality-one
attributes. An empty collection (or nil) will retract all values. The values
themselves must be primitive, i.e. no map forms are permitted for refs, use
tempids directly. If the attribute is-component, removed values will be
:db.fn/retractEntity-ed."
:db/fn
#db/fn {:lang "clojure"
:params [db ent attr values]
:code (let [eid (datomic.api/entid db ent)
aid (datomic.api/entid db attr)
{:keys [value-type is-component]} (datomic.api/attribute db aid)
newvalues (if (= value-type :db.type/ref)
(into #{} (map #(if (string? %) % (d/entid db %))) values)
(into #{} values))
oldvalues (into #{} (map :v) (datomic.api/datoms db :eavt eid aid))]
(-> []
(into (comp
(remove newvalues)
(map (if is-component
#(do [:db.fn/retractEntity %])
#(do [:db/retract eid aid %]))))
oldvalues)
(into (comp
(remove oldvalues)
(map #(do [:db/add eid aid %])))
newvalues)))}}])

(def reason-entities
[{:db/id (d/tempid :db.part/user)
Expand Down
80 changes: 76 additions & 4 deletions scheduler/test/cook/test/compute_cluster.clj
Original file line number Diff line number Diff line change
Expand Up @@ -604,6 +604,7 @@
(is (= {} (get-in-mem-configs))))))

(deftest test-execute-update!
(setup)
(with-redefs [config/compute-cluster-templates
(constantly {"template1" {:config {:a :bb :c :dd
:dynamic-cluster-config? true}
Expand Down Expand Up @@ -694,7 +695,66 @@
:location "us-east1"
:features [{:key "NVIDIA_DRIVER_VERSION"
:value "450-80-02"}]}}
(get-in-mem-configs)))))
(get-in-mem-configs))))

(testing "features are appropriately updated"
(is (= {:update-succeeded true}
(execute-update! conn
{:goal-config
{:a :a
:name "name"
:template "template1"
:base-path "base-path"
:ca-cert "ca-cert"
:state :running
:state-locked? true
:location "us-east1"
:features [{:key "foo" :value "bar"}
{:key "baz" :value "qux"}]}
:valid? true
:differs? true
:active? true})))
(is (= {:base-path "base-path"
:ca-cert "ca-cert"
:name "name"
:state :running
:state-locked? true
:template "template1"
:location "us-east1"
:features [{:key "foo" :value "bar"}
{:key "baz" :value "qux"}]}
(-> (get-db-config-ents (d/db conn))
(get "name")
compute-cluster-config-ent->compute-cluster-config)))
(is (= {:update-succeeded true}
(execute-update! conn
{:goal-config
{:a :a
:name "name"
:template "template1"
:base-path "base-path"
:ca-cert "ca-cert"
:state :running
:state-locked? true
:location "us-east1"
:features [{:key "one" :value "two"}
{:key "three" :value "four"}]}
:valid? true
:differs? true
:active? true})))
(is (= {:base-path "base-path"
:ca-cert "ca-cert"
:name "name"
:state :running
:state-locked? true
:template "template1"
:location "us-east1"
:features [{:key "three" :value "four"}
{:key "one" :value "two"}]}
(-> (get-db-config-ents (d/db conn))
(get "name")
compute-cluster-config-ent->compute-cluster-config)))))

(let [conn (restore-fresh-database! uri)]
(testing "normal update - insert then update"
(testing "insert"
Expand Down Expand Up @@ -759,8 +819,7 @@
:state-locked? true
:template "template1"
:location "us-east1"
:features [{:key "NVIDIA_DRIVER_VERSION"
:value "450-80-02"}]}
:features []}
(-> (get-db-config-ents (d/db conn)) (get "name") compute-cluster-config-ent->compute-cluster-config)))
(is (= {"name" {:base-path "base-path"
:ca-cert "ca-cert"
Expand All @@ -772,6 +831,7 @@
:features [{:key "NVIDIA_DRIVER_VERSION"
:value "450-80-02"}]}}
(get-in-mem-configs))))))

(let [conn (restore-fresh-database! uri)]
(testing "exceptions"
(reset! cluster-name->compute-cluster-atom {})
Expand Down Expand Up @@ -1058,4 +1118,16 @@

(deftest test-config=?
(testing "features can be in different orders"
(is (true? (config=? {:features [:a :b]} {:features [:b :a]})))))
(is (true? (config=? {:features [:a :b]} {:features [:b :a]}))))

(testing "features can be completely dynamic"
(is (true? (config=? {:features [:a :b]} {})))
(is (true? (config=? {:features [:a :b]} {:features nil})))
(is (true? (config=? {:features [:a :b]} {:features []})))
(is (true? (config=? {:features [:a :b]} {:features [:a]})))
(is (true? (config=? {:features [:a :b]} {:features [:c]})))
(is (true? (config=? {} {:features [:a :b]})))
(is (true? (config=? {:features nil} {:features [:a :b]})))
(is (true? (config=? {:features []} {:features [:a :b]})))
(is (true? (config=? {:features [:a]} {:features [:a :b]})))
(is (true? (config=? {:features [:c]} {:features [:a :b]})))))

0 comments on commit c7fca9e

Please sign in to comment.