-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathk8s_template.yaml
142 lines (142 loc) · 4.69 KB
/
k8s_template.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
apiVersion: batch/v1
kind: Job
spec:
template:
spec:
restartPolicy: OnFailure
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
# Require compatibility with CUDA 12.4
# This version needs to be specified in the dockerfile `FROM` and
# `RUN micromamba install` lines too
# Check the version used by NRP nodes with:
# kubectl get nodes -L nvidia.com/gpu.product,nvidia.com/cuda.runtime.major,nvidia.com/cuda.runtime.minor -l nvidia.com/gpu.product
- key: nvidia.com/cuda.runtime.major
operator: In
values:
- "12"
- key: nvidia.com/cuda.runtime.minor
operator: In
values:
- "4"
initContainers:
- name: init-git
image: alpine/git
resources:
limits:
memory: "100Mi"
cpu: "100m"
requests:
memory: "100Mi"
cpu: "100m"
command:
- "/bin/sh"
- "-c"
- |
set -euf
cd /opt/repo
git clone https://github.com/openforcefield/proteinbenchmark-nrp.git /opt/repo || true
git checkout $PROTBENCH_SCRIPT_COMMIT
volumeMounts:
- mountPath: /opt/repo
name: repo-vol
- name: init-rclone
image: rclone/rclone
resources:
limits:
memory: "1Gi"
cpu: "200m"
ephemeral-storage: "20Gi"
requests:
memory: "1Gi"
cpu: "200m"
ephemeral-storage: "20Gi"
env:
- name: RCLONE_CONFIG
value: /secrets/rclone.conf
command:
- "/bin/sh"
- "-c"
- |
set -euf
rclone --version
echo "about to clone"
IFS=$'\n'
for FILE in ${PROTBENCH_REQUIRED_FILES}; do
echo "cloning to /results/$FILE"
rclone copyto --progress nrp-internal:proteinbenchmark-jm-bucket/results/$FILE /results/$FILE
done
unset IFS
echo "done cloning"
ls /results
chmod -R 777 /results
echo "done chmoding"
volumeMounts:
- mountPath: /results
name: results-vol
- name: rclone-config
mountPath: /secrets/rclone.conf
subPath: rclone.conf
containers:
- name: main
image: ghcr.io/openforcefield/proteinbenchmark-nrp:latest
resources:
limits:
memory: "5Gi"
cpu: "1"
nvidia.com/gpu: 1
ephemeral-storage: "20Gi"
requests:
memory: "5Gi"
cpu: "1"
nvidia.com/gpu: 1
ephemeral-storage: "20Gi"
volumeMounts:
- name: rclone-config
mountPath: /secrets/rclone.conf
subPath: rclone.conf
- name: openeye-license
mountPath: /secrets/oe_license.txt
subPath: oe_license.txt
- mountPath: /opt/repo
name: repo-vol
- mountPath: /results
name: results-vol
env:
- name: OE_LICENSE
value: /secrets/oe_license.txt
- name: RCLONE_CONFIG
value: /secrets/rclone.conf
- name: THIS_POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
command:
- /bin/sh
- -c
- |
micromamba run -n base pip install git+https://github.com/openforcefield/proteinbenchmark.git@nagl
micromamba run -n base python /opt/repo/$(PROTBENCH_SCRIPT_PATH) -o/results
rclone copy --update /results nrp-internal:proteinbenchmark-jm-bucket/results
lifecycle:
preStop:
exec:
command:
- /bin/sh
- -c
- rclone copy --update /results nrp-internal:proteinbenchmark-jm-bucket/results 2>&1 > /proc/1/fd/1
volumes:
- name: rclone-config
secret:
secretName: jm-rclone-config
- name: openeye-license
secret:
secretName: oe-license-feb-2024
- name: repo-vol
emptyDir: {}
- name: results-vol
emptyDir: {}
backoffLimit: 10