-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprovision.sh
executable file
·313 lines (280 loc) · 10.8 KB
/
provision.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
#!/bin/bash
# -*- coding: utf-8 -*-
# Copyright 2021 Red Hat, Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
#
# This script will provision an OpenStack cluster against an OpenStack cloud
# (defaults to our CI account on VEXXHOST)
# Usage:
# ./provision <config-name>
# The config has to exist in ./configs
#
set -eu
DEBUG=${DEBUG:-}
if [ -n "$DEBUG" ]; then
set -x
fi
##########################
# Github Actions Secrets #
##########################
# IS_CI must be set to True when the script is run by Github Actions so
# we will get secrets from Github Action secrets instead of decrypting
# the secret directory of this repository.
IS_CI=${IS_CI:-}
VEXXHOST_ENDPOINT=${VEXXHOST_ENDPOINT:-}
VEXXHOST_SHIFTSTACK_BM_CI_PASSWORD=${VEXXHOST_SHIFTSTACK_BM_CI_PASSWORD:-}
VEXXHOST_SHIFTSTACK_BM_CI_SSH_PRIVATE_KEY=${VEXXHOST_SHIFTSTACK_BM_CI_SSH_PRIVATE_KEY:-}
OPENSHIFT_TENANT_PASSWORD=${OPENSHIFT_TENANT_PASSWORD:-}
SSL_CA_CERT=${SSL_CA_CERT:-}
SSL_CA_KEY=${SSL_CA_KEY:-}
REDHAT_REGISTRY_USERNAME=${REDHAT_REGISTRY_USERNAME:-}
REDHAT_REGISTRY_PASSWORD=${REDHAT_REGISTRY_PASSWORD:-}
REDHAT_RHSM_ORG=${REDHAT_RHSM_ORG:-}
REDHAT_RHSM_ACTIVATION_KEY=${REDHAT_RHSM_ACTIVATION_KEY:-}
######################
# PLATFORM VARIABLES #
######################
# Note that defaults are set to use our VEXXHOST cloud
# but they can be overriden to deploy somewhere else (e.g. PSI)
IMAGE_NAME=${IMAGE_NAME:-rhel-9.2-x86_64}
FLAVOR_NAME=${FLAVOR_NAME:-b1-standard-96}
NETWORK_NAME=${NETWORK_NAME:-public}
KEYPAIR_NAME=${KEYPAIR_NAME:-shiftstack-ci}
SERVER_USER=${SERVER_USER:-cloud-user}
OVERRIDE_OS_CLOUD=${OVERRIDE_OS_CLOUD:-}
######################
# VEXXHOST VARIABLES #
######################
export OS_CLOUD=${OVERRIDE_OS_CLOUD:-shiftstack-bm}
VEXXHOST_USERNAME=${VEXXHOST_USERNAME:-shiftstack-bm-ci}
VEXXHOST_PROJECT_NAME=${VEXXHOST_PROJECT_NAME:-shiftstack-bm}
SCRIPT_NAME=`basename "$0"`
[[ "$#" -ne 1 ]] && echo "Missing argument, usage: ./$SCRIPT_NAME <config-name>" && exit 1
ROOT_DIR=$PWD
WORK_DIR=${WORK_DIR:-$(mktemp -d -t shiftstack-ci-XXXXXXXXXX)}
CLUSTER_NAME=$1
SERVER_CREATE_CMD="openstack server create --wait --key-name $KEYPAIR_NAME --network $NETWORK_NAME --flavor $FLAVOR_NAME --image $IMAGE_NAME $CLUSTER_NAME"
SERVER_DELETE_CMD="openstack server delete --wait $CLUSTER_NAME"
# Function to run a command with a retry.
# You can specify the number of total retries in $1
# and the sleep time (in seconds) between retries.
function retry {
local retries=$1
local time=$2
shift 2
local count=0
until "$@"; do
exit=$?
count=$(($count + 1))
if [ $count -lt $retries ]; then
sleep $time
else
return $exit
fi
done
return 0
}
function create_server {
if openstack server show $CLUSTER_NAME &>/dev/null; then
echo "DEBUG: Cluster $CLUSTER_NAME was already running, removing it..."
eval $SERVER_DELETE_CMD
fi
echo "DEBUG: Creating server for $CLUSTER_NAME..."
set +e
SERVER_CREATE=$(eval $SERVER_CREATE_CMD)
RC=$?
if [[ $SERVER_CREATE == *"Quota exceeded"* ]]; then
echo "ERROR: Not enough quotas available to create $CLUSTER_NAME"
exit 1
fi
# If the server goes into ERROR status immediately, openstackclient returns 0
# so we need to force RC to 1 so we can retry later.
SERVER_STATUS=$(openstack server show -c status -f shell $CLUSTER_NAME)
if [[ $SERVER_STATUS == *"ERROR"* ]]; then
RC=1
fi
set -e
return $RC
}
if [ ! -f "$ROOT_DIR/configs/$CLUSTER_NAME.yaml" ]; then
echo "ERROR: No config was found for $CLUSTER_NAME in $ROOT_DIR/configs"
exit 1
fi
# Sanity check for CI jobs and locally
if [ -n "IS_CI" ]; then
for i in REDHAT_REGISTRY_USERNAME REDHAT_REGISTRY_PASSWORD SSL_CA_CERT SSL_CA_KEY VEXXHOST_SHIFTSTACK_BM_CI_PASSWORD VEXXHOST_SHIFTSTACK_BM_CI_SSH_PRIVATE_KEY VEXXHOST_ENDPOINT OPENSHIFT_TENANT_PASSWORD; do
if [ -z "$i" ]; then
echo "ERROR:$ $i is not set and is required when this script runs in CI"
exit 1
fi
done
else
if ! grep -q clouds $ROOT_DIR/secrets/clouds.yaml; then
echo "ERROR: ./secrets directory is not decrypted, please check the README"
exit 1
fi
fi
# Initialize secrets
# All variables are defined in Github Actions secrets
if [ -n "$VEXXHOST_SHIFTSTACK_BM_CI_PASSWORD" ]; then
cat << EOF > $WORK_DIR/clouds.yaml
clouds:
$OS_CLOUD:
auth:
auth_url: "$VEXXHOST_ENDPOINT"
username: "$VEXXHOST_USERNAME"
password: "$VEXXHOST_SHIFTSTACK_BM_CI_PASSWORD"
project_name: "$VEXXHOST_PROJECT_NAME"
identity_api_version: 3
EOF
else
# When OVERRIDE_OS_CLOUD is set, we want to use our own clouds.yaml
if [ -z "$OVERRIDE_OS_CLOUD" ]; then
cp $ROOT_DIR/secrets/clouds.yaml $WORK_DIR/clouds.yaml
fi
fi
if [ -n "$VEXXHOST_SHIFTSTACK_BM_CI_SSH_PRIVATE_KEY" ]; then
cat << EOF > $WORK_DIR/ssh-private.key
-----BEGIN RSA PRIVATE KEY-----
${VEXXHOST_SHIFTSTACK_BM_CI_SSH_PRIVATE_KEY}
-----END RSA PRIVATE KEY-----
EOF
else
cp $ROOT_DIR/secrets/vexxhost-ssh-private.key $WORK_DIR/ssh-private.key
fi
chmod 400 $WORK_DIR/ssh-private.key
if [ -n "$SSL_CA_CERT" ]; then
cat << EOF > $WORK_DIR/shiftstack-ci-ca.crt
${SSL_CA_CERT}
EOF
else
cp $ROOT_DIR/secrets/ssl/shiftstack-ci-ca.crt $WORK_DIR/shiftstack-ci-ca.crt
fi
if [ -n "$SSL_CA_KEY" ]; then
cat << EOF > $WORK_DIR/shiftstack-ci-ca.key
${SSL_CA_KEY}
EOF
else
cp $ROOT_DIR/secrets/ssl/shiftstack-ci-ca.key $WORK_DIR/shiftstack-ci-ca.key
fi
chmod 400 $WORK_DIR/shiftstack-ci-ca.crt $WORK_DIR/shiftstack-ci-ca.key
# OPENSHIFT_TENANT_PASSWORD is defined in Github Actions secrets
if [ -z "$OPENSHIFT_TENANT_PASSWORD" ]; then
source $ROOT_DIR/secrets/passwords.rc
fi
# REDHAT_REGISTRY_USERNAME is defined in Github Actions secrets
if [ -z "$REDHAT_REGISTRY_USERNAME" ]; then
source $ROOT_DIR/secrets/redhat-credentials.rc
fi
if ! [[ -d "$WORK_DIR/dev-install" ]]; then
git clone -q https://github.com/shiftstack/dev-install $WORK_DIR/dev-install
fi
pushd $WORK_DIR &>/dev/null
if ! openstack keypair show $KEYPAIR_NAME &>/dev/null; then
echo "ERROR: No keypair was found with $KEYPAIR_NAME name"
exit 1
fi
# Here we'll have to deal with the fact that a node being removed from Nova
# will have its disk erased by Ironic, and takes a bit of time because it can
# be reprovisioned. Which means that if didn't have a node available before the
# cluster removal, there is a big chance that the redeploy will fail since Nova
# Resources periodic tracker needs a bit of time to catch up on the node status.
# Try to create the server during 15 min max:
if ! retry 30 30 create_server; then
echo "ERROR: Server for $CLUSTER_NAME was not able to be created"
exit 1
fi
PUBLIC_IP=$(openstack server show $CLUSTER_NAME -c addresses -f json | grep -Pom 1 '[0-9.]{7,15}')
SSH_ARGS="-o ConnectTimeout=10 -o "StrictHostKeyChecking=no" -i $WORK_DIR/ssh-private.key"
SSH_CMD="ssh $SSH_ARGS $SERVER_USER@$PUBLIC_IP"
SCP_CMD="scp -r $SSH_ARGS"
# When a node is deployed with Ironic, OpenSSH is open for a few seconds then the node becomes unreachable until the
# OS starts properly, so let's run SSH with a timeout of 10 seconds, sleep 10 seconds between retries and repeat
# 90 times, which should give a total timeout of 30 min.
echo "DEBUG: Trying to SSH $CLUSTER_NAME via $SERVER_USER@$PUBLIC_IP"
if ! retry 90 10 $SSH_CMD uname -a; then
echo "ERROR: Server for $CLUSTER_NAME ($PUBLIC_IP) was not reachable..."
exit 1
fi
echo "DEBUG: Server was successfuly deployed and its IP is $PUBLIC_IP"
cd dev-install
export ansible_args="--private-key=$WORK_DIR/ssh-private.key"
echo "DEBUG: Configure dev-install to deploy OpenStack on $CLUSTER_NAME"
make config host=$PUBLIC_IP user=$SERVER_USER &>/dev/null
cat $ROOT_DIR/configs/$CLUSTER_NAME.yaml >> local-overrides.yaml
INDENTED_SSL_CA_CERT=$(cat $WORK_DIR/shiftstack-ci-ca.crt | sed 's/^/ /')
INDENTED_SSL_CA_KEY=$(cat $WORK_DIR/shiftstack-ci-ca.key | sed 's/^/ /')
cat << EOF >> local-overrides.yaml
openshift_password: "${OPENSHIFT_TENANT_PASSWORD}"
ssl_enabled: true
ssl_ca_cert: |
$INDENTED_SSL_CA_CERT
ssl_ca_key: |
$INDENTED_SSL_CA_KEY
authorized_keys:
- https://github.com/dulek.keys
- https://github.com/EmilienM.keys
- https://github.com/gryf.keys
- https://github.com/MaysaMacedo.keys
- https://github.com/mandre.keys
- https://github.com/mdbooth.keys
- https://github.com/pierreprinetti.keys
- https://github.com/stephenfin.keys
- https://github.com/MiguelCarpio.keys
create_rhcos_image: false
hostname: $CLUSTER_NAME
local_cloudname: $CLUSTER_NAME
EOF
# If the host is RHEL we'll need credentials to pull images
# from Red Hat Container Image Registry
if $SSH_CMD grep -q "Red\ Hat" /etc/redhat-release; then
cat << EOF >> local-overrides.yaml
rhsm_ephemeral: false
redhat_registry_credentials:
username: "${REDHAT_REGISTRY_USERNAME}"
password: "${REDHAT_REGISTRY_PASSWORD}"
rhsm_org_id: "${REDHAT_RHSM_ORG}"
rhsm_activation_key: "${REDHAT_RHSM_ACTIVATION_KEY}"
EOF
fi
# Workaround, it doesn't seem to work fine for now when running
# the Ansible task that does it in dev-install from Github CI
echo "DEBUG: Upgrading the server to CentOS Stream..."
$SSH_CMD "if test -f /etc/centos-release; then rpm --query centos-stream-release || bash -c 'sudo dnf -y swap centos-linux-repos centos-stream-repos && sudo dnf -y distro-sync'; fi"
echo "DEBUG: Run dev-install to deploy OpenStack on $CLUSTER_NAME..."
# Prepare the host for the deployment
make local_requirements prepare_host
rm -f inventory.yaml
make config host=$PUBLIC_IP user=stack &>/dev/null
MAKE_TARGETS="network install_stack prepare_stack local_os_client"
make $MAKE_TARGETS
if grep -q "podman create .*squid" "$ROOT_DIR/configs/$CLUSTER_NAME.yaml"; then
echo "DEBUG: proxy node detected, copying squid config"
$SCP_CMD $ROOT_DIR/secrets/squid stack@$PUBLIC_IP: &>/dev/null
fi
# OSASINFRA-3269 - remove that workaround once OVS is fixed in next zstream
if [[ $CLUSTER_NAME == *"nfv"* ]]; then
echo "DEBUG: NFV node detected, installing custom OVS for DPDK fixes"
$SSH_CMD "mkdir ~/ovs-dpdk"
$SCP_CMD $ROOT_DIR/secrets/ovs-dpdk/* stack@$PUBLIC_IP:ovs-dpdk &>/dev/null
fi
make post_install
echo "DEBUG: Cluster $CLUSTER_NAME was successfuly deployed !"
cd ..
if [ -n "$IS_CI" ]; then
echo "DEBUG: Destruction of $CLUSTER_NAME..."
eval $SERVER_DELETE_CMD
fi