-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathtest-job-all.sh
executable file
·94 lines (81 loc) · 4.13 KB
/
test-job-all.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/bin/bash
#SBATCH --job-name=spark-cluster
#SBATCH --account=support # change to your account
#SBATCH --partition=standard
#SBATCH --nodes=2 # node count, change as needed
#SBATCH --ntasks-per-node=1 # do not change, leave as 1 task per node
#SBATCH --cpus-per-task=36 # cpu-cores per task, change as needed
#SBATCH --mem=180g # memory per node, change as needed
#SBATCH --time=00:60:00
#SBATCH --mail-type=NONE
# A spark and python module are required. You can select the module version
# you need depending on which cluster you are on.
module load spark/3.5 python/3.12
# Start the Spark instance.
spark-start
# Source spark-env.sh to get useful env variables.
source ${HOME}/.spark-local/${SLURM_JOB_ID}/spark/conf/spark-env.sh
# Set an executor configuration.
export SPARK_EXECUTOR_CORES=1
export SPARK_EXECUTOR_MEMORY=5
# Calculate the maximum number of executors the cluster will support.
SPARK_MAX_NUM_EXECUTOR_BY_CORES=$(( SPARK_CLUSTER_CORES / SPARK_EXECUTOR_CORES ))
SPARK_MAX_NUM_EXECUTOR_BY_MEMORY=$(( SPARK_CLUSTER_MEMORY / SPARK_EXECUTOR_MEMORY ))
if [ ${SPARK_MAX_NUM_EXECUTOR_BY_CORES} -ne ${SPARK_MAX_NUM_EXECUTOR_BY_MEMORY} ]; then
echo "Warning: There is a resource mismatch."
echo "Executor configuration:"
echo " - ${SPARK_EXECUTOR_CORES} cores"
echo " - ${SPARK_EXECUTOR_MEMORY}G memory"
echo "Spark cluster total capacity for executors:"
echo " - ${SPARK_CLUSTER_CORES} cores"
echo " - ${SPARK_CLUSTER_MEMORY}G memory"
echo "Spark cluster has capacity to run the lesser of ${SPARK_MAX_NUM_EXECUTOR_BY_CORES} or ${SPARK_MAX_NUM_EXECUTOR_BY_MEMORY} executors."
echo "Consider adjusting the spark cluster or executor configuration to avoid wasting resources."
if [ ${SPARK_MAX_NUM_EXECUTOR_BY_MEMORY} -lt ${SPARK_MAX_NUM_EXECUTOR_BY_CORES} ]; then
SPARK_MAX_NUM_EXECUTOR=${SPARK_MAX_NUM_EXECUTOR_BY_MEMORY}
else
SPARK_MAX_NUM_EXECUTOR=${SPARK_MAX_NUM_EXECUTOR_BY_CORES}
fi
else
SPARK_MAX_NUM_EXECUTOR=${SPARK_MAX_NUM_EXECUTOR_BY_CORES}
fi
SPARK_TOTAL_EXECUTOR_CORES=$(( SPARK_MAX_NUM_EXECUTOR * SPARK_EXECUTOR_CORES ))
# Customize the executor resources below to match resources requested above
# with an allowance for spark driver overhead. Also change the path to your spark job.
EXAMPLES_DIR="/sw/examples/spark/spark-on-hpc/examples"
spark-submit --master ${SPARK_MASTER_URL} \
--executor-cores ${SPARK_EXECUTOR_CORES} \
--executor-memory ${SPARK_EXECUTOR_MEMORY}G \
--total-executor-cores ${SPARK_TOTAL_EXECUTOR_CORES} \
${EXAMPLES_DIR}/logistic_regression_with_lbfgs_example.py
spark-submit --master ${SPARK_MASTER_URL} \
--executor-cores ${SPARK_EXECUTOR_CORES} \
--executor-memory ${SPARK_EXECUTOR_MEMORY}G \
--total-executor-cores ${SPARK_TOTAL_EXECUTOR_CORES} \
${EXAMPLES_DIR}/nltk-test.py
spark-submit --master ${SPARK_MASTER_URL} \
--executor-cores ${SPARK_EXECUTOR_CORES} \
--executor-memory ${SPARK_EXECUTOR_MEMORY}G \
--total-executor-cores ${SPARK_TOTAL_EXECUTOR_CORES} \
${EXAMPLES_DIR}/numeric-integration.py
spark-submit --master ${SPARK_MASTER_URL} \
--executor-cores ${SPARK_EXECUTOR_CORES} \
--executor-memory ${SPARK_EXECUTOR_MEMORY}G \
--total-executor-cores ${SPARK_TOTAL_EXECUTOR_CORES} \
${EXAMPLES_DIR}/numpy-test.py
spark-submit --master ${SPARK_MASTER_URL} \
--executor-cores ${SPARK_EXECUTOR_CORES} \
--executor-memory ${SPARK_EXECUTOR_MEMORY}G \
--total-executor-cores ${SPARK_TOTAL_EXECUTOR_CORES} \
${EXAMPLES_DIR}/pi.py
# This example only works if you have read permission to the twitter decahose on Great Lakes.
#spark-submit --master ${SPARK_MASTER_URL} \
# --executor-cores ${SPARK_EXECUTOR_CORES} \
# --executor-memory ${SPARK_EXECUTOR_MEMORY}G \
# --total-executor-cores ${SPARK_TOTAL_EXECUTOR_CORES} \
# ${EXAMPLES_DIR}/twitter-decahose-test.py /nfs/turbo/twitter-decahose/decahose/raw/decahose.2020-05-25.p1.bz2
spark-submit --master ${SPARK_MASTER_URL} \
--executor-cores ${SPARK_EXECUTOR_CORES} \
--executor-memory ${SPARK_EXECUTOR_MEMORY}G \
--total-executor-cores ${SPARK_TOTAL_EXECUTOR_CORES} \
${EXAMPLES_DIR}/word-count.py /nfs/turbo/arcts-data-hadoop-stage/data/Gutenberg.txt