Improve specs

snowplow · Nov 4, 2024 · 9d40544 · 9d40544
1 parent 0528a28
commit 9d40544
Show file tree

Hide file tree

Showing 2 changed files with 30 additions and 30 deletions.
diff --git a/.github/workflows/spark_deployment/docker-compose.yml b/.github/workflows/spark_deployment/docker-compose.yml
@@ -27,9 +27,9 @@ services:
     deploy:
       resources:
         limits:
-          memory: 2G
+          memory: 4G
         reservations:
-          memory: 1G
+          memory: 2G
     entrypoint: ["/bin/bash", "/setup.sh"]
     command: ["/bin/bash", "-c", "/spark/sbin/start-master.sh -h spark-master --properties-file /spark/conf/spark-defaults.conf && tail -f /spark/logs/spark--org.apache.spark.deploy.master.Master-1-*.out"]
     networks:
@@ -57,9 +57,9 @@ services:
     deploy:
       resources:
         limits:
-          memory: 4G
+          memory: 8G
         reservations:
-          memory: 2G
+          memory: 4G
     entrypoint: ["/bin/bash", "/setup.sh"]
     command: ["/bin/bash", "-c", "sleep 10 && /spark/sbin/start-worker.sh spark://spark-master:7077 --properties-file /spark/conf/spark-defaults.conf && tail -f /spark/logs/spark--org.apache.spark.deploy.worker.Worker-*.out"]
     networks:
@@ -87,9 +87,9 @@ services:
     deploy:
       resources:
         limits:
-          memory: 3G
+          memory: 4G
         reservations:
-          memory: 1G
+          memory: 2G
     entrypoint: ["/bin/bash", "/setup.sh"]
     command: ["/bin/bash", "-c", "sleep 30 && /spark/sbin/start-thriftserver.sh --master spark://spark-master:7077 --conf spark.driver.memory=2g --conf spark.executor.memory=2g --hiveconf hive.server2.thrift.port=10000 --hiveconf hive.server2.thrift.bind.host=0.0.0.0 && tail -f /spark/logs/spark--org.apache.spark.sql.hive.thriftserver.HiveThriftServer2-*.out"]
     networks:

diff --git a/.github/workflows/spark_deployment/spark-defaults.conf b/.github/workflows/spark_deployment/spark-defaults.conf
@@ -1,35 +1,35 @@
 # Basic Spark Configuration
 spark.master                                   spark://spark-master:7077
 
-# Memory Configuration for GitHub Actions Runner
-spark.driver.memory                            2g
-spark.executor.memory                          2g
+# Memory Configuration for GitHub Actions Runner (16GB RAM)
+spark.driver.memory                            4g
+spark.executor.memory                          6g
 spark.memory.offHeap.enabled                   true
-spark.memory.offHeap.size                      512m
-spark.memory.fraction                          0.6
+spark.memory.offHeap.size                      2g
+spark.memory.fraction                          0.8
 spark.memory.storageFraction                   0.3
 
-# JVM Options (without memory settings)
+# JVM Options
 spark.driver.extraJavaOptions                  -XX:+UseG1GC -XX:+UseCompressedOops
 spark.executor.extraJavaOptions                -XX:+UseG1GC -XX:+UseCompressedOops
 
-# Executor Configuration
-spark.executor.cores                           2
+# Executor Configuration (4 cores)
+spark.executor.cores                           4
 spark.executor.instances                       1
-spark.default.parallelism                      4
-spark.sql.shuffle.partitions                   4
+spark.default.parallelism                      8
+spark.sql.shuffle.partitions                   8
 
 # Performance Optimization
 spark.sql.adaptive.enabled                     true
 spark.sql.adaptive.coalescePartitions.enabled  true
 spark.sql.adaptive.localShuffleReader.enabled  true
 spark.sql.adaptive.skewJoin.enabled           true
-spark.sql.adaptive.advisoryPartitionSizeInBytes 32m
+spark.sql.adaptive.advisoryPartitionSizeInBytes 64m
 
 # Storage Optimization
 spark.local.dir                               /tmp/spark-temp
-spark.disk.spillSize                          256m
-spark.sql.files.maxPartitionBytes             33554432
+spark.disk.spillSize                          512m
+spark.sql.files.maxPartitionBytes             67108864
 spark.sql.inMemoryColumnarStorage.compressed   true
 
 # Network and Shuffle Settings
@@ -53,21 +53,21 @@ spark.hadoop.fs.s3a.path.style.access          true
 spark.hadoop.fs.s3a.region                     ${AWS_REGION}
 
 # S3 Performance Optimization
-spark.hadoop.fs.s3a.connection.maximum         25
+spark.hadoop.fs.s3a.connection.maximum         50
 spark.hadoop.fs.s3a.connection.timeout         30000
-spark.hadoop.fs.s3a.attempts.maximum           5
+spark.hadoop.fs.s3a.attempts.maximum           10
 spark.hadoop.fs.s3a.connection.establish.timeout 30000
-spark.hadoop.fs.s3a.readahead.range           64K
+spark.hadoop.fs.s3a.readahead.range           128K
 spark.hadoop.fs.s3a.impl.disable.cache        false
 spark.hadoop.fs.s3a.buffer.dir                /tmp/s3a
 
 # Development Optimizations
 spark.sql.execution.arrow.pyspark.enabled      true
-spark.sql.execution.arrow.maxRecordsPerBatch   5000
+spark.sql.execution.arrow.maxRecordsPerBatch   10000
 spark.ui.port                                 4040
-spark.ui.retainedJobs                         25
-spark.ui.retainedStages                       25
-spark.ui.retainedTasks                        25
+spark.ui.retainedJobs                         50
+spark.ui.retainedStages                       50
+spark.ui.retainedTasks                        50
 
 # Warehouse Configuration
 spark.sql.warehouse.dir                        s3a://dbt-spark-iceberg/github-integration-testing
@@ -77,12 +77,12 @@ spark.sql.catalog.glue.database                unified
 # Thrift Server Settings
 spark.sql.hive.thriftServer.singleSession      true
 spark.sql.hive.thriftServer.async              true
-spark.sql.hive.thriftServer.maxWorkerThreads   2
-spark.sql.hive.thriftServer.workerQueue.size   50
+spark.sql.hive.thriftServer.maxWorkerThreads   4
+spark.sql.hive.thriftServer.workerQueue.size   100
 
 # Window Operations
-spark.sql.window.exec.buffer.in.memory.threshold 25000
-spark.sql.window.exec.buffer.spill.threshold 50000
+spark.sql.window.exec.buffer.in.memory.threshold 50000
+spark.sql.window.exec.buffer.spill.threshold 100000
 
 # Join Optimizations
 spark.sql.adaptive.skewJoin.enabled true