Skip to content

Commit

Permalink
Improve specs
Browse files Browse the repository at this point in the history
  • Loading branch information
ilias1111 committed Nov 4, 2024
1 parent 0528a28 commit 9d40544
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 30 deletions.
12 changes: 6 additions & 6 deletions .github/workflows/spark_deployment/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@ services:
deploy:
resources:
limits:
memory: 2G
memory: 4G
reservations:
memory: 1G
memory: 2G
entrypoint: ["/bin/bash", "/setup.sh"]
command: ["/bin/bash", "-c", "/spark/sbin/start-master.sh -h spark-master --properties-file /spark/conf/spark-defaults.conf && tail -f /spark/logs/spark--org.apache.spark.deploy.master.Master-1-*.out"]
networks:
Expand Down Expand Up @@ -57,9 +57,9 @@ services:
deploy:
resources:
limits:
memory: 4G
memory: 8G
reservations:
memory: 2G
memory: 4G
entrypoint: ["/bin/bash", "/setup.sh"]
command: ["/bin/bash", "-c", "sleep 10 && /spark/sbin/start-worker.sh spark://spark-master:7077 --properties-file /spark/conf/spark-defaults.conf && tail -f /spark/logs/spark--org.apache.spark.deploy.worker.Worker-*.out"]
networks:
Expand Down Expand Up @@ -87,9 +87,9 @@ services:
deploy:
resources:
limits:
memory: 3G
memory: 4G
reservations:
memory: 1G
memory: 2G
entrypoint: ["/bin/bash", "/setup.sh"]
command: ["/bin/bash", "-c", "sleep 30 && /spark/sbin/start-thriftserver.sh --master spark://spark-master:7077 --conf spark.driver.memory=2g --conf spark.executor.memory=2g --hiveconf hive.server2.thrift.port=10000 --hiveconf hive.server2.thrift.bind.host=0.0.0.0 && tail -f /spark/logs/spark--org.apache.spark.sql.hive.thriftserver.HiveThriftServer2-*.out"]
networks:
Expand Down
48 changes: 24 additions & 24 deletions .github/workflows/spark_deployment/spark-defaults.conf
Original file line number Diff line number Diff line change
@@ -1,35 +1,35 @@
# Basic Spark Configuration
spark.master spark://spark-master:7077

# Memory Configuration for GitHub Actions Runner
spark.driver.memory 2g
spark.executor.memory 2g
# Memory Configuration for GitHub Actions Runner (16GB RAM)
spark.driver.memory 4g
spark.executor.memory 6g
spark.memory.offHeap.enabled true
spark.memory.offHeap.size 512m
spark.memory.fraction 0.6
spark.memory.offHeap.size 2g
spark.memory.fraction 0.8
spark.memory.storageFraction 0.3

# JVM Options (without memory settings)
# JVM Options
spark.driver.extraJavaOptions -XX:+UseG1GC -XX:+UseCompressedOops
spark.executor.extraJavaOptions -XX:+UseG1GC -XX:+UseCompressedOops

# Executor Configuration
spark.executor.cores 2
# Executor Configuration (4 cores)
spark.executor.cores 4
spark.executor.instances 1
spark.default.parallelism 4
spark.sql.shuffle.partitions 4
spark.default.parallelism 8
spark.sql.shuffle.partitions 8

# Performance Optimization
spark.sql.adaptive.enabled true
spark.sql.adaptive.coalescePartitions.enabled true
spark.sql.adaptive.localShuffleReader.enabled true
spark.sql.adaptive.skewJoin.enabled true
spark.sql.adaptive.advisoryPartitionSizeInBytes 32m
spark.sql.adaptive.advisoryPartitionSizeInBytes 64m

# Storage Optimization
spark.local.dir /tmp/spark-temp
spark.disk.spillSize 256m
spark.sql.files.maxPartitionBytes 33554432
spark.disk.spillSize 512m
spark.sql.files.maxPartitionBytes 67108864
spark.sql.inMemoryColumnarStorage.compressed true

# Network and Shuffle Settings
Expand All @@ -53,21 +53,21 @@ spark.hadoop.fs.s3a.path.style.access true
spark.hadoop.fs.s3a.region ${AWS_REGION}

# S3 Performance Optimization
spark.hadoop.fs.s3a.connection.maximum 25
spark.hadoop.fs.s3a.connection.maximum 50
spark.hadoop.fs.s3a.connection.timeout 30000
spark.hadoop.fs.s3a.attempts.maximum 5
spark.hadoop.fs.s3a.attempts.maximum 10
spark.hadoop.fs.s3a.connection.establish.timeout 30000
spark.hadoop.fs.s3a.readahead.range 64K
spark.hadoop.fs.s3a.readahead.range 128K
spark.hadoop.fs.s3a.impl.disable.cache false
spark.hadoop.fs.s3a.buffer.dir /tmp/s3a

# Development Optimizations
spark.sql.execution.arrow.pyspark.enabled true
spark.sql.execution.arrow.maxRecordsPerBatch 5000
spark.sql.execution.arrow.maxRecordsPerBatch 10000
spark.ui.port 4040
spark.ui.retainedJobs 25
spark.ui.retainedStages 25
spark.ui.retainedTasks 25
spark.ui.retainedJobs 50
spark.ui.retainedStages 50
spark.ui.retainedTasks 50

# Warehouse Configuration
spark.sql.warehouse.dir s3a://dbt-spark-iceberg/github-integration-testing
Expand All @@ -77,12 +77,12 @@ spark.sql.catalog.glue.database unified
# Thrift Server Settings
spark.sql.hive.thriftServer.singleSession true
spark.sql.hive.thriftServer.async true
spark.sql.hive.thriftServer.maxWorkerThreads 2
spark.sql.hive.thriftServer.workerQueue.size 50
spark.sql.hive.thriftServer.maxWorkerThreads 4
spark.sql.hive.thriftServer.workerQueue.size 100

# Window Operations
spark.sql.window.exec.buffer.in.memory.threshold 25000
spark.sql.window.exec.buffer.spill.threshold 50000
spark.sql.window.exec.buffer.in.memory.threshold 50000
spark.sql.window.exec.buffer.spill.threshold 100000

# Join Optimizations
spark.sql.adaptive.skewJoin.enabled true

0 comments on commit 9d40544

Please sign in to comment.