Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue 41 security #43

Open
wants to merge 4 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions conf/conf_example_baskerville.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,13 @@ spark:
kryoserializer_buffer: '1024k' # It is suggested that you omit setting kryoserializer_buffer_max and kryoserializer_buffer and only set them if you get serialization errors.
driver_java_options: '-verbose:gc' # Optional. When on a local machine with less than 36GB of ram -XX:+UseCompressedOops
executor_extra_java_options: '-verbose:gc' # Optional. When on a local machine with less than 36GB of ram -XX:+UseCompressedOops
auth_secret: 'TEST_SECRET' # Optional. For RPC auth in cluster set up
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Needs to be tested on the cluster.

ssl_enabled: True # Optional. Sets SSL for the spark ui - all following configuration must be provided -- to generate cert and import use ssl_for_sparkui.sh under data/scripts
ssl_truststore: '/path/to/truststore'
ssl_truststore_password: 'examplestorepass'
ssl_keystore: '/path/to/keystore'
ssl_keystore_password: 'examplestorepass'
ssl_keypassword: 'examplekeypass'
# to connect to the jvm for memory profiling and deugging (remove the -Dcom.sun.management.jmxremote.port=1098 if more than one executors because it will cause the other executors to fail):
# -XX:+PrintFlagsFinal -XX:+PrintReferenceGC -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+UnlockDiagnosticVMOptions -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.port=1098
# depending on your configuration and resources:
Expand Down
Binary file added data/jars/baskervilleSecurityFilter.jar
Binary file not shown.
24 changes: 24 additions & 0 deletions data/scripts/ssl_for_sparkui.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# source: https://www.ibm.com/support/knowledgecenter/en/SS3H8V_1.1.0/com.ibm.izoda.v1r1.azka100/topics/azkic_t_securingwebUIs.htm

echo ">>> Setting up SSL for Spark UI..."
echo "Keystore path : "
read -s -r KEYSTORE_PATH
echo "Truststore path : "
read -s -r TRUSTSTORE_PATH
echo "Store password : "
read -s -r STORE_PASS
echo "Key password : "
read -s -r KEY_PASS
echo "L : "
read -s -r L
echo "S : "
read -s -r S
echo "C : "
read -s -r C

keytool -genkeypair -keystore "$KEYSTORE_PATH/keystore" -keyalg RSA -alias selfsigned -dname "CN=sparkcert L=$L S=$S C=$C" -storepass "$STORE_PASS" -keypass "$KEY_PASS"

keytool -exportcert -keystore "$KEYSTORE_PATH/keystore" -alias selfsigned -storepass $STORE_PASS -file spark.cer

# note: do not forget to import cert in all nodes
keytool -importcert -keystore "$TRUSTSTORE_PATH/truststore" -alias selfsigned -storepass $STORE_PASS -file spark.cer -noprompt
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have checked this only locally. We'll need to test this on the cluster.

32 changes: 31 additions & 1 deletion src/baskerville/models/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -770,9 +770,26 @@ class SparkConfig(Config):
off_heap_size = None
redis_host = 'localhost'
redis_port = 6379
auth_secret = 'TEST_SECRET'
admin_acls = 'admin'
driver_port = 18050
block_manager_port = 18060
ssl_enabled = False
ssl_truststore = None
ssl_truststore_password = None
ssl_keystore = None
ssl_keystore_password = None
ssl_keypassword = None

def __init__(self, config):
super(SparkConfig, self).__init__(config)
self._ssl_properties = {
'ssl_truststore': self.ssl_truststore,
'ssl_truststore_password': self.ssl_truststore_password,
'ssl_keystore': self.ssl_keystore,
'ssl_keystore_password': self.ssl_keystore_password,
'ssl_keypassword': self.ssl_keypassword
}

def validate(self):
logger.debug('Validating SparkConfig...')
Expand Down Expand Up @@ -824,7 +841,20 @@ def validate(self):
self.event_log = 'false'
else:
self.event_log = 'true'

if not self.ssl_enabled:
self.ssl_enabled = 'false'
else:
self.ssl_enabled = 'true'
for name, prop in enumerate(self._ssl_properties):
if not prop:
self.add_error(ConfigError(
f'No {name} while ssl_enabled is set to "true" ',
[name],
))
warnings.warn(
'SSL is enabled, so spark ui will redirect to '
'https://localhost:4442'
)
if self.metrics_conf and not self.jar_packages:
warnings.warn('Spark metrics configuration has been set but '
'jar packages is empty, '
Expand Down
39 changes: 38 additions & 1 deletion src/baskerville/spark/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,10 @@
from pyspark import SparkConf, StorageLevel
from pyspark.sql import SparkSession

from baskerville.models.config import SparkConfig

def get_or_create_spark_session(spark_conf):

def get_or_create_spark_session(spark_conf: SparkConfig):
"""
Returns a configured spark session
:param SparkConfig spark_conf: the spark configuration
Expand Down Expand Up @@ -145,6 +147,41 @@ def get_or_create_spark_session(spark_conf):
conf.set('spark.sql.shuffle.partitions', spark_conf.shuffle_partitions)
conf.set('spark.sql.autoBroadcastJoinThreshold', 1024*1024*100) # 100MB

# security
# https://spark.apache.org/docs/latest/security.html
# note that: The same secret is shared by all Spark applications and
# daemons in that case, which limits the security of these deployments,
# especially on multi-tenant clusters.
if spark_conf.auth_secret:
conf.set('spark.authenticate', 'true')
conf.set('spark.authenticate.secret', spark_conf.auth_secret)

# encryption
conf.set('spark.network.crypto.enabled', 'true')
conf.set('spark.io.encryption.enabled', 'true')
# https://www.fortytools.com/blog/servlet-filter-for-http-basic-auth
conf.set('spark.ui.filters', 'baskerville.security.BasicAuthFilter')
# conf.set('spark.acls.enable', 'true')
# conf.set('spark.admin.acls', spark_conf.admin_acls)

# SSL https://spark.apache.org/docs/latest/security.html#ssl-configuration
if spark_conf.ssl_enabled == 'true':
conf.set('spark.ssl.enabled', spark_conf.ssl_enabled)
conf.set('spark.ssl.trustStore', spark_conf.ssl_truststore)
conf.set('spark.ssl.trustStorePassword', spark_conf.ssl_truststore_password)
conf.set('spark.ssl.keyStore', spark_conf.ssl_keystore)
conf.set('spark.ssl.keyStorePassword', spark_conf.ssl_keystore_password)
conf.set('spark.ssl.keyPassword', spark_conf.ssl_keypassword)
conf.set('spark.ssl.protocol', 'TLSv1.2')
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure about the tls protocol version, will double check.


# conf.set('spark.driver.port', spark_conf.driver_port)
# conf.set('spark.blockManager.port', spark_conf.block_manager_port)

# The REST Submission Server and the MesosClusterDispatcher do not support
# authentication. You should ensure that all network access to the REST API
# & MesosClusterDispatcher (port 6066 and 7077 respectively by default) are
# restricted to hosts that are trusted to submit jobs.

spark = SparkSession.builder \
.config(conf=conf) \
.appName(spark_conf.app_name) \
Expand Down