diff --git a/conf/conf_example_baskerville.yaml b/conf/conf_example_baskerville.yaml index b54a848a..3275b0d5 100644 --- a/conf/conf_example_baskerville.yaml +++ b/conf/conf_example_baskerville.yaml @@ -134,6 +134,13 @@ spark: kryoserializer_buffer: '1024k' # It is suggested that you omit setting kryoserializer_buffer_max and kryoserializer_buffer and only set them if you get serialization errors. driver_java_options: '-verbose:gc' # Optional. When on a local machine with less than 36GB of ram -XX:+UseCompressedOops executor_extra_java_options: '-verbose:gc' # Optional. When on a local machine with less than 36GB of ram -XX:+UseCompressedOops + auth_secret: 'TEST_SECRET' # Optional. For RPC auth in cluster set up + ssl_enabled: True # Optional. Sets SSL for the spark ui - all following configuration must be provided -- to generate cert and import use ssl_for_sparkui.sh under data/scripts + ssl_truststore: '/path/to/truststore' + ssl_truststore_password: 'examplestorepass' + ssl_keystore: '/path/to/keystore' + ssl_keystore_password: 'examplestorepass' + ssl_keypassword: 'examplekeypass' # to connect to the jvm for memory profiling and deugging (remove the -Dcom.sun.management.jmxremote.port=1098 if more than one executors because it will cause the other executors to fail): # -XX:+PrintFlagsFinal -XX:+PrintReferenceGC -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+UnlockDiagnosticVMOptions -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.port=1098 # depending on your configuration and resources: diff --git a/data/jars/baskervilleSecurityFilter.jar b/data/jars/baskervilleSecurityFilter.jar new file mode 100644 index 00000000..08efb2cd Binary files /dev/null and b/data/jars/baskervilleSecurityFilter.jar differ diff --git a/data/scripts/ssl_for_sparkui.sh b/data/scripts/ssl_for_sparkui.sh new file mode 100644 index 00000000..2699cdd2 --- /dev/null +++ b/data/scripts/ssl_for_sparkui.sh @@ -0,0 +1,24 @@ +# source: https://www.ibm.com/support/knowledgecenter/en/SS3H8V_1.1.0/com.ibm.izoda.v1r1.azka100/topics/azkic_t_securingwebUIs.htm + +echo ">>> Setting up SSL for Spark UI..." +echo "Keystore path : " +read -s -r KEYSTORE_PATH +echo "Truststore path : " +read -s -r TRUSTSTORE_PATH +echo "Store password : " +read -s -r STORE_PASS +echo "Key password : " +read -s -r KEY_PASS +echo "L : " +read -s -r L +echo "S : " +read -s -r S +echo "C : " +read -s -r C + +keytool -genkeypair -keystore "$KEYSTORE_PATH/keystore" -keyalg RSA -alias selfsigned -dname "CN=sparkcert L=$L S=$S C=$C" -storepass "$STORE_PASS" -keypass "$KEY_PASS" + +keytool -exportcert -keystore "$KEYSTORE_PATH/keystore" -alias selfsigned -storepass $STORE_PASS -file spark.cer + +# note: do not forget to import cert in all nodes +keytool -importcert -keystore "$TRUSTSTORE_PATH/truststore" -alias selfsigned -storepass $STORE_PASS -file spark.cer -noprompt \ No newline at end of file diff --git a/src/baskerville/models/config.py b/src/baskerville/models/config.py index 86a1b615..104b4a29 100644 --- a/src/baskerville/models/config.py +++ b/src/baskerville/models/config.py @@ -770,9 +770,26 @@ class SparkConfig(Config): off_heap_size = None redis_host = 'localhost' redis_port = 6379 + auth_secret = 'TEST_SECRET' + admin_acls = 'admin' + driver_port = 18050 + block_manager_port = 18060 + ssl_enabled = False + ssl_truststore = None + ssl_truststore_password = None + ssl_keystore = None + ssl_keystore_password = None + ssl_keypassword = None def __init__(self, config): super(SparkConfig, self).__init__(config) + self._ssl_properties = { + 'ssl_truststore': self.ssl_truststore, + 'ssl_truststore_password': self.ssl_truststore_password, + 'ssl_keystore': self.ssl_keystore, + 'ssl_keystore_password': self.ssl_keystore_password, + 'ssl_keypassword': self.ssl_keypassword + } def validate(self): logger.debug('Validating SparkConfig...') @@ -824,7 +841,20 @@ def validate(self): self.event_log = 'false' else: self.event_log = 'true' - + if not self.ssl_enabled: + self.ssl_enabled = 'false' + else: + self.ssl_enabled = 'true' + for name, prop in enumerate(self._ssl_properties): + if not prop: + self.add_error(ConfigError( + f'No {name} while ssl_enabled is set to "true" ', + [name], + )) + warnings.warn( + 'SSL is enabled, so spark ui will redirect to ' + 'https://localhost:4442' + ) if self.metrics_conf and not self.jar_packages: warnings.warn('Spark metrics configuration has been set but ' 'jar packages is empty, ' diff --git a/src/baskerville/spark/__init__.py b/src/baskerville/spark/__init__.py index 0396cdae..3d52a690 100644 --- a/src/baskerville/spark/__init__.py +++ b/src/baskerville/spark/__init__.py @@ -10,8 +10,10 @@ from pyspark import SparkConf, StorageLevel from pyspark.sql import SparkSession +from baskerville.models.config import SparkConfig -def get_or_create_spark_session(spark_conf): + +def get_or_create_spark_session(spark_conf: SparkConfig): """ Returns a configured spark session :param SparkConfig spark_conf: the spark configuration @@ -145,6 +147,41 @@ def get_or_create_spark_session(spark_conf): conf.set('spark.sql.shuffle.partitions', spark_conf.shuffle_partitions) conf.set('spark.sql.autoBroadcastJoinThreshold', 1024*1024*100) # 100MB + # security + # https://spark.apache.org/docs/latest/security.html + # note that: The same secret is shared by all Spark applications and + # daemons in that case, which limits the security of these deployments, + # especially on multi-tenant clusters. + if spark_conf.auth_secret: + conf.set('spark.authenticate', 'true') + conf.set('spark.authenticate.secret', spark_conf.auth_secret) + + # encryption + conf.set('spark.network.crypto.enabled', 'true') + conf.set('spark.io.encryption.enabled', 'true') + # https://www.fortytools.com/blog/servlet-filter-for-http-basic-auth + conf.set('spark.ui.filters', 'baskerville.security.BasicAuthFilter') + # conf.set('spark.acls.enable', 'true') + # conf.set('spark.admin.acls', spark_conf.admin_acls) + + # SSL https://spark.apache.org/docs/latest/security.html#ssl-configuration + if spark_conf.ssl_enabled == 'true': + conf.set('spark.ssl.enabled', spark_conf.ssl_enabled) + conf.set('spark.ssl.trustStore', spark_conf.ssl_truststore) + conf.set('spark.ssl.trustStorePassword', spark_conf.ssl_truststore_password) + conf.set('spark.ssl.keyStore', spark_conf.ssl_keystore) + conf.set('spark.ssl.keyStorePassword', spark_conf.ssl_keystore_password) + conf.set('spark.ssl.keyPassword', spark_conf.ssl_keypassword) + conf.set('spark.ssl.protocol', 'TLSv1.2') + + # conf.set('spark.driver.port', spark_conf.driver_port) + # conf.set('spark.blockManager.port', spark_conf.block_manager_port) + + # The REST Submission Server and the MesosClusterDispatcher do not support + # authentication. You should ensure that all network access to the REST API + # & MesosClusterDispatcher (port 6066 and 7077 respectively by default) are + # restricted to hosts that are trusted to submit jobs. + spark = SparkSession.builder \ .config(conf=conf) \ .appName(spark_conf.app_name) \