Skip to content

Commit

Permalink
making pyspark an optional dependency
Browse files Browse the repository at this point in the history
  • Loading branch information
brifordwylie committed May 12, 2021
1 parent 6271847 commit 2924f5a
Show file tree
Hide file tree
Showing 6 changed files with 37 additions and 19 deletions.
9 changes: 8 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ with Pandas, scikit-learn, Kafka, and Spark

### Install
```
$ pip install zat
pip install zat
pip install zat[pyspark] (includes pyspark library)
pip install zat[all] (include pyarrow, yara-python, and tldextract)
```

### Getting Started
Expand Down Expand Up @@ -59,6 +61,11 @@ from here to there.
### Documentation
<https://supercowpowers.github.io/zat/>

#### Running the Tests
```
pip install pytest coverage pytest-cov
pytest zat
```

### About SuperCowPowers
The company was formed so that its developers could follow their passion for Python, streaming data pipelines and having fun with data analysis. We also think cows are cool and should be superheros or at least carry around rayguns and burner phones. <a href="https://www.supercowpowers.com" target="_blank">Visit SuperCowPowers</a>
15 changes: 7 additions & 8 deletions examples/kafka_spark.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
"""Read Kafka Streams into Spark, perform simple filtering/aggregation"""

import sys
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StringType, BooleanType, IntegerType
from pyspark.sql.functions import from_json, to_json, col, struct, udf

import argparse
from time import sleep

# Local imports
from zat.utils import signal_utils
try:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StringType, BooleanType, IntegerType
from pyspark.sql.functions import from_json, to_json, col, struct, udf
except ImportError:
print('\npip install pyspark')
sys.exit(1)

# Third Party Imports
try:
import tldextract
except ImportError:
Expand Down
6 changes: 5 additions & 1 deletion examples/zeek_to_parquet_with_spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@
import os
import sys
import argparse
from pyspark.sql import SparkSession
try:
from pyspark.sql import SparkSession
except ImportError:
print('pip install pyspark')
sys.exit(1)

# Local imports
from zat import log_to_sparkdf
Expand Down
8 changes: 3 additions & 5 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,12 @@ def get_files(dir_name):
install_requires=[
'requests',
'watchdog',
'numpy',
'scipy',
'pandas',
'scikit-learn',
'pyspark'
'scikit-learn'
],
extras_require={
'all': ['pyarrow', 'yara-python', 'tldextract']
'pyspark': ['pyspark'],
'all': ['pyspark', 'pyarrow', 'yara-python', 'tldextract']
},
license='Apache',
keywords='Zeek, Bro, Python, Networking, Security, Scikit-Learn, Spark, Kafka, Parquet',
Expand Down
2 changes: 1 addition & 1 deletion zat/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
__author__ = 'Brian Wylie'
__email__ = 'briford@supercowpowers.com'
__version__ = '0.4.2'
__version__ = '0.4.3'
16 changes: 13 additions & 3 deletions zat/log_to_sparkdf.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
"""LogToSparkDF: Converts a Zeek log to a Spark DataFrame"""

import sys

# Third Party
from pyspark.sql.types import StructType, StringType, IntegerType, FloatType, LongType
from pyspark.sql.functions import col, when
try:
from pyspark.sql.types import StructType, StringType, IntegerType, FloatType, LongType
from pyspark.sql.functions import col, when
except ImportError:
print('\npip install pyspark')


# Local
from zat import zeek_log_reader
Expand Down Expand Up @@ -115,8 +120,13 @@ def build_spark_schema(self, column_names, column_types, verbose=False):
def test():
"""Test for LogToSparkDF Class"""
import os
import pytest
from zat.utils import file_utils
from pyspark.sql import SparkSession

try:
from pyspark.sql import SparkSession
except ImportError:
pytest.skip('pip install pyspark')

# Spin up a local Spark Session (with 4 executors)
spark = SparkSession.builder.master('local[4]').appName('my_awesome').getOrCreate()
Expand Down

0 comments on commit 2924f5a

Please sign in to comment.