Skip to content

Commit

Permalink
fsspark -> fslite
Browse files Browse the repository at this point in the history
  • Loading branch information
ypriverol committed Sep 22, 2024
1 parent c2312c8 commit 10ee2e8
Show file tree
Hide file tree
Showing 31 changed files with 67 additions and 146 deletions.
22 changes: 12 additions & 10 deletions docs/README.data.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,19 +37,21 @@ The following is an example of a TSV file with a binary response variable:
- `import_table` - Import data from a TSV file into a Spark Data Frame (sdf).

```python
from fsspark.utils.io import import_table
sdf = import_table('data.tsv.bgz',
sep='\t',
n_partitions=5)
from fslite.utils.io import import_table

sdf = import_table('data.tsv.bgz',
sep='\t',
n_partitions=5)
```

- `import_table_as_psdf` - Import data from a TSV file into a Spark Data Frame (sdf) and
convert it into a Pandas on Spark Data Frame (psdf).

```python
from fsspark.utils.io import import_table_as_psdf
psdf = import_table_as_psdf('data.tsv.bgz',
sep='\t',
from fslite.utils.io import import_table_as_psdf

psdf = import_table_as_psdf('data.tsv.bgz',
sep='\t',
n_partitions=5)
```

Expand All @@ -73,9 +75,9 @@ contains the response variable.
#### How to create a Feature Selection Spark Data Frame (FSDF)

```python
from fsspark.config.context import init_spark, stop_spark_session
from fsspark.fs.core import FSDataFrame
from fsspark.utils.io import import_table_as_psdf
from fslite.config.context import init_spark, stop_spark_session
from fslite.fs.core import FSDataFrame
from fslite.utils.io import import_table_as_psdf

# Init spark
init_spark()
Expand Down
2 changes: 1 addition & 1 deletion docs/README.methods.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,4 +53,4 @@ A typical workflow written using `fsspark` can be divided roughly in four major

### 5. Feature selection pipeline example

[FS pipeline example](../fsspark/pipeline/fs_pipeline_example.py)
[FS pipeline example](../fslite/pipeline/fs_pipeline_example.py)
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: fsspark-venv
name: fslite-venv
channels:
- defaults
- conda-forge
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
12 changes: 6 additions & 6 deletions fsspark/fs/methods.py → fslite/fs/methods.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from abc import ABC, abstractmethod
from typing import List, Type, Union, Tuple, Optional, Dict, Any

from fsspark.fs.constants import (ML_METHODS, UNIVARIATE_METHODS,
MULTIVARIATE_METHODS)
from fsspark.fs.core import FSDataFrame
from fsspark.fs.ml import MLCVModel
from fsspark.fs.multivariate import multivariate_filter
from fsspark.fs.univariate import univariate_filter
from fslite.fs.constants import (ML_METHODS, UNIVARIATE_METHODS,
MULTIVARIATE_METHODS)
from fslite.fs.core import FSDataFrame
from fslite.fs.ml import MLCVModel
from fslite.fs.multivariate import multivariate_filter
from fslite.fs.univariate import univariate_filter


class FSMethod(ABC):
Expand Down
18 changes: 9 additions & 9 deletions fsspark/fs/ml.py → fslite/fs/ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,15 @@
from pyspark.ml.regression import RandomForestRegressionModel, RandomForestRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, CrossValidatorModel, Param

from fsspark.fs.constants import (RF_BINARY,
LSVC_BINARY,
FM_BINARY,
RF_MULTILABEL,
LR_MULTILABEL,
RF_REGRESSION,
FM_REGRESSION,
ML_METHODS)
from fsspark.fs.core import FSDataFrame
from fslite.fs.constants import (RF_BINARY,
LSVC_BINARY,
FM_BINARY,
RF_MULTILABEL,
LR_MULTILABEL,
RF_REGRESSION,
FM_REGRESSION,
ML_METHODS)
from fslite.fs.core import FSDataFrame

ESTIMATORS_CLASSES = [RandomForestClassifier, RandomForestRegressionModel, LinearSVC, LogisticRegression]
EVALUATORS_CLASSES = [BinaryClassificationEvaluator, MulticlassClassificationEvaluator, RegressionEvaluator]
Expand Down
8 changes: 4 additions & 4 deletions fsspark/fs/multivariate.py → fslite/fs/multivariate.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@
from pyspark.ml.feature import (VarianceThresholdSelector)
from pyspark.ml.stat import Correlation

from fsspark.fs.constants import MULTIVARIATE_METHODS, MULTIVARIATE_CORRELATION, MULTIVARIATE_VARIANCE
from fslite.fs.constants import MULTIVARIATE_METHODS, MULTIVARIATE_CORRELATION, MULTIVARIATE_VARIANCE

from fsspark.fs.core import FSDataFrame
from fsspark.fs.utils import find_maximal_independent_set
from fsspark.utils.generic import tag
from fslite.fs.core import FSDataFrame
from fslite.fs.utils import find_maximal_independent_set
from fslite.utils.generic import tag

logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
logger = logging.getLogger("FSSPARK:MULTIVARIATE")
Expand Down
2 changes: 1 addition & 1 deletion fsspark/fs/univariate.py → fslite/fs/univariate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif, f_regression

from fsspark.fs.fdataframe import FSDataFrame
from fslite.fs.fdataframe import FSDataFrame

logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
logger = logging.getLogger("FS:UNIVARIATE")
Expand Down
4 changes: 2 additions & 2 deletions fsspark/fs/utils.py → fslite/fs/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
from networkx.algorithms.mis import maximal_independent_set
from pyspark.ml.feature import Imputer

from fsspark.fs.core import FSDataFrame
from fsspark.utils.generic import tag
from fslite.fs.core import FSDataFrame
from fslite.utils.generic import tag

logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
logger = logging.getLogger("FSSPARK:UTILS")
Expand Down
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
"""
Example of a feature selection pipeline implemented in fsspark.
Example of a feature selection pipeline implemented in fslite.
After data import and pre-processing, the pipeline applies univariate correlation filter,
multivariate correlation filter and Randon Forest classification.
"""

from fsspark.config.context import init_spark, stop_spark_session
from fsspark.fs.core import FSDataFrame
from fsspark.fs.methods import FSPipeline, FSUnivariate, FSMultivariate, FSMLMethod
from fsspark.utils.datasets import get_tnbc_data_path
from fsspark.utils.io import import_table_as_psdf
from fslite.config.context import init_spark, stop_spark_session
from fslite.fs.core import FSDataFrame
from fslite.fs.methods import FSPipeline, FSUnivariate, FSMultivariate, FSMLMethod
from fslite.utils.datasets import get_tnbc_data_path
from fslite.utils.io import import_table_as_psdf

# Init spark
init_spark(apply_pyarrow_settings=True,
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@

import numpy as np

from fsspark.config.context import init_spark, stop_spark_session
from fsspark.fs.core import FSDataFrame
from fsspark.fs.utils import compute_missingness_rate, remove_features_by_missingness_rate, impute_missing
from fsspark.utils.datasets import get_tnbc_data_missing_values_path
from fsspark.utils.io import import_table_as_psdf
from fslite.config.context import init_spark, stop_spark_session
from fslite.fs.core import FSDataFrame
from fslite.fs.utils import compute_missingness_rate, remove_features_by_missingness_rate, impute_missing
from fslite.utils.datasets import get_tnbc_data_missing_values_path
from fslite.utils.io import import_table_as_psdf


class TestDataPreprocessing(unittest.TestCase):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import unittest

from fsspark.config.context import init_spark, stop_spark_session
from fsspark.fs.core import FSDataFrame
from fsspark.fs.methods import FSPipeline, FSUnivariate, FSMultivariate, FSMLMethod
from fsspark.utils.datasets import get_tnbc_data_path
from fsspark.utils.io import import_table_as_psdf
from fslite.config.context import init_spark, stop_spark_session
from fslite.fs.core import FSDataFrame
from fslite.fs.methods import FSPipeline, FSUnivariate, FSMultivariate, FSMLMethod
from fslite.utils.datasets import get_tnbc_data_path
from fslite.utils.io import import_table_as_psdf


class FeatureSelectionPipelineTest(unittest.TestCase):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from memory_profiler import memory_usage
import gc

from fsspark.fs.fdataframe import FSDataFrame
from fslite.fs.fdataframe import FSDataFrame

def test_initializes_fsdataframe():

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
import pyspark
import pyspark.pandas as ps

from fsspark.config.context import init_spark, stop_spark_session
from fsspark.utils.datasets import get_tnbc_data_path
from fsspark.utils.io import import_table, import_table_as_psdf
from fslite.config.context import init_spark, stop_spark_session
from fslite.utils.datasets import get_tnbc_data_path
from fslite.utils.io import import_table, import_table_as_psdf


class TestImportExport(unittest.TestCase):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
from pyspark.ml.evaluation import (BinaryClassificationEvaluator,
MulticlassClassificationEvaluator)

from fsspark.config.context import init_spark, stop_spark_session
from fsspark.fs.core import FSDataFrame
from fsspark.fs.ml import MLCVModel
from fsspark.utils.datasets import get_tnbc_data_path
from fsspark.utils.io import import_table_as_psdf
from fslite.config.context import init_spark, stop_spark_session
from fslite.fs.core import FSDataFrame
from fslite.fs.ml import MLCVModel
from fslite.utils.datasets import get_tnbc_data_path
from fslite.utils.io import import_table_as_psdf


class MLMethodTest(unittest.TestCase):
Expand Down Expand Up @@ -145,7 +145,7 @@ def test_multilabel_lr_model(self):
assert testing_acc > 0.7

def test_FSMLMethod(self):
from fsspark.fs.methods import FSMLMethod
from fslite.fs.methods import FSMLMethod

fsdf = self.import_FSDataFrame()
training_data, testing_data = fsdf.split_df(split_training_factor=0.7)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import pandas as pd
from fsspark.utils.datasets import get_tnbc_data_path
from fsspark.fs.fdataframe import FSDataFrame
from fslite.utils.datasets import get_tnbc_data_path
from fslite.fs.fdataframe import FSDataFrame

from fsspark.fs.univariate import univariate_filter
from fslite.fs.univariate import univariate_filter

def test_univariate_filter_corr():
"""
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
2 changes: 1 addition & 1 deletion fsspark/utils/io.py → fslite/utils/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pyspark.pandas
import pyspark.sql

from fsspark.config.context import PANDAS_ON_SPARK_API_SETTINGS
from fslite.config.context import PANDAS_ON_SPARK_API_SETTINGS

warnings.filterwarnings("ignore")

Expand Down
59 changes: 0 additions & 59 deletions fsspark/config/context.py

This file was deleted.

22 changes: 0 additions & 22 deletions fsspark/config/global_settings.py

This file was deleted.

Empty file removed fsspark/utils/__init__.py
Empty file.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
long_description = fh.read()

setup(
name='fsspark',
name='fslite',
version='0.0.1',
url='https://github.com/bigbio/fsspark',
license='Apache-2.0',
Expand Down

0 comments on commit 10ee2e8

Please sign in to comment.