diff --git a/fslite/fs/constants.py b/fslite/fs/constants.py index c45dd65..da81329 100644 --- a/fslite/fs/constants.py +++ b/fslite/fs/constants.py @@ -101,6 +101,7 @@ def is_valid_univariate_method(method_name: str) -> bool: return True return False + def is_valid_multivariate_method(method_name: str) -> bool: """ This method check if the given method name is a supported multivariate method @@ -112,6 +113,7 @@ def is_valid_multivariate_method(method_name: str) -> bool: return True return False + def is_valid_ml_method(method_name: str) -> bool: """ This method check if the given method name is a supported machine learning method diff --git a/fslite/fs/methods.py b/fslite/fs/methods.py index 9ed62f3..adf67fe 100644 --- a/fslite/fs/methods.py +++ b/fslite/fs/methods.py @@ -1,11 +1,10 @@ from abc import ABC, abstractmethod -from typing import List, Type, Union, Optional, Dict, Any +from typing import List from fslite.fs.constants import get_fs_method_details from fslite.fs.fdataframe import FSDataFrame - class FSMethod(ABC): """ Feature selection abtract class, this class defines the basic structure of a feature selection method. @@ -60,6 +59,7 @@ def set_params(self, **kwargs): """ self.kwargs.update(kwargs) + # class FSPipeline: # """ # The FSPipeline class creates a pipeline of feature selection methods. It provides a way to diff --git a/fslite/fs/ml.py b/fslite/fs/ml.py index 77c9f69..30bd5a7 100644 --- a/fslite/fs/ml.py +++ b/fslite/fs/ml.py @@ -5,10 +5,6 @@ """ -import warnings -from typing import List, Any, Dict, Optional, Union - -import pandas as pd from fslite.fs.constants import get_fs_ml_methods, is_valid_ml_method from fslite.fs.fdataframe import FSDataFrame from fslite.fs.methods import FSMethod, InvalidMethodError @@ -243,8 +239,6 @@ def __repr__(self): return self.__str__() - - # # # # Define an abstract class that allow to create a factory of models diff --git a/fslite/fs/multivariate.py b/fslite/fs/multivariate.py index d59ca5e..444a5b5 100644 --- a/fslite/fs/multivariate.py +++ b/fslite/fs/multivariate.py @@ -1,38 +1,16 @@ -# import logging -# from typing import List -# -# import numpy as np -# import pyspark -# from pyspark.ml.feature import VarianceThresholdSelector -# from pyspark.ml.stat import Correlation -# -# from fslite.fs.constants import ( -# MULTIVARIATE_METHODS, -# MULTIVARIATE_CORRELATION, -# MULTIVARIATE_VARIANCE, -# ) -# -# from fslite.fs.core import FSDataFrame -# from fslite.fs.utils import find_maximal_independent_set -# from fslite.utils.generic import tag -# -# logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") -# logger = logging.getLogger("FSSPARK:MULTIVARIATE") -# logger.setLevel(logging.INFO) -# import logging from typing import List import numpy as np from scipy.stats import spearmanr -from fslite.fs.constants import get_fs_multivariate_methods +from fslite.fs.constants import get_fs_multivariate_methods, is_valid_multivariate_method from fslite.fs.fdataframe import FSDataFrame from fslite.fs.methods import FSMethod, InvalidMethodError from fslite.fs.utils import find_maximal_independent_set logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") -logger = logging.getLogger("FS:UNIVARIATE") +logger = logging.getLogger("FS:MULTIVARIATE") logger.setLevel(logging.INFO) class FSMultivariate(FSMethod): @@ -102,7 +80,8 @@ def multivariate_filter( if multivariate_method == "m_corr": selected_features = multivariate_correlation_selector(fsdf, **kwargs) elif multivariate_method == "variance": - selected_features = multivariate_variance_selector(fsdf, **kwargs) + # selected_features = multivariate_variance_selector(fsdf, **kwargs) + logging.info("Variance method not implemented yet.") else: raise ValueError( f"Invalid multivariate method: {multivariate_method}. " diff --git a/fslite/pipeline/fs_pipeline_example.py b/fslite/pipeline/fs_pipeline_example.py index 32159e9..1d3c539 100644 --- a/fslite/pipeline/fs_pipeline_example.py +++ b/fslite/pipeline/fs_pipeline_example.py @@ -8,6 +8,7 @@ from fslite.config.context import init_spark, stop_spark_session from fslite.fs.core import FSDataFrame + from fslite.fs.methods import FSPipeline, FSUnivariate, FSMultivariate, FSMLMethod from fslite.utils.datasets import get_tnbc_data_path from fslite.utils.io import import_table_as_psdf diff --git a/fslite/tests/generate_big_tests.py b/fslite/tests/generate_big_tests.py index ccc6f19..94d9c5f 100644 --- a/fslite/tests/generate_big_tests.py +++ b/fslite/tests/generate_big_tests.py @@ -1,7 +1,7 @@ import logging -import pandas as pd import numpy as np +import pandas as pd import pyarrow as pa import pyarrow.parquet as pq diff --git a/fslite/tests/test_fsdataframe.py b/fslite/tests/test_fsdataframe.py index 21879c7..039637c 100644 --- a/fslite/tests/test_fsdataframe.py +++ b/fslite/tests/test_fsdataframe.py @@ -1,8 +1,9 @@ +import gc + +import matplotlib.pyplot as plt import numpy as np import pandas as pd -import matplotlib.pyplot as plt from memory_profiler import memory_usage -import gc from fslite.fs.fdataframe import FSDataFrame diff --git a/fslite/tests/test_univariate_methods.py b/fslite/tests/test_univariate_methods.py index 4d16e0f..1da779e 100644 --- a/fslite/tests/test_univariate_methods.py +++ b/fslite/tests/test_univariate_methods.py @@ -1,8 +1,8 @@ import pandas as pd -from fslite.utils.datasets import get_tnbc_data_path -from fslite.fs.fdataframe import FSDataFrame +from fslite.fs.fdataframe import FSDataFrame from fslite.fs.univariate import FSUnivariate +from fslite.utils.datasets import get_tnbc_data_path def test_univariate_filter_corr(): diff --git a/fslite/utils/io.py b/fslite/utils/io.py index 74c202c..02dd07f 100644 --- a/fslite/utils/io.py +++ b/fslite/utils/io.py @@ -2,7 +2,6 @@ import pyspark.pandas import pyspark.sql - from fslite.config.context import PANDAS_ON_SPARK_API_SETTINGS warnings.filterwarnings("ignore")