Skip to content

Commit

Permalink
update in dependencies
Browse files Browse the repository at this point in the history
  • Loading branch information
ypriverol committed Sep 22, 2024
1 parent a0181aa commit 4a93621
Show file tree
Hide file tree
Showing 4 changed files with 379 additions and 379 deletions.
170 changes: 85 additions & 85 deletions fslite/tests/test_data_preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,85 +1,85 @@
import unittest

import numpy as np

from fslite.config.context import init_spark, stop_spark_session
from fslite.fs.core import FSDataFrame
from fslite.fs.utils import (
compute_missingness_rate,
remove_features_by_missingness_rate,
impute_missing,
)
from fslite.utils.datasets import get_tnbc_data_missing_values_path
from fslite.utils.io import import_table_as_psdf


class TestDataPreprocessing(unittest.TestCase):
"""
Define testing methods for data preprocessing (e.g, scaling, imputation, etc.)
"""

def setUp(self) -> None:
init_spark(
apply_pyarrow_settings=True,
apply_extra_spark_settings=True,
apply_pandas_settings=True,
)

def tearDown(self) -> None:
stop_spark_session()

@staticmethod
def import_FSDataFrame() -> FSDataFrame:
"""
Import FSDataFrame object with missing values.
Number of samples: 44
Number of features: 10 (5 with missing values)
:return:
"""
df = import_table_as_psdf(get_tnbc_data_missing_values_path(), n_partitions=5)
fsdf = FSDataFrame(df, sample_col="Sample", label_col="label")
return fsdf

def test_compute_missingness_rate(self):
"""
Test compute_missingness_rate method.
:return: None
"""

fsdf = self.import_FSDataFrame()
features_missing_rates = compute_missingness_rate(fsdf)
self.assertEqual(features_missing_rates.get("tr|E9PBJ4"), 0.0)
self.assertAlmostEqual(features_missing_rates.get("sp|P07437"), 0.295, places=2)

def test_filter_by_missingness_rate(self):
"""
Test filter_missingness_rate method.
:return: None
"""

fsdf = self.import_FSDataFrame()
fsdf = remove_features_by_missingness_rate(fsdf, threshold=0.15)
# print number of features
print(f"Number of remaining features: {fsdf.count_features()}")

self.assertEqual(fsdf.count_features(), 6)

def test_impute_missing(self):
"""
Test impute_missing method. Impute missing values using the mean across columns.
:return: None
"""

fsdf = self.import_FSDataFrame()
fsdf = impute_missing(fsdf, strategy="mean")

# Collect features as array
array = fsdf._collect_features_as_array()

# Check if there are no missing (NaNs) or null values
self.assertFalse(np.isnan(array).any())


if __name__ == "__main__":
unittest.main()
# import unittest
#
# import numpy as np
#
# from fslite.config.context import init_spark, stop_spark_session
# from fslite.fs.core import FSDataFrame
# from fslite.fs.utils import (
# compute_missingness_rate,
# remove_features_by_missingness_rate,
# impute_missing,
# )
# from fslite.utils.datasets import get_tnbc_data_missing_values_path
# from fslite.utils.io import import_table_as_psdf
#
#
# class TestDataPreprocessing(unittest.TestCase):
# """
# Define testing methods for data preprocessing (e.g, scaling, imputation, etc.)
#
# """
#
# def setUp(self) -> None:
# init_spark(
# apply_pyarrow_settings=True,
# apply_extra_spark_settings=True,
# apply_pandas_settings=True,
# )
#
# def tearDown(self) -> None:
# stop_spark_session()
#
# @staticmethod
# def import_FSDataFrame() -> FSDataFrame:
# """
# Import FSDataFrame object with missing values.
# Number of samples: 44
# Number of features: 10 (5 with missing values)
# :return:
# """
# df = import_table_as_psdf(get_tnbc_data_missing_values_path(), n_partitions=5)
# fsdf = FSDataFrame(df, sample_col="Sample", label_col="label")
# return fsdf
#
# def test_compute_missingness_rate(self):
# """
# Test compute_missingness_rate method.
# :return: None
# """
#
# fsdf = self.import_FSDataFrame()
# features_missing_rates = compute_missingness_rate(fsdf)
# self.assertEqual(features_missing_rates.get("tr|E9PBJ4"), 0.0)
# self.assertAlmostEqual(features_missing_rates.get("sp|P07437"), 0.295, places=2)
#
# def test_filter_by_missingness_rate(self):
# """
# Test filter_missingness_rate method.
# :return: None
# """
#
# fsdf = self.import_FSDataFrame()
# fsdf = remove_features_by_missingness_rate(fsdf, threshold=0.15)
# # print number of features
# print(f"Number of remaining features: {fsdf.count_features()}")
#
# self.assertEqual(fsdf.count_features(), 6)
#
# def test_impute_missing(self):
# """
# Test impute_missing method. Impute missing values using the mean across columns.
# :return: None
# """
#
# fsdf = self.import_FSDataFrame()
# fsdf = impute_missing(fsdf, strategy="mean")
#
# # Collect features as array
# array = fsdf._collect_features_as_array()
#
# # Check if there are no missing (NaNs) or null values
# self.assertFalse(np.isnan(array).any())
#
#
# if __name__ == "__main__":
# unittest.main()
144 changes: 72 additions & 72 deletions fslite/tests/test_fs_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,72 +1,72 @@
import unittest

from fslite.config.context import init_spark, stop_spark_session
from fslite.fs.core import FSDataFrame
from fslite.fs.methods import FSPipeline, FSUnivariate, FSMultivariate, FSMLMethod
from fslite.utils.datasets import get_tnbc_data_path
from fslite.utils.io import import_table_as_psdf


class FeatureSelectionPipelineTest(unittest.TestCase):

def setUp(self) -> None:
init_spark(
apply_pyarrow_settings=True,
apply_extra_spark_settings=True,
apply_pandas_settings=True,
)

def tearDown(self) -> None:
stop_spark_session()

@staticmethod
def import_FSDataFrame():
df = import_table_as_psdf(get_tnbc_data_path(), n_partitions=5)
fsdf = FSDataFrame(df, sample_col="Sample", label_col="label")
return fsdf

def test_feature_selection_pipeline(self):
fsdf = self.import_FSDataFrame()

training_data, testing_data = fsdf.split_df(split_training_factor=0.6)

# create a Univariate object
univariate = FSUnivariate(
fs_method="anova", selection_mode="percentile", selection_threshold=0.8
)

# create a Multivariate object
multivariate = FSMultivariate(
fs_method="m_corr", corr_threshold=0.75, corr_method="pearson"
)

# create a MLMethod object
rf_classifier = FSMLMethod(
fs_method="rf_multilabel",
rfe=True,
rfe_iterations=2,
percent_to_keep=0.9,
estimator_params={"labelCol": "label"},
evaluator_params={"metricName": "accuracy"},
grid_params={"numTrees": [10, 15], "maxDepth": [5, 10]},
cv_params={"parallelism": 2, "numFolds": 5},
)

# create a pipeline object
fs_pipeline = FSPipeline(
df_training=training_data,
df_testing=testing_data,
fs_stages=[univariate, multivariate, rf_classifier],
)

# run the pipeline
results = fs_pipeline.run()

# print results
print(results)

assert results.get("training_metric") > 0.9


if __name__ == "__main__":
unittest.main()
# import unittest
#
# from fslite.config.context import init_spark, stop_spark_session
# from fslite.fs.core import FSDataFrame
# from fslite.fs.methods import FSPipeline, FSUnivariate, FSMultivariate, FSMLMethod
# from fslite.utils.datasets import get_tnbc_data_path
# from fslite.utils.io import import_table_as_psdf
#
#
# class FeatureSelectionPipelineTest(unittest.TestCase):
#
# def setUp(self) -> None:
# init_spark(
# apply_pyarrow_settings=True,
# apply_extra_spark_settings=True,
# apply_pandas_settings=True,
# )
#
# def tearDown(self) -> None:
# stop_spark_session()
#
# @staticmethod
# def import_FSDataFrame():
# df = import_table_as_psdf(get_tnbc_data_path(), n_partitions=5)
# fsdf = FSDataFrame(df, sample_col="Sample", label_col="label")
# return fsdf
#
# def test_feature_selection_pipeline(self):
# fsdf = self.import_FSDataFrame()
#
# training_data, testing_data = fsdf.split_df(split_training_factor=0.6)
#
# # create a Univariate object
# univariate = FSUnivariate(
# fs_method="anova", selection_mode="percentile", selection_threshold=0.8
# )
#
# # create a Multivariate object
# multivariate = FSMultivariate(
# fs_method="m_corr", corr_threshold=0.75, corr_method="pearson"
# )
#
# # create a MLMethod object
# rf_classifier = FSMLMethod(
# fs_method="rf_multilabel",
# rfe=True,
# rfe_iterations=2,
# percent_to_keep=0.9,
# estimator_params={"labelCol": "label"},
# evaluator_params={"metricName": "accuracy"},
# grid_params={"numTrees": [10, 15], "maxDepth": [5, 10]},
# cv_params={"parallelism": 2, "numFolds": 5},
# )
#
# # create a pipeline object
# fs_pipeline = FSPipeline(
# df_training=training_data,
# df_testing=testing_data,
# fs_stages=[univariate, multivariate, rf_classifier],
# )
#
# # run the pipeline
# results = fs_pipeline.run()
#
# # print results
# print(results)
#
# assert results.get("training_metric") > 0.9
#
#
# if __name__ == "__main__":
# unittest.main()
Loading

0 comments on commit 4a93621

Please sign in to comment.