From 82a1a86ef2e55bf04871097a94a5ca04bcffb1ae Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 25 Sep 2024 13:14:15 +0100 Subject: [PATCH] delete ML methods --- examples/loom2parquetchunks.py | 4 +- fslite/tests/test_ml_methods.py | 177 -------------------------------- 2 files changed, 2 insertions(+), 179 deletions(-) delete mode 100644 fslite/tests/test_ml_methods.py diff --git a/examples/loom2parquetchunks.py b/examples/loom2parquetchunks.py index 5667518..a4cc52d 100644 --- a/examples/loom2parquetchunks.py +++ b/examples/loom2parquetchunks.py @@ -66,8 +66,8 @@ # transpose dataset and convert to parquet. # process the data per chunks. -chunk_size = 2000 -number_chunks = 1000 # Number of chunks to process, if None, all chunks are processed +chunk_size = 50000 +number_chunks = 50 # Number of chunks to process, if None, all chunks are processed count = 0 for (ix, selection, view) in ds.scan(axis=1, batch_size=chunk_size): # retrieve the chunk diff --git a/fslite/tests/test_ml_methods.py b/fslite/tests/test_ml_methods.py deleted file mode 100644 index 20920e6..0000000 --- a/fslite/tests/test_ml_methods.py +++ /dev/null @@ -1,177 +0,0 @@ -# import unittest -# -# from pyspark.ml.classification import RandomForestClassifier, LogisticRegression -# from pyspark.ml.evaluation import ( -# BinaryClassificationEvaluator, -# MulticlassClassificationEvaluator, -# ) -# -# from fslite.config.context import init_spark, stop_spark_session -# from fslite.fs.core import FSDataFrame -# from fslite.fs.ml import MLCVModel -# from fslite.utils.datasets import get_tnbc_data_path -# from fslite.utils.io import import_table_as_psdf -# -# -# class MLMethodTest(unittest.TestCase): -# -# def setUp(self) -> None: -# init_spark( -# apply_pyarrow_settings=True, -# apply_extra_spark_settings=True, -# apply_pandas_settings=True, -# ) -# -# def tearDown(self) -> None: -# stop_spark_session() -# -# @staticmethod -# def import_FSDataFrame(): -# df = import_table_as_psdf(get_tnbc_data_path(), n_partitions=5) -# fsdf = FSDataFrame(df, sample_col="Sample", label_col="label") -# return fsdf -# -# def test_build_model_using_cross_validator(self): -# fsdf = self.import_FSDataFrame() -# estimator = RandomForestClassifier() -# evaluator = BinaryClassificationEvaluator() -# grid_params = {"numTrees": [10, 20, 30], "maxDepth": [5, 10, 15]} -# ml_method = MLCVModel( -# estimator=estimator, -# evaluator=evaluator, -# estimator_params=None, -# grid_params=None, -# cv_params=None, -# ) -# -# print(ml_method._cross_validator.__str__()) -# assert ml_method._cross_validator is not None -# -# def test_get_feature_scores_random_forest_classifier(self): -# # Create a sample FSDataFrame -# fsdf = self.import_FSDataFrame() -# -# # Create a RandomForestClassifier model -# estimator = RandomForestClassifier() -# evaluator = MulticlassClassificationEvaluator() -# estimator_params = {"labelCol": "label"} -# grid_params = {"numTrees": [10, 20, 30], "maxDepth": [5, 10, 15]} -# cv_params = {"parallelism": 2, "numFolds": 5, "collectSubModels": False} -# -# ml_method = MLCVModel( -# estimator=estimator, -# evaluator=evaluator, -# estimator_params=estimator_params, -# grid_params=grid_params, -# cv_params=cv_params, -# ) -# -# (ml_method.fit(fsdf)) -# -# # Get the feature scores -# feature_scores = ml_method.get_feature_scores() -# -# # Assert that the feature scores DataFrame is not empty -# assert not feature_scores.empty -# -# # Assert that the feature scores DataFrame has the expected columns -# expected_columns = ["features", "feature_index", "scores", "percentile_rank"] -# assert list(feature_scores.columns) == expected_columns -# -# # check if dataframe is sorted by scores (descending) -# assert feature_scores["scores"].is_monotonic_decreasing -# -# print(feature_scores) -# -# def test_multilabel_rf_model(self): -# fsdf = self.import_FSDataFrame() -# training_data, testing_data = fsdf.split_df(split_training_factor=0.8) -# -# estimator = RandomForestClassifier() -# evaluator = MulticlassClassificationEvaluator(metricName="accuracy") -# estimator_params = {"labelCol": "label"} -# grid_params = {"numTrees": [5, 10], "maxDepth": [3, 5]} -# cv_params = {"parallelism": 2, "numFolds": 3} -# -# ml_method = MLCVModel( -# estimator=estimator, -# evaluator=evaluator, -# estimator_params=estimator_params, -# grid_params=grid_params, -# cv_params=cv_params, -# ) -# -# (ml_method.fit(training_data)) -# -# # get the accuracy on training -# eval_training = ml_method.get_eval_metric_on_training() -# print(f"Accuracy on training data: {eval_training}") -# -# # get the accuracy on testing -# testing_acc = ml_method.get_eval_metric_on_testing(testing_data) -# print(f"Accuracy on test data: {testing_acc}") -assert 0.65 < testing_acc < 0.95, f"Testing accuracy {testing_acc} is out of expected range" -# -# def test_multilabel_lr_model(self): -# fsdf = self.import_FSDataFrame() -# training_data, testing_data = fsdf.split_df(split_training_factor=0.6) -# -# estimator = LogisticRegression() -# evaluator = MulticlassClassificationEvaluator(metricName="accuracy") -# estimator_params = {"labelCol": "label"} -# grid_params = {"regParam": [0.1, 0.01]} -# cv_params = {"parallelism": 2, "numFolds": 3} -# -# ml_method = MLCVModel( -# estimator=estimator, -# evaluator=evaluator, -# estimator_params=estimator_params, -# grid_params=grid_params, -# cv_params=cv_params, -# ) -# -# (ml_method.fit(training_data)) -# -# # get the accuracy on training -# eval_training = ml_method.get_eval_metric_on_training() -# print(f"Accuracy on training data: {eval_training}") -# -# # get the accuracy on testing -# testing_acc = ml_method.get_eval_metric_on_testing(testing_data) -# print(f"Accuracy on test data: {testing_acc}") -# assert testing_acc > 0.7 -# -# def test_FSMLMethod(self): -# from fslite.fs.methods import FSMLMethod -# -# fsdf = self.import_FSDataFrame() -# training_data, testing_data = fsdf.split_df(split_training_factor=0.7) -# -# estimator_params = {"labelCol": "label"} -# grid_params = {"numTrees": [5, 10], "maxDepth": [3, 5]} -# cv_params = {"parallelism": 2, "numFolds": 3} -# -# ml_method = FSMLMethod( -# fs_method="rf_multilabel", -# rfe=True, -# rfe_iterations=2, -# percent_to_keep=0.9, -# estimator_params=estimator_params, -# evaluator_params={"metricName": "accuracy"}, -# grid_params=grid_params, -# cv_params=cv_params, -# ) -# -# filtered_fsdf = ml_method.select_features(training_data) -# -# training_acc = ml_method.get_eval_metric_on_training() -# print(f"Training accuracy: {training_acc}") -# assert training_acc > 0.8 -# -# testing_acc = ml_method.get_eval_metric_on_testing(testing_data) -# print(f"Testing accuracy: {testing_acc}") -# assert testing_acc > 0.7 -# -# -# if __name__ == "__main__": -# unittest.main()