delete ML methods

bigbio · Sep 25, 2024 · 7ee27c8 · 7ee27c8
1 parent d5cc974
commit 7ee27c8
Show file tree

Hide file tree

Showing 11 changed files with 163 additions and 237 deletions.
diff --git a/examples/loom2parquetchunks.py b/examples/loom2parquetchunks.py
@@ -32,10 +32,14 @@
 development_day = ds.ca["Development_day"]
 
 # make a dataframe with the sample metadata, define the columns types
-sample_df = pd.DataFrame({"sample_id": sample_id,
-                          "cell_cluster": cell_cluster,
-                          "assay": assay,
-                          "development_day": development_day})
+sample_df = pd.DataFrame(
+    {
+        "sample_id": sample_id,
+        "cell_cluster": cell_cluster,
+        "assay": assay,
+        "development_day": development_day,
+    }
+)
 
 # print the first 5 rows
 sample_df.head()
@@ -55,43 +59,41 @@
 sample_df.head()
 
 # Save the sample metadata to parquet
-(sample_df
- .reset_index()
- .to_parquet("sample_metadata.parquet",
-             index=False,
-             engine="auto",
-             compression="gzip")
- )
+(
+    sample_df.reset_index().to_parquet(
+        "sample_metadata.parquet", index=False, engine="auto", compression="gzip"
+    )
+)
 
 
 # transpose dataset and convert to parquet.
 # process the data per chunks.
 chunk_size = 50000
-number_chunks = 50 # Number of chunks to process, if None, all chunks are processed
+number_chunks = 50  # Number of chunks to process, if None, all chunks are processed
 count = 0
-for (ix, selection, view) in ds.scan(axis=1, batch_size=chunk_size):
+for ix, selection, view in ds.scan(axis=1, batch_size=chunk_size):
     # retrieve the chunk
     matrix_chunk = view[:, :]
 
     # transpose the data
     matrix_chunk_t = matrix_chunk.T
 
     # convert to pandas dataframe
-    df_chunk = pd.DataFrame(matrix_chunk_t,
-                            index=sample_id[selection.tolist()],
-                            columns=gene_ids)
+    df_chunk = pd.DataFrame(
+        matrix_chunk_t, index=sample_id[selection.tolist()], columns=gene_ids
+    )
 
     # merge chunk with sample metadata
     df_chunk = pd.merge(
-        left=sample_df[['cell_cluster_id', 'development_day', 'assay_id']],
+        left=sample_df[["cell_cluster_id", "development_day", "assay_id"]],
         right=df_chunk,
         how="inner",
         left_index=True,
         right_index=True,
         sort=False,
         copy=True,
         indicator=False,
-        validate="one_to_one"
+        validate="one_to_one",
     )
 
     # reset the index
@@ -101,10 +103,12 @@
     df_chunk = df_chunk.rename(columns={"index": "sample_id"})
 
     # save the chunk to parquet
-    df_chunk.to_parquet(f"gene_count_chunk_{ix}.parquet",
-                        index=False,
-                        engine="pyarrow",
-                        compression="gzip")
+    df_chunk.to_parquet(
+        f"gene_count_chunk_{ix}.parquet",
+        index=False,
+        engine="pyarrow",
+        compression="gzip",
+    )
 
     print(f"Chunk {ix} saved")
     count = count + 1

diff --git a/fslite/fs/constants.py b/fslite/fs/constants.py
@@ -8,25 +8,34 @@
     "univariate": {
         "title": "Univariate Feature Selection",
         "description": "Univariate feature selection refers to the process of selecting the most relevant features for "
-                       "a machine learning model by evaluating each feature individually with respect to the target "
-                       "variable using univariate statistical tests. It simplifies the feature selection process by "
-                       "treating each feature independently and assessing its contribution to the predictive "
-                       "performance of the model.",
+        "a machine learning model by evaluating each feature individually with respect to the target "
+        "variable using univariate statistical tests. It simplifies the feature selection process by "
+        "treating each feature independently and assessing its contribution to the predictive "
+        "performance of the model.",
         "methods": [
-            {"name": "anova", "description": "Univariate ANOVA feature selection (f-classification)"},
+            {
+                "name": "anova",
+                "description": "Univariate ANOVA feature selection (f-classification)",
+            },
             {"name": "u_corr", "description": "Univariate Pearson's correlation"},
             {"name": "f_regression", "description": "Univariate f-regression"},
-            {"name": "mutual_info_regression", "description": "Univariate mutual information regression"},
-            {"name": "mutual_info_classification", "description": "Univariate mutual information classification"},
+            {
+                "name": "mutual_info_regression",
+                "description": "Univariate mutual information regression",
+            },
+            {
+                "name": "mutual_info_classification",
+                "description": "Univariate mutual information classification",
+            },
         ],
     },
     "multivariate": {
         "title": "Multivariate Feature Selection",
         "description": "Multivariate feature selection is a method of selecting features by evaluating them in "
-                       "combination rather than individually. Unlike univariate feature selection, which treats each "
-                       "feature separately, multivariate feature selection considers the relationships and interactions "
-                       "between multiple features and the target variable. This method aims to identify a subset of "
-                       "features that work well together to improve the performance of a machine learning model.",
+        "combination rather than individually. Unlike univariate feature selection, which treats each "
+        "feature separately, multivariate feature selection considers the relationships and interactions "
+        "between multiple features and the target variable. This method aims to identify a subset of "
+        "features that work well together to improve the performance of a machine learning model.",
         "methods": [
             {"name": "m_corr", "description": "Multivariate Correlation"},
             {"name": "variance", "description": "Multivariate Variance"},
@@ -38,15 +47,28 @@
         "methods": [
             {"name": "rf_binary", "description": "Random Forest Binary Classifier"},
             {"name": "lsvc_binary", "description": "Linear SVC Binary Classifier"},
-            {"name": "fm_binary", "description": "Factorization Machine Binary Classifier"},
-            {"name": "rf_multilabel", "description": "Random Forest Multi-label Classifier"},
-            {"name": "lg_multilabel","description": "Logistic Regression Multi-label Classifier"},
+            {
+                "name": "fm_binary",
+                "description": "Factorization Machine Binary Classifier",
+            },
+            {
+                "name": "rf_multilabel",
+                "description": "Random Forest Multi-label Classifier",
+            },
+            {
+                "name": "lg_multilabel",
+                "description": "Logistic Regression Multi-label Classifier",
+            },
             {"name": "rf_regression", "description": "Random Forest Regression"},
-            {"name": "fm_regression","description": "Factorization Machine Regression"},
+            {
+                "name": "fm_regression",
+                "description": "Factorization Machine Regression",
+            },
         ],
     },
 }
 
+
 def get_fs_methods():
     """
     Get the list of feature selection methods

diff --git a/fslite/fs/methods.py b/fslite/fs/methods.py
@@ -60,134 +60,19 @@ def set_params(self, **kwargs):
         self.kwargs.update(kwargs)
 
 
-# class FSPipeline:
-#     """
-#     The FSPipeline class creates a pipeline of feature selection methods. It provides a way to
-#     chain multiple feature selection methods together to create a pipeline of feature selection methods.
-#
-#     Example Usage
-#     -------------
-#     # Create an instance of FSPipeline with the specified feature selection methods
-#     fs_pipeline = FSPipeline(fs_methods=[FSUnivariate('anova'), FSMultivariate('m_corr')])
-#
-#     # Select features using the pipeline
-#     selected_features = fs_pipeline.select_features(fsdf)
-#     """
-#
-#     _valid_methods: List[Type[Union[FSUnivariate, FSMultivariate, FSMLMethod]]] = [
-#         FSUnivariate,
-#         FSMultivariate,
-#         FSMLMethod,
-#     ]
-#
-#     def __init__(
-#         self,
-#         df_training: FSDataFrame,
-#         df_testing: Optional[FSDataFrame],
-#         fs_stages: List[Union[FSUnivariate, FSMultivariate, FSMLMethod]],
-#     ):
-#         """
-#         Initialize the feature selection pipeline with the specified feature selection methods.
-#
-#         Parameters:
-#             df_training: The training data frame on which the feature selection pipeline is to be run.
-#             df_testing: The testing data frame on which the ML wrapper method (if any) is to be evaluated.
-#             fs_stages: A list of feature selection methods to be used in the pipeline.
-#         """
-#
-#         self.df_training = df_training
-#         self.df_testing = df_testing
-#         self.fs_stages = fs_stages
-#         self.validate_methods()
-#
-#         self.pipeline_results = {}
-#
-#     def validate_methods(self):
-#         """
-#         Validate the feature selection methods in the pipeline.
-#         """
-#         # check if the pipeline contains at least one feature selection method
-#         if len(self.fs_stages) == 0:
-#             raise ValueError(
-#                 "The pipeline must contain at least one feature selection method."
-#             )
-#
-#         # check if the feature selection methods are valid
-#         if not all(
-#             isinstance(method, tuple(self._valid_methods)) for method in self.fs_stages
-#         ):
-#             raise InvalidMethodError(
-#                 f"Invalid feature selection method. "
-#                 f"Accepted methods are {', '.join([str(m) for m in self._valid_methods])}"
-#             )
-#
-#         # check if only one ML method is used in the pipeline
-#         ml_methods = [
-#             method for method in self.fs_stages if isinstance(method, FSMLMethod)
-#         ]
-#         if len(ml_methods) > 1:
-#             raise ValueError("Only one ML method is allowed in the pipeline.")
-#
-#     def run(self) -> Dict[str, Any]:
-#         """
-#         Run the feature selection pipeline.
-#
-#         Returns:
-#            A dictionary with the results of the feature selection pipeline.
-#         """
-#
-#         # apply each feature selection method in the pipeline sequentially
-#         n_stages = len(self.fs_stages)
-#         fsdf_tmp = self.df_training
-#
-#         self.pipeline_results.update(n_stages=n_stages)
-#
-#         for i, method in enumerate(self.fs_stages):
-#             print(
-#                 f"Running stage {i + 1} of {n_stages} of the feature selection pipeline: {method}"
-#             )
-#             if isinstance(method, FSMLMethod):
-#
-#                 fsdf_tmp = method.select_features(fsdf_tmp)
-#
-#                 # collect the results during the feature selection process (rfe iterations, feature scores, etc.)
-#                 self.pipeline_results.update(rfe_iterations=method.rfe_iterations)
-#                 self.pipeline_results.update(feature_scores=method.get_feature_scores())
-#                 self.pipeline_results.update(eval_metric=method.get_eval_metric_name())
-#                 self.pipeline_results.update(
-#                     rfe_training_metric=method.get_eval_metric_on_training_rfe()
-#                 )
-#                 self.pipeline_results.update(
-#                     training_metric=method.get_eval_metric_on_training()
-#                 )
-#
-#                 if self.df_testing is not None:
-#
-#                     # evaluate the final model on the testing data (if available)
-#                     testing_metric = method.get_eval_metric_on_testing(self.df_testing)
-#                     self.pipeline_results.update(testing_metric=testing_metric)
-#
-#             else:
-#                 fsdf_tmp = method.select_features(fsdf_tmp)
-#
-#         self.pipeline_results.update(
-#             n_initial_features=self.df_training.count_features()
-#         )
-#         self.pipeline_results.update(n_selected_features=fsdf_tmp.count_features())
-#
-#         return self.pipeline_results
-#
-#     def __str__(self):
-#         return f"FSPipeline(fs_methods={self.fs_stages})"
-#
-#     def __repr__(self):
-#         return self.__str__()
-
-
 class InvalidMethodError(ValueError):
     """
     Error raised when an invalid feature selection method is used.
     """
 
     def __init__(self, message):
         super().__init__(f"Invalid feature selection method: {message}")
+
+
+class InvalidDataError(ValueError):
+    """
+    Error raised when an invalid feature selection method is used.
+    """
+
+    def __init__(self, message):
+        super().__init__(f"Invalid data frame: {message}")
diff --git a/fslite/fs/ml.py b/fslite/fs/ml.py
@@ -9,7 +9,7 @@
 
 from fslite.fs.constants import get_fs_ml_methods, is_valid_ml_method
 from fslite.fs.fdataframe import FSDataFrame
-from fslite.fs.methods import FSMethod, InvalidMethodError
+from fslite.fs.methods import FSMethod, InvalidMethodError, InvalidDataError
 from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 from sklearn.svm import SVC, LinearSVC
 from sklearn.linear_model import LogisticRegression

diff --git a/fslite/fs/multivariate.py b/fslite/fs/multivariate.py
@@ -70,11 +70,9 @@ def select_features(self, fsdf: FSDataFrame):
             fsdf, multivariate_method=self.fs_method, **self.kwargs
         )
 
-    def multivariate_filter(self,
-                            fsdf: FSDataFrame,
-                            multivariate_method: str = "m_corr",
-                            **kwargs
-                            ) -> FSDataFrame:
+    def multivariate_filter(
+        self, fsdf: FSDataFrame, multivariate_method: str = "m_corr", **kwargs
+    ) -> FSDataFrame:
         """
          Filter features after applying a multivariate feature selector method.
 
@@ -107,10 +105,10 @@ def __repr__(self):
 
 
 def multivariate_correlation_selector(
-        fsdf: FSDataFrame,
-        selection_mode: str = 'strict',
-        selection_threshold: float = 0.75,
-        corr_method: str = "pearson",
+    fsdf: FSDataFrame,
+    selection_mode: str = "strict",
+    selection_threshold: float = 0.75,
+    corr_method: str = "pearson",
 ) -> List[int]:
     """
     Compute the correlation matrix among input features and select those below a specified threshold.
@@ -173,9 +171,9 @@ def multivariate_correlation_selector(
     return selected_features
 
 
-def multivariate_variance_selector(fsdf: FSDataFrame,
-                                   selection_mode: str = "k_best",
-                                   selection_threshold: float = 0.0) -> List[int]:
+def multivariate_variance_selector(
+    fsdf: FSDataFrame, selection_mode: str = "k_best", selection_threshold: float = 0.0
+) -> List[int]:
     """
     Filter features based on variance threshold.
 
@@ -189,7 +187,7 @@ def multivariate_variance_selector(fsdf: FSDataFrame,
 
     :return: List of selected feature indices
     """
-    
+
     # Retrieve the feature matrix
     f_matrix = fsdf.get_feature_matrix()
 
@@ -212,8 +210,10 @@ def multivariate_variance_selector(fsdf: FSDataFrame,
         raise ValueError(
             f"Unsupported selection mode '{selection_mode}'. Use 'percentile' or 'k_best'."
         )
-
-    logger.info(f"Feature selection mode: {selection_mode}. \n"
-                f"Number of features selected: {len(selected_features)}")
+
+    logger.info(
+        f"Feature selection mode: {selection_mode}. \n"
+        f"Number of features selected: {len(selected_features)}"
+    )
 
     return list(selected_features)