added file for experiments

bigbio · Sep 23, 2024 · c46167c · c46167c
1 parent c657be9
commit c46167c
Show file tree

Hide file tree

Showing 4 changed files with 49 additions and 22 deletions.
diff --git a/docs/EXPERIMENTS.md b/docs/EXPERIMENTS.md
@@ -0,0 +1,4 @@
+## Experiments and Benchmarks
+
+This document contains the experiments and benchmarks that were conducted to evaluate the performance of fslite. 
+The experiments were conducted on the following datasets:
diff --git a/fslite/fs/constants.py b/fslite/fs/constants.py
@@ -7,6 +7,11 @@
 FS_METHODS = {
     "univariate": {
         "title": "Univariate Feature Selection",
+        "description": "Univariate feature selection refers to the process of selecting the most relevant features for "
+                       "a machine learning model by evaluating each feature individually with respect to the target "
+                       "variable using univariate statistical tests. It simplifies the feature selection process by "
+                       "treating each feature independently and assessing its contribution to the predictive "
+                       "performance of the model.",
         "methods": [
             {
                 "name": "anova",
@@ -18,13 +23,19 @@
     },
     "multivariate": {
         "title": "Multivariate Feature Selection",
+        "description": "Multivariate feature selection is a method of selecting features by evaluating them in "
+                       "combination rather than individually. Unlike univariate feature selection, which treats each "
+                       "feature separately, multivariate feature selection considers the relationships and interactions "
+                       "between multiple features and the target variable. This method aims to identify a subset of "
+                       "features that work well together to improve the performance of a machine learning model.",
         "methods": [
             {"name": "m_corr", "description": "Multivariate Correlation"},
             {"name": "variance", "description": "Multivariate Variance"},
         ],
     },
     "ml": {
         "title": "Machine Learning Wrapper",
+        "description": "Machine learning wrapper methods are feature selection techniques that use a machine learning ",
         "methods": [
             {"name": "rf_binary", "description": "Random Forest Binary Classifier"},
             {"name": "lsvc_binary", "description": "Linear SVC Binary Classifier"},

diff --git a/fslite/fs/ml.py b/fslite/fs/ml.py
@@ -4,6 +4,7 @@
 for feature selection (e.g., rank by feature importance) and prediction.
 
 """
+
 from typing import Union, Optional, Dict, Any, List
 
 from fslite.fs.constants import get_fs_ml_methods, is_valid_ml_method
@@ -247,6 +248,7 @@ def __str__(self):
     def __repr__(self):
         return self.__str__()
 
+
 class MLCVModel:
     """
     A factory class for creating various machine learning models with scikit-learn.
@@ -264,7 +266,7 @@ def __init__(
             RandomForestRegressor,
             LinearSVC,
             LogisticRegression,
-            SVC
+            SVC,
         ],
         scoring: str,
         estimator_params: Optional[Dict[str, Any]] = None,
@@ -291,7 +293,7 @@ def _initialize_model(self):
                 estimator=self.estimator,
                 param_grid=self.grid_params,
                 scoring=self.scoring,
-                cv=self.cv
+                cv=self.cv,
             )
 
     def fit(self, fsdf: FSDataFrame) -> "MLCVModel":
@@ -319,15 +321,18 @@ def get_feature_scores(self) -> pd.DataFrame:
         """
         Get feature importance scores from the best model.
         """
-        if not isinstance(self._best_model, (RandomForestClassifier, RandomForestRegressor)):
-            raise ValueError("Feature importance is only available for tree-based models.")
+        if not isinstance(
+            self._best_model, (RandomForestClassifier, RandomForestRegressor)
+        ):
+            raise ValueError(
+                "Feature importance is only available for tree-based models."
+            )
 
         features = self._fsdf.get_feature_names()
         importances = self._best_model.feature_importances_
-        df = pd.DataFrame({
-            'feature': features,
-            'importance': importances
-        }).sort_values(by='importance', ascending=False)
+        df = pd.DataFrame({"feature": features, "importance": importances}).sort_values(
+            by="importance", ascending=False
+        )
 
         return df
 
@@ -338,11 +343,11 @@ def get_eval_metric_on_training(self) -> float:
         X_train, y_train = self._fsdf.get_features_and_labels()
         y_pred = self._best_model.predict(X_train)
 
-        if self.scoring == 'accuracy':
+        if self.scoring == "accuracy":
             return accuracy_score(y_train, y_pred)
-        elif self.scoring == 'f1':
+        elif self.scoring == "f1":
             return f1_score(y_train, y_pred)
-        elif self.scoring == 'roc_auc':
+        elif self.scoring == "roc_auc":
             return roc_auc_score(y_train, y_pred)
         else:
             raise ValueError("Unsupported scoring method.")
@@ -354,11 +359,11 @@ def get_eval_metric_on_testing(self, test_data: FSDataFrame) -> float:
         X_test, y_test = test_data.get_features_and_labels()
         y_pred = self._best_model.predict(X_test)
 
-        if self.scoring == 'accuracy':
+        if self.scoring == "accuracy":
             return accuracy_score(y_test, y_pred)
-        elif self.scoring == 'f1':
+        elif self.scoring == "f1":
             return f1_score(y_test, y_pred)
-        elif self.scoring == 'roc_auc':
+        elif self.scoring == "roc_auc":
             return roc_auc_score(y_test, y_pred)
         else:
             raise ValueError("Unsupported scoring method.")
@@ -368,8 +373,8 @@ def create_model(
         model_type: str,
         estimator_params: Dict[str, Any] = None,
         grid_params: Dict[str, List[Any]] = None,
-        scoring: str = 'accuracy',
-        cv: int = 5
+        scoring: str = "accuracy",
+        cv: int = 5,
     ) -> "MLCVModel":
         """
         Create an ML model based on the model type.

diff --git a/fslite/fs/multivariate.py b/fslite/fs/multivariate.py
@@ -4,7 +4,10 @@
 import numpy as np
 from scipy.stats import spearmanr
 
-from fslite.fs.constants import get_fs_multivariate_methods, is_valid_multivariate_method
+from fslite.fs.constants import (
+    get_fs_multivariate_methods,
+    is_valid_multivariate_method,
+)
 from fslite.fs.fdataframe import FSDataFrame
 from fslite.fs.methods import FSMethod, InvalidMethodError
 from fslite.fs.utils import find_maximal_independent_set
@@ -13,6 +16,7 @@
 logger = logging.getLogger("FS:MULTIVARIATE")
 logger.setLevel(logging.INFO)
 
+
 class FSMultivariate(FSMethod):
     """
     The FSMultivariate class is a subclass of the FSMethod class and is used for multivariate
@@ -56,6 +60,7 @@ def validate_method(self, multivariate_method: str):
                 f"Invalid multivariate method: "
                 f"{multivariate_method}. Accepted methods are {', '.join(self.valid_methods)}"
             )
+
     def select_features(self, fsdf: FSDataFrame):
         """
         Select features using the specified multivariate method.
@@ -100,10 +105,10 @@ def __repr__(self):
 
 
 def multivariate_correlation_selector(
-        fsdf: FSDataFrame,
-        strict: bool = True,
-        corr_threshold: float = 0.75,
-        corr_method: str = "pearson",
+    fsdf: FSDataFrame,
+    strict: bool = True,
+    corr_threshold: float = 0.75,
+    corr_method: str = "pearson",
 ) -> List[str]:
     """
     Compute the correlation matrix among input features and select those below a specified threshold.
@@ -128,7 +133,9 @@ def multivariate_correlation_selector(
     elif corr_method == "spearman":
         corr_matrix, _ = spearmanr(matrix)
     else:
-        raise ValueError(f"Unsupported correlation method '{corr_method}'. Use 'pearson' or 'spearman'.")
+        raise ValueError(
+            f"Unsupported correlation method '{corr_method}'. Use 'pearson' or 'spearman'."
+        )
 
     # Get absolute values of correlations to check magnitude
     corr_matrix = np.abs(corr_matrix)