Skip to content

Commit

Permalink
added file for experiments
Browse files Browse the repository at this point in the history
  • Loading branch information
ypriverol committed Sep 23, 2024
1 parent c657be9 commit c46167c
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 22 deletions.
4 changes: 4 additions & 0 deletions docs/EXPERIMENTS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
## Experiments and Benchmarks

This document contains the experiments and benchmarks that were conducted to evaluate the performance of fslite.
The experiments were conducted on the following datasets:
11 changes: 11 additions & 0 deletions fslite/fs/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@
FS_METHODS = {
"univariate": {
"title": "Univariate Feature Selection",
"description": "Univariate feature selection refers to the process of selecting the most relevant features for "
"a machine learning model by evaluating each feature individually with respect to the target "
"variable using univariate statistical tests. It simplifies the feature selection process by "
"treating each feature independently and assessing its contribution to the predictive "
"performance of the model.",
"methods": [
{
"name": "anova",
Expand All @@ -18,13 +23,19 @@
},
"multivariate": {
"title": "Multivariate Feature Selection",
"description": "Multivariate feature selection is a method of selecting features by evaluating them in "
"combination rather than individually. Unlike univariate feature selection, which treats each "
"feature separately, multivariate feature selection considers the relationships and interactions "
"between multiple features and the target variable. This method aims to identify a subset of "
"features that work well together to improve the performance of a machine learning model.",
"methods": [
{"name": "m_corr", "description": "Multivariate Correlation"},
{"name": "variance", "description": "Multivariate Variance"},
],
},
"ml": {
"title": "Machine Learning Wrapper",
"description": "Machine learning wrapper methods are feature selection techniques that use a machine learning ",
"methods": [
{"name": "rf_binary", "description": "Random Forest Binary Classifier"},
{"name": "lsvc_binary", "description": "Linear SVC Binary Classifier"},
Expand Down
37 changes: 21 additions & 16 deletions fslite/fs/ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
for feature selection (e.g., rank by feature importance) and prediction.
"""

from typing import Union, Optional, Dict, Any, List

from fslite.fs.constants import get_fs_ml_methods, is_valid_ml_method
Expand Down Expand Up @@ -247,6 +248,7 @@ def __str__(self):
def __repr__(self):
return self.__str__()


class MLCVModel:
"""
A factory class for creating various machine learning models with scikit-learn.
Expand All @@ -264,7 +266,7 @@ def __init__(
RandomForestRegressor,
LinearSVC,
LogisticRegression,
SVC
SVC,
],
scoring: str,
estimator_params: Optional[Dict[str, Any]] = None,
Expand All @@ -291,7 +293,7 @@ def _initialize_model(self):
estimator=self.estimator,
param_grid=self.grid_params,
scoring=self.scoring,
cv=self.cv
cv=self.cv,
)

def fit(self, fsdf: FSDataFrame) -> "MLCVModel":
Expand Down Expand Up @@ -319,15 +321,18 @@ def get_feature_scores(self) -> pd.DataFrame:
"""
Get feature importance scores from the best model.
"""
if not isinstance(self._best_model, (RandomForestClassifier, RandomForestRegressor)):
raise ValueError("Feature importance is only available for tree-based models.")
if not isinstance(
self._best_model, (RandomForestClassifier, RandomForestRegressor)
):
raise ValueError(
"Feature importance is only available for tree-based models."
)

features = self._fsdf.get_feature_names()
importances = self._best_model.feature_importances_
df = pd.DataFrame({
'feature': features,
'importance': importances
}).sort_values(by='importance', ascending=False)
df = pd.DataFrame({"feature": features, "importance": importances}).sort_values(
by="importance", ascending=False
)

return df

Expand All @@ -338,11 +343,11 @@ def get_eval_metric_on_training(self) -> float:
X_train, y_train = self._fsdf.get_features_and_labels()
y_pred = self._best_model.predict(X_train)

if self.scoring == 'accuracy':
if self.scoring == "accuracy":
return accuracy_score(y_train, y_pred)
elif self.scoring == 'f1':
elif self.scoring == "f1":
return f1_score(y_train, y_pred)
elif self.scoring == 'roc_auc':
elif self.scoring == "roc_auc":
return roc_auc_score(y_train, y_pred)
else:
raise ValueError("Unsupported scoring method.")
Expand All @@ -354,11 +359,11 @@ def get_eval_metric_on_testing(self, test_data: FSDataFrame) -> float:
X_test, y_test = test_data.get_features_and_labels()
y_pred = self._best_model.predict(X_test)

if self.scoring == 'accuracy':
if self.scoring == "accuracy":
return accuracy_score(y_test, y_pred)
elif self.scoring == 'f1':
elif self.scoring == "f1":
return f1_score(y_test, y_pred)
elif self.scoring == 'roc_auc':
elif self.scoring == "roc_auc":
return roc_auc_score(y_test, y_pred)
else:
raise ValueError("Unsupported scoring method.")
Expand All @@ -368,8 +373,8 @@ def create_model(
model_type: str,
estimator_params: Dict[str, Any] = None,
grid_params: Dict[str, List[Any]] = None,
scoring: str = 'accuracy',
cv: int = 5
scoring: str = "accuracy",
cv: int = 5,
) -> "MLCVModel":
"""
Create an ML model based on the model type.
Expand Down
19 changes: 13 additions & 6 deletions fslite/fs/multivariate.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@
import numpy as np
from scipy.stats import spearmanr

from fslite.fs.constants import get_fs_multivariate_methods, is_valid_multivariate_method
from fslite.fs.constants import (
get_fs_multivariate_methods,
is_valid_multivariate_method,
)
from fslite.fs.fdataframe import FSDataFrame
from fslite.fs.methods import FSMethod, InvalidMethodError
from fslite.fs.utils import find_maximal_independent_set
Expand All @@ -13,6 +16,7 @@
logger = logging.getLogger("FS:MULTIVARIATE")
logger.setLevel(logging.INFO)


class FSMultivariate(FSMethod):
"""
The FSMultivariate class is a subclass of the FSMethod class and is used for multivariate
Expand Down Expand Up @@ -56,6 +60,7 @@ def validate_method(self, multivariate_method: str):
f"Invalid multivariate method: "
f"{multivariate_method}. Accepted methods are {', '.join(self.valid_methods)}"
)

def select_features(self, fsdf: FSDataFrame):
"""
Select features using the specified multivariate method.
Expand Down Expand Up @@ -100,10 +105,10 @@ def __repr__(self):


def multivariate_correlation_selector(
fsdf: FSDataFrame,
strict: bool = True,
corr_threshold: float = 0.75,
corr_method: str = "pearson",
fsdf: FSDataFrame,
strict: bool = True,
corr_threshold: float = 0.75,
corr_method: str = "pearson",
) -> List[str]:
"""
Compute the correlation matrix among input features and select those below a specified threshold.
Expand All @@ -128,7 +133,9 @@ def multivariate_correlation_selector(
elif corr_method == "spearman":
corr_matrix, _ = spearmanr(matrix)
else:
raise ValueError(f"Unsupported correlation method '{corr_method}'. Use 'pearson' or 'spearman'.")
raise ValueError(
f"Unsupported correlation method '{corr_method}'. Use 'pearson' or 'spearman'."
)

# Get absolute values of correlations to check magnitude
corr_matrix = np.abs(corr_matrix)
Expand Down

0 comments on commit c46167c

Please sign in to comment.