Skip to content

Commit

Permalink
delete ML methods
Browse files Browse the repository at this point in the history
  • Loading branch information
ypriverol committed Sep 25, 2024
1 parent d5cc974 commit 7ee27c8
Show file tree
Hide file tree
Showing 11 changed files with 163 additions and 237 deletions.
48 changes: 26 additions & 22 deletions examples/loom2parquetchunks.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,14 @@
development_day = ds.ca["Development_day"]

# make a dataframe with the sample metadata, define the columns types
sample_df = pd.DataFrame({"sample_id": sample_id,
"cell_cluster": cell_cluster,
"assay": assay,
"development_day": development_day})
sample_df = pd.DataFrame(
{
"sample_id": sample_id,
"cell_cluster": cell_cluster,
"assay": assay,
"development_day": development_day,
}
)

# print the first 5 rows
sample_df.head()
Expand All @@ -55,43 +59,41 @@
sample_df.head()

# Save the sample metadata to parquet
(sample_df
.reset_index()
.to_parquet("sample_metadata.parquet",
index=False,
engine="auto",
compression="gzip")
)
(
sample_df.reset_index().to_parquet(
"sample_metadata.parquet", index=False, engine="auto", compression="gzip"
)
)


# transpose dataset and convert to parquet.
# process the data per chunks.
chunk_size = 50000
number_chunks = 50 # Number of chunks to process, if None, all chunks are processed
number_chunks = 50 # Number of chunks to process, if None, all chunks are processed
count = 0
for (ix, selection, view) in ds.scan(axis=1, batch_size=chunk_size):
for ix, selection, view in ds.scan(axis=1, batch_size=chunk_size):
# retrieve the chunk
matrix_chunk = view[:, :]

# transpose the data
matrix_chunk_t = matrix_chunk.T

# convert to pandas dataframe
df_chunk = pd.DataFrame(matrix_chunk_t,
index=sample_id[selection.tolist()],
columns=gene_ids)
df_chunk = pd.DataFrame(
matrix_chunk_t, index=sample_id[selection.tolist()], columns=gene_ids
)

# merge chunk with sample metadata
df_chunk = pd.merge(
left=sample_df[['cell_cluster_id', 'development_day', 'assay_id']],
left=sample_df[["cell_cluster_id", "development_day", "assay_id"]],
right=df_chunk,
how="inner",
left_index=True,
right_index=True,
sort=False,
copy=True,
indicator=False,
validate="one_to_one"
validate="one_to_one",
)

# reset the index
Expand All @@ -101,10 +103,12 @@
df_chunk = df_chunk.rename(columns={"index": "sample_id"})

# save the chunk to parquet
df_chunk.to_parquet(f"gene_count_chunk_{ix}.parquet",
index=False,
engine="pyarrow",
compression="gzip")
df_chunk.to_parquet(
f"gene_count_chunk_{ix}.parquet",
index=False,
engine="pyarrow",
compression="gzip",
)

print(f"Chunk {ix} saved")
count = count + 1
Expand Down
52 changes: 37 additions & 15 deletions fslite/fs/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,25 +8,34 @@
"univariate": {
"title": "Univariate Feature Selection",
"description": "Univariate feature selection refers to the process of selecting the most relevant features for "
"a machine learning model by evaluating each feature individually with respect to the target "
"variable using univariate statistical tests. It simplifies the feature selection process by "
"treating each feature independently and assessing its contribution to the predictive "
"performance of the model.",
"a machine learning model by evaluating each feature individually with respect to the target "
"variable using univariate statistical tests. It simplifies the feature selection process by "
"treating each feature independently and assessing its contribution to the predictive "
"performance of the model.",
"methods": [
{"name": "anova", "description": "Univariate ANOVA feature selection (f-classification)"},
{
"name": "anova",
"description": "Univariate ANOVA feature selection (f-classification)",
},
{"name": "u_corr", "description": "Univariate Pearson's correlation"},
{"name": "f_regression", "description": "Univariate f-regression"},
{"name": "mutual_info_regression", "description": "Univariate mutual information regression"},
{"name": "mutual_info_classification", "description": "Univariate mutual information classification"},
{
"name": "mutual_info_regression",
"description": "Univariate mutual information regression",
},
{
"name": "mutual_info_classification",
"description": "Univariate mutual information classification",
},
],
},
"multivariate": {
"title": "Multivariate Feature Selection",
"description": "Multivariate feature selection is a method of selecting features by evaluating them in "
"combination rather than individually. Unlike univariate feature selection, which treats each "
"feature separately, multivariate feature selection considers the relationships and interactions "
"between multiple features and the target variable. This method aims to identify a subset of "
"features that work well together to improve the performance of a machine learning model.",
"combination rather than individually. Unlike univariate feature selection, which treats each "
"feature separately, multivariate feature selection considers the relationships and interactions "
"between multiple features and the target variable. This method aims to identify a subset of "
"features that work well together to improve the performance of a machine learning model.",
"methods": [
{"name": "m_corr", "description": "Multivariate Correlation"},
{"name": "variance", "description": "Multivariate Variance"},
Expand All @@ -38,15 +47,28 @@
"methods": [
{"name": "rf_binary", "description": "Random Forest Binary Classifier"},
{"name": "lsvc_binary", "description": "Linear SVC Binary Classifier"},
{"name": "fm_binary", "description": "Factorization Machine Binary Classifier"},
{"name": "rf_multilabel", "description": "Random Forest Multi-label Classifier"},
{"name": "lg_multilabel","description": "Logistic Regression Multi-label Classifier"},
{
"name": "fm_binary",
"description": "Factorization Machine Binary Classifier",
},
{
"name": "rf_multilabel",
"description": "Random Forest Multi-label Classifier",
},
{
"name": "lg_multilabel",
"description": "Logistic Regression Multi-label Classifier",
},
{"name": "rf_regression", "description": "Random Forest Regression"},
{"name": "fm_regression","description": "Factorization Machine Regression"},
{
"name": "fm_regression",
"description": "Factorization Machine Regression",
},
],
},
}


def get_fs_methods():
"""
Get the list of feature selection methods
Expand Down
133 changes: 9 additions & 124 deletions fslite/fs/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,134 +60,19 @@ def set_params(self, **kwargs):
self.kwargs.update(kwargs)


# class FSPipeline:
# """
# The FSPipeline class creates a pipeline of feature selection methods. It provides a way to
# chain multiple feature selection methods together to create a pipeline of feature selection methods.
#
# Example Usage
# -------------
# # Create an instance of FSPipeline with the specified feature selection methods
# fs_pipeline = FSPipeline(fs_methods=[FSUnivariate('anova'), FSMultivariate('m_corr')])
#
# # Select features using the pipeline
# selected_features = fs_pipeline.select_features(fsdf)
# """
#
# _valid_methods: List[Type[Union[FSUnivariate, FSMultivariate, FSMLMethod]]] = [
# FSUnivariate,
# FSMultivariate,
# FSMLMethod,
# ]
#
# def __init__(
# self,
# df_training: FSDataFrame,
# df_testing: Optional[FSDataFrame],
# fs_stages: List[Union[FSUnivariate, FSMultivariate, FSMLMethod]],
# ):
# """
# Initialize the feature selection pipeline with the specified feature selection methods.
#
# Parameters:
# df_training: The training data frame on which the feature selection pipeline is to be run.
# df_testing: The testing data frame on which the ML wrapper method (if any) is to be evaluated.
# fs_stages: A list of feature selection methods to be used in the pipeline.
# """
#
# self.df_training = df_training
# self.df_testing = df_testing
# self.fs_stages = fs_stages
# self.validate_methods()
#
# self.pipeline_results = {}
#
# def validate_methods(self):
# """
# Validate the feature selection methods in the pipeline.
# """
# # check if the pipeline contains at least one feature selection method
# if len(self.fs_stages) == 0:
# raise ValueError(
# "The pipeline must contain at least one feature selection method."
# )
#
# # check if the feature selection methods are valid
# if not all(
# isinstance(method, tuple(self._valid_methods)) for method in self.fs_stages
# ):
# raise InvalidMethodError(
# f"Invalid feature selection method. "
# f"Accepted methods are {', '.join([str(m) for m in self._valid_methods])}"
# )
#
# # check if only one ML method is used in the pipeline
# ml_methods = [
# method for method in self.fs_stages if isinstance(method, FSMLMethod)
# ]
# if len(ml_methods) > 1:
# raise ValueError("Only one ML method is allowed in the pipeline.")
#
# def run(self) -> Dict[str, Any]:
# """
# Run the feature selection pipeline.
#
# Returns:
# A dictionary with the results of the feature selection pipeline.
# """
#
# # apply each feature selection method in the pipeline sequentially
# n_stages = len(self.fs_stages)
# fsdf_tmp = self.df_training
#
# self.pipeline_results.update(n_stages=n_stages)
#
# for i, method in enumerate(self.fs_stages):
# print(
# f"Running stage {i + 1} of {n_stages} of the feature selection pipeline: {method}"
# )
# if isinstance(method, FSMLMethod):
#
# fsdf_tmp = method.select_features(fsdf_tmp)
#
# # collect the results during the feature selection process (rfe iterations, feature scores, etc.)
# self.pipeline_results.update(rfe_iterations=method.rfe_iterations)
# self.pipeline_results.update(feature_scores=method.get_feature_scores())
# self.pipeline_results.update(eval_metric=method.get_eval_metric_name())
# self.pipeline_results.update(
# rfe_training_metric=method.get_eval_metric_on_training_rfe()
# )
# self.pipeline_results.update(
# training_metric=method.get_eval_metric_on_training()
# )
#
# if self.df_testing is not None:
#
# # evaluate the final model on the testing data (if available)
# testing_metric = method.get_eval_metric_on_testing(self.df_testing)
# self.pipeline_results.update(testing_metric=testing_metric)
#
# else:
# fsdf_tmp = method.select_features(fsdf_tmp)
#
# self.pipeline_results.update(
# n_initial_features=self.df_training.count_features()
# )
# self.pipeline_results.update(n_selected_features=fsdf_tmp.count_features())
#
# return self.pipeline_results
#
# def __str__(self):
# return f"FSPipeline(fs_methods={self.fs_stages})"
#
# def __repr__(self):
# return self.__str__()


class InvalidMethodError(ValueError):
"""
Error raised when an invalid feature selection method is used.
"""

def __init__(self, message):
super().__init__(f"Invalid feature selection method: {message}")


class InvalidDataError(ValueError):
"""
Error raised when an invalid feature selection method is used.
"""

def __init__(self, message):
super().__init__(f"Invalid data frame: {message}")
2 changes: 1 addition & 1 deletion fslite/fs/ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from fslite.fs.constants import get_fs_ml_methods, is_valid_ml_method
from fslite.fs.fdataframe import FSDataFrame
from fslite.fs.methods import FSMethod, InvalidMethodError
from fslite.fs.methods import FSMethod, InvalidMethodError, InvalidDataError
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
Expand Down
32 changes: 16 additions & 16 deletions fslite/fs/multivariate.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,9 @@ def select_features(self, fsdf: FSDataFrame):
fsdf, multivariate_method=self.fs_method, **self.kwargs
)

def multivariate_filter(self,
fsdf: FSDataFrame,
multivariate_method: str = "m_corr",
**kwargs
) -> FSDataFrame:
def multivariate_filter(
self, fsdf: FSDataFrame, multivariate_method: str = "m_corr", **kwargs
) -> FSDataFrame:
"""
Filter features after applying a multivariate feature selector method.
Expand Down Expand Up @@ -107,10 +105,10 @@ def __repr__(self):


def multivariate_correlation_selector(
fsdf: FSDataFrame,
selection_mode: str = 'strict',
selection_threshold: float = 0.75,
corr_method: str = "pearson",
fsdf: FSDataFrame,
selection_mode: str = "strict",
selection_threshold: float = 0.75,
corr_method: str = "pearson",
) -> List[int]:
"""
Compute the correlation matrix among input features and select those below a specified threshold.
Expand Down Expand Up @@ -173,9 +171,9 @@ def multivariate_correlation_selector(
return selected_features


def multivariate_variance_selector(fsdf: FSDataFrame,
selection_mode: str = "k_best",
selection_threshold: float = 0.0) -> List[int]:
def multivariate_variance_selector(
fsdf: FSDataFrame, selection_mode: str = "k_best", selection_threshold: float = 0.0
) -> List[int]:
"""
Filter features based on variance threshold.
Expand All @@ -189,7 +187,7 @@ def multivariate_variance_selector(fsdf: FSDataFrame,
:return: List of selected feature indices
"""

# Retrieve the feature matrix
f_matrix = fsdf.get_feature_matrix()

Expand All @@ -212,8 +210,10 @@ def multivariate_variance_selector(fsdf: FSDataFrame,
raise ValueError(
f"Unsupported selection mode '{selection_mode}'. Use 'percentile' or 'k_best'."
)

logger.info(f"Feature selection mode: {selection_mode}. \n"
f"Number of features selected: {len(selected_features)}")

logger.info(
f"Feature selection mode: {selection_mode}. \n"
f"Number of features selected: {len(selected_features)}"
)

return list(selected_features)
Loading

0 comments on commit 7ee27c8

Please sign in to comment.