Skip to content

Commit

Permalink
black applied
Browse files Browse the repository at this point in the history
  • Loading branch information
ypriverol committed Sep 22, 2024
1 parent a69ac12 commit 3f56ded
Show file tree
Hide file tree
Showing 18 changed files with 603 additions and 445 deletions.
96 changes: 43 additions & 53 deletions fslite/fs/constants.py
Original file line number Diff line number Diff line change
@@ -1,72 +1,52 @@
"""
This file contains a list of constants used in the feature selection and machine learning methods.
"""

from typing import Dict, List, Union

FS_METHODS = {
'univariate': {
"title": 'Univariate Feature Selection',
"univariate": {
"title": "Univariate Feature Selection",
"methods": [
{
'name': 'anova',
'description': 'Univariate ANOVA feature selection (f-classification)'
},
{
'name': 'u_corr',
'description': 'Univariate correlation'
"name": "anova",
"description": "Univariate ANOVA feature selection (f-classification)",
},
{
'name': 'f_regression',
'description': 'Univariate f-regression'
}
]
{"name": "u_corr", "description": "Univariate correlation"},
{"name": "f_regression", "description": "Univariate f-regression"},
],
},
'multivariate': {
"title": 'Multivariate Feature Selection',
"multivariate": {
"title": "Multivariate Feature Selection",
"methods": [
{
'name': 'm_corr',
'description': 'Multivariate Correlation'
},
{
'name': 'variance',
'description': 'Multivariate Variance'
}
]
{"name": "m_corr", "description": "Multivariate Correlation"},
{"name": "variance", "description": "Multivariate Variance"},
],
},
'ml': {
"title": 'Machine Learning Wrapper',
"ml": {
"title": "Machine Learning Wrapper",
"methods": [
{"name": "rf_binary", "description": "Random Forest Binary Classifier"},
{"name": "lsvc_binary", "description": "Linear SVC Binary Classifier"},
{
'name': 'rf_binary',
'description': 'Random Forest Binary Classifier'
},
{
'name': 'lsvc_binary',
'description': 'Linear SVC Binary Classifier'
"name": "fm_binary",
"description": "Factorization Machine Binary Classifier",
},
{
'name': 'fm_binary',
'description': 'Factorization Machine Binary Classifier'
"name": "rf_multilabel",
"description": "Random Forest Multi-label Classifier",
},
{
'name': 'rf_multilabel',
'description': 'Random Forest Multi-label Classifier'
"name": "lg_multilabel",
"description": "Logistic Regression Multi-label Classifier",
},
{"name": "rf_regression", "description": "Random Forest Regression"},
{
'name': 'lg_multilabel',
'description': 'Logistic Regression Multi-label Classifier'
"name": "fm_regression",
"description": "Factorization Machine Regression",
},
{
'name': 'rf_regression',
'description': 'Random Forest Regression'
},
{
'name': 'fm_regression',
'description': 'Factorization Machine Regression'
}
]
}
],
},
}


Expand All @@ -77,6 +57,7 @@ def get_fs_methods():
"""
return FS_METHODS


def get_fs_method_details(method_name: str) -> Union[Dict, None]:
"""
Get the details of the feature selection method, this function search in all-methods definitions
Expand All @@ -87,19 +68,19 @@ def get_fs_method_details(method_name: str) -> Union[Dict, None]:
"""

for method_type in FS_METHODS:
for method in FS_METHODS[method_type]['methods']:
if method['name'].lower() == method_name.lower():
for method in FS_METHODS[method_type]["methods"]:
if method["name"].lower() == method_name.lower():
return method
return None


def get_fs_univariate_methods() -> List:
"""
Get the list of univariate methods implemented in the library
:return: list
"""
univariate_methods = FS_METHODS['univariate']
univariate_names = [method["name"] for method in univariate_methods["methods"]]
return univariate_names
return get_fs_method_by_class["univariate"]


def is_valid_univariate_method(method_name: str) -> bool:
"""
Expand All @@ -113,3 +94,12 @@ def is_valid_univariate_method(method_name: str) -> bool:
return False


def get_fs_method_by_class(fs_class: str) -> List:
"""
Get the FS method supported for a given FS class, for example, univariate
:param fs_class
:return FS List
"""
fs_methods = FS_METHODS[fs_class]
fs_names = [method["name"] for method in fs_methods["methods"]]
return fs_names
74 changes: 48 additions & 26 deletions fslite/fs/fdataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,13 @@
import psutil
from pandas import DataFrame
from scipy import sparse
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler, RobustScaler, LabelEncoder
from sklearn.preprocessing import (
MinMaxScaler,
MaxAbsScaler,
StandardScaler,
RobustScaler,
LabelEncoder,
)

logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
logger = logging.getLogger("pickfeat")
Expand All @@ -30,13 +36,16 @@ class FSDataFrame:
[...]
"""

def __init__(
self,
df: pd.DataFrame,
sample_col: Optional[str] = None,
label_col: Optional[str] = None,
sparse_threshold: float = 0.7, # Threshold for sparsity
memory_threshold: Optional[float] = 0.75 # Proportion of system memory to use for dense arrays
self,
df: pd.DataFrame,
sample_col: Optional[str] = None,
label_col: Optional[str] = None,
sparse_threshold: float = 0.7, # Threshold for sparsity
memory_threshold: Optional[
float
] = 0.75, # Proportion of system memory to use for dense arrays
):
"""
Create an instance of FSDataFrame.
Expand All @@ -60,7 +69,9 @@ def __init__(
# Handle sample column
if sample_col:
if sample_col not in df.columns:
raise ValueError(f"Sample column '{sample_col}' not found in DataFrame.")
raise ValueError(
f"Sample column '{sample_col}' not found in DataFrame."
)
self.__sample_col = sample_col
self.__samples = df[sample_col].tolist()
columns_to_drop.append(sample_col)
Expand Down Expand Up @@ -105,19 +116,27 @@ def __init__(
if sparsity > sparse_threshold:
if dense_matrix_size < memory_threshold * available_memory:
# Use dense matrix if enough memory is available
logging.info(f"Data is sparse (sparsity={sparsity:.2f}) but enough memory available. "
f"Using a dense matrix.")
logging.info(
f"Data is sparse (sparsity={sparsity:.2f}) but enough memory available. "
f"Using a dense matrix."
)
self.__matrix = numerical_df.to_numpy(dtype=np.float32)
self.__is_sparse = False
else:
# Use sparse matrix due to memory constraints
logging.info(f"Data is sparse (sparsity={sparsity:.2f}), memory insufficient for dense matrix. "
f"Using a sparse matrix representation.")
self.__matrix = sparse.csr_matrix(numerical_df.to_numpy(dtype=np.float32))
logging.info(
f"Data is sparse (sparsity={sparsity:.2f}), memory insufficient for dense matrix. "
f"Using a sparse matrix representation."
)
self.__matrix = sparse.csr_matrix(
numerical_df.to_numpy(dtype=np.float32)
)
self.__is_sparse = True
else:
# Use dense matrix since it's not sparse
logging.info(f"Data is not sparse (sparsity={sparsity:.2f}), using a dense matrix.")
logging.info(
f"Data is not sparse (sparsity={sparsity:.2f}), using a dense matrix."
)
self.__matrix = numerical_df.to_numpy(dtype=np.float32)
self.__is_sparse = False

Expand Down Expand Up @@ -159,24 +178,26 @@ def count_instances(self) -> int:
"""
return self.__matrix.shape[0]

def scale_features(self, scaler_method: str = 'standard', **kwargs) -> bool:
def scale_features(self, scaler_method: str = "standard", **kwargs) -> bool:
"""
Scales features in the SDataFrame using a specified method.
:param scaler_method: One of: min_max, max_abs, standard or robust.
:return: FSDataFrame with scaled features.
"""

if scaler_method == 'min_max':
if scaler_method == "min_max":
scaler = MinMaxScaler(**kwargs)
elif scaler_method == 'max_abs':
elif scaler_method == "max_abs":
scaler = MaxAbsScaler(**kwargs)
elif scaler_method == 'standard':
elif scaler_method == "standard":
scaler = StandardScaler(**kwargs)
elif scaler_method == 'robust':
elif scaler_method == "robust":
scaler = RobustScaler(**kwargs)
else:
raise ValueError("`scaler_method` must be one of: min_max, max_abs, standard or robust.")
raise ValueError(
"`scaler_method` must be one of: min_max, max_abs, standard or robust."
)

# TODO: Scale only the features for now, we have to investigate if we scale categorical variables
self.__matrix = scaler.fit_transform(self.__matrix)
Expand All @@ -192,7 +213,7 @@ def get_scaled_method(self):
def is_sparse(self):
return self.__is_sparse

def select_features_by_index(self, feature_indexes: List[int]) -> 'FSDataFrame':
def select_features_by_index(self, feature_indexes: List[int]) -> "FSDataFrame":
"""
Keep only the specified features (by index) and return an updated instance of FSDataFrame.
Expand All @@ -216,7 +237,9 @@ def select_features_by_index(self, feature_indexes: List[int]) -> 'FSDataFrame':
updated_df[self.__label_col] = self.__labels

# Return a new instance of FSDataFrame with the updated data
return FSDataFrame(updated_df, sample_col=self.__sample_col, label_col=self.__label_col)
return FSDataFrame(
updated_df, sample_col=self.__sample_col, label_col=self.__label_col
)

def to_pandas(self) -> DataFrame:
"""
Expand All @@ -241,9 +264,9 @@ def to_pandas(self) -> DataFrame:

return df

def split_df(self,
label_type_cat: bool = True,
split_training_factor: float = 0.7) -> Tuple['FSDataFrame', 'FSDataFrame']:
def split_df(
self, label_type_cat: bool = True, split_training_factor: float = 0.7
) -> Tuple["FSDataFrame", "FSDataFrame"]:
"""
Split DataFrame into training and test dataset.
It will generate a nearly class-balanced training
Expand Down Expand Up @@ -284,4 +307,3 @@ def split_df(self,
#
# # Return the updated DataFrames
# return self.update(train_df), self.update(test_df)

Loading

0 comments on commit 3f56ded

Please sign in to comment.