black applied

bigbio · Sep 22, 2024 · 3f56ded · 3f56ded
1 parent a69ac12
commit 3f56ded
Show file tree

Hide file tree

Showing 18 changed files with 603 additions and 445 deletions.
diff --git a/fslite/fs/constants.py b/fslite/fs/constants.py
@@ -1,72 +1,52 @@
 """
 This file contains a list of constants used in the feature selection and machine learning methods.
 """
+
 from typing import Dict, List, Union
 
 FS_METHODS = {
-    'univariate': {
-        "title": 'Univariate Feature Selection',
+    "univariate": {
+        "title": "Univariate Feature Selection",
         "methods": [
             {
-                'name': 'anova',
-                'description': 'Univariate ANOVA feature selection (f-classification)'
-            },
-            {
-                'name': 'u_corr',
-                'description': 'Univariate correlation'
+                "name": "anova",
+                "description": "Univariate ANOVA feature selection (f-classification)",
             },
-            {
-                'name': 'f_regression',
-                'description': 'Univariate f-regression'
-            }
-        ]
+            {"name": "u_corr", "description": "Univariate correlation"},
+            {"name": "f_regression", "description": "Univariate f-regression"},
+        ],
     },
-    'multivariate': {
-        "title": 'Multivariate Feature Selection',
+    "multivariate": {
+        "title": "Multivariate Feature Selection",
         "methods": [
-            {
-                'name': 'm_corr',
-                'description': 'Multivariate Correlation'
-            },
-            {
-                'name': 'variance',
-                'description': 'Multivariate Variance'
-            }
-        ]
+            {"name": "m_corr", "description": "Multivariate Correlation"},
+            {"name": "variance", "description": "Multivariate Variance"},
+        ],
     },
-    'ml': {
-        "title": 'Machine Learning Wrapper',
+    "ml": {
+        "title": "Machine Learning Wrapper",
         "methods": [
+            {"name": "rf_binary", "description": "Random Forest Binary Classifier"},
+            {"name": "lsvc_binary", "description": "Linear SVC Binary Classifier"},
             {
-                'name': 'rf_binary',
-                'description': 'Random Forest Binary Classifier'
-            },
-            {
-                'name': 'lsvc_binary',
-                'description': 'Linear SVC Binary Classifier'
+                "name": "fm_binary",
+                "description": "Factorization Machine Binary Classifier",
             },
             {
-                'name': 'fm_binary',
-                'description': 'Factorization Machine Binary Classifier'
+                "name": "rf_multilabel",
+                "description": "Random Forest Multi-label Classifier",
             },
             {
-                'name': 'rf_multilabel',
-                'description': 'Random Forest Multi-label Classifier'
+                "name": "lg_multilabel",
+                "description": "Logistic Regression Multi-label Classifier",
             },
+            {"name": "rf_regression", "description": "Random Forest Regression"},
             {
-                'name': 'lg_multilabel',
-                'description': 'Logistic Regression Multi-label Classifier'
+                "name": "fm_regression",
+                "description": "Factorization Machine Regression",
             },
-            {
-                'name': 'rf_regression',
-                'description': 'Random Forest Regression'
-            },
-            {
-                'name': 'fm_regression',
-                'description': 'Factorization Machine Regression'
-            }
-        ]
-    }
+        ],
+    },
 }
 
 
@@ -77,6 +57,7 @@ def get_fs_methods():
     """
     return FS_METHODS
 
+
 def get_fs_method_details(method_name: str) -> Union[Dict, None]:
     """
     Get the details of the feature selection method, this function search in all-methods definitions
@@ -87,19 +68,19 @@ def get_fs_method_details(method_name: str) -> Union[Dict, None]:
     """
 
     for method_type in FS_METHODS:
-        for method in FS_METHODS[method_type]['methods']:
-            if method['name'].lower() == method_name.lower():
+        for method in FS_METHODS[method_type]["methods"]:
+            if method["name"].lower() == method_name.lower():
                 return method
     return None
 
+
 def get_fs_univariate_methods() -> List:
     """
     Get the list of univariate methods implemented in the library
     :return: list
     """
-    univariate_methods = FS_METHODS['univariate']
-    univariate_names = [method["name"] for method in univariate_methods["methods"]]
-    return univariate_names
+    return get_fs_method_by_class["univariate"]
+
 
 def is_valid_univariate_method(method_name: str) -> bool:
     """
@@ -113,3 +94,12 @@ def is_valid_univariate_method(method_name: str) -> bool:
     return False
 
 
+def get_fs_method_by_class(fs_class: str) -> List:
+    """
+    Get the FS method supported for a given FS class, for example, univariate
+    :param fs_class
+    :return FS List
+    """
+    fs_methods = FS_METHODS[fs_class]
+    fs_names = [method["name"] for method in fs_methods["methods"]]
+    return fs_names
diff --git a/fslite/fs/fdataframe.py b/fslite/fs/fdataframe.py
@@ -7,7 +7,13 @@
 import psutil
 from pandas import DataFrame
 from scipy import sparse
-from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler, RobustScaler, LabelEncoder
+from sklearn.preprocessing import (
+    MinMaxScaler,
+    MaxAbsScaler,
+    StandardScaler,
+    RobustScaler,
+    LabelEncoder,
+)
 
 logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
 logger = logging.getLogger("pickfeat")
@@ -30,13 +36,16 @@ class FSDataFrame:
     [...]
 
     """
+
     def __init__(
-            self,
-            df: pd.DataFrame,
-            sample_col: Optional[str] = None,
-            label_col: Optional[str] = None,
-            sparse_threshold: float = 0.7,  # Threshold for sparsity
-            memory_threshold: Optional[float] = 0.75  # Proportion of system memory to use for dense arrays
+        self,
+        df: pd.DataFrame,
+        sample_col: Optional[str] = None,
+        label_col: Optional[str] = None,
+        sparse_threshold: float = 0.7,  # Threshold for sparsity
+        memory_threshold: Optional[
+            float
+        ] = 0.75,  # Proportion of system memory to use for dense arrays
     ):
         """
         Create an instance of FSDataFrame.
@@ -60,7 +69,9 @@ def __init__(
         # Handle sample column
         if sample_col:
             if sample_col not in df.columns:
-                raise ValueError(f"Sample column '{sample_col}' not found in DataFrame.")
+                raise ValueError(
+                    f"Sample column '{sample_col}' not found in DataFrame."
+                )
             self.__sample_col = sample_col
             self.__samples = df[sample_col].tolist()
             columns_to_drop.append(sample_col)
@@ -105,19 +116,27 @@ def __init__(
         if sparsity > sparse_threshold:
             if dense_matrix_size < memory_threshold * available_memory:
                 # Use dense matrix if enough memory is available
-                logging.info(f"Data is sparse (sparsity={sparsity:.2f}) but enough memory available. "
-                             f"Using a dense matrix.")
+                logging.info(
+                    f"Data is sparse (sparsity={sparsity:.2f}) but enough memory available. "
+                    f"Using a dense matrix."
+                )
                 self.__matrix = numerical_df.to_numpy(dtype=np.float32)
                 self.__is_sparse = False
             else:
                 # Use sparse matrix due to memory constraints
-                logging.info(f"Data is sparse (sparsity={sparsity:.2f}), memory insufficient for dense matrix. "
-                             f"Using a sparse matrix representation.")
-                self.__matrix = sparse.csr_matrix(numerical_df.to_numpy(dtype=np.float32))
+                logging.info(
+                    f"Data is sparse (sparsity={sparsity:.2f}), memory insufficient for dense matrix. "
+                    f"Using a sparse matrix representation."
+                )
+                self.__matrix = sparse.csr_matrix(
+                    numerical_df.to_numpy(dtype=np.float32)
+                )
                 self.__is_sparse = True
         else:
             # Use dense matrix since it's not sparse
-            logging.info(f"Data is not sparse (sparsity={sparsity:.2f}), using a dense matrix.")
+            logging.info(
+                f"Data is not sparse (sparsity={sparsity:.2f}), using a dense matrix."
+            )
             self.__matrix = numerical_df.to_numpy(dtype=np.float32)
             self.__is_sparse = False
 
@@ -159,24 +178,26 @@ def count_instances(self) -> int:
         """
         return self.__matrix.shape[0]
 
-    def scale_features(self, scaler_method: str = 'standard', **kwargs) -> bool:
+    def scale_features(self, scaler_method: str = "standard", **kwargs) -> bool:
         """
         Scales features in the SDataFrame using a specified method.
 
         :param scaler_method: One of: min_max, max_abs, standard or robust.
         :return: FSDataFrame with scaled features.
         """
 
-        if scaler_method == 'min_max':
+        if scaler_method == "min_max":
             scaler = MinMaxScaler(**kwargs)
-        elif scaler_method == 'max_abs':
+        elif scaler_method == "max_abs":
             scaler = MaxAbsScaler(**kwargs)
-        elif scaler_method == 'standard':
+        elif scaler_method == "standard":
             scaler = StandardScaler(**kwargs)
-        elif scaler_method == 'robust':
+        elif scaler_method == "robust":
             scaler = RobustScaler(**kwargs)
         else:
-            raise ValueError("`scaler_method` must be one of: min_max, max_abs, standard or robust.")
+            raise ValueError(
+                "`scaler_method` must be one of: min_max, max_abs, standard or robust."
+            )
 
         # TODO: Scale only the features for now, we have to investigate if we scale categorical variables
         self.__matrix = scaler.fit_transform(self.__matrix)
@@ -192,7 +213,7 @@ def get_scaled_method(self):
     def is_sparse(self):
         return self.__is_sparse
 
-    def select_features_by_index(self, feature_indexes: List[int]) -> 'FSDataFrame':
+    def select_features_by_index(self, feature_indexes: List[int]) -> "FSDataFrame":
         """
         Keep only the specified features (by index) and return an updated instance of FSDataFrame.
 
@@ -216,7 +237,9 @@ def select_features_by_index(self, feature_indexes: List[int]) -> 'FSDataFrame':
         updated_df[self.__label_col] = self.__labels
 
         # Return a new instance of FSDataFrame with the updated data
-        return FSDataFrame(updated_df, sample_col=self.__sample_col, label_col=self.__label_col)
+        return FSDataFrame(
+            updated_df, sample_col=self.__sample_col, label_col=self.__label_col
+        )
 
     def to_pandas(self) -> DataFrame:
         """
@@ -241,9 +264,9 @@ def to_pandas(self) -> DataFrame:
 
         return df
 
-    def split_df(self,
-                 label_type_cat: bool = True,
-                 split_training_factor: float = 0.7) -> Tuple['FSDataFrame', 'FSDataFrame']:
+    def split_df(
+        self, label_type_cat: bool = True, split_training_factor: float = 0.7
+    ) -> Tuple["FSDataFrame", "FSDataFrame"]:
         """
         Split DataFrame into training and test dataset.
         It will generate a nearly class-balanced training
@@ -284,4 +307,3 @@ def split_df(self,
         #
         # # Return the updated DataFrames
         # return self.update(train_df), self.update(test_df)
-