Merge pull request #44 from f-aguzzi/pre/beta

Pre/beta into main: release 2.5.0
f-aguzzi · Jun 13, 2024 · 7ce6c00 · 7ce6c00
2 parents 21fcb01 + 78d59e4
commit 7ce6c00
Show file tree

Hide file tree

Showing 99 changed files with 50,443 additions and 706 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,56 @@
+## [2.5.0-beta.6](https://github.com/f-aguzzi/tesi/compare/v2.5.0-beta.5...v2.5.0-beta.6) (2024-06-12)
+
+
+### Bug Fixes
+
+* **pca:** field and property issues ([91cefd3](https://github.com/f-aguzzi/tesi/commit/91cefd3d66475d13ed990ed2911014cc866e2f8d))
+
+## [2.5.0-beta.5](https://github.com/f-aguzzi/tesi/compare/v2.5.0-beta.4...v2.5.0-beta.5) (2024-06-12)
+
+
+### Bug Fixes
+
+* **BaseActionClass:** trailing commas creating unwanted tuple ([1b057ab](https://github.com/f-aguzzi/tesi/commit/1b057abb23710c0aef8f67875411d63e47a61036))
+
+## [2.5.0-beta.4](https://github.com/f-aguzzi/tesi/compare/v2.5.0-beta.3...v2.5.0-beta.4) (2024-06-12)
+
+
+### Bug Fixes
+
+* **base:** third round of inheritance fixes ([c3b1bbb](https://github.com/f-aguzzi/tesi/commit/c3b1bbb36c7c150525a6d9d0c87f863291ad397e))
+
+## [2.5.0-beta.3](https://github.com/f-aguzzi/tesi/compare/v2.5.0-beta.2...v2.5.0-beta.3) (2024-06-12)
+
+
+### Bug Fixes
+
+* **base:** inheritance settings (2) ([103040c](https://github.com/f-aguzzi/tesi/commit/103040c940ba884debaf8fdc2239d3dee0e76ab2))
+
+## [2.5.0-beta.2](https://github.com/f-aguzzi/tesi/compare/v2.5.0-beta.1...v2.5.0-beta.2) (2024-06-12)
+
+
+### Bug Fixes
+
+* **base:** settings inheritance ([8a14a00](https://github.com/f-aguzzi/tesi/commit/8a14a0036bf3791deda182ec0f3aa618806f180d))
+
+## [2.5.0-beta.1](https://github.com/f-aguzzi/tesi/compare/v2.4.0...v2.5.0-beta.1) (2024-06-12)
+
+
+### Features
+
+* **base:** added BaseActionClass as root class ([4f0bac8](https://github.com/f-aguzzi/tesi/commit/4f0bac831f6ca215212f6c801b01483ef2d0e5cc))
+* LDA and PCA inherit from BaseReducer ([7b797bd](https://github.com/f-aguzzi/tesi/commit/7b797bdfb00404b90fdeaf4178085f2d2bdb9c34))
+
+
+### chore
+
+* **docs:** upgrade to version 2.5.0 ([387499f](https://github.com/f-aguzzi/tesi/commit/387499f5add4ceae62d07891ffd531406607cdaf))
+
+
+### Docs
+
+* finish first case study ([21fcb01](https://github.com/f-aguzzi/tesi/commit/21fcb014b8b510fe7718e1443a1daad0afd09e0e))
+
 ## [2.4.0](https://github.com/f-aguzzi/tesi/compare/v2.3.0...v2.4.0) (2024-06-11)
 
 

diff --git a/chemfusekit/__base.py b/chemfusekit/__base.py
@@ -1,4 +1,5 @@
-'''A base class for all classifiers.'''
+"""A base class for all classifiers."""
+from abc import ABC, abstractmethod
 
 import pandas as pd
 import numpy as np
@@ -10,7 +11,7 @@
 
 
 class BaseDataModel:
-    '''Models the output data from data-outputting operations'''
+    """Models the output data from data-outputting operations"""
 
     def __init__(self, x_data: pd.DataFrame, x_train: pd.DataFrame, y: np.ndarray):
         self.x_data = x_data
@@ -83,45 +84,39 @@ def export_to_file(self, export_path: str, sheet_name: str = 'Sheet1'):
             raise ValueError(f"Unsupported file format: {export_path}")
 
     def __getitem__(self, index):
-        '''Get an item with array-style indexing'''
+        """Get an item with array-style indexing"""
         return pd.DataFrame(self.x_data.iloc[index, :]).transpose()
 
 
 class BaseSettings:
-    '''Holds the settings for the BaseClassifier object.'''
+    """Holds the settings for all objects with settings."""
+
+    def __init__(self, output: GraphMode = GraphMode.NONE):
+        self.output = output
+
+
+class BaseClassifierSettings(BaseSettings):
+    """Holds the settings for the BaseClassifier object."""
 
     def __init__(self, output: GraphMode = GraphMode.NONE, test_split: bool = False):
+        super().__init__(output)
         if test_split is True and output is GraphMode.NONE:
             raise Warning(
                 "You selected test_split but it won't run because you disabled the output."
             )
-        self.output = output
         self.test_split = test_split
 
 
-class BaseClassifier:
-    '''Parent class for all classifiers, containing basic shared utilities.'''
-
+class BaseActionClass(ABC):
+    """Abstract base class for all reducers and classifiers."""
     def __init__(self, settings: BaseSettings, data: BaseDataModel):
         self.settings = settings
         self.data = data
         self.model: BaseEstimator | None = None
 
-    def import_model(self, import_path: str):
-        model = joblib.load(import_path)
-        if not isinstance(model, BaseEstimator):
-            raise ImportError("The file you tried importing is not a sklearn model!")
-        self.model = model
-
-    def export_model(self, export_path: str):
-        if self.model is not None:
-            joblib.dump(self.model, export_path)
-        else:
-            raise RuntimeError("You haven't trained the model yet! You cannot export it now.")
-
     @classmethod
     def from_file(cls, settings, model_path):
-        '''Creates a classifier instance from file'''
+        """Creates a classifier instance from file"""
         x_data = pd.DataFrame()
         y_dataframe = pd.DataFrame(columns=['Substance'])
         x_train = pd.concat([y_dataframe, x_data], axis=1)
@@ -134,13 +129,71 @@ def from_file(cls, settings, model_path):
         class_instance = cls(settings, data)
         class_instance.import_model(model_path)
         return class_instance
+
+    def import_model(self, import_path: str):
+        """Imports a sklearn model from a file."""
+        try:
+            model = joblib.load(import_path)
+        except Exception as exc:
+            raise ImportError("The file you tried importing is not a valid Python object!") from exc
+        if not isinstance(model, BaseEstimator):
+            raise ImportError("The file you tried importing is not a sklearn model!")
+        self.model = model
+
+    def export_model(self, export_path: str):
+        """Exports the underlying sklearn model to a file."""
+        if self.model is not None:
+            joblib.dump(self.model, export_path)
+        else:
+            raise RuntimeError("You haven't trained the model yet! You cannot export it now.")
+
+
+class BaseClassifier(BaseActionClass):
+    """Parent class for all classifiers, containing basic shared utilities."""
+
+    def __init__(self, settings: BaseClassifierSettings, data: BaseDataModel):
+        super().__init__(settings, data)
 
     def predict(self, x_data: pd.DataFrame):
-        '''Performs prediction once the model is trained.'''
+        """Performs prediction once the model is trained."""
         if x_data is None:
             raise TypeError(f"X data for {self.__class__.__name__} prediction must be non-empty.")
         if self.model is None:
             raise RuntimeError(f"The {self.__class__.__name__} model is not trained yet!")
 
         y_pred = self.model.predict(x_data)
         return y_pred
+
+
+class BaseReducer(BaseActionClass):
+    """Parent class for all reducers (decomposition-performing classes), containing basic shared utilities."""
+
+    def __init__(self, settings: BaseSettings, data: BaseDataModel):
+        super().__init__(settings, data)
+
+    @abstractmethod
+    def export_data(self) -> BaseDataModel:
+        pass
+
+    def reduce(self, data: BaseDataModel) -> BaseDataModel:
+        """Reduces dimensionality of data."""
+        if self.model is None:
+            raise RuntimeError(
+                "The model hasn't been trained yet! You cannot use it to reduce data dimensionality."
+            )
+        x_data = pd.DataFrame(self.model.transform(data.x_data))
+        y_dataframe = pd.DataFrame(data.y)
+        x_train = pd.concat(
+            [y_dataframe, x_data],
+            axis=1
+        )
+        return BaseDataModel(
+            x_data=x_data,
+            x_train=x_train,
+            y=data.y
+        )
+
+    @property
+    @abstractmethod
+    def rescaled_data(self) -> BaseDataModel:
+        pass
diff --git a/chemfusekit/__init__.py b/chemfusekit/__init__.py
@@ -1,4 +1,4 @@
-'''__init__.py file for the library'''
+"""__init__.py file for the library"""
 from beartype import BeartypeConf
 from beartype.claw import beartype_this_package
 beartype_this_package(conf=BeartypeConf(violation_type=TypeError))
diff --git a/chemfusekit/__utils.py b/chemfusekit/__utils.py
@@ -1,4 +1,4 @@
-'''Utilities module: functions that are shared between different classes'''
+"""Utilities module: functions that are shared between different classes"""
 from sklearn.cross_decomposition import PLSRegression as PLSR
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import train_test_split
@@ -16,13 +16,15 @@
 
 from enum import Enum, auto
 
+
 class GraphMode(Enum):
     TEXT = auto()
     GRAPHIC = auto()
     NONE = auto()
 
+
 def graph_output(scores, model, name: str, mode: GraphMode = GraphMode.GRAPHIC):
-    '''A reusable graphing function.'''
+    """A reusable graphing function."""
     # Return early if output is disabled
     if mode is GraphMode.NONE:
         return
@@ -38,7 +40,7 @@ def graph_output(scores, model, name: str, mode: GraphMode = GraphMode.GRAPHIC):
 
         # Display the explained variance ratio
         print("Explained Variance Ratio:", model.explained_variance_ratio_)
-    
+
     if mode is GraphMode.GRAPHIC:
         # Scores table
         print_table(
@@ -47,7 +49,7 @@ def graph_output(scores, model, name: str, mode: GraphMode = GraphMode.GRAPHIC):
             f"{name} Scores"
         )
 
-        #Scores plot
+        # Scores plot
         fig = px.scatter(scores, x="LV1", y="LV2", color="Substance", hover_data=['Substance'])
         fig.update_xaxes(zeroline=True, zerolinewidth=1, zerolinecolor='Black')
         fig.update_yaxes(zeroline=True, zerolinewidth=1, zerolinecolor='Black')
@@ -61,25 +63,25 @@ def graph_output(scores, model, name: str, mode: GraphMode = GraphMode.GRAPHIC):
         fig = px.scatter_3d(scores, x='LV1', y='LV2', z='LV3',
                             color='Substance', hover_data=['Substance'],
                             hover_name=scores.index
-        )
+                            )
         fig.update_layout(title_text=f"3D colored by Substance for {name}")
         fig.show()
 
 
 def print_table(header_values, cell_values, title: str, mode: GraphMode = GraphMode.GRAPHIC):
-    '''Multimodal table printing utility.'''
+    """Multimodal table printing utility."""
     # Return early if output is disabled
     if mode is GraphMode.NONE:
         return
     # Graphical table printing
     elif mode is GraphMode.GRAPHIC:
         fig = go.Figure(data=[go.Table(
-        header=dict(values=header_values,
-                    fill_color='paleturquoise',
-                    align='left'),
-        cells=dict(values=cell_values,
-                fill_color='lavender',
-                align='left'))
+            header=dict(values=header_values,
+                        fill_color='paleturquoise',
+                        align='left'),
+            cells=dict(values=cell_values,
+                       fill_color='lavender',
+                       align='left'))
         ])
         fig.update_layout(title=title)
         fig.show()
@@ -94,7 +96,7 @@ def print_table(header_values, cell_values, title: str, mode: GraphMode = GraphM
 
 
 def run_split_test(x, y, model, extended=False, mode: GraphMode = GraphMode.GRAPHIC):
-    '''A function to run split tests on trained models.'''
+    """A function to run split tests on trained models."""
 
     # Return early if there's nothing to print
     if mode is GraphMode.NONE:
@@ -121,11 +123,11 @@ def run_split_test(x, y, model, extended=False, mode: GraphMode = GraphMode.GRAP
 
         # See the classes the model used
         classes = model.classes_
-        if isinstance(model, LogisticRegression) and len(classes) == 2:     # Binary classifier
+        if isinstance(model, LogisticRegression) and len(classes) == 2:  # Binary classifier
             classes = [" / ".join(classes)]
             classes = np.asarray(classes)
             classes = classes.reshape((1, 1))
-        else:   # All other cases
+        else:  # All other cases
             classes = classes.reshape((1, len(classes)))
 
         coefficients = model.coef_.transpose()
@@ -192,7 +194,7 @@ def run_split_test(x, y, model, extended=False, mode: GraphMode = GraphMode.GRAP
 
 
 def print_confusion_matrix(y1, y2, title: str, mode: GraphMode = GraphMode.GRAPHIC):
-    '''Function to simplify the plotting of confusion matrices'''
+    """Function to simplify the plotting of confusion matrices"""
 
     # Return early if there's nothing to print
     if mode is GraphMode.NONE:
@@ -202,7 +204,7 @@ def print_confusion_matrix(y1, y2, title: str, mode: GraphMode = GraphMode.GRAPH
 
     # Get unique class labels from y_true
     class_labels = sorted(set(y2))
-    
+
     # Create the report
     report = classification_report(y1, y2, digits=2, output_dict=True)
 
@@ -236,5 +238,5 @@ def print_confusion_matrix(y1, y2, title: str, mode: GraphMode = GraphMode.GRAPH
         ['substance'] + list(cr.columns),
         [cr.index, cr['precision'], cr['recall'], cr['f1-score'], cr['support']],
         "Classification Report",
-        mode 
+        mode
     )
diff --git a/chemfusekit/knn.py b/chemfusekit/knn.py
@@ -1,19 +1,15 @@
-'''k-Nearest Neighbors Analysis module'''
+"""k-Nearest Neighbors Analysis module"""
 from copy import copy
-from typing import Optional
 from beartype.typing import Callable
 
 from sklearn.neighbors import KNeighborsClassifier
 
-import pandas as pd
-
-from chemfusekit.lldf import LLDFDataModel
 from chemfusekit.__utils import run_split_test, print_confusion_matrix, print_table, GraphMode
-from .__base import BaseSettings, BaseClassifier
+from .__base import BaseClassifierSettings, BaseClassifier, BaseDataModel
 
 
-class KNNSettings(BaseSettings):
-    '''Holds the settings for the kNN object.'''
+class KNNSettings(BaseClassifierSettings):
+    """Holds the settings for the kNN object."""
     def __init__(self, n_neighbors: int = 15, metric: str | Callable = 'euclidean', weights: str | Callable = 'uniform',
                  algorithm: str = 'auto', output: GraphMode = GraphMode.NONE, test_split: bool = False):
 
@@ -28,7 +24,7 @@ def __init__(self, n_neighbors: int = 15, metric: str | Callable = 'euclidean',
         if weights not in ['uniform', 'distance'] and not callable(weights):
             raise ValueError("Invalid weight: should be 'uniform', 'distance' or a callable")
         if algorithm not in ['auto', 'ball_tree', 'kd_tree', 'brute']:
-            raise  ValueError(
+            raise ValueError(
                 "Invalid algorithm: should be 'auto', 'ball_tree', 'kd_tree' or 'brute'."
             )
         self.n_neighbors = n_neighbors
@@ -38,12 +34,12 @@ def __init__(self, n_neighbors: int = 15, metric: str | Callable = 'euclidean',
 
 
 class KNN(BaseClassifier):
-    '''Class to store the data, methods and artifacts for k-Nearest Neighbors Analysis'''
-    def __init__(self, settings: KNNSettings, fused_data: LLDFDataModel):
-        super().__init__(settings, fused_data)
+    """Class to store the data, methods and artifacts for k-Nearest Neighbors Analysis"""
+    def __init__(self, settings: KNNSettings, data: BaseDataModel):
+        super().__init__(settings, data)
 
     def knn(self):
-        '''Performs k-Nearest Neighbors Analysis'''
+        """Performs k-Nearest Neighbors Analysis"""
         # Prepare and train the kNN model
         knn = KNeighborsClassifier(
             n_neighbors=self.settings.n_neighbors,
@@ -60,7 +56,7 @@ def knn(self):
         y_pred = knn.predict(self.data.x_data)
         print_table(
             ["Predictions"],
-            y_pred.reshape(1,len(y_pred)),
+            y_pred.reshape(1, len(y_pred)),
             "Data and predictions",
             self.settings.output
         )