From ebff95d7cf00d7c86d987d113543631acc0a0113 Mon Sep 17 00:00:00 2001 From: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> Date: Fri, 7 Jun 2024 18:39:34 +0200 Subject: [PATCH] docs: update to version 2.2.0 --- docs/cookbook/case-study-classifier.md | 2 +- docs/cookbook/case-study-hybrid.md | 2 +- docs/cookbook/case-study-realtime.md | 2 +- docs/cookbook/data-operations.md | 238 ++++++++++++++++++ docs/cookbook/structure.md | 2 +- .../version-2.2.0/case-study-classifier.md | 9 + .../version-2.2.0/case-study-hybrid.md | 9 + .../version-2.2.0/case-study-realtime.md | 9 + .../version-2.2.0/data-operations.md | 238 ++++++++++++++++++ .../version-2.2.0/introduction.md | 26 ++ .../version-2.2.0/structure.md | 197 +++++++++++++++ .../version-2.2.0-sidebars.json | 8 + docs/cookbook_versions.json | 1 + .../version-2.2.0/base/_category_.json | 8 + .../version-2.2.0/base/baseclassifier.md | 37 +++ .../version-2.2.0/base/basedatamodel.md | 28 +++ .../version-2.2.0/base/basesettings.md | 21 ++ .../version-2.2.0/complete-workflow.md | 31 +++ .../version-2.2.0/knn/_category_.json | 8 + docs/versioned_docs/version-2.2.0/knn/knn.md | 47 ++++ .../version-2.2.0/knn/knnsettings.md | 64 +++++ .../version-2.2.0/lda/_category_.json | 8 + docs/versioned_docs/version-2.2.0/lda/lda.md | 48 ++++ .../version-2.2.0/lda/ldasettings.md | 39 +++ .../version-2.2.0/lldf/_category_.json | 8 + .../version-2.2.0/lldf/index.mdx | 47 ++++ .../version-2.2.0/lldf/lldf-class.md | 67 +++++ .../version-2.2.0/lldf/lldfmodel.md | 29 +++ .../version-2.2.0/lldf/lldfsettings.md | 25 ++ .../version-2.2.0/lldf/table.md | 45 ++++ .../version-2.2.0/lr/_category_.json | 8 + docs/versioned_docs/version-2.2.0/lr/lr.md | 51 ++++ .../version-2.2.0/lr/lrsettings.md | 44 ++++ .../version-2.2.0/pca/_category_.json | 8 + docs/versioned_docs/version-2.2.0/pca/pca.md | 46 ++++ .../version-2.2.0/pca/pcadatamodel.md | 35 +++ .../version-2.2.0/pca/pcasettings.md | 41 +++ .../version-2.2.0/plsda/_category_.json | 8 + .../version-2.2.0/plsda/plsda.md | 47 ++++ .../version-2.2.0/plsda/plsdasettings.md | 38 +++ .../version-2.2.0/svm/_category_.json | 8 + docs/versioned_docs/version-2.2.0/svm/svm.md | 49 ++++ .../version-2.2.0/svm/svmsettings.md | 43 ++++ docs/versioned_docs/version-2.2.0/tutorial.md | 109 ++++++++ .../version-2.2.0/utils/_category_.json | 8 + .../version-2.2.0/utils/graphmode.md | 11 + .../version-2.2.0/utils/graphoutput.md | 20 ++ .../utils/printconfusionmatrix.md | 19 ++ .../version-2.2.0/utils/printtable.md | 20 ++ .../version-2.2.0/utils/runsplittests.md | 20 ++ .../version-2.2.0-sidebars.json | 8 + docs/versions.json | 1 + notes/.obsidian/workspace.json | 9 +- 53 files changed, 1946 insertions(+), 8 deletions(-) create mode 100644 docs/cookbook/data-operations.md create mode 100644 docs/cookbook_versioned_docs/version-2.2.0/case-study-classifier.md create mode 100644 docs/cookbook_versioned_docs/version-2.2.0/case-study-hybrid.md create mode 100644 docs/cookbook_versioned_docs/version-2.2.0/case-study-realtime.md create mode 100644 docs/cookbook_versioned_docs/version-2.2.0/data-operations.md create mode 100644 docs/cookbook_versioned_docs/version-2.2.0/introduction.md create mode 100644 docs/cookbook_versioned_docs/version-2.2.0/structure.md create mode 100644 docs/cookbook_versioned_sidebars/version-2.2.0-sidebars.json create mode 100644 docs/versioned_docs/version-2.2.0/base/_category_.json create mode 100644 docs/versioned_docs/version-2.2.0/base/baseclassifier.md create mode 100644 docs/versioned_docs/version-2.2.0/base/basedatamodel.md create mode 100644 docs/versioned_docs/version-2.2.0/base/basesettings.md create mode 100644 docs/versioned_docs/version-2.2.0/complete-workflow.md create mode 100644 docs/versioned_docs/version-2.2.0/knn/_category_.json create mode 100644 docs/versioned_docs/version-2.2.0/knn/knn.md create mode 100644 docs/versioned_docs/version-2.2.0/knn/knnsettings.md create mode 100644 docs/versioned_docs/version-2.2.0/lda/_category_.json create mode 100644 docs/versioned_docs/version-2.2.0/lda/lda.md create mode 100644 docs/versioned_docs/version-2.2.0/lda/ldasettings.md create mode 100644 docs/versioned_docs/version-2.2.0/lldf/_category_.json create mode 100644 docs/versioned_docs/version-2.2.0/lldf/index.mdx create mode 100644 docs/versioned_docs/version-2.2.0/lldf/lldf-class.md create mode 100644 docs/versioned_docs/version-2.2.0/lldf/lldfmodel.md create mode 100644 docs/versioned_docs/version-2.2.0/lldf/lldfsettings.md create mode 100644 docs/versioned_docs/version-2.2.0/lldf/table.md create mode 100644 docs/versioned_docs/version-2.2.0/lr/_category_.json create mode 100644 docs/versioned_docs/version-2.2.0/lr/lr.md create mode 100644 docs/versioned_docs/version-2.2.0/lr/lrsettings.md create mode 100644 docs/versioned_docs/version-2.2.0/pca/_category_.json create mode 100644 docs/versioned_docs/version-2.2.0/pca/pca.md create mode 100644 docs/versioned_docs/version-2.2.0/pca/pcadatamodel.md create mode 100644 docs/versioned_docs/version-2.2.0/pca/pcasettings.md create mode 100644 docs/versioned_docs/version-2.2.0/plsda/_category_.json create mode 100644 docs/versioned_docs/version-2.2.0/plsda/plsda.md create mode 100644 docs/versioned_docs/version-2.2.0/plsda/plsdasettings.md create mode 100644 docs/versioned_docs/version-2.2.0/svm/_category_.json create mode 100644 docs/versioned_docs/version-2.2.0/svm/svm.md create mode 100644 docs/versioned_docs/version-2.2.0/svm/svmsettings.md create mode 100644 docs/versioned_docs/version-2.2.0/tutorial.md create mode 100644 docs/versioned_docs/version-2.2.0/utils/_category_.json create mode 100644 docs/versioned_docs/version-2.2.0/utils/graphmode.md create mode 100644 docs/versioned_docs/version-2.2.0/utils/graphoutput.md create mode 100644 docs/versioned_docs/version-2.2.0/utils/printconfusionmatrix.md create mode 100644 docs/versioned_docs/version-2.2.0/utils/printtable.md create mode 100644 docs/versioned_docs/version-2.2.0/utils/runsplittests.md create mode 100644 docs/versioned_sidebars/version-2.2.0-sidebars.json diff --git a/docs/cookbook/case-study-classifier.md b/docs/cookbook/case-study-classifier.md index 8b1d3d3..a460087 100644 --- a/docs/cookbook/case-study-classifier.md +++ b/docs/cookbook/case-study-classifier.md @@ -1,5 +1,5 @@ --- -sidebar_position: 3 +sidebar_position: 4 --- # Case study: training a classifier from lab data diff --git a/docs/cookbook/case-study-hybrid.md b/docs/cookbook/case-study-hybrid.md index a7c4c5d..d69b4b6 100644 --- a/docs/cookbook/case-study-hybrid.md +++ b/docs/cookbook/case-study-hybrid.md @@ -1,5 +1,5 @@ --- -sidebar_position: 4 +sidebar_position: 5 --- # Case study: hybrid workflow diff --git a/docs/cookbook/case-study-realtime.md b/docs/cookbook/case-study-realtime.md index 8725eac..95bbbef 100644 --- a/docs/cookbook/case-study-realtime.md +++ b/docs/cookbook/case-study-realtime.md @@ -1,5 +1,5 @@ --- -sidebar_position: 5 +sidebar_position: 6 --- # Case study: real-time data classification diff --git a/docs/cookbook/data-operations.md b/docs/cookbook/data-operations.md new file mode 100644 index 0000000..31369f4 --- /dev/null +++ b/docs/cookbook/data-operations.md @@ -0,0 +1,238 @@ +--- +sidebar_position: 3 +--- + +# 2. Data operations: import, export and data fusion + +`ChemFuseKit` can elaborate datasets in multiple different ways. + +For now, let's take a look at basic input / output operations on datasets. As stated in the previous chapter of this cookbook, all loaded datasets are contained into a `BaseDataModel` object or in one of its derived classes: + +```mermaid +classDiagram + class BaseDataModel { + +x_data: DataFrame + +x_train: DataFrame + +y: ndarray + __init__(x_data, x_train, y) + } + + class LLDFDataModel { + ... + __init__(...) + } + + class PCADataModel { + +array_scores: ndarray + +components: int + __init__(..., array_scores) + } + + BaseDataModel *-- LLDFDataModel + BaseDataModel *-- PCADataModel +``` + +With the help of these classes, we can perform three fundamental data operations: + +1. dataset loading +2. data fusion +3. dataset saving + +## Dataset loading + +`ChemFuseKit` can import Excel tables into its `BaseDataModel`-derived classes. `BaseDataModel` and both its derived classes offer the ability to import a single datasheet, while only `LLDF` (the name stands for *Low Level Data Fusion*) can perform data fusion by importing multiple tables at once into a single, unified dataset: + +```mermaid +flowchart TD + A[Import data] --> B{How many tables?} + B --> |One| C[BaseDataModel\nand its\nderived classes] + B --> |Many| D[LLDF only] +``` + +Let's say we have one single table, called `spectrometer_data.xlsx`, and we want to load it into our project, to then feed it to one of our classifiers. + +This is the schema of the table called `Spectral Samples` within `spectrometer_data.xlsx`: + +| Sample number | Class | 8 | 8.1 | 8.2 | 8.3 | ... | 9.9 | 10 | +|---------------|-----------------|-------|-------|-------|--------|-----|--------|--------| +| 1 | Dichloromethane | 2.341 | 3.866 | 1.430 | 5.843 | | 0.032 | 1.128 | +| 2 | N-hexane | 5.745 | 8.346 | 2.985 | 6.842 | | 1.832 | 3.543 | +| 3 | Dioxane | 0.003 | 0.002 | 0.006 | 0.0013 | | 11.483 | 10.445 | +| ... | | | | | | | | | + +As we can observe, there's an index column called *Sample number*, a class column called *Class* and then a certain number of columns containing the spectral responses of the samples in the 8nm - 10nm range. + +
+ +Now, let's import this table into a `BaseDataModel`: + +```python +from chemfusekit.__base import BaseDataModel + +data = BaseDataModel.load_from_file( + import_path='spectrometer_data.xlsx' + sheet_name='Spectral samples', + class_column='Class', + index_column='Sample number' +) +``` + +Now our `data` variable is loaded, and, as an instance of `BaseDataModel`, it contains three fields: + +- `x_data` +- `x_train` +- `y` + +
+ +`x_data` is a Pandas Dataframe with the following content: + +| 8 | 8.1 | 8.2 | 8.3 | ... | 9.9 | 10 | +|-------|-------|-------|--------|-----|--------|--------| +| 2.341 | 3.866 | 1.430 | 5.843 | | 0.032 | 1.128 | +| 5.745 | 8.346 | 2.985 | 6.842 | | 1.832 | 3.543 | +| 0.003 | 0.002 | 0.006 | 0.0013 | | 11.483 | 10.445 | +| ... | | | | | | | + +As we can see, it only contains the spectral data. + +
+ +`x_train`, a Pandas Dataframe too, contains both the classes (with the column header renamed to *Substance*) and the spectral data: + +| Substance | 8 | 8.1 | 8.2 | 8.3 | ... | 9.9 | 10 | +|-----------------|-------|-------|-------|--------|-----|--------|--------| +| Dichloromethane | 2.341 | 3.866 | 1.430 | 5.843 | | 0.032 | 1.128 | +| N-hexane | 5.745 | 8.346 | 2.985 | 6.842 | | 1.832 | 3.543 | +| Dioxane | 0.003 | 0.002 | 0.006 | 0.0013 | | 11.483 | 10.445 | +| ... | | | | | | | | + +
+ +`y`, a NumPy ndarray, only contains the classes (with the column header renamed to *Substance*): + +| Substance | +|-----------------| +| Dichloromethane | +| N-hexane | +| Dioxane | +| ... | + + + +## Data Fusion + +Let's build on from the previous example. Our file, `spectrometer_data.xlsx`, also contains a second sheet called `Gas Chromatography samples`, and we want to import it along our previous `Spectral samples` table. + +This is the schema of the second table: + +| Sample | class | Retention time | +|---------|-----------------|----------------| +| 1 | Dichloromethane | 123.78 | +| 2 | N-hexane | 44.19 | +| 3 | Dioxane | 22.34 | +| ... | | | + +Even though the header names are slightly different, the content of the first two columns corresponds to the first two columns of the previous table. The third column contains gas chromatography retention times in milliseconds. + +
+ +The `LLDF` module allows us to join these two tables (the current and the one from the previous examples) to form a single dataset that contains both spectral data and retention times. Let's see how. + +```python +from chemfusekit.lldf import LLDFSettings, LLDF, GraphMode, Table + +settings = LLDFSettings() # Initialize the default settings + +# Set up the import settings for the first table (spectral data) +table1 = Table( + file_path='chemical_data.xlsx', + sheet_name='Spectral samples' + preprocessing='snv' + class_column='Class' + index_column='Sample number' +) + +# Set up the import settings for the second table (chromatography data) +table2 = Table( + file_path='chemical_data.xlsx', + sheet_name='Chromatography samples' + preprocessing='none' + class_column='class' + index_column='Sample' +) + +# Now, let's make an array of the two tables +tables = [Table1, Table2] + +# Let's pass the settings and the tables to the LLDF constructor +lldf = LLDF(settings, tables) + +# Let's finally perform data fusion with the lldf() method! +lldf.lldf() +``` + +At the end of this cycle of operations, we can find our fused data object inside the `fused_data` property of our low-level data fusion object: + +```python +lldf.fused_data +``` + +The `fused_data` field is of class `LLDFDataModel`, which is derived from `BaseDataModel`, and contains the same fields (`x_data`, `x_train`, `y`). + +
+ +This is the content of `x_data` (a Pandas DataFrame): + +| 8 | 8.1 | 8.2 | 8.3 | ... | 9.9 | 10 |Retention time | +|-------|-------|-------|--------|-----|--------|--------|---------------| +| 2.341 | 3.866 | 1.430 | 5.843 | | 0.032 | 1.128 |123.78 | +| 5.745 | 8.346 | 2.985 | 6.842 | | 1.832 | 3.543 |44.19 | +| 0.003 | 0.002 | 0.006 | 0.0013 | | 11.483 | 10.445 |22.34 | +| ... | | | | | | | | + +
+ +This is `x_train` (a Pandas DataFrame): + +| Substance | 8 | 8.1 | 8.2 | 8.3 | ... | 9.9 | 10 |Retention time | +|-----------------|-------|-------|-------|--------|-----|--------|--------|---------------| +| Dichloromethane | 2.341 | 3.866 | 1.430 | 5.843 | | 0.032 | 1.128 |123.78 | +| N-hexane | 5.745 | 8.346 | 2.985 | 6.842 | | 1.832 | 3.543 |44.19 | +| Dioxane | 0.003 | 0.002 | 0.006 | 0.0013 | | 11.483 | 10.445 |22.34 | +| ... | ... | | | | | | | | + +
+ +This is the content of `y` (a NumPy ndarray): + +| Substance | +|-----------------| +| Dichloromethane | +| N-hexane | +| Dioxane | +| ... | + + +## Dataset export + +`BaseDataModel` and its derived classes have a `export_to_file` method that exports the complete table (class names and data columns) to an Excel file. + +Let's say we want to export the fused dataset from the previous example into a file called `fused dataset.xlsx`. Here's how to do it, using our `lldf` variable from the previous example (the one that contained an instance of the `LLDF` class, with which we joined the two tables): + +```python +lldf.export_to_file(export_path='fused dataset.xlsx', sheet_name="Sheet 1") +``` + +Et voila! Now we have a new file called `fused dataset.xlsx`, inside of which there is a sheet called "Sheet 1" with the following content: + +| Substance | 8 | 8.1 | 8.2 | 8.3 | ... | 9.9 | 10 |Retention time | +|-----------------|-------|-------|-------|--------|-----|--------|--------|---------------| +| Dichloromethane | 2.341 | 3.866 | 1.430 | 5.843 | | 0.032 | 1.128 |123.78 | +| N-hexane | 5.745 | 8.346 | 2.985 | 6.842 | | 1.832 | 3.543 |44.19 | +| Dioxane | 0.003 | 0.002 | 0.006 | 0.0013 | | 11.483 | 10.445 |22.34 | +| ... | ... | | | | | | | | + +
+ +With this, you now know all the basics of data handling within `ChemFuseKit`. \ No newline at end of file diff --git a/docs/cookbook/structure.md b/docs/cookbook/structure.md index 64bec98..24b6ec3 100644 --- a/docs/cookbook/structure.md +++ b/docs/cookbook/structure.md @@ -2,7 +2,7 @@ sidebar_position: 2 --- -# Project structure +# 1. Project structure In this cookbook page, you will be shown how the project is structured, and the purpose of each module. diff --git a/docs/cookbook_versioned_docs/version-2.2.0/case-study-classifier.md b/docs/cookbook_versioned_docs/version-2.2.0/case-study-classifier.md new file mode 100644 index 0000000..a460087 --- /dev/null +++ b/docs/cookbook_versioned_docs/version-2.2.0/case-study-classifier.md @@ -0,0 +1,9 @@ +--- +sidebar_position: 4 +--- + +# Case study: training a classifier from lab data + +:::note +This case study is still **under construction**. +::: \ No newline at end of file diff --git a/docs/cookbook_versioned_docs/version-2.2.0/case-study-hybrid.md b/docs/cookbook_versioned_docs/version-2.2.0/case-study-hybrid.md new file mode 100644 index 0000000..d69b4b6 --- /dev/null +++ b/docs/cookbook_versioned_docs/version-2.2.0/case-study-hybrid.md @@ -0,0 +1,9 @@ +--- +sidebar_position: 5 +--- + +# Case study: hybrid workflow + +:::note +This case study is still **under construction**. +::: \ No newline at end of file diff --git a/docs/cookbook_versioned_docs/version-2.2.0/case-study-realtime.md b/docs/cookbook_versioned_docs/version-2.2.0/case-study-realtime.md new file mode 100644 index 0000000..95bbbef --- /dev/null +++ b/docs/cookbook_versioned_docs/version-2.2.0/case-study-realtime.md @@ -0,0 +1,9 @@ +--- +sidebar_position: 6 +--- + +# Case study: real-time data classification + +:::note +This case study is still **under construction**. +::: \ No newline at end of file diff --git a/docs/cookbook_versioned_docs/version-2.2.0/data-operations.md b/docs/cookbook_versioned_docs/version-2.2.0/data-operations.md new file mode 100644 index 0000000..31369f4 --- /dev/null +++ b/docs/cookbook_versioned_docs/version-2.2.0/data-operations.md @@ -0,0 +1,238 @@ +--- +sidebar_position: 3 +--- + +# 2. Data operations: import, export and data fusion + +`ChemFuseKit` can elaborate datasets in multiple different ways. + +For now, let's take a look at basic input / output operations on datasets. As stated in the previous chapter of this cookbook, all loaded datasets are contained into a `BaseDataModel` object or in one of its derived classes: + +```mermaid +classDiagram + class BaseDataModel { + +x_data: DataFrame + +x_train: DataFrame + +y: ndarray + __init__(x_data, x_train, y) + } + + class LLDFDataModel { + ... + __init__(...) + } + + class PCADataModel { + +array_scores: ndarray + +components: int + __init__(..., array_scores) + } + + BaseDataModel *-- LLDFDataModel + BaseDataModel *-- PCADataModel +``` + +With the help of these classes, we can perform three fundamental data operations: + +1. dataset loading +2. data fusion +3. dataset saving + +## Dataset loading + +`ChemFuseKit` can import Excel tables into its `BaseDataModel`-derived classes. `BaseDataModel` and both its derived classes offer the ability to import a single datasheet, while only `LLDF` (the name stands for *Low Level Data Fusion*) can perform data fusion by importing multiple tables at once into a single, unified dataset: + +```mermaid +flowchart TD + A[Import data] --> B{How many tables?} + B --> |One| C[BaseDataModel\nand its\nderived classes] + B --> |Many| D[LLDF only] +``` + +Let's say we have one single table, called `spectrometer_data.xlsx`, and we want to load it into our project, to then feed it to one of our classifiers. + +This is the schema of the table called `Spectral Samples` within `spectrometer_data.xlsx`: + +| Sample number | Class | 8 | 8.1 | 8.2 | 8.3 | ... | 9.9 | 10 | +|---------------|-----------------|-------|-------|-------|--------|-----|--------|--------| +| 1 | Dichloromethane | 2.341 | 3.866 | 1.430 | 5.843 | | 0.032 | 1.128 | +| 2 | N-hexane | 5.745 | 8.346 | 2.985 | 6.842 | | 1.832 | 3.543 | +| 3 | Dioxane | 0.003 | 0.002 | 0.006 | 0.0013 | | 11.483 | 10.445 | +| ... | | | | | | | | | + +As we can observe, there's an index column called *Sample number*, a class column called *Class* and then a certain number of columns containing the spectral responses of the samples in the 8nm - 10nm range. + +
+ +Now, let's import this table into a `BaseDataModel`: + +```python +from chemfusekit.__base import BaseDataModel + +data = BaseDataModel.load_from_file( + import_path='spectrometer_data.xlsx' + sheet_name='Spectral samples', + class_column='Class', + index_column='Sample number' +) +``` + +Now our `data` variable is loaded, and, as an instance of `BaseDataModel`, it contains three fields: + +- `x_data` +- `x_train` +- `y` + +
+ +`x_data` is a Pandas Dataframe with the following content: + +| 8 | 8.1 | 8.2 | 8.3 | ... | 9.9 | 10 | +|-------|-------|-------|--------|-----|--------|--------| +| 2.341 | 3.866 | 1.430 | 5.843 | | 0.032 | 1.128 | +| 5.745 | 8.346 | 2.985 | 6.842 | | 1.832 | 3.543 | +| 0.003 | 0.002 | 0.006 | 0.0013 | | 11.483 | 10.445 | +| ... | | | | | | | + +As we can see, it only contains the spectral data. + +
+ +`x_train`, a Pandas Dataframe too, contains both the classes (with the column header renamed to *Substance*) and the spectral data: + +| Substance | 8 | 8.1 | 8.2 | 8.3 | ... | 9.9 | 10 | +|-----------------|-------|-------|-------|--------|-----|--------|--------| +| Dichloromethane | 2.341 | 3.866 | 1.430 | 5.843 | | 0.032 | 1.128 | +| N-hexane | 5.745 | 8.346 | 2.985 | 6.842 | | 1.832 | 3.543 | +| Dioxane | 0.003 | 0.002 | 0.006 | 0.0013 | | 11.483 | 10.445 | +| ... | | | | | | | | + +
+ +`y`, a NumPy ndarray, only contains the classes (with the column header renamed to *Substance*): + +| Substance | +|-----------------| +| Dichloromethane | +| N-hexane | +| Dioxane | +| ... | + + + +## Data Fusion + +Let's build on from the previous example. Our file, `spectrometer_data.xlsx`, also contains a second sheet called `Gas Chromatography samples`, and we want to import it along our previous `Spectral samples` table. + +This is the schema of the second table: + +| Sample | class | Retention time | +|---------|-----------------|----------------| +| 1 | Dichloromethane | 123.78 | +| 2 | N-hexane | 44.19 | +| 3 | Dioxane | 22.34 | +| ... | | | + +Even though the header names are slightly different, the content of the first two columns corresponds to the first two columns of the previous table. The third column contains gas chromatography retention times in milliseconds. + +
+ +The `LLDF` module allows us to join these two tables (the current and the one from the previous examples) to form a single dataset that contains both spectral data and retention times. Let's see how. + +```python +from chemfusekit.lldf import LLDFSettings, LLDF, GraphMode, Table + +settings = LLDFSettings() # Initialize the default settings + +# Set up the import settings for the first table (spectral data) +table1 = Table( + file_path='chemical_data.xlsx', + sheet_name='Spectral samples' + preprocessing='snv' + class_column='Class' + index_column='Sample number' +) + +# Set up the import settings for the second table (chromatography data) +table2 = Table( + file_path='chemical_data.xlsx', + sheet_name='Chromatography samples' + preprocessing='none' + class_column='class' + index_column='Sample' +) + +# Now, let's make an array of the two tables +tables = [Table1, Table2] + +# Let's pass the settings and the tables to the LLDF constructor +lldf = LLDF(settings, tables) + +# Let's finally perform data fusion with the lldf() method! +lldf.lldf() +``` + +At the end of this cycle of operations, we can find our fused data object inside the `fused_data` property of our low-level data fusion object: + +```python +lldf.fused_data +``` + +The `fused_data` field is of class `LLDFDataModel`, which is derived from `BaseDataModel`, and contains the same fields (`x_data`, `x_train`, `y`). + +
+ +This is the content of `x_data` (a Pandas DataFrame): + +| 8 | 8.1 | 8.2 | 8.3 | ... | 9.9 | 10 |Retention time | +|-------|-------|-------|--------|-----|--------|--------|---------------| +| 2.341 | 3.866 | 1.430 | 5.843 | | 0.032 | 1.128 |123.78 | +| 5.745 | 8.346 | 2.985 | 6.842 | | 1.832 | 3.543 |44.19 | +| 0.003 | 0.002 | 0.006 | 0.0013 | | 11.483 | 10.445 |22.34 | +| ... | | | | | | | | + +
+ +This is `x_train` (a Pandas DataFrame): + +| Substance | 8 | 8.1 | 8.2 | 8.3 | ... | 9.9 | 10 |Retention time | +|-----------------|-------|-------|-------|--------|-----|--------|--------|---------------| +| Dichloromethane | 2.341 | 3.866 | 1.430 | 5.843 | | 0.032 | 1.128 |123.78 | +| N-hexane | 5.745 | 8.346 | 2.985 | 6.842 | | 1.832 | 3.543 |44.19 | +| Dioxane | 0.003 | 0.002 | 0.006 | 0.0013 | | 11.483 | 10.445 |22.34 | +| ... | ... | | | | | | | | + +
+ +This is the content of `y` (a NumPy ndarray): + +| Substance | +|-----------------| +| Dichloromethane | +| N-hexane | +| Dioxane | +| ... | + + +## Dataset export + +`BaseDataModel` and its derived classes have a `export_to_file` method that exports the complete table (class names and data columns) to an Excel file. + +Let's say we want to export the fused dataset from the previous example into a file called `fused dataset.xlsx`. Here's how to do it, using our `lldf` variable from the previous example (the one that contained an instance of the `LLDF` class, with which we joined the two tables): + +```python +lldf.export_to_file(export_path='fused dataset.xlsx', sheet_name="Sheet 1") +``` + +Et voila! Now we have a new file called `fused dataset.xlsx`, inside of which there is a sheet called "Sheet 1" with the following content: + +| Substance | 8 | 8.1 | 8.2 | 8.3 | ... | 9.9 | 10 |Retention time | +|-----------------|-------|-------|-------|--------|-----|--------|--------|---------------| +| Dichloromethane | 2.341 | 3.866 | 1.430 | 5.843 | | 0.032 | 1.128 |123.78 | +| N-hexane | 5.745 | 8.346 | 2.985 | 6.842 | | 1.832 | 3.543 |44.19 | +| Dioxane | 0.003 | 0.002 | 0.006 | 0.0013 | | 11.483 | 10.445 |22.34 | +| ... | ... | | | | | | | | + +
+ +With this, you now know all the basics of data handling within `ChemFuseKit`. \ No newline at end of file diff --git a/docs/cookbook_versioned_docs/version-2.2.0/introduction.md b/docs/cookbook_versioned_docs/version-2.2.0/introduction.md new file mode 100644 index 0000000..3a4d2ef --- /dev/null +++ b/docs/cookbook_versioned_docs/version-2.2.0/introduction.md @@ -0,0 +1,26 @@ +--- +sidebar_position: 1 +--- + +# The ChemFuseKit Cookbook: an introduction + +*What is a cookbook, exactly?* + +> A cookbook is a comprehensive collection of recipes that guide users through +the process of learning and mastering the use of a specific library or +programming technique, by providing step-by-step instructions, explanations and +examples. + +## What you'll learn + +In this cookbook you will learn the basic principles of operation of `ChemFuseKit` through practical examples and case studies. You will be shown that all modules follow a basic structure, and once you've learned it for one module, you will be able to reapply that knowledge for all modules. + +You will be shown how to use the library on its own, and also how to use it as a part of a bigger pipeline. + +## Cookbook sectioning + +Here we go: + +- first of all, you will be shown the basic principles and structure; +- then, you will be shown three case studies; +- finally, you'll receive instructions on how to modify and expand this library for your own purposes. \ No newline at end of file diff --git a/docs/cookbook_versioned_docs/version-2.2.0/structure.md b/docs/cookbook_versioned_docs/version-2.2.0/structure.md new file mode 100644 index 0000000..24b6ec3 --- /dev/null +++ b/docs/cookbook_versioned_docs/version-2.2.0/structure.md @@ -0,0 +1,197 @@ +--- +sidebar_position: 2 +--- + +# 1. Project structure + +In this cookbook page, you will be shown how the project is structured, and the purpose of each module. + +## Project Hierarchy + +``` +chemfusekit + │ + ├── lda + │ ├── LDASettings + │ └── LDA + │ + ├── lr + │ ├── LRSettings + │ └── LR + │ + ├── plsda + │ ├── PLSDASettings + │ └── PLSDA + │ + ├── pca + │ ├── PCASettings + │ ├── PCA + │ └── PCADataModel + │ + ├── lldf + │ ├── LLDFSettings + │ ├── LLDF + │ └── LLDFDataModel + │ + ├── svm + │ ├── SVMSettings + │ └── SVM + │ + └── knn + ├── KNNSettings + └── KNN +``` + +As you can see, each module contains a class with the same name of the module, and a settings class. That's because this project tries to be as modular and as regular as possible, for clarity and interoperability. + + +## Modular design features + +The entire library was streamlined to make operations as smooth and easy as possible. Any operation (import and export of both data and classifier models, training, processing, prediction, ...) looks the same on any class. + +
+ +> *Want to update the settings in a classifier?* + +You'll find the settings for `LDA` in `LDA.settings`. And the settings of `PCA` in `PCA.settings`. Where are the settings for `SVM`? In `SVM.settings`, of course. You get the hang of it. + +
+ +> *Want to inspect the underlying `sklearn` model in one of the classifiers? + +Let's say you're using a `LR` object. Its underlying sklearn classifier is in `LR.model`, as much as the underlying sklearn classifier of `KNN` is in `KNN.model`. + +
+ +> *Want to swap out the data in a model and retrain it?* + +Let's assume your new data is called `new_data`. Knowing that the training data, when present, is located in the `.data` field, just do this: + +```python +knn.data = new_data +knn.knn() +``` + +The training method is always called like its container class, but in lower case. To train a `KNN` model, like in this case, you just have to call `.knn()` on it. Same goes for `.lda()` on `LDA`, `.lldf()` on `LLDF`, and so on. + + +### Modular settings + +The settings for all classifiers (that is, all classes except `LLDF` and `PCA`) inherit from a base class called [`BaseSettings`](/docs/base/basesettings) in the `base` module: + +```mermaid +classDiagram + class BaseSettings { + +output: GraphMode + +test_split: bool + __init__(output, test_split) + } + + class KNNSettings { + ... + } + + class LDASettings { + ... + } + + class LRSettings { + ... + } + + class PLSDASettings { + ... + } + + class SVMSettings { + ... + } + + BaseSettings *-- KNNSettings + BaseSettings *-- LDASettings + BaseSettings *-- LRSettings + BaseSettings *-- PLSDASettings + BaseSettings *-- SVMSettings +``` + + +### Modular classifiers + +The classifiers themselves all inherit from a base class called [`BaseClassifier`](/docs/base/baseclassifier) in the `base` module: + +```mermaid +classDiagram + + class BaseClassifier { + +settings: BaseSettings + +data: BaseDataModel + +model: sklearn model + __init__(settings, data) + import_model(import_path: str) + export_model(export_path: str) + predict(x_data: pd.DataFrame) + } + + class KNN { + ... + } + + class LDA { + ... + } + + class LR { + ... + } + + class PLSDA { + ... + } + + class SVM { + ... + } + + BaseClassifier *-- KNN + BaseClassifier *-- LDA + BaseClassifier *-- LR + BaseClassifier *-- PLSDA + BaseClassifier *-- SVM +``` + + +### Modular data types + +The data types are modular and interexchangeable too. Both [`LLDFDataModel`](/docs/lldf/lldfmodel) and [`PCADataModel`](/docs/pca/pcadatamodel) inherit from [`BaseDataModel`](/docs/base/basedatamodel) as shown in the following diagram: + +```mermaid +classDiagram + class BaseDataModel { + +x_data: DataFrame + +x_train: DataFrame + +y: ndarray + __init__(x_data, x_train, y) + } + + class LLDFDataModel { + ... + __init__(...) + } + + class PCADataModel { + +array_scores: ndarray + +components: int + __init__(..., array_scores) + } + + BaseDataModel *-- LLDFDataModel + BaseDataModel *-- PCADataModel +``` + +This allows all the classifiers to use the `LLDF` data, dimension-reduced `PCA` data, or any other type of data as long as it follows the `BaseDataModel` template. + +## File import and export + +All the data models (`BaseDataModel`, and its derived, `LLDFDataModel` and `PCADataModel`) can export their content to Excel tables. + +All classifiers derived from `BaseClassifier` (`KNN`, `LDA`, `LR`, `PLSDA`, `SVM`) can import and export their sklearn data model from and to file. \ No newline at end of file diff --git a/docs/cookbook_versioned_sidebars/version-2.2.0-sidebars.json b/docs/cookbook_versioned_sidebars/version-2.2.0-sidebars.json new file mode 100644 index 0000000..caea0c0 --- /dev/null +++ b/docs/cookbook_versioned_sidebars/version-2.2.0-sidebars.json @@ -0,0 +1,8 @@ +{ + "tutorialSidebar": [ + { + "type": "autogenerated", + "dirName": "." + } + ] +} diff --git a/docs/cookbook_versions.json b/docs/cookbook_versions.json index dc236fe..293afb9 100644 --- a/docs/cookbook_versions.json +++ b/docs/cookbook_versions.json @@ -1,4 +1,5 @@ [ + "2.2.0", "2.1.0", "2.0.0" ] diff --git a/docs/versioned_docs/version-2.2.0/base/_category_.json b/docs/versioned_docs/version-2.2.0/base/_category_.json new file mode 100644 index 0000000..ac3f2d9 --- /dev/null +++ b/docs/versioned_docs/version-2.2.0/base/_category_.json @@ -0,0 +1,8 @@ +{ + "label": "Base module", + "position": 9, + "link": { + "type": "generated-index", + "description": "A module containing base classes for all the other modules." + } +} diff --git a/docs/versioned_docs/version-2.2.0/base/baseclassifier.md b/docs/versioned_docs/version-2.2.0/base/baseclassifier.md new file mode 100644 index 0000000..05448ad --- /dev/null +++ b/docs/versioned_docs/version-2.2.0/base/baseclassifier.md @@ -0,0 +1,37 @@ +--- +sidebar_position: 1 +--- + +# BaseClassifier class + +A base class from which all classifiers inherit. + +## Syntax + +```python +BaseClassifier(settings: BaseSettings, data: BaseDataModel) +``` + +## Constructor parameters + +- `settings`: object of type [`BaseSettings`](./basesettings.md). Contains the settings for + the `BaseClassifier` object. +- `data`: object of type [`BaseDataModel`](../base/basedatamodel.md). Contains the data to be analyzed. + +## Fields + +- `settings`: object of type [`KNNSettings`](/tesi/docs/knn/knnsettings). Contains the settings for + the `BaseClassifier` object. +- `data`: object of type [`BaseDataModel`](../base/basedatamodel.md). Contains the data to be analyzed. +- `model`: a `sklearn` model from `scikit-learn`. Defaults to `None`. + +## Methods + +- `import_model(import_path: str)`: loads a model from file +- `export_model(export_path: str)`: exports a model to file + - *raises*: + - `RuntimeError("You haven't trained the model yet! You cannot export it now.")` when trying to export an untrained model +- `predict(x_data: pd.DataFrame)`: performs prediction through the `model` + - *raises*: + - `TypeError("X data for prediction must be non-empty.")` on empty `x_data` + - `RuntimeError("The model is not trained yet!")` when run with an untrained `model` diff --git a/docs/versioned_docs/version-2.2.0/base/basedatamodel.md b/docs/versioned_docs/version-2.2.0/base/basedatamodel.md new file mode 100644 index 0000000..fc9efb3 --- /dev/null +++ b/docs/versioned_docs/version-2.2.0/base/basedatamodel.md @@ -0,0 +1,28 @@ +--- +sidebar_position: 3 +--- + +# BaseDataModel class + +This class models the output data for all data-outputting operations (currently, the [`LLDF`](../lldf/lldf-class.md) operation and the [`PCA`](../pca/pca.md) operation). + +## Syntax + +```python +BaseDataModel(x_data: pd.DataFrame, x_train: pd.DataFrame, y: pd.DataFrame) +``` + +## Fields and constructor parameters + +The first two are `Pandas` `DataFrame` objects: +- `x_data` +- `x_train` +The last is a `NumPy` `ndarray`: +- `y` + +## Methods + +Both methods are inherited from [`BaseDataModel`](../base/basedatamodel.md): + +- `@classmethod load_from_file(cls, import_path: str, sheet_name: str = 'Sheet1', class_column: str = 'Substance', index_column: str | None = None):`: creates a `BaseDataModel` instance from an Excel file +- `export_to_file(export_path: str, sheet_name: str = 'Sheet1')`: exports the `BaseDataModel` contents to an Excel table diff --git a/docs/versioned_docs/version-2.2.0/base/basesettings.md b/docs/versioned_docs/version-2.2.0/base/basesettings.md new file mode 100644 index 0000000..740581b --- /dev/null +++ b/docs/versioned_docs/version-2.2.0/base/basesettings.md @@ -0,0 +1,21 @@ +--- +sidebar-position: 1 +--- + +# BaseSettings class + +Holds the settings for all classifier object. It's not meant for direct usage, only for inheritance. + + +## Syntax + +```python +BaseSettings(output: GraphMode, test_split: false) +``` + +## Fields and constructor parameters +- `output`: toggles graph output mode. Defaults to [`GraphMode.NONE`](../utils/graphmode.md). +- `test_split`: toggles the training split test phase. Defaults to `False`. Requires `output` to be set to `True` to work. + +The constructor raises: +- `Warning("You selected test_split but it won't run because you disabled the output.")` if `test_split` is run with `output` set to false (split tests only produce graphical output, and are useless when run with disabled output). diff --git a/docs/versioned_docs/version-2.2.0/complete-workflow.md b/docs/versioned_docs/version-2.2.0/complete-workflow.md new file mode 100644 index 0000000..f607520 --- /dev/null +++ b/docs/versioned_docs/version-2.2.0/complete-workflow.md @@ -0,0 +1,31 @@ +--- +sidebar-position: 7 +--- + +# Complete workflow + +Here's a sequence diagram to represent an example workflow, from the raw data +tables to classification, including data fusion, PCA and training. + +```plantuml +actor User +participant LLDF +participant PCA +participant Classifier + +User -> LLDF : Upload training tables +User -> LLDF : Set parameters +User -> Classifier : (optional) Upload model + +LLDF -> PCA : Pass preprocessed / fused tables +LLDF --> User : Download fused tables +LLDF -> Classifier : Pass preprocessed / fused tables \nRun classification +PCA -> Classifier : (optional) Set number of components + +Classifier --> User : classification results, graphs +PCA --> User : classification results, graphs +Classifier --> User : (optional) download trained model + +User -> Classifier : pass data to classify +Classifier --> User : classification results +``` diff --git a/docs/versioned_docs/version-2.2.0/knn/_category_.json b/docs/versioned_docs/version-2.2.0/knn/_category_.json new file mode 100644 index 0000000..e670264 --- /dev/null +++ b/docs/versioned_docs/version-2.2.0/knn/_category_.json @@ -0,0 +1,8 @@ +{ + "label": "kNN module", + "position": 6, + "link": { + "type": "generated-index", + "description": "A module for k-nearest neighbors analysis." + } +} diff --git a/docs/versioned_docs/version-2.2.0/knn/knn.md b/docs/versioned_docs/version-2.2.0/knn/knn.md new file mode 100644 index 0000000..2cecad5 --- /dev/null +++ b/docs/versioned_docs/version-2.2.0/knn/knn.md @@ -0,0 +1,47 @@ +--- +sidebar_position: 1 +--- + +# KNN class + +A class to store the data, methods and artifacts for _k-Nearest Neighbors Analysis_. + +## Syntax + +```python +KNN(settings: KNNSettings, data: LLDFModel) +``` + +## Constructor parameters + +- `settings`: object of type [`KNNSettings`](knnsettings.md). Contains the settings for + the `KNN` object. +- `data`: object of type [`BaseDataModel`](../base/basedatamodel.md). Contains the data to be analyzed. + +## Fields + +- `settings`: object of type [`KNNSettings`](/tesi/docs/knn/knnsettings). Contains the settings for + the `KNN` object. +- `fused_data`: onject of type ['LLDFModel`](/tesi/docs/lldf/lldfmodel). Contains the + artifacts from the data fusion process. +- `model`: a `KNeighborsClassifier` model from `scikit-learn`. Defaults to `None`. + +## Methods + +- `knn(self)`: trains the k-Neighbors Analysis model +- `predict(self, x_data)`: performs LDA prediction once the model is trained. + - *raises*: + - `RuntimeError("The kNN model is not trained yet!")` if the `KNN` model hasn't been trained yet + +## Example + +```python +from chemfusekit.knn import KNN + +# Initialize and run the LDA class +knn = KNN(settings, lldf.fused_data) +knn.knn() + +# Run predictions +knn.predict(x_data) +``` \ No newline at end of file diff --git a/docs/versioned_docs/version-2.2.0/knn/knnsettings.md b/docs/versioned_docs/version-2.2.0/knn/knnsettings.md new file mode 100644 index 0000000..81f9486 --- /dev/null +++ b/docs/versioned_docs/version-2.2.0/knn/knnsettings.md @@ -0,0 +1,64 @@ +--- +sidebar-position: 1 +--- + +# KNNSettings class + +Holds the settings for the [`KNN`](knn.md) object. + +Inherits from [`BaseSettings`](../base/basesettings.md). + +## Syntax + +```python +KNNSettings( + n_neighbors: int, + metric: str | Callable, + weights: str | Callable, + algorithm: str, + output: GraphMode, + test_split: false +) +``` + +## Fields and constructor parameters +- `n_neighbors`: the amount of components to be used in the `KNN` model. Defaults to 15. +- `metric`: the distance metric for the model. It can take one of the following values: + - `minkwoski` + - `precomputed` + - `euclidean` + or be a callable object. +- `weights`: the weight metric for the model. It can take one of the following values: + - `uniform` + - `distance` + or be a callable object. +- `algorithm`: the algorithm for the model. It can take one of the following values: + - `auto` + - `ball_tree` + - `kd_tree` + - `brute` + or be a callable object. +- `output`: toggles graph output mode. Defaults to [`GraphMode.NONE`](../utils/graphmode.md). +- `test_split`: toggles the training split test phase. Defaults to `False`. Requires `output` to be set to `True` to work. + +The constructor raises: +- `ValueError("Invalid n_neighbors number: should be a positive integer.")` if the number of components is not valid. +- `ValueError("Invalid metric: should be 'minkwoski', 'precomputed', 'euclidean' or a callable.")` if the chosen metric is neither available nor a callable function. +- `ValueError("Invalid weight: should be 'uniform', 'distance' or a callable")` if the chosen weight is neither available nor a callable function. +- `ValueError("Invalid algorithm: should be 'auto', 'ball_tree', 'kd_tree' or 'brute'.")` if the chosen algotithm does not exist. +- `Warning("You selected test_split but it won't run because you disabled the output.")` if `test_split` is run with `output` set to false (split tests only produce graphical output, and are useless when run with disabled output). + +## Example + +```python +from chemfusekit.knn import KNNSettings, GraphMode + +settings = KNNSettings( + n_neighbors=20, # pick 20 neighbors + metric='minkowski', # choose the metric + weights='distance', # choose the weight metric + algorithm='auto', # the best algorithm gets chosen automatically + output=GraphMode.GRAPHIC, # graph output is enabled + test_split=True # the model will be split-tested at the end of the training +) +``` \ No newline at end of file diff --git a/docs/versioned_docs/version-2.2.0/lda/_category_.json b/docs/versioned_docs/version-2.2.0/lda/_category_.json new file mode 100644 index 0000000..8d1f6af --- /dev/null +++ b/docs/versioned_docs/version-2.2.0/lda/_category_.json @@ -0,0 +1,8 @@ +{ + "label": "LDA module", + "position": 4, + "link": { + "type": "generated-index", + "description": "A module for linear discriminant analysis." + } +} diff --git a/docs/versioned_docs/version-2.2.0/lda/lda.md b/docs/versioned_docs/version-2.2.0/lda/lda.md new file mode 100644 index 0000000..2b8f19e --- /dev/null +++ b/docs/versioned_docs/version-2.2.0/lda/lda.md @@ -0,0 +1,48 @@ +--- +sidebar_position: 1 +--- + +# LDA class + +A class to store the data, methods and artifacts for _Linear Discriminant Analysis_. + +## Syntax + +```python +LDA(settings: LDASettings, data: BaseDataModel) +``` + +## Constructor parameters + +- `settings`: object of type [`LDASettings`](./ldasettings.md). Contains the settings for + the `LDA` object. +- `data`: object of type [`BaseDataModel`](../base/basedatamodel.md). Contains the data to be analyzed. + +## Fields + +- `settings`: object of type [`LDASettings`](./ldasettings.md). Contains the settings for + the `LDA` object. +- Fused data fields: + - `x_data` + - `x_train` + - `y` +- `model`: a `LinearDiscriminantAnalysis` model from `scikit-learn`. Defaults to `None`. + +## Methods + +- `lda(self)`: performs Linear Discriminant Analysis +- `__print_prediction_graphs(self, y_test, y_pred)`: helper function to print + graphs and stats about LDA predictions +- `predict(self, x_data)`: performs LDA prediction once the model is trained. + - *raises*: + - `RuntimeError("The LDA model is not trained yet!")` if the LDA model hasn't been trained yet + +## Example + +```python +from chemfusekit.lda import LDA + +# Initialize and run the LDA class +lda = LDA(lldf.fused_data, settings) +lda.lda() +``` \ No newline at end of file diff --git a/docs/versioned_docs/version-2.2.0/lda/ldasettings.md b/docs/versioned_docs/version-2.2.0/lda/ldasettings.md new file mode 100644 index 0000000..c9c850e --- /dev/null +++ b/docs/versioned_docs/version-2.2.0/lda/ldasettings.md @@ -0,0 +1,39 @@ +--- +sidebar_position: 2 +--- + +# LDASettings class + +Holds the settings for the [`LDA`](./lda.md) object. + +Inherits from [`BaseSettings`](../base/basesettings.md). + +## Syntax + +```python +LDASettings(components: int, output: GraphMode, split_test: bool) +``` + +## Fields and constructor parameters + +- `components`: the amount of components to be used in the LDA model. Defaults to 3. +- `output`: toggles graph output. Defaults to [`GraphMode.NONE`](../utils/graphmode.md). +- `test_split`: toggles split testing. Defaults to `False`. + + +The constructor raises: +- `ValueError("Invalid component number: must be a > 1 integer.")` if the number of + components is not valid. +- `Warning("You selected test_split but it won't run because you disabled the output.")` if split tests are run with `output` disabled + +## Example + +```python +from chemfusekit.lda import LDASettings, GraphMode + +settings = LDASettings( + components=(pca.components - 1), # one less component than the number determined by PCA + output=GraphMode.GRAPHIC, # graphs will be printed + test_split=True # split testing is enabled +) +``` \ No newline at end of file diff --git a/docs/versioned_docs/version-2.2.0/lldf/_category_.json b/docs/versioned_docs/version-2.2.0/lldf/_category_.json new file mode 100644 index 0000000..096ed0c --- /dev/null +++ b/docs/versioned_docs/version-2.2.0/lldf/_category_.json @@ -0,0 +1,8 @@ +{ + "label": "LLDF Module", + "position": 2, + "link": { + "type": "doc", + "id": "index" + } +} diff --git a/docs/versioned_docs/version-2.2.0/lldf/index.mdx b/docs/versioned_docs/version-2.2.0/lldf/index.mdx new file mode 100644 index 0000000..01c1d16 --- /dev/null +++ b/docs/versioned_docs/version-2.2.0/lldf/index.mdx @@ -0,0 +1,47 @@ +import DocCardList from '@theme/DocCardList'; + +# LLDF Module + +A module for low-level data fusion. + +# Members + + + +# UML + +```mermaid +classDiagram + class Table { + +str file_path + +str sheet_name + +str preprocessing + __init__(file_path, sheet_name, preprocessing) + } + + class LLDFSettings { + +GraphOutput output + __init__(output) + } + + class LLDF { + +LLDFSettings settings + +Table[] tables + +LLDFDataModel | None fused_data + lldf() + -_snv() + +export_data(export_path: str) + __init__(settings, tables[]) + } + + class LLDFDataModel { + +x_data: pd.DataFrame + +x_train: pd.DataFrame + +y: np.ndarray + __init__(x_data, x_train, y) + } + + LLDF *-- LLDFModel + LLDF *-- Table + LLDF *-- LLDFSettings +``` \ No newline at end of file diff --git a/docs/versioned_docs/version-2.2.0/lldf/lldf-class.md b/docs/versioned_docs/version-2.2.0/lldf/lldf-class.md new file mode 100644 index 0000000..1a0be38 --- /dev/null +++ b/docs/versioned_docs/version-2.2.0/lldf/lldf-class.md @@ -0,0 +1,67 @@ +--- +sidebar_position: 1 +--- + +# LLDF class + +The `LLDF` class is used for _low-level data fusion_. + +## Syntax + +```python +LLDF(tables: List[Table], lldf_settings: LLDFSettings) +``` + +## Constructor parameters + +- `tables`: `List[`[`Table`](./table.md)`]` + + A list of `Table` objects containing info about the files to import + +- `lldf_settings`: [`LLDFSettings`](./lldfsettings) + + The settings for the LLDF object. + +## Fields + +- `settings`: [`LLDFSettings`](./lldfsettings) + + The settings for the LLDF object. + +- `tables`: `List[`[`Table`](./table.md)`]` + + A list of `Table` objects containing info about the files to import + +- `fused_data`: [`LLDFModel`](./lldfmodel.md) + + The resulting model containing the data fusion artifacts. + +## Methods + +- `_snv(self, input_data)`: static method to rescale input arrays +- `lldf(self)`: performs low-level data fusion on the data passed in the settings + - *raises*: + - `FileNotFoundError("Error opening the selected files.")` + if the files specified in the settings are not valid + - `SyntaxError("LLDF: this type of preprocessing does not exist")` + if the preprocessing method specified in the settings is not valid +- `export_data(self, export_path)`: exports the data fusion artifacts to an Excel file + - *raises*: + - `RuntimeError("Cannot export data before data fusion.")` if export is + attempted before fusing the data + - `RuntimeError("Could not export data to the selected path.")` if any error + happens during the export phase + + +## Example + +```python +from chemfusekit.lldf import LLDF + +# Initialize and run low-level data fusion +lldf = LLDF(tables, lldf_settings) +lldf.lldf() + +# Export the LLDF data to an Excel file +lldf.export_data('output_file.xlsx') +``` \ No newline at end of file diff --git a/docs/versioned_docs/version-2.2.0/lldf/lldfmodel.md b/docs/versioned_docs/version-2.2.0/lldf/lldfmodel.md new file mode 100644 index 0000000..98291e3 --- /dev/null +++ b/docs/versioned_docs/version-2.2.0/lldf/lldfmodel.md @@ -0,0 +1,29 @@ +--- +sidebar_position: 3 +--- + +# LLDFDataModel class + +This class models the output data from the [`LLDF`](./lldf-class.md) operation. + +It inherits from the [`BaseDataModel`](../base/basedatamodel.md). + +## Syntax + +```python +LLDFModel(x_data: pd.DataFrame, x_train: pd.DataFrame, y: np.ndarray) +``` + +## Fields and constructor parameters + +The first two are `Pandas` `DataFrame` objects: +- `x_data` +- `x_train` + +The last is a `NumPy` `ndarray`: +- `y` + +## Methods + +- `@classmethod def load_from_file(cls, import_path: str, sheet_name: str = 'Sheet1', class_column: str = 'Substance', index_column: str | None = None)`: creates an `LLDFDataModel` instance from an Excel file +- `export_to_file(export_path: str, sheet_name: str = 'Sheet1')`: exports the `LLDFDataModel` contents to an Excel table \ No newline at end of file diff --git a/docs/versioned_docs/version-2.2.0/lldf/lldfsettings.md b/docs/versioned_docs/version-2.2.0/lldf/lldfsettings.md new file mode 100644 index 0000000..30f985f --- /dev/null +++ b/docs/versioned_docs/version-2.2.0/lldf/lldfsettings.md @@ -0,0 +1,25 @@ +--- +sidebar_position: 2 +--- + +# LLDFSettings class + +Holds the settings for the [`LLDF`](./lldf-class.md) object. + +## Syntax + +```python +LLDFSettings(output: GraphMode) +``` + +## Fields and constructor parameters +- `output`: toggles graph output. Defaults to [`GraphMode.NONE`](../utils/graphmode.md). + +## Example + +```python +from chemfusekit.lldf import LLDFSettings + +# Initialize the settings for low-level data fusion +lldf_settings = LLDFSettings(output=GraphMode.TEXT) +``` \ No newline at end of file diff --git a/docs/versioned_docs/version-2.2.0/lldf/table.md b/docs/versioned_docs/version-2.2.0/lldf/table.md new file mode 100644 index 0000000..200181e --- /dev/null +++ b/docs/versioned_docs/version-2.2.0/lldf/table.md @@ -0,0 +1,45 @@ +--- +sidebar_position: 4 +--- + +# Table class + +Holds the information for a single table to import. + +The [`LLDF`](./lldf-class.md) object takes a list of `Table` as a parameter. + +## Syntax + +```python +Table( + file_path: str + sheet_name: str + preprocessing: str + class_column: str + index_column: str | None +) +``` + +## Fields and constructor parameters + +- `file_path`: a `str` containing the path to the Excel datasheet +- `sheet_name`: a `str` containing the name of the sheet to select within the Excel file +- `preprocessing`: a `str` with the name of the preprocessing to be applied to the table. + Available options: `snv` (normalization), `savgol` (Savitski-Golay smoothing), `savgol+snv` (both), `none` (no processing). +- 'class_column': a 'str' indicating the name of the class column within the Excel datasheet. Defaults to 'Substance'. +- 'index_column': a 'str' | `None` indicating the name of the index column within the Excel datasheet. Defaults to `None` (and in that case, the first column will be treated as the index). + +## Example + +```python +from chemfusekit.lldf import Table + +# Create a table +table1 = Table( + file_path='tests/qepas.xlsx', + sheet_name='Sheet1', + preprocessing='snv', # normalization preprocessing; other options: savgol, both or none + class_column: 'substance', # The column called 'substance' in the datase will be treated as the class column + index_column: 'sample' # The column named 'index' in the dataset will be treated as the index column +) +``` \ No newline at end of file diff --git a/docs/versioned_docs/version-2.2.0/lr/_category_.json b/docs/versioned_docs/version-2.2.0/lr/_category_.json new file mode 100644 index 0000000..c9a9071 --- /dev/null +++ b/docs/versioned_docs/version-2.2.0/lr/_category_.json @@ -0,0 +1,8 @@ +{ + "label": "LR module", + "position": 6, + "link": { + "type": "generated-index", + "description": "A module for logistic regression." + } +} diff --git a/docs/versioned_docs/version-2.2.0/lr/lr.md b/docs/versioned_docs/version-2.2.0/lr/lr.md new file mode 100644 index 0000000..f7ded6f --- /dev/null +++ b/docs/versioned_docs/version-2.2.0/lr/lr.md @@ -0,0 +1,51 @@ +--- +sidebar_position: 1 +--- + +# LR class + +A class to store the data, methods and artifacts for _Logistic Regression_. + +## Syntax + +```python +LR(settings: LRSettings, array_scores: np.ndarray, y: np.ndarray): +``` + +## Constructor parameters + +- `settings`: object of type [`LRSettings`](./lrsettings.md). Contains the settings for + the `LR` object. +- `array_scores`: `np.ndarray`, product of [`PCA` analysis](../pca/). +- `y`: `np.ndarray`, product of [`PCA` analysis](../pca/). + +## Fields + +- `settings`: object of type [`LRSettings`](./lrsettings.md). Contains the settings for + the `LR` object. +- `array_scores`: product of [`PCA` analysis](../pca/). +- `y`: product of [`PCA` analysis](../pca/). +- `model`: A `LR` model from `scikit-learn`. Defaults to `None`. + +## Methods + +- `lr(self)`: performs Logistic Regression. +- `predict(self, x_sample)`: performs LR-based classification on input data. + - *raises*: + - `RuntimeError("The LR model is not trained yet!")` if prediction is + started without training the model first; + - `raise TypeError("X data for LDA prediction must be non-empty.")` if + the data passed as argument is null. + +## Example + +```python +from chemfusekit.lr import LR + +# Initialize and train the LR class +lr = LR(settings, array_scores, y) +lr.lr() + +# Perform prediction +lr.predict(x_sample) +``` \ No newline at end of file diff --git a/docs/versioned_docs/version-2.2.0/lr/lrsettings.md b/docs/versioned_docs/version-2.2.0/lr/lrsettings.md new file mode 100644 index 0000000..847c002 --- /dev/null +++ b/docs/versioned_docs/version-2.2.0/lr/lrsettings.md @@ -0,0 +1,44 @@ +--- +sidebar_position: 2 +--- + +# LRSettings class + +Holds the settings for the [`LR`](./lr.md) object. + +Inherits from [`BaseSettings`](../base/basesettings.md). + +## Syntax + +```python +LRSettings(algorithm: str, output: GraphMode, test_split: bool) +``` + +## Fields and constructor parameters + +- `algorithm`: the amount of components to be used in the LDA model. Defaults to + `liblinear`. Other available options: + - `lbfgs` + - `newton-cg` + - `newton-cholesky` + - `sag` + - `saga` +- `output`: toggles graph output. Defaults to [`GraphMode.NONE`](../utils/graphmode.md). +- `test_split`: toggles split testing. Defaults to `False`. + +The constructor raises: +- `ValueError("This algorithm does not exist.")` if the selected `algorithm` + is not a valid option. +- `Warning("You selected test_split but it won't run because you disabled the output.")` if split tests are run with `output` disabled + +## Example + +```python +from chemfusekit.lr import LRSettings, GraphMode + +settings = LRSettings( + algorithm='newton-cg', + output=GraphMode.GRAPHIC, # graphs will be printed + test_split=True # split testing is enabled +) +``` \ No newline at end of file diff --git a/docs/versioned_docs/version-2.2.0/pca/_category_.json b/docs/versioned_docs/version-2.2.0/pca/_category_.json new file mode 100644 index 0000000..266d36f --- /dev/null +++ b/docs/versioned_docs/version-2.2.0/pca/_category_.json @@ -0,0 +1,8 @@ +{ + "label": "PCA module", + "position": 3, + "link": { + "type": "generated-index", + "description": "A module for principal component analysis." + } +} diff --git a/docs/versioned_docs/version-2.2.0/pca/pca.md b/docs/versioned_docs/version-2.2.0/pca/pca.md new file mode 100644 index 0000000..1a07127 --- /dev/null +++ b/docs/versioned_docs/version-2.2.0/pca/pca.md @@ -0,0 +1,46 @@ +--- +sidebar_position: 1 +--- + +# PCA class + +A class to store the data, methods and artifacts for _Principal Component Analysis_. + +## Syntax + +```python +PCA(settings: PCASettings, data: BaseDataModel) +``` + +## Constructor parameters + +- `settings`: object of type [`PCASettings`](./pcasettings.md). Contains the settings for + the `PCA` object. +- `data`: object of type [`BaseDataModel`](../base/basedatamodel.md). Contains the data to be analyzed. + +## Fields + +- `fused_data`: object of type [`LLDF`](../lldf/lldf-class.md). Contains the data to be analyzed. +- `components`: Number of components for the PCA analysis. Defaults to 0. +- `pca_model`: A `PCA` model from `scikit-learn`. Defaults to `None`. +- `settings`: object of type [`PCASettings`](./pcasettings.md). Contains the settings for + the `PCA` object. + +## Methods + +- `pca(self)`: performs Principal Component Analysis +- `pca_stats(self)` produces PCA-related statistics and graphs. + +## Example + +```python +from chemfusekit.pca import PCA + +# Initialize and run the PCA class +pca = PCA(lldf.fused_data, pca_settings) +pca.pca() + +# Print the number of components and the statistics +print(pca.components) +pca.pca_stats() +``` \ No newline at end of file diff --git a/docs/versioned_docs/version-2.2.0/pca/pcadatamodel.md b/docs/versioned_docs/version-2.2.0/pca/pcadatamodel.md new file mode 100644 index 0000000..f76b156 --- /dev/null +++ b/docs/versioned_docs/version-2.2.0/pca/pcadatamodel.md @@ -0,0 +1,35 @@ +--- +sidebar_position: 3 +--- + +# PCADataModel class + +This class models the output data from the [`PCA`](./pca.md) operation. + +It inherits from the [`BaseDataModel`](../base/basedatamodel.md). + +## Syntax + +```python +PCAModel(x_data: pd.DataFrame, x_train: pd.DataFrame, y: np.ndarray, array_scores: np.ndarray, components: int) +``` + +## Fields and constructor parameters + +The first two are `Pandas` `DataFrame` objects: +- `x_data` +- `x_train` + +The second two are `NumPy` `ndarray`s: +- `y` +- `array_scores` + +The last is an integer: +- `components` + +## Methods + +Both methods are inherited from [`BaseDataModel`](../base/basedatamodel.md): + +- `@classmethod def load_from_file(import_path: str, sheet_name: str = 'Sheet1')`: creates a `BaseDataModel` instance from an Excel file +- `export_to_file(export_path: str, sheet_name: str = 'Sheet1')`: exports the `BaseDataModel` contents to an Excel table diff --git a/docs/versioned_docs/version-2.2.0/pca/pcasettings.md b/docs/versioned_docs/version-2.2.0/pca/pcasettings.md new file mode 100644 index 0000000..ca533ea --- /dev/null +++ b/docs/versioned_docs/version-2.2.0/pca/pcasettings.md @@ -0,0 +1,41 @@ +--- +sidebar_position: 2 +--- + +# PCASettings class + +Holds the settings for the [`PCA`](./pca.md) object. + +## Syntax + +```python +PCASettings( + target_variance: float, + confidence_level: float, + initial_components: int, + output: GraphMode +) +``` + +## Fields and constructor parameters + +- `target_variance`: the minimum cumulative explained variance to reach in the analysis. + Defaults to 0.95. +- `confidence_level`: the confidence level for statistical tests. Defaults to 0.05. +- `initial_components`: the minimum amount of components to be used in the PCA model. + Defaults to 10. +- `output`: toggles graph output. Defaults to [`GraphMode.NONE`](../utils/graphmode.md). + +## Example + +```python +from chemfusekit.pca import PCASettings, GraphMode + +# Initialize the settings for Principal Component Analysis +pca_settings = PCASettings( + target_variance=0.99, + confidence_level=0.05, + initial_components=10, + output=GraphMode.GRAPHIC # graphs will be printed +) +``` \ No newline at end of file diff --git a/docs/versioned_docs/version-2.2.0/plsda/_category_.json b/docs/versioned_docs/version-2.2.0/plsda/_category_.json new file mode 100644 index 0000000..aaf0cbf --- /dev/null +++ b/docs/versioned_docs/version-2.2.0/plsda/_category_.json @@ -0,0 +1,8 @@ +{ + "label": "PLSDA module", + "position": 7, + "link": { + "type": "generated-index", + "description": "A module for partial least squares discriminant analysis." + } +} diff --git a/docs/versioned_docs/version-2.2.0/plsda/plsda.md b/docs/versioned_docs/version-2.2.0/plsda/plsda.md new file mode 100644 index 0000000..3287fd6 --- /dev/null +++ b/docs/versioned_docs/version-2.2.0/plsda/plsda.md @@ -0,0 +1,47 @@ +--- +sidebar_position: 1 +--- + +# PLSDA class + +A class to store the data, methods and artifacts for _Partial Least Squares Discriminant Analysis_. + +## Syntax + +```python +PLSDA(settings: PLSDASettings, data: BaseDataModel) +``` + +## Constructor parameters + +- `settings`: object of type [`PLSDASettings`](plsdasettings.md). Contains the settings for + the `PLSDA` object. +- `data`: object of type [`BaseDataModel`](../base/basedatamodel.md). Contains the data to be analyzed. + +## Fields + +- `settings`: object of type [`PLSDASettings`](./plsdasettings.md). Contains the settings for + the `PLSDA` object. +- `fused_data`: onject of type ['LLDFModel`](../lldf/lldfmodel.md). Contains the + artifacts from the data fusion process. +- `model`: a `PLSRegression` model from `scikit-learn`. Defaults to `None`. + +## Methods + +- `plsda(self)`: trains the Partial Least Squares Discriminant Analysis model. +- `predict(self, x_data)`: performs PLSDA prediction once the model is trained. + - *raises*: + - `RuntimeError("The PLSDA model is not trained yet!")` if the `PLSDA` model hasn't been trained yet + +## Example + +```python +from chemfusekit.knn import PLSDA + +# Initialize and run the LDA class +plsda = PLSDA(settings, lldf.fused_data) +plsda.plsda() + +# Run predictions +plsda.predict(x_data) +``` \ No newline at end of file diff --git a/docs/versioned_docs/version-2.2.0/plsda/plsdasettings.md b/docs/versioned_docs/version-2.2.0/plsda/plsdasettings.md new file mode 100644 index 0000000..7b547a7 --- /dev/null +++ b/docs/versioned_docs/version-2.2.0/plsda/plsdasettings.md @@ -0,0 +1,38 @@ +--- +sidebar_position: 2 +--- + +# PLSDASettings class + +Holds the settings for the [`PLSDA`](./plsda.md) object. + +Inherits from [`BaseSettings`](../base/basesettings.md). + +## Syntax + +```python +PLSDASettings(n_components: int, output: GraphMode, test_split: bool) +``` + +## Fields and constructor parameters + +- `n_components`: number of components for the PLSDA analysis. Defaults to 3. +- `output`: toggles graph output. Defaults to [`GraphMode.NONE`](../utils/graphmode.md). +- `test_split`: toggles the training split test phase. Defaults to `False`. Requires `output` to not be set to `GraphMode.NONE` to work. + +The constructor raises: +- `ValueError("Invalid n_components number: should be a positive integer.")` if the number of components is below 1. +- `Warning("You selected test_split but it won't run because you disabled the output.")` if `test_split` is run with `output` set to `GraphOutput.NONE` (split tests only produce graphical output, and are useless when run with disabled output). + +## Example + +```python +from chemfusekit.plsda import PLSDASettings, GraphMode + +# Initialize the settings for Partial Least Squares Discriminant Analysis +plsda_settings = PLSDASettings( + n_components=5, + output=GraphMode.GRAPHIC, # graphs will be printed + test_split=False # no split testing +) +``` \ No newline at end of file diff --git a/docs/versioned_docs/version-2.2.0/svm/_category_.json b/docs/versioned_docs/version-2.2.0/svm/_category_.json new file mode 100644 index 0000000..f0ebda8 --- /dev/null +++ b/docs/versioned_docs/version-2.2.0/svm/_category_.json @@ -0,0 +1,8 @@ +{ + "label": "SVM module", + "position": 5, + "link": { + "type": "generated-index", + "description": "A module for support vector machine analysis." + } +} diff --git a/docs/versioned_docs/version-2.2.0/svm/svm.md b/docs/versioned_docs/version-2.2.0/svm/svm.md new file mode 100644 index 0000000..b984ff7 --- /dev/null +++ b/docs/versioned_docs/version-2.2.0/svm/svm.md @@ -0,0 +1,49 @@ +--- +sidebar_position: 1 +--- + +# SVM class + +A class to store the data, methods and artifacts for _Support Vector Machine Analysis_. + +## Syntax + +```python +SVM(settings: SVMSettings, data: BaseDataModel) +``` + +## Constructor parameters + +- `settings`: object of type [`SVMSettings`](./svmsettings.md). Contains the settings for + the `SVM` object. +- `data`: object of type [`BaseDataModel`](../base/basedatamodel.md). Contains the data to be analyzed. + +The constructor raises: +- `ValueError("Fused data input cannot be empty.")` if the input data is null +- `valueError("Settings cannot be empty.")` if the settings are null + +## Fields + +- `fused_data`: object of type [`LLDFModel`](../lldf/lldfmodel.md). Contains the data to be analyzed. +- `settings`: object of type [`SVMSettings`](./svmsettings.md). Contains the settings for + the `PCA` object. +- `pca_model`: an `SVM` model from `scikit-learn`. Defaults to `None`. + +## Methods + +- `svm(self)`: performs Support Vector Machine analysis. + - *raises*: + - `ValueError(SVM: this type of kernel does not exist.")` if the kernel type is invalid +- `predict(self, x_data)`: performs classification based on SVM + - *raises*: + - `RuntimeError("The model hasn't been trained yet!")` if the model is null + +## Example + +```python +from chemfusekit.svm import SVM + +# Initialize and run the SVM class +svm = LDA(lldf.fused_data, settings) +svm.svm() +``` \ No newline at end of file diff --git a/docs/versioned_docs/version-2.2.0/svm/svmsettings.md b/docs/versioned_docs/version-2.2.0/svm/svmsettings.md new file mode 100644 index 0000000..7ebf852 --- /dev/null +++ b/docs/versioned_docs/version-2.2.0/svm/svmsettings.md @@ -0,0 +1,43 @@ +--- +sidebar_position: 2 +--- + +# SVMSettings class + +Holds the settings for the [`SVM`](./svm.md) object. + +Inherits from [`BaseSettings`](../base/basesettings.md). + +## Syntax + +```python +SVMSettings(kernel: str, output: GraphMode, test_split: bool) +``` + +## Fields and constructor parameters + +- `kernel`: the type of kernel to use in the SVM analysis. Available options: + - `linear` + - `poly` + - `gaussian` + - `sigmoid` + Defaults to `linear`. +- `output`: toggles graph output. Defaults to [`GraphMode.NONE`](../utils/graphmode.md). +- `test_split`: toggles split testing. Defaults to `False`. + +The constructor raises: +- `ValueError("Invalid type: must be linear, poly, gaussian or sigmoid")` if the selected kernel is not one of the available +- `Warning("You selected test_split but it won't run because you disabled the output.")` if split tests are run with `output` disabled + +## Example + +```python +from chemfusekit.svm import SVMSettings, GraphMode + +# Initialize the settings for Support Vector Machine +svm_settings = SVMSettings( + type='linear', + output=GraphMode.GRAPHIC, # graphs will be printed + test_split=True # split testing is enabled +) +``` \ No newline at end of file diff --git a/docs/versioned_docs/version-2.2.0/tutorial.md b/docs/versioned_docs/version-2.2.0/tutorial.md new file mode 100644 index 0000000..7945b21 --- /dev/null +++ b/docs/versioned_docs/version-2.2.0/tutorial.md @@ -0,0 +1,109 @@ +--- +sidebar_position: 1 +--- + +# Tutorial + +Let's discover **Data Fusion**. + +As a simple example, we will train an LDA model and use it for classification. + +First of all, let's install the package: +```bash +pip install chemfusekit +``` + +## First step: data fusion + +We will load the `LLDFSettings` with the paths to some Excel datasheets, containing +respectively the data from a QEPAS spectrometer and a GC chromatographer. + +We will pick normalization as the preprocessing technique for the data. + +The `LLDF` class will take these settings and perform low-level data fusion on the +two Excel tables we picked. + +```python +from chemfusekit.lldf import LLDFSettings, LLDF + +# Initialize the settings for low-level data fusion +lldf_settings = LLDFSettings( + qepas_path='tests/qepas.xlsx', + qepas_sheet='Sheet1', + rt_path='tests/rt.xlsx', + rt_sheet='Sheet1', + preprocessing='snv' # normalization preprocessing; other options: savgol or both +) + +# Initialize and run low-level data fusion +lldf = LLDF(lldf_settings) +lldf.lldf() +``` + +Optionally, we can export the fused data into a new, single Excel datasheet: + +```python +# (optional) export the LLDF data to an Excel file +lldf.export_data('output_file.xlsx') +``` + +## Second step: PCA + +A run of Principal Component Analysis (`PCA`) will help us pick the right number +of components for the subsequent `LDA` analysis step. + +As in the previous case, we will set it up with the help of the `PCASettings` class. + +```python +from chemfusekit.pca import PCASettings, PCA + +# Initialize the settings for Principal Component Analysis +pca_settings = PCASettings( + target_variance=0.99, # the minimum acceptable level of cumulative explained covariance + confidence_level=0.05, # the desired level of confidence + initial_components=10, # the initial amount of components for the iterative analysis + output=GraphMode.GRAPHIC # graphs will be printed +) + +# Initialize and run the PCA class +pca = PCA(lldf.fused_data, pca_settings) +pca.pca() + +# Print the number of components and the statistics +print(pca.components) +pca.pca_stats() +``` + +## Third step: LDA training + +In this step we will set up the `LDASettings` and then run the `LDA` analysis with one less +component than what we figured out from the `PCA` analysis of the previous step. + +```python +from chemfusekit.lda import LDASettings, LDA + +settings = LDASettings( + components=(pca.components - 1), # one less component than the number determined by PCA + output=GraphMode.GRAPHIC, # graphs will be printed + test_split=True # Split testing is enabled +) + +# Initialize and run the LDA class +lda = LDA(lldf.fused_data, settings) +lda.lda() +``` + +## Fourth step: prediction + +We will pick a random sample from the dataset and see whether the trained `LDA` model +can identify it correctly. + +```python +# Let's pick a random sample from the dataset and see if it gets recognized correctly: +x_data_sample = lldf.fused_data.x_train.iloc[119] # should be DMMP +x_data_sample = x_data_sample.iloc[1:].to_frame().transpose() + +# Let's run the prediction: +predictions = lda.predict(x_data_sample) +print(predictions) +``` \ No newline at end of file diff --git a/docs/versioned_docs/version-2.2.0/utils/_category_.json b/docs/versioned_docs/version-2.2.0/utils/_category_.json new file mode 100644 index 0000000..322950e --- /dev/null +++ b/docs/versioned_docs/version-2.2.0/utils/_category_.json @@ -0,0 +1,8 @@ +{ + "label": "Utilities module", + "position": 8, + "link": { + "type": "generated-index", + "description": "A module containing helper functions." + } +} diff --git a/docs/versioned_docs/version-2.2.0/utils/graphmode.md b/docs/versioned_docs/version-2.2.0/utils/graphmode.md new file mode 100644 index 0000000..bdfaa0a --- /dev/null +++ b/docs/versioned_docs/version-2.2.0/utils/graphmode.md @@ -0,0 +1,11 @@ +--- +sidebar-position: 1 +--- + +# GraphMode enum + +The `GraphMode` enum defines three possible values that the output of other classes can take: + +- `GRAPHIC`: graphs, tables and stats will be rendered with `Plotly`, `MatPlotLib` or `Seaborn`. Best used with `Jupyter Notebook`; +- `TEXT`: output will be rendered as plain text. The best option for offline batch processing; +- `NONE`: output will be suppressed completely. \ No newline at end of file diff --git a/docs/versioned_docs/version-2.2.0/utils/graphoutput.md b/docs/versioned_docs/version-2.2.0/utils/graphoutput.md new file mode 100644 index 0000000..362bd7f --- /dev/null +++ b/docs/versioned_docs/version-2.2.0/utils/graphoutput.md @@ -0,0 +1,20 @@ +--- +sidebar-position: 2 +--- + +# `graph_output` function + +A (partially) reusable graphing function shared by different classes. Not meant for direct usage. + +## Syntax + +```python +graph_output(scores, model, name: str, mode: GraphMode) +``` + +## Parameters + +- `scores`: the scores that are output by the model fitting function +- `model`: a `scikit-learn` classification model +- `name`: a `str` representing the name of the analysis technique +- `mode`: a [`GraphMode`](./graphmode.md) enum that acts as an output selector diff --git a/docs/versioned_docs/version-2.2.0/utils/printconfusionmatrix.md b/docs/versioned_docs/version-2.2.0/utils/printconfusionmatrix.md new file mode 100644 index 0000000..826c2f2 --- /dev/null +++ b/docs/versioned_docs/version-2.2.0/utils/printconfusionmatrix.md @@ -0,0 +1,19 @@ +--- +sidebar-position: 5 +--- + +# `print_confuson_matrix` function + +A multimodal confusion matrix and classification report printer utility. Not meant for direct usage. + +## Syntax + +```python +print_confusion_matrix(y1, y2, title: str, mode: GraphMode) +``` + +## Parameters + +- `y1` and `y2`: the true and predicted values +- `title`: a `str` representing the title for the confusion matrix and classification report +- `mode`: a [`GraphMode`](./graphmode.md) enum that acts as an output selector \ No newline at end of file diff --git a/docs/versioned_docs/version-2.2.0/utils/printtable.md b/docs/versioned_docs/version-2.2.0/utils/printtable.md new file mode 100644 index 0000000..a07206e --- /dev/null +++ b/docs/versioned_docs/version-2.2.0/utils/printtable.md @@ -0,0 +1,20 @@ +--- +sidebar-position: 3 +--- + +# `print_table` function + +A multimodal table printing utility. It can output tables as `Plotly` plots or as plain text. Not meant for direct usage. + +## Syntax + +```python +print_table(header_values, cell_values, title: str, mode: GraphMode) +``` + +## Parameters + +- `header_values`: the column titles +- `cell_values`: a row array of column arrays +- `title`: a `str` containing the title for the table +- `mode`: a [`GraphMode`](./graphmode.md) enum that acts as an output selector \ No newline at end of file diff --git a/docs/versioned_docs/version-2.2.0/utils/runsplittests.md b/docs/versioned_docs/version-2.2.0/utils/runsplittests.md new file mode 100644 index 0000000..410eb65 --- /dev/null +++ b/docs/versioned_docs/version-2.2.0/utils/runsplittests.md @@ -0,0 +1,20 @@ +--- +sidebar-position: 4 +--- + +# `run_split_tests` function + +A reusable function for split testing a generic model. Not meant for direct usage. + +## Syntax + +```python +run_split_test(x, y, model, extended=False, mode: GraphMode) +``` + +## Parameters + +- `x` and `y`: the regressor and target arrays +- `model`: a `scikit-learn` classifier +- `extended`: a `bool` that selects whether a longer split analysis will be carried out +- `mode`: a [`GraphMode`](./graphmode.md) enum that acts as an output selector diff --git a/docs/versioned_sidebars/version-2.2.0-sidebars.json b/docs/versioned_sidebars/version-2.2.0-sidebars.json new file mode 100644 index 0000000..caea0c0 --- /dev/null +++ b/docs/versioned_sidebars/version-2.2.0-sidebars.json @@ -0,0 +1,8 @@ +{ + "tutorialSidebar": [ + { + "type": "autogenerated", + "dirName": "." + } + ] +} diff --git a/docs/versions.json b/docs/versions.json index b619bc6..02120e2 100644 --- a/docs/versions.json +++ b/docs/versions.json @@ -1,4 +1,5 @@ [ + "2.2.0", "2.1.0", "2.0.0", "1.2.0", diff --git a/notes/.obsidian/workspace.json b/notes/.obsidian/workspace.json index 0d22638..0aae246 100644 --- a/notes/.obsidian/workspace.json +++ b/notes/.obsidian/workspace.json @@ -25,7 +25,7 @@ "state": { "type": "markdown", "state": { - "file": "UML/Components.md", + "file": "To do.md", "mode": "source", "source": false } @@ -106,7 +106,7 @@ "state": { "type": "backlink", "state": { - "file": "UML/Components.md", + "file": "To do.md", "collapseAll": false, "extraContext": false, "sortOrder": "alphabetical", @@ -123,7 +123,7 @@ "state": { "type": "outgoing-link", "state": { - "file": "UML/Components.md", + "file": "To do.md", "linksCollapsed": false, "unlinkedCollapsed": true } @@ -146,7 +146,7 @@ "state": { "type": "outline", "state": { - "file": "UML/Components.md" + "file": "To do.md" } } } @@ -168,6 +168,7 @@ }, "active": "24e13959c0a297de", "lastOpenFiles": [ + "UML/Components.md", "SVM.md", "PEP coding standards.md", "Beartype.md",